diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,31636 @@ +{ + "epoch": 0.9999831931831551, + "global_step": 22312, + "max_steps": 22312, + "logging_steps": 5, + "eval_steps": 500, + "save_steps": 500, + "train_batch_size": 32, + "num_train_epochs": 1, + "num_input_tokens_seen": 0, + "total_flos": 1.364299850107899e+19, + "log_history": [ + { + "loss": 14.6143, + "grad_norm": 3.2611544132232666, + "learning_rate": 0.0005, + "epoch": 0.00022409089126549728, + "step": 5 + }, + { + "loss": 14.4676, + "grad_norm": 2.4582467079162598, + "learning_rate": 0.0005, + "epoch": 0.00044818178253099456, + "step": 10 + }, + { + "loss": 14.4128, + "grad_norm": 2.327120065689087, + "learning_rate": 0.0005, + "epoch": 0.0006722726737964918, + "step": 15 + }, + { + "loss": 14.3947, + "grad_norm": 2.182023048400879, + "learning_rate": 0.0005, + "epoch": 0.0008963635650619891, + "step": 20 + }, + { + "loss": 14.3541, + "grad_norm": 2.0157508850097656, + "learning_rate": 0.0005, + "epoch": 0.0011204544563274863, + "step": 25 + }, + { + "loss": 14.2979, + "grad_norm": 2.0835866928100586, + "learning_rate": 0.0005, + "epoch": 0.0013445453475929836, + "step": 30 + }, + { + "loss": 14.3136, + "grad_norm": 2.124310255050659, + "learning_rate": 0.0005, + "epoch": 0.001568636238858481, + "step": 35 + }, + { + "loss": 14.3155, + "grad_norm": 2.100733757019043, + "learning_rate": 0.0005, + "epoch": 0.0017927271301239783, + "step": 40 + }, + { + "loss": 14.2023, + "grad_norm": 2.044724702835083, + "learning_rate": 0.0005, + "epoch": 0.0020168180213894758, + "step": 45 + }, + { + "loss": 14.3092, + "grad_norm": 2.281487464904785, + "learning_rate": 0.0005, + "epoch": 0.0022409089126549727, + "step": 50 + }, + { + "loss": 14.1618, + "grad_norm": 1.9946085214614868, + "learning_rate": 0.0005, + "epoch": 0.00246499980392047, + "step": 55 + }, + { + "loss": 14.3028, + "grad_norm": 2.176398754119873, + "learning_rate": 0.0005, + "epoch": 0.0026890906951859673, + "step": 60 + }, + { + "loss": 14.3326, + "grad_norm": 1.999497890472412, + "learning_rate": 0.0005, + "epoch": 0.0029131815864514646, + "step": 65 + }, + { + "loss": 14.4064, + "grad_norm": 1.9895137548446655, + "learning_rate": 0.0005, + "epoch": 0.003137272477716962, + "step": 70 + }, + { + "loss": 14.3487, + "grad_norm": 1.9479091167449951, + "learning_rate": 0.0005, + "epoch": 0.003361363368982459, + "step": 75 + }, + { + "loss": 14.2659, + "grad_norm": 2.2668464183807373, + "learning_rate": 0.0005, + "epoch": 0.0035854542602479565, + "step": 80 + }, + { + "loss": 14.2306, + "grad_norm": 2.197277069091797, + "learning_rate": 0.0005, + "epoch": 0.003809545151513454, + "step": 85 + }, + { + "loss": 14.2925, + "grad_norm": 2.349569797515869, + "learning_rate": 0.0005, + "epoch": 0.0040336360427789516, + "step": 90 + }, + { + "loss": 14.2346, + "grad_norm": 2.1121737957000732, + "learning_rate": 0.0005, + "epoch": 0.004257726934044448, + "step": 95 + }, + { + "loss": 14.2444, + "grad_norm": 2.3515448570251465, + "learning_rate": 0.0005, + "epoch": 0.004481817825309945, + "step": 100 + }, + { + "loss": 14.2322, + "grad_norm": 2.0645346641540527, + "learning_rate": 0.0005, + "epoch": 0.004705908716575443, + "step": 105 + }, + { + "loss": 14.2183, + "grad_norm": 2.159444808959961, + "learning_rate": 0.0005, + "epoch": 0.00492999960784094, + "step": 110 + }, + { + "loss": 14.3575, + "grad_norm": 2.2058215141296387, + "learning_rate": 0.0005, + "epoch": 0.005154090499106438, + "step": 115 + }, + { + "loss": 14.3028, + "grad_norm": 2.4209940433502197, + "learning_rate": 0.0005, + "epoch": 0.0053781813903719345, + "step": 120 + }, + { + "loss": 14.2203, + "grad_norm": 2.0729739665985107, + "learning_rate": 0.0005, + "epoch": 0.005602272281637432, + "step": 125 + }, + { + "loss": 14.2853, + "grad_norm": 2.0709404945373535, + "learning_rate": 0.0005, + "epoch": 0.005826363172902929, + "step": 130 + }, + { + "loss": 14.3667, + "grad_norm": 1.994726300239563, + "learning_rate": 0.0005, + "epoch": 0.006050454064168427, + "step": 135 + }, + { + "loss": 14.2726, + "grad_norm": 2.0086257457733154, + "learning_rate": 0.0005, + "epoch": 0.006274544955433924, + "step": 140 + }, + { + "loss": 14.3021, + "grad_norm": 2.103579044342041, + "learning_rate": 0.0005, + "epoch": 0.0064986358466994215, + "step": 145 + }, + { + "loss": 14.2639, + "grad_norm": 2.1623342037200928, + "learning_rate": 0.0005, + "epoch": 0.006722726737964918, + "step": 150 + }, + { + "loss": 14.3839, + "grad_norm": 2.345151662826538, + "learning_rate": 0.0005, + "epoch": 0.006946817629230416, + "step": 155 + }, + { + "loss": 14.3292, + "grad_norm": 2.145923137664795, + "learning_rate": 0.0005, + "epoch": 0.007170908520495913, + "step": 160 + }, + { + "loss": 14.2732, + "grad_norm": 2.088350296020508, + "learning_rate": 0.0005, + "epoch": 0.007394999411761411, + "step": 165 + }, + { + "loss": 14.25, + "grad_norm": 2.0546774864196777, + "learning_rate": 0.0005, + "epoch": 0.007619090303026908, + "step": 170 + }, + { + "loss": 14.3347, + "grad_norm": 1.9553978443145752, + "learning_rate": 0.0005, + "epoch": 0.007843181194292404, + "step": 175 + }, + { + "loss": 14.3019, + "grad_norm": 2.0136640071868896, + "learning_rate": 0.0005, + "epoch": 0.008067272085557903, + "step": 180 + }, + { + "loss": 14.3584, + "grad_norm": 2.148486614227295, + "learning_rate": 0.0005, + "epoch": 0.0082913629768234, + "step": 185 + }, + { + "loss": 14.2891, + "grad_norm": 2.0075933933258057, + "learning_rate": 0.0005, + "epoch": 0.008515453868088897, + "step": 190 + }, + { + "loss": 14.2923, + "grad_norm": 1.9557173252105713, + "learning_rate": 0.0005, + "epoch": 0.008739544759354394, + "step": 195 + }, + { + "loss": 14.386, + "grad_norm": 2.001337766647339, + "learning_rate": 0.0005, + "epoch": 0.00896363565061989, + "step": 200 + }, + { + "loss": 14.2192, + "grad_norm": 2.044628381729126, + "learning_rate": 0.0005, + "epoch": 0.00918772654188539, + "step": 205 + }, + { + "loss": 14.3734, + "grad_norm": 2.002190351486206, + "learning_rate": 0.0005, + "epoch": 0.009411817433150886, + "step": 210 + }, + { + "loss": 14.3697, + "grad_norm": 2.089207172393799, + "learning_rate": 0.0005, + "epoch": 0.009635908324416383, + "step": 215 + }, + { + "loss": 14.2986, + "grad_norm": 1.9658682346343994, + "learning_rate": 0.0005, + "epoch": 0.00985999921568188, + "step": 220 + }, + { + "loss": 14.3278, + "grad_norm": 2.009406566619873, + "learning_rate": 0.0005, + "epoch": 0.010084090106947378, + "step": 225 + }, + { + "loss": 14.347, + "grad_norm": 2.0346028804779053, + "learning_rate": 0.0005, + "epoch": 0.010308180998212875, + "step": 230 + }, + { + "loss": 14.3171, + "grad_norm": 2.057018756866455, + "learning_rate": 0.0005, + "epoch": 0.010532271889478372, + "step": 235 + }, + { + "loss": 14.3144, + "grad_norm": 2.078429937362671, + "learning_rate": 0.0005, + "epoch": 0.010756362780743869, + "step": 240 + }, + { + "loss": 14.2827, + "grad_norm": 2.3576812744140625, + "learning_rate": 0.0005, + "epoch": 0.010980453672009368, + "step": 245 + }, + { + "loss": 14.2424, + "grad_norm": 2.1042275428771973, + "learning_rate": 0.0005, + "epoch": 0.011204544563274865, + "step": 250 + }, + { + "loss": 14.2774, + "grad_norm": 2.023210048675537, + "learning_rate": 0.0005, + "epoch": 0.011428635454540361, + "step": 255 + }, + { + "loss": 14.2762, + "grad_norm": 1.9983354806900024, + "learning_rate": 0.0005, + "epoch": 0.011652726345805858, + "step": 260 + }, + { + "loss": 14.3338, + "grad_norm": 1.9574092626571655, + "learning_rate": 0.0005, + "epoch": 0.011876817237071357, + "step": 265 + }, + { + "loss": 14.2085, + "grad_norm": 1.9682737588882446, + "learning_rate": 0.0005, + "epoch": 0.012100908128336854, + "step": 270 + }, + { + "loss": 14.2391, + "grad_norm": 2.040682077407837, + "learning_rate": 0.0005, + "epoch": 0.01232499901960235, + "step": 275 + }, + { + "loss": 14.2422, + "grad_norm": 2.072660446166992, + "learning_rate": 0.0005, + "epoch": 0.012549089910867848, + "step": 280 + }, + { + "loss": 14.2232, + "grad_norm": 1.9866284132003784, + "learning_rate": 0.0005, + "epoch": 0.012773180802133344, + "step": 285 + }, + { + "loss": 14.3504, + "grad_norm": 2.0564022064208984, + "learning_rate": 0.0005, + "epoch": 0.012997271693398843, + "step": 290 + }, + { + "loss": 14.3313, + "grad_norm": 2.2206408977508545, + "learning_rate": 0.0005, + "epoch": 0.01322136258466434, + "step": 295 + }, + { + "loss": 14.2808, + "grad_norm": 2.1048293113708496, + "learning_rate": 0.0005, + "epoch": 0.013445453475929837, + "step": 300 + }, + { + "loss": 14.2852, + "grad_norm": 1.9940338134765625, + "learning_rate": 0.0005, + "epoch": 0.013669544367195334, + "step": 305 + }, + { + "loss": 14.2704, + "grad_norm": 2.0356545448303223, + "learning_rate": 0.0005, + "epoch": 0.013893635258460832, + "step": 310 + }, + { + "loss": 14.3152, + "grad_norm": 2.060826063156128, + "learning_rate": 0.0005, + "epoch": 0.01411772614972633, + "step": 315 + }, + { + "loss": 14.158, + "grad_norm": 2.0694780349731445, + "learning_rate": 0.0005, + "epoch": 0.014341817040991826, + "step": 320 + }, + { + "loss": 14.385, + "grad_norm": 2.0996909141540527, + "learning_rate": 0.0005, + "epoch": 0.014565907932257323, + "step": 325 + }, + { + "loss": 14.2717, + "grad_norm": 2.0667507648468018, + "learning_rate": 0.0005, + "epoch": 0.014789998823522821, + "step": 330 + }, + { + "loss": 14.2032, + "grad_norm": 2.118894577026367, + "learning_rate": 0.0005, + "epoch": 0.015014089714788318, + "step": 335 + }, + { + "loss": 14.3441, + "grad_norm": 2.37672758102417, + "learning_rate": 0.0005, + "epoch": 0.015238180606053815, + "step": 340 + }, + { + "loss": 14.2123, + "grad_norm": 2.0683722496032715, + "learning_rate": 0.0005, + "epoch": 0.015462271497319312, + "step": 345 + }, + { + "loss": 14.2399, + "grad_norm": 1.9291362762451172, + "learning_rate": 0.0005, + "epoch": 0.01568636238858481, + "step": 350 + }, + { + "loss": 14.2786, + "grad_norm": 2.073303461074829, + "learning_rate": 0.0005, + "epoch": 0.015910453279850308, + "step": 355 + }, + { + "loss": 14.1956, + "grad_norm": 2.015531539916992, + "learning_rate": 0.0005, + "epoch": 0.016134544171115806, + "step": 360 + }, + { + "loss": 14.2929, + "grad_norm": 1.978757381439209, + "learning_rate": 0.0005, + "epoch": 0.0163586350623813, + "step": 365 + }, + { + "loss": 14.3134, + "grad_norm": 2.015641927719116, + "learning_rate": 0.0005, + "epoch": 0.0165827259536468, + "step": 370 + }, + { + "loss": 14.316, + "grad_norm": 2.088158130645752, + "learning_rate": 0.0005, + "epoch": 0.016806816844912295, + "step": 375 + }, + { + "loss": 14.2379, + "grad_norm": 2.0318617820739746, + "learning_rate": 0.0005, + "epoch": 0.017030907736177794, + "step": 380 + }, + { + "loss": 14.3892, + "grad_norm": 1.9806574583053589, + "learning_rate": 0.0005, + "epoch": 0.017254998627443292, + "step": 385 + }, + { + "loss": 14.2584, + "grad_norm": 1.9797101020812988, + "learning_rate": 0.0005, + "epoch": 0.017479089518708787, + "step": 390 + }, + { + "loss": 14.227, + "grad_norm": 1.906246542930603, + "learning_rate": 0.0005, + "epoch": 0.017703180409974286, + "step": 395 + }, + { + "loss": 14.3051, + "grad_norm": 2.0552897453308105, + "learning_rate": 0.0005, + "epoch": 0.01792727130123978, + "step": 400 + }, + { + "loss": 14.2216, + "grad_norm": 2.062342405319214, + "learning_rate": 0.0005, + "epoch": 0.01815136219250528, + "step": 405 + }, + { + "loss": 14.2193, + "grad_norm": 1.9998198747634888, + "learning_rate": 0.0005, + "epoch": 0.01837545308377078, + "step": 410 + }, + { + "loss": 14.2426, + "grad_norm": 2.0867531299591064, + "learning_rate": 0.0005, + "epoch": 0.018599543975036274, + "step": 415 + }, + { + "loss": 14.3968, + "grad_norm": 2.1598596572875977, + "learning_rate": 0.0005, + "epoch": 0.018823634866301772, + "step": 420 + }, + { + "loss": 14.2109, + "grad_norm": 2.1043760776519775, + "learning_rate": 0.0005, + "epoch": 0.01904772575756727, + "step": 425 + }, + { + "loss": 14.2744, + "grad_norm": 2.138683557510376, + "learning_rate": 0.0005, + "epoch": 0.019271816648832766, + "step": 430 + }, + { + "loss": 14.1959, + "grad_norm": 2.0307869911193848, + "learning_rate": 0.0005, + "epoch": 0.019495907540098265, + "step": 435 + }, + { + "loss": 14.2097, + "grad_norm": 1.9247474670410156, + "learning_rate": 0.0005, + "epoch": 0.01971999843136376, + "step": 440 + }, + { + "loss": 14.2815, + "grad_norm": 2.0889439582824707, + "learning_rate": 0.0005, + "epoch": 0.01994408932262926, + "step": 445 + }, + { + "loss": 14.1621, + "grad_norm": 2.0505826473236084, + "learning_rate": 0.0005, + "epoch": 0.020168180213894757, + "step": 450 + }, + { + "loss": 14.3243, + "grad_norm": 2.0084340572357178, + "learning_rate": 0.0005, + "epoch": 0.020392271105160252, + "step": 455 + }, + { + "loss": 14.2527, + "grad_norm": 1.934202790260315, + "learning_rate": 0.0005, + "epoch": 0.02061636199642575, + "step": 460 + }, + { + "loss": 14.2158, + "grad_norm": 2.235403537750244, + "learning_rate": 0.0005, + "epoch": 0.02084045288769125, + "step": 465 + }, + { + "loss": 14.3807, + "grad_norm": 2.1436288356781006, + "learning_rate": 0.0005, + "epoch": 0.021064543778956744, + "step": 470 + }, + { + "loss": 14.2475, + "grad_norm": 2.0434954166412354, + "learning_rate": 0.0005, + "epoch": 0.021288634670222243, + "step": 475 + }, + { + "loss": 14.3062, + "grad_norm": 2.029393434524536, + "learning_rate": 0.0005, + "epoch": 0.021512725561487738, + "step": 480 + }, + { + "loss": 14.2349, + "grad_norm": 2.0223193168640137, + "learning_rate": 0.0005, + "epoch": 0.021736816452753237, + "step": 485 + }, + { + "loss": 14.2309, + "grad_norm": 1.913985013961792, + "learning_rate": 0.0005, + "epoch": 0.021960907344018735, + "step": 490 + }, + { + "loss": 14.3093, + "grad_norm": 2.0677883625030518, + "learning_rate": 0.0005, + "epoch": 0.02218499823528423, + "step": 495 + }, + { + "loss": 14.3713, + "grad_norm": 2.2938449382781982, + "learning_rate": 0.0005, + "epoch": 0.02240908912654973, + "step": 500 + }, + { + "eval_loss": 1.7718801498413086, + "eval_runtime": 18.8403, + "eval_samples_per_second": 869.624, + "eval_steps_per_second": 7.802, + "epoch": 0.02240908912654973, + "step": 500 + }, + { + "loss": 14.2151, + "grad_norm": 2.096050500869751, + "learning_rate": 0.0005, + "epoch": 0.022633180017815224, + "step": 505 + }, + { + "loss": 14.3219, + "grad_norm": 2.2422337532043457, + "learning_rate": 0.0005, + "epoch": 0.022857270909080723, + "step": 510 + }, + { + "loss": 14.2732, + "grad_norm": 2.0611579418182373, + "learning_rate": 0.0005, + "epoch": 0.02308136180034622, + "step": 515 + }, + { + "loss": 14.2907, + "grad_norm": 1.9564974308013916, + "learning_rate": 0.0005, + "epoch": 0.023305452691611717, + "step": 520 + }, + { + "loss": 14.2737, + "grad_norm": 2.0616042613983154, + "learning_rate": 0.0005, + "epoch": 0.023529543582877215, + "step": 525 + }, + { + "loss": 14.32, + "grad_norm": 2.1140406131744385, + "learning_rate": 0.0005, + "epoch": 0.023753634474142714, + "step": 530 + }, + { + "loss": 14.2913, + "grad_norm": 1.8935840129852295, + "learning_rate": 0.0005, + "epoch": 0.02397772536540821, + "step": 535 + }, + { + "loss": 14.3307, + "grad_norm": 2.028771162033081, + "learning_rate": 0.0005, + "epoch": 0.024201816256673708, + "step": 540 + }, + { + "loss": 14.2717, + "grad_norm": 1.976778507232666, + "learning_rate": 0.0005, + "epoch": 0.024425907147939203, + "step": 545 + }, + { + "loss": 14.2348, + "grad_norm": 2.082973003387451, + "learning_rate": 0.0005, + "epoch": 0.0246499980392047, + "step": 550 + }, + { + "loss": 14.2249, + "grad_norm": 2.036085367202759, + "learning_rate": 0.0005, + "epoch": 0.0248740889304702, + "step": 555 + }, + { + "loss": 14.2696, + "grad_norm": 1.8578675985336304, + "learning_rate": 0.0005, + "epoch": 0.025098179821735695, + "step": 560 + }, + { + "loss": 14.2171, + "grad_norm": 2.2652697563171387, + "learning_rate": 0.0005, + "epoch": 0.025322270713001194, + "step": 565 + }, + { + "loss": 14.353, + "grad_norm": 2.3761842250823975, + "learning_rate": 0.0005, + "epoch": 0.02554636160426669, + "step": 570 + }, + { + "loss": 14.3057, + "grad_norm": 1.9577360153198242, + "learning_rate": 0.0005, + "epoch": 0.025770452495532187, + "step": 575 + }, + { + "loss": 14.3683, + "grad_norm": 1.9947212934494019, + "learning_rate": 0.0005, + "epoch": 0.025994543386797686, + "step": 580 + }, + { + "loss": 14.291, + "grad_norm": 2.12111234664917, + "learning_rate": 0.0005, + "epoch": 0.02621863427806318, + "step": 585 + }, + { + "loss": 14.3058, + "grad_norm": 2.155707836151123, + "learning_rate": 0.0005, + "epoch": 0.02644272516932868, + "step": 590 + }, + { + "loss": 14.2909, + "grad_norm": 2.0623831748962402, + "learning_rate": 0.0005, + "epoch": 0.02666681606059418, + "step": 595 + }, + { + "loss": 14.3209, + "grad_norm": 2.009176254272461, + "learning_rate": 0.0005, + "epoch": 0.026890906951859674, + "step": 600 + }, + { + "loss": 14.2734, + "grad_norm": 2.1131885051727295, + "learning_rate": 0.0005, + "epoch": 0.027114997843125172, + "step": 605 + }, + { + "loss": 14.3535, + "grad_norm": 2.228571653366089, + "learning_rate": 0.0005, + "epoch": 0.027339088734390667, + "step": 610 + }, + { + "loss": 14.2595, + "grad_norm": 2.2658498287200928, + "learning_rate": 0.0005, + "epoch": 0.027563179625656166, + "step": 615 + }, + { + "loss": 14.2853, + "grad_norm": 2.1453394889831543, + "learning_rate": 0.0005, + "epoch": 0.027787270516921665, + "step": 620 + }, + { + "loss": 14.383, + "grad_norm": 1.982365369796753, + "learning_rate": 0.0005, + "epoch": 0.02801136140818716, + "step": 625 + }, + { + "loss": 14.3929, + "grad_norm": 2.0140199661254883, + "learning_rate": 0.0005, + "epoch": 0.02823545229945266, + "step": 630 + }, + { + "loss": 14.1989, + "grad_norm": 1.998089075088501, + "learning_rate": 0.0005, + "epoch": 0.028459543190718157, + "step": 635 + }, + { + "loss": 14.2745, + "grad_norm": 2.0165891647338867, + "learning_rate": 0.0005, + "epoch": 0.028683634081983652, + "step": 640 + }, + { + "loss": 14.2618, + "grad_norm": 2.0754010677337646, + "learning_rate": 0.0005, + "epoch": 0.02890772497324915, + "step": 645 + }, + { + "loss": 14.2327, + "grad_norm": 2.3930583000183105, + "learning_rate": 0.0005, + "epoch": 0.029131815864514646, + "step": 650 + }, + { + "loss": 14.2808, + "grad_norm": 2.123385190963745, + "learning_rate": 0.0005, + "epoch": 0.029355906755780144, + "step": 655 + }, + { + "loss": 14.1984, + "grad_norm": 1.9736683368682861, + "learning_rate": 0.0005, + "epoch": 0.029579997647045643, + "step": 660 + }, + { + "loss": 14.3508, + "grad_norm": 2.030437707901001, + "learning_rate": 0.0005, + "epoch": 0.029804088538311138, + "step": 665 + }, + { + "loss": 14.131, + "grad_norm": 1.8928433656692505, + "learning_rate": 0.0005, + "epoch": 0.030028179429576637, + "step": 670 + }, + { + "loss": 14.1608, + "grad_norm": 1.977250099182129, + "learning_rate": 0.0005, + "epoch": 0.030252270320842132, + "step": 675 + }, + { + "loss": 14.4085, + "grad_norm": 2.0198206901550293, + "learning_rate": 0.0005, + "epoch": 0.03047636121210763, + "step": 680 + }, + { + "loss": 14.2706, + "grad_norm": 2.0707778930664062, + "learning_rate": 0.0005, + "epoch": 0.03070045210337313, + "step": 685 + }, + { + "loss": 14.1478, + "grad_norm": 2.046593189239502, + "learning_rate": 0.0005, + "epoch": 0.030924542994638624, + "step": 690 + }, + { + "loss": 14.1779, + "grad_norm": 2.0935871601104736, + "learning_rate": 0.0005, + "epoch": 0.031148633885904123, + "step": 695 + }, + { + "loss": 14.2842, + "grad_norm": 2.1891965866088867, + "learning_rate": 0.0005, + "epoch": 0.03137272477716962, + "step": 700 + }, + { + "loss": 14.2074, + "grad_norm": 2.070681095123291, + "learning_rate": 0.0005, + "epoch": 0.03159681566843512, + "step": 705 + }, + { + "loss": 14.2629, + "grad_norm": 2.0730481147766113, + "learning_rate": 0.0005, + "epoch": 0.031820906559700615, + "step": 710 + }, + { + "loss": 14.252, + "grad_norm": 1.9831678867340088, + "learning_rate": 0.0005, + "epoch": 0.03204499745096611, + "step": 715 + }, + { + "loss": 14.3001, + "grad_norm": 1.9226957559585571, + "learning_rate": 0.0005, + "epoch": 0.03226908834223161, + "step": 720 + }, + { + "loss": 14.3861, + "grad_norm": 1.9222818613052368, + "learning_rate": 0.0005, + "epoch": 0.03249317923349711, + "step": 725 + }, + { + "loss": 14.1986, + "grad_norm": 2.1250648498535156, + "learning_rate": 0.0005, + "epoch": 0.0327172701247626, + "step": 730 + }, + { + "loss": 14.2355, + "grad_norm": 2.0338075160980225, + "learning_rate": 0.0005, + "epoch": 0.0329413610160281, + "step": 735 + }, + { + "loss": 14.2037, + "grad_norm": 1.9043233394622803, + "learning_rate": 0.0005, + "epoch": 0.0331654519072936, + "step": 740 + }, + { + "loss": 14.3108, + "grad_norm": 1.9609804153442383, + "learning_rate": 0.0005, + "epoch": 0.033389542798559095, + "step": 745 + }, + { + "loss": 14.3283, + "grad_norm": 1.9094796180725098, + "learning_rate": 0.0005, + "epoch": 0.03361363368982459, + "step": 750 + }, + { + "loss": 14.2745, + "grad_norm": 2.0199835300445557, + "learning_rate": 0.0005, + "epoch": 0.03383772458109009, + "step": 755 + }, + { + "loss": 14.248, + "grad_norm": 1.8590503931045532, + "learning_rate": 0.0005, + "epoch": 0.03406181547235559, + "step": 760 + }, + { + "loss": 14.2879, + "grad_norm": 1.8757070302963257, + "learning_rate": 0.0005, + "epoch": 0.03428590636362108, + "step": 765 + }, + { + "loss": 14.2926, + "grad_norm": 2.2693803310394287, + "learning_rate": 0.0005, + "epoch": 0.034509997254886585, + "step": 770 + }, + { + "loss": 14.4677, + "grad_norm": 2.126627206802368, + "learning_rate": 0.0005, + "epoch": 0.03473408814615208, + "step": 775 + }, + { + "loss": 14.3263, + "grad_norm": 2.0534279346466064, + "learning_rate": 0.0005, + "epoch": 0.034958179037417575, + "step": 780 + }, + { + "loss": 14.3332, + "grad_norm": 2.061572313308716, + "learning_rate": 0.0005, + "epoch": 0.03518226992868308, + "step": 785 + }, + { + "loss": 14.3802, + "grad_norm": 2.1430110931396484, + "learning_rate": 0.0005, + "epoch": 0.03540636081994857, + "step": 790 + }, + { + "loss": 14.1992, + "grad_norm": 2.114748477935791, + "learning_rate": 0.0005, + "epoch": 0.03563045171121407, + "step": 795 + }, + { + "loss": 14.239, + "grad_norm": 1.9971671104431152, + "learning_rate": 0.0005, + "epoch": 0.03585454260247956, + "step": 800 + }, + { + "loss": 14.2222, + "grad_norm": 1.95136559009552, + "learning_rate": 0.0005, + "epoch": 0.036078633493745065, + "step": 805 + }, + { + "loss": 14.1883, + "grad_norm": 2.1996357440948486, + "learning_rate": 0.0005, + "epoch": 0.03630272438501056, + "step": 810 + }, + { + "loss": 14.2557, + "grad_norm": 2.095583200454712, + "learning_rate": 0.0005, + "epoch": 0.036526815276276055, + "step": 815 + }, + { + "loss": 14.2175, + "grad_norm": 2.0955843925476074, + "learning_rate": 0.0005, + "epoch": 0.03675090616754156, + "step": 820 + }, + { + "loss": 14.2187, + "grad_norm": 1.9569734334945679, + "learning_rate": 0.0005, + "epoch": 0.03697499705880705, + "step": 825 + }, + { + "loss": 14.3528, + "grad_norm": 1.9772740602493286, + "learning_rate": 0.0005, + "epoch": 0.03719908795007255, + "step": 830 + }, + { + "loss": 14.2456, + "grad_norm": 2.1345417499542236, + "learning_rate": 0.0005, + "epoch": 0.03742317884133805, + "step": 835 + }, + { + "loss": 14.2124, + "grad_norm": 1.975181221961975, + "learning_rate": 0.0005, + "epoch": 0.037647269732603544, + "step": 840 + }, + { + "loss": 14.2323, + "grad_norm": 2.1009271144866943, + "learning_rate": 0.0005, + "epoch": 0.03787136062386904, + "step": 845 + }, + { + "loss": 14.251, + "grad_norm": 2.0016536712646484, + "learning_rate": 0.0005, + "epoch": 0.03809545151513454, + "step": 850 + }, + { + "loss": 14.2497, + "grad_norm": 1.9874367713928223, + "learning_rate": 0.0005, + "epoch": 0.03831954240640004, + "step": 855 + }, + { + "loss": 14.1931, + "grad_norm": 2.083117961883545, + "learning_rate": 0.0005, + "epoch": 0.03854363329766553, + "step": 860 + }, + { + "loss": 14.24, + "grad_norm": 2.0738794803619385, + "learning_rate": 0.0005, + "epoch": 0.03876772418893103, + "step": 865 + }, + { + "loss": 14.2042, + "grad_norm": 1.9692766666412354, + "learning_rate": 0.0005, + "epoch": 0.03899181508019653, + "step": 870 + }, + { + "loss": 14.3241, + "grad_norm": 2.271322727203369, + "learning_rate": 0.0005, + "epoch": 0.039215905971462024, + "step": 875 + }, + { + "loss": 14.2399, + "grad_norm": 2.0843918323516846, + "learning_rate": 0.0005, + "epoch": 0.03943999686272752, + "step": 880 + }, + { + "loss": 14.2721, + "grad_norm": 1.9291036128997803, + "learning_rate": 0.0005, + "epoch": 0.03966408775399302, + "step": 885 + }, + { + "loss": 14.2569, + "grad_norm": 2.0397393703460693, + "learning_rate": 0.0005, + "epoch": 0.03988817864525852, + "step": 890 + }, + { + "loss": 14.2288, + "grad_norm": 2.144932508468628, + "learning_rate": 0.0005, + "epoch": 0.04011226953652401, + "step": 895 + }, + { + "loss": 14.191, + "grad_norm": 1.9231230020523071, + "learning_rate": 0.0005, + "epoch": 0.040336360427789514, + "step": 900 + }, + { + "loss": 14.2027, + "grad_norm": 1.8474692106246948, + "learning_rate": 0.0005, + "epoch": 0.04056045131905501, + "step": 905 + }, + { + "loss": 14.2725, + "grad_norm": 2.227571725845337, + "learning_rate": 0.0005, + "epoch": 0.040784542210320504, + "step": 910 + }, + { + "loss": 14.291, + "grad_norm": 2.0123486518859863, + "learning_rate": 0.0005, + "epoch": 0.041008633101586006, + "step": 915 + }, + { + "loss": 14.3685, + "grad_norm": 2.12306809425354, + "learning_rate": 0.0005, + "epoch": 0.0412327239928515, + "step": 920 + }, + { + "loss": 14.3251, + "grad_norm": 1.951163411140442, + "learning_rate": 0.0005, + "epoch": 0.041456814884116996, + "step": 925 + }, + { + "loss": 14.3223, + "grad_norm": 2.004356861114502, + "learning_rate": 0.0005, + "epoch": 0.0416809057753825, + "step": 930 + }, + { + "loss": 14.2391, + "grad_norm": 2.049684524536133, + "learning_rate": 0.0005, + "epoch": 0.041904996666647994, + "step": 935 + }, + { + "loss": 14.3082, + "grad_norm": 2.043984889984131, + "learning_rate": 0.0005, + "epoch": 0.04212908755791349, + "step": 940 + }, + { + "loss": 14.2308, + "grad_norm": 2.0580008029937744, + "learning_rate": 0.0005, + "epoch": 0.042353178449178984, + "step": 945 + }, + { + "loss": 14.2479, + "grad_norm": 2.0241260528564453, + "learning_rate": 0.0005, + "epoch": 0.042577269340444486, + "step": 950 + }, + { + "loss": 14.2361, + "grad_norm": 2.0930376052856445, + "learning_rate": 0.0005, + "epoch": 0.04280136023170998, + "step": 955 + }, + { + "loss": 14.2378, + "grad_norm": 2.057032585144043, + "learning_rate": 0.0005, + "epoch": 0.043025451122975476, + "step": 960 + }, + { + "loss": 14.2969, + "grad_norm": 2.033273696899414, + "learning_rate": 0.0005, + "epoch": 0.04324954201424098, + "step": 965 + }, + { + "loss": 14.1837, + "grad_norm": 1.8709105253219604, + "learning_rate": 0.0005, + "epoch": 0.043473632905506474, + "step": 970 + }, + { + "loss": 14.2079, + "grad_norm": 2.157222270965576, + "learning_rate": 0.0005, + "epoch": 0.04369772379677197, + "step": 975 + }, + { + "loss": 14.4298, + "grad_norm": 2.202359437942505, + "learning_rate": 0.0005, + "epoch": 0.04392181468803747, + "step": 980 + }, + { + "loss": 14.1975, + "grad_norm": 2.245966672897339, + "learning_rate": 0.0005, + "epoch": 0.044145905579302966, + "step": 985 + }, + { + "loss": 14.2658, + "grad_norm": 2.1800920963287354, + "learning_rate": 0.0005, + "epoch": 0.04436999647056846, + "step": 990 + }, + { + "loss": 14.3301, + "grad_norm": 2.060558795928955, + "learning_rate": 0.0005, + "epoch": 0.04459408736183396, + "step": 995 + }, + { + "loss": 14.2039, + "grad_norm": 2.113269329071045, + "learning_rate": 0.0005, + "epoch": 0.04481817825309946, + "step": 1000 + }, + { + "eval_loss": 1.7727333307266235, + "eval_runtime": 18.7595, + "eval_samples_per_second": 873.37, + "eval_steps_per_second": 7.836, + "epoch": 0.04481817825309946, + "step": 1000 + }, + { + "loss": 14.2288, + "grad_norm": 2.1471664905548096, + "learning_rate": 0.0005, + "epoch": 0.04504226914436495, + "step": 1005 + }, + { + "loss": 14.3332, + "grad_norm": 2.2036454677581787, + "learning_rate": 0.0005, + "epoch": 0.04526636003563045, + "step": 1010 + }, + { + "loss": 14.1147, + "grad_norm": 2.000432252883911, + "learning_rate": 0.0005, + "epoch": 0.04549045092689595, + "step": 1015 + }, + { + "loss": 14.2888, + "grad_norm": 1.8297663927078247, + "learning_rate": 0.0005, + "epoch": 0.045714541818161446, + "step": 1020 + }, + { + "loss": 14.3685, + "grad_norm": 1.954213261604309, + "learning_rate": 0.0005, + "epoch": 0.04593863270942694, + "step": 1025 + }, + { + "loss": 14.2465, + "grad_norm": 2.0441508293151855, + "learning_rate": 0.0005, + "epoch": 0.04616272360069244, + "step": 1030 + }, + { + "loss": 14.1629, + "grad_norm": 1.9861886501312256, + "learning_rate": 0.0005, + "epoch": 0.04638681449195794, + "step": 1035 + }, + { + "loss": 14.3546, + "grad_norm": 1.9641690254211426, + "learning_rate": 0.0005, + "epoch": 0.04661090538322343, + "step": 1040 + }, + { + "loss": 14.3615, + "grad_norm": 1.914635419845581, + "learning_rate": 0.0005, + "epoch": 0.046834996274488935, + "step": 1045 + }, + { + "loss": 14.1544, + "grad_norm": 2.138408660888672, + "learning_rate": 0.0005, + "epoch": 0.04705908716575443, + "step": 1050 + }, + { + "loss": 14.1677, + "grad_norm": 2.0267415046691895, + "learning_rate": 0.0005, + "epoch": 0.047283178057019926, + "step": 1055 + }, + { + "loss": 14.2248, + "grad_norm": 2.190242290496826, + "learning_rate": 0.0005, + "epoch": 0.04750726894828543, + "step": 1060 + }, + { + "loss": 14.2402, + "grad_norm": 2.30466628074646, + "learning_rate": 0.0005, + "epoch": 0.04773135983955092, + "step": 1065 + }, + { + "loss": 14.1104, + "grad_norm": 2.035266876220703, + "learning_rate": 0.0005, + "epoch": 0.04795545073081642, + "step": 1070 + }, + { + "loss": 14.1833, + "grad_norm": 2.0810415744781494, + "learning_rate": 0.0005, + "epoch": 0.04817954162208191, + "step": 1075 + }, + { + "loss": 14.2342, + "grad_norm": 2.0724008083343506, + "learning_rate": 0.0005, + "epoch": 0.048403632513347415, + "step": 1080 + }, + { + "loss": 14.2719, + "grad_norm": 1.9949944019317627, + "learning_rate": 0.0005, + "epoch": 0.04862772340461291, + "step": 1085 + }, + { + "loss": 14.2835, + "grad_norm": 2.058635950088501, + "learning_rate": 0.0005, + "epoch": 0.048851814295878405, + "step": 1090 + }, + { + "loss": 14.1962, + "grad_norm": 1.8565870523452759, + "learning_rate": 0.0005, + "epoch": 0.04907590518714391, + "step": 1095 + }, + { + "loss": 14.2335, + "grad_norm": 1.9683058261871338, + "learning_rate": 0.0005, + "epoch": 0.0492999960784094, + "step": 1100 + }, + { + "loss": 14.2524, + "grad_norm": 1.9165276288986206, + "learning_rate": 0.0005, + "epoch": 0.0495240869696749, + "step": 1105 + }, + { + "loss": 14.1797, + "grad_norm": 1.9913203716278076, + "learning_rate": 0.0005, + "epoch": 0.0497481778609404, + "step": 1110 + }, + { + "loss": 14.1801, + "grad_norm": 2.0073564052581787, + "learning_rate": 0.0005, + "epoch": 0.049972268752205895, + "step": 1115 + }, + { + "loss": 14.2034, + "grad_norm": 2.0174570083618164, + "learning_rate": 0.0005, + "epoch": 0.05019635964347139, + "step": 1120 + }, + { + "loss": 14.2805, + "grad_norm": 2.1646080017089844, + "learning_rate": 0.0005, + "epoch": 0.05042045053473689, + "step": 1125 + }, + { + "loss": 14.3219, + "grad_norm": 1.9945608377456665, + "learning_rate": 0.0005, + "epoch": 0.05064454142600239, + "step": 1130 + }, + { + "loss": 14.3191, + "grad_norm": 2.1684176921844482, + "learning_rate": 0.0005, + "epoch": 0.05086863231726788, + "step": 1135 + }, + { + "loss": 14.1874, + "grad_norm": 2.009509325027466, + "learning_rate": 0.0005, + "epoch": 0.05109272320853338, + "step": 1140 + }, + { + "loss": 14.2761, + "grad_norm": 1.8036493062973022, + "learning_rate": 0.0005, + "epoch": 0.05131681409979888, + "step": 1145 + }, + { + "loss": 14.1475, + "grad_norm": 1.93241548538208, + "learning_rate": 0.0005, + "epoch": 0.051540904991064375, + "step": 1150 + }, + { + "loss": 14.1615, + "grad_norm": 1.9887784719467163, + "learning_rate": 0.0005, + "epoch": 0.05176499588232987, + "step": 1155 + }, + { + "loss": 14.1796, + "grad_norm": 2.110835552215576, + "learning_rate": 0.0005, + "epoch": 0.05198908677359537, + "step": 1160 + }, + { + "loss": 14.3305, + "grad_norm": 1.9870234727859497, + "learning_rate": 0.0005, + "epoch": 0.05221317766486087, + "step": 1165 + }, + { + "loss": 14.2401, + "grad_norm": 2.033839225769043, + "learning_rate": 0.0005, + "epoch": 0.05243726855612636, + "step": 1170 + }, + { + "loss": 14.2614, + "grad_norm": 2.0544512271881104, + "learning_rate": 0.0005, + "epoch": 0.052661359447391864, + "step": 1175 + }, + { + "loss": 14.2981, + "grad_norm": 2.0397698879241943, + "learning_rate": 0.0005, + "epoch": 0.05288545033865736, + "step": 1180 + }, + { + "loss": 14.2508, + "grad_norm": 2.0876481533050537, + "learning_rate": 0.0005, + "epoch": 0.053109541229922855, + "step": 1185 + }, + { + "loss": 14.2421, + "grad_norm": 2.0604329109191895, + "learning_rate": 0.0005, + "epoch": 0.05333363212118836, + "step": 1190 + }, + { + "loss": 14.2969, + "grad_norm": 2.262632131576538, + "learning_rate": 0.0005, + "epoch": 0.05355772301245385, + "step": 1195 + }, + { + "loss": 14.2506, + "grad_norm": 2.0581042766571045, + "learning_rate": 0.0005, + "epoch": 0.05378181390371935, + "step": 1200 + }, + { + "loss": 14.2416, + "grad_norm": 1.9745397567749023, + "learning_rate": 0.0005, + "epoch": 0.05400590479498485, + "step": 1205 + }, + { + "loss": 14.1699, + "grad_norm": 1.9858930110931396, + "learning_rate": 0.0005, + "epoch": 0.054229995686250344, + "step": 1210 + }, + { + "loss": 14.3006, + "grad_norm": 2.047473907470703, + "learning_rate": 0.0005, + "epoch": 0.05445408657751584, + "step": 1215 + }, + { + "loss": 14.3145, + "grad_norm": 2.091883420944214, + "learning_rate": 0.0005, + "epoch": 0.054678177468781335, + "step": 1220 + }, + { + "loss": 14.1586, + "grad_norm": 1.9428058862686157, + "learning_rate": 0.0005, + "epoch": 0.05490226836004684, + "step": 1225 + }, + { + "loss": 14.3759, + "grad_norm": 1.914755940437317, + "learning_rate": 0.0005, + "epoch": 0.05512635925131233, + "step": 1230 + }, + { + "loss": 14.3249, + "grad_norm": 1.89765465259552, + "learning_rate": 0.0005, + "epoch": 0.05535045014257783, + "step": 1235 + }, + { + "loss": 14.2483, + "grad_norm": 2.0791454315185547, + "learning_rate": 0.0005, + "epoch": 0.05557454103384333, + "step": 1240 + }, + { + "loss": 14.2269, + "grad_norm": 2.1523244380950928, + "learning_rate": 0.0005, + "epoch": 0.055798631925108824, + "step": 1245 + }, + { + "loss": 14.2138, + "grad_norm": 2.0883595943450928, + "learning_rate": 0.0005, + "epoch": 0.05602272281637432, + "step": 1250 + }, + { + "loss": 14.1858, + "grad_norm": 2.0705084800720215, + "learning_rate": 0.0005, + "epoch": 0.05624681370763982, + "step": 1255 + }, + { + "loss": 14.1589, + "grad_norm": 1.935402274131775, + "learning_rate": 0.0005, + "epoch": 0.05647090459890532, + "step": 1260 + }, + { + "loss": 14.1945, + "grad_norm": 2.0258562564849854, + "learning_rate": 0.0005, + "epoch": 0.05669499549017081, + "step": 1265 + }, + { + "loss": 14.2134, + "grad_norm": 2.161383628845215, + "learning_rate": 0.0005, + "epoch": 0.056919086381436314, + "step": 1270 + }, + { + "loss": 14.1014, + "grad_norm": 1.868166446685791, + "learning_rate": 0.0005, + "epoch": 0.05714317727270181, + "step": 1275 + }, + { + "loss": 14.1909, + "grad_norm": 2.053658962249756, + "learning_rate": 0.0005, + "epoch": 0.057367268163967304, + "step": 1280 + }, + { + "loss": 14.1878, + "grad_norm": 1.993074655532837, + "learning_rate": 0.0005, + "epoch": 0.0575913590552328, + "step": 1285 + }, + { + "loss": 14.1786, + "grad_norm": 1.9569101333618164, + "learning_rate": 0.0005, + "epoch": 0.0578154499464983, + "step": 1290 + }, + { + "loss": 14.1818, + "grad_norm": 1.9939252138137817, + "learning_rate": 0.0005, + "epoch": 0.058039540837763796, + "step": 1295 + }, + { + "loss": 14.2783, + "grad_norm": 1.876535177230835, + "learning_rate": 0.0005, + "epoch": 0.05826363172902929, + "step": 1300 + }, + { + "loss": 14.2104, + "grad_norm": 2.0808701515197754, + "learning_rate": 0.0005, + "epoch": 0.058487722620294794, + "step": 1305 + }, + { + "loss": 14.3005, + "grad_norm": 2.094132900238037, + "learning_rate": 0.0005, + "epoch": 0.05871181351156029, + "step": 1310 + }, + { + "loss": 14.2414, + "grad_norm": 2.143573522567749, + "learning_rate": 0.0005, + "epoch": 0.058935904402825784, + "step": 1315 + }, + { + "loss": 14.1906, + "grad_norm": 1.943663477897644, + "learning_rate": 0.0005, + "epoch": 0.059159995294091286, + "step": 1320 + }, + { + "loss": 14.233, + "grad_norm": 1.9655219316482544, + "learning_rate": 0.0005, + "epoch": 0.05938408618535678, + "step": 1325 + }, + { + "loss": 14.1973, + "grad_norm": 1.943224549293518, + "learning_rate": 0.0005, + "epoch": 0.059608177076622276, + "step": 1330 + }, + { + "loss": 14.2429, + "grad_norm": 2.0061824321746826, + "learning_rate": 0.0005, + "epoch": 0.05983226796788778, + "step": 1335 + }, + { + "loss": 14.2179, + "grad_norm": 1.8484687805175781, + "learning_rate": 0.0005, + "epoch": 0.060056358859153273, + "step": 1340 + }, + { + "loss": 14.2897, + "grad_norm": 2.1448802947998047, + "learning_rate": 0.0005, + "epoch": 0.06028044975041877, + "step": 1345 + }, + { + "loss": 14.3255, + "grad_norm": 2.0925004482269287, + "learning_rate": 0.0005, + "epoch": 0.060504540641684264, + "step": 1350 + }, + { + "loss": 14.2182, + "grad_norm": 2.072070598602295, + "learning_rate": 0.0005, + "epoch": 0.060728631532949766, + "step": 1355 + }, + { + "loss": 14.2734, + "grad_norm": 2.3992390632629395, + "learning_rate": 0.0005, + "epoch": 0.06095272242421526, + "step": 1360 + }, + { + "loss": 14.2596, + "grad_norm": 2.0187509059906006, + "learning_rate": 0.0005, + "epoch": 0.061176813315480756, + "step": 1365 + }, + { + "loss": 14.2198, + "grad_norm": 2.126812219619751, + "learning_rate": 0.0005, + "epoch": 0.06140090420674626, + "step": 1370 + }, + { + "loss": 14.3122, + "grad_norm": 1.9537678956985474, + "learning_rate": 0.0005, + "epoch": 0.06162499509801175, + "step": 1375 + }, + { + "loss": 14.1976, + "grad_norm": 1.9577600955963135, + "learning_rate": 0.0005, + "epoch": 0.06184908598927725, + "step": 1380 + }, + { + "loss": 14.1317, + "grad_norm": 1.8684802055358887, + "learning_rate": 0.0005, + "epoch": 0.06207317688054275, + "step": 1385 + }, + { + "loss": 14.2812, + "grad_norm": 2.074007511138916, + "learning_rate": 0.0005, + "epoch": 0.062297267771808246, + "step": 1390 + }, + { + "loss": 14.2356, + "grad_norm": 2.1334567070007324, + "learning_rate": 0.0005, + "epoch": 0.06252135866307375, + "step": 1395 + }, + { + "loss": 14.1932, + "grad_norm": 2.039788007736206, + "learning_rate": 0.0005, + "epoch": 0.06274544955433924, + "step": 1400 + }, + { + "loss": 14.2866, + "grad_norm": 1.9726324081420898, + "learning_rate": 0.0005, + "epoch": 0.06296954044560474, + "step": 1405 + }, + { + "loss": 14.2492, + "grad_norm": 2.065645217895508, + "learning_rate": 0.0005, + "epoch": 0.06319363133687024, + "step": 1410 + }, + { + "loss": 14.3002, + "grad_norm": 2.189345598220825, + "learning_rate": 0.0005, + "epoch": 0.06341772222813573, + "step": 1415 + }, + { + "loss": 14.1713, + "grad_norm": 1.8538919687271118, + "learning_rate": 0.0005, + "epoch": 0.06364181311940123, + "step": 1420 + }, + { + "loss": 14.2846, + "grad_norm": 2.042672634124756, + "learning_rate": 0.0005, + "epoch": 0.06386590401066673, + "step": 1425 + }, + { + "loss": 14.2022, + "grad_norm": 2.0098628997802734, + "learning_rate": 0.0005, + "epoch": 0.06408999490193222, + "step": 1430 + }, + { + "loss": 14.2969, + "grad_norm": 1.9617176055908203, + "learning_rate": 0.0005, + "epoch": 0.06431408579319772, + "step": 1435 + }, + { + "loss": 14.282, + "grad_norm": 2.022548198699951, + "learning_rate": 0.0005, + "epoch": 0.06453817668446322, + "step": 1440 + }, + { + "loss": 14.157, + "grad_norm": 1.9941786527633667, + "learning_rate": 0.0005, + "epoch": 0.06476226757572871, + "step": 1445 + }, + { + "loss": 14.297, + "grad_norm": 2.028557777404785, + "learning_rate": 0.0005, + "epoch": 0.06498635846699422, + "step": 1450 + }, + { + "loss": 14.1691, + "grad_norm": 1.9549771547317505, + "learning_rate": 0.0005, + "epoch": 0.0652104493582597, + "step": 1455 + }, + { + "loss": 14.3092, + "grad_norm": 2.1454029083251953, + "learning_rate": 0.0005, + "epoch": 0.0654345402495252, + "step": 1460 + }, + { + "loss": 14.2483, + "grad_norm": 2.0175013542175293, + "learning_rate": 0.0005, + "epoch": 0.06565863114079071, + "step": 1465 + }, + { + "loss": 14.3039, + "grad_norm": 1.8648537397384644, + "learning_rate": 0.0005, + "epoch": 0.0658827220320562, + "step": 1470 + }, + { + "loss": 14.2233, + "grad_norm": 2.2553863525390625, + "learning_rate": 0.0005, + "epoch": 0.0661068129233217, + "step": 1475 + }, + { + "loss": 14.2414, + "grad_norm": 2.15773606300354, + "learning_rate": 0.0005, + "epoch": 0.0663309038145872, + "step": 1480 + }, + { + "loss": 14.1754, + "grad_norm": 1.9377132654190063, + "learning_rate": 0.0005, + "epoch": 0.06655499470585269, + "step": 1485 + }, + { + "loss": 14.2484, + "grad_norm": 2.170342445373535, + "learning_rate": 0.0005, + "epoch": 0.06677908559711819, + "step": 1490 + }, + { + "loss": 14.2683, + "grad_norm": 1.8721458911895752, + "learning_rate": 0.0005, + "epoch": 0.06700317648838369, + "step": 1495 + }, + { + "loss": 14.1945, + "grad_norm": 1.9463225603103638, + "learning_rate": 0.0005, + "epoch": 0.06722726737964918, + "step": 1500 + }, + { + "eval_loss": 1.7756863832473755, + "eval_runtime": 18.6591, + "eval_samples_per_second": 878.068, + "eval_steps_per_second": 7.878, + "epoch": 0.06722726737964918, + "step": 1500 + }, + { + "loss": 14.1917, + "grad_norm": 1.997578501701355, + "learning_rate": 0.0005, + "epoch": 0.06745135827091468, + "step": 1505 + }, + { + "loss": 14.3386, + "grad_norm": 2.0206363201141357, + "learning_rate": 0.0005, + "epoch": 0.06767544916218018, + "step": 1510 + }, + { + "loss": 14.2589, + "grad_norm": 2.2400431632995605, + "learning_rate": 0.0005, + "epoch": 0.06789954005344567, + "step": 1515 + }, + { + "loss": 14.2423, + "grad_norm": 2.181386709213257, + "learning_rate": 0.0005, + "epoch": 0.06812363094471117, + "step": 1520 + }, + { + "loss": 14.1931, + "grad_norm": 1.9889214038848877, + "learning_rate": 0.0005, + "epoch": 0.06834772183597668, + "step": 1525 + }, + { + "loss": 14.3355, + "grad_norm": 2.175074577331543, + "learning_rate": 0.0005, + "epoch": 0.06857181272724217, + "step": 1530 + }, + { + "loss": 14.1702, + "grad_norm": 2.0602669715881348, + "learning_rate": 0.0005, + "epoch": 0.06879590361850767, + "step": 1535 + }, + { + "loss": 14.2198, + "grad_norm": 2.050475597381592, + "learning_rate": 0.0005, + "epoch": 0.06901999450977317, + "step": 1540 + }, + { + "loss": 14.2265, + "grad_norm": 2.1217849254608154, + "learning_rate": 0.0005, + "epoch": 0.06924408540103866, + "step": 1545 + }, + { + "loss": 14.212, + "grad_norm": 2.244414806365967, + "learning_rate": 0.0005, + "epoch": 0.06946817629230416, + "step": 1550 + }, + { + "loss": 14.2356, + "grad_norm": 2.1231167316436768, + "learning_rate": 0.0005, + "epoch": 0.06969226718356966, + "step": 1555 + }, + { + "loss": 14.2127, + "grad_norm": 1.9955865144729614, + "learning_rate": 0.0005, + "epoch": 0.06991635807483515, + "step": 1560 + }, + { + "loss": 14.2554, + "grad_norm": 2.26926851272583, + "learning_rate": 0.0005, + "epoch": 0.07014044896610065, + "step": 1565 + }, + { + "loss": 14.087, + "grad_norm": 1.944664478302002, + "learning_rate": 0.0005, + "epoch": 0.07036453985736615, + "step": 1570 + }, + { + "loss": 14.1392, + "grad_norm": 2.1763548851013184, + "learning_rate": 0.0005, + "epoch": 0.07058863074863164, + "step": 1575 + }, + { + "loss": 14.1809, + "grad_norm": 2.0459864139556885, + "learning_rate": 0.0005, + "epoch": 0.07081272163989714, + "step": 1580 + }, + { + "loss": 14.1929, + "grad_norm": 1.9332237243652344, + "learning_rate": 0.0005, + "epoch": 0.07103681253116265, + "step": 1585 + }, + { + "loss": 14.1885, + "grad_norm": 1.9467579126358032, + "learning_rate": 0.0005, + "epoch": 0.07126090342242813, + "step": 1590 + }, + { + "loss": 14.1799, + "grad_norm": 1.8630837202072144, + "learning_rate": 0.0005, + "epoch": 0.07148499431369364, + "step": 1595 + }, + { + "loss": 14.1846, + "grad_norm": 2.0461738109588623, + "learning_rate": 0.0005, + "epoch": 0.07170908520495912, + "step": 1600 + }, + { + "loss": 14.3046, + "grad_norm": 1.9873324632644653, + "learning_rate": 0.0005, + "epoch": 0.07193317609622463, + "step": 1605 + }, + { + "loss": 14.1908, + "grad_norm": 2.094851493835449, + "learning_rate": 0.0005, + "epoch": 0.07215726698749013, + "step": 1610 + }, + { + "loss": 14.269, + "grad_norm": 2.1090633869171143, + "learning_rate": 0.0005, + "epoch": 0.07238135787875562, + "step": 1615 + }, + { + "loss": 14.2144, + "grad_norm": 2.027017831802368, + "learning_rate": 0.0005, + "epoch": 0.07260544877002112, + "step": 1620 + }, + { + "loss": 14.287, + "grad_norm": 2.1863245964050293, + "learning_rate": 0.0005, + "epoch": 0.07282953966128662, + "step": 1625 + }, + { + "loss": 14.264, + "grad_norm": 2.1034693717956543, + "learning_rate": 0.0005, + "epoch": 0.07305363055255211, + "step": 1630 + }, + { + "loss": 14.1631, + "grad_norm": 1.9393339157104492, + "learning_rate": 0.0005, + "epoch": 0.07327772144381761, + "step": 1635 + }, + { + "loss": 14.2625, + "grad_norm": 1.9777086973190308, + "learning_rate": 0.0005, + "epoch": 0.07350181233508311, + "step": 1640 + }, + { + "loss": 14.201, + "grad_norm": 2.212585687637329, + "learning_rate": 0.0005, + "epoch": 0.0737259032263486, + "step": 1645 + }, + { + "loss": 14.1729, + "grad_norm": 2.052217483520508, + "learning_rate": 0.0005, + "epoch": 0.0739499941176141, + "step": 1650 + }, + { + "loss": 14.1748, + "grad_norm": 2.1643266677856445, + "learning_rate": 0.0005, + "epoch": 0.0741740850088796, + "step": 1655 + }, + { + "loss": 14.2851, + "grad_norm": 2.2242891788482666, + "learning_rate": 0.0005, + "epoch": 0.0743981759001451, + "step": 1660 + }, + { + "loss": 14.2938, + "grad_norm": 2.176030158996582, + "learning_rate": 0.0005, + "epoch": 0.0746222667914106, + "step": 1665 + }, + { + "loss": 14.4237, + "grad_norm": 2.0001564025878906, + "learning_rate": 0.0005, + "epoch": 0.0748463576826761, + "step": 1670 + }, + { + "loss": 14.2654, + "grad_norm": 2.1057164669036865, + "learning_rate": 0.0005, + "epoch": 0.07507044857394159, + "step": 1675 + }, + { + "loss": 14.2158, + "grad_norm": 2.168222427368164, + "learning_rate": 0.0005, + "epoch": 0.07529453946520709, + "step": 1680 + }, + { + "loss": 14.3471, + "grad_norm": 2.0431439876556396, + "learning_rate": 0.0005, + "epoch": 0.07551863035647259, + "step": 1685 + }, + { + "loss": 14.2348, + "grad_norm": 2.1417438983917236, + "learning_rate": 0.0005, + "epoch": 0.07574272124773808, + "step": 1690 + }, + { + "loss": 14.3011, + "grad_norm": 1.8830145597457886, + "learning_rate": 0.0005, + "epoch": 0.07596681213900358, + "step": 1695 + }, + { + "loss": 14.1981, + "grad_norm": 2.20562744140625, + "learning_rate": 0.0005, + "epoch": 0.07619090303026908, + "step": 1700 + }, + { + "loss": 14.1587, + "grad_norm": 2.0206198692321777, + "learning_rate": 0.0005, + "epoch": 0.07641499392153457, + "step": 1705 + }, + { + "loss": 14.2649, + "grad_norm": 1.962499976158142, + "learning_rate": 0.0005, + "epoch": 0.07663908481280007, + "step": 1710 + }, + { + "loss": 14.2564, + "grad_norm": 2.036729574203491, + "learning_rate": 0.0005, + "epoch": 0.07686317570406558, + "step": 1715 + }, + { + "loss": 14.1581, + "grad_norm": 2.094439744949341, + "learning_rate": 0.0005, + "epoch": 0.07708726659533106, + "step": 1720 + }, + { + "loss": 14.1704, + "grad_norm": 2.143383502960205, + "learning_rate": 0.0005, + "epoch": 0.07731135748659657, + "step": 1725 + }, + { + "loss": 14.2263, + "grad_norm": 1.9057375192642212, + "learning_rate": 0.0005, + "epoch": 0.07753544837786205, + "step": 1730 + }, + { + "loss": 14.3187, + "grad_norm": 1.9055688381195068, + "learning_rate": 0.0005, + "epoch": 0.07775953926912756, + "step": 1735 + }, + { + "loss": 14.3005, + "grad_norm": 2.220723867416382, + "learning_rate": 0.0005, + "epoch": 0.07798363016039306, + "step": 1740 + }, + { + "loss": 14.2552, + "grad_norm": 2.2899348735809326, + "learning_rate": 0.0005, + "epoch": 0.07820772105165855, + "step": 1745 + }, + { + "loss": 14.1751, + "grad_norm": 1.9819159507751465, + "learning_rate": 0.0005, + "epoch": 0.07843181194292405, + "step": 1750 + }, + { + "loss": 14.1918, + "grad_norm": 1.9067010879516602, + "learning_rate": 0.0005, + "epoch": 0.07865590283418955, + "step": 1755 + }, + { + "loss": 14.2655, + "grad_norm": 1.8417835235595703, + "learning_rate": 0.0005, + "epoch": 0.07887999372545504, + "step": 1760 + }, + { + "loss": 14.2191, + "grad_norm": 2.03226637840271, + "learning_rate": 0.0005, + "epoch": 0.07910408461672054, + "step": 1765 + }, + { + "loss": 14.2526, + "grad_norm": 1.9686570167541504, + "learning_rate": 0.0005, + "epoch": 0.07932817550798604, + "step": 1770 + }, + { + "loss": 14.2191, + "grad_norm": 1.992022156715393, + "learning_rate": 0.0005, + "epoch": 0.07955226639925153, + "step": 1775 + }, + { + "loss": 14.222, + "grad_norm": 1.9003782272338867, + "learning_rate": 0.0005, + "epoch": 0.07977635729051703, + "step": 1780 + }, + { + "loss": 14.2151, + "grad_norm": 2.0168862342834473, + "learning_rate": 0.0005, + "epoch": 0.08000044818178254, + "step": 1785 + }, + { + "loss": 14.2644, + "grad_norm": 2.1465072631835938, + "learning_rate": 0.0005, + "epoch": 0.08022453907304802, + "step": 1790 + }, + { + "loss": 14.2751, + "grad_norm": 1.987831473350525, + "learning_rate": 0.0005, + "epoch": 0.08044862996431353, + "step": 1795 + }, + { + "loss": 14.3681, + "grad_norm": 2.0240437984466553, + "learning_rate": 0.0005, + "epoch": 0.08067272085557903, + "step": 1800 + }, + { + "loss": 14.3111, + "grad_norm": 2.202488422393799, + "learning_rate": 0.0005, + "epoch": 0.08089681174684452, + "step": 1805 + }, + { + "loss": 14.1726, + "grad_norm": 2.3818199634552, + "learning_rate": 0.0005, + "epoch": 0.08112090263811002, + "step": 1810 + }, + { + "loss": 14.2812, + "grad_norm": 2.1590795516967773, + "learning_rate": 0.0005, + "epoch": 0.08134499352937552, + "step": 1815 + }, + { + "loss": 14.1505, + "grad_norm": 2.025921106338501, + "learning_rate": 0.0005, + "epoch": 0.08156908442064101, + "step": 1820 + }, + { + "loss": 14.2326, + "grad_norm": 1.9268773794174194, + "learning_rate": 0.0005, + "epoch": 0.08179317531190651, + "step": 1825 + }, + { + "loss": 14.2276, + "grad_norm": 1.9694938659667969, + "learning_rate": 0.0005, + "epoch": 0.08201726620317201, + "step": 1830 + }, + { + "loss": 14.244, + "grad_norm": 2.059058904647827, + "learning_rate": 0.0005, + "epoch": 0.0822413570944375, + "step": 1835 + }, + { + "loss": 14.264, + "grad_norm": 1.882689118385315, + "learning_rate": 0.0005, + "epoch": 0.082465447985703, + "step": 1840 + }, + { + "loss": 14.2633, + "grad_norm": 2.167781114578247, + "learning_rate": 0.0005, + "epoch": 0.0826895388769685, + "step": 1845 + }, + { + "loss": 14.2088, + "grad_norm": 2.1264965534210205, + "learning_rate": 0.0005, + "epoch": 0.08291362976823399, + "step": 1850 + }, + { + "loss": 14.1594, + "grad_norm": 2.0307443141937256, + "learning_rate": 0.0005, + "epoch": 0.0831377206594995, + "step": 1855 + }, + { + "loss": 14.3297, + "grad_norm": 2.053874969482422, + "learning_rate": 0.0005, + "epoch": 0.083361811550765, + "step": 1860 + }, + { + "loss": 14.1621, + "grad_norm": 1.9825525283813477, + "learning_rate": 0.0005, + "epoch": 0.08358590244203049, + "step": 1865 + }, + { + "loss": 14.18, + "grad_norm": 1.9070043563842773, + "learning_rate": 0.0005, + "epoch": 0.08380999333329599, + "step": 1870 + }, + { + "loss": 14.2022, + "grad_norm": 1.9118187427520752, + "learning_rate": 0.0005, + "epoch": 0.08403408422456148, + "step": 1875 + }, + { + "loss": 14.3087, + "grad_norm": 2.05552339553833, + "learning_rate": 0.0005, + "epoch": 0.08425817511582698, + "step": 1880 + }, + { + "loss": 14.1496, + "grad_norm": 2.0132551193237305, + "learning_rate": 0.0005, + "epoch": 0.08448226600709248, + "step": 1885 + }, + { + "loss": 14.1777, + "grad_norm": 1.9496753215789795, + "learning_rate": 0.0005, + "epoch": 0.08470635689835797, + "step": 1890 + }, + { + "loss": 14.2595, + "grad_norm": 1.9793522357940674, + "learning_rate": 0.0005, + "epoch": 0.08493044778962347, + "step": 1895 + }, + { + "loss": 14.1772, + "grad_norm": 2.1838932037353516, + "learning_rate": 0.0005, + "epoch": 0.08515453868088897, + "step": 1900 + }, + { + "loss": 14.2392, + "grad_norm": 2.0018186569213867, + "learning_rate": 0.0005, + "epoch": 0.08537862957215446, + "step": 1905 + }, + { + "loss": 14.2405, + "grad_norm": 1.9946718215942383, + "learning_rate": 0.0005, + "epoch": 0.08560272046341996, + "step": 1910 + }, + { + "loss": 14.1868, + "grad_norm": 1.9519858360290527, + "learning_rate": 0.0005, + "epoch": 0.08582681135468546, + "step": 1915 + }, + { + "loss": 14.2879, + "grad_norm": 2.1353161334991455, + "learning_rate": 0.0005, + "epoch": 0.08605090224595095, + "step": 1920 + }, + { + "loss": 14.2344, + "grad_norm": 1.9332703351974487, + "learning_rate": 0.0005, + "epoch": 0.08627499313721645, + "step": 1925 + }, + { + "loss": 14.1706, + "grad_norm": 1.8814945220947266, + "learning_rate": 0.0005, + "epoch": 0.08649908402848196, + "step": 1930 + }, + { + "loss": 14.3248, + "grad_norm": 2.1232025623321533, + "learning_rate": 0.0005, + "epoch": 0.08672317491974744, + "step": 1935 + }, + { + "loss": 14.2242, + "grad_norm": 2.008812665939331, + "learning_rate": 0.0005, + "epoch": 0.08694726581101295, + "step": 1940 + }, + { + "loss": 14.3041, + "grad_norm": 2.014477014541626, + "learning_rate": 0.0005, + "epoch": 0.08717135670227845, + "step": 1945 + }, + { + "loss": 14.2186, + "grad_norm": 1.984898328781128, + "learning_rate": 0.0005, + "epoch": 0.08739544759354394, + "step": 1950 + }, + { + "loss": 14.1764, + "grad_norm": 1.9279865026474, + "learning_rate": 0.0005, + "epoch": 0.08761953848480944, + "step": 1955 + }, + { + "loss": 14.2195, + "grad_norm": 1.8694874048233032, + "learning_rate": 0.0005, + "epoch": 0.08784362937607494, + "step": 1960 + }, + { + "loss": 14.2286, + "grad_norm": 1.8479489088058472, + "learning_rate": 0.0005, + "epoch": 0.08806772026734043, + "step": 1965 + }, + { + "loss": 14.3218, + "grad_norm": 1.998494267463684, + "learning_rate": 0.0005, + "epoch": 0.08829181115860593, + "step": 1970 + }, + { + "loss": 14.1551, + "grad_norm": 1.9892079830169678, + "learning_rate": 0.0005, + "epoch": 0.08851590204987143, + "step": 1975 + }, + { + "loss": 14.2086, + "grad_norm": 1.9579588174819946, + "learning_rate": 0.0005, + "epoch": 0.08873999294113692, + "step": 1980 + }, + { + "loss": 14.2593, + "grad_norm": 2.2442996501922607, + "learning_rate": 0.0005, + "epoch": 0.08896408383240242, + "step": 1985 + }, + { + "loss": 14.2019, + "grad_norm": 2.1113250255584717, + "learning_rate": 0.0005, + "epoch": 0.08918817472366793, + "step": 1990 + }, + { + "loss": 14.3084, + "grad_norm": 1.873713731765747, + "learning_rate": 0.0005, + "epoch": 0.08941226561493341, + "step": 1995 + }, + { + "loss": 14.2092, + "grad_norm": 1.936440110206604, + "learning_rate": 0.0005, + "epoch": 0.08963635650619892, + "step": 2000 + }, + { + "eval_loss": 1.7694858312606812, + "eval_runtime": 18.5104, + "eval_samples_per_second": 885.125, + "eval_steps_per_second": 7.941, + "epoch": 0.08963635650619892, + "step": 2000 + }, + { + "loss": 14.2115, + "grad_norm": 2.13944673538208, + "learning_rate": 0.0005, + "epoch": 0.0898604473974644, + "step": 2005 + }, + { + "loss": 14.2865, + "grad_norm": 2.0662755966186523, + "learning_rate": 0.0005, + "epoch": 0.0900845382887299, + "step": 2010 + }, + { + "loss": 14.2349, + "grad_norm": 2.0008041858673096, + "learning_rate": 0.0005, + "epoch": 0.09030862917999541, + "step": 2015 + }, + { + "loss": 14.315, + "grad_norm": 2.110163688659668, + "learning_rate": 0.0005, + "epoch": 0.0905327200712609, + "step": 2020 + }, + { + "loss": 14.249, + "grad_norm": 2.0696797370910645, + "learning_rate": 0.0005, + "epoch": 0.0907568109625264, + "step": 2025 + }, + { + "loss": 14.2679, + "grad_norm": 2.0506837368011475, + "learning_rate": 0.0005, + "epoch": 0.0909809018537919, + "step": 2030 + }, + { + "loss": 14.2995, + "grad_norm": 1.9853943586349487, + "learning_rate": 0.0005, + "epoch": 0.09120499274505739, + "step": 2035 + }, + { + "loss": 14.1841, + "grad_norm": 1.9887142181396484, + "learning_rate": 0.0005, + "epoch": 0.09142908363632289, + "step": 2040 + }, + { + "loss": 14.1799, + "grad_norm": 1.916782259941101, + "learning_rate": 0.0005, + "epoch": 0.0916531745275884, + "step": 2045 + }, + { + "loss": 14.2815, + "grad_norm": 1.9294121265411377, + "learning_rate": 0.0005, + "epoch": 0.09187726541885388, + "step": 2050 + }, + { + "loss": 14.3043, + "grad_norm": 2.086944580078125, + "learning_rate": 0.0005, + "epoch": 0.09210135631011938, + "step": 2055 + }, + { + "loss": 14.0751, + "grad_norm": 2.0607666969299316, + "learning_rate": 0.0005, + "epoch": 0.09232544720138489, + "step": 2060 + }, + { + "loss": 14.0893, + "grad_norm": 1.8890804052352905, + "learning_rate": 0.0005, + "epoch": 0.09254953809265037, + "step": 2065 + }, + { + "loss": 14.2719, + "grad_norm": 2.1385536193847656, + "learning_rate": 0.0005, + "epoch": 0.09277362898391588, + "step": 2070 + }, + { + "loss": 14.1656, + "grad_norm": 2.054858446121216, + "learning_rate": 0.0005, + "epoch": 0.09299771987518138, + "step": 2075 + }, + { + "loss": 14.1492, + "grad_norm": 2.176318645477295, + "learning_rate": 0.0005, + "epoch": 0.09322181076644687, + "step": 2080 + }, + { + "loss": 14.1967, + "grad_norm": 2.424211263656616, + "learning_rate": 0.0005, + "epoch": 0.09344590165771237, + "step": 2085 + }, + { + "loss": 14.2942, + "grad_norm": 2.1716487407684326, + "learning_rate": 0.0005, + "epoch": 0.09366999254897787, + "step": 2090 + }, + { + "loss": 14.2212, + "grad_norm": 1.8533152341842651, + "learning_rate": 0.0005, + "epoch": 0.09389408344024336, + "step": 2095 + }, + { + "loss": 14.2678, + "grad_norm": 1.9658597707748413, + "learning_rate": 0.0005, + "epoch": 0.09411817433150886, + "step": 2100 + }, + { + "loss": 14.3178, + "grad_norm": 1.9647737741470337, + "learning_rate": 0.0005, + "epoch": 0.09434226522277436, + "step": 2105 + }, + { + "loss": 14.307, + "grad_norm": 1.9707311391830444, + "learning_rate": 0.0005, + "epoch": 0.09456635611403985, + "step": 2110 + }, + { + "loss": 14.2303, + "grad_norm": 1.9236079454421997, + "learning_rate": 0.0005, + "epoch": 0.09479044700530535, + "step": 2115 + }, + { + "loss": 14.1953, + "grad_norm": 1.9698580503463745, + "learning_rate": 0.0005, + "epoch": 0.09501453789657086, + "step": 2120 + }, + { + "loss": 14.2416, + "grad_norm": 2.0320985317230225, + "learning_rate": 0.0005, + "epoch": 0.09523862878783634, + "step": 2125 + }, + { + "loss": 14.3115, + "grad_norm": 2.0945701599121094, + "learning_rate": 0.0005, + "epoch": 0.09546271967910185, + "step": 2130 + }, + { + "loss": 14.2766, + "grad_norm": 1.9204962253570557, + "learning_rate": 0.0005, + "epoch": 0.09568681057036735, + "step": 2135 + }, + { + "loss": 14.1301, + "grad_norm": 2.120319366455078, + "learning_rate": 0.0005, + "epoch": 0.09591090146163284, + "step": 2140 + }, + { + "loss": 14.1893, + "grad_norm": 2.18697452545166, + "learning_rate": 0.0005, + "epoch": 0.09613499235289834, + "step": 2145 + }, + { + "loss": 14.2259, + "grad_norm": 2.278101682662964, + "learning_rate": 0.0005, + "epoch": 0.09635908324416383, + "step": 2150 + }, + { + "loss": 14.2309, + "grad_norm": 2.2018258571624756, + "learning_rate": 0.0005, + "epoch": 0.09658317413542933, + "step": 2155 + }, + { + "loss": 14.2609, + "grad_norm": 2.056438684463501, + "learning_rate": 0.0005, + "epoch": 0.09680726502669483, + "step": 2160 + }, + { + "loss": 14.1959, + "grad_norm": 2.1732802391052246, + "learning_rate": 0.0005, + "epoch": 0.09703135591796032, + "step": 2165 + }, + { + "loss": 14.2884, + "grad_norm": 2.277907133102417, + "learning_rate": 0.0005, + "epoch": 0.09725544680922582, + "step": 2170 + }, + { + "loss": 14.1614, + "grad_norm": 2.2980709075927734, + "learning_rate": 0.0005, + "epoch": 0.09747953770049132, + "step": 2175 + }, + { + "loss": 14.1949, + "grad_norm": 2.000433921813965, + "learning_rate": 0.0005, + "epoch": 0.09770362859175681, + "step": 2180 + }, + { + "loss": 14.1517, + "grad_norm": 2.084157705307007, + "learning_rate": 0.0005, + "epoch": 0.09792771948302231, + "step": 2185 + }, + { + "loss": 14.0785, + "grad_norm": 2.104269504547119, + "learning_rate": 0.0005, + "epoch": 0.09815181037428782, + "step": 2190 + }, + { + "loss": 14.24, + "grad_norm": 2.062222719192505, + "learning_rate": 0.0005, + "epoch": 0.0983759012655533, + "step": 2195 + }, + { + "loss": 14.2399, + "grad_norm": 1.9244170188903809, + "learning_rate": 0.0005, + "epoch": 0.0985999921568188, + "step": 2200 + }, + { + "loss": 14.2444, + "grad_norm": 2.064002752304077, + "learning_rate": 0.0005, + "epoch": 0.09882408304808431, + "step": 2205 + }, + { + "loss": 14.2101, + "grad_norm": 1.9590471982955933, + "learning_rate": 0.0005, + "epoch": 0.0990481739393498, + "step": 2210 + }, + { + "loss": 14.3417, + "grad_norm": 2.0932776927948, + "learning_rate": 0.0005, + "epoch": 0.0992722648306153, + "step": 2215 + }, + { + "loss": 14.1232, + "grad_norm": 2.11871600151062, + "learning_rate": 0.0005, + "epoch": 0.0994963557218808, + "step": 2220 + }, + { + "loss": 14.2301, + "grad_norm": 2.137220859527588, + "learning_rate": 0.0005, + "epoch": 0.09972044661314629, + "step": 2225 + }, + { + "loss": 14.204, + "grad_norm": 2.1775405406951904, + "learning_rate": 0.0005, + "epoch": 0.09994453750441179, + "step": 2230 + }, + { + "loss": 14.1153, + "grad_norm": 1.8135520219802856, + "learning_rate": 0.0005, + "epoch": 0.10016862839567729, + "step": 2235 + }, + { + "loss": 14.0921, + "grad_norm": 1.9595603942871094, + "learning_rate": 0.0005, + "epoch": 0.10039271928694278, + "step": 2240 + }, + { + "loss": 14.1785, + "grad_norm": 2.0768790245056152, + "learning_rate": 0.0005, + "epoch": 0.10061681017820828, + "step": 2245 + }, + { + "loss": 14.3008, + "grad_norm": 1.988590955734253, + "learning_rate": 0.0005, + "epoch": 0.10084090106947378, + "step": 2250 + }, + { + "loss": 14.2555, + "grad_norm": 2.1865451335906982, + "learning_rate": 0.0005, + "epoch": 0.10106499196073927, + "step": 2255 + }, + { + "loss": 14.1862, + "grad_norm": 2.0482969284057617, + "learning_rate": 0.0005, + "epoch": 0.10128908285200477, + "step": 2260 + }, + { + "loss": 14.2719, + "grad_norm": 2.082878351211548, + "learning_rate": 0.0005, + "epoch": 0.10151317374327028, + "step": 2265 + }, + { + "loss": 14.1601, + "grad_norm": 2.1429758071899414, + "learning_rate": 0.0005, + "epoch": 0.10173726463453577, + "step": 2270 + }, + { + "loss": 14.3036, + "grad_norm": 1.9471749067306519, + "learning_rate": 0.0005, + "epoch": 0.10196135552580127, + "step": 2275 + }, + { + "loss": 14.1512, + "grad_norm": 1.9544740915298462, + "learning_rate": 0.0005, + "epoch": 0.10218544641706676, + "step": 2280 + }, + { + "loss": 14.1185, + "grad_norm": 1.9881949424743652, + "learning_rate": 0.0005, + "epoch": 0.10240953730833226, + "step": 2285 + }, + { + "loss": 14.1819, + "grad_norm": 2.0492491722106934, + "learning_rate": 0.0005, + "epoch": 0.10263362819959776, + "step": 2290 + }, + { + "loss": 14.1772, + "grad_norm": 1.8486850261688232, + "learning_rate": 0.0005, + "epoch": 0.10285771909086325, + "step": 2295 + }, + { + "loss": 14.2211, + "grad_norm": 1.8952183723449707, + "learning_rate": 0.0005, + "epoch": 0.10308180998212875, + "step": 2300 + }, + { + "loss": 14.2313, + "grad_norm": 2.058659076690674, + "learning_rate": 0.0005, + "epoch": 0.10330590087339425, + "step": 2305 + }, + { + "loss": 14.1636, + "grad_norm": 2.078249454498291, + "learning_rate": 0.0005, + "epoch": 0.10352999176465974, + "step": 2310 + }, + { + "loss": 14.2255, + "grad_norm": 2.116319417953491, + "learning_rate": 0.0005, + "epoch": 0.10375408265592524, + "step": 2315 + }, + { + "loss": 14.1742, + "grad_norm": 1.9836976528167725, + "learning_rate": 0.0005, + "epoch": 0.10397817354719074, + "step": 2320 + }, + { + "loss": 14.2929, + "grad_norm": 1.8880807161331177, + "learning_rate": 0.0005, + "epoch": 0.10420226443845623, + "step": 2325 + }, + { + "loss": 14.2256, + "grad_norm": 2.033595323562622, + "learning_rate": 0.0005, + "epoch": 0.10442635532972173, + "step": 2330 + }, + { + "loss": 14.2659, + "grad_norm": 1.9538575410842896, + "learning_rate": 0.0005, + "epoch": 0.10465044622098724, + "step": 2335 + }, + { + "loss": 14.2364, + "grad_norm": 2.214423179626465, + "learning_rate": 0.0005, + "epoch": 0.10487453711225272, + "step": 2340 + }, + { + "loss": 14.2227, + "grad_norm": 2.110269546508789, + "learning_rate": 0.0005, + "epoch": 0.10509862800351823, + "step": 2345 + }, + { + "loss": 14.2708, + "grad_norm": 2.208690643310547, + "learning_rate": 0.0005, + "epoch": 0.10532271889478373, + "step": 2350 + }, + { + "loss": 14.2478, + "grad_norm": 2.119637966156006, + "learning_rate": 0.0005, + "epoch": 0.10554680978604922, + "step": 2355 + }, + { + "loss": 14.2098, + "grad_norm": 1.8906817436218262, + "learning_rate": 0.0005, + "epoch": 0.10577090067731472, + "step": 2360 + }, + { + "loss": 14.2196, + "grad_norm": 1.96303129196167, + "learning_rate": 0.0005, + "epoch": 0.10599499156858022, + "step": 2365 + }, + { + "loss": 14.3115, + "grad_norm": 1.972203254699707, + "learning_rate": 0.0005, + "epoch": 0.10621908245984571, + "step": 2370 + }, + { + "loss": 14.2014, + "grad_norm": 2.158946990966797, + "learning_rate": 0.0005, + "epoch": 0.10644317335111121, + "step": 2375 + }, + { + "loss": 14.2611, + "grad_norm": 2.0782086849212646, + "learning_rate": 0.0005, + "epoch": 0.10666726424237671, + "step": 2380 + }, + { + "loss": 14.1658, + "grad_norm": 1.9304105043411255, + "learning_rate": 0.0005, + "epoch": 0.1068913551336422, + "step": 2385 + }, + { + "loss": 14.1135, + "grad_norm": 1.9931169748306274, + "learning_rate": 0.0005, + "epoch": 0.1071154460249077, + "step": 2390 + }, + { + "loss": 14.2804, + "grad_norm": 2.020009994506836, + "learning_rate": 0.0005, + "epoch": 0.1073395369161732, + "step": 2395 + }, + { + "loss": 14.2276, + "grad_norm": 2.0295846462249756, + "learning_rate": 0.0005, + "epoch": 0.1075636278074387, + "step": 2400 + }, + { + "loss": 14.1819, + "grad_norm": 1.9559621810913086, + "learning_rate": 0.0005, + "epoch": 0.1077877186987042, + "step": 2405 + }, + { + "loss": 14.2124, + "grad_norm": 2.1522719860076904, + "learning_rate": 0.0005, + "epoch": 0.1080118095899697, + "step": 2410 + }, + { + "loss": 14.2209, + "grad_norm": 1.9611594676971436, + "learning_rate": 0.0005, + "epoch": 0.10823590048123519, + "step": 2415 + }, + { + "loss": 14.2033, + "grad_norm": 2.0471651554107666, + "learning_rate": 0.0005, + "epoch": 0.10845999137250069, + "step": 2420 + }, + { + "loss": 14.1608, + "grad_norm": 2.0396370887756348, + "learning_rate": 0.0005, + "epoch": 0.10868408226376618, + "step": 2425 + }, + { + "loss": 14.2029, + "grad_norm": 1.9847370386123657, + "learning_rate": 0.0005, + "epoch": 0.10890817315503168, + "step": 2430 + }, + { + "loss": 14.3484, + "grad_norm": 1.928900122642517, + "learning_rate": 0.0005, + "epoch": 0.10913226404629718, + "step": 2435 + }, + { + "loss": 14.1839, + "grad_norm": 1.9697222709655762, + "learning_rate": 0.0005, + "epoch": 0.10935635493756267, + "step": 2440 + }, + { + "loss": 14.3031, + "grad_norm": 1.9415478706359863, + "learning_rate": 0.0005, + "epoch": 0.10958044582882817, + "step": 2445 + }, + { + "loss": 14.1281, + "grad_norm": 2.1670243740081787, + "learning_rate": 0.0005, + "epoch": 0.10980453672009367, + "step": 2450 + }, + { + "loss": 14.2247, + "grad_norm": 2.067964792251587, + "learning_rate": 0.0005, + "epoch": 0.11002862761135916, + "step": 2455 + }, + { + "loss": 14.1618, + "grad_norm": 1.9133504629135132, + "learning_rate": 0.0005, + "epoch": 0.11025271850262466, + "step": 2460 + }, + { + "loss": 14.1767, + "grad_norm": 2.117112159729004, + "learning_rate": 0.0005, + "epoch": 0.11047680939389017, + "step": 2465 + }, + { + "loss": 14.2068, + "grad_norm": 2.132375717163086, + "learning_rate": 0.0005, + "epoch": 0.11070090028515565, + "step": 2470 + }, + { + "loss": 14.1987, + "grad_norm": 1.9348459243774414, + "learning_rate": 0.0005, + "epoch": 0.11092499117642116, + "step": 2475 + }, + { + "loss": 14.0825, + "grad_norm": 2.089294910430908, + "learning_rate": 0.0005, + "epoch": 0.11114908206768666, + "step": 2480 + }, + { + "loss": 14.2549, + "grad_norm": 2.2194576263427734, + "learning_rate": 0.0005, + "epoch": 0.11137317295895215, + "step": 2485 + }, + { + "loss": 14.3019, + "grad_norm": 2.019630193710327, + "learning_rate": 0.0005, + "epoch": 0.11159726385021765, + "step": 2490 + }, + { + "loss": 14.219, + "grad_norm": 1.942373275756836, + "learning_rate": 0.0005, + "epoch": 0.11182135474148315, + "step": 2495 + }, + { + "loss": 14.1376, + "grad_norm": 2.0612740516662598, + "learning_rate": 0.0005, + "epoch": 0.11204544563274864, + "step": 2500 + }, + { + "eval_loss": 1.7685763835906982, + "eval_runtime": 18.3338, + "eval_samples_per_second": 893.65, + "eval_steps_per_second": 8.018, + "epoch": 0.11204544563274864, + "step": 2500 + }, + { + "loss": 14.202, + "grad_norm": 2.05464768409729, + "learning_rate": 0.0005, + "epoch": 0.11226953652401414, + "step": 2505 + }, + { + "loss": 14.1914, + "grad_norm": 1.9068650007247925, + "learning_rate": 0.0005, + "epoch": 0.11249362741527964, + "step": 2510 + }, + { + "loss": 14.1411, + "grad_norm": 1.9879950284957886, + "learning_rate": 0.0005, + "epoch": 0.11271771830654513, + "step": 2515 + }, + { + "loss": 14.1364, + "grad_norm": 2.096574306488037, + "learning_rate": 0.0005, + "epoch": 0.11294180919781063, + "step": 2520 + }, + { + "loss": 14.1867, + "grad_norm": 1.994155764579773, + "learning_rate": 0.0005, + "epoch": 0.11316590008907614, + "step": 2525 + }, + { + "loss": 14.1919, + "grad_norm": 2.00042724609375, + "learning_rate": 0.0005, + "epoch": 0.11338999098034162, + "step": 2530 + }, + { + "loss": 14.1076, + "grad_norm": 2.0063529014587402, + "learning_rate": 0.0005, + "epoch": 0.11361408187160713, + "step": 2535 + }, + { + "loss": 14.2082, + "grad_norm": 1.990536093711853, + "learning_rate": 0.0005, + "epoch": 0.11383817276287263, + "step": 2540 + }, + { + "loss": 14.2478, + "grad_norm": 2.0046889781951904, + "learning_rate": 0.0005, + "epoch": 0.11406226365413812, + "step": 2545 + }, + { + "loss": 14.223, + "grad_norm": 1.8907625675201416, + "learning_rate": 0.0005, + "epoch": 0.11428635454540362, + "step": 2550 + }, + { + "loss": 14.1406, + "grad_norm": 1.927342176437378, + "learning_rate": 0.0005, + "epoch": 0.11451044543666912, + "step": 2555 + }, + { + "loss": 14.1912, + "grad_norm": 1.8561776876449585, + "learning_rate": 0.0005, + "epoch": 0.11473453632793461, + "step": 2560 + }, + { + "loss": 14.218, + "grad_norm": 1.9795992374420166, + "learning_rate": 0.0005, + "epoch": 0.11495862721920011, + "step": 2565 + }, + { + "loss": 14.1991, + "grad_norm": 2.0302844047546387, + "learning_rate": 0.0005, + "epoch": 0.1151827181104656, + "step": 2570 + }, + { + "loss": 14.1874, + "grad_norm": 1.9552420377731323, + "learning_rate": 0.0005, + "epoch": 0.1154068090017311, + "step": 2575 + }, + { + "loss": 14.1369, + "grad_norm": 2.0299878120422363, + "learning_rate": 0.0005, + "epoch": 0.1156308998929966, + "step": 2580 + }, + { + "loss": 14.2594, + "grad_norm": 2.055478096008301, + "learning_rate": 0.0005, + "epoch": 0.11585499078426209, + "step": 2585 + }, + { + "loss": 14.2619, + "grad_norm": 1.9360374212265015, + "learning_rate": 0.0005, + "epoch": 0.11607908167552759, + "step": 2590 + }, + { + "loss": 14.0849, + "grad_norm": 2.006324529647827, + "learning_rate": 0.0005, + "epoch": 0.1163031725667931, + "step": 2595 + }, + { + "loss": 14.1619, + "grad_norm": 1.85250985622406, + "learning_rate": 0.0005, + "epoch": 0.11652726345805858, + "step": 2600 + }, + { + "loss": 14.1659, + "grad_norm": 1.9738417863845825, + "learning_rate": 0.0005, + "epoch": 0.11675135434932409, + "step": 2605 + }, + { + "loss": 14.2338, + "grad_norm": 1.9255121946334839, + "learning_rate": 0.0005, + "epoch": 0.11697544524058959, + "step": 2610 + }, + { + "loss": 14.176, + "grad_norm": 1.98264741897583, + "learning_rate": 0.0005, + "epoch": 0.11719953613185508, + "step": 2615 + }, + { + "loss": 14.1467, + "grad_norm": 1.8770341873168945, + "learning_rate": 0.0005, + "epoch": 0.11742362702312058, + "step": 2620 + }, + { + "loss": 14.3661, + "grad_norm": 2.094547986984253, + "learning_rate": 0.0005, + "epoch": 0.11764771791438608, + "step": 2625 + }, + { + "loss": 14.246, + "grad_norm": 2.261746644973755, + "learning_rate": 0.0005, + "epoch": 0.11787180880565157, + "step": 2630 + }, + { + "loss": 14.2903, + "grad_norm": 1.9414910078048706, + "learning_rate": 0.0005, + "epoch": 0.11809589969691707, + "step": 2635 + }, + { + "loss": 14.1791, + "grad_norm": 1.837276577949524, + "learning_rate": 0.0005, + "epoch": 0.11831999058818257, + "step": 2640 + }, + { + "loss": 14.2062, + "grad_norm": 2.1287856101989746, + "learning_rate": 0.0005, + "epoch": 0.11854408147944806, + "step": 2645 + }, + { + "loss": 14.2858, + "grad_norm": 2.1157543659210205, + "learning_rate": 0.0005, + "epoch": 0.11876817237071356, + "step": 2650 + }, + { + "loss": 14.142, + "grad_norm": 2.023026704788208, + "learning_rate": 0.0005, + "epoch": 0.11899226326197906, + "step": 2655 + }, + { + "loss": 14.1913, + "grad_norm": 1.9373962879180908, + "learning_rate": 0.0005, + "epoch": 0.11921635415324455, + "step": 2660 + }, + { + "loss": 14.1843, + "grad_norm": 2.1472551822662354, + "learning_rate": 0.0005, + "epoch": 0.11944044504451005, + "step": 2665 + }, + { + "loss": 14.1469, + "grad_norm": 2.104508876800537, + "learning_rate": 0.0005, + "epoch": 0.11966453593577556, + "step": 2670 + }, + { + "loss": 14.2021, + "grad_norm": 1.955299735069275, + "learning_rate": 0.0005, + "epoch": 0.11988862682704104, + "step": 2675 + }, + { + "loss": 14.2431, + "grad_norm": 2.3675167560577393, + "learning_rate": 0.0005, + "epoch": 0.12011271771830655, + "step": 2680 + }, + { + "loss": 14.2884, + "grad_norm": 2.3444430828094482, + "learning_rate": 0.0005, + "epoch": 0.12033680860957205, + "step": 2685 + }, + { + "loss": 14.187, + "grad_norm": 1.919924259185791, + "learning_rate": 0.0005, + "epoch": 0.12056089950083754, + "step": 2690 + }, + { + "loss": 14.153, + "grad_norm": 1.9042751789093018, + "learning_rate": 0.0005, + "epoch": 0.12078499039210304, + "step": 2695 + }, + { + "loss": 14.2259, + "grad_norm": 1.8784211874008179, + "learning_rate": 0.0005, + "epoch": 0.12100908128336853, + "step": 2700 + }, + { + "loss": 14.1874, + "grad_norm": 2.0663719177246094, + "learning_rate": 0.0005, + "epoch": 0.12123317217463403, + "step": 2705 + }, + { + "loss": 14.2434, + "grad_norm": 1.931662917137146, + "learning_rate": 0.0005, + "epoch": 0.12145726306589953, + "step": 2710 + }, + { + "loss": 14.0864, + "grad_norm": 2.011249303817749, + "learning_rate": 0.0005, + "epoch": 0.12168135395716502, + "step": 2715 + }, + { + "loss": 14.1069, + "grad_norm": 2.1195309162139893, + "learning_rate": 0.0005, + "epoch": 0.12190544484843052, + "step": 2720 + }, + { + "loss": 14.2632, + "grad_norm": 2.1942484378814697, + "learning_rate": 0.0005, + "epoch": 0.12212953573969602, + "step": 2725 + }, + { + "loss": 14.1839, + "grad_norm": 2.2845983505249023, + "learning_rate": 0.0005, + "epoch": 0.12235362663096151, + "step": 2730 + }, + { + "loss": 14.1554, + "grad_norm": 1.9804823398590088, + "learning_rate": 0.0005, + "epoch": 0.12257771752222701, + "step": 2735 + }, + { + "loss": 14.2848, + "grad_norm": 2.084580421447754, + "learning_rate": 0.0005, + "epoch": 0.12280180841349252, + "step": 2740 + }, + { + "loss": 14.2366, + "grad_norm": 1.894793152809143, + "learning_rate": 0.0005, + "epoch": 0.123025899304758, + "step": 2745 + }, + { + "loss": 14.3029, + "grad_norm": 2.0080316066741943, + "learning_rate": 0.0005, + "epoch": 0.1232499901960235, + "step": 2750 + }, + { + "loss": 14.1409, + "grad_norm": 1.9836593866348267, + "learning_rate": 0.0005, + "epoch": 0.12347408108728901, + "step": 2755 + }, + { + "loss": 14.1903, + "grad_norm": 2.1319408416748047, + "learning_rate": 0.0005, + "epoch": 0.1236981719785545, + "step": 2760 + }, + { + "loss": 14.1102, + "grad_norm": 2.1290483474731445, + "learning_rate": 0.0005, + "epoch": 0.12392226286982, + "step": 2765 + }, + { + "loss": 14.2279, + "grad_norm": 1.875629186630249, + "learning_rate": 0.0005, + "epoch": 0.1241463537610855, + "step": 2770 + }, + { + "loss": 14.1881, + "grad_norm": 2.0723519325256348, + "learning_rate": 0.0005, + "epoch": 0.12437044465235099, + "step": 2775 + }, + { + "loss": 14.1952, + "grad_norm": 1.951786756515503, + "learning_rate": 0.0005, + "epoch": 0.12459453554361649, + "step": 2780 + }, + { + "loss": 14.2668, + "grad_norm": 2.084223985671997, + "learning_rate": 0.0005, + "epoch": 0.124818626434882, + "step": 2785 + }, + { + "loss": 14.2497, + "grad_norm": 1.9113136529922485, + "learning_rate": 0.0005, + "epoch": 0.1250427173261475, + "step": 2790 + }, + { + "loss": 14.2409, + "grad_norm": 2.0159406661987305, + "learning_rate": 0.0005, + "epoch": 0.12526680821741298, + "step": 2795 + }, + { + "loss": 14.2057, + "grad_norm": 2.006678819656372, + "learning_rate": 0.0005, + "epoch": 0.12549089910867847, + "step": 2800 + }, + { + "loss": 14.2058, + "grad_norm": 1.7884042263031006, + "learning_rate": 0.0005, + "epoch": 0.125714989999944, + "step": 2805 + }, + { + "loss": 14.2556, + "grad_norm": 1.9461385011672974, + "learning_rate": 0.0005, + "epoch": 0.12593908089120948, + "step": 2810 + }, + { + "loss": 14.1465, + "grad_norm": 1.9860858917236328, + "learning_rate": 0.0005, + "epoch": 0.12616317178247496, + "step": 2815 + }, + { + "loss": 14.2526, + "grad_norm": 2.0649843215942383, + "learning_rate": 0.0005, + "epoch": 0.12638726267374048, + "step": 2820 + }, + { + "loss": 14.1878, + "grad_norm": 1.8778871297836304, + "learning_rate": 0.0005, + "epoch": 0.12661135356500597, + "step": 2825 + }, + { + "loss": 14.1535, + "grad_norm": 2.0370824337005615, + "learning_rate": 0.0005, + "epoch": 0.12683544445627146, + "step": 2830 + }, + { + "loss": 14.2271, + "grad_norm": 2.323042154312134, + "learning_rate": 0.0005, + "epoch": 0.12705953534753697, + "step": 2835 + }, + { + "loss": 14.154, + "grad_norm": 2.1211202144622803, + "learning_rate": 0.0005, + "epoch": 0.12728362623880246, + "step": 2840 + }, + { + "loss": 14.4302, + "grad_norm": 2.069763660430908, + "learning_rate": 0.0005, + "epoch": 0.12750771713006795, + "step": 2845 + }, + { + "loss": 14.1709, + "grad_norm": 2.1002161502838135, + "learning_rate": 0.0005, + "epoch": 0.12773180802133347, + "step": 2850 + }, + { + "loss": 14.1417, + "grad_norm": 1.8982101678848267, + "learning_rate": 0.0005, + "epoch": 0.12795589891259895, + "step": 2855 + }, + { + "loss": 14.1567, + "grad_norm": 2.0299010276794434, + "learning_rate": 0.0005, + "epoch": 0.12817998980386444, + "step": 2860 + }, + { + "loss": 14.2246, + "grad_norm": 1.9968671798706055, + "learning_rate": 0.0005, + "epoch": 0.12840408069512996, + "step": 2865 + }, + { + "loss": 14.1685, + "grad_norm": 2.1153697967529297, + "learning_rate": 0.0005, + "epoch": 0.12862817158639545, + "step": 2870 + }, + { + "loss": 14.1018, + "grad_norm": 1.9605722427368164, + "learning_rate": 0.0005, + "epoch": 0.12885226247766093, + "step": 2875 + }, + { + "loss": 14.2357, + "grad_norm": 2.1596014499664307, + "learning_rate": 0.0005, + "epoch": 0.12907635336892645, + "step": 2880 + }, + { + "loss": 14.1059, + "grad_norm": 2.115142583847046, + "learning_rate": 0.0005, + "epoch": 0.12930044426019194, + "step": 2885 + }, + { + "loss": 14.2208, + "grad_norm": 1.9209206104278564, + "learning_rate": 0.0005, + "epoch": 0.12952453515145743, + "step": 2890 + }, + { + "loss": 14.0913, + "grad_norm": 2.015691041946411, + "learning_rate": 0.0005, + "epoch": 0.12974862604272294, + "step": 2895 + }, + { + "loss": 14.2086, + "grad_norm": 2.330225944519043, + "learning_rate": 0.0005, + "epoch": 0.12997271693398843, + "step": 2900 + }, + { + "loss": 14.1818, + "grad_norm": 2.37501859664917, + "learning_rate": 0.0005, + "epoch": 0.13019680782525392, + "step": 2905 + }, + { + "loss": 14.302, + "grad_norm": 1.8006333112716675, + "learning_rate": 0.0005, + "epoch": 0.1304208987165194, + "step": 2910 + }, + { + "loss": 14.1393, + "grad_norm": 1.9731910228729248, + "learning_rate": 0.0005, + "epoch": 0.13064498960778492, + "step": 2915 + }, + { + "loss": 14.1278, + "grad_norm": 2.001185894012451, + "learning_rate": 0.0005, + "epoch": 0.1308690804990504, + "step": 2920 + }, + { + "loss": 14.2546, + "grad_norm": 2.10178542137146, + "learning_rate": 0.0005, + "epoch": 0.1310931713903159, + "step": 2925 + }, + { + "loss": 14.2663, + "grad_norm": 1.9712029695510864, + "learning_rate": 0.0005, + "epoch": 0.13131726228158142, + "step": 2930 + }, + { + "loss": 14.248, + "grad_norm": 2.0637047290802, + "learning_rate": 0.0005, + "epoch": 0.1315413531728469, + "step": 2935 + }, + { + "loss": 14.2202, + "grad_norm": 1.9773246049880981, + "learning_rate": 0.0005, + "epoch": 0.1317654440641124, + "step": 2940 + }, + { + "loss": 14.1655, + "grad_norm": 1.958693027496338, + "learning_rate": 0.0005, + "epoch": 0.1319895349553779, + "step": 2945 + }, + { + "loss": 14.1329, + "grad_norm": 1.9771935939788818, + "learning_rate": 0.0005, + "epoch": 0.1322136258466434, + "step": 2950 + }, + { + "loss": 14.1144, + "grad_norm": 2.0255813598632812, + "learning_rate": 0.0005, + "epoch": 0.13243771673790888, + "step": 2955 + }, + { + "loss": 14.1486, + "grad_norm": 1.958465337753296, + "learning_rate": 0.0005, + "epoch": 0.1326618076291744, + "step": 2960 + }, + { + "loss": 14.1397, + "grad_norm": 1.9276237487792969, + "learning_rate": 0.0005, + "epoch": 0.1328858985204399, + "step": 2965 + }, + { + "loss": 14.2054, + "grad_norm": 2.038614511489868, + "learning_rate": 0.0005, + "epoch": 0.13310998941170538, + "step": 2970 + }, + { + "loss": 14.1933, + "grad_norm": 2.030726671218872, + "learning_rate": 0.0005, + "epoch": 0.1333340803029709, + "step": 2975 + }, + { + "loss": 14.1404, + "grad_norm": 2.0541908740997314, + "learning_rate": 0.0005, + "epoch": 0.13355817119423638, + "step": 2980 + }, + { + "loss": 14.1175, + "grad_norm": 1.962197184562683, + "learning_rate": 0.0005, + "epoch": 0.13378226208550187, + "step": 2985 + }, + { + "loss": 14.227, + "grad_norm": 1.9372152090072632, + "learning_rate": 0.0005, + "epoch": 0.13400635297676738, + "step": 2990 + }, + { + "loss": 14.2248, + "grad_norm": 1.856194019317627, + "learning_rate": 0.0005, + "epoch": 0.13423044386803287, + "step": 2995 + }, + { + "loss": 14.2369, + "grad_norm": 2.008424758911133, + "learning_rate": 0.0005, + "epoch": 0.13445453475929836, + "step": 3000 + }, + { + "eval_loss": 1.766704797744751, + "eval_runtime": 18.3043, + "eval_samples_per_second": 895.089, + "eval_steps_per_second": 8.031, + "epoch": 0.13445453475929836, + "step": 3000 + }, + { + "loss": 14.2562, + "grad_norm": 1.9771119356155396, + "learning_rate": 0.0005, + "epoch": 0.13467862565056388, + "step": 3005 + }, + { + "loss": 14.1959, + "grad_norm": 2.1328437328338623, + "learning_rate": 0.0005, + "epoch": 0.13490271654182937, + "step": 3010 + }, + { + "loss": 14.1737, + "grad_norm": 2.1573870182037354, + "learning_rate": 0.0005, + "epoch": 0.13512680743309485, + "step": 3015 + }, + { + "loss": 14.165, + "grad_norm": 1.8604843616485596, + "learning_rate": 0.0005, + "epoch": 0.13535089832436037, + "step": 3020 + }, + { + "loss": 14.2836, + "grad_norm": 1.8671879768371582, + "learning_rate": 0.0005, + "epoch": 0.13557498921562586, + "step": 3025 + }, + { + "loss": 14.2551, + "grad_norm": 2.0154945850372314, + "learning_rate": 0.0005, + "epoch": 0.13579908010689135, + "step": 3030 + }, + { + "loss": 14.2651, + "grad_norm": 1.9040732383728027, + "learning_rate": 0.0005, + "epoch": 0.13602317099815686, + "step": 3035 + }, + { + "loss": 14.1997, + "grad_norm": 2.0263047218322754, + "learning_rate": 0.0005, + "epoch": 0.13624726188942235, + "step": 3040 + }, + { + "loss": 14.2974, + "grad_norm": 1.902062177658081, + "learning_rate": 0.0005, + "epoch": 0.13647135278068784, + "step": 3045 + }, + { + "loss": 14.1647, + "grad_norm": 1.8822523355484009, + "learning_rate": 0.0005, + "epoch": 0.13669544367195335, + "step": 3050 + }, + { + "loss": 14.1171, + "grad_norm": 1.9626497030258179, + "learning_rate": 0.0005, + "epoch": 0.13691953456321884, + "step": 3055 + }, + { + "loss": 14.1639, + "grad_norm": 1.9847556352615356, + "learning_rate": 0.0005, + "epoch": 0.13714362545448433, + "step": 3060 + }, + { + "loss": 14.3451, + "grad_norm": 2.1711082458496094, + "learning_rate": 0.0005, + "epoch": 0.13736771634574985, + "step": 3065 + }, + { + "loss": 14.2022, + "grad_norm": 1.9800173044204712, + "learning_rate": 0.0005, + "epoch": 0.13759180723701533, + "step": 3070 + }, + { + "loss": 14.3151, + "grad_norm": 1.9356379508972168, + "learning_rate": 0.0005, + "epoch": 0.13781589812828082, + "step": 3075 + }, + { + "loss": 14.194, + "grad_norm": 2.027348518371582, + "learning_rate": 0.0005, + "epoch": 0.13803998901954634, + "step": 3080 + }, + { + "loss": 14.202, + "grad_norm": 2.1039271354675293, + "learning_rate": 0.0005, + "epoch": 0.13826407991081183, + "step": 3085 + }, + { + "loss": 14.2074, + "grad_norm": 1.9769928455352783, + "learning_rate": 0.0005, + "epoch": 0.13848817080207732, + "step": 3090 + }, + { + "loss": 14.1429, + "grad_norm": 1.9912936687469482, + "learning_rate": 0.0005, + "epoch": 0.13871226169334283, + "step": 3095 + }, + { + "loss": 14.2189, + "grad_norm": 1.9611599445343018, + "learning_rate": 0.0005, + "epoch": 0.13893635258460832, + "step": 3100 + }, + { + "loss": 14.2479, + "grad_norm": 1.8536235094070435, + "learning_rate": 0.0005, + "epoch": 0.1391604434758738, + "step": 3105 + }, + { + "loss": 14.2304, + "grad_norm": 1.8334366083145142, + "learning_rate": 0.0005, + "epoch": 0.13938453436713932, + "step": 3110 + }, + { + "loss": 14.1922, + "grad_norm": 2.020998239517212, + "learning_rate": 0.0005, + "epoch": 0.1396086252584048, + "step": 3115 + }, + { + "loss": 14.2457, + "grad_norm": 2.0635135173797607, + "learning_rate": 0.0005, + "epoch": 0.1398327161496703, + "step": 3120 + }, + { + "loss": 14.2277, + "grad_norm": 1.9335616827011108, + "learning_rate": 0.0005, + "epoch": 0.14005680704093582, + "step": 3125 + }, + { + "loss": 14.1735, + "grad_norm": 2.0182814598083496, + "learning_rate": 0.0005, + "epoch": 0.1402808979322013, + "step": 3130 + }, + { + "loss": 14.1644, + "grad_norm": 1.9300106763839722, + "learning_rate": 0.0005, + "epoch": 0.1405049888234668, + "step": 3135 + }, + { + "loss": 14.105, + "grad_norm": 1.9784296751022339, + "learning_rate": 0.0005, + "epoch": 0.1407290797147323, + "step": 3140 + }, + { + "loss": 14.2727, + "grad_norm": 1.8297818899154663, + "learning_rate": 0.0005, + "epoch": 0.1409531706059978, + "step": 3145 + }, + { + "loss": 14.2687, + "grad_norm": 2.1447770595550537, + "learning_rate": 0.0005, + "epoch": 0.14117726149726328, + "step": 3150 + }, + { + "loss": 14.2114, + "grad_norm": 1.9135013818740845, + "learning_rate": 0.0005, + "epoch": 0.1414013523885288, + "step": 3155 + }, + { + "loss": 14.1907, + "grad_norm": 2.194002389907837, + "learning_rate": 0.0005, + "epoch": 0.1416254432797943, + "step": 3160 + }, + { + "loss": 14.1548, + "grad_norm": 2.1245758533477783, + "learning_rate": 0.0005, + "epoch": 0.14184953417105978, + "step": 3165 + }, + { + "loss": 14.1323, + "grad_norm": 2.1875216960906982, + "learning_rate": 0.0005, + "epoch": 0.1420736250623253, + "step": 3170 + }, + { + "loss": 14.1936, + "grad_norm": 2.0083394050598145, + "learning_rate": 0.0005, + "epoch": 0.14229771595359078, + "step": 3175 + }, + { + "loss": 14.1293, + "grad_norm": 2.0901074409484863, + "learning_rate": 0.0005, + "epoch": 0.14252180684485627, + "step": 3180 + }, + { + "loss": 14.2272, + "grad_norm": 2.2300620079040527, + "learning_rate": 0.0005, + "epoch": 0.14274589773612176, + "step": 3185 + }, + { + "loss": 14.178, + "grad_norm": 1.8876138925552368, + "learning_rate": 0.0005, + "epoch": 0.14296998862738727, + "step": 3190 + }, + { + "loss": 14.0993, + "grad_norm": 1.8890095949172974, + "learning_rate": 0.0005, + "epoch": 0.14319407951865276, + "step": 3195 + }, + { + "loss": 14.2384, + "grad_norm": 2.007490396499634, + "learning_rate": 0.0005, + "epoch": 0.14341817040991825, + "step": 3200 + }, + { + "loss": 14.1718, + "grad_norm": 1.9662436246871948, + "learning_rate": 0.0005, + "epoch": 0.14364226130118377, + "step": 3205 + }, + { + "loss": 14.2349, + "grad_norm": 2.0031325817108154, + "learning_rate": 0.0005, + "epoch": 0.14386635219244925, + "step": 3210 + }, + { + "loss": 14.2658, + "grad_norm": 2.1796069145202637, + "learning_rate": 0.0005, + "epoch": 0.14409044308371474, + "step": 3215 + }, + { + "loss": 14.1664, + "grad_norm": 2.0716702938079834, + "learning_rate": 0.0005, + "epoch": 0.14431453397498026, + "step": 3220 + }, + { + "loss": 14.2758, + "grad_norm": 1.9255475997924805, + "learning_rate": 0.0005, + "epoch": 0.14453862486624575, + "step": 3225 + }, + { + "loss": 14.1191, + "grad_norm": 2.288928508758545, + "learning_rate": 0.0005, + "epoch": 0.14476271575751123, + "step": 3230 + }, + { + "loss": 14.0554, + "grad_norm": 2.1082675457000732, + "learning_rate": 0.0005, + "epoch": 0.14498680664877675, + "step": 3235 + }, + { + "loss": 14.1859, + "grad_norm": 2.0232932567596436, + "learning_rate": 0.0005, + "epoch": 0.14521089754004224, + "step": 3240 + }, + { + "loss": 14.1005, + "grad_norm": 1.8354517221450806, + "learning_rate": 0.0005, + "epoch": 0.14543498843130773, + "step": 3245 + }, + { + "loss": 14.1885, + "grad_norm": 2.031891345977783, + "learning_rate": 0.0005, + "epoch": 0.14565907932257324, + "step": 3250 + }, + { + "loss": 14.1925, + "grad_norm": 1.9492149353027344, + "learning_rate": 0.0005, + "epoch": 0.14588317021383873, + "step": 3255 + }, + { + "loss": 14.294, + "grad_norm": 2.073660373687744, + "learning_rate": 0.0005, + "epoch": 0.14610726110510422, + "step": 3260 + }, + { + "loss": 14.1864, + "grad_norm": 1.9290986061096191, + "learning_rate": 0.0005, + "epoch": 0.14633135199636974, + "step": 3265 + }, + { + "loss": 14.1303, + "grad_norm": 2.0566937923431396, + "learning_rate": 0.0005, + "epoch": 0.14655544288763522, + "step": 3270 + }, + { + "loss": 14.027, + "grad_norm": 1.9406923055648804, + "learning_rate": 0.0005, + "epoch": 0.1467795337789007, + "step": 3275 + }, + { + "loss": 14.0723, + "grad_norm": 2.021090507507324, + "learning_rate": 0.0005, + "epoch": 0.14700362467016623, + "step": 3280 + }, + { + "loss": 14.162, + "grad_norm": 2.272169589996338, + "learning_rate": 0.0005, + "epoch": 0.14722771556143172, + "step": 3285 + }, + { + "loss": 14.2405, + "grad_norm": 1.9472681283950806, + "learning_rate": 0.0005, + "epoch": 0.1474518064526972, + "step": 3290 + }, + { + "loss": 14.229, + "grad_norm": 1.8901585340499878, + "learning_rate": 0.0005, + "epoch": 0.14767589734396272, + "step": 3295 + }, + { + "loss": 14.0907, + "grad_norm": 1.9590394496917725, + "learning_rate": 0.0005, + "epoch": 0.1478999882352282, + "step": 3300 + }, + { + "loss": 14.1363, + "grad_norm": 2.068774461746216, + "learning_rate": 0.0005, + "epoch": 0.1481240791264937, + "step": 3305 + }, + { + "loss": 14.2616, + "grad_norm": 2.1157450675964355, + "learning_rate": 0.0005, + "epoch": 0.1483481700177592, + "step": 3310 + }, + { + "loss": 14.167, + "grad_norm": 2.029632091522217, + "learning_rate": 0.0005, + "epoch": 0.1485722609090247, + "step": 3315 + }, + { + "loss": 14.1601, + "grad_norm": 2.0851261615753174, + "learning_rate": 0.0005, + "epoch": 0.1487963518002902, + "step": 3320 + }, + { + "loss": 14.225, + "grad_norm": 2.05302357673645, + "learning_rate": 0.0005, + "epoch": 0.1490204426915557, + "step": 3325 + }, + { + "loss": 14.0864, + "grad_norm": 1.9321824312210083, + "learning_rate": 0.0005, + "epoch": 0.1492445335828212, + "step": 3330 + }, + { + "loss": 14.1827, + "grad_norm": 1.9291865825653076, + "learning_rate": 0.0005, + "epoch": 0.14946862447408668, + "step": 3335 + }, + { + "loss": 14.2404, + "grad_norm": 2.1532223224639893, + "learning_rate": 0.0005, + "epoch": 0.1496927153653522, + "step": 3340 + }, + { + "loss": 14.2338, + "grad_norm": 2.166241407394409, + "learning_rate": 0.0005, + "epoch": 0.14991680625661769, + "step": 3345 + }, + { + "loss": 14.1989, + "grad_norm": 2.0716960430145264, + "learning_rate": 0.0005, + "epoch": 0.15014089714788317, + "step": 3350 + }, + { + "loss": 14.2566, + "grad_norm": 2.158968448638916, + "learning_rate": 0.0005, + "epoch": 0.1503649880391487, + "step": 3355 + }, + { + "loss": 14.2069, + "grad_norm": 2.1141438484191895, + "learning_rate": 0.0005, + "epoch": 0.15058907893041418, + "step": 3360 + }, + { + "loss": 14.151, + "grad_norm": 2.0348339080810547, + "learning_rate": 0.0005, + "epoch": 0.15081316982167967, + "step": 3365 + }, + { + "loss": 14.1337, + "grad_norm": 2.09519100189209, + "learning_rate": 0.0005, + "epoch": 0.15103726071294518, + "step": 3370 + }, + { + "loss": 14.1808, + "grad_norm": 2.012434720993042, + "learning_rate": 0.0005, + "epoch": 0.15126135160421067, + "step": 3375 + }, + { + "loss": 14.0764, + "grad_norm": 2.0529088973999023, + "learning_rate": 0.0005, + "epoch": 0.15148544249547616, + "step": 3380 + }, + { + "loss": 14.1444, + "grad_norm": 2.0532617568969727, + "learning_rate": 0.0005, + "epoch": 0.15170953338674167, + "step": 3385 + }, + { + "loss": 14.2286, + "grad_norm": 1.7985155582427979, + "learning_rate": 0.0005, + "epoch": 0.15193362427800716, + "step": 3390 + }, + { + "loss": 14.1809, + "grad_norm": 1.9286701679229736, + "learning_rate": 0.0005, + "epoch": 0.15215771516927265, + "step": 3395 + }, + { + "loss": 14.1947, + "grad_norm": 2.126664161682129, + "learning_rate": 0.0005, + "epoch": 0.15238180606053817, + "step": 3400 + }, + { + "loss": 14.0935, + "grad_norm": 2.0395383834838867, + "learning_rate": 0.0005, + "epoch": 0.15260589695180365, + "step": 3405 + }, + { + "loss": 14.1364, + "grad_norm": 2.0696709156036377, + "learning_rate": 0.0005, + "epoch": 0.15282998784306914, + "step": 3410 + }, + { + "loss": 14.21, + "grad_norm": 1.9847348928451538, + "learning_rate": 0.0005, + "epoch": 0.15305407873433466, + "step": 3415 + }, + { + "loss": 14.1458, + "grad_norm": 1.9850157499313354, + "learning_rate": 0.0005, + "epoch": 0.15327816962560015, + "step": 3420 + }, + { + "loss": 14.2044, + "grad_norm": 2.1267752647399902, + "learning_rate": 0.0005, + "epoch": 0.15350226051686564, + "step": 3425 + }, + { + "loss": 14.246, + "grad_norm": 2.027884006500244, + "learning_rate": 0.0005, + "epoch": 0.15372635140813115, + "step": 3430 + }, + { + "loss": 14.1692, + "grad_norm": 2.0583317279815674, + "learning_rate": 0.0005, + "epoch": 0.15395044229939664, + "step": 3435 + }, + { + "loss": 14.2038, + "grad_norm": 2.07718825340271, + "learning_rate": 0.0005, + "epoch": 0.15417453319066213, + "step": 3440 + }, + { + "loss": 14.1482, + "grad_norm": 1.9352562427520752, + "learning_rate": 0.0005, + "epoch": 0.15439862408192764, + "step": 3445 + }, + { + "loss": 14.2268, + "grad_norm": 1.9634325504302979, + "learning_rate": 0.0005, + "epoch": 0.15462271497319313, + "step": 3450 + }, + { + "loss": 14.2241, + "grad_norm": 2.0379881858825684, + "learning_rate": 0.0005, + "epoch": 0.15484680586445862, + "step": 3455 + }, + { + "loss": 14.1091, + "grad_norm": 1.9497126340866089, + "learning_rate": 0.0005, + "epoch": 0.1550708967557241, + "step": 3460 + }, + { + "loss": 14.0976, + "grad_norm": 2.1084482669830322, + "learning_rate": 0.0005, + "epoch": 0.15529498764698962, + "step": 3465 + }, + { + "loss": 14.2549, + "grad_norm": 1.9887515306472778, + "learning_rate": 0.0005, + "epoch": 0.1555190785382551, + "step": 3470 + }, + { + "loss": 14.209, + "grad_norm": 2.1256349086761475, + "learning_rate": 0.0005, + "epoch": 0.1557431694295206, + "step": 3475 + }, + { + "loss": 14.1249, + "grad_norm": 2.1030185222625732, + "learning_rate": 0.0005, + "epoch": 0.15596726032078612, + "step": 3480 + }, + { + "loss": 14.2972, + "grad_norm": 1.9382493495941162, + "learning_rate": 0.0005, + "epoch": 0.1561913512120516, + "step": 3485 + }, + { + "loss": 14.226, + "grad_norm": 1.9482671022415161, + "learning_rate": 0.0005, + "epoch": 0.1564154421033171, + "step": 3490 + }, + { + "loss": 14.1851, + "grad_norm": 1.9536066055297852, + "learning_rate": 0.0005, + "epoch": 0.1566395329945826, + "step": 3495 + }, + { + "loss": 14.2262, + "grad_norm": 2.197573184967041, + "learning_rate": 0.0005, + "epoch": 0.1568636238858481, + "step": 3500 + }, + { + "eval_loss": 1.7696559429168701, + "eval_runtime": 18.3621, + "eval_samples_per_second": 892.271, + "eval_steps_per_second": 8.006, + "epoch": 0.1568636238858481, + "step": 3500 + }, + { + "loss": 14.1501, + "grad_norm": 2.1332786083221436, + "learning_rate": 0.0005, + "epoch": 0.15708771477711359, + "step": 3505 + }, + { + "loss": 14.1858, + "grad_norm": 2.1712486743927, + "learning_rate": 0.0005, + "epoch": 0.1573118056683791, + "step": 3510 + }, + { + "loss": 14.2522, + "grad_norm": 1.9922101497650146, + "learning_rate": 0.0005, + "epoch": 0.1575358965596446, + "step": 3515 + }, + { + "loss": 14.1805, + "grad_norm": 2.2779488563537598, + "learning_rate": 0.0005, + "epoch": 0.15775998745091008, + "step": 3520 + }, + { + "loss": 14.1793, + "grad_norm": 1.9923431873321533, + "learning_rate": 0.0005, + "epoch": 0.1579840783421756, + "step": 3525 + }, + { + "loss": 14.2578, + "grad_norm": 2.1077773571014404, + "learning_rate": 0.0005, + "epoch": 0.15820816923344108, + "step": 3530 + }, + { + "loss": 14.2033, + "grad_norm": 2.0996055603027344, + "learning_rate": 0.0005, + "epoch": 0.15843226012470657, + "step": 3535 + }, + { + "loss": 14.2472, + "grad_norm": 1.8849451541900635, + "learning_rate": 0.0005, + "epoch": 0.15865635101597209, + "step": 3540 + }, + { + "loss": 14.1729, + "grad_norm": 1.9497867822647095, + "learning_rate": 0.0005, + "epoch": 0.15888044190723757, + "step": 3545 + }, + { + "loss": 14.1113, + "grad_norm": 2.1263110637664795, + "learning_rate": 0.0005, + "epoch": 0.15910453279850306, + "step": 3550 + }, + { + "loss": 14.3157, + "grad_norm": 2.093867063522339, + "learning_rate": 0.0005, + "epoch": 0.15932862368976858, + "step": 3555 + }, + { + "loss": 14.1378, + "grad_norm": 1.9930814504623413, + "learning_rate": 0.0005, + "epoch": 0.15955271458103407, + "step": 3560 + }, + { + "loss": 14.0981, + "grad_norm": 1.9953348636627197, + "learning_rate": 0.0005, + "epoch": 0.15977680547229955, + "step": 3565 + }, + { + "loss": 14.3871, + "grad_norm": 2.055655002593994, + "learning_rate": 0.0005, + "epoch": 0.16000089636356507, + "step": 3570 + }, + { + "loss": 14.2805, + "grad_norm": 1.884310007095337, + "learning_rate": 0.0005, + "epoch": 0.16022498725483056, + "step": 3575 + }, + { + "loss": 14.0668, + "grad_norm": 1.9147753715515137, + "learning_rate": 0.0005, + "epoch": 0.16044907814609605, + "step": 3580 + }, + { + "loss": 14.2115, + "grad_norm": 2.0674216747283936, + "learning_rate": 0.0005, + "epoch": 0.16067316903736156, + "step": 3585 + }, + { + "loss": 14.1922, + "grad_norm": 2.0138802528381348, + "learning_rate": 0.0005, + "epoch": 0.16089725992862705, + "step": 3590 + }, + { + "loss": 14.1058, + "grad_norm": 1.9173427820205688, + "learning_rate": 0.0005, + "epoch": 0.16112135081989254, + "step": 3595 + }, + { + "loss": 14.0634, + "grad_norm": 1.9749242067337036, + "learning_rate": 0.0005, + "epoch": 0.16134544171115806, + "step": 3600 + }, + { + "loss": 14.2739, + "grad_norm": 1.9603943824768066, + "learning_rate": 0.0005, + "epoch": 0.16156953260242354, + "step": 3605 + }, + { + "loss": 14.1966, + "grad_norm": 2.004307508468628, + "learning_rate": 0.0005, + "epoch": 0.16179362349368903, + "step": 3610 + }, + { + "loss": 14.1685, + "grad_norm": 1.9626902341842651, + "learning_rate": 0.0005, + "epoch": 0.16201771438495455, + "step": 3615 + }, + { + "loss": 14.2082, + "grad_norm": 1.9217684268951416, + "learning_rate": 0.0005, + "epoch": 0.16224180527622004, + "step": 3620 + }, + { + "loss": 14.1972, + "grad_norm": 2.047498941421509, + "learning_rate": 0.0005, + "epoch": 0.16246589616748552, + "step": 3625 + }, + { + "loss": 14.1222, + "grad_norm": 1.9391909837722778, + "learning_rate": 0.0005, + "epoch": 0.16268998705875104, + "step": 3630 + }, + { + "loss": 14.1984, + "grad_norm": 2.028310775756836, + "learning_rate": 0.0005, + "epoch": 0.16291407795001653, + "step": 3635 + }, + { + "loss": 14.217, + "grad_norm": 2.000718355178833, + "learning_rate": 0.0005, + "epoch": 0.16313816884128202, + "step": 3640 + }, + { + "loss": 14.2582, + "grad_norm": 1.9638919830322266, + "learning_rate": 0.0005, + "epoch": 0.16336225973254753, + "step": 3645 + }, + { + "loss": 14.1799, + "grad_norm": 1.9330832958221436, + "learning_rate": 0.0005, + "epoch": 0.16358635062381302, + "step": 3650 + }, + { + "loss": 14.1262, + "grad_norm": 1.9113759994506836, + "learning_rate": 0.0005, + "epoch": 0.1638104415150785, + "step": 3655 + }, + { + "loss": 14.2052, + "grad_norm": 1.8158360719680786, + "learning_rate": 0.0005, + "epoch": 0.16403453240634402, + "step": 3660 + }, + { + "loss": 14.201, + "grad_norm": 2.2180416584014893, + "learning_rate": 0.0005, + "epoch": 0.1642586232976095, + "step": 3665 + }, + { + "loss": 14.1913, + "grad_norm": 2.4512360095977783, + "learning_rate": 0.0005, + "epoch": 0.164482714188875, + "step": 3670 + }, + { + "loss": 14.1491, + "grad_norm": 2.017465829849243, + "learning_rate": 0.0005, + "epoch": 0.16470680508014052, + "step": 3675 + }, + { + "loss": 14.1776, + "grad_norm": 2.021597146987915, + "learning_rate": 0.0005, + "epoch": 0.164930895971406, + "step": 3680 + }, + { + "loss": 14.2879, + "grad_norm": 2.0513081550598145, + "learning_rate": 0.0005, + "epoch": 0.1651549868626715, + "step": 3685 + }, + { + "loss": 14.2206, + "grad_norm": 1.86613929271698, + "learning_rate": 0.0005, + "epoch": 0.165379077753937, + "step": 3690 + }, + { + "loss": 14.2824, + "grad_norm": 1.9146097898483276, + "learning_rate": 0.0005, + "epoch": 0.1656031686452025, + "step": 3695 + }, + { + "loss": 14.2447, + "grad_norm": 1.893530249595642, + "learning_rate": 0.0005, + "epoch": 0.16582725953646799, + "step": 3700 + }, + { + "loss": 14.2238, + "grad_norm": 2.099381685256958, + "learning_rate": 0.0005, + "epoch": 0.1660513504277335, + "step": 3705 + }, + { + "loss": 14.2613, + "grad_norm": 1.8816338777542114, + "learning_rate": 0.0005, + "epoch": 0.166275441318999, + "step": 3710 + }, + { + "loss": 14.2852, + "grad_norm": 1.8955073356628418, + "learning_rate": 0.0005, + "epoch": 0.16649953221026448, + "step": 3715 + }, + { + "loss": 14.183, + "grad_norm": 2.0466463565826416, + "learning_rate": 0.0005, + "epoch": 0.16672362310153, + "step": 3720 + }, + { + "loss": 14.2067, + "grad_norm": 1.945846438407898, + "learning_rate": 0.0005, + "epoch": 0.16694771399279548, + "step": 3725 + }, + { + "loss": 14.2256, + "grad_norm": 2.201847553253174, + "learning_rate": 0.0005, + "epoch": 0.16717180488406097, + "step": 3730 + }, + { + "loss": 14.224, + "grad_norm": 1.9679917097091675, + "learning_rate": 0.0005, + "epoch": 0.16739589577532646, + "step": 3735 + }, + { + "loss": 14.2026, + "grad_norm": 2.029130697250366, + "learning_rate": 0.0005, + "epoch": 0.16761998666659197, + "step": 3740 + }, + { + "loss": 14.2399, + "grad_norm": 2.055572271347046, + "learning_rate": 0.0005, + "epoch": 0.16784407755785746, + "step": 3745 + }, + { + "loss": 14.2184, + "grad_norm": 2.423161506652832, + "learning_rate": 0.0005, + "epoch": 0.16806816844912295, + "step": 3750 + }, + { + "loss": 14.2343, + "grad_norm": 1.8970690965652466, + "learning_rate": 0.0005, + "epoch": 0.16829225934038847, + "step": 3755 + }, + { + "loss": 14.1596, + "grad_norm": 1.9343767166137695, + "learning_rate": 0.0005, + "epoch": 0.16851635023165396, + "step": 3760 + }, + { + "loss": 14.1946, + "grad_norm": 1.874110221862793, + "learning_rate": 0.0005, + "epoch": 0.16874044112291944, + "step": 3765 + }, + { + "loss": 14.1475, + "grad_norm": 1.7971092462539673, + "learning_rate": 0.0005, + "epoch": 0.16896453201418496, + "step": 3770 + }, + { + "loss": 14.161, + "grad_norm": 1.8645752668380737, + "learning_rate": 0.0005, + "epoch": 0.16918862290545045, + "step": 3775 + }, + { + "loss": 14.104, + "grad_norm": 1.9900041818618774, + "learning_rate": 0.0005, + "epoch": 0.16941271379671594, + "step": 3780 + }, + { + "loss": 14.1374, + "grad_norm": 2.0856997966766357, + "learning_rate": 0.0005, + "epoch": 0.16963680468798145, + "step": 3785 + }, + { + "loss": 14.1522, + "grad_norm": 2.178510904312134, + "learning_rate": 0.0005, + "epoch": 0.16986089557924694, + "step": 3790 + }, + { + "loss": 14.1531, + "grad_norm": 2.0625481605529785, + "learning_rate": 0.0005, + "epoch": 0.17008498647051243, + "step": 3795 + }, + { + "loss": 14.2027, + "grad_norm": 2.0520877838134766, + "learning_rate": 0.0005, + "epoch": 0.17030907736177794, + "step": 3800 + }, + { + "loss": 14.2521, + "grad_norm": 2.1085047721862793, + "learning_rate": 0.0005, + "epoch": 0.17053316825304343, + "step": 3805 + }, + { + "loss": 14.1758, + "grad_norm": 1.987500548362732, + "learning_rate": 0.0005, + "epoch": 0.17075725914430892, + "step": 3810 + }, + { + "loss": 14.1554, + "grad_norm": 2.0057876110076904, + "learning_rate": 0.0005, + "epoch": 0.17098135003557444, + "step": 3815 + }, + { + "loss": 14.2989, + "grad_norm": 1.9037858247756958, + "learning_rate": 0.0005, + "epoch": 0.17120544092683992, + "step": 3820 + }, + { + "loss": 14.1809, + "grad_norm": 1.977766513824463, + "learning_rate": 0.0005, + "epoch": 0.1714295318181054, + "step": 3825 + }, + { + "loss": 14.3139, + "grad_norm": 1.839013934135437, + "learning_rate": 0.0005, + "epoch": 0.17165362270937093, + "step": 3830 + }, + { + "loss": 14.1497, + "grad_norm": 1.875419020652771, + "learning_rate": 0.0005, + "epoch": 0.17187771360063642, + "step": 3835 + }, + { + "loss": 14.3065, + "grad_norm": 1.92861807346344, + "learning_rate": 0.0005, + "epoch": 0.1721018044919019, + "step": 3840 + }, + { + "loss": 14.1558, + "grad_norm": 1.94759202003479, + "learning_rate": 0.0005, + "epoch": 0.17232589538316742, + "step": 3845 + }, + { + "loss": 14.1194, + "grad_norm": 1.9916903972625732, + "learning_rate": 0.0005, + "epoch": 0.1725499862744329, + "step": 3850 + }, + { + "loss": 14.1304, + "grad_norm": 2.2031240463256836, + "learning_rate": 0.0005, + "epoch": 0.1727740771656984, + "step": 3855 + }, + { + "loss": 14.1791, + "grad_norm": 2.153580904006958, + "learning_rate": 0.0005, + "epoch": 0.1729981680569639, + "step": 3860 + }, + { + "loss": 14.1746, + "grad_norm": 2.063750743865967, + "learning_rate": 0.0005, + "epoch": 0.1732222589482294, + "step": 3865 + }, + { + "loss": 14.1255, + "grad_norm": 1.8618475198745728, + "learning_rate": 0.0005, + "epoch": 0.1734463498394949, + "step": 3870 + }, + { + "loss": 14.1702, + "grad_norm": 1.898120641708374, + "learning_rate": 0.0005, + "epoch": 0.1736704407307604, + "step": 3875 + }, + { + "loss": 14.2209, + "grad_norm": 2.050619125366211, + "learning_rate": 0.0005, + "epoch": 0.1738945316220259, + "step": 3880 + }, + { + "loss": 14.2072, + "grad_norm": 2.2624967098236084, + "learning_rate": 0.0005, + "epoch": 0.17411862251329138, + "step": 3885 + }, + { + "loss": 14.3059, + "grad_norm": 1.9534329175949097, + "learning_rate": 0.0005, + "epoch": 0.1743427134045569, + "step": 3890 + }, + { + "loss": 14.0746, + "grad_norm": 1.9066346883773804, + "learning_rate": 0.0005, + "epoch": 0.1745668042958224, + "step": 3895 + }, + { + "loss": 14.0933, + "grad_norm": 2.0286290645599365, + "learning_rate": 0.0005, + "epoch": 0.17479089518708787, + "step": 3900 + }, + { + "loss": 14.272, + "grad_norm": 1.9771586656570435, + "learning_rate": 0.0005, + "epoch": 0.1750149860783534, + "step": 3905 + }, + { + "loss": 14.1956, + "grad_norm": 2.023655652999878, + "learning_rate": 0.0005, + "epoch": 0.17523907696961888, + "step": 3910 + }, + { + "loss": 14.1492, + "grad_norm": 2.0308282375335693, + "learning_rate": 0.0005, + "epoch": 0.17546316786088437, + "step": 3915 + }, + { + "loss": 14.116, + "grad_norm": 1.9718906879425049, + "learning_rate": 0.0005, + "epoch": 0.17568725875214988, + "step": 3920 + }, + { + "loss": 14.1377, + "grad_norm": 2.0543808937072754, + "learning_rate": 0.0005, + "epoch": 0.17591134964341537, + "step": 3925 + }, + { + "loss": 14.2587, + "grad_norm": 1.874050259590149, + "learning_rate": 0.0005, + "epoch": 0.17613544053468086, + "step": 3930 + }, + { + "loss": 14.0803, + "grad_norm": 2.186093807220459, + "learning_rate": 0.0005, + "epoch": 0.17635953142594638, + "step": 3935 + }, + { + "loss": 14.1585, + "grad_norm": 2.1141676902770996, + "learning_rate": 0.0005, + "epoch": 0.17658362231721186, + "step": 3940 + }, + { + "loss": 14.1433, + "grad_norm": 2.111907482147217, + "learning_rate": 0.0005, + "epoch": 0.17680771320847735, + "step": 3945 + }, + { + "loss": 14.1331, + "grad_norm": 2.1772894859313965, + "learning_rate": 0.0005, + "epoch": 0.17703180409974287, + "step": 3950 + }, + { + "loss": 14.1855, + "grad_norm": 2.2366697788238525, + "learning_rate": 0.0005, + "epoch": 0.17725589499100836, + "step": 3955 + }, + { + "loss": 14.1047, + "grad_norm": 2.192366600036621, + "learning_rate": 0.0005, + "epoch": 0.17747998588227384, + "step": 3960 + }, + { + "loss": 14.1799, + "grad_norm": 2.024726152420044, + "learning_rate": 0.0005, + "epoch": 0.17770407677353936, + "step": 3965 + }, + { + "loss": 14.0991, + "grad_norm": 1.8307974338531494, + "learning_rate": 0.0005, + "epoch": 0.17792816766480485, + "step": 3970 + }, + { + "loss": 14.1659, + "grad_norm": 1.9299472570419312, + "learning_rate": 0.0005, + "epoch": 0.17815225855607034, + "step": 3975 + }, + { + "loss": 14.1707, + "grad_norm": 1.914947748184204, + "learning_rate": 0.0005, + "epoch": 0.17837634944733585, + "step": 3980 + }, + { + "loss": 14.1733, + "grad_norm": 2.022740602493286, + "learning_rate": 0.0005, + "epoch": 0.17860044033860134, + "step": 3985 + }, + { + "loss": 14.2043, + "grad_norm": 2.0922834873199463, + "learning_rate": 0.0005, + "epoch": 0.17882453122986683, + "step": 3990 + }, + { + "loss": 14.2815, + "grad_norm": 1.8319189548492432, + "learning_rate": 0.0005, + "epoch": 0.17904862212113234, + "step": 3995 + }, + { + "loss": 14.2127, + "grad_norm": 1.9187498092651367, + "learning_rate": 0.0005, + "epoch": 0.17927271301239783, + "step": 4000 + }, + { + "eval_loss": 1.7681186199188232, + "eval_runtime": 18.1442, + "eval_samples_per_second": 902.987, + "eval_steps_per_second": 8.102, + "epoch": 0.17927271301239783, + "step": 4000 + }, + { + "loss": 14.1477, + "grad_norm": 2.096644401550293, + "learning_rate": 0.0005, + "epoch": 0.17949680390366332, + "step": 4005 + }, + { + "loss": 14.1563, + "grad_norm": 2.2901999950408936, + "learning_rate": 0.0005, + "epoch": 0.1797208947949288, + "step": 4010 + }, + { + "loss": 14.0974, + "grad_norm": 2.0580334663391113, + "learning_rate": 0.0005, + "epoch": 0.17994498568619433, + "step": 4015 + }, + { + "loss": 14.1193, + "grad_norm": 1.9283686876296997, + "learning_rate": 0.0005, + "epoch": 0.1801690765774598, + "step": 4020 + }, + { + "loss": 14.2224, + "grad_norm": 2.0037317276000977, + "learning_rate": 0.0005, + "epoch": 0.1803931674687253, + "step": 4025 + }, + { + "loss": 14.1634, + "grad_norm": 2.235990285873413, + "learning_rate": 0.0005, + "epoch": 0.18061725835999082, + "step": 4030 + }, + { + "loss": 14.1742, + "grad_norm": 2.029142141342163, + "learning_rate": 0.0005, + "epoch": 0.1808413492512563, + "step": 4035 + }, + { + "loss": 14.1899, + "grad_norm": 1.978773832321167, + "learning_rate": 0.0005, + "epoch": 0.1810654401425218, + "step": 4040 + }, + { + "loss": 14.1891, + "grad_norm": 2.089905261993408, + "learning_rate": 0.0005, + "epoch": 0.1812895310337873, + "step": 4045 + }, + { + "loss": 14.218, + "grad_norm": 1.9286154508590698, + "learning_rate": 0.0005, + "epoch": 0.1815136219250528, + "step": 4050 + }, + { + "loss": 14.2234, + "grad_norm": 1.9573795795440674, + "learning_rate": 0.0005, + "epoch": 0.1817377128163183, + "step": 4055 + }, + { + "loss": 14.1842, + "grad_norm": 1.9163328409194946, + "learning_rate": 0.0005, + "epoch": 0.1819618037075838, + "step": 4060 + }, + { + "loss": 14.2729, + "grad_norm": 1.882692575454712, + "learning_rate": 0.0005, + "epoch": 0.1821858945988493, + "step": 4065 + }, + { + "loss": 14.1909, + "grad_norm": 2.1156723499298096, + "learning_rate": 0.0005, + "epoch": 0.18240998549011478, + "step": 4070 + }, + { + "loss": 14.2205, + "grad_norm": 1.8549857139587402, + "learning_rate": 0.0005, + "epoch": 0.1826340763813803, + "step": 4075 + }, + { + "loss": 14.143, + "grad_norm": 1.9605876207351685, + "learning_rate": 0.0005, + "epoch": 0.18285816727264578, + "step": 4080 + }, + { + "loss": 14.1288, + "grad_norm": 2.1343271732330322, + "learning_rate": 0.0005, + "epoch": 0.18308225816391127, + "step": 4085 + }, + { + "loss": 14.2586, + "grad_norm": 1.905978798866272, + "learning_rate": 0.0005, + "epoch": 0.1833063490551768, + "step": 4090 + }, + { + "loss": 14.2387, + "grad_norm": 1.9031362533569336, + "learning_rate": 0.0005, + "epoch": 0.18353043994644228, + "step": 4095 + }, + { + "loss": 14.2686, + "grad_norm": 2.0088846683502197, + "learning_rate": 0.0005, + "epoch": 0.18375453083770776, + "step": 4100 + }, + { + "loss": 14.2028, + "grad_norm": 1.8141952753067017, + "learning_rate": 0.0005, + "epoch": 0.18397862172897328, + "step": 4105 + }, + { + "loss": 14.1979, + "grad_norm": 1.85496985912323, + "learning_rate": 0.0005, + "epoch": 0.18420271262023877, + "step": 4110 + }, + { + "loss": 14.0222, + "grad_norm": 1.909380316734314, + "learning_rate": 0.0005, + "epoch": 0.18442680351150426, + "step": 4115 + }, + { + "loss": 14.0876, + "grad_norm": 1.9430705308914185, + "learning_rate": 0.0005, + "epoch": 0.18465089440276977, + "step": 4120 + }, + { + "loss": 14.1747, + "grad_norm": 2.063159704208374, + "learning_rate": 0.0005, + "epoch": 0.18487498529403526, + "step": 4125 + }, + { + "loss": 14.1689, + "grad_norm": 1.9237209558486938, + "learning_rate": 0.0005, + "epoch": 0.18509907618530075, + "step": 4130 + }, + { + "loss": 14.1428, + "grad_norm": 1.9635990858078003, + "learning_rate": 0.0005, + "epoch": 0.18532316707656626, + "step": 4135 + }, + { + "loss": 14.2495, + "grad_norm": 1.9368425607681274, + "learning_rate": 0.0005, + "epoch": 0.18554725796783175, + "step": 4140 + }, + { + "loss": 14.2331, + "grad_norm": 1.946762204170227, + "learning_rate": 0.0005, + "epoch": 0.18577134885909724, + "step": 4145 + }, + { + "loss": 14.1397, + "grad_norm": 1.9915481805801392, + "learning_rate": 0.0005, + "epoch": 0.18599543975036276, + "step": 4150 + }, + { + "loss": 14.2306, + "grad_norm": 2.0128185749053955, + "learning_rate": 0.0005, + "epoch": 0.18621953064162824, + "step": 4155 + }, + { + "loss": 14.3031, + "grad_norm": 1.8977795839309692, + "learning_rate": 0.0005, + "epoch": 0.18644362153289373, + "step": 4160 + }, + { + "loss": 14.0981, + "grad_norm": 1.9194146394729614, + "learning_rate": 0.0005, + "epoch": 0.18666771242415925, + "step": 4165 + }, + { + "loss": 14.1679, + "grad_norm": 1.9983175992965698, + "learning_rate": 0.0005, + "epoch": 0.18689180331542474, + "step": 4170 + }, + { + "loss": 14.0315, + "grad_norm": 2.0112874507904053, + "learning_rate": 0.0005, + "epoch": 0.18711589420669023, + "step": 4175 + }, + { + "loss": 14.2212, + "grad_norm": 2.002476453781128, + "learning_rate": 0.0005, + "epoch": 0.18733998509795574, + "step": 4180 + }, + { + "loss": 13.9643, + "grad_norm": 1.9648174047470093, + "learning_rate": 0.0005, + "epoch": 0.18756407598922123, + "step": 4185 + }, + { + "loss": 14.2087, + "grad_norm": 1.9648782014846802, + "learning_rate": 0.0005, + "epoch": 0.18778816688048672, + "step": 4190 + }, + { + "loss": 14.1372, + "grad_norm": 1.9486149549484253, + "learning_rate": 0.0005, + "epoch": 0.18801225777175223, + "step": 4195 + }, + { + "loss": 14.0997, + "grad_norm": 1.9666614532470703, + "learning_rate": 0.0005, + "epoch": 0.18823634866301772, + "step": 4200 + }, + { + "loss": 14.1642, + "grad_norm": 2.030273675918579, + "learning_rate": 0.0005, + "epoch": 0.1884604395542832, + "step": 4205 + }, + { + "loss": 14.1444, + "grad_norm": 1.8118653297424316, + "learning_rate": 0.0005, + "epoch": 0.18868453044554873, + "step": 4210 + }, + { + "loss": 14.0818, + "grad_norm": 1.9407061338424683, + "learning_rate": 0.0005, + "epoch": 0.18890862133681421, + "step": 4215 + }, + { + "loss": 14.277, + "grad_norm": 1.949091911315918, + "learning_rate": 0.0005, + "epoch": 0.1891327122280797, + "step": 4220 + }, + { + "loss": 14.1532, + "grad_norm": 1.961521863937378, + "learning_rate": 0.0005, + "epoch": 0.18935680311934522, + "step": 4225 + }, + { + "loss": 14.1745, + "grad_norm": 2.182128667831421, + "learning_rate": 0.0005, + "epoch": 0.1895808940106107, + "step": 4230 + }, + { + "loss": 14.2068, + "grad_norm": 2.066096305847168, + "learning_rate": 0.0005, + "epoch": 0.1898049849018762, + "step": 4235 + }, + { + "loss": 14.1577, + "grad_norm": 1.9394439458847046, + "learning_rate": 0.0005, + "epoch": 0.1900290757931417, + "step": 4240 + }, + { + "loss": 14.1978, + "grad_norm": 1.9918142557144165, + "learning_rate": 0.0005, + "epoch": 0.1902531666844072, + "step": 4245 + }, + { + "loss": 14.1594, + "grad_norm": 2.0450599193573, + "learning_rate": 0.0005, + "epoch": 0.1904772575756727, + "step": 4250 + }, + { + "loss": 14.1575, + "grad_norm": 2.207885980606079, + "learning_rate": 0.0005, + "epoch": 0.1907013484669382, + "step": 4255 + }, + { + "loss": 14.1581, + "grad_norm": 1.9571201801300049, + "learning_rate": 0.0005, + "epoch": 0.1909254393582037, + "step": 4260 + }, + { + "loss": 14.1871, + "grad_norm": 2.14985728263855, + "learning_rate": 0.0005, + "epoch": 0.19114953024946918, + "step": 4265 + }, + { + "loss": 14.2034, + "grad_norm": 2.0554187297821045, + "learning_rate": 0.0005, + "epoch": 0.1913736211407347, + "step": 4270 + }, + { + "loss": 14.07, + "grad_norm": 1.8668701648712158, + "learning_rate": 0.0005, + "epoch": 0.19159771203200018, + "step": 4275 + }, + { + "loss": 14.0869, + "grad_norm": 1.7568072080612183, + "learning_rate": 0.0005, + "epoch": 0.19182180292326567, + "step": 4280 + }, + { + "loss": 14.1629, + "grad_norm": 1.8690704107284546, + "learning_rate": 0.0005, + "epoch": 0.19204589381453116, + "step": 4285 + }, + { + "loss": 14.1132, + "grad_norm": 2.0354042053222656, + "learning_rate": 0.0005, + "epoch": 0.19226998470579668, + "step": 4290 + }, + { + "loss": 14.1111, + "grad_norm": 2.053537368774414, + "learning_rate": 0.0005, + "epoch": 0.19249407559706216, + "step": 4295 + }, + { + "loss": 14.2885, + "grad_norm": 1.9747086763381958, + "learning_rate": 0.0005, + "epoch": 0.19271816648832765, + "step": 4300 + }, + { + "loss": 14.2183, + "grad_norm": 2.169306755065918, + "learning_rate": 0.0005, + "epoch": 0.19294225737959317, + "step": 4305 + }, + { + "loss": 14.1, + "grad_norm": 2.0829429626464844, + "learning_rate": 0.0005, + "epoch": 0.19316634827085866, + "step": 4310 + }, + { + "loss": 14.1872, + "grad_norm": 2.137615919113159, + "learning_rate": 0.0005, + "epoch": 0.19339043916212414, + "step": 4315 + }, + { + "loss": 14.194, + "grad_norm": 2.0153584480285645, + "learning_rate": 0.0005, + "epoch": 0.19361453005338966, + "step": 4320 + }, + { + "loss": 14.2101, + "grad_norm": 1.9413926601409912, + "learning_rate": 0.0005, + "epoch": 0.19383862094465515, + "step": 4325 + }, + { + "loss": 14.1603, + "grad_norm": 1.9266632795333862, + "learning_rate": 0.0005, + "epoch": 0.19406271183592064, + "step": 4330 + }, + { + "loss": 14.1888, + "grad_norm": 1.8601921796798706, + "learning_rate": 0.0005, + "epoch": 0.19428680272718615, + "step": 4335 + }, + { + "loss": 14.0737, + "grad_norm": 2.0529356002807617, + "learning_rate": 0.0005, + "epoch": 0.19451089361845164, + "step": 4340 + }, + { + "loss": 14.1909, + "grad_norm": 2.106046438217163, + "learning_rate": 0.0005, + "epoch": 0.19473498450971713, + "step": 4345 + }, + { + "loss": 14.141, + "grad_norm": 1.9242192506790161, + "learning_rate": 0.0005, + "epoch": 0.19495907540098265, + "step": 4350 + }, + { + "loss": 14.2139, + "grad_norm": 2.0597519874572754, + "learning_rate": 0.0005, + "epoch": 0.19518316629224813, + "step": 4355 + }, + { + "loss": 14.1508, + "grad_norm": 2.0324819087982178, + "learning_rate": 0.0005, + "epoch": 0.19540725718351362, + "step": 4360 + }, + { + "loss": 14.0463, + "grad_norm": 1.981508731842041, + "learning_rate": 0.0005, + "epoch": 0.19563134807477914, + "step": 4365 + }, + { + "loss": 14.1125, + "grad_norm": 1.9829838275909424, + "learning_rate": 0.0005, + "epoch": 0.19585543896604463, + "step": 4370 + }, + { + "loss": 14.1608, + "grad_norm": 2.0116312503814697, + "learning_rate": 0.0005, + "epoch": 0.19607952985731011, + "step": 4375 + }, + { + "loss": 14.1478, + "grad_norm": 1.931014895439148, + "learning_rate": 0.0005, + "epoch": 0.19630362074857563, + "step": 4380 + }, + { + "loss": 14.1709, + "grad_norm": 2.123720407485962, + "learning_rate": 0.0005, + "epoch": 0.19652771163984112, + "step": 4385 + }, + { + "loss": 14.1629, + "grad_norm": 2.0556654930114746, + "learning_rate": 0.0005, + "epoch": 0.1967518025311066, + "step": 4390 + }, + { + "loss": 14.2394, + "grad_norm": 1.9314510822296143, + "learning_rate": 0.0005, + "epoch": 0.19697589342237212, + "step": 4395 + }, + { + "loss": 14.1073, + "grad_norm": 2.0276386737823486, + "learning_rate": 0.0005, + "epoch": 0.1971999843136376, + "step": 4400 + }, + { + "loss": 14.1801, + "grad_norm": 1.922781229019165, + "learning_rate": 0.0005, + "epoch": 0.1974240752049031, + "step": 4405 + }, + { + "loss": 14.2932, + "grad_norm": 1.8780455589294434, + "learning_rate": 0.0005, + "epoch": 0.19764816609616861, + "step": 4410 + }, + { + "loss": 14.2366, + "grad_norm": 2.770186185836792, + "learning_rate": 0.0005, + "epoch": 0.1978722569874341, + "step": 4415 + }, + { + "loss": 14.1106, + "grad_norm": 1.9934104681015015, + "learning_rate": 0.0005, + "epoch": 0.1980963478786996, + "step": 4420 + }, + { + "loss": 14.1122, + "grad_norm": 1.9693650007247925, + "learning_rate": 0.0005, + "epoch": 0.1983204387699651, + "step": 4425 + }, + { + "loss": 14.1088, + "grad_norm": 2.167682647705078, + "learning_rate": 0.0005, + "epoch": 0.1985445296612306, + "step": 4430 + }, + { + "loss": 14.1959, + "grad_norm": 1.8895680904388428, + "learning_rate": 0.0005, + "epoch": 0.19876862055249608, + "step": 4435 + }, + { + "loss": 14.2289, + "grad_norm": 1.9719932079315186, + "learning_rate": 0.0005, + "epoch": 0.1989927114437616, + "step": 4440 + }, + { + "loss": 14.112, + "grad_norm": 1.9747672080993652, + "learning_rate": 0.0005, + "epoch": 0.1992168023350271, + "step": 4445 + }, + { + "loss": 14.0386, + "grad_norm": 2.0173239707946777, + "learning_rate": 0.0005, + "epoch": 0.19944089322629258, + "step": 4450 + }, + { + "loss": 14.1635, + "grad_norm": 1.942533254623413, + "learning_rate": 0.0005, + "epoch": 0.1996649841175581, + "step": 4455 + }, + { + "loss": 14.2371, + "grad_norm": 1.8232591152191162, + "learning_rate": 0.0005, + "epoch": 0.19988907500882358, + "step": 4460 + }, + { + "loss": 14.225, + "grad_norm": 1.9957948923110962, + "learning_rate": 0.0005, + "epoch": 0.20011316590008907, + "step": 4465 + }, + { + "loss": 14.1964, + "grad_norm": 1.9297046661376953, + "learning_rate": 0.0005, + "epoch": 0.20033725679135458, + "step": 4470 + }, + { + "loss": 14.1573, + "grad_norm": 2.030123472213745, + "learning_rate": 0.0005, + "epoch": 0.20056134768262007, + "step": 4475 + }, + { + "loss": 14.2168, + "grad_norm": 2.060288906097412, + "learning_rate": 0.0005, + "epoch": 0.20078543857388556, + "step": 4480 + }, + { + "loss": 14.1218, + "grad_norm": 2.0320048332214355, + "learning_rate": 0.0005, + "epoch": 0.20100952946515108, + "step": 4485 + }, + { + "loss": 14.2131, + "grad_norm": 2.079967975616455, + "learning_rate": 0.0005, + "epoch": 0.20123362035641656, + "step": 4490 + }, + { + "loss": 14.1652, + "grad_norm": 2.0145998001098633, + "learning_rate": 0.0005, + "epoch": 0.20145771124768205, + "step": 4495 + }, + { + "loss": 14.3127, + "grad_norm": 1.9463013410568237, + "learning_rate": 0.0005, + "epoch": 0.20168180213894757, + "step": 4500 + }, + { + "eval_loss": 1.7675774097442627, + "eval_runtime": 18.483, + "eval_samples_per_second": 886.435, + "eval_steps_per_second": 7.953, + "epoch": 0.20168180213894757, + "step": 4500 + }, + { + "loss": 14.1472, + "grad_norm": 2.087540626525879, + "learning_rate": 0.0005, + "epoch": 0.20190589303021306, + "step": 4505 + }, + { + "loss": 14.2054, + "grad_norm": 1.9299111366271973, + "learning_rate": 0.0005, + "epoch": 0.20212998392147855, + "step": 4510 + }, + { + "loss": 14.2457, + "grad_norm": 1.780938744544983, + "learning_rate": 0.0005, + "epoch": 0.20235407481274406, + "step": 4515 + }, + { + "loss": 14.075, + "grad_norm": 1.9856529235839844, + "learning_rate": 0.0005, + "epoch": 0.20257816570400955, + "step": 4520 + }, + { + "loss": 14.1967, + "grad_norm": 1.8442602157592773, + "learning_rate": 0.0005, + "epoch": 0.20280225659527504, + "step": 4525 + }, + { + "loss": 14.1787, + "grad_norm": 1.90349280834198, + "learning_rate": 0.0005, + "epoch": 0.20302634748654055, + "step": 4530 + }, + { + "loss": 14.1255, + "grad_norm": 2.013941526412964, + "learning_rate": 0.0005, + "epoch": 0.20325043837780604, + "step": 4535 + }, + { + "loss": 14.208, + "grad_norm": 2.1116743087768555, + "learning_rate": 0.0005, + "epoch": 0.20347452926907153, + "step": 4540 + }, + { + "loss": 14.2598, + "grad_norm": 2.4251699447631836, + "learning_rate": 0.0005, + "epoch": 0.20369862016033705, + "step": 4545 + }, + { + "loss": 14.1809, + "grad_norm": 1.930247187614441, + "learning_rate": 0.0005, + "epoch": 0.20392271105160253, + "step": 4550 + }, + { + "loss": 14.1545, + "grad_norm": 1.8743910789489746, + "learning_rate": 0.0005, + "epoch": 0.20414680194286802, + "step": 4555 + }, + { + "loss": 14.0868, + "grad_norm": 2.250181198120117, + "learning_rate": 0.0005, + "epoch": 0.2043708928341335, + "step": 4560 + }, + { + "loss": 14.1351, + "grad_norm": 1.8496135473251343, + "learning_rate": 0.0005, + "epoch": 0.20459498372539903, + "step": 4565 + }, + { + "loss": 14.2764, + "grad_norm": 2.0180184841156006, + "learning_rate": 0.0005, + "epoch": 0.20481907461666451, + "step": 4570 + }, + { + "loss": 14.2002, + "grad_norm": 1.9459477663040161, + "learning_rate": 0.0005, + "epoch": 0.20504316550793, + "step": 4575 + }, + { + "loss": 14.1513, + "grad_norm": 1.914696455001831, + "learning_rate": 0.0005, + "epoch": 0.20526725639919552, + "step": 4580 + }, + { + "loss": 14.2149, + "grad_norm": 1.929095983505249, + "learning_rate": 0.0005, + "epoch": 0.205491347290461, + "step": 4585 + }, + { + "loss": 14.2086, + "grad_norm": 2.1694610118865967, + "learning_rate": 0.0005, + "epoch": 0.2057154381817265, + "step": 4590 + }, + { + "loss": 14.117, + "grad_norm": 1.9861886501312256, + "learning_rate": 0.0005, + "epoch": 0.205939529072992, + "step": 4595 + }, + { + "loss": 14.1129, + "grad_norm": 2.0941860675811768, + "learning_rate": 0.0005, + "epoch": 0.2061636199642575, + "step": 4600 + }, + { + "loss": 14.2026, + "grad_norm": 1.8497081995010376, + "learning_rate": 0.0005, + "epoch": 0.206387710855523, + "step": 4605 + }, + { + "loss": 14.2216, + "grad_norm": 2.1611194610595703, + "learning_rate": 0.0005, + "epoch": 0.2066118017467885, + "step": 4610 + }, + { + "loss": 14.2318, + "grad_norm": 2.006192684173584, + "learning_rate": 0.0005, + "epoch": 0.206835892638054, + "step": 4615 + }, + { + "loss": 14.251, + "grad_norm": 2.033385992050171, + "learning_rate": 0.0005, + "epoch": 0.20705998352931948, + "step": 4620 + }, + { + "loss": 14.2138, + "grad_norm": 1.9639863967895508, + "learning_rate": 0.0005, + "epoch": 0.207284074420585, + "step": 4625 + }, + { + "loss": 14.1249, + "grad_norm": 1.7755391597747803, + "learning_rate": 0.0005, + "epoch": 0.20750816531185048, + "step": 4630 + }, + { + "loss": 14.2106, + "grad_norm": 2.0013914108276367, + "learning_rate": 0.0005, + "epoch": 0.20773225620311597, + "step": 4635 + }, + { + "loss": 14.2025, + "grad_norm": 2.1361021995544434, + "learning_rate": 0.0005, + "epoch": 0.2079563470943815, + "step": 4640 + }, + { + "loss": 14.0911, + "grad_norm": 1.9296854734420776, + "learning_rate": 0.0005, + "epoch": 0.20818043798564698, + "step": 4645 + }, + { + "loss": 14.1535, + "grad_norm": 1.9083620309829712, + "learning_rate": 0.0005, + "epoch": 0.20840452887691246, + "step": 4650 + }, + { + "loss": 14.1754, + "grad_norm": 2.1370511054992676, + "learning_rate": 0.0005, + "epoch": 0.20862861976817798, + "step": 4655 + }, + { + "loss": 14.1165, + "grad_norm": 2.0198771953582764, + "learning_rate": 0.0005, + "epoch": 0.20885271065944347, + "step": 4660 + }, + { + "loss": 14.1984, + "grad_norm": 2.125767230987549, + "learning_rate": 0.0005, + "epoch": 0.20907680155070896, + "step": 4665 + }, + { + "loss": 14.1492, + "grad_norm": 2.205829620361328, + "learning_rate": 0.0005, + "epoch": 0.20930089244197447, + "step": 4670 + }, + { + "loss": 14.2795, + "grad_norm": 2.207564353942871, + "learning_rate": 0.0005, + "epoch": 0.20952498333323996, + "step": 4675 + }, + { + "loss": 14.0954, + "grad_norm": 2.261016368865967, + "learning_rate": 0.0005, + "epoch": 0.20974907422450545, + "step": 4680 + }, + { + "loss": 14.1751, + "grad_norm": 2.4380059242248535, + "learning_rate": 0.0005, + "epoch": 0.20997316511577097, + "step": 4685 + }, + { + "loss": 14.1239, + "grad_norm": 2.4530105590820312, + "learning_rate": 0.0005, + "epoch": 0.21019725600703645, + "step": 4690 + }, + { + "loss": 14.0755, + "grad_norm": 2.1958963871002197, + "learning_rate": 0.0005, + "epoch": 0.21042134689830194, + "step": 4695 + }, + { + "loss": 14.1825, + "grad_norm": 1.959038257598877, + "learning_rate": 0.0005, + "epoch": 0.21064543778956746, + "step": 4700 + }, + { + "loss": 14.0358, + "grad_norm": 1.8986763954162598, + "learning_rate": 0.0005, + "epoch": 0.21086952868083295, + "step": 4705 + }, + { + "loss": 14.033, + "grad_norm": 1.932701826095581, + "learning_rate": 0.0005, + "epoch": 0.21109361957209843, + "step": 4710 + }, + { + "loss": 14.1792, + "grad_norm": 1.8619905710220337, + "learning_rate": 0.0005, + "epoch": 0.21131771046336395, + "step": 4715 + }, + { + "loss": 14.0447, + "grad_norm": 1.9640110731124878, + "learning_rate": 0.0005, + "epoch": 0.21154180135462944, + "step": 4720 + }, + { + "loss": 14.0722, + "grad_norm": 1.8437641859054565, + "learning_rate": 0.0005, + "epoch": 0.21176589224589493, + "step": 4725 + }, + { + "loss": 14.1856, + "grad_norm": 1.8629305362701416, + "learning_rate": 0.0005, + "epoch": 0.21198998313716044, + "step": 4730 + }, + { + "loss": 14.1092, + "grad_norm": 2.0573370456695557, + "learning_rate": 0.0005, + "epoch": 0.21221407402842593, + "step": 4735 + }, + { + "loss": 14.1854, + "grad_norm": 1.941266417503357, + "learning_rate": 0.0005, + "epoch": 0.21243816491969142, + "step": 4740 + }, + { + "loss": 14.2779, + "grad_norm": 1.9443109035491943, + "learning_rate": 0.0005, + "epoch": 0.21266225581095693, + "step": 4745 + }, + { + "loss": 14.1528, + "grad_norm": 2.157406806945801, + "learning_rate": 0.0005, + "epoch": 0.21288634670222242, + "step": 4750 + }, + { + "loss": 14.2898, + "grad_norm": 1.987005591392517, + "learning_rate": 0.0005, + "epoch": 0.2131104375934879, + "step": 4755 + }, + { + "loss": 14.2019, + "grad_norm": 1.8471060991287231, + "learning_rate": 0.0005, + "epoch": 0.21333452848475343, + "step": 4760 + }, + { + "loss": 14.1338, + "grad_norm": 1.946158766746521, + "learning_rate": 0.0005, + "epoch": 0.21355861937601892, + "step": 4765 + }, + { + "loss": 14.1958, + "grad_norm": 1.9621422290802002, + "learning_rate": 0.0005, + "epoch": 0.2137827102672844, + "step": 4770 + }, + { + "loss": 14.1579, + "grad_norm": 1.9724212884902954, + "learning_rate": 0.0005, + "epoch": 0.21400680115854992, + "step": 4775 + }, + { + "loss": 14.1578, + "grad_norm": 2.133413076400757, + "learning_rate": 0.0005, + "epoch": 0.2142308920498154, + "step": 4780 + }, + { + "loss": 14.1889, + "grad_norm": 2.089862823486328, + "learning_rate": 0.0005, + "epoch": 0.2144549829410809, + "step": 4785 + }, + { + "loss": 14.1868, + "grad_norm": 2.0941824913024902, + "learning_rate": 0.0005, + "epoch": 0.2146790738323464, + "step": 4790 + }, + { + "loss": 14.1852, + "grad_norm": 2.1381120681762695, + "learning_rate": 0.0005, + "epoch": 0.2149031647236119, + "step": 4795 + }, + { + "loss": 14.1893, + "grad_norm": 1.9739441871643066, + "learning_rate": 0.0005, + "epoch": 0.2151272556148774, + "step": 4800 + }, + { + "loss": 14.1212, + "grad_norm": 2.1892285346984863, + "learning_rate": 0.0005, + "epoch": 0.2153513465061429, + "step": 4805 + }, + { + "loss": 14.216, + "grad_norm": 1.9279417991638184, + "learning_rate": 0.0005, + "epoch": 0.2155754373974084, + "step": 4810 + }, + { + "loss": 14.0294, + "grad_norm": 1.9074068069458008, + "learning_rate": 0.0005, + "epoch": 0.21579952828867388, + "step": 4815 + }, + { + "loss": 14.2165, + "grad_norm": 1.9555555582046509, + "learning_rate": 0.0005, + "epoch": 0.2160236191799394, + "step": 4820 + }, + { + "loss": 14.1804, + "grad_norm": 1.9695144891738892, + "learning_rate": 0.0005, + "epoch": 0.21624771007120488, + "step": 4825 + }, + { + "loss": 14.0645, + "grad_norm": 2.063330888748169, + "learning_rate": 0.0005, + "epoch": 0.21647180096247037, + "step": 4830 + }, + { + "loss": 14.1582, + "grad_norm": 1.9090768098831177, + "learning_rate": 0.0005, + "epoch": 0.2166958918537359, + "step": 4835 + }, + { + "loss": 14.1237, + "grad_norm": 2.066309690475464, + "learning_rate": 0.0005, + "epoch": 0.21691998274500138, + "step": 4840 + }, + { + "loss": 14.121, + "grad_norm": 1.9688724279403687, + "learning_rate": 0.0005, + "epoch": 0.21714407363626687, + "step": 4845 + }, + { + "loss": 14.0986, + "grad_norm": 2.0572214126586914, + "learning_rate": 0.0005, + "epoch": 0.21736816452753235, + "step": 4850 + }, + { + "loss": 14.1856, + "grad_norm": 1.8715074062347412, + "learning_rate": 0.0005, + "epoch": 0.21759225541879787, + "step": 4855 + }, + { + "loss": 14.1442, + "grad_norm": 1.880017876625061, + "learning_rate": 0.0005, + "epoch": 0.21781634631006336, + "step": 4860 + }, + { + "loss": 14.0823, + "grad_norm": 2.1221022605895996, + "learning_rate": 0.0005, + "epoch": 0.21804043720132885, + "step": 4865 + }, + { + "loss": 14.1402, + "grad_norm": 1.8102777004241943, + "learning_rate": 0.0005, + "epoch": 0.21826452809259436, + "step": 4870 + }, + { + "loss": 14.0999, + "grad_norm": 2.0177509784698486, + "learning_rate": 0.0005, + "epoch": 0.21848861898385985, + "step": 4875 + }, + { + "loss": 14.156, + "grad_norm": 1.8971713781356812, + "learning_rate": 0.0005, + "epoch": 0.21871270987512534, + "step": 4880 + }, + { + "loss": 14.1406, + "grad_norm": 1.9269769191741943, + "learning_rate": 0.0005, + "epoch": 0.21893680076639085, + "step": 4885 + }, + { + "loss": 14.062, + "grad_norm": 1.8797369003295898, + "learning_rate": 0.0005, + "epoch": 0.21916089165765634, + "step": 4890 + }, + { + "loss": 14.1249, + "grad_norm": 1.9738274812698364, + "learning_rate": 0.0005, + "epoch": 0.21938498254892183, + "step": 4895 + }, + { + "loss": 14.0797, + "grad_norm": 1.9180667400360107, + "learning_rate": 0.0005, + "epoch": 0.21960907344018735, + "step": 4900 + }, + { + "loss": 14.1443, + "grad_norm": 1.7913436889648438, + "learning_rate": 0.0005, + "epoch": 0.21983316433145283, + "step": 4905 + }, + { + "loss": 14.1768, + "grad_norm": 2.0899739265441895, + "learning_rate": 0.0005, + "epoch": 0.22005725522271832, + "step": 4910 + }, + { + "loss": 14.1679, + "grad_norm": 2.132495164871216, + "learning_rate": 0.0005, + "epoch": 0.22028134611398384, + "step": 4915 + }, + { + "loss": 14.1287, + "grad_norm": 1.8545218706130981, + "learning_rate": 0.0005, + "epoch": 0.22050543700524933, + "step": 4920 + }, + { + "loss": 14.0069, + "grad_norm": 1.779749870300293, + "learning_rate": 0.0005, + "epoch": 0.22072952789651482, + "step": 4925 + }, + { + "loss": 14.0672, + "grad_norm": 1.7925384044647217, + "learning_rate": 0.0005, + "epoch": 0.22095361878778033, + "step": 4930 + }, + { + "loss": 14.0624, + "grad_norm": 1.9922071695327759, + "learning_rate": 0.0005, + "epoch": 0.22117770967904582, + "step": 4935 + }, + { + "loss": 14.0915, + "grad_norm": 1.8443610668182373, + "learning_rate": 0.0005, + "epoch": 0.2214018005703113, + "step": 4940 + }, + { + "loss": 14.1989, + "grad_norm": 2.111323833465576, + "learning_rate": 0.0005, + "epoch": 0.22162589146157682, + "step": 4945 + }, + { + "loss": 14.2195, + "grad_norm": 2.143103837966919, + "learning_rate": 0.0005, + "epoch": 0.2218499823528423, + "step": 4950 + }, + { + "loss": 14.0956, + "grad_norm": 1.8921234607696533, + "learning_rate": 0.0005, + "epoch": 0.2220740732441078, + "step": 4955 + }, + { + "loss": 14.1515, + "grad_norm": 1.8157905340194702, + "learning_rate": 0.0005, + "epoch": 0.22229816413537332, + "step": 4960 + }, + { + "loss": 14.192, + "grad_norm": 1.8960933685302734, + "learning_rate": 0.0005, + "epoch": 0.2225222550266388, + "step": 4965 + }, + { + "loss": 14.2119, + "grad_norm": 2.09665584564209, + "learning_rate": 0.0005, + "epoch": 0.2227463459179043, + "step": 4970 + }, + { + "loss": 14.0471, + "grad_norm": 1.929456114768982, + "learning_rate": 0.0005, + "epoch": 0.2229704368091698, + "step": 4975 + }, + { + "loss": 14.2054, + "grad_norm": 1.9969825744628906, + "learning_rate": 0.0005, + "epoch": 0.2231945277004353, + "step": 4980 + }, + { + "loss": 14.1176, + "grad_norm": 2.014631748199463, + "learning_rate": 0.0005, + "epoch": 0.22341861859170078, + "step": 4985 + }, + { + "loss": 14.1314, + "grad_norm": 2.0011658668518066, + "learning_rate": 0.0005, + "epoch": 0.2236427094829663, + "step": 4990 + }, + { + "loss": 14.0723, + "grad_norm": 1.908673882484436, + "learning_rate": 0.0005, + "epoch": 0.2238668003742318, + "step": 4995 + }, + { + "loss": 14.1676, + "grad_norm": 1.9253995418548584, + "learning_rate": 0.0005, + "epoch": 0.22409089126549728, + "step": 5000 + }, + { + "eval_loss": 1.75691819190979, + "eval_runtime": 18.4767, + "eval_samples_per_second": 886.736, + "eval_steps_per_second": 7.956, + "epoch": 0.22409089126549728, + "step": 5000 + }, + { + "loss": 14.1952, + "grad_norm": 1.9407540559768677, + "learning_rate": 0.0005, + "epoch": 0.2243149821567628, + "step": 5005 + }, + { + "loss": 14.2406, + "grad_norm": 1.813610315322876, + "learning_rate": 0.0005, + "epoch": 0.22453907304802828, + "step": 5010 + }, + { + "loss": 14.143, + "grad_norm": 1.9335557222366333, + "learning_rate": 0.0005, + "epoch": 0.22476316393929377, + "step": 5015 + }, + { + "loss": 14.1897, + "grad_norm": 1.9015038013458252, + "learning_rate": 0.0005, + "epoch": 0.22498725483055929, + "step": 5020 + }, + { + "loss": 14.2873, + "grad_norm": 2.3834633827209473, + "learning_rate": 0.0005, + "epoch": 0.22521134572182477, + "step": 5025 + }, + { + "loss": 14.0845, + "grad_norm": 1.96584951877594, + "learning_rate": 0.0005, + "epoch": 0.22543543661309026, + "step": 5030 + }, + { + "loss": 14.1636, + "grad_norm": 2.146692991256714, + "learning_rate": 0.0005, + "epoch": 0.22565952750435578, + "step": 5035 + }, + { + "loss": 14.2176, + "grad_norm": 1.9806510210037231, + "learning_rate": 0.0005, + "epoch": 0.22588361839562127, + "step": 5040 + }, + { + "loss": 14.2031, + "grad_norm": 1.9274729490280151, + "learning_rate": 0.0005, + "epoch": 0.22610770928688675, + "step": 5045 + }, + { + "loss": 14.1723, + "grad_norm": 1.7561886310577393, + "learning_rate": 0.0005, + "epoch": 0.22633180017815227, + "step": 5050 + }, + { + "loss": 14.1663, + "grad_norm": 1.9986470937728882, + "learning_rate": 0.0005, + "epoch": 0.22655589106941776, + "step": 5055 + }, + { + "loss": 14.1284, + "grad_norm": 2.1279730796813965, + "learning_rate": 0.0005, + "epoch": 0.22677998196068325, + "step": 5060 + }, + { + "loss": 14.0967, + "grad_norm": 1.9476654529571533, + "learning_rate": 0.0005, + "epoch": 0.22700407285194876, + "step": 5065 + }, + { + "loss": 14.1572, + "grad_norm": 1.9759037494659424, + "learning_rate": 0.0005, + "epoch": 0.22722816374321425, + "step": 5070 + }, + { + "loss": 14.0725, + "grad_norm": 2.016960620880127, + "learning_rate": 0.0005, + "epoch": 0.22745225463447974, + "step": 5075 + }, + { + "loss": 14.2644, + "grad_norm": 2.0249178409576416, + "learning_rate": 0.0005, + "epoch": 0.22767634552574526, + "step": 5080 + }, + { + "loss": 14.1046, + "grad_norm": 1.9025015830993652, + "learning_rate": 0.0005, + "epoch": 0.22790043641701074, + "step": 5085 + }, + { + "loss": 14.135, + "grad_norm": 2.043222665786743, + "learning_rate": 0.0005, + "epoch": 0.22812452730827623, + "step": 5090 + }, + { + "loss": 14.2218, + "grad_norm": 1.9227269887924194, + "learning_rate": 0.0005, + "epoch": 0.22834861819954175, + "step": 5095 + }, + { + "loss": 14.1908, + "grad_norm": 1.9823837280273438, + "learning_rate": 0.0005, + "epoch": 0.22857270909080724, + "step": 5100 + }, + { + "loss": 14.2474, + "grad_norm": 2.08957839012146, + "learning_rate": 0.0005, + "epoch": 0.22879679998207272, + "step": 5105 + }, + { + "loss": 14.1789, + "grad_norm": 1.8854748010635376, + "learning_rate": 0.0005, + "epoch": 0.22902089087333824, + "step": 5110 + }, + { + "loss": 14.143, + "grad_norm": 1.9586538076400757, + "learning_rate": 0.0005, + "epoch": 0.22924498176460373, + "step": 5115 + }, + { + "loss": 14.2054, + "grad_norm": 2.168851137161255, + "learning_rate": 0.0005, + "epoch": 0.22946907265586922, + "step": 5120 + }, + { + "loss": 14.2061, + "grad_norm": 1.938035249710083, + "learning_rate": 0.0005, + "epoch": 0.2296931635471347, + "step": 5125 + }, + { + "loss": 14.113, + "grad_norm": 1.8732202053070068, + "learning_rate": 0.0005, + "epoch": 0.22991725443840022, + "step": 5130 + }, + { + "loss": 14.0918, + "grad_norm": 1.9816375970840454, + "learning_rate": 0.0005, + "epoch": 0.2301413453296657, + "step": 5135 + }, + { + "loss": 14.1993, + "grad_norm": 2.076524257659912, + "learning_rate": 0.0005, + "epoch": 0.2303654362209312, + "step": 5140 + }, + { + "loss": 14.2595, + "grad_norm": 2.0364131927490234, + "learning_rate": 0.0005, + "epoch": 0.2305895271121967, + "step": 5145 + }, + { + "loss": 14.1889, + "grad_norm": 1.949928641319275, + "learning_rate": 0.0005, + "epoch": 0.2308136180034622, + "step": 5150 + }, + { + "loss": 14.0974, + "grad_norm": 1.966451644897461, + "learning_rate": 0.0005, + "epoch": 0.2310377088947277, + "step": 5155 + }, + { + "loss": 14.2536, + "grad_norm": 1.9429701566696167, + "learning_rate": 0.0005, + "epoch": 0.2312617997859932, + "step": 5160 + }, + { + "loss": 14.0996, + "grad_norm": 1.9530205726623535, + "learning_rate": 0.0005, + "epoch": 0.2314858906772587, + "step": 5165 + }, + { + "loss": 14.2591, + "grad_norm": 2.3076071739196777, + "learning_rate": 0.0005, + "epoch": 0.23170998156852418, + "step": 5170 + }, + { + "loss": 14.2217, + "grad_norm": 2.1827287673950195, + "learning_rate": 0.0005, + "epoch": 0.2319340724597897, + "step": 5175 + }, + { + "loss": 14.1556, + "grad_norm": 1.8181138038635254, + "learning_rate": 0.0005, + "epoch": 0.23215816335105519, + "step": 5180 + }, + { + "loss": 14.1436, + "grad_norm": 2.006176233291626, + "learning_rate": 0.0005, + "epoch": 0.23238225424232067, + "step": 5185 + }, + { + "loss": 14.1115, + "grad_norm": 1.8907212018966675, + "learning_rate": 0.0005, + "epoch": 0.2326063451335862, + "step": 5190 + }, + { + "loss": 14.1021, + "grad_norm": 1.937179684638977, + "learning_rate": 0.0005, + "epoch": 0.23283043602485168, + "step": 5195 + }, + { + "loss": 14.2557, + "grad_norm": 2.1188528537750244, + "learning_rate": 0.0005, + "epoch": 0.23305452691611717, + "step": 5200 + }, + { + "loss": 14.1039, + "grad_norm": 2.041637420654297, + "learning_rate": 0.0005, + "epoch": 0.23327861780738268, + "step": 5205 + }, + { + "loss": 14.3222, + "grad_norm": 1.9130808115005493, + "learning_rate": 0.0005, + "epoch": 0.23350270869864817, + "step": 5210 + }, + { + "loss": 14.1975, + "grad_norm": 1.881244421005249, + "learning_rate": 0.0005, + "epoch": 0.23372679958991366, + "step": 5215 + }, + { + "loss": 14.0401, + "grad_norm": 1.8800253868103027, + "learning_rate": 0.0005, + "epoch": 0.23395089048117917, + "step": 5220 + }, + { + "loss": 14.157, + "grad_norm": 2.026484489440918, + "learning_rate": 0.0005, + "epoch": 0.23417498137244466, + "step": 5225 + }, + { + "loss": 14.1267, + "grad_norm": 1.979641318321228, + "learning_rate": 0.0005, + "epoch": 0.23439907226371015, + "step": 5230 + }, + { + "loss": 14.1186, + "grad_norm": 1.9195035696029663, + "learning_rate": 0.0005, + "epoch": 0.23462316315497567, + "step": 5235 + }, + { + "loss": 14.2415, + "grad_norm": 1.9984934329986572, + "learning_rate": 0.0005, + "epoch": 0.23484725404624116, + "step": 5240 + }, + { + "loss": 14.2466, + "grad_norm": 1.9173070192337036, + "learning_rate": 0.0005, + "epoch": 0.23507134493750664, + "step": 5245 + }, + { + "loss": 14.2014, + "grad_norm": 2.020751714706421, + "learning_rate": 0.0005, + "epoch": 0.23529543582877216, + "step": 5250 + }, + { + "loss": 14.0758, + "grad_norm": 2.14182448387146, + "learning_rate": 0.0005, + "epoch": 0.23551952672003765, + "step": 5255 + }, + { + "loss": 14.0696, + "grad_norm": 2.078418731689453, + "learning_rate": 0.0005, + "epoch": 0.23574361761130314, + "step": 5260 + }, + { + "loss": 14.1406, + "grad_norm": 2.12013840675354, + "learning_rate": 0.0005, + "epoch": 0.23596770850256865, + "step": 5265 + }, + { + "loss": 14.0492, + "grad_norm": 1.7853872776031494, + "learning_rate": 0.0005, + "epoch": 0.23619179939383414, + "step": 5270 + }, + { + "loss": 14.0794, + "grad_norm": 1.743823766708374, + "learning_rate": 0.0005, + "epoch": 0.23641589028509963, + "step": 5275 + }, + { + "loss": 14.2286, + "grad_norm": 1.872301459312439, + "learning_rate": 0.0005, + "epoch": 0.23663998117636514, + "step": 5280 + }, + { + "loss": 14.2119, + "grad_norm": 2.0029842853546143, + "learning_rate": 0.0005, + "epoch": 0.23686407206763063, + "step": 5285 + }, + { + "loss": 14.1721, + "grad_norm": 2.0815510749816895, + "learning_rate": 0.0005, + "epoch": 0.23708816295889612, + "step": 5290 + }, + { + "loss": 14.1907, + "grad_norm": 1.9053215980529785, + "learning_rate": 0.0005, + "epoch": 0.23731225385016164, + "step": 5295 + }, + { + "loss": 14.1224, + "grad_norm": 1.9221818447113037, + "learning_rate": 0.0005, + "epoch": 0.23753634474142712, + "step": 5300 + }, + { + "loss": 14.1437, + "grad_norm": 2.031381607055664, + "learning_rate": 0.0005, + "epoch": 0.2377604356326926, + "step": 5305 + }, + { + "loss": 14.1592, + "grad_norm": 1.8390190601348877, + "learning_rate": 0.0005, + "epoch": 0.23798452652395813, + "step": 5310 + }, + { + "loss": 14.3036, + "grad_norm": 1.8330848217010498, + "learning_rate": 0.0005, + "epoch": 0.23820861741522362, + "step": 5315 + }, + { + "loss": 14.0815, + "grad_norm": 1.9881000518798828, + "learning_rate": 0.0005, + "epoch": 0.2384327083064891, + "step": 5320 + }, + { + "loss": 14.1096, + "grad_norm": 2.018603563308716, + "learning_rate": 0.0005, + "epoch": 0.23865679919775462, + "step": 5325 + }, + { + "loss": 14.2211, + "grad_norm": 1.9145399332046509, + "learning_rate": 0.0005, + "epoch": 0.2388808900890201, + "step": 5330 + }, + { + "loss": 14.2118, + "grad_norm": 1.8877010345458984, + "learning_rate": 0.0005, + "epoch": 0.2391049809802856, + "step": 5335 + }, + { + "loss": 14.1218, + "grad_norm": 1.8736507892608643, + "learning_rate": 0.0005, + "epoch": 0.2393290718715511, + "step": 5340 + }, + { + "loss": 14.2243, + "grad_norm": 1.9161192178726196, + "learning_rate": 0.0005, + "epoch": 0.2395531627628166, + "step": 5345 + }, + { + "loss": 14.0829, + "grad_norm": 2.1973557472229004, + "learning_rate": 0.0005, + "epoch": 0.2397772536540821, + "step": 5350 + }, + { + "loss": 14.1577, + "grad_norm": 1.886846661567688, + "learning_rate": 0.0005, + "epoch": 0.2400013445453476, + "step": 5355 + }, + { + "loss": 14.2135, + "grad_norm": 1.8616987466812134, + "learning_rate": 0.0005, + "epoch": 0.2402254354366131, + "step": 5360 + }, + { + "loss": 14.1345, + "grad_norm": 2.0593018531799316, + "learning_rate": 0.0005, + "epoch": 0.24044952632787858, + "step": 5365 + }, + { + "loss": 14.1975, + "grad_norm": 1.9629650115966797, + "learning_rate": 0.0005, + "epoch": 0.2406736172191441, + "step": 5370 + }, + { + "loss": 14.1527, + "grad_norm": 1.9321669340133667, + "learning_rate": 0.0005, + "epoch": 0.2408977081104096, + "step": 5375 + }, + { + "loss": 14.2791, + "grad_norm": 1.8482413291931152, + "learning_rate": 0.0005, + "epoch": 0.24112179900167507, + "step": 5380 + }, + { + "loss": 14.1751, + "grad_norm": 1.824684500694275, + "learning_rate": 0.0005, + "epoch": 0.2413458898929406, + "step": 5385 + }, + { + "loss": 14.1206, + "grad_norm": 1.8810324668884277, + "learning_rate": 0.0005, + "epoch": 0.24156998078420608, + "step": 5390 + }, + { + "loss": 14.1139, + "grad_norm": 2.046349048614502, + "learning_rate": 0.0005, + "epoch": 0.24179407167547157, + "step": 5395 + }, + { + "loss": 14.2159, + "grad_norm": 2.128713369369507, + "learning_rate": 0.0005, + "epoch": 0.24201816256673706, + "step": 5400 + }, + { + "loss": 14.2577, + "grad_norm": 2.089057445526123, + "learning_rate": 0.0005, + "epoch": 0.24224225345800257, + "step": 5405 + }, + { + "loss": 14.149, + "grad_norm": 1.968741536140442, + "learning_rate": 0.0005, + "epoch": 0.24246634434926806, + "step": 5410 + }, + { + "loss": 14.1253, + "grad_norm": 1.9236301183700562, + "learning_rate": 0.0005, + "epoch": 0.24269043524053355, + "step": 5415 + }, + { + "loss": 14.1906, + "grad_norm": 2.0584657192230225, + "learning_rate": 0.0005, + "epoch": 0.24291452613179906, + "step": 5420 + }, + { + "loss": 14.1517, + "grad_norm": 2.031520128250122, + "learning_rate": 0.0005, + "epoch": 0.24313861702306455, + "step": 5425 + }, + { + "loss": 14.1071, + "grad_norm": 2.004542827606201, + "learning_rate": 0.0005, + "epoch": 0.24336270791433004, + "step": 5430 + }, + { + "loss": 14.1991, + "grad_norm": 1.9510637521743774, + "learning_rate": 0.0005, + "epoch": 0.24358679880559556, + "step": 5435 + }, + { + "loss": 14.1279, + "grad_norm": 1.9296494722366333, + "learning_rate": 0.0005, + "epoch": 0.24381088969686104, + "step": 5440 + }, + { + "loss": 14.1353, + "grad_norm": 1.9912152290344238, + "learning_rate": 0.0005, + "epoch": 0.24403498058812653, + "step": 5445 + }, + { + "loss": 14.0887, + "grad_norm": 1.7792458534240723, + "learning_rate": 0.0005, + "epoch": 0.24425907147939205, + "step": 5450 + }, + { + "loss": 14.1501, + "grad_norm": 1.7846481800079346, + "learning_rate": 0.0005, + "epoch": 0.24448316237065754, + "step": 5455 + }, + { + "loss": 14.3272, + "grad_norm": 1.8782302141189575, + "learning_rate": 0.0005, + "epoch": 0.24470725326192302, + "step": 5460 + }, + { + "loss": 14.1075, + "grad_norm": 1.8768726587295532, + "learning_rate": 0.0005, + "epoch": 0.24493134415318854, + "step": 5465 + }, + { + "loss": 14.1498, + "grad_norm": 1.882405400276184, + "learning_rate": 0.0005, + "epoch": 0.24515543504445403, + "step": 5470 + }, + { + "loss": 14.0709, + "grad_norm": 2.045891046524048, + "learning_rate": 0.0005, + "epoch": 0.24537952593571952, + "step": 5475 + }, + { + "loss": 14.2524, + "grad_norm": 1.9826222658157349, + "learning_rate": 0.0005, + "epoch": 0.24560361682698503, + "step": 5480 + }, + { + "loss": 14.1392, + "grad_norm": 2.0543572902679443, + "learning_rate": 0.0005, + "epoch": 0.24582770771825052, + "step": 5485 + }, + { + "loss": 14.2088, + "grad_norm": 1.989733338356018, + "learning_rate": 0.0005, + "epoch": 0.246051798609516, + "step": 5490 + }, + { + "loss": 14.2357, + "grad_norm": 1.967455506324768, + "learning_rate": 0.0005, + "epoch": 0.24627588950078153, + "step": 5495 + }, + { + "loss": 14.2458, + "grad_norm": 2.254347085952759, + "learning_rate": 0.0005, + "epoch": 0.246499980392047, + "step": 5500 + }, + { + "eval_loss": 1.76084303855896, + "eval_runtime": 18.6083, + "eval_samples_per_second": 880.468, + "eval_steps_per_second": 7.9, + "epoch": 0.246499980392047, + "step": 5500 + }, + { + "loss": 14.0829, + "grad_norm": 2.1639139652252197, + "learning_rate": 0.0005, + "epoch": 0.2467240712833125, + "step": 5505 + }, + { + "loss": 14.1356, + "grad_norm": 1.9651415348052979, + "learning_rate": 0.0005, + "epoch": 0.24694816217457802, + "step": 5510 + }, + { + "loss": 14.2047, + "grad_norm": 1.8461552858352661, + "learning_rate": 0.0005, + "epoch": 0.2471722530658435, + "step": 5515 + }, + { + "loss": 14.2647, + "grad_norm": 1.8990920782089233, + "learning_rate": 0.0005, + "epoch": 0.247396343957109, + "step": 5520 + }, + { + "loss": 14.2771, + "grad_norm": 1.8398399353027344, + "learning_rate": 0.0005, + "epoch": 0.2476204348483745, + "step": 5525 + }, + { + "loss": 14.2495, + "grad_norm": 1.9693412780761719, + "learning_rate": 0.0005, + "epoch": 0.24784452573964, + "step": 5530 + }, + { + "loss": 14.2004, + "grad_norm": 2.0449378490448, + "learning_rate": 0.0005, + "epoch": 0.2480686166309055, + "step": 5535 + }, + { + "loss": 14.1347, + "grad_norm": 2.1678719520568848, + "learning_rate": 0.0005, + "epoch": 0.248292707522171, + "step": 5540 + }, + { + "loss": 14.0804, + "grad_norm": 1.9390939474105835, + "learning_rate": 0.0005, + "epoch": 0.2485167984134365, + "step": 5545 + }, + { + "loss": 14.1556, + "grad_norm": 2.0248236656188965, + "learning_rate": 0.0005, + "epoch": 0.24874088930470198, + "step": 5550 + }, + { + "loss": 14.118, + "grad_norm": 2.0306644439697266, + "learning_rate": 0.0005, + "epoch": 0.2489649801959675, + "step": 5555 + }, + { + "loss": 14.1897, + "grad_norm": 1.973796010017395, + "learning_rate": 0.0005, + "epoch": 0.24918907108723298, + "step": 5560 + }, + { + "loss": 14.1298, + "grad_norm": 2.283743381500244, + "learning_rate": 0.0005, + "epoch": 0.24941316197849847, + "step": 5565 + }, + { + "loss": 14.1013, + "grad_norm": 1.961428165435791, + "learning_rate": 0.0005, + "epoch": 0.249637252869764, + "step": 5570 + }, + { + "loss": 14.1541, + "grad_norm": 1.8481162786483765, + "learning_rate": 0.0005, + "epoch": 0.24986134376102948, + "step": 5575 + }, + { + "loss": 14.1204, + "grad_norm": 1.7814725637435913, + "learning_rate": 0.0005, + "epoch": 0.250085434652295, + "step": 5580 + }, + { + "loss": 14.1446, + "grad_norm": 1.8474041223526, + "learning_rate": 0.0005, + "epoch": 0.25030952554356045, + "step": 5585 + }, + { + "loss": 14.1353, + "grad_norm": 1.9465347528457642, + "learning_rate": 0.0005, + "epoch": 0.25053361643482597, + "step": 5590 + }, + { + "loss": 14.1041, + "grad_norm": 2.047680616378784, + "learning_rate": 0.0005, + "epoch": 0.2507577073260915, + "step": 5595 + }, + { + "loss": 14.2028, + "grad_norm": 1.9223048686981201, + "learning_rate": 0.0005, + "epoch": 0.25098179821735694, + "step": 5600 + }, + { + "loss": 14.17, + "grad_norm": 2.0348899364471436, + "learning_rate": 0.0005, + "epoch": 0.25120588910862246, + "step": 5605 + }, + { + "loss": 14.0867, + "grad_norm": 2.129124164581299, + "learning_rate": 0.0005, + "epoch": 0.251429979999888, + "step": 5610 + }, + { + "loss": 14.1084, + "grad_norm": 1.8007903099060059, + "learning_rate": 0.0005, + "epoch": 0.25165407089115344, + "step": 5615 + }, + { + "loss": 14.1026, + "grad_norm": 1.8596608638763428, + "learning_rate": 0.0005, + "epoch": 0.25187816178241895, + "step": 5620 + }, + { + "loss": 14.1257, + "grad_norm": 1.837965726852417, + "learning_rate": 0.0005, + "epoch": 0.25210225267368447, + "step": 5625 + }, + { + "loss": 14.0864, + "grad_norm": 2.020282506942749, + "learning_rate": 0.0005, + "epoch": 0.25232634356494993, + "step": 5630 + }, + { + "loss": 14.2164, + "grad_norm": 2.035090446472168, + "learning_rate": 0.0005, + "epoch": 0.25255043445621544, + "step": 5635 + }, + { + "loss": 14.1225, + "grad_norm": 1.9386943578720093, + "learning_rate": 0.0005, + "epoch": 0.25277452534748096, + "step": 5640 + }, + { + "loss": 14.2029, + "grad_norm": 1.9536880254745483, + "learning_rate": 0.0005, + "epoch": 0.2529986162387464, + "step": 5645 + }, + { + "loss": 14.1768, + "grad_norm": 1.7979294061660767, + "learning_rate": 0.0005, + "epoch": 0.25322270713001194, + "step": 5650 + }, + { + "loss": 14.1672, + "grad_norm": 2.0136592388153076, + "learning_rate": 0.0005, + "epoch": 0.25344679802127745, + "step": 5655 + }, + { + "loss": 14.0883, + "grad_norm": 1.9120242595672607, + "learning_rate": 0.0005, + "epoch": 0.2536708889125429, + "step": 5660 + }, + { + "loss": 14.1071, + "grad_norm": 1.9633128643035889, + "learning_rate": 0.0005, + "epoch": 0.25389497980380843, + "step": 5665 + }, + { + "loss": 14.1785, + "grad_norm": 2.052507162094116, + "learning_rate": 0.0005, + "epoch": 0.25411907069507395, + "step": 5670 + }, + { + "loss": 14.1423, + "grad_norm": 1.8795636892318726, + "learning_rate": 0.0005, + "epoch": 0.2543431615863394, + "step": 5675 + }, + { + "loss": 14.1653, + "grad_norm": 1.8552874326705933, + "learning_rate": 0.0005, + "epoch": 0.2545672524776049, + "step": 5680 + }, + { + "loss": 14.0681, + "grad_norm": 1.9075721502304077, + "learning_rate": 0.0005, + "epoch": 0.25479134336887044, + "step": 5685 + }, + { + "loss": 14.2344, + "grad_norm": 2.0484118461608887, + "learning_rate": 0.0005, + "epoch": 0.2550154342601359, + "step": 5690 + }, + { + "loss": 14.177, + "grad_norm": 1.8938542604446411, + "learning_rate": 0.0005, + "epoch": 0.2552395251514014, + "step": 5695 + }, + { + "loss": 14.1684, + "grad_norm": 1.949268102645874, + "learning_rate": 0.0005, + "epoch": 0.25546361604266693, + "step": 5700 + }, + { + "loss": 14.0736, + "grad_norm": 1.8290852308273315, + "learning_rate": 0.0005, + "epoch": 0.2556877069339324, + "step": 5705 + }, + { + "loss": 14.1068, + "grad_norm": 1.9694098234176636, + "learning_rate": 0.0005, + "epoch": 0.2559117978251979, + "step": 5710 + }, + { + "loss": 14.1696, + "grad_norm": 2.0308942794799805, + "learning_rate": 0.0005, + "epoch": 0.2561358887164634, + "step": 5715 + }, + { + "loss": 14.1764, + "grad_norm": 1.825493335723877, + "learning_rate": 0.0005, + "epoch": 0.2563599796077289, + "step": 5720 + }, + { + "loss": 14.0856, + "grad_norm": 1.9709550142288208, + "learning_rate": 0.0005, + "epoch": 0.2565840704989944, + "step": 5725 + }, + { + "loss": 14.18, + "grad_norm": 2.001124620437622, + "learning_rate": 0.0005, + "epoch": 0.2568081613902599, + "step": 5730 + }, + { + "loss": 14.2103, + "grad_norm": 2.108546733856201, + "learning_rate": 0.0005, + "epoch": 0.2570322522815254, + "step": 5735 + }, + { + "loss": 14.0931, + "grad_norm": 2.1841468811035156, + "learning_rate": 0.0005, + "epoch": 0.2572563431727909, + "step": 5740 + }, + { + "loss": 14.1396, + "grad_norm": 2.1639816761016846, + "learning_rate": 0.0005, + "epoch": 0.2574804340640564, + "step": 5745 + }, + { + "loss": 14.1319, + "grad_norm": 2.002530336380005, + "learning_rate": 0.0005, + "epoch": 0.25770452495532187, + "step": 5750 + }, + { + "loss": 14.1058, + "grad_norm": 2.0346779823303223, + "learning_rate": 0.0005, + "epoch": 0.2579286158465874, + "step": 5755 + }, + { + "loss": 14.1493, + "grad_norm": 2.0625033378601074, + "learning_rate": 0.0005, + "epoch": 0.2581527067378529, + "step": 5760 + }, + { + "loss": 14.1357, + "grad_norm": 1.9248889684677124, + "learning_rate": 0.0005, + "epoch": 0.25837679762911836, + "step": 5765 + }, + { + "loss": 14.098, + "grad_norm": 2.009620428085327, + "learning_rate": 0.0005, + "epoch": 0.2586008885203839, + "step": 5770 + }, + { + "loss": 14.1985, + "grad_norm": 2.120296001434326, + "learning_rate": 0.0005, + "epoch": 0.2588249794116494, + "step": 5775 + }, + { + "loss": 14.1445, + "grad_norm": 2.1229355335235596, + "learning_rate": 0.0005, + "epoch": 0.25904907030291485, + "step": 5780 + }, + { + "loss": 14.1839, + "grad_norm": 2.3002853393554688, + "learning_rate": 0.0005, + "epoch": 0.25927316119418037, + "step": 5785 + }, + { + "loss": 14.0847, + "grad_norm": 1.961533784866333, + "learning_rate": 0.0005, + "epoch": 0.2594972520854459, + "step": 5790 + }, + { + "loss": 14.127, + "grad_norm": 2.012505531311035, + "learning_rate": 0.0005, + "epoch": 0.25972134297671134, + "step": 5795 + }, + { + "loss": 14.1247, + "grad_norm": 2.105245590209961, + "learning_rate": 0.0005, + "epoch": 0.25994543386797686, + "step": 5800 + }, + { + "loss": 14.2372, + "grad_norm": 1.9416728019714355, + "learning_rate": 0.0005, + "epoch": 0.2601695247592423, + "step": 5805 + }, + { + "loss": 14.139, + "grad_norm": 2.037787437438965, + "learning_rate": 0.0005, + "epoch": 0.26039361565050784, + "step": 5810 + }, + { + "loss": 14.0952, + "grad_norm": 1.8043665885925293, + "learning_rate": 0.0005, + "epoch": 0.26061770654177335, + "step": 5815 + }, + { + "loss": 14.0973, + "grad_norm": 2.161391496658325, + "learning_rate": 0.0005, + "epoch": 0.2608417974330388, + "step": 5820 + }, + { + "loss": 14.096, + "grad_norm": 2.1336889266967773, + "learning_rate": 0.0005, + "epoch": 0.26106588832430433, + "step": 5825 + }, + { + "loss": 14.2098, + "grad_norm": 1.9870775938034058, + "learning_rate": 0.0005, + "epoch": 0.26128997921556985, + "step": 5830 + }, + { + "loss": 14.1767, + "grad_norm": 1.848604440689087, + "learning_rate": 0.0005, + "epoch": 0.2615140701068353, + "step": 5835 + }, + { + "loss": 14.1075, + "grad_norm": 1.8512866497039795, + "learning_rate": 0.0005, + "epoch": 0.2617381609981008, + "step": 5840 + }, + { + "loss": 14.2583, + "grad_norm": 2.112514019012451, + "learning_rate": 0.0005, + "epoch": 0.26196225188936634, + "step": 5845 + }, + { + "loss": 14.1369, + "grad_norm": 2.0032267570495605, + "learning_rate": 0.0005, + "epoch": 0.2621863427806318, + "step": 5850 + }, + { + "loss": 14.1011, + "grad_norm": 2.060760259628296, + "learning_rate": 0.0005, + "epoch": 0.2624104336718973, + "step": 5855 + }, + { + "loss": 14.1127, + "grad_norm": 1.7968379259109497, + "learning_rate": 0.0005, + "epoch": 0.26263452456316283, + "step": 5860 + }, + { + "loss": 14.2577, + "grad_norm": 1.912384271621704, + "learning_rate": 0.0005, + "epoch": 0.2628586154544283, + "step": 5865 + }, + { + "loss": 14.0495, + "grad_norm": 1.9313682317733765, + "learning_rate": 0.0005, + "epoch": 0.2630827063456938, + "step": 5870 + }, + { + "loss": 14.232, + "grad_norm": 1.9164438247680664, + "learning_rate": 0.0005, + "epoch": 0.2633067972369593, + "step": 5875 + }, + { + "loss": 14.159, + "grad_norm": 2.094454526901245, + "learning_rate": 0.0005, + "epoch": 0.2635308881282248, + "step": 5880 + }, + { + "loss": 14.0457, + "grad_norm": 1.9597326517105103, + "learning_rate": 0.0005, + "epoch": 0.2637549790194903, + "step": 5885 + }, + { + "loss": 14.0497, + "grad_norm": 1.7545689344406128, + "learning_rate": 0.0005, + "epoch": 0.2639790699107558, + "step": 5890 + }, + { + "loss": 14.1202, + "grad_norm": 1.9736093282699585, + "learning_rate": 0.0005, + "epoch": 0.2642031608020213, + "step": 5895 + }, + { + "loss": 14.1309, + "grad_norm": 2.03840970993042, + "learning_rate": 0.0005, + "epoch": 0.2644272516932868, + "step": 5900 + }, + { + "loss": 14.1956, + "grad_norm": 1.9478352069854736, + "learning_rate": 0.0005, + "epoch": 0.2646513425845523, + "step": 5905 + }, + { + "loss": 14.1178, + "grad_norm": 2.0999906063079834, + "learning_rate": 0.0005, + "epoch": 0.26487543347581777, + "step": 5910 + }, + { + "loss": 14.0856, + "grad_norm": 2.1087260246276855, + "learning_rate": 0.0005, + "epoch": 0.2650995243670833, + "step": 5915 + }, + { + "loss": 14.1338, + "grad_norm": 1.7892274856567383, + "learning_rate": 0.0005, + "epoch": 0.2653236152583488, + "step": 5920 + }, + { + "loss": 14.1355, + "grad_norm": 2.0246315002441406, + "learning_rate": 0.0005, + "epoch": 0.26554770614961426, + "step": 5925 + }, + { + "loss": 14.1132, + "grad_norm": 1.98887038230896, + "learning_rate": 0.0005, + "epoch": 0.2657717970408798, + "step": 5930 + }, + { + "loss": 14.2338, + "grad_norm": 2.2294838428497314, + "learning_rate": 0.0005, + "epoch": 0.2659958879321453, + "step": 5935 + }, + { + "loss": 14.1011, + "grad_norm": 1.8237981796264648, + "learning_rate": 0.0005, + "epoch": 0.26621997882341075, + "step": 5940 + }, + { + "loss": 14.0735, + "grad_norm": 2.0049431324005127, + "learning_rate": 0.0005, + "epoch": 0.26644406971467627, + "step": 5945 + }, + { + "loss": 14.0917, + "grad_norm": 2.1553215980529785, + "learning_rate": 0.0005, + "epoch": 0.2666681606059418, + "step": 5950 + }, + { + "loss": 14.0225, + "grad_norm": 2.1619985103607178, + "learning_rate": 0.0005, + "epoch": 0.26689225149720724, + "step": 5955 + }, + { + "loss": 14.1076, + "grad_norm": 1.9701801538467407, + "learning_rate": 0.0005, + "epoch": 0.26711634238847276, + "step": 5960 + }, + { + "loss": 14.0973, + "grad_norm": 2.117084264755249, + "learning_rate": 0.0005, + "epoch": 0.2673404332797383, + "step": 5965 + }, + { + "loss": 14.1038, + "grad_norm": 1.9590649604797363, + "learning_rate": 0.0005, + "epoch": 0.26756452417100374, + "step": 5970 + }, + { + "loss": 14.1344, + "grad_norm": 1.9777534008026123, + "learning_rate": 0.0005, + "epoch": 0.26778861506226925, + "step": 5975 + }, + { + "loss": 14.0876, + "grad_norm": 2.003319025039673, + "learning_rate": 0.0005, + "epoch": 0.26801270595353477, + "step": 5980 + }, + { + "loss": 14.1786, + "grad_norm": 1.934256911277771, + "learning_rate": 0.0005, + "epoch": 0.26823679684480023, + "step": 5985 + }, + { + "loss": 14.1779, + "grad_norm": 1.8747920989990234, + "learning_rate": 0.0005, + "epoch": 0.26846088773606575, + "step": 5990 + }, + { + "loss": 13.9991, + "grad_norm": 2.0487060546875, + "learning_rate": 0.0005, + "epoch": 0.26868497862733126, + "step": 5995 + }, + { + "loss": 14.0573, + "grad_norm": 2.062567949295044, + "learning_rate": 0.0005, + "epoch": 0.2689090695185967, + "step": 6000 + }, + { + "eval_loss": 1.7624342441558838, + "eval_runtime": 18.6015, + "eval_samples_per_second": 880.791, + "eval_steps_per_second": 7.903, + "epoch": 0.2689090695185967, + "step": 6000 + }, + { + "loss": 14.0626, + "grad_norm": 1.9452993869781494, + "learning_rate": 0.0005, + "epoch": 0.26913316040986224, + "step": 6005 + }, + { + "loss": 14.1218, + "grad_norm": 1.8791625499725342, + "learning_rate": 0.0005, + "epoch": 0.26935725130112775, + "step": 6010 + }, + { + "loss": 14.042, + "grad_norm": 1.9103970527648926, + "learning_rate": 0.0005, + "epoch": 0.2695813421923932, + "step": 6015 + }, + { + "loss": 14.1955, + "grad_norm": 1.7610692977905273, + "learning_rate": 0.0005, + "epoch": 0.26980543308365873, + "step": 6020 + }, + { + "loss": 14.2647, + "grad_norm": 1.9507147073745728, + "learning_rate": 0.0005, + "epoch": 0.27002952397492425, + "step": 6025 + }, + { + "loss": 14.1833, + "grad_norm": 1.9560884237289429, + "learning_rate": 0.0005, + "epoch": 0.2702536148661897, + "step": 6030 + }, + { + "loss": 14.2063, + "grad_norm": 1.982217788696289, + "learning_rate": 0.0005, + "epoch": 0.2704777057574552, + "step": 6035 + }, + { + "loss": 14.2817, + "grad_norm": 1.8248435258865356, + "learning_rate": 0.0005, + "epoch": 0.27070179664872074, + "step": 6040 + }, + { + "loss": 14.0953, + "grad_norm": 2.075336456298828, + "learning_rate": 0.0005, + "epoch": 0.2709258875399862, + "step": 6045 + }, + { + "loss": 14.1181, + "grad_norm": 1.8482534885406494, + "learning_rate": 0.0005, + "epoch": 0.2711499784312517, + "step": 6050 + }, + { + "loss": 14.1048, + "grad_norm": 1.7765411138534546, + "learning_rate": 0.0005, + "epoch": 0.27137406932251723, + "step": 6055 + }, + { + "loss": 14.1664, + "grad_norm": 2.0339853763580322, + "learning_rate": 0.0005, + "epoch": 0.2715981602137827, + "step": 6060 + }, + { + "loss": 14.0792, + "grad_norm": 2.048095464706421, + "learning_rate": 0.0005, + "epoch": 0.2718222511050482, + "step": 6065 + }, + { + "loss": 14.1056, + "grad_norm": 1.952912449836731, + "learning_rate": 0.0005, + "epoch": 0.2720463419963137, + "step": 6070 + }, + { + "loss": 14.2421, + "grad_norm": 2.003805160522461, + "learning_rate": 0.0005, + "epoch": 0.2722704328875792, + "step": 6075 + }, + { + "loss": 14.1109, + "grad_norm": 1.9438632726669312, + "learning_rate": 0.0005, + "epoch": 0.2724945237788447, + "step": 6080 + }, + { + "loss": 14.1035, + "grad_norm": 2.0806822776794434, + "learning_rate": 0.0005, + "epoch": 0.2727186146701102, + "step": 6085 + }, + { + "loss": 14.1132, + "grad_norm": 1.8243623971939087, + "learning_rate": 0.0005, + "epoch": 0.2729427055613757, + "step": 6090 + }, + { + "loss": 14.0947, + "grad_norm": 2.221346139907837, + "learning_rate": 0.0005, + "epoch": 0.2731667964526412, + "step": 6095 + }, + { + "loss": 14.1149, + "grad_norm": 1.9295768737792969, + "learning_rate": 0.0005, + "epoch": 0.2733908873439067, + "step": 6100 + }, + { + "loss": 14.0679, + "grad_norm": 1.9353907108306885, + "learning_rate": 0.0005, + "epoch": 0.27361497823517217, + "step": 6105 + }, + { + "loss": 14.1163, + "grad_norm": 1.8101950883865356, + "learning_rate": 0.0005, + "epoch": 0.2738390691264377, + "step": 6110 + }, + { + "loss": 14.184, + "grad_norm": 1.8892567157745361, + "learning_rate": 0.0005, + "epoch": 0.2740631600177032, + "step": 6115 + }, + { + "loss": 14.191, + "grad_norm": 1.8542805910110474, + "learning_rate": 0.0005, + "epoch": 0.27428725090896866, + "step": 6120 + }, + { + "loss": 14.1419, + "grad_norm": 1.95559823513031, + "learning_rate": 0.0005, + "epoch": 0.2745113418002342, + "step": 6125 + }, + { + "loss": 14.1413, + "grad_norm": 2.100402355194092, + "learning_rate": 0.0005, + "epoch": 0.2747354326914997, + "step": 6130 + }, + { + "loss": 14.1712, + "grad_norm": 1.8818026781082153, + "learning_rate": 0.0005, + "epoch": 0.27495952358276515, + "step": 6135 + }, + { + "loss": 14.1133, + "grad_norm": 2.1112377643585205, + "learning_rate": 0.0005, + "epoch": 0.27518361447403067, + "step": 6140 + }, + { + "loss": 14.1174, + "grad_norm": 1.9072048664093018, + "learning_rate": 0.0005, + "epoch": 0.2754077053652962, + "step": 6145 + }, + { + "loss": 14.16, + "grad_norm": 1.8370167016983032, + "learning_rate": 0.0005, + "epoch": 0.27563179625656165, + "step": 6150 + }, + { + "loss": 14.1799, + "grad_norm": 1.7252795696258545, + "learning_rate": 0.0005, + "epoch": 0.27585588714782716, + "step": 6155 + }, + { + "loss": 14.152, + "grad_norm": 1.9398436546325684, + "learning_rate": 0.0005, + "epoch": 0.2760799780390927, + "step": 6160 + }, + { + "loss": 14.0982, + "grad_norm": 1.8849056959152222, + "learning_rate": 0.0005, + "epoch": 0.27630406893035814, + "step": 6165 + }, + { + "loss": 14.2026, + "grad_norm": 2.120668888092041, + "learning_rate": 0.0005, + "epoch": 0.27652815982162365, + "step": 6170 + }, + { + "loss": 14.1554, + "grad_norm": 2.057431221008301, + "learning_rate": 0.0005, + "epoch": 0.27675225071288917, + "step": 6175 + }, + { + "loss": 14.0881, + "grad_norm": 1.9021568298339844, + "learning_rate": 0.0005, + "epoch": 0.27697634160415463, + "step": 6180 + }, + { + "loss": 14.1402, + "grad_norm": 1.9208106994628906, + "learning_rate": 0.0005, + "epoch": 0.27720043249542015, + "step": 6185 + }, + { + "loss": 14.1664, + "grad_norm": 2.031012773513794, + "learning_rate": 0.0005, + "epoch": 0.27742452338668566, + "step": 6190 + }, + { + "loss": 14.0986, + "grad_norm": 1.9070667028427124, + "learning_rate": 0.0005, + "epoch": 0.2776486142779511, + "step": 6195 + }, + { + "loss": 14.1555, + "grad_norm": 2.1638131141662598, + "learning_rate": 0.0005, + "epoch": 0.27787270516921664, + "step": 6200 + }, + { + "loss": 14.0978, + "grad_norm": 1.982803225517273, + "learning_rate": 0.0005, + "epoch": 0.27809679606048215, + "step": 6205 + }, + { + "loss": 14.1707, + "grad_norm": 1.9024537801742554, + "learning_rate": 0.0005, + "epoch": 0.2783208869517476, + "step": 6210 + }, + { + "loss": 14.1495, + "grad_norm": 1.9770755767822266, + "learning_rate": 0.0005, + "epoch": 0.27854497784301313, + "step": 6215 + }, + { + "loss": 14.0563, + "grad_norm": 1.8895354270935059, + "learning_rate": 0.0005, + "epoch": 0.27876906873427865, + "step": 6220 + }, + { + "loss": 13.9898, + "grad_norm": 1.9661836624145508, + "learning_rate": 0.0005, + "epoch": 0.2789931596255441, + "step": 6225 + }, + { + "loss": 14.1714, + "grad_norm": 2.0561423301696777, + "learning_rate": 0.0005, + "epoch": 0.2792172505168096, + "step": 6230 + }, + { + "loss": 14.2114, + "grad_norm": 2.0155813694000244, + "learning_rate": 0.0005, + "epoch": 0.27944134140807514, + "step": 6235 + }, + { + "loss": 14.1845, + "grad_norm": 1.8753061294555664, + "learning_rate": 0.0005, + "epoch": 0.2796654322993406, + "step": 6240 + }, + { + "loss": 14.2542, + "grad_norm": 1.7123141288757324, + "learning_rate": 0.0005, + "epoch": 0.2798895231906061, + "step": 6245 + }, + { + "loss": 14.1507, + "grad_norm": 1.9749904870986938, + "learning_rate": 0.0005, + "epoch": 0.28011361408187163, + "step": 6250 + }, + { + "loss": 14.1013, + "grad_norm": 1.9573017358779907, + "learning_rate": 0.0005, + "epoch": 0.2803377049731371, + "step": 6255 + }, + { + "loss": 14.107, + "grad_norm": 1.9311197996139526, + "learning_rate": 0.0005, + "epoch": 0.2805617958644026, + "step": 6260 + }, + { + "loss": 14.1186, + "grad_norm": 1.9432008266448975, + "learning_rate": 0.0005, + "epoch": 0.2807858867556681, + "step": 6265 + }, + { + "loss": 14.1111, + "grad_norm": 2.0310680866241455, + "learning_rate": 0.0005, + "epoch": 0.2810099776469336, + "step": 6270 + }, + { + "loss": 14.0357, + "grad_norm": 1.7939568758010864, + "learning_rate": 0.0005, + "epoch": 0.2812340685381991, + "step": 6275 + }, + { + "loss": 14.1525, + "grad_norm": 1.9812079668045044, + "learning_rate": 0.0005, + "epoch": 0.2814581594294646, + "step": 6280 + }, + { + "loss": 14.0599, + "grad_norm": 1.7790424823760986, + "learning_rate": 0.0005, + "epoch": 0.2816822503207301, + "step": 6285 + }, + { + "loss": 14.1505, + "grad_norm": 1.9458509683609009, + "learning_rate": 0.0005, + "epoch": 0.2819063412119956, + "step": 6290 + }, + { + "loss": 14.1274, + "grad_norm": 1.8281060457229614, + "learning_rate": 0.0005, + "epoch": 0.2821304321032611, + "step": 6295 + }, + { + "loss": 14.0824, + "grad_norm": 2.062748908996582, + "learning_rate": 0.0005, + "epoch": 0.28235452299452657, + "step": 6300 + }, + { + "loss": 14.1564, + "grad_norm": 2.144453763961792, + "learning_rate": 0.0005, + "epoch": 0.2825786138857921, + "step": 6305 + }, + { + "loss": 14.2729, + "grad_norm": 2.056806802749634, + "learning_rate": 0.0005, + "epoch": 0.2828027047770576, + "step": 6310 + }, + { + "loss": 14.0951, + "grad_norm": 1.949935793876648, + "learning_rate": 0.0005, + "epoch": 0.28302679566832306, + "step": 6315 + }, + { + "loss": 14.2044, + "grad_norm": 1.8974496126174927, + "learning_rate": 0.0005, + "epoch": 0.2832508865595886, + "step": 6320 + }, + { + "loss": 14.0844, + "grad_norm": 1.9938383102416992, + "learning_rate": 0.0005, + "epoch": 0.2834749774508541, + "step": 6325 + }, + { + "loss": 14.0717, + "grad_norm": 2.000389575958252, + "learning_rate": 0.0005, + "epoch": 0.28369906834211955, + "step": 6330 + }, + { + "loss": 14.1252, + "grad_norm": 1.9282065629959106, + "learning_rate": 0.0005, + "epoch": 0.28392315923338507, + "step": 6335 + }, + { + "loss": 14.1557, + "grad_norm": 1.9459840059280396, + "learning_rate": 0.0005, + "epoch": 0.2841472501246506, + "step": 6340 + }, + { + "loss": 14.1244, + "grad_norm": 2.0548715591430664, + "learning_rate": 0.0005, + "epoch": 0.28437134101591605, + "step": 6345 + }, + { + "loss": 14.2648, + "grad_norm": 1.9488264322280884, + "learning_rate": 0.0005, + "epoch": 0.28459543190718156, + "step": 6350 + }, + { + "loss": 14.2416, + "grad_norm": 1.9849375486373901, + "learning_rate": 0.0005, + "epoch": 0.2848195227984471, + "step": 6355 + }, + { + "loss": 14.0532, + "grad_norm": 2.030043840408325, + "learning_rate": 0.0005, + "epoch": 0.28504361368971254, + "step": 6360 + }, + { + "loss": 14.1358, + "grad_norm": 1.9630738496780396, + "learning_rate": 0.0005, + "epoch": 0.28526770458097805, + "step": 6365 + }, + { + "loss": 14.1398, + "grad_norm": 1.936179757118225, + "learning_rate": 0.0005, + "epoch": 0.2854917954722435, + "step": 6370 + }, + { + "loss": 14.147, + "grad_norm": 1.8546801805496216, + "learning_rate": 0.0005, + "epoch": 0.28571588636350903, + "step": 6375 + }, + { + "loss": 14.1357, + "grad_norm": 1.7892138957977295, + "learning_rate": 0.0005, + "epoch": 0.28593997725477455, + "step": 6380 + }, + { + "loss": 14.1072, + "grad_norm": 2.2381319999694824, + "learning_rate": 0.0005, + "epoch": 0.28616406814604, + "step": 6385 + }, + { + "loss": 14.2096, + "grad_norm": 1.8750665187835693, + "learning_rate": 0.0005, + "epoch": 0.2863881590373055, + "step": 6390 + }, + { + "loss": 14.1466, + "grad_norm": 1.993774652481079, + "learning_rate": 0.0005, + "epoch": 0.28661224992857104, + "step": 6395 + }, + { + "loss": 14.1791, + "grad_norm": 1.9932512044906616, + "learning_rate": 0.0005, + "epoch": 0.2868363408198365, + "step": 6400 + }, + { + "loss": 13.9481, + "grad_norm": 1.9475047588348389, + "learning_rate": 0.0005, + "epoch": 0.287060431711102, + "step": 6405 + }, + { + "loss": 14.1085, + "grad_norm": 1.8606120347976685, + "learning_rate": 0.0005, + "epoch": 0.28728452260236753, + "step": 6410 + }, + { + "loss": 14.0986, + "grad_norm": 1.98119056224823, + "learning_rate": 0.0005, + "epoch": 0.287508613493633, + "step": 6415 + }, + { + "loss": 14.0864, + "grad_norm": 1.93229341506958, + "learning_rate": 0.0005, + "epoch": 0.2877327043848985, + "step": 6420 + }, + { + "loss": 14.1505, + "grad_norm": 2.0001397132873535, + "learning_rate": 0.0005, + "epoch": 0.287956795276164, + "step": 6425 + }, + { + "loss": 14.1825, + "grad_norm": 2.1840929985046387, + "learning_rate": 0.0005, + "epoch": 0.2881808861674295, + "step": 6430 + }, + { + "loss": 14.0742, + "grad_norm": 1.9578980207443237, + "learning_rate": 0.0005, + "epoch": 0.288404977058695, + "step": 6435 + }, + { + "loss": 14.0583, + "grad_norm": 1.9192719459533691, + "learning_rate": 0.0005, + "epoch": 0.2886290679499605, + "step": 6440 + }, + { + "loss": 14.1828, + "grad_norm": 1.7977638244628906, + "learning_rate": 0.0005, + "epoch": 0.288853158841226, + "step": 6445 + }, + { + "loss": 14.1839, + "grad_norm": 1.9880512952804565, + "learning_rate": 0.0005, + "epoch": 0.2890772497324915, + "step": 6450 + }, + { + "loss": 14.1521, + "grad_norm": 1.9016677141189575, + "learning_rate": 0.0005, + "epoch": 0.289301340623757, + "step": 6455 + }, + { + "loss": 14.0941, + "grad_norm": 2.009845018386841, + "learning_rate": 0.0005, + "epoch": 0.28952543151502247, + "step": 6460 + }, + { + "loss": 14.0867, + "grad_norm": 1.9944508075714111, + "learning_rate": 0.0005, + "epoch": 0.289749522406288, + "step": 6465 + }, + { + "loss": 14.12, + "grad_norm": 1.9572532176971436, + "learning_rate": 0.0005, + "epoch": 0.2899736132975535, + "step": 6470 + }, + { + "loss": 14.1033, + "grad_norm": 1.8286199569702148, + "learning_rate": 0.0005, + "epoch": 0.29019770418881896, + "step": 6475 + }, + { + "loss": 14.238, + "grad_norm": 1.8490030765533447, + "learning_rate": 0.0005, + "epoch": 0.2904217950800845, + "step": 6480 + }, + { + "loss": 14.0584, + "grad_norm": 1.883685827255249, + "learning_rate": 0.0005, + "epoch": 0.29064588597135, + "step": 6485 + }, + { + "loss": 14.1139, + "grad_norm": 1.8208873271942139, + "learning_rate": 0.0005, + "epoch": 0.29086997686261545, + "step": 6490 + }, + { + "loss": 14.1691, + "grad_norm": 1.79149329662323, + "learning_rate": 0.0005, + "epoch": 0.29109406775388097, + "step": 6495 + }, + { + "loss": 14.131, + "grad_norm": 1.8157325983047485, + "learning_rate": 0.0005, + "epoch": 0.2913181586451465, + "step": 6500 + }, + { + "eval_loss": 1.760825514793396, + "eval_runtime": 18.5425, + "eval_samples_per_second": 883.592, + "eval_steps_per_second": 7.928, + "epoch": 0.2913181586451465, + "step": 6500 + }, + { + "loss": 14.1269, + "grad_norm": 1.978583812713623, + "learning_rate": 0.0005, + "epoch": 0.29154224953641195, + "step": 6505 + }, + { + "loss": 14.1458, + "grad_norm": 1.9699724912643433, + "learning_rate": 0.0005, + "epoch": 0.29176634042767746, + "step": 6510 + }, + { + "loss": 14.127, + "grad_norm": 1.8420236110687256, + "learning_rate": 0.0005, + "epoch": 0.291990431318943, + "step": 6515 + }, + { + "loss": 14.1576, + "grad_norm": 2.1266531944274902, + "learning_rate": 0.0005, + "epoch": 0.29221452221020844, + "step": 6520 + }, + { + "loss": 14.057, + "grad_norm": 1.9557487964630127, + "learning_rate": 0.0005, + "epoch": 0.29243861310147395, + "step": 6525 + }, + { + "loss": 14.0618, + "grad_norm": 2.1013238430023193, + "learning_rate": 0.0005, + "epoch": 0.29266270399273947, + "step": 6530 + }, + { + "loss": 14.1422, + "grad_norm": 1.920631766319275, + "learning_rate": 0.0005, + "epoch": 0.29288679488400493, + "step": 6535 + }, + { + "loss": 14.0785, + "grad_norm": 1.8109145164489746, + "learning_rate": 0.0005, + "epoch": 0.29311088577527045, + "step": 6540 + }, + { + "loss": 14.036, + "grad_norm": 2.0602829456329346, + "learning_rate": 0.0005, + "epoch": 0.29333497666653596, + "step": 6545 + }, + { + "loss": 14.0582, + "grad_norm": 2.0971171855926514, + "learning_rate": 0.0005, + "epoch": 0.2935590675578014, + "step": 6550 + }, + { + "loss": 14.1295, + "grad_norm": 1.8590223789215088, + "learning_rate": 0.0005, + "epoch": 0.29378315844906694, + "step": 6555 + }, + { + "loss": 14.0767, + "grad_norm": 1.782094120979309, + "learning_rate": 0.0005, + "epoch": 0.29400724934033245, + "step": 6560 + }, + { + "loss": 14.1499, + "grad_norm": 2.0229482650756836, + "learning_rate": 0.0005, + "epoch": 0.2942313402315979, + "step": 6565 + }, + { + "loss": 14.0854, + "grad_norm": 2.0149924755096436, + "learning_rate": 0.0005, + "epoch": 0.29445543112286343, + "step": 6570 + }, + { + "loss": 14.1079, + "grad_norm": 1.915675401687622, + "learning_rate": 0.0005, + "epoch": 0.29467952201412895, + "step": 6575 + }, + { + "loss": 14.088, + "grad_norm": 1.950971245765686, + "learning_rate": 0.0005, + "epoch": 0.2949036129053944, + "step": 6580 + }, + { + "loss": 14.1089, + "grad_norm": 1.8409794569015503, + "learning_rate": 0.0005, + "epoch": 0.2951277037966599, + "step": 6585 + }, + { + "loss": 14.1737, + "grad_norm": 1.8910725116729736, + "learning_rate": 0.0005, + "epoch": 0.29535179468792544, + "step": 6590 + }, + { + "loss": 14.212, + "grad_norm": 2.172785997390747, + "learning_rate": 0.0005, + "epoch": 0.2955758855791909, + "step": 6595 + }, + { + "loss": 14.1142, + "grad_norm": 1.8371375799179077, + "learning_rate": 0.0005, + "epoch": 0.2957999764704564, + "step": 6600 + }, + { + "loss": 14.1355, + "grad_norm": 2.132636308670044, + "learning_rate": 0.0005, + "epoch": 0.29602406736172193, + "step": 6605 + }, + { + "loss": 14.1586, + "grad_norm": 1.8999865055084229, + "learning_rate": 0.0005, + "epoch": 0.2962481582529874, + "step": 6610 + }, + { + "loss": 14.0566, + "grad_norm": 1.8841404914855957, + "learning_rate": 0.0005, + "epoch": 0.2964722491442529, + "step": 6615 + }, + { + "loss": 14.1065, + "grad_norm": 1.9028798341751099, + "learning_rate": 0.0005, + "epoch": 0.2966963400355184, + "step": 6620 + }, + { + "loss": 14.221, + "grad_norm": 1.945740818977356, + "learning_rate": 0.0005, + "epoch": 0.2969204309267839, + "step": 6625 + }, + { + "loss": 14.1517, + "grad_norm": 1.882527232170105, + "learning_rate": 0.0005, + "epoch": 0.2971445218180494, + "step": 6630 + }, + { + "loss": 14.0765, + "grad_norm": 1.7825379371643066, + "learning_rate": 0.0005, + "epoch": 0.2973686127093149, + "step": 6635 + }, + { + "loss": 14.2537, + "grad_norm": 1.8401821851730347, + "learning_rate": 0.0005, + "epoch": 0.2975927036005804, + "step": 6640 + }, + { + "loss": 14.1541, + "grad_norm": 2.0787389278411865, + "learning_rate": 0.0005, + "epoch": 0.2978167944918459, + "step": 6645 + }, + { + "loss": 14.118, + "grad_norm": 1.9669815301895142, + "learning_rate": 0.0005, + "epoch": 0.2980408853831114, + "step": 6650 + }, + { + "loss": 14.1166, + "grad_norm": 1.958560824394226, + "learning_rate": 0.0005, + "epoch": 0.29826497627437687, + "step": 6655 + }, + { + "loss": 14.0878, + "grad_norm": 1.839176893234253, + "learning_rate": 0.0005, + "epoch": 0.2984890671656424, + "step": 6660 + }, + { + "loss": 14.0742, + "grad_norm": 1.9698538780212402, + "learning_rate": 0.0005, + "epoch": 0.2987131580569079, + "step": 6665 + }, + { + "loss": 14.11, + "grad_norm": 1.977521300315857, + "learning_rate": 0.0005, + "epoch": 0.29893724894817336, + "step": 6670 + }, + { + "loss": 14.0945, + "grad_norm": 1.8831443786621094, + "learning_rate": 0.0005, + "epoch": 0.2991613398394389, + "step": 6675 + }, + { + "loss": 14.1022, + "grad_norm": 1.9751718044281006, + "learning_rate": 0.0005, + "epoch": 0.2993854307307044, + "step": 6680 + }, + { + "loss": 14.0919, + "grad_norm": 1.9496080875396729, + "learning_rate": 0.0005, + "epoch": 0.29960952162196985, + "step": 6685 + }, + { + "loss": 14.1258, + "grad_norm": 1.9235432147979736, + "learning_rate": 0.0005, + "epoch": 0.29983361251323537, + "step": 6690 + }, + { + "loss": 14.2083, + "grad_norm": 2.081127166748047, + "learning_rate": 0.0005, + "epoch": 0.3000577034045009, + "step": 6695 + }, + { + "loss": 14.1192, + "grad_norm": 1.8495169878005981, + "learning_rate": 0.0005, + "epoch": 0.30028179429576635, + "step": 6700 + }, + { + "loss": 14.1334, + "grad_norm": 1.945540189743042, + "learning_rate": 0.0005, + "epoch": 0.30050588518703186, + "step": 6705 + }, + { + "loss": 14.0117, + "grad_norm": 1.8660441637039185, + "learning_rate": 0.0005, + "epoch": 0.3007299760782974, + "step": 6710 + }, + { + "loss": 14.0557, + "grad_norm": 2.1192219257354736, + "learning_rate": 0.0005, + "epoch": 0.30095406696956284, + "step": 6715 + }, + { + "loss": 14.144, + "grad_norm": 2.134988307952881, + "learning_rate": 0.0005, + "epoch": 0.30117815786082835, + "step": 6720 + }, + { + "loss": 14.2177, + "grad_norm": 1.8217895030975342, + "learning_rate": 0.0005, + "epoch": 0.30140224875209387, + "step": 6725 + }, + { + "loss": 14.1873, + "grad_norm": 1.897010087966919, + "learning_rate": 0.0005, + "epoch": 0.30162633964335933, + "step": 6730 + }, + { + "loss": 14.0938, + "grad_norm": 1.7802810668945312, + "learning_rate": 0.0005, + "epoch": 0.30185043053462485, + "step": 6735 + }, + { + "loss": 14.1105, + "grad_norm": 1.8884286880493164, + "learning_rate": 0.0005, + "epoch": 0.30207452142589036, + "step": 6740 + }, + { + "loss": 14.0623, + "grad_norm": 1.925511360168457, + "learning_rate": 0.0005, + "epoch": 0.3022986123171558, + "step": 6745 + }, + { + "loss": 14.078, + "grad_norm": 1.9921625852584839, + "learning_rate": 0.0005, + "epoch": 0.30252270320842134, + "step": 6750 + }, + { + "loss": 14.1964, + "grad_norm": 2.1286556720733643, + "learning_rate": 0.0005, + "epoch": 0.30274679409968686, + "step": 6755 + }, + { + "loss": 14.2371, + "grad_norm": 1.959270715713501, + "learning_rate": 0.0005, + "epoch": 0.3029708849909523, + "step": 6760 + }, + { + "loss": 13.9704, + "grad_norm": 2.141359806060791, + "learning_rate": 0.0005, + "epoch": 0.30319497588221783, + "step": 6765 + }, + { + "loss": 14.1439, + "grad_norm": 1.8885819911956787, + "learning_rate": 0.0005, + "epoch": 0.30341906677348335, + "step": 6770 + }, + { + "loss": 14.1109, + "grad_norm": 1.8617545366287231, + "learning_rate": 0.0005, + "epoch": 0.3036431576647488, + "step": 6775 + }, + { + "loss": 14.101, + "grad_norm": 2.027916431427002, + "learning_rate": 0.0005, + "epoch": 0.3038672485560143, + "step": 6780 + }, + { + "loss": 13.9948, + "grad_norm": 1.952297568321228, + "learning_rate": 0.0005, + "epoch": 0.30409133944727984, + "step": 6785 + }, + { + "loss": 14.0892, + "grad_norm": 1.88018000125885, + "learning_rate": 0.0005, + "epoch": 0.3043154303385453, + "step": 6790 + }, + { + "loss": 14.0721, + "grad_norm": 2.2838850021362305, + "learning_rate": 0.0005, + "epoch": 0.3045395212298108, + "step": 6795 + }, + { + "loss": 14.1025, + "grad_norm": 2.2587685585021973, + "learning_rate": 0.0005, + "epoch": 0.30476361212107633, + "step": 6800 + }, + { + "loss": 14.1222, + "grad_norm": 2.1795482635498047, + "learning_rate": 0.0005, + "epoch": 0.3049877030123418, + "step": 6805 + }, + { + "loss": 14.2536, + "grad_norm": 1.8609881401062012, + "learning_rate": 0.0005, + "epoch": 0.3052117939036073, + "step": 6810 + }, + { + "loss": 14.0862, + "grad_norm": 2.1692614555358887, + "learning_rate": 0.0005, + "epoch": 0.3054358847948728, + "step": 6815 + }, + { + "loss": 14.059, + "grad_norm": 2.0320756435394287, + "learning_rate": 0.0005, + "epoch": 0.3056599756861383, + "step": 6820 + }, + { + "loss": 14.0452, + "grad_norm": 2.1237852573394775, + "learning_rate": 0.0005, + "epoch": 0.3058840665774038, + "step": 6825 + }, + { + "loss": 14.0988, + "grad_norm": 1.8583446741104126, + "learning_rate": 0.0005, + "epoch": 0.3061081574686693, + "step": 6830 + }, + { + "loss": 14.0915, + "grad_norm": 2.0603761672973633, + "learning_rate": 0.0005, + "epoch": 0.3063322483599348, + "step": 6835 + }, + { + "loss": 14.0048, + "grad_norm": 2.0896894931793213, + "learning_rate": 0.0005, + "epoch": 0.3065563392512003, + "step": 6840 + }, + { + "loss": 14.2185, + "grad_norm": 2.165130376815796, + "learning_rate": 0.0005, + "epoch": 0.3067804301424658, + "step": 6845 + }, + { + "loss": 14.157, + "grad_norm": 2.0199732780456543, + "learning_rate": 0.0005, + "epoch": 0.30700452103373127, + "step": 6850 + }, + { + "loss": 14.1059, + "grad_norm": 2.059459686279297, + "learning_rate": 0.0005, + "epoch": 0.3072286119249968, + "step": 6855 + }, + { + "loss": 14.1432, + "grad_norm": 2.1732242107391357, + "learning_rate": 0.0005, + "epoch": 0.3074527028162623, + "step": 6860 + }, + { + "loss": 14.0289, + "grad_norm": 1.854026198387146, + "learning_rate": 0.0005, + "epoch": 0.30767679370752776, + "step": 6865 + }, + { + "loss": 14.0782, + "grad_norm": 1.9383268356323242, + "learning_rate": 0.0005, + "epoch": 0.3079008845987933, + "step": 6870 + }, + { + "loss": 14.0815, + "grad_norm": 1.9054582118988037, + "learning_rate": 0.0005, + "epoch": 0.3081249754900588, + "step": 6875 + }, + { + "loss": 14.1134, + "grad_norm": 1.9737333059310913, + "learning_rate": 0.0005, + "epoch": 0.30834906638132425, + "step": 6880 + }, + { + "loss": 14.0502, + "grad_norm": 1.8631350994110107, + "learning_rate": 0.0005, + "epoch": 0.30857315727258977, + "step": 6885 + }, + { + "loss": 14.1507, + "grad_norm": 1.9060652256011963, + "learning_rate": 0.0005, + "epoch": 0.3087972481638553, + "step": 6890 + }, + { + "loss": 14.0591, + "grad_norm": 2.0781631469726562, + "learning_rate": 0.0005, + "epoch": 0.30902133905512075, + "step": 6895 + }, + { + "loss": 14.012, + "grad_norm": 1.8879958391189575, + "learning_rate": 0.0005, + "epoch": 0.30924542994638626, + "step": 6900 + }, + { + "loss": 14.168, + "grad_norm": 1.9677551984786987, + "learning_rate": 0.0005, + "epoch": 0.3094695208376518, + "step": 6905 + }, + { + "loss": 14.0628, + "grad_norm": 2.168494462966919, + "learning_rate": 0.0005, + "epoch": 0.30969361172891724, + "step": 6910 + }, + { + "loss": 14.1082, + "grad_norm": 2.0503041744232178, + "learning_rate": 0.0005, + "epoch": 0.30991770262018276, + "step": 6915 + }, + { + "loss": 14.1525, + "grad_norm": 1.9112447500228882, + "learning_rate": 0.0005, + "epoch": 0.3101417935114482, + "step": 6920 + }, + { + "loss": 14.0622, + "grad_norm": 2.0286989212036133, + "learning_rate": 0.0005, + "epoch": 0.31036588440271373, + "step": 6925 + }, + { + "loss": 14.0246, + "grad_norm": 2.076118230819702, + "learning_rate": 0.0005, + "epoch": 0.31058997529397925, + "step": 6930 + }, + { + "loss": 14.1809, + "grad_norm": 2.0124707221984863, + "learning_rate": 0.0005, + "epoch": 0.3108140661852447, + "step": 6935 + }, + { + "loss": 14.0884, + "grad_norm": 1.8748708963394165, + "learning_rate": 0.0005, + "epoch": 0.3110381570765102, + "step": 6940 + }, + { + "loss": 14.0756, + "grad_norm": 1.9558440446853638, + "learning_rate": 0.0005, + "epoch": 0.31126224796777574, + "step": 6945 + }, + { + "loss": 14.082, + "grad_norm": 1.950158715248108, + "learning_rate": 0.0005, + "epoch": 0.3114863388590412, + "step": 6950 + }, + { + "loss": 14.0608, + "grad_norm": 2.2592644691467285, + "learning_rate": 0.0005, + "epoch": 0.3117104297503067, + "step": 6955 + }, + { + "loss": 14.1437, + "grad_norm": 1.9274147748947144, + "learning_rate": 0.0005, + "epoch": 0.31193452064157223, + "step": 6960 + }, + { + "loss": 14.1519, + "grad_norm": 1.9249687194824219, + "learning_rate": 0.0005, + "epoch": 0.3121586115328377, + "step": 6965 + }, + { + "loss": 14.1133, + "grad_norm": 1.9677976369857788, + "learning_rate": 0.0005, + "epoch": 0.3123827024241032, + "step": 6970 + }, + { + "loss": 14.027, + "grad_norm": 1.7901554107666016, + "learning_rate": 0.0005, + "epoch": 0.3126067933153687, + "step": 6975 + }, + { + "loss": 14.0051, + "grad_norm": 2.0552713871002197, + "learning_rate": 0.0005, + "epoch": 0.3128308842066342, + "step": 6980 + }, + { + "loss": 14.0906, + "grad_norm": 1.7620432376861572, + "learning_rate": 0.0005, + "epoch": 0.3130549750978997, + "step": 6985 + }, + { + "loss": 14.0703, + "grad_norm": 2.0076029300689697, + "learning_rate": 0.0005, + "epoch": 0.3132790659891652, + "step": 6990 + }, + { + "loss": 14.0815, + "grad_norm": 1.863505244255066, + "learning_rate": 0.0005, + "epoch": 0.3135031568804307, + "step": 6995 + }, + { + "loss": 14.1361, + "grad_norm": 1.9563758373260498, + "learning_rate": 0.0005, + "epoch": 0.3137272477716962, + "step": 7000 + }, + { + "eval_loss": 1.755889892578125, + "eval_runtime": 18.4089, + "eval_samples_per_second": 890.003, + "eval_steps_per_second": 7.985, + "epoch": 0.3137272477716962, + "step": 7000 + }, + { + "loss": 14.1046, + "grad_norm": 1.793610692024231, + "learning_rate": 0.0005, + "epoch": 0.3139513386629617, + "step": 7005 + }, + { + "loss": 14.0576, + "grad_norm": 1.8872146606445312, + "learning_rate": 0.0005, + "epoch": 0.31417542955422717, + "step": 7010 + }, + { + "loss": 14.2113, + "grad_norm": 2.029392719268799, + "learning_rate": 0.0005, + "epoch": 0.3143995204454927, + "step": 7015 + }, + { + "loss": 14.1018, + "grad_norm": 1.9868321418762207, + "learning_rate": 0.0005, + "epoch": 0.3146236113367582, + "step": 7020 + }, + { + "loss": 14.2401, + "grad_norm": 1.868456244468689, + "learning_rate": 0.0005, + "epoch": 0.31484770222802366, + "step": 7025 + }, + { + "loss": 14.1273, + "grad_norm": 1.926793098449707, + "learning_rate": 0.0005, + "epoch": 0.3150717931192892, + "step": 7030 + }, + { + "loss": 14.0381, + "grad_norm": 2.0602972507476807, + "learning_rate": 0.0005, + "epoch": 0.3152958840105547, + "step": 7035 + }, + { + "loss": 14.1099, + "grad_norm": 1.9188587665557861, + "learning_rate": 0.0005, + "epoch": 0.31551997490182015, + "step": 7040 + }, + { + "loss": 14.0984, + "grad_norm": 1.8581831455230713, + "learning_rate": 0.0005, + "epoch": 0.31574406579308567, + "step": 7045 + }, + { + "loss": 14.094, + "grad_norm": 1.9398704767227173, + "learning_rate": 0.0005, + "epoch": 0.3159681566843512, + "step": 7050 + }, + { + "loss": 14.1393, + "grad_norm": 2.011657953262329, + "learning_rate": 0.0005, + "epoch": 0.31619224757561665, + "step": 7055 + }, + { + "loss": 14.1263, + "grad_norm": 1.9346157312393188, + "learning_rate": 0.0005, + "epoch": 0.31641633846688216, + "step": 7060 + }, + { + "loss": 14.2194, + "grad_norm": 2.0041491985321045, + "learning_rate": 0.0005, + "epoch": 0.3166404293581477, + "step": 7065 + }, + { + "loss": 14.1546, + "grad_norm": 1.8866181373596191, + "learning_rate": 0.0005, + "epoch": 0.31686452024941314, + "step": 7070 + }, + { + "loss": 14.1474, + "grad_norm": 1.9563933610916138, + "learning_rate": 0.0005, + "epoch": 0.31708861114067866, + "step": 7075 + }, + { + "loss": 14.1298, + "grad_norm": 2.010119915008545, + "learning_rate": 0.0005, + "epoch": 0.31731270203194417, + "step": 7080 + }, + { + "loss": 14.1502, + "grad_norm": 2.1588711738586426, + "learning_rate": 0.0005, + "epoch": 0.31753679292320963, + "step": 7085 + }, + { + "loss": 14.1125, + "grad_norm": 2.150607109069824, + "learning_rate": 0.0005, + "epoch": 0.31776088381447515, + "step": 7090 + }, + { + "loss": 14.1021, + "grad_norm": 2.117875099182129, + "learning_rate": 0.0005, + "epoch": 0.31798497470574066, + "step": 7095 + }, + { + "loss": 14.0199, + "grad_norm": 1.9777783155441284, + "learning_rate": 0.0005, + "epoch": 0.3182090655970061, + "step": 7100 + }, + { + "loss": 14.0376, + "grad_norm": 2.0438952445983887, + "learning_rate": 0.0005, + "epoch": 0.31843315648827164, + "step": 7105 + }, + { + "loss": 14.1883, + "grad_norm": 2.117734432220459, + "learning_rate": 0.0005, + "epoch": 0.31865724737953716, + "step": 7110 + }, + { + "loss": 14.0348, + "grad_norm": 1.8688557147979736, + "learning_rate": 0.0005, + "epoch": 0.3188813382708026, + "step": 7115 + }, + { + "loss": 14.108, + "grad_norm": 1.9363583326339722, + "learning_rate": 0.0005, + "epoch": 0.31910542916206813, + "step": 7120 + }, + { + "loss": 14.11, + "grad_norm": 1.835010051727295, + "learning_rate": 0.0005, + "epoch": 0.31932952005333365, + "step": 7125 + }, + { + "loss": 14.0241, + "grad_norm": 1.6924667358398438, + "learning_rate": 0.0005, + "epoch": 0.3195536109445991, + "step": 7130 + }, + { + "loss": 14.0406, + "grad_norm": 1.8923516273498535, + "learning_rate": 0.0005, + "epoch": 0.3197777018358646, + "step": 7135 + }, + { + "loss": 14.1686, + "grad_norm": 2.049652338027954, + "learning_rate": 0.0005, + "epoch": 0.32000179272713014, + "step": 7140 + }, + { + "loss": 14.0831, + "grad_norm": 1.9465097188949585, + "learning_rate": 0.0005, + "epoch": 0.3202258836183956, + "step": 7145 + }, + { + "loss": 14.1339, + "grad_norm": 1.9414552450180054, + "learning_rate": 0.0005, + "epoch": 0.3204499745096611, + "step": 7150 + }, + { + "loss": 14.0689, + "grad_norm": 1.7531272172927856, + "learning_rate": 0.0005, + "epoch": 0.32067406540092663, + "step": 7155 + }, + { + "loss": 14.1045, + "grad_norm": 1.9618552923202515, + "learning_rate": 0.0005, + "epoch": 0.3208981562921921, + "step": 7160 + }, + { + "loss": 14.0208, + "grad_norm": 1.8808432817459106, + "learning_rate": 0.0005, + "epoch": 0.3211222471834576, + "step": 7165 + }, + { + "loss": 14.0674, + "grad_norm": 2.10398530960083, + "learning_rate": 0.0005, + "epoch": 0.3213463380747231, + "step": 7170 + }, + { + "loss": 14.1006, + "grad_norm": 1.930496096611023, + "learning_rate": 0.0005, + "epoch": 0.3215704289659886, + "step": 7175 + }, + { + "loss": 14.101, + "grad_norm": 2.353649139404297, + "learning_rate": 0.0005, + "epoch": 0.3217945198572541, + "step": 7180 + }, + { + "loss": 14.0608, + "grad_norm": 2.090111017227173, + "learning_rate": 0.0005, + "epoch": 0.3220186107485196, + "step": 7185 + }, + { + "loss": 14.1003, + "grad_norm": 1.959304928779602, + "learning_rate": 0.0005, + "epoch": 0.3222427016397851, + "step": 7190 + }, + { + "loss": 14.0667, + "grad_norm": 1.8106657266616821, + "learning_rate": 0.0005, + "epoch": 0.3224667925310506, + "step": 7195 + }, + { + "loss": 14.1066, + "grad_norm": 1.7654136419296265, + "learning_rate": 0.0005, + "epoch": 0.3226908834223161, + "step": 7200 + }, + { + "loss": 14.1614, + "grad_norm": 1.8367727994918823, + "learning_rate": 0.0005, + "epoch": 0.32291497431358157, + "step": 7205 + }, + { + "loss": 14.1704, + "grad_norm": 1.9710397720336914, + "learning_rate": 0.0005, + "epoch": 0.3231390652048471, + "step": 7210 + }, + { + "loss": 14.1573, + "grad_norm": 1.7852445840835571, + "learning_rate": 0.0005, + "epoch": 0.3233631560961126, + "step": 7215 + }, + { + "loss": 14.2409, + "grad_norm": 1.9539028406143188, + "learning_rate": 0.0005, + "epoch": 0.32358724698737806, + "step": 7220 + }, + { + "loss": 14.0829, + "grad_norm": 1.8820106983184814, + "learning_rate": 0.0005, + "epoch": 0.3238113378786436, + "step": 7225 + }, + { + "loss": 13.9979, + "grad_norm": 1.8565396070480347, + "learning_rate": 0.0005, + "epoch": 0.3240354287699091, + "step": 7230 + }, + { + "loss": 13.9935, + "grad_norm": 1.9052975177764893, + "learning_rate": 0.0005, + "epoch": 0.32425951966117456, + "step": 7235 + }, + { + "loss": 14.1653, + "grad_norm": 1.9659631252288818, + "learning_rate": 0.0005, + "epoch": 0.32448361055244007, + "step": 7240 + }, + { + "loss": 14.1506, + "grad_norm": 2.118605375289917, + "learning_rate": 0.0005, + "epoch": 0.3247077014437056, + "step": 7245 + }, + { + "loss": 14.2135, + "grad_norm": 2.056361436843872, + "learning_rate": 0.0005, + "epoch": 0.32493179233497105, + "step": 7250 + }, + { + "loss": 14.1589, + "grad_norm": 2.0376381874084473, + "learning_rate": 0.0005, + "epoch": 0.32515588322623656, + "step": 7255 + }, + { + "loss": 14.0433, + "grad_norm": 2.0109989643096924, + "learning_rate": 0.0005, + "epoch": 0.3253799741175021, + "step": 7260 + }, + { + "loss": 14.1103, + "grad_norm": 1.9517523050308228, + "learning_rate": 0.0005, + "epoch": 0.32560406500876754, + "step": 7265 + }, + { + "loss": 14.052, + "grad_norm": 2.0111052989959717, + "learning_rate": 0.0005, + "epoch": 0.32582815590003306, + "step": 7270 + }, + { + "loss": 14.0868, + "grad_norm": 1.9086047410964966, + "learning_rate": 0.0005, + "epoch": 0.32605224679129857, + "step": 7275 + }, + { + "loss": 14.1952, + "grad_norm": 1.7707585096359253, + "learning_rate": 0.0005, + "epoch": 0.32627633768256403, + "step": 7280 + }, + { + "loss": 14.149, + "grad_norm": 1.7972558736801147, + "learning_rate": 0.0005, + "epoch": 0.32650042857382955, + "step": 7285 + }, + { + "loss": 14.0481, + "grad_norm": 1.899881362915039, + "learning_rate": 0.0005, + "epoch": 0.32672451946509506, + "step": 7290 + }, + { + "loss": 14.1443, + "grad_norm": 1.952229619026184, + "learning_rate": 0.0005, + "epoch": 0.3269486103563605, + "step": 7295 + }, + { + "loss": 14.1473, + "grad_norm": 1.8101708889007568, + "learning_rate": 0.0005, + "epoch": 0.32717270124762604, + "step": 7300 + }, + { + "loss": 14.0977, + "grad_norm": 2.0748109817504883, + "learning_rate": 0.0005, + "epoch": 0.32739679213889156, + "step": 7305 + }, + { + "loss": 14.1556, + "grad_norm": 1.8937422037124634, + "learning_rate": 0.0005, + "epoch": 0.327620883030157, + "step": 7310 + }, + { + "loss": 13.9949, + "grad_norm": 2.060307502746582, + "learning_rate": 0.0005, + "epoch": 0.32784497392142253, + "step": 7315 + }, + { + "loss": 14.193, + "grad_norm": 1.9886871576309204, + "learning_rate": 0.0005, + "epoch": 0.32806906481268805, + "step": 7320 + }, + { + "loss": 14.1288, + "grad_norm": 1.7452255487442017, + "learning_rate": 0.0005, + "epoch": 0.3282931557039535, + "step": 7325 + }, + { + "loss": 14.0731, + "grad_norm": 1.8011245727539062, + "learning_rate": 0.0005, + "epoch": 0.328517246595219, + "step": 7330 + }, + { + "loss": 14.1108, + "grad_norm": 1.8402856588363647, + "learning_rate": 0.0005, + "epoch": 0.32874133748648454, + "step": 7335 + }, + { + "loss": 14.1586, + "grad_norm": 1.8013571500778198, + "learning_rate": 0.0005, + "epoch": 0.32896542837775, + "step": 7340 + }, + { + "loss": 14.0897, + "grad_norm": 1.759786605834961, + "learning_rate": 0.0005, + "epoch": 0.3291895192690155, + "step": 7345 + }, + { + "loss": 14.1886, + "grad_norm": 1.9093493223190308, + "learning_rate": 0.0005, + "epoch": 0.32941361016028103, + "step": 7350 + }, + { + "loss": 14.0965, + "grad_norm": 1.7506341934204102, + "learning_rate": 0.0005, + "epoch": 0.3296377010515465, + "step": 7355 + }, + { + "loss": 14.007, + "grad_norm": 1.922635793685913, + "learning_rate": 0.0005, + "epoch": 0.329861791942812, + "step": 7360 + }, + { + "loss": 14.127, + "grad_norm": 1.9030444622039795, + "learning_rate": 0.0005, + "epoch": 0.3300858828340775, + "step": 7365 + }, + { + "loss": 14.0548, + "grad_norm": 1.9263081550598145, + "learning_rate": 0.0005, + "epoch": 0.330309973725343, + "step": 7370 + }, + { + "loss": 14.1499, + "grad_norm": 1.7502036094665527, + "learning_rate": 0.0005, + "epoch": 0.3305340646166085, + "step": 7375 + }, + { + "loss": 14.0007, + "grad_norm": 2.1209959983825684, + "learning_rate": 0.0005, + "epoch": 0.330758155507874, + "step": 7380 + }, + { + "loss": 14.0438, + "grad_norm": 2.0219545364379883, + "learning_rate": 0.0005, + "epoch": 0.3309822463991395, + "step": 7385 + }, + { + "loss": 14.1204, + "grad_norm": 1.984423041343689, + "learning_rate": 0.0005, + "epoch": 0.331206337290405, + "step": 7390 + }, + { + "loss": 14.1094, + "grad_norm": 2.0163533687591553, + "learning_rate": 0.0005, + "epoch": 0.3314304281816705, + "step": 7395 + }, + { + "loss": 14.1005, + "grad_norm": 1.7359834909439087, + "learning_rate": 0.0005, + "epoch": 0.33165451907293597, + "step": 7400 + }, + { + "loss": 14.1199, + "grad_norm": 1.8637721538543701, + "learning_rate": 0.0005, + "epoch": 0.3318786099642015, + "step": 7405 + }, + { + "loss": 14.1242, + "grad_norm": 1.927276372909546, + "learning_rate": 0.0005, + "epoch": 0.332102700855467, + "step": 7410 + }, + { + "loss": 14.1951, + "grad_norm": 2.265612840652466, + "learning_rate": 0.0005, + "epoch": 0.33232679174673246, + "step": 7415 + }, + { + "loss": 14.1503, + "grad_norm": 1.9447145462036133, + "learning_rate": 0.0005, + "epoch": 0.332550882637998, + "step": 7420 + }, + { + "loss": 14.0934, + "grad_norm": 1.9777686595916748, + "learning_rate": 0.0005, + "epoch": 0.3327749735292635, + "step": 7425 + }, + { + "loss": 14.0585, + "grad_norm": 1.8262908458709717, + "learning_rate": 0.0005, + "epoch": 0.33299906442052896, + "step": 7430 + }, + { + "loss": 14.0943, + "grad_norm": 2.1320831775665283, + "learning_rate": 0.0005, + "epoch": 0.33322315531179447, + "step": 7435 + }, + { + "loss": 14.1044, + "grad_norm": 2.061400890350342, + "learning_rate": 0.0005, + "epoch": 0.33344724620306, + "step": 7440 + }, + { + "loss": 14.0905, + "grad_norm": 1.9482510089874268, + "learning_rate": 0.0005, + "epoch": 0.33367133709432545, + "step": 7445 + }, + { + "loss": 14.1586, + "grad_norm": 1.8687303066253662, + "learning_rate": 0.0005, + "epoch": 0.33389542798559096, + "step": 7450 + }, + { + "loss": 13.9945, + "grad_norm": 1.830513834953308, + "learning_rate": 0.0005, + "epoch": 0.3341195188768565, + "step": 7455 + }, + { + "loss": 14.1178, + "grad_norm": 1.9088151454925537, + "learning_rate": 0.0005, + "epoch": 0.33434360976812194, + "step": 7460 + }, + { + "loss": 14.0652, + "grad_norm": 1.9446756839752197, + "learning_rate": 0.0005, + "epoch": 0.33456770065938746, + "step": 7465 + }, + { + "loss": 14.0836, + "grad_norm": 1.9063694477081299, + "learning_rate": 0.0005, + "epoch": 0.3347917915506529, + "step": 7470 + }, + { + "loss": 14.1995, + "grad_norm": 2.0292277336120605, + "learning_rate": 0.0005, + "epoch": 0.33501588244191843, + "step": 7475 + }, + { + "loss": 14.0583, + "grad_norm": 1.8030997514724731, + "learning_rate": 0.0005, + "epoch": 0.33523997333318395, + "step": 7480 + }, + { + "loss": 14.0631, + "grad_norm": 1.8953999280929565, + "learning_rate": 0.0005, + "epoch": 0.3354640642244494, + "step": 7485 + }, + { + "loss": 14.0377, + "grad_norm": 1.8723889589309692, + "learning_rate": 0.0005, + "epoch": 0.3356881551157149, + "step": 7490 + }, + { + "loss": 14.1068, + "grad_norm": 2.212399959564209, + "learning_rate": 0.0005, + "epoch": 0.33591224600698044, + "step": 7495 + }, + { + "loss": 14.2251, + "grad_norm": 1.9140340089797974, + "learning_rate": 0.0005, + "epoch": 0.3361363368982459, + "step": 7500 + }, + { + "eval_loss": 1.758628249168396, + "eval_runtime": 18.346, + "eval_samples_per_second": 893.054, + "eval_steps_per_second": 8.013, + "epoch": 0.3361363368982459, + "step": 7500 + }, + { + "loss": 14.0754, + "grad_norm": 1.972398042678833, + "learning_rate": 0.0005, + "epoch": 0.3363604277895114, + "step": 7505 + }, + { + "loss": 14.1095, + "grad_norm": 2.093316078186035, + "learning_rate": 0.0005, + "epoch": 0.33658451868077693, + "step": 7510 + }, + { + "loss": 14.1565, + "grad_norm": 1.9581537246704102, + "learning_rate": 0.0005, + "epoch": 0.3368086095720424, + "step": 7515 + }, + { + "loss": 14.0715, + "grad_norm": 1.897660732269287, + "learning_rate": 0.0005, + "epoch": 0.3370327004633079, + "step": 7520 + }, + { + "loss": 14.1641, + "grad_norm": 1.992495059967041, + "learning_rate": 0.0005, + "epoch": 0.3372567913545734, + "step": 7525 + }, + { + "loss": 14.1386, + "grad_norm": 1.9933212995529175, + "learning_rate": 0.0005, + "epoch": 0.3374808822458389, + "step": 7530 + }, + { + "loss": 14.0411, + "grad_norm": 2.030996561050415, + "learning_rate": 0.0005, + "epoch": 0.3377049731371044, + "step": 7535 + }, + { + "loss": 14.1321, + "grad_norm": 1.9262065887451172, + "learning_rate": 0.0005, + "epoch": 0.3379290640283699, + "step": 7540 + }, + { + "loss": 14.1055, + "grad_norm": 2.041747808456421, + "learning_rate": 0.0005, + "epoch": 0.3381531549196354, + "step": 7545 + }, + { + "loss": 14.103, + "grad_norm": 1.9697359800338745, + "learning_rate": 0.0005, + "epoch": 0.3383772458109009, + "step": 7550 + }, + { + "loss": 14.0462, + "grad_norm": 1.863049864768982, + "learning_rate": 0.0005, + "epoch": 0.3386013367021664, + "step": 7555 + }, + { + "loss": 13.9718, + "grad_norm": 1.8301945924758911, + "learning_rate": 0.0005, + "epoch": 0.33882542759343187, + "step": 7560 + }, + { + "loss": 13.9904, + "grad_norm": 1.8847441673278809, + "learning_rate": 0.0005, + "epoch": 0.3390495184846974, + "step": 7565 + }, + { + "loss": 14.0646, + "grad_norm": 1.8905744552612305, + "learning_rate": 0.0005, + "epoch": 0.3392736093759629, + "step": 7570 + }, + { + "loss": 14.0983, + "grad_norm": 2.0466771125793457, + "learning_rate": 0.0005, + "epoch": 0.33949770026722836, + "step": 7575 + }, + { + "loss": 14.0606, + "grad_norm": 1.8803582191467285, + "learning_rate": 0.0005, + "epoch": 0.3397217911584939, + "step": 7580 + }, + { + "loss": 14.078, + "grad_norm": 1.9058480262756348, + "learning_rate": 0.0005, + "epoch": 0.3399458820497594, + "step": 7585 + }, + { + "loss": 13.9842, + "grad_norm": 1.855478286743164, + "learning_rate": 0.0005, + "epoch": 0.34016997294102486, + "step": 7590 + }, + { + "loss": 14.0922, + "grad_norm": 1.8349817991256714, + "learning_rate": 0.0005, + "epoch": 0.34039406383229037, + "step": 7595 + }, + { + "loss": 14.0174, + "grad_norm": 2.0688560009002686, + "learning_rate": 0.0005, + "epoch": 0.3406181547235559, + "step": 7600 + }, + { + "loss": 14.0327, + "grad_norm": 2.0723955631256104, + "learning_rate": 0.0005, + "epoch": 0.34084224561482135, + "step": 7605 + }, + { + "loss": 14.0648, + "grad_norm": 1.9300533533096313, + "learning_rate": 0.0005, + "epoch": 0.34106633650608686, + "step": 7610 + }, + { + "loss": 14.1158, + "grad_norm": 1.8659693002700806, + "learning_rate": 0.0005, + "epoch": 0.3412904273973524, + "step": 7615 + }, + { + "loss": 14.0362, + "grad_norm": 2.035356283187866, + "learning_rate": 0.0005, + "epoch": 0.34151451828861784, + "step": 7620 + }, + { + "loss": 14.1092, + "grad_norm": 1.8583754301071167, + "learning_rate": 0.0005, + "epoch": 0.34173860917988336, + "step": 7625 + }, + { + "loss": 14.0497, + "grad_norm": 1.897011637687683, + "learning_rate": 0.0005, + "epoch": 0.3419627000711489, + "step": 7630 + }, + { + "loss": 14.1952, + "grad_norm": 1.9849909543991089, + "learning_rate": 0.0005, + "epoch": 0.34218679096241433, + "step": 7635 + }, + { + "loss": 14.0395, + "grad_norm": 1.7245944738388062, + "learning_rate": 0.0005, + "epoch": 0.34241088185367985, + "step": 7640 + }, + { + "loss": 14.0689, + "grad_norm": 1.7866997718811035, + "learning_rate": 0.0005, + "epoch": 0.34263497274494537, + "step": 7645 + }, + { + "loss": 14.1168, + "grad_norm": 2.009934186935425, + "learning_rate": 0.0005, + "epoch": 0.3428590636362108, + "step": 7650 + }, + { + "loss": 14.1508, + "grad_norm": 1.8619006872177124, + "learning_rate": 0.0005, + "epoch": 0.34308315452747634, + "step": 7655 + }, + { + "loss": 14.1192, + "grad_norm": 1.9013493061065674, + "learning_rate": 0.0005, + "epoch": 0.34330724541874186, + "step": 7660 + }, + { + "loss": 14.0782, + "grad_norm": 1.7894394397735596, + "learning_rate": 0.0005, + "epoch": 0.3435313363100073, + "step": 7665 + }, + { + "loss": 14.0461, + "grad_norm": 1.9507958889007568, + "learning_rate": 0.0005, + "epoch": 0.34375542720127283, + "step": 7670 + }, + { + "loss": 14.0361, + "grad_norm": 1.9026128053665161, + "learning_rate": 0.0005, + "epoch": 0.34397951809253835, + "step": 7675 + }, + { + "loss": 14.1364, + "grad_norm": 2.021289825439453, + "learning_rate": 0.0005, + "epoch": 0.3442036089838038, + "step": 7680 + }, + { + "loss": 14.15, + "grad_norm": 2.0237743854522705, + "learning_rate": 0.0005, + "epoch": 0.3444276998750693, + "step": 7685 + }, + { + "loss": 14.0614, + "grad_norm": 1.9201725721359253, + "learning_rate": 0.0005, + "epoch": 0.34465179076633484, + "step": 7690 + }, + { + "loss": 14.2131, + "grad_norm": 1.8731049299240112, + "learning_rate": 0.0005, + "epoch": 0.3448758816576003, + "step": 7695 + }, + { + "loss": 14.0291, + "grad_norm": 1.8014097213745117, + "learning_rate": 0.0005, + "epoch": 0.3450999725488658, + "step": 7700 + }, + { + "loss": 14.0754, + "grad_norm": 1.9392848014831543, + "learning_rate": 0.0005, + "epoch": 0.34532406344013133, + "step": 7705 + }, + { + "loss": 14.1579, + "grad_norm": 2.0457603931427, + "learning_rate": 0.0005, + "epoch": 0.3455481543313968, + "step": 7710 + }, + { + "loss": 14.1939, + "grad_norm": 1.9583426713943481, + "learning_rate": 0.0005, + "epoch": 0.3457722452226623, + "step": 7715 + }, + { + "loss": 14.1407, + "grad_norm": 2.2009177207946777, + "learning_rate": 0.0005, + "epoch": 0.3459963361139278, + "step": 7720 + }, + { + "loss": 14.0798, + "grad_norm": 1.8904290199279785, + "learning_rate": 0.0005, + "epoch": 0.3462204270051933, + "step": 7725 + }, + { + "loss": 14.1414, + "grad_norm": 1.900882601737976, + "learning_rate": 0.0005, + "epoch": 0.3464445178964588, + "step": 7730 + }, + { + "loss": 14.0602, + "grad_norm": 1.8346198797225952, + "learning_rate": 0.0005, + "epoch": 0.3466686087877243, + "step": 7735 + }, + { + "loss": 13.9476, + "grad_norm": 1.9995858669281006, + "learning_rate": 0.0005, + "epoch": 0.3468926996789898, + "step": 7740 + }, + { + "loss": 14.1434, + "grad_norm": 2.0434787273406982, + "learning_rate": 0.0005, + "epoch": 0.3471167905702553, + "step": 7745 + }, + { + "loss": 14.1186, + "grad_norm": 2.11923885345459, + "learning_rate": 0.0005, + "epoch": 0.3473408814615208, + "step": 7750 + }, + { + "loss": 14.0013, + "grad_norm": 2.0023956298828125, + "learning_rate": 0.0005, + "epoch": 0.34756497235278627, + "step": 7755 + }, + { + "loss": 14.1206, + "grad_norm": 2.100276470184326, + "learning_rate": 0.0005, + "epoch": 0.3477890632440518, + "step": 7760 + }, + { + "loss": 14.1161, + "grad_norm": 1.7902406454086304, + "learning_rate": 0.0005, + "epoch": 0.3480131541353173, + "step": 7765 + }, + { + "loss": 14.0546, + "grad_norm": 2.0395708084106445, + "learning_rate": 0.0005, + "epoch": 0.34823724502658276, + "step": 7770 + }, + { + "loss": 14.1542, + "grad_norm": 2.015235185623169, + "learning_rate": 0.0005, + "epoch": 0.3484613359178483, + "step": 7775 + }, + { + "loss": 14.0569, + "grad_norm": 1.918387770652771, + "learning_rate": 0.0005, + "epoch": 0.3486854268091138, + "step": 7780 + }, + { + "loss": 14.145, + "grad_norm": 1.950944185256958, + "learning_rate": 0.0005, + "epoch": 0.34890951770037926, + "step": 7785 + }, + { + "loss": 13.9847, + "grad_norm": 2.028846263885498, + "learning_rate": 0.0005, + "epoch": 0.3491336085916448, + "step": 7790 + }, + { + "loss": 14.1951, + "grad_norm": 1.9557349681854248, + "learning_rate": 0.0005, + "epoch": 0.3493576994829103, + "step": 7795 + }, + { + "loss": 14.1204, + "grad_norm": 2.0673904418945312, + "learning_rate": 0.0005, + "epoch": 0.34958179037417575, + "step": 7800 + }, + { + "loss": 14.0529, + "grad_norm": 1.918513298034668, + "learning_rate": 0.0005, + "epoch": 0.34980588126544127, + "step": 7805 + }, + { + "loss": 14.0581, + "grad_norm": 1.9200891256332397, + "learning_rate": 0.0005, + "epoch": 0.3500299721567068, + "step": 7810 + }, + { + "loss": 14.0796, + "grad_norm": 1.8107129335403442, + "learning_rate": 0.0005, + "epoch": 0.35025406304797224, + "step": 7815 + }, + { + "loss": 14.2017, + "grad_norm": 1.9424742460250854, + "learning_rate": 0.0005, + "epoch": 0.35047815393923776, + "step": 7820 + }, + { + "loss": 14.0347, + "grad_norm": 1.8949542045593262, + "learning_rate": 0.0005, + "epoch": 0.3507022448305033, + "step": 7825 + }, + { + "loss": 14.1288, + "grad_norm": 1.9397120475769043, + "learning_rate": 0.0005, + "epoch": 0.35092633572176873, + "step": 7830 + }, + { + "loss": 14.0256, + "grad_norm": 1.869429349899292, + "learning_rate": 0.0005, + "epoch": 0.35115042661303425, + "step": 7835 + }, + { + "loss": 14.1089, + "grad_norm": 1.8402559757232666, + "learning_rate": 0.0005, + "epoch": 0.35137451750429977, + "step": 7840 + }, + { + "loss": 14.1165, + "grad_norm": 2.3390116691589355, + "learning_rate": 0.0005, + "epoch": 0.3515986083955652, + "step": 7845 + }, + { + "loss": 14.0348, + "grad_norm": 1.8177026510238647, + "learning_rate": 0.0005, + "epoch": 0.35182269928683074, + "step": 7850 + }, + { + "loss": 14.0949, + "grad_norm": 1.8412022590637207, + "learning_rate": 0.0005, + "epoch": 0.35204679017809626, + "step": 7855 + }, + { + "loss": 14.1576, + "grad_norm": 1.8210394382476807, + "learning_rate": 0.0005, + "epoch": 0.3522708810693617, + "step": 7860 + }, + { + "loss": 14.027, + "grad_norm": 2.008986234664917, + "learning_rate": 0.0005, + "epoch": 0.35249497196062723, + "step": 7865 + }, + { + "loss": 14.0451, + "grad_norm": 1.8268238306045532, + "learning_rate": 0.0005, + "epoch": 0.35271906285189275, + "step": 7870 + }, + { + "loss": 14.0394, + "grad_norm": 1.906351923942566, + "learning_rate": 0.0005, + "epoch": 0.3529431537431582, + "step": 7875 + }, + { + "loss": 14.0353, + "grad_norm": 1.8411533832550049, + "learning_rate": 0.0005, + "epoch": 0.3531672446344237, + "step": 7880 + }, + { + "loss": 14.0102, + "grad_norm": 1.8553107976913452, + "learning_rate": 0.0005, + "epoch": 0.35339133552568924, + "step": 7885 + }, + { + "loss": 14.1753, + "grad_norm": 1.7922219038009644, + "learning_rate": 0.0005, + "epoch": 0.3536154264169547, + "step": 7890 + }, + { + "loss": 14.2017, + "grad_norm": 1.9880884885787964, + "learning_rate": 0.0005, + "epoch": 0.3538395173082202, + "step": 7895 + }, + { + "loss": 14.1431, + "grad_norm": 1.9502806663513184, + "learning_rate": 0.0005, + "epoch": 0.35406360819948574, + "step": 7900 + }, + { + "loss": 14.1458, + "grad_norm": 1.8457231521606445, + "learning_rate": 0.0005, + "epoch": 0.3542876990907512, + "step": 7905 + }, + { + "loss": 14.2067, + "grad_norm": 1.8743624687194824, + "learning_rate": 0.0005, + "epoch": 0.3545117899820167, + "step": 7910 + }, + { + "loss": 14.0933, + "grad_norm": 1.9435484409332275, + "learning_rate": 0.0005, + "epoch": 0.3547358808732822, + "step": 7915 + }, + { + "loss": 14.1251, + "grad_norm": 2.1356041431427, + "learning_rate": 0.0005, + "epoch": 0.3549599717645477, + "step": 7920 + }, + { + "loss": 14.033, + "grad_norm": 2.004192590713501, + "learning_rate": 0.0005, + "epoch": 0.3551840626558132, + "step": 7925 + }, + { + "loss": 14.0399, + "grad_norm": 2.1297152042388916, + "learning_rate": 0.0005, + "epoch": 0.3554081535470787, + "step": 7930 + }, + { + "loss": 14.1168, + "grad_norm": 2.082568407058716, + "learning_rate": 0.0005, + "epoch": 0.3556322444383442, + "step": 7935 + }, + { + "loss": 14.1034, + "grad_norm": 2.28765869140625, + "learning_rate": 0.0005, + "epoch": 0.3558563353296097, + "step": 7940 + }, + { + "loss": 14.1522, + "grad_norm": 2.0880069732666016, + "learning_rate": 0.0005, + "epoch": 0.3560804262208752, + "step": 7945 + }, + { + "loss": 14.1474, + "grad_norm": 1.9852226972579956, + "learning_rate": 0.0005, + "epoch": 0.3563045171121407, + "step": 7950 + }, + { + "loss": 14.148, + "grad_norm": 1.974709391593933, + "learning_rate": 0.0005, + "epoch": 0.3565286080034062, + "step": 7955 + }, + { + "loss": 14.0528, + "grad_norm": 1.8595918416976929, + "learning_rate": 0.0005, + "epoch": 0.3567526988946717, + "step": 7960 + }, + { + "loss": 13.9739, + "grad_norm": 1.963915467262268, + "learning_rate": 0.0005, + "epoch": 0.35697678978593717, + "step": 7965 + }, + { + "loss": 14.066, + "grad_norm": 1.8649743795394897, + "learning_rate": 0.0005, + "epoch": 0.3572008806772027, + "step": 7970 + }, + { + "loss": 14.0075, + "grad_norm": 1.9949859380722046, + "learning_rate": 0.0005, + "epoch": 0.3574249715684682, + "step": 7975 + }, + { + "loss": 13.947, + "grad_norm": 1.871898889541626, + "learning_rate": 0.0005, + "epoch": 0.35764906245973366, + "step": 7980 + }, + { + "loss": 14.067, + "grad_norm": 1.872648000717163, + "learning_rate": 0.0005, + "epoch": 0.3578731533509992, + "step": 7985 + }, + { + "loss": 14.1521, + "grad_norm": 1.8796306848526, + "learning_rate": 0.0005, + "epoch": 0.3580972442422647, + "step": 7990 + }, + { + "loss": 14.0508, + "grad_norm": 2.0444676876068115, + "learning_rate": 0.0005, + "epoch": 0.35832133513353015, + "step": 7995 + }, + { + "loss": 14.0414, + "grad_norm": 1.9715629816055298, + "learning_rate": 0.0005, + "epoch": 0.35854542602479567, + "step": 8000 + }, + { + "eval_loss": 1.755042314529419, + "eval_runtime": 18.6527, + "eval_samples_per_second": 878.373, + "eval_steps_per_second": 7.881, + "epoch": 0.35854542602479567, + "step": 8000 + }, + { + "loss": 14.0417, + "grad_norm": 1.8838669061660767, + "learning_rate": 0.0005, + "epoch": 0.3587695169160612, + "step": 8005 + }, + { + "loss": 14.1389, + "grad_norm": 2.0093634128570557, + "learning_rate": 0.0005, + "epoch": 0.35899360780732664, + "step": 8010 + }, + { + "loss": 14.1017, + "grad_norm": 1.9981526136398315, + "learning_rate": 0.0005, + "epoch": 0.35921769869859216, + "step": 8015 + }, + { + "loss": 14.2044, + "grad_norm": 1.8038954734802246, + "learning_rate": 0.0005, + "epoch": 0.3594417895898576, + "step": 8020 + }, + { + "loss": 14.0573, + "grad_norm": 1.8292555809020996, + "learning_rate": 0.0005, + "epoch": 0.35966588048112313, + "step": 8025 + }, + { + "loss": 14.118, + "grad_norm": 2.0507237911224365, + "learning_rate": 0.0005, + "epoch": 0.35988997137238865, + "step": 8030 + }, + { + "loss": 14.1929, + "grad_norm": 1.936213493347168, + "learning_rate": 0.0005, + "epoch": 0.3601140622636541, + "step": 8035 + }, + { + "loss": 14.0351, + "grad_norm": 1.9231542348861694, + "learning_rate": 0.0005, + "epoch": 0.3603381531549196, + "step": 8040 + }, + { + "loss": 13.9999, + "grad_norm": 1.9181580543518066, + "learning_rate": 0.0005, + "epoch": 0.36056224404618514, + "step": 8045 + }, + { + "loss": 14.1313, + "grad_norm": 1.8799428939819336, + "learning_rate": 0.0005, + "epoch": 0.3607863349374506, + "step": 8050 + }, + { + "loss": 14.1121, + "grad_norm": 1.8789302110671997, + "learning_rate": 0.0005, + "epoch": 0.3610104258287161, + "step": 8055 + }, + { + "loss": 14.1483, + "grad_norm": 1.9037946462631226, + "learning_rate": 0.0005, + "epoch": 0.36123451671998164, + "step": 8060 + }, + { + "loss": 14.0798, + "grad_norm": 1.95029878616333, + "learning_rate": 0.0005, + "epoch": 0.3614586076112471, + "step": 8065 + }, + { + "loss": 14.0892, + "grad_norm": 1.9673045873641968, + "learning_rate": 0.0005, + "epoch": 0.3616826985025126, + "step": 8070 + }, + { + "loss": 14.0908, + "grad_norm": 1.9095479249954224, + "learning_rate": 0.0005, + "epoch": 0.3619067893937781, + "step": 8075 + }, + { + "loss": 14.1214, + "grad_norm": 1.9915626049041748, + "learning_rate": 0.0005, + "epoch": 0.3621308802850436, + "step": 8080 + }, + { + "loss": 14.2213, + "grad_norm": 1.9557825326919556, + "learning_rate": 0.0005, + "epoch": 0.3623549711763091, + "step": 8085 + }, + { + "loss": 14.0364, + "grad_norm": 2.075934886932373, + "learning_rate": 0.0005, + "epoch": 0.3625790620675746, + "step": 8090 + }, + { + "loss": 14.077, + "grad_norm": 1.8182425498962402, + "learning_rate": 0.0005, + "epoch": 0.3628031529588401, + "step": 8095 + }, + { + "loss": 14.0508, + "grad_norm": 2.0258545875549316, + "learning_rate": 0.0005, + "epoch": 0.3630272438501056, + "step": 8100 + }, + { + "loss": 14.0646, + "grad_norm": 2.0093271732330322, + "learning_rate": 0.0005, + "epoch": 0.3632513347413711, + "step": 8105 + }, + { + "loss": 14.105, + "grad_norm": 1.894214391708374, + "learning_rate": 0.0005, + "epoch": 0.3634754256326366, + "step": 8110 + }, + { + "loss": 14.1646, + "grad_norm": 2.042280912399292, + "learning_rate": 0.0005, + "epoch": 0.3636995165239021, + "step": 8115 + }, + { + "loss": 14.0829, + "grad_norm": 1.8640018701553345, + "learning_rate": 0.0005, + "epoch": 0.3639236074151676, + "step": 8120 + }, + { + "loss": 14.1016, + "grad_norm": 1.757461428642273, + "learning_rate": 0.0005, + "epoch": 0.36414769830643307, + "step": 8125 + }, + { + "loss": 14.0598, + "grad_norm": 1.9780429601669312, + "learning_rate": 0.0005, + "epoch": 0.3643717891976986, + "step": 8130 + }, + { + "loss": 14.1351, + "grad_norm": 1.7367233037948608, + "learning_rate": 0.0005, + "epoch": 0.3645958800889641, + "step": 8135 + }, + { + "loss": 14.0752, + "grad_norm": 1.8880066871643066, + "learning_rate": 0.0005, + "epoch": 0.36481997098022956, + "step": 8140 + }, + { + "loss": 14.0078, + "grad_norm": 1.8871039152145386, + "learning_rate": 0.0005, + "epoch": 0.3650440618714951, + "step": 8145 + }, + { + "loss": 14.1339, + "grad_norm": 1.9108654260635376, + "learning_rate": 0.0005, + "epoch": 0.3652681527627606, + "step": 8150 + }, + { + "loss": 14.112, + "grad_norm": 1.7433931827545166, + "learning_rate": 0.0005, + "epoch": 0.36549224365402605, + "step": 8155 + }, + { + "loss": 14.0514, + "grad_norm": 1.8739866018295288, + "learning_rate": 0.0005, + "epoch": 0.36571633454529157, + "step": 8160 + }, + { + "loss": 14.0728, + "grad_norm": 1.8400698900222778, + "learning_rate": 0.0005, + "epoch": 0.3659404254365571, + "step": 8165 + }, + { + "loss": 14.0749, + "grad_norm": 1.9645445346832275, + "learning_rate": 0.0005, + "epoch": 0.36616451632782254, + "step": 8170 + }, + { + "loss": 14.0766, + "grad_norm": 1.7623934745788574, + "learning_rate": 0.0005, + "epoch": 0.36638860721908806, + "step": 8175 + }, + { + "loss": 14.0931, + "grad_norm": 1.822167992591858, + "learning_rate": 0.0005, + "epoch": 0.3666126981103536, + "step": 8180 + }, + { + "loss": 14.112, + "grad_norm": 1.8237991333007812, + "learning_rate": 0.0005, + "epoch": 0.36683678900161903, + "step": 8185 + }, + { + "loss": 14.047, + "grad_norm": 1.8531397581100464, + "learning_rate": 0.0005, + "epoch": 0.36706087989288455, + "step": 8190 + }, + { + "loss": 14.0997, + "grad_norm": 1.9457165002822876, + "learning_rate": 0.0005, + "epoch": 0.36728497078415007, + "step": 8195 + }, + { + "loss": 14.0801, + "grad_norm": 2.0252914428710938, + "learning_rate": 0.0005, + "epoch": 0.3675090616754155, + "step": 8200 + }, + { + "loss": 14.06, + "grad_norm": 1.9659444093704224, + "learning_rate": 0.0005, + "epoch": 0.36773315256668104, + "step": 8205 + }, + { + "loss": 14.033, + "grad_norm": 1.795413613319397, + "learning_rate": 0.0005, + "epoch": 0.36795724345794656, + "step": 8210 + }, + { + "loss": 14.0703, + "grad_norm": 1.954249382019043, + "learning_rate": 0.0005, + "epoch": 0.368181334349212, + "step": 8215 + }, + { + "loss": 14.1219, + "grad_norm": 1.9318137168884277, + "learning_rate": 0.0005, + "epoch": 0.36840542524047754, + "step": 8220 + }, + { + "loss": 14.1956, + "grad_norm": 1.9770511388778687, + "learning_rate": 0.0005, + "epoch": 0.36862951613174305, + "step": 8225 + }, + { + "loss": 13.991, + "grad_norm": 2.029613494873047, + "learning_rate": 0.0005, + "epoch": 0.3688536070230085, + "step": 8230 + }, + { + "loss": 14.076, + "grad_norm": 1.9258540868759155, + "learning_rate": 0.0005, + "epoch": 0.369077697914274, + "step": 8235 + }, + { + "loss": 14.0669, + "grad_norm": 1.8921314477920532, + "learning_rate": 0.0005, + "epoch": 0.36930178880553954, + "step": 8240 + }, + { + "loss": 14.0538, + "grad_norm": 2.094562530517578, + "learning_rate": 0.0005, + "epoch": 0.369525879696805, + "step": 8245 + }, + { + "loss": 14.0573, + "grad_norm": 2.0521252155303955, + "learning_rate": 0.0005, + "epoch": 0.3697499705880705, + "step": 8250 + }, + { + "loss": 14.1545, + "grad_norm": 1.9510191679000854, + "learning_rate": 0.0005, + "epoch": 0.36997406147933604, + "step": 8255 + }, + { + "loss": 14.1197, + "grad_norm": 2.046823501586914, + "learning_rate": 0.0005, + "epoch": 0.3701981523706015, + "step": 8260 + }, + { + "loss": 14.1504, + "grad_norm": 1.987096905708313, + "learning_rate": 0.0005, + "epoch": 0.370422243261867, + "step": 8265 + }, + { + "loss": 14.0392, + "grad_norm": 1.924657940864563, + "learning_rate": 0.0005, + "epoch": 0.37064633415313253, + "step": 8270 + }, + { + "loss": 14.0724, + "grad_norm": 2.036562204360962, + "learning_rate": 0.0005, + "epoch": 0.370870425044398, + "step": 8275 + }, + { + "loss": 14.1124, + "grad_norm": 1.9594347476959229, + "learning_rate": 0.0005, + "epoch": 0.3710945159356635, + "step": 8280 + }, + { + "loss": 14.0489, + "grad_norm": 1.9957181215286255, + "learning_rate": 0.0005, + "epoch": 0.371318606826929, + "step": 8285 + }, + { + "loss": 14.1009, + "grad_norm": 2.141080617904663, + "learning_rate": 0.0005, + "epoch": 0.3715426977181945, + "step": 8290 + }, + { + "loss": 14.1201, + "grad_norm": 1.8952361345291138, + "learning_rate": 0.0005, + "epoch": 0.37176678860946, + "step": 8295 + }, + { + "loss": 14.0303, + "grad_norm": 2.0000150203704834, + "learning_rate": 0.0005, + "epoch": 0.3719908795007255, + "step": 8300 + }, + { + "loss": 14.1823, + "grad_norm": 1.8380866050720215, + "learning_rate": 0.0005, + "epoch": 0.372214970391991, + "step": 8305 + }, + { + "loss": 14.117, + "grad_norm": 1.7979339361190796, + "learning_rate": 0.0005, + "epoch": 0.3724390612832565, + "step": 8310 + }, + { + "loss": 14.0296, + "grad_norm": 2.126523494720459, + "learning_rate": 0.0005, + "epoch": 0.372663152174522, + "step": 8315 + }, + { + "loss": 13.9201, + "grad_norm": 1.9141422510147095, + "learning_rate": 0.0005, + "epoch": 0.37288724306578747, + "step": 8320 + }, + { + "loss": 14.0944, + "grad_norm": 1.9338246583938599, + "learning_rate": 0.0005, + "epoch": 0.373111333957053, + "step": 8325 + }, + { + "loss": 14.0784, + "grad_norm": 1.7644654512405396, + "learning_rate": 0.0005, + "epoch": 0.3733354248483185, + "step": 8330 + }, + { + "loss": 14.0946, + "grad_norm": 1.7906842231750488, + "learning_rate": 0.0005, + "epoch": 0.37355951573958396, + "step": 8335 + }, + { + "loss": 14.0335, + "grad_norm": 1.8195035457611084, + "learning_rate": 0.0005, + "epoch": 0.3737836066308495, + "step": 8340 + }, + { + "loss": 13.9956, + "grad_norm": 1.8171489238739014, + "learning_rate": 0.0005, + "epoch": 0.374007697522115, + "step": 8345 + }, + { + "loss": 14.0929, + "grad_norm": 1.9822993278503418, + "learning_rate": 0.0005, + "epoch": 0.37423178841338045, + "step": 8350 + }, + { + "loss": 14.075, + "grad_norm": 2.0512478351593018, + "learning_rate": 0.0005, + "epoch": 0.37445587930464597, + "step": 8355 + }, + { + "loss": 14.1075, + "grad_norm": 2.137077808380127, + "learning_rate": 0.0005, + "epoch": 0.3746799701959115, + "step": 8360 + }, + { + "loss": 13.9692, + "grad_norm": 1.8836326599121094, + "learning_rate": 0.0005, + "epoch": 0.37490406108717694, + "step": 8365 + }, + { + "loss": 14.0401, + "grad_norm": 1.8308993577957153, + "learning_rate": 0.0005, + "epoch": 0.37512815197844246, + "step": 8370 + }, + { + "loss": 14.0071, + "grad_norm": 1.8756014108657837, + "learning_rate": 0.0005, + "epoch": 0.375352242869708, + "step": 8375 + }, + { + "loss": 14.057, + "grad_norm": 1.8783321380615234, + "learning_rate": 0.0005, + "epoch": 0.37557633376097344, + "step": 8380 + }, + { + "loss": 14.1134, + "grad_norm": 1.9342875480651855, + "learning_rate": 0.0005, + "epoch": 0.37580042465223895, + "step": 8385 + }, + { + "loss": 14.155, + "grad_norm": 1.8861297369003296, + "learning_rate": 0.0005, + "epoch": 0.37602451554350447, + "step": 8390 + }, + { + "loss": 14.0238, + "grad_norm": 1.832170844078064, + "learning_rate": 0.0005, + "epoch": 0.3762486064347699, + "step": 8395 + }, + { + "loss": 14.0867, + "grad_norm": 2.1398568153381348, + "learning_rate": 0.0005, + "epoch": 0.37647269732603544, + "step": 8400 + }, + { + "loss": 14.1837, + "grad_norm": 1.9946504831314087, + "learning_rate": 0.0005, + "epoch": 0.37669678821730096, + "step": 8405 + }, + { + "loss": 13.9595, + "grad_norm": 1.8542578220367432, + "learning_rate": 0.0005, + "epoch": 0.3769208791085664, + "step": 8410 + }, + { + "loss": 14.067, + "grad_norm": 2.0427167415618896, + "learning_rate": 0.0005, + "epoch": 0.37714496999983194, + "step": 8415 + }, + { + "loss": 14.0449, + "grad_norm": 1.8909755945205688, + "learning_rate": 0.0005, + "epoch": 0.37736906089109745, + "step": 8420 + }, + { + "loss": 14.0803, + "grad_norm": 1.9262720346450806, + "learning_rate": 0.0005, + "epoch": 0.3775931517823629, + "step": 8425 + }, + { + "loss": 14.0376, + "grad_norm": 1.8407044410705566, + "learning_rate": 0.0005, + "epoch": 0.37781724267362843, + "step": 8430 + }, + { + "loss": 14.1077, + "grad_norm": 1.8292977809906006, + "learning_rate": 0.0005, + "epoch": 0.37804133356489394, + "step": 8435 + }, + { + "loss": 14.2441, + "grad_norm": 2.043469190597534, + "learning_rate": 0.0005, + "epoch": 0.3782654244561594, + "step": 8440 + }, + { + "loss": 14.019, + "grad_norm": 1.8498077392578125, + "learning_rate": 0.0005, + "epoch": 0.3784895153474249, + "step": 8445 + }, + { + "loss": 14.0124, + "grad_norm": 1.902140736579895, + "learning_rate": 0.0005, + "epoch": 0.37871360623869044, + "step": 8450 + }, + { + "loss": 14.0932, + "grad_norm": 2.1382274627685547, + "learning_rate": 0.0005, + "epoch": 0.3789376971299559, + "step": 8455 + }, + { + "loss": 14.0854, + "grad_norm": 2.017334461212158, + "learning_rate": 0.0005, + "epoch": 0.3791617880212214, + "step": 8460 + }, + { + "loss": 14.1117, + "grad_norm": 1.8728015422821045, + "learning_rate": 0.0005, + "epoch": 0.37938587891248693, + "step": 8465 + }, + { + "loss": 14.0292, + "grad_norm": 1.930253028869629, + "learning_rate": 0.0005, + "epoch": 0.3796099698037524, + "step": 8470 + }, + { + "loss": 14.0795, + "grad_norm": 2.0670037269592285, + "learning_rate": 0.0005, + "epoch": 0.3798340606950179, + "step": 8475 + }, + { + "loss": 14.0816, + "grad_norm": 1.9624706506729126, + "learning_rate": 0.0005, + "epoch": 0.3800581515862834, + "step": 8480 + }, + { + "loss": 14.116, + "grad_norm": 1.935685157775879, + "learning_rate": 0.0005, + "epoch": 0.3802822424775489, + "step": 8485 + }, + { + "loss": 14.1128, + "grad_norm": 1.997086763381958, + "learning_rate": 0.0005, + "epoch": 0.3805063333688144, + "step": 8490 + }, + { + "loss": 14.0283, + "grad_norm": 2.0466487407684326, + "learning_rate": 0.0005, + "epoch": 0.3807304242600799, + "step": 8495 + }, + { + "loss": 14.1002, + "grad_norm": 2.157344102859497, + "learning_rate": 0.0005, + "epoch": 0.3809545151513454, + "step": 8500 + }, + { + "eval_loss": 1.7569822072982788, + "eval_runtime": 18.6681, + "eval_samples_per_second": 877.645, + "eval_steps_per_second": 7.874, + "epoch": 0.3809545151513454, + "step": 8500 + }, + { + "loss": 14.0763, + "grad_norm": 2.2164812088012695, + "learning_rate": 0.0005, + "epoch": 0.3811786060426109, + "step": 8505 + }, + { + "loss": 14.1342, + "grad_norm": 2.0805397033691406, + "learning_rate": 0.0005, + "epoch": 0.3814026969338764, + "step": 8510 + }, + { + "loss": 14.1283, + "grad_norm": 1.8117724657058716, + "learning_rate": 0.0005, + "epoch": 0.38162678782514187, + "step": 8515 + }, + { + "loss": 14.0734, + "grad_norm": 1.8892253637313843, + "learning_rate": 0.0005, + "epoch": 0.3818508787164074, + "step": 8520 + }, + { + "loss": 14.0782, + "grad_norm": 2.018918752670288, + "learning_rate": 0.0005, + "epoch": 0.3820749696076729, + "step": 8525 + }, + { + "loss": 14.1138, + "grad_norm": 2.0088467597961426, + "learning_rate": 0.0005, + "epoch": 0.38229906049893836, + "step": 8530 + }, + { + "loss": 14.1419, + "grad_norm": 2.0010855197906494, + "learning_rate": 0.0005, + "epoch": 0.3825231513902039, + "step": 8535 + }, + { + "loss": 14.0559, + "grad_norm": 1.973644495010376, + "learning_rate": 0.0005, + "epoch": 0.3827472422814694, + "step": 8540 + }, + { + "loss": 13.9804, + "grad_norm": 1.8364356756210327, + "learning_rate": 0.0005, + "epoch": 0.38297133317273485, + "step": 8545 + }, + { + "loss": 14.0609, + "grad_norm": 1.7789885997772217, + "learning_rate": 0.0005, + "epoch": 0.38319542406400037, + "step": 8550 + }, + { + "loss": 14.1007, + "grad_norm": 1.9528905153274536, + "learning_rate": 0.0005, + "epoch": 0.3834195149552659, + "step": 8555 + }, + { + "loss": 14.0495, + "grad_norm": 1.8014353513717651, + "learning_rate": 0.0005, + "epoch": 0.38364360584653134, + "step": 8560 + }, + { + "loss": 14.1449, + "grad_norm": 1.844429612159729, + "learning_rate": 0.0005, + "epoch": 0.38386769673779686, + "step": 8565 + }, + { + "loss": 14.0979, + "grad_norm": 2.125936508178711, + "learning_rate": 0.0005, + "epoch": 0.3840917876290623, + "step": 8570 + }, + { + "loss": 14.0512, + "grad_norm": 2.0436089038848877, + "learning_rate": 0.0005, + "epoch": 0.38431587852032784, + "step": 8575 + }, + { + "loss": 14.0902, + "grad_norm": 2.1031296253204346, + "learning_rate": 0.0005, + "epoch": 0.38453996941159335, + "step": 8580 + }, + { + "loss": 14.0436, + "grad_norm": 1.9743539094924927, + "learning_rate": 0.0005, + "epoch": 0.3847640603028588, + "step": 8585 + }, + { + "loss": 14.0761, + "grad_norm": 2.10516357421875, + "learning_rate": 0.0005, + "epoch": 0.38498815119412433, + "step": 8590 + }, + { + "loss": 14.0811, + "grad_norm": 2.1132824420928955, + "learning_rate": 0.0005, + "epoch": 0.38521224208538984, + "step": 8595 + }, + { + "loss": 14.0995, + "grad_norm": 2.0395777225494385, + "learning_rate": 0.0005, + "epoch": 0.3854363329766553, + "step": 8600 + }, + { + "loss": 14.0018, + "grad_norm": 1.8657382726669312, + "learning_rate": 0.0005, + "epoch": 0.3856604238679208, + "step": 8605 + }, + { + "loss": 14.0678, + "grad_norm": 1.8442227840423584, + "learning_rate": 0.0005, + "epoch": 0.38588451475918634, + "step": 8610 + }, + { + "loss": 14.1595, + "grad_norm": 2.0256452560424805, + "learning_rate": 0.0005, + "epoch": 0.3861086056504518, + "step": 8615 + }, + { + "loss": 14.1808, + "grad_norm": 2.03298282623291, + "learning_rate": 0.0005, + "epoch": 0.3863326965417173, + "step": 8620 + }, + { + "loss": 14.0694, + "grad_norm": 1.8715609312057495, + "learning_rate": 0.0005, + "epoch": 0.38655678743298283, + "step": 8625 + }, + { + "loss": 14.106, + "grad_norm": 1.9478174448013306, + "learning_rate": 0.0005, + "epoch": 0.3867808783242483, + "step": 8630 + }, + { + "loss": 13.9328, + "grad_norm": 1.817594051361084, + "learning_rate": 0.0005, + "epoch": 0.3870049692155138, + "step": 8635 + }, + { + "loss": 14.019, + "grad_norm": 1.7071633338928223, + "learning_rate": 0.0005, + "epoch": 0.3872290601067793, + "step": 8640 + }, + { + "loss": 14.0565, + "grad_norm": 1.9565168619155884, + "learning_rate": 0.0005, + "epoch": 0.3874531509980448, + "step": 8645 + }, + { + "loss": 14.1427, + "grad_norm": 1.9424870014190674, + "learning_rate": 0.0005, + "epoch": 0.3876772418893103, + "step": 8650 + }, + { + "loss": 14.1189, + "grad_norm": 1.878967046737671, + "learning_rate": 0.0005, + "epoch": 0.3879013327805758, + "step": 8655 + }, + { + "loss": 14.1579, + "grad_norm": 1.932654619216919, + "learning_rate": 0.0005, + "epoch": 0.3881254236718413, + "step": 8660 + }, + { + "loss": 14.1557, + "grad_norm": 1.8046934604644775, + "learning_rate": 0.0005, + "epoch": 0.3883495145631068, + "step": 8665 + }, + { + "loss": 14.0229, + "grad_norm": 1.834702968597412, + "learning_rate": 0.0005, + "epoch": 0.3885736054543723, + "step": 8670 + }, + { + "loss": 14.1103, + "grad_norm": 1.7599685192108154, + "learning_rate": 0.0005, + "epoch": 0.38879769634563777, + "step": 8675 + }, + { + "loss": 14.1212, + "grad_norm": 1.8775702714920044, + "learning_rate": 0.0005, + "epoch": 0.3890217872369033, + "step": 8680 + }, + { + "loss": 14.1246, + "grad_norm": 2.1239051818847656, + "learning_rate": 0.0005, + "epoch": 0.3892458781281688, + "step": 8685 + }, + { + "loss": 13.9983, + "grad_norm": 2.047067880630493, + "learning_rate": 0.0005, + "epoch": 0.38946996901943426, + "step": 8690 + }, + { + "loss": 14.1257, + "grad_norm": 1.868910789489746, + "learning_rate": 0.0005, + "epoch": 0.3896940599106998, + "step": 8695 + }, + { + "loss": 14.1012, + "grad_norm": 2.0451931953430176, + "learning_rate": 0.0005, + "epoch": 0.3899181508019653, + "step": 8700 + }, + { + "loss": 14.0864, + "grad_norm": 1.8716967105865479, + "learning_rate": 0.0005, + "epoch": 0.39014224169323075, + "step": 8705 + }, + { + "loss": 14.1215, + "grad_norm": 1.8944684267044067, + "learning_rate": 0.0005, + "epoch": 0.39036633258449627, + "step": 8710 + }, + { + "loss": 14.0179, + "grad_norm": 1.885197639465332, + "learning_rate": 0.0005, + "epoch": 0.3905904234757618, + "step": 8715 + }, + { + "loss": 14.1015, + "grad_norm": 2.0148558616638184, + "learning_rate": 0.0005, + "epoch": 0.39081451436702724, + "step": 8720 + }, + { + "loss": 14.1754, + "grad_norm": 1.9229111671447754, + "learning_rate": 0.0005, + "epoch": 0.39103860525829276, + "step": 8725 + }, + { + "loss": 14.0637, + "grad_norm": 1.8236292600631714, + "learning_rate": 0.0005, + "epoch": 0.3912626961495583, + "step": 8730 + }, + { + "loss": 14.0855, + "grad_norm": 1.950639247894287, + "learning_rate": 0.0005, + "epoch": 0.39148678704082374, + "step": 8735 + }, + { + "loss": 14.0255, + "grad_norm": 1.8944975137710571, + "learning_rate": 0.0005, + "epoch": 0.39171087793208925, + "step": 8740 + }, + { + "loss": 14.1443, + "grad_norm": 2.0181682109832764, + "learning_rate": 0.0005, + "epoch": 0.39193496882335477, + "step": 8745 + }, + { + "loss": 13.984, + "grad_norm": 1.8844550848007202, + "learning_rate": 0.0005, + "epoch": 0.39215905971462023, + "step": 8750 + }, + { + "loss": 14.0865, + "grad_norm": 2.0160844326019287, + "learning_rate": 0.0005, + "epoch": 0.39238315060588574, + "step": 8755 + }, + { + "loss": 14.0881, + "grad_norm": 1.9414035081863403, + "learning_rate": 0.0005, + "epoch": 0.39260724149715126, + "step": 8760 + }, + { + "loss": 14.0639, + "grad_norm": 1.839568853378296, + "learning_rate": 0.0005, + "epoch": 0.3928313323884167, + "step": 8765 + }, + { + "loss": 14.1536, + "grad_norm": 1.899423599243164, + "learning_rate": 0.0005, + "epoch": 0.39305542327968224, + "step": 8770 + }, + { + "loss": 14.1575, + "grad_norm": 1.8649468421936035, + "learning_rate": 0.0005, + "epoch": 0.39327951417094775, + "step": 8775 + }, + { + "loss": 14.0814, + "grad_norm": 1.937558889389038, + "learning_rate": 0.0005, + "epoch": 0.3935036050622132, + "step": 8780 + }, + { + "loss": 14.0491, + "grad_norm": 1.889802098274231, + "learning_rate": 0.0005, + "epoch": 0.39372769595347873, + "step": 8785 + }, + { + "loss": 14.1103, + "grad_norm": 2.0124385356903076, + "learning_rate": 0.0005, + "epoch": 0.39395178684474425, + "step": 8790 + }, + { + "loss": 14.109, + "grad_norm": 2.07157564163208, + "learning_rate": 0.0005, + "epoch": 0.3941758777360097, + "step": 8795 + }, + { + "loss": 14.1463, + "grad_norm": 1.7840614318847656, + "learning_rate": 0.0005, + "epoch": 0.3943999686272752, + "step": 8800 + }, + { + "loss": 14.082, + "grad_norm": 1.8262500762939453, + "learning_rate": 0.0005, + "epoch": 0.39462405951854074, + "step": 8805 + }, + { + "loss": 14.1194, + "grad_norm": 1.9340875148773193, + "learning_rate": 0.0005, + "epoch": 0.3948481504098062, + "step": 8810 + }, + { + "loss": 14.0318, + "grad_norm": 1.982276201248169, + "learning_rate": 0.0005, + "epoch": 0.3950722413010717, + "step": 8815 + }, + { + "loss": 13.9853, + "grad_norm": 1.7948544025421143, + "learning_rate": 0.0005, + "epoch": 0.39529633219233723, + "step": 8820 + }, + { + "loss": 14.0622, + "grad_norm": 1.7773853540420532, + "learning_rate": 0.0005, + "epoch": 0.3955204230836027, + "step": 8825 + }, + { + "loss": 14.1185, + "grad_norm": 1.7478654384613037, + "learning_rate": 0.0005, + "epoch": 0.3957445139748682, + "step": 8830 + }, + { + "loss": 14.0453, + "grad_norm": 2.0046229362487793, + "learning_rate": 0.0005, + "epoch": 0.3959686048661337, + "step": 8835 + }, + { + "loss": 14.196, + "grad_norm": 1.9458394050598145, + "learning_rate": 0.0005, + "epoch": 0.3961926957573992, + "step": 8840 + }, + { + "loss": 14.0311, + "grad_norm": 1.9534013271331787, + "learning_rate": 0.0005, + "epoch": 0.3964167866486647, + "step": 8845 + }, + { + "loss": 14.1093, + "grad_norm": 1.789678692817688, + "learning_rate": 0.0005, + "epoch": 0.3966408775399302, + "step": 8850 + }, + { + "loss": 14.0338, + "grad_norm": 1.9533801078796387, + "learning_rate": 0.0005, + "epoch": 0.3968649684311957, + "step": 8855 + }, + { + "loss": 13.9558, + "grad_norm": 2.187878370285034, + "learning_rate": 0.0005, + "epoch": 0.3970890593224612, + "step": 8860 + }, + { + "loss": 14.0059, + "grad_norm": 1.9042341709136963, + "learning_rate": 0.0005, + "epoch": 0.3973131502137267, + "step": 8865 + }, + { + "loss": 14.1696, + "grad_norm": 2.0896317958831787, + "learning_rate": 0.0005, + "epoch": 0.39753724110499217, + "step": 8870 + }, + { + "loss": 14.1228, + "grad_norm": 1.904725193977356, + "learning_rate": 0.0005, + "epoch": 0.3977613319962577, + "step": 8875 + }, + { + "loss": 14.0367, + "grad_norm": 1.8315083980560303, + "learning_rate": 0.0005, + "epoch": 0.3979854228875232, + "step": 8880 + }, + { + "loss": 14.1059, + "grad_norm": 1.8545987606048584, + "learning_rate": 0.0005, + "epoch": 0.39820951377878866, + "step": 8885 + }, + { + "loss": 14.0085, + "grad_norm": 2.000612258911133, + "learning_rate": 0.0005, + "epoch": 0.3984336046700542, + "step": 8890 + }, + { + "loss": 14.0234, + "grad_norm": 1.8360822200775146, + "learning_rate": 0.0005, + "epoch": 0.3986576955613197, + "step": 8895 + }, + { + "loss": 14.0266, + "grad_norm": 1.7959775924682617, + "learning_rate": 0.0005, + "epoch": 0.39888178645258515, + "step": 8900 + }, + { + "loss": 14.179, + "grad_norm": 2.0182433128356934, + "learning_rate": 0.0005, + "epoch": 0.39910587734385067, + "step": 8905 + }, + { + "loss": 14.0248, + "grad_norm": 2.0366687774658203, + "learning_rate": 0.0005, + "epoch": 0.3993299682351162, + "step": 8910 + }, + { + "loss": 14.0431, + "grad_norm": 2.001441717147827, + "learning_rate": 0.0005, + "epoch": 0.39955405912638164, + "step": 8915 + }, + { + "loss": 14.222, + "grad_norm": 1.8706127405166626, + "learning_rate": 0.0005, + "epoch": 0.39977815001764716, + "step": 8920 + }, + { + "loss": 14.1706, + "grad_norm": 1.9963351488113403, + "learning_rate": 0.0005, + "epoch": 0.4000022409089127, + "step": 8925 + }, + { + "loss": 14.1615, + "grad_norm": 2.0632803440093994, + "learning_rate": 0.0005, + "epoch": 0.40022633180017814, + "step": 8930 + }, + { + "loss": 14.0615, + "grad_norm": 2.052077293395996, + "learning_rate": 0.0005, + "epoch": 0.40045042269144365, + "step": 8935 + }, + { + "loss": 14.0403, + "grad_norm": 1.9180212020874023, + "learning_rate": 0.0005, + "epoch": 0.40067451358270917, + "step": 8940 + }, + { + "loss": 14.0491, + "grad_norm": 1.9582902193069458, + "learning_rate": 0.0005, + "epoch": 0.40089860447397463, + "step": 8945 + }, + { + "loss": 14.1251, + "grad_norm": 1.6565606594085693, + "learning_rate": 0.0005, + "epoch": 0.40112269536524015, + "step": 8950 + }, + { + "loss": 14.09, + "grad_norm": 2.0036303997039795, + "learning_rate": 0.0005, + "epoch": 0.40134678625650566, + "step": 8955 + }, + { + "loss": 14.128, + "grad_norm": 1.8896582126617432, + "learning_rate": 0.0005, + "epoch": 0.4015708771477711, + "step": 8960 + }, + { + "loss": 14.0465, + "grad_norm": 1.9904286861419678, + "learning_rate": 0.0005, + "epoch": 0.40179496803903664, + "step": 8965 + }, + { + "loss": 14.0307, + "grad_norm": 1.9393749237060547, + "learning_rate": 0.0005, + "epoch": 0.40201905893030215, + "step": 8970 + }, + { + "loss": 14.0348, + "grad_norm": 1.9569603204727173, + "learning_rate": 0.0005, + "epoch": 0.4022431498215676, + "step": 8975 + }, + { + "loss": 14.1068, + "grad_norm": 1.8881086111068726, + "learning_rate": 0.0005, + "epoch": 0.40246724071283313, + "step": 8980 + }, + { + "loss": 13.9595, + "grad_norm": 1.8988546133041382, + "learning_rate": 0.0005, + "epoch": 0.40269133160409865, + "step": 8985 + }, + { + "loss": 14.0135, + "grad_norm": 1.9805768728256226, + "learning_rate": 0.0005, + "epoch": 0.4029154224953641, + "step": 8990 + }, + { + "loss": 14.0296, + "grad_norm": 1.8838372230529785, + "learning_rate": 0.0005, + "epoch": 0.4031395133866296, + "step": 8995 + }, + { + "loss": 14.1413, + "grad_norm": 1.8862353563308716, + "learning_rate": 0.0005, + "epoch": 0.40336360427789514, + "step": 9000 + }, + { + "eval_loss": 1.750436782836914, + "eval_runtime": 18.7449, + "eval_samples_per_second": 874.051, + "eval_steps_per_second": 7.842, + "epoch": 0.40336360427789514, + "step": 9000 + }, + { + "loss": 14.0521, + "grad_norm": 1.953079104423523, + "learning_rate": 0.0005, + "epoch": 0.4035876951691606, + "step": 9005 + }, + { + "loss": 14.1428, + "grad_norm": 1.8866537809371948, + "learning_rate": 0.0005, + "epoch": 0.4038117860604261, + "step": 9010 + }, + { + "loss": 14.0127, + "grad_norm": 2.0234899520874023, + "learning_rate": 0.0005, + "epoch": 0.40403587695169163, + "step": 9015 + }, + { + "loss": 14.0549, + "grad_norm": 1.7747480869293213, + "learning_rate": 0.0005, + "epoch": 0.4042599678429571, + "step": 9020 + }, + { + "loss": 13.9821, + "grad_norm": 2.022836208343506, + "learning_rate": 0.0005, + "epoch": 0.4044840587342226, + "step": 9025 + }, + { + "loss": 14.0746, + "grad_norm": 1.9047553539276123, + "learning_rate": 0.0005, + "epoch": 0.4047081496254881, + "step": 9030 + }, + { + "loss": 13.99, + "grad_norm": 1.864039421081543, + "learning_rate": 0.0005, + "epoch": 0.4049322405167536, + "step": 9035 + }, + { + "loss": 14.0763, + "grad_norm": 2.018603563308716, + "learning_rate": 0.0005, + "epoch": 0.4051563314080191, + "step": 9040 + }, + { + "loss": 14.058, + "grad_norm": 2.015758514404297, + "learning_rate": 0.0005, + "epoch": 0.4053804222992846, + "step": 9045 + }, + { + "loss": 14.0219, + "grad_norm": 2.26421856880188, + "learning_rate": 0.0005, + "epoch": 0.4056045131905501, + "step": 9050 + }, + { + "loss": 14.0153, + "grad_norm": 1.898167610168457, + "learning_rate": 0.0005, + "epoch": 0.4058286040818156, + "step": 9055 + }, + { + "loss": 14.1529, + "grad_norm": 1.7895253896713257, + "learning_rate": 0.0005, + "epoch": 0.4060526949730811, + "step": 9060 + }, + { + "loss": 14.077, + "grad_norm": 1.8098982572555542, + "learning_rate": 0.0005, + "epoch": 0.40627678586434657, + "step": 9065 + }, + { + "loss": 14.0183, + "grad_norm": 1.9741629362106323, + "learning_rate": 0.0005, + "epoch": 0.4065008767556121, + "step": 9070 + }, + { + "loss": 14.076, + "grad_norm": 1.8350106477737427, + "learning_rate": 0.0005, + "epoch": 0.4067249676468776, + "step": 9075 + }, + { + "loss": 14.0936, + "grad_norm": 1.9996545314788818, + "learning_rate": 0.0005, + "epoch": 0.40694905853814306, + "step": 9080 + }, + { + "loss": 14.126, + "grad_norm": 1.8608310222625732, + "learning_rate": 0.0005, + "epoch": 0.4071731494294086, + "step": 9085 + }, + { + "loss": 14.0375, + "grad_norm": 1.8878345489501953, + "learning_rate": 0.0005, + "epoch": 0.4073972403206741, + "step": 9090 + }, + { + "loss": 14.0464, + "grad_norm": 1.8385180234909058, + "learning_rate": 0.0005, + "epoch": 0.40762133121193955, + "step": 9095 + }, + { + "loss": 14.1311, + "grad_norm": 1.998307228088379, + "learning_rate": 0.0005, + "epoch": 0.40784542210320507, + "step": 9100 + }, + { + "loss": 14.1412, + "grad_norm": 1.788956642150879, + "learning_rate": 0.0005, + "epoch": 0.4080695129944706, + "step": 9105 + }, + { + "loss": 14.0221, + "grad_norm": 1.7810660600662231, + "learning_rate": 0.0005, + "epoch": 0.40829360388573605, + "step": 9110 + }, + { + "loss": 14.0332, + "grad_norm": 1.9653055667877197, + "learning_rate": 0.0005, + "epoch": 0.40851769477700156, + "step": 9115 + }, + { + "loss": 14.0218, + "grad_norm": 1.846150279045105, + "learning_rate": 0.0005, + "epoch": 0.408741785668267, + "step": 9120 + }, + { + "loss": 14.065, + "grad_norm": 1.925832748413086, + "learning_rate": 0.0005, + "epoch": 0.40896587655953254, + "step": 9125 + }, + { + "loss": 14.0653, + "grad_norm": 1.9436774253845215, + "learning_rate": 0.0005, + "epoch": 0.40918996745079805, + "step": 9130 + }, + { + "loss": 14.2366, + "grad_norm": 1.8719289302825928, + "learning_rate": 0.0005, + "epoch": 0.4094140583420635, + "step": 9135 + }, + { + "loss": 14.0781, + "grad_norm": 2.0129549503326416, + "learning_rate": 0.0005, + "epoch": 0.40963814923332903, + "step": 9140 + }, + { + "loss": 14.0747, + "grad_norm": 2.2035460472106934, + "learning_rate": 0.0005, + "epoch": 0.40986224012459455, + "step": 9145 + }, + { + "loss": 14.1699, + "grad_norm": 2.0379064083099365, + "learning_rate": 0.0005, + "epoch": 0.41008633101586, + "step": 9150 + }, + { + "loss": 13.9654, + "grad_norm": 1.9979286193847656, + "learning_rate": 0.0005, + "epoch": 0.4103104219071255, + "step": 9155 + }, + { + "loss": 13.9914, + "grad_norm": 1.9642233848571777, + "learning_rate": 0.0005, + "epoch": 0.41053451279839104, + "step": 9160 + }, + { + "loss": 13.9992, + "grad_norm": 2.0654077529907227, + "learning_rate": 0.0005, + "epoch": 0.4107586036896565, + "step": 9165 + }, + { + "loss": 14.094, + "grad_norm": 1.8470113277435303, + "learning_rate": 0.0005, + "epoch": 0.410982694580922, + "step": 9170 + }, + { + "loss": 14.1311, + "grad_norm": 1.7958095073699951, + "learning_rate": 0.0005, + "epoch": 0.41120678547218753, + "step": 9175 + }, + { + "loss": 14.0738, + "grad_norm": 1.732292890548706, + "learning_rate": 0.0005, + "epoch": 0.411430876363453, + "step": 9180 + }, + { + "loss": 14.0696, + "grad_norm": 1.9240189790725708, + "learning_rate": 0.0005, + "epoch": 0.4116549672547185, + "step": 9185 + }, + { + "loss": 14.0463, + "grad_norm": 1.9035232067108154, + "learning_rate": 0.0005, + "epoch": 0.411879058145984, + "step": 9190 + }, + { + "loss": 14.1022, + "grad_norm": 1.982646107673645, + "learning_rate": 0.0005, + "epoch": 0.4121031490372495, + "step": 9195 + }, + { + "loss": 13.9918, + "grad_norm": 1.9125901460647583, + "learning_rate": 0.0005, + "epoch": 0.412327239928515, + "step": 9200 + }, + { + "loss": 14.0336, + "grad_norm": 1.8618894815444946, + "learning_rate": 0.0005, + "epoch": 0.4125513308197805, + "step": 9205 + }, + { + "loss": 13.986, + "grad_norm": 2.1027495861053467, + "learning_rate": 0.0005, + "epoch": 0.412775421711046, + "step": 9210 + }, + { + "loss": 14.0505, + "grad_norm": 2.065297842025757, + "learning_rate": 0.0005, + "epoch": 0.4129995126023115, + "step": 9215 + }, + { + "loss": 14.0786, + "grad_norm": 1.935354471206665, + "learning_rate": 0.0005, + "epoch": 0.413223603493577, + "step": 9220 + }, + { + "loss": 14.1429, + "grad_norm": 2.1198596954345703, + "learning_rate": 0.0005, + "epoch": 0.41344769438484247, + "step": 9225 + }, + { + "loss": 14.1127, + "grad_norm": 2.406881093978882, + "learning_rate": 0.0005, + "epoch": 0.413671785276108, + "step": 9230 + }, + { + "loss": 14.0365, + "grad_norm": 2.0867960453033447, + "learning_rate": 0.0005, + "epoch": 0.4138958761673735, + "step": 9235 + }, + { + "loss": 14.0288, + "grad_norm": 1.940619945526123, + "learning_rate": 0.0005, + "epoch": 0.41411996705863896, + "step": 9240 + }, + { + "loss": 14.0551, + "grad_norm": 1.8351531028747559, + "learning_rate": 0.0005, + "epoch": 0.4143440579499045, + "step": 9245 + }, + { + "loss": 14.0954, + "grad_norm": 1.847739815711975, + "learning_rate": 0.0005, + "epoch": 0.41456814884117, + "step": 9250 + }, + { + "loss": 14.061, + "grad_norm": 2.023423910140991, + "learning_rate": 0.0005, + "epoch": 0.41479223973243545, + "step": 9255 + }, + { + "loss": 13.9158, + "grad_norm": 1.890186071395874, + "learning_rate": 0.0005, + "epoch": 0.41501633062370097, + "step": 9260 + }, + { + "loss": 14.1103, + "grad_norm": 1.8148173093795776, + "learning_rate": 0.0005, + "epoch": 0.4152404215149665, + "step": 9265 + }, + { + "loss": 14.0353, + "grad_norm": 1.978704571723938, + "learning_rate": 0.0005, + "epoch": 0.41546451240623195, + "step": 9270 + }, + { + "loss": 14.0829, + "grad_norm": 1.8345826864242554, + "learning_rate": 0.0005, + "epoch": 0.41568860329749746, + "step": 9275 + }, + { + "loss": 13.973, + "grad_norm": 1.8378230333328247, + "learning_rate": 0.0005, + "epoch": 0.415912694188763, + "step": 9280 + }, + { + "loss": 14.1264, + "grad_norm": 1.8736780881881714, + "learning_rate": 0.0005, + "epoch": 0.41613678508002844, + "step": 9285 + }, + { + "loss": 14.1078, + "grad_norm": 1.7875055074691772, + "learning_rate": 0.0005, + "epoch": 0.41636087597129395, + "step": 9290 + }, + { + "loss": 14.1918, + "grad_norm": 1.9262399673461914, + "learning_rate": 0.0005, + "epoch": 0.41658496686255947, + "step": 9295 + }, + { + "loss": 14.0808, + "grad_norm": 1.9088530540466309, + "learning_rate": 0.0005, + "epoch": 0.41680905775382493, + "step": 9300 + }, + { + "loss": 14.1056, + "grad_norm": 1.8706014156341553, + "learning_rate": 0.0005, + "epoch": 0.41703314864509045, + "step": 9305 + }, + { + "loss": 14.1663, + "grad_norm": 1.8975495100021362, + "learning_rate": 0.0005, + "epoch": 0.41725723953635596, + "step": 9310 + }, + { + "loss": 13.9967, + "grad_norm": 1.8865870237350464, + "learning_rate": 0.0005, + "epoch": 0.4174813304276214, + "step": 9315 + }, + { + "loss": 14.0276, + "grad_norm": 1.9450527429580688, + "learning_rate": 0.0005, + "epoch": 0.41770542131888694, + "step": 9320 + }, + { + "loss": 13.9744, + "grad_norm": 1.8535137176513672, + "learning_rate": 0.0005, + "epoch": 0.41792951221015245, + "step": 9325 + }, + { + "loss": 14.0797, + "grad_norm": 1.8707098960876465, + "learning_rate": 0.0005, + "epoch": 0.4181536031014179, + "step": 9330 + }, + { + "loss": 13.9541, + "grad_norm": 1.8629024028778076, + "learning_rate": 0.0005, + "epoch": 0.41837769399268343, + "step": 9335 + }, + { + "loss": 14.0776, + "grad_norm": 2.0101118087768555, + "learning_rate": 0.0005, + "epoch": 0.41860178488394895, + "step": 9340 + }, + { + "loss": 14.0206, + "grad_norm": 1.778620958328247, + "learning_rate": 0.0005, + "epoch": 0.4188258757752144, + "step": 9345 + }, + { + "loss": 14.0152, + "grad_norm": 1.9611141681671143, + "learning_rate": 0.0005, + "epoch": 0.4190499666664799, + "step": 9350 + }, + { + "loss": 13.9832, + "grad_norm": 1.933802604675293, + "learning_rate": 0.0005, + "epoch": 0.41927405755774544, + "step": 9355 + }, + { + "loss": 14.1467, + "grad_norm": 2.054326057434082, + "learning_rate": 0.0005, + "epoch": 0.4194981484490109, + "step": 9360 + }, + { + "loss": 13.9849, + "grad_norm": 1.8363392353057861, + "learning_rate": 0.0005, + "epoch": 0.4197222393402764, + "step": 9365 + }, + { + "loss": 14.1202, + "grad_norm": 1.9941134452819824, + "learning_rate": 0.0005, + "epoch": 0.41994633023154193, + "step": 9370 + }, + { + "loss": 14.1037, + "grad_norm": 2.0817792415618896, + "learning_rate": 0.0005, + "epoch": 0.4201704211228074, + "step": 9375 + }, + { + "loss": 14.0734, + "grad_norm": 1.8130356073379517, + "learning_rate": 0.0005, + "epoch": 0.4203945120140729, + "step": 9380 + }, + { + "loss": 14.036, + "grad_norm": 1.88438880443573, + "learning_rate": 0.0005, + "epoch": 0.4206186029053384, + "step": 9385 + }, + { + "loss": 14.0426, + "grad_norm": 1.95289146900177, + "learning_rate": 0.0005, + "epoch": 0.4208426937966039, + "step": 9390 + }, + { + "loss": 14.0527, + "grad_norm": 2.1133406162261963, + "learning_rate": 0.0005, + "epoch": 0.4210667846878694, + "step": 9395 + }, + { + "loss": 14.0653, + "grad_norm": 2.0192480087280273, + "learning_rate": 0.0005, + "epoch": 0.4212908755791349, + "step": 9400 + }, + { + "loss": 14.0586, + "grad_norm": 2.0362696647644043, + "learning_rate": 0.0005, + "epoch": 0.4215149664704004, + "step": 9405 + }, + { + "loss": 14.0897, + "grad_norm": 2.0646374225616455, + "learning_rate": 0.0005, + "epoch": 0.4217390573616659, + "step": 9410 + }, + { + "loss": 13.9908, + "grad_norm": 1.804474949836731, + "learning_rate": 0.0005, + "epoch": 0.4219631482529314, + "step": 9415 + }, + { + "loss": 13.9978, + "grad_norm": 1.795154333114624, + "learning_rate": 0.0005, + "epoch": 0.42218723914419687, + "step": 9420 + }, + { + "loss": 13.9795, + "grad_norm": 1.8122735023498535, + "learning_rate": 0.0005, + "epoch": 0.4224113300354624, + "step": 9425 + }, + { + "loss": 14.1127, + "grad_norm": 2.0129716396331787, + "learning_rate": 0.0005, + "epoch": 0.4226354209267279, + "step": 9430 + }, + { + "loss": 14.0821, + "grad_norm": 1.7567360401153564, + "learning_rate": 0.0005, + "epoch": 0.42285951181799336, + "step": 9435 + }, + { + "loss": 14.0475, + "grad_norm": 1.889049768447876, + "learning_rate": 0.0005, + "epoch": 0.4230836027092589, + "step": 9440 + }, + { + "loss": 14.0047, + "grad_norm": 1.8952374458312988, + "learning_rate": 0.0005, + "epoch": 0.4233076936005244, + "step": 9445 + }, + { + "loss": 14.0252, + "grad_norm": 2.077031373977661, + "learning_rate": 0.0005, + "epoch": 0.42353178449178985, + "step": 9450 + }, + { + "loss": 14.0336, + "grad_norm": 1.902687907218933, + "learning_rate": 0.0005, + "epoch": 0.42375587538305537, + "step": 9455 + }, + { + "loss": 14.0308, + "grad_norm": 1.8955004215240479, + "learning_rate": 0.0005, + "epoch": 0.4239799662743209, + "step": 9460 + }, + { + "loss": 14.1439, + "grad_norm": 1.7904077768325806, + "learning_rate": 0.0005, + "epoch": 0.42420405716558635, + "step": 9465 + }, + { + "loss": 14.0121, + "grad_norm": 1.9405328035354614, + "learning_rate": 0.0005, + "epoch": 0.42442814805685186, + "step": 9470 + }, + { + "loss": 14.0604, + "grad_norm": 2.001901388168335, + "learning_rate": 0.0005, + "epoch": 0.4246522389481174, + "step": 9475 + }, + { + "loss": 13.8987, + "grad_norm": 2.1401023864746094, + "learning_rate": 0.0005, + "epoch": 0.42487632983938284, + "step": 9480 + }, + { + "loss": 14.0958, + "grad_norm": 1.8659871816635132, + "learning_rate": 0.0005, + "epoch": 0.42510042073064835, + "step": 9485 + }, + { + "loss": 14.0518, + "grad_norm": 1.9418147802352905, + "learning_rate": 0.0005, + "epoch": 0.42532451162191387, + "step": 9490 + }, + { + "loss": 14.0263, + "grad_norm": 1.788070559501648, + "learning_rate": 0.0005, + "epoch": 0.42554860251317933, + "step": 9495 + }, + { + "loss": 14.0373, + "grad_norm": 1.8509469032287598, + "learning_rate": 0.0005, + "epoch": 0.42577269340444485, + "step": 9500 + }, + { + "eval_loss": 1.7514612674713135, + "eval_runtime": 18.448, + "eval_samples_per_second": 888.118, + "eval_steps_per_second": 7.968, + "epoch": 0.42577269340444485, + "step": 9500 + }, + { + "loss": 13.9943, + "grad_norm": 1.886093020439148, + "learning_rate": 0.0005, + "epoch": 0.42599678429571036, + "step": 9505 + }, + { + "loss": 14.1202, + "grad_norm": 2.0397934913635254, + "learning_rate": 0.0005, + "epoch": 0.4262208751869758, + "step": 9510 + }, + { + "loss": 14.0062, + "grad_norm": 2.0059173107147217, + "learning_rate": 0.0005, + "epoch": 0.42644496607824134, + "step": 9515 + }, + { + "loss": 14.1346, + "grad_norm": 2.0011186599731445, + "learning_rate": 0.0005, + "epoch": 0.42666905696950685, + "step": 9520 + }, + { + "loss": 14.1812, + "grad_norm": 1.9647142887115479, + "learning_rate": 0.0005, + "epoch": 0.4268931478607723, + "step": 9525 + }, + { + "loss": 14.2527, + "grad_norm": 2.0474014282226562, + "learning_rate": 0.0005, + "epoch": 0.42711723875203783, + "step": 9530 + }, + { + "loss": 14.0878, + "grad_norm": 2.0448601245880127, + "learning_rate": 0.0005, + "epoch": 0.42734132964330335, + "step": 9535 + }, + { + "loss": 14.0419, + "grad_norm": 1.9019169807434082, + "learning_rate": 0.0005, + "epoch": 0.4275654205345688, + "step": 9540 + }, + { + "loss": 14.0307, + "grad_norm": 1.9999176263809204, + "learning_rate": 0.0005, + "epoch": 0.4277895114258343, + "step": 9545 + }, + { + "loss": 14.1149, + "grad_norm": 2.0039052963256836, + "learning_rate": 0.0005, + "epoch": 0.42801360231709984, + "step": 9550 + }, + { + "loss": 14.0169, + "grad_norm": 1.9922994375228882, + "learning_rate": 0.0005, + "epoch": 0.4282376932083653, + "step": 9555 + }, + { + "loss": 14.1576, + "grad_norm": 2.1537740230560303, + "learning_rate": 0.0005, + "epoch": 0.4284617840996308, + "step": 9560 + }, + { + "loss": 14.0213, + "grad_norm": 1.8866078853607178, + "learning_rate": 0.0005, + "epoch": 0.42868587499089633, + "step": 9565 + }, + { + "loss": 14.1185, + "grad_norm": 1.8932602405548096, + "learning_rate": 0.0005, + "epoch": 0.4289099658821618, + "step": 9570 + }, + { + "loss": 14.0622, + "grad_norm": 1.8032782077789307, + "learning_rate": 0.0005, + "epoch": 0.4291340567734273, + "step": 9575 + }, + { + "loss": 14.0255, + "grad_norm": 1.8393731117248535, + "learning_rate": 0.0005, + "epoch": 0.4293581476646928, + "step": 9580 + }, + { + "loss": 14.048, + "grad_norm": 1.914337396621704, + "learning_rate": 0.0005, + "epoch": 0.4295822385559583, + "step": 9585 + }, + { + "loss": 14.0042, + "grad_norm": 1.8856425285339355, + "learning_rate": 0.0005, + "epoch": 0.4298063294472238, + "step": 9590 + }, + { + "loss": 14.1046, + "grad_norm": 1.7746888399124146, + "learning_rate": 0.0005, + "epoch": 0.4300304203384893, + "step": 9595 + }, + { + "loss": 14.0323, + "grad_norm": 1.9397006034851074, + "learning_rate": 0.0005, + "epoch": 0.4302545112297548, + "step": 9600 + }, + { + "loss": 14.0858, + "grad_norm": 1.9266762733459473, + "learning_rate": 0.0005, + "epoch": 0.4304786021210203, + "step": 9605 + }, + { + "loss": 14.0142, + "grad_norm": 1.9198224544525146, + "learning_rate": 0.0005, + "epoch": 0.4307026930122858, + "step": 9610 + }, + { + "loss": 14.0901, + "grad_norm": 1.9672712087631226, + "learning_rate": 0.0005, + "epoch": 0.43092678390355127, + "step": 9615 + }, + { + "loss": 13.9971, + "grad_norm": 1.9564368724822998, + "learning_rate": 0.0005, + "epoch": 0.4311508747948168, + "step": 9620 + }, + { + "loss": 14.0168, + "grad_norm": 1.7687296867370605, + "learning_rate": 0.0005, + "epoch": 0.4313749656860823, + "step": 9625 + }, + { + "loss": 14.0594, + "grad_norm": 1.8282158374786377, + "learning_rate": 0.0005, + "epoch": 0.43159905657734776, + "step": 9630 + }, + { + "loss": 13.99, + "grad_norm": 1.986424207687378, + "learning_rate": 0.0005, + "epoch": 0.4318231474686133, + "step": 9635 + }, + { + "loss": 14.0392, + "grad_norm": 1.755864143371582, + "learning_rate": 0.0005, + "epoch": 0.4320472383598788, + "step": 9640 + }, + { + "loss": 14.1816, + "grad_norm": 1.9651696681976318, + "learning_rate": 0.0005, + "epoch": 0.43227132925114425, + "step": 9645 + }, + { + "loss": 14.1242, + "grad_norm": 1.9259588718414307, + "learning_rate": 0.0005, + "epoch": 0.43249542014240977, + "step": 9650 + }, + { + "loss": 14.0321, + "grad_norm": 2.0099666118621826, + "learning_rate": 0.0005, + "epoch": 0.4327195110336753, + "step": 9655 + }, + { + "loss": 14.1886, + "grad_norm": 2.332737922668457, + "learning_rate": 0.0005, + "epoch": 0.43294360192494075, + "step": 9660 + }, + { + "loss": 14.0657, + "grad_norm": 2.220191240310669, + "learning_rate": 0.0005, + "epoch": 0.43316769281620626, + "step": 9665 + }, + { + "loss": 14.046, + "grad_norm": 1.9909117221832275, + "learning_rate": 0.0005, + "epoch": 0.4333917837074718, + "step": 9670 + }, + { + "loss": 14.0047, + "grad_norm": 1.9637538194656372, + "learning_rate": 0.0005, + "epoch": 0.43361587459873724, + "step": 9675 + }, + { + "loss": 14.104, + "grad_norm": 2.0495548248291016, + "learning_rate": 0.0005, + "epoch": 0.43383996549000275, + "step": 9680 + }, + { + "loss": 14.0572, + "grad_norm": 2.065443277359009, + "learning_rate": 0.0005, + "epoch": 0.4340640563812682, + "step": 9685 + }, + { + "loss": 13.9782, + "grad_norm": 1.896341323852539, + "learning_rate": 0.0005, + "epoch": 0.43428814727253373, + "step": 9690 + }, + { + "loss": 14.0158, + "grad_norm": 2.0682294368743896, + "learning_rate": 0.0005, + "epoch": 0.43451223816379925, + "step": 9695 + }, + { + "loss": 14.0686, + "grad_norm": 2.1541178226470947, + "learning_rate": 0.0005, + "epoch": 0.4347363290550647, + "step": 9700 + }, + { + "loss": 14.111, + "grad_norm": 2.1676082611083984, + "learning_rate": 0.0005, + "epoch": 0.4349604199463302, + "step": 9705 + }, + { + "loss": 14.0887, + "grad_norm": 2.198476552963257, + "learning_rate": 0.0005, + "epoch": 0.43518451083759574, + "step": 9710 + }, + { + "loss": 14.0078, + "grad_norm": 1.7735761404037476, + "learning_rate": 0.0005, + "epoch": 0.4354086017288612, + "step": 9715 + }, + { + "loss": 14.0178, + "grad_norm": 2.0045340061187744, + "learning_rate": 0.0005, + "epoch": 0.4356326926201267, + "step": 9720 + }, + { + "loss": 14.0358, + "grad_norm": 1.8827794790267944, + "learning_rate": 0.0005, + "epoch": 0.43585678351139223, + "step": 9725 + }, + { + "loss": 14.0842, + "grad_norm": 1.7653541564941406, + "learning_rate": 0.0005, + "epoch": 0.4360808744026577, + "step": 9730 + }, + { + "loss": 14.129, + "grad_norm": 1.8109867572784424, + "learning_rate": 0.0005, + "epoch": 0.4363049652939232, + "step": 9735 + }, + { + "loss": 14.0635, + "grad_norm": 1.8981984853744507, + "learning_rate": 0.0005, + "epoch": 0.4365290561851887, + "step": 9740 + }, + { + "loss": 13.9951, + "grad_norm": 1.8760943412780762, + "learning_rate": 0.0005, + "epoch": 0.4367531470764542, + "step": 9745 + }, + { + "loss": 14.0403, + "grad_norm": 1.8407485485076904, + "learning_rate": 0.0005, + "epoch": 0.4369772379677197, + "step": 9750 + }, + { + "loss": 13.9382, + "grad_norm": 1.9359339475631714, + "learning_rate": 0.0005, + "epoch": 0.4372013288589852, + "step": 9755 + }, + { + "loss": 14.0912, + "grad_norm": 1.8769389390945435, + "learning_rate": 0.0005, + "epoch": 0.4374254197502507, + "step": 9760 + }, + { + "loss": 13.9935, + "grad_norm": 1.6882269382476807, + "learning_rate": 0.0005, + "epoch": 0.4376495106415162, + "step": 9765 + }, + { + "loss": 14.0368, + "grad_norm": 1.8080883026123047, + "learning_rate": 0.0005, + "epoch": 0.4378736015327817, + "step": 9770 + }, + { + "loss": 13.9275, + "grad_norm": 1.9042779207229614, + "learning_rate": 0.0005, + "epoch": 0.43809769242404717, + "step": 9775 + }, + { + "loss": 14.1232, + "grad_norm": 1.9694958925247192, + "learning_rate": 0.0005, + "epoch": 0.4383217833153127, + "step": 9780 + }, + { + "loss": 14.0069, + "grad_norm": 1.853636384010315, + "learning_rate": 0.0005, + "epoch": 0.4385458742065782, + "step": 9785 + }, + { + "loss": 14.0662, + "grad_norm": 1.9663853645324707, + "learning_rate": 0.0005, + "epoch": 0.43876996509784366, + "step": 9790 + }, + { + "loss": 14.0328, + "grad_norm": 2.089175224304199, + "learning_rate": 0.0005, + "epoch": 0.4389940559891092, + "step": 9795 + }, + { + "loss": 14.0414, + "grad_norm": 1.8818589448928833, + "learning_rate": 0.0005, + "epoch": 0.4392181468803747, + "step": 9800 + }, + { + "loss": 14.0658, + "grad_norm": 1.8281285762786865, + "learning_rate": 0.0005, + "epoch": 0.43944223777164015, + "step": 9805 + }, + { + "loss": 14.1294, + "grad_norm": 2.1331112384796143, + "learning_rate": 0.0005, + "epoch": 0.43966632866290567, + "step": 9810 + }, + { + "loss": 14.1253, + "grad_norm": 2.412768602371216, + "learning_rate": 0.0005, + "epoch": 0.4398904195541712, + "step": 9815 + }, + { + "loss": 14.0673, + "grad_norm": 1.8944461345672607, + "learning_rate": 0.0005, + "epoch": 0.44011451044543665, + "step": 9820 + }, + { + "loss": 14.0721, + "grad_norm": 1.772596836090088, + "learning_rate": 0.0005, + "epoch": 0.44033860133670216, + "step": 9825 + }, + { + "loss": 13.9508, + "grad_norm": 1.7824463844299316, + "learning_rate": 0.0005, + "epoch": 0.4405626922279677, + "step": 9830 + }, + { + "loss": 14.0619, + "grad_norm": 1.9199808835983276, + "learning_rate": 0.0005, + "epoch": 0.44078678311923314, + "step": 9835 + }, + { + "loss": 13.9435, + "grad_norm": 1.7367388010025024, + "learning_rate": 0.0005, + "epoch": 0.44101087401049865, + "step": 9840 + }, + { + "loss": 14.0238, + "grad_norm": 1.9348160028457642, + "learning_rate": 0.0005, + "epoch": 0.44123496490176417, + "step": 9845 + }, + { + "loss": 14.1016, + "grad_norm": 1.9653925895690918, + "learning_rate": 0.0005, + "epoch": 0.44145905579302963, + "step": 9850 + }, + { + "loss": 14.0995, + "grad_norm": 1.7807902097702026, + "learning_rate": 0.0005, + "epoch": 0.44168314668429515, + "step": 9855 + }, + { + "loss": 14.0309, + "grad_norm": 1.9768279790878296, + "learning_rate": 0.0005, + "epoch": 0.44190723757556066, + "step": 9860 + }, + { + "loss": 14.026, + "grad_norm": 1.8459382057189941, + "learning_rate": 0.0005, + "epoch": 0.4421313284668261, + "step": 9865 + }, + { + "loss": 13.9491, + "grad_norm": 1.765913963317871, + "learning_rate": 0.0005, + "epoch": 0.44235541935809164, + "step": 9870 + }, + { + "loss": 14.0261, + "grad_norm": 1.9289934635162354, + "learning_rate": 0.0005, + "epoch": 0.44257951024935716, + "step": 9875 + }, + { + "loss": 14.0231, + "grad_norm": 2.0702064037323, + "learning_rate": 0.0005, + "epoch": 0.4428036011406226, + "step": 9880 + }, + { + "loss": 14.0428, + "grad_norm": 1.9474433660507202, + "learning_rate": 0.0005, + "epoch": 0.44302769203188813, + "step": 9885 + }, + { + "loss": 14.0147, + "grad_norm": 1.930172085762024, + "learning_rate": 0.0005, + "epoch": 0.44325178292315365, + "step": 9890 + }, + { + "loss": 14.0694, + "grad_norm": 2.0727486610412598, + "learning_rate": 0.0005, + "epoch": 0.4434758738144191, + "step": 9895 + }, + { + "loss": 14.1825, + "grad_norm": 1.8144179582595825, + "learning_rate": 0.0005, + "epoch": 0.4436999647056846, + "step": 9900 + }, + { + "loss": 13.9797, + "grad_norm": 1.8270831108093262, + "learning_rate": 0.0005, + "epoch": 0.44392405559695014, + "step": 9905 + }, + { + "loss": 14.0102, + "grad_norm": 1.9958640336990356, + "learning_rate": 0.0005, + "epoch": 0.4441481464882156, + "step": 9910 + }, + { + "loss": 14.0815, + "grad_norm": 1.8873703479766846, + "learning_rate": 0.0005, + "epoch": 0.4443722373794811, + "step": 9915 + }, + { + "loss": 13.9492, + "grad_norm": 2.3722341060638428, + "learning_rate": 0.0005, + "epoch": 0.44459632827074663, + "step": 9920 + }, + { + "loss": 14.0492, + "grad_norm": 1.8521634340286255, + "learning_rate": 0.0005, + "epoch": 0.4448204191620121, + "step": 9925 + }, + { + "loss": 14.1128, + "grad_norm": 1.9003937244415283, + "learning_rate": 0.0005, + "epoch": 0.4450445100532776, + "step": 9930 + }, + { + "loss": 14.0259, + "grad_norm": 1.7331500053405762, + "learning_rate": 0.0005, + "epoch": 0.4452686009445431, + "step": 9935 + }, + { + "loss": 13.979, + "grad_norm": 1.9712514877319336, + "learning_rate": 0.0005, + "epoch": 0.4454926918358086, + "step": 9940 + }, + { + "loss": 14.0766, + "grad_norm": 1.9684165716171265, + "learning_rate": 0.0005, + "epoch": 0.4457167827270741, + "step": 9945 + }, + { + "loss": 14.0004, + "grad_norm": 1.961761474609375, + "learning_rate": 0.0005, + "epoch": 0.4459408736183396, + "step": 9950 + }, + { + "loss": 14.1178, + "grad_norm": 2.1732451915740967, + "learning_rate": 0.0005, + "epoch": 0.4461649645096051, + "step": 9955 + }, + { + "loss": 14.0609, + "grad_norm": 1.9633890390396118, + "learning_rate": 0.0005, + "epoch": 0.4463890554008706, + "step": 9960 + }, + { + "loss": 14.0079, + "grad_norm": 1.9380450248718262, + "learning_rate": 0.0005, + "epoch": 0.4466131462921361, + "step": 9965 + }, + { + "loss": 14.1057, + "grad_norm": 2.074018955230713, + "learning_rate": 0.0005, + "epoch": 0.44683723718340157, + "step": 9970 + }, + { + "loss": 14.1039, + "grad_norm": 1.8502141237258911, + "learning_rate": 0.0005, + "epoch": 0.4470613280746671, + "step": 9975 + }, + { + "loss": 14.0487, + "grad_norm": 2.16658616065979, + "learning_rate": 0.0005, + "epoch": 0.4472854189659326, + "step": 9980 + }, + { + "loss": 13.9841, + "grad_norm": 1.9512484073638916, + "learning_rate": 0.0005, + "epoch": 0.44750950985719806, + "step": 9985 + }, + { + "loss": 14.0446, + "grad_norm": 1.728674292564392, + "learning_rate": 0.0005, + "epoch": 0.4477336007484636, + "step": 9990 + }, + { + "loss": 13.9403, + "grad_norm": 1.8797376155853271, + "learning_rate": 0.0005, + "epoch": 0.4479576916397291, + "step": 9995 + }, + { + "loss": 13.9627, + "grad_norm": 1.7768317461013794, + "learning_rate": 0.0005, + "epoch": 0.44818178253099455, + "step": 10000 + }, + { + "eval_loss": 1.7535099983215332, + "eval_runtime": 18.6103, + "eval_samples_per_second": 880.37, + "eval_steps_per_second": 7.899, + "epoch": 0.44818178253099455, + "step": 10000 + }, + { + "loss": 14.0345, + "grad_norm": 1.8116319179534912, + "learning_rate": 0.0005, + "epoch": 0.44840587342226007, + "step": 10005 + }, + { + "loss": 13.9768, + "grad_norm": 1.941550612449646, + "learning_rate": 0.0005, + "epoch": 0.4486299643135256, + "step": 10010 + }, + { + "loss": 13.9213, + "grad_norm": 2.029679298400879, + "learning_rate": 0.0005, + "epoch": 0.44885405520479105, + "step": 10015 + }, + { + "loss": 14.0488, + "grad_norm": 1.9416868686676025, + "learning_rate": 0.0005, + "epoch": 0.44907814609605656, + "step": 10020 + }, + { + "loss": 13.9741, + "grad_norm": 1.7495111227035522, + "learning_rate": 0.0005, + "epoch": 0.4493022369873221, + "step": 10025 + }, + { + "loss": 14.0472, + "grad_norm": 1.8708152770996094, + "learning_rate": 0.0005, + "epoch": 0.44952632787858754, + "step": 10030 + }, + { + "loss": 14.0744, + "grad_norm": 1.8809341192245483, + "learning_rate": 0.0005, + "epoch": 0.44975041876985306, + "step": 10035 + }, + { + "loss": 14.1143, + "grad_norm": 1.8372942209243774, + "learning_rate": 0.0005, + "epoch": 0.44997450966111857, + "step": 10040 + }, + { + "loss": 13.9523, + "grad_norm": 1.7713371515274048, + "learning_rate": 0.0005, + "epoch": 0.45019860055238403, + "step": 10045 + }, + { + "loss": 14.0756, + "grad_norm": 1.8108537197113037, + "learning_rate": 0.0005, + "epoch": 0.45042269144364955, + "step": 10050 + }, + { + "loss": 13.9517, + "grad_norm": 1.839207410812378, + "learning_rate": 0.0005, + "epoch": 0.45064678233491506, + "step": 10055 + }, + { + "loss": 14.0217, + "grad_norm": 2.060572385787964, + "learning_rate": 0.0005, + "epoch": 0.4508708732261805, + "step": 10060 + }, + { + "loss": 14.0306, + "grad_norm": 1.7858555316925049, + "learning_rate": 0.0005, + "epoch": 0.45109496411744604, + "step": 10065 + }, + { + "loss": 14.0025, + "grad_norm": 1.8673533201217651, + "learning_rate": 0.0005, + "epoch": 0.45131905500871156, + "step": 10070 + }, + { + "loss": 14.0783, + "grad_norm": 2.0727386474609375, + "learning_rate": 0.0005, + "epoch": 0.451543145899977, + "step": 10075 + }, + { + "loss": 14.042, + "grad_norm": 1.91311514377594, + "learning_rate": 0.0005, + "epoch": 0.45176723679124253, + "step": 10080 + }, + { + "loss": 14.0383, + "grad_norm": 1.808239221572876, + "learning_rate": 0.0005, + "epoch": 0.45199132768250805, + "step": 10085 + }, + { + "loss": 14.0227, + "grad_norm": 1.7587231397628784, + "learning_rate": 0.0005, + "epoch": 0.4522154185737735, + "step": 10090 + }, + { + "loss": 14.0389, + "grad_norm": 1.8921979665756226, + "learning_rate": 0.0005, + "epoch": 0.452439509465039, + "step": 10095 + }, + { + "loss": 14.0514, + "grad_norm": 4.310143947601318, + "learning_rate": 0.0005, + "epoch": 0.45266360035630454, + "step": 10100 + }, + { + "loss": 14.6969, + "grad_norm": 64.53416442871094, + "learning_rate": 0.0005, + "epoch": 0.45288769124757, + "step": 10105 + }, + { + "loss": 14.4036, + "grad_norm": 2.269564628601074, + "learning_rate": 0.0005, + "epoch": 0.4531117821388355, + "step": 10110 + }, + { + "loss": 14.1395, + "grad_norm": 29.50452995300293, + "learning_rate": 0.0005, + "epoch": 0.45333587303010103, + "step": 10115 + }, + { + "loss": 14.1476, + "grad_norm": 2.0817313194274902, + "learning_rate": 0.0005, + "epoch": 0.4535599639213665, + "step": 10120 + }, + { + "loss": 13.938, + "grad_norm": 2.068413257598877, + "learning_rate": 0.0005, + "epoch": 0.453784054812632, + "step": 10125 + }, + { + "loss": 14.0095, + "grad_norm": 1.9635816812515259, + "learning_rate": 0.0005, + "epoch": 0.4540081457038975, + "step": 10130 + }, + { + "loss": 13.9837, + "grad_norm": 1.942551612854004, + "learning_rate": 0.0005, + "epoch": 0.454232236595163, + "step": 10135 + }, + { + "loss": 14.0523, + "grad_norm": 84.11277770996094, + "learning_rate": 0.0005, + "epoch": 0.4544563274864285, + "step": 10140 + }, + { + "loss": 13.989, + "grad_norm": 2.137803792953491, + "learning_rate": 0.0005, + "epoch": 0.454680418377694, + "step": 10145 + }, + { + "loss": 14.1064, + "grad_norm": 2.055637836456299, + "learning_rate": 0.0005, + "epoch": 0.4549045092689595, + "step": 10150 + }, + { + "loss": 14.0542, + "grad_norm": 1.9040488004684448, + "learning_rate": 0.0005, + "epoch": 0.455128600160225, + "step": 10155 + }, + { + "loss": 14.0577, + "grad_norm": 10.268532752990723, + "learning_rate": 0.0005, + "epoch": 0.4553526910514905, + "step": 10160 + }, + { + "loss": 14.1002, + "grad_norm": 1.9174935817718506, + "learning_rate": 0.0005, + "epoch": 0.45557678194275597, + "step": 10165 + }, + { + "loss": 14.0308, + "grad_norm": 1.8861088752746582, + "learning_rate": 0.0005, + "epoch": 0.4558008728340215, + "step": 10170 + }, + { + "loss": 14.0439, + "grad_norm": 2.4005439281463623, + "learning_rate": 0.0005, + "epoch": 0.456024963725287, + "step": 10175 + }, + { + "loss": 14.038, + "grad_norm": 1.8772950172424316, + "learning_rate": 0.0005, + "epoch": 0.45624905461655246, + "step": 10180 + }, + { + "loss": 13.9871, + "grad_norm": 1.927895426750183, + "learning_rate": 0.0005, + "epoch": 0.456473145507818, + "step": 10185 + }, + { + "loss": 13.98, + "grad_norm": 1.8765352964401245, + "learning_rate": 0.0005, + "epoch": 0.4566972363990835, + "step": 10190 + }, + { + "loss": 14.1258, + "grad_norm": 1.8601654767990112, + "learning_rate": 0.0005, + "epoch": 0.45692132729034896, + "step": 10195 + }, + { + "loss": 13.9915, + "grad_norm": 1.9006295204162598, + "learning_rate": 0.0005, + "epoch": 0.45714541818161447, + "step": 10200 + }, + { + "loss": 14.0611, + "grad_norm": 1.9740972518920898, + "learning_rate": 0.0005, + "epoch": 0.45736950907288, + "step": 10205 + }, + { + "loss": 14.055, + "grad_norm": 1.8645578622817993, + "learning_rate": 0.0005, + "epoch": 0.45759359996414545, + "step": 10210 + }, + { + "loss": 14.0846, + "grad_norm": 1.9724942445755005, + "learning_rate": 0.0005, + "epoch": 0.45781769085541096, + "step": 10215 + }, + { + "loss": 14.0726, + "grad_norm": 2.019284725189209, + "learning_rate": 0.0005, + "epoch": 0.4580417817466765, + "step": 10220 + }, + { + "loss": 14.0259, + "grad_norm": 2.0942375659942627, + "learning_rate": 0.0005, + "epoch": 0.45826587263794194, + "step": 10225 + }, + { + "loss": 14.0304, + "grad_norm": 2.0027451515197754, + "learning_rate": 0.0005, + "epoch": 0.45848996352920746, + "step": 10230 + }, + { + "loss": 13.9557, + "grad_norm": 1.8633819818496704, + "learning_rate": 0.0005, + "epoch": 0.4587140544204729, + "step": 10235 + }, + { + "loss": 14.113, + "grad_norm": 1.8201923370361328, + "learning_rate": 0.0005, + "epoch": 0.45893814531173843, + "step": 10240 + }, + { + "loss": 14.0944, + "grad_norm": 1.6903871297836304, + "learning_rate": 0.0005, + "epoch": 0.45916223620300395, + "step": 10245 + }, + { + "loss": 14.0911, + "grad_norm": 2.0811848640441895, + "learning_rate": 0.0005, + "epoch": 0.4593863270942694, + "step": 10250 + }, + { + "loss": 14.0192, + "grad_norm": 2.03033447265625, + "learning_rate": 0.0005, + "epoch": 0.4596104179855349, + "step": 10255 + }, + { + "loss": 14.0824, + "grad_norm": 2.0053560733795166, + "learning_rate": 0.0005, + "epoch": 0.45983450887680044, + "step": 10260 + }, + { + "loss": 14.1325, + "grad_norm": 1.9983246326446533, + "learning_rate": 0.0005, + "epoch": 0.4600585997680659, + "step": 10265 + }, + { + "loss": 14.0628, + "grad_norm": 1.8627090454101562, + "learning_rate": 0.0005, + "epoch": 0.4602826906593314, + "step": 10270 + }, + { + "loss": 14.0698, + "grad_norm": 1.819238305091858, + "learning_rate": 0.0005, + "epoch": 0.46050678155059693, + "step": 10275 + }, + { + "loss": 14.087, + "grad_norm": 1.8443467617034912, + "learning_rate": 0.0005, + "epoch": 0.4607308724418624, + "step": 10280 + }, + { + "loss": 14.0054, + "grad_norm": 1.9102563858032227, + "learning_rate": 0.0005, + "epoch": 0.4609549633331279, + "step": 10285 + }, + { + "loss": 13.9632, + "grad_norm": 1.9677178859710693, + "learning_rate": 0.0005, + "epoch": 0.4611790542243934, + "step": 10290 + }, + { + "loss": 14.105, + "grad_norm": 1.943468451499939, + "learning_rate": 0.0005, + "epoch": 0.4614031451156589, + "step": 10295 + }, + { + "loss": 14.0248, + "grad_norm": 1.8622480630874634, + "learning_rate": 0.0005, + "epoch": 0.4616272360069244, + "step": 10300 + }, + { + "loss": 14.0241, + "grad_norm": 1.9953628778457642, + "learning_rate": 0.0005, + "epoch": 0.4618513268981899, + "step": 10305 + }, + { + "loss": 14.1536, + "grad_norm": 2.2701895236968994, + "learning_rate": 0.0005, + "epoch": 0.4620754177894554, + "step": 10310 + }, + { + "loss": 14.0628, + "grad_norm": 2.028715133666992, + "learning_rate": 0.0005, + "epoch": 0.4622995086807209, + "step": 10315 + }, + { + "loss": 14.0113, + "grad_norm": 1.8300288915634155, + "learning_rate": 0.0005, + "epoch": 0.4625235995719864, + "step": 10320 + }, + { + "loss": 14.1851, + "grad_norm": 1.9454532861709595, + "learning_rate": 0.0005, + "epoch": 0.46274769046325187, + "step": 10325 + }, + { + "loss": 14.098, + "grad_norm": 2.0312724113464355, + "learning_rate": 0.0005, + "epoch": 0.4629717813545174, + "step": 10330 + }, + { + "loss": 14.0318, + "grad_norm": 1.91213858127594, + "learning_rate": 0.0005, + "epoch": 0.4631958722457829, + "step": 10335 + }, + { + "loss": 14.0608, + "grad_norm": 1.9075995683670044, + "learning_rate": 0.0005, + "epoch": 0.46341996313704836, + "step": 10340 + }, + { + "loss": 13.9737, + "grad_norm": 1.7976598739624023, + "learning_rate": 0.0005, + "epoch": 0.4636440540283139, + "step": 10345 + }, + { + "loss": 14.0832, + "grad_norm": 1.8574029207229614, + "learning_rate": 0.0005, + "epoch": 0.4638681449195794, + "step": 10350 + }, + { + "loss": 13.9781, + "grad_norm": 1.830602765083313, + "learning_rate": 0.0005, + "epoch": 0.46409223581084486, + "step": 10355 + }, + { + "loss": 14.0166, + "grad_norm": 1.8759225606918335, + "learning_rate": 0.0005, + "epoch": 0.46431632670211037, + "step": 10360 + }, + { + "loss": 14.0674, + "grad_norm": 2.1599314212799072, + "learning_rate": 0.0005, + "epoch": 0.4645404175933759, + "step": 10365 + }, + { + "loss": 14.0889, + "grad_norm": 2.089174270629883, + "learning_rate": 0.0005, + "epoch": 0.46476450848464135, + "step": 10370 + }, + { + "loss": 13.9817, + "grad_norm": 1.879372239112854, + "learning_rate": 0.0005, + "epoch": 0.46498859937590686, + "step": 10375 + }, + { + "loss": 13.9842, + "grad_norm": 1.8365589380264282, + "learning_rate": 0.0005, + "epoch": 0.4652126902671724, + "step": 10380 + }, + { + "loss": 14.1293, + "grad_norm": 1.8881949186325073, + "learning_rate": 0.0005, + "epoch": 0.46543678115843784, + "step": 10385 + }, + { + "loss": 14.0341, + "grad_norm": 1.8966625928878784, + "learning_rate": 0.0005, + "epoch": 0.46566087204970336, + "step": 10390 + }, + { + "loss": 14.1341, + "grad_norm": 1.953307032585144, + "learning_rate": 0.0005, + "epoch": 0.46588496294096887, + "step": 10395 + }, + { + "loss": 14.0498, + "grad_norm": 1.8317679166793823, + "learning_rate": 0.0005, + "epoch": 0.46610905383223433, + "step": 10400 + }, + { + "loss": 14.0098, + "grad_norm": 1.9041969776153564, + "learning_rate": 0.0005, + "epoch": 0.46633314472349985, + "step": 10405 + }, + { + "loss": 14.0836, + "grad_norm": 2.216858386993408, + "learning_rate": 0.0005, + "epoch": 0.46655723561476536, + "step": 10410 + }, + { + "loss": 14.1337, + "grad_norm": 1.7740803956985474, + "learning_rate": 0.0005, + "epoch": 0.4667813265060308, + "step": 10415 + }, + { + "loss": 14.1292, + "grad_norm": 1.901526927947998, + "learning_rate": 0.0005, + "epoch": 0.46700541739729634, + "step": 10420 + }, + { + "loss": 14.0508, + "grad_norm": 2.048635244369507, + "learning_rate": 0.0005, + "epoch": 0.46722950828856186, + "step": 10425 + }, + { + "loss": 14.1055, + "grad_norm": 1.9151564836502075, + "learning_rate": 0.0005, + "epoch": 0.4674535991798273, + "step": 10430 + }, + { + "loss": 14.0799, + "grad_norm": 1.8743534088134766, + "learning_rate": 0.0005, + "epoch": 0.46767769007109283, + "step": 10435 + }, + { + "loss": 14.0249, + "grad_norm": 2.037383794784546, + "learning_rate": 0.0005, + "epoch": 0.46790178096235835, + "step": 10440 + }, + { + "loss": 14.1092, + "grad_norm": 1.9318444728851318, + "learning_rate": 0.0005, + "epoch": 0.4681258718536238, + "step": 10445 + }, + { + "loss": 14.1141, + "grad_norm": 1.904248595237732, + "learning_rate": 0.0005, + "epoch": 0.4683499627448893, + "step": 10450 + }, + { + "loss": 13.9967, + "grad_norm": 1.9173495769500732, + "learning_rate": 0.0005, + "epoch": 0.46857405363615484, + "step": 10455 + }, + { + "loss": 14.0076, + "grad_norm": 2.2218124866485596, + "learning_rate": 0.0005, + "epoch": 0.4687981445274203, + "step": 10460 + }, + { + "loss": 14.0625, + "grad_norm": 2.1360721588134766, + "learning_rate": 0.0005, + "epoch": 0.4690222354186858, + "step": 10465 + }, + { + "loss": 14.1023, + "grad_norm": 1.7907522916793823, + "learning_rate": 0.0005, + "epoch": 0.46924632630995133, + "step": 10470 + }, + { + "loss": 14.0633, + "grad_norm": 1.9924049377441406, + "learning_rate": 0.0005, + "epoch": 0.4694704172012168, + "step": 10475 + }, + { + "loss": 14.0495, + "grad_norm": 1.9566833972930908, + "learning_rate": 0.0005, + "epoch": 0.4696945080924823, + "step": 10480 + }, + { + "loss": 14.0408, + "grad_norm": 2.0456693172454834, + "learning_rate": 0.0005, + "epoch": 0.4699185989837478, + "step": 10485 + }, + { + "loss": 14.0525, + "grad_norm": 1.915246844291687, + "learning_rate": 0.0005, + "epoch": 0.4701426898750133, + "step": 10490 + }, + { + "loss": 14.0103, + "grad_norm": 1.8662196397781372, + "learning_rate": 0.0005, + "epoch": 0.4703667807662788, + "step": 10495 + }, + { + "loss": 14.0615, + "grad_norm": 2.1604061126708984, + "learning_rate": 0.0005, + "epoch": 0.4705908716575443, + "step": 10500 + }, + { + "eval_loss": 1.7521030902862549, + "eval_runtime": 18.4061, + "eval_samples_per_second": 890.139, + "eval_steps_per_second": 7.986, + "epoch": 0.4705908716575443, + "step": 10500 + }, + { + "loss": 14.1009, + "grad_norm": 2.012334108352661, + "learning_rate": 0.0005, + "epoch": 0.4708149625488098, + "step": 10505 + }, + { + "loss": 14.0878, + "grad_norm": 2.1435182094573975, + "learning_rate": 0.0005, + "epoch": 0.4710390534400753, + "step": 10510 + }, + { + "loss": 13.9449, + "grad_norm": 2.0218379497528076, + "learning_rate": 0.0005, + "epoch": 0.4712631443313408, + "step": 10515 + }, + { + "loss": 14.1722, + "grad_norm": 1.9891680479049683, + "learning_rate": 0.0005, + "epoch": 0.47148723522260627, + "step": 10520 + }, + { + "loss": 13.9856, + "grad_norm": 1.8381295204162598, + "learning_rate": 0.0005, + "epoch": 0.4717113261138718, + "step": 10525 + }, + { + "loss": 14.0003, + "grad_norm": 1.849729299545288, + "learning_rate": 0.0005, + "epoch": 0.4719354170051373, + "step": 10530 + }, + { + "loss": 13.9588, + "grad_norm": 1.8582322597503662, + "learning_rate": 0.0005, + "epoch": 0.47215950789640276, + "step": 10535 + }, + { + "loss": 13.9515, + "grad_norm": 1.9138835668563843, + "learning_rate": 0.0005, + "epoch": 0.4723835987876683, + "step": 10540 + }, + { + "loss": 14.0562, + "grad_norm": 1.8926070928573608, + "learning_rate": 0.0005, + "epoch": 0.4726076896789338, + "step": 10545 + }, + { + "loss": 13.957, + "grad_norm": 1.9087103605270386, + "learning_rate": 0.0005, + "epoch": 0.47283178057019926, + "step": 10550 + }, + { + "loss": 13.9708, + "grad_norm": 2.0126662254333496, + "learning_rate": 0.0005, + "epoch": 0.47305587146146477, + "step": 10555 + }, + { + "loss": 13.9551, + "grad_norm": 1.951674461364746, + "learning_rate": 0.0005, + "epoch": 0.4732799623527303, + "step": 10560 + }, + { + "loss": 14.1009, + "grad_norm": 1.8163164854049683, + "learning_rate": 0.0005, + "epoch": 0.47350405324399575, + "step": 10565 + }, + { + "loss": 14.1147, + "grad_norm": 2.0408079624176025, + "learning_rate": 0.0005, + "epoch": 0.47372814413526126, + "step": 10570 + }, + { + "loss": 14.0755, + "grad_norm": 2.0262444019317627, + "learning_rate": 0.0005, + "epoch": 0.4739522350265268, + "step": 10575 + }, + { + "loss": 14.1181, + "grad_norm": 1.8418506383895874, + "learning_rate": 0.0005, + "epoch": 0.47417632591779224, + "step": 10580 + }, + { + "loss": 13.9996, + "grad_norm": 1.8340831995010376, + "learning_rate": 0.0005, + "epoch": 0.47440041680905776, + "step": 10585 + }, + { + "loss": 14.0614, + "grad_norm": 1.894936203956604, + "learning_rate": 0.0005, + "epoch": 0.4746245077003233, + "step": 10590 + }, + { + "loss": 13.9815, + "grad_norm": 1.7638943195343018, + "learning_rate": 0.0005, + "epoch": 0.47484859859158873, + "step": 10595 + }, + { + "loss": 14.115, + "grad_norm": 2.0073249340057373, + "learning_rate": 0.0005, + "epoch": 0.47507268948285425, + "step": 10600 + }, + { + "loss": 14.0861, + "grad_norm": 1.953412413597107, + "learning_rate": 0.0005, + "epoch": 0.47529678037411976, + "step": 10605 + }, + { + "loss": 13.9858, + "grad_norm": 1.7932475805282593, + "learning_rate": 0.0005, + "epoch": 0.4755208712653852, + "step": 10610 + }, + { + "loss": 13.9297, + "grad_norm": 1.9420661926269531, + "learning_rate": 0.0005, + "epoch": 0.47574496215665074, + "step": 10615 + }, + { + "loss": 14.0705, + "grad_norm": 1.8492196798324585, + "learning_rate": 0.0005, + "epoch": 0.47596905304791626, + "step": 10620 + }, + { + "loss": 14.06, + "grad_norm": 1.8225876092910767, + "learning_rate": 0.0005, + "epoch": 0.4761931439391817, + "step": 10625 + }, + { + "loss": 14.0748, + "grad_norm": 1.7609186172485352, + "learning_rate": 0.0005, + "epoch": 0.47641723483044723, + "step": 10630 + }, + { + "loss": 13.9309, + "grad_norm": 1.9755157232284546, + "learning_rate": 0.0005, + "epoch": 0.47664132572171275, + "step": 10635 + }, + { + "loss": 14.0347, + "grad_norm": 1.896437644958496, + "learning_rate": 0.0005, + "epoch": 0.4768654166129782, + "step": 10640 + }, + { + "loss": 14.029, + "grad_norm": 1.8733117580413818, + "learning_rate": 0.0005, + "epoch": 0.4770895075042437, + "step": 10645 + }, + { + "loss": 14.0766, + "grad_norm": 2.0224342346191406, + "learning_rate": 0.0005, + "epoch": 0.47731359839550924, + "step": 10650 + }, + { + "loss": 13.886, + "grad_norm": 1.963172197341919, + "learning_rate": 0.0005, + "epoch": 0.4775376892867747, + "step": 10655 + }, + { + "loss": 14.0283, + "grad_norm": 1.968991994857788, + "learning_rate": 0.0005, + "epoch": 0.4777617801780402, + "step": 10660 + }, + { + "loss": 14.0249, + "grad_norm": 1.9693257808685303, + "learning_rate": 0.0005, + "epoch": 0.47798587106930573, + "step": 10665 + }, + { + "loss": 13.9854, + "grad_norm": 2.002882242202759, + "learning_rate": 0.0005, + "epoch": 0.4782099619605712, + "step": 10670 + }, + { + "loss": 14.0813, + "grad_norm": 1.8276903629302979, + "learning_rate": 0.0005, + "epoch": 0.4784340528518367, + "step": 10675 + }, + { + "loss": 14.0866, + "grad_norm": 1.9171091318130493, + "learning_rate": 0.0005, + "epoch": 0.4786581437431022, + "step": 10680 + }, + { + "loss": 14.0089, + "grad_norm": 1.8800194263458252, + "learning_rate": 0.0005, + "epoch": 0.4788822346343677, + "step": 10685 + }, + { + "loss": 13.999, + "grad_norm": 1.8443156480789185, + "learning_rate": 0.0005, + "epoch": 0.4791063255256332, + "step": 10690 + }, + { + "loss": 14.0472, + "grad_norm": 1.7933176755905151, + "learning_rate": 0.0005, + "epoch": 0.4793304164168987, + "step": 10695 + }, + { + "loss": 14.0846, + "grad_norm": 2.079235315322876, + "learning_rate": 0.0005, + "epoch": 0.4795545073081642, + "step": 10700 + }, + { + "loss": 14.0334, + "grad_norm": 1.8366498947143555, + "learning_rate": 0.0005, + "epoch": 0.4797785981994297, + "step": 10705 + }, + { + "loss": 14.0646, + "grad_norm": 1.997281789779663, + "learning_rate": 0.0005, + "epoch": 0.4800026890906952, + "step": 10710 + }, + { + "loss": 13.9765, + "grad_norm": 2.008910655975342, + "learning_rate": 0.0005, + "epoch": 0.48022677998196067, + "step": 10715 + }, + { + "loss": 14.0477, + "grad_norm": 1.8822402954101562, + "learning_rate": 0.0005, + "epoch": 0.4804508708732262, + "step": 10720 + }, + { + "loss": 14.0811, + "grad_norm": 2.069028854370117, + "learning_rate": 0.0005, + "epoch": 0.4806749617644917, + "step": 10725 + }, + { + "loss": 13.9759, + "grad_norm": 2.019068479537964, + "learning_rate": 0.0005, + "epoch": 0.48089905265575716, + "step": 10730 + }, + { + "loss": 14.0791, + "grad_norm": 1.9277989864349365, + "learning_rate": 0.0005, + "epoch": 0.4811231435470227, + "step": 10735 + }, + { + "loss": 14.0211, + "grad_norm": 2.0322346687316895, + "learning_rate": 0.0005, + "epoch": 0.4813472344382882, + "step": 10740 + }, + { + "loss": 14.1959, + "grad_norm": 2.0111358165740967, + "learning_rate": 0.0005, + "epoch": 0.48157132532955366, + "step": 10745 + }, + { + "loss": 14.0045, + "grad_norm": 2.2163989543914795, + "learning_rate": 0.0005, + "epoch": 0.4817954162208192, + "step": 10750 + }, + { + "loss": 13.9323, + "grad_norm": 2.0738158226013184, + "learning_rate": 0.0005, + "epoch": 0.4820195071120847, + "step": 10755 + }, + { + "loss": 14.0603, + "grad_norm": 1.9725066423416138, + "learning_rate": 0.0005, + "epoch": 0.48224359800335015, + "step": 10760 + }, + { + "loss": 14.0725, + "grad_norm": 2.052971601486206, + "learning_rate": 0.0005, + "epoch": 0.48246768889461566, + "step": 10765 + }, + { + "loss": 14.1246, + "grad_norm": 2.0623252391815186, + "learning_rate": 0.0005, + "epoch": 0.4826917797858812, + "step": 10770 + }, + { + "loss": 14.0091, + "grad_norm": 1.806399941444397, + "learning_rate": 0.0005, + "epoch": 0.48291587067714664, + "step": 10775 + }, + { + "loss": 14.011, + "grad_norm": 1.7548433542251587, + "learning_rate": 0.0005, + "epoch": 0.48313996156841216, + "step": 10780 + }, + { + "loss": 14.0414, + "grad_norm": 1.7872982025146484, + "learning_rate": 0.0005, + "epoch": 0.4833640524596776, + "step": 10785 + }, + { + "loss": 14.0229, + "grad_norm": 1.8104687929153442, + "learning_rate": 0.0005, + "epoch": 0.48358814335094313, + "step": 10790 + }, + { + "loss": 13.9876, + "grad_norm": 1.7846254110336304, + "learning_rate": 0.0005, + "epoch": 0.48381223424220865, + "step": 10795 + }, + { + "loss": 14.0102, + "grad_norm": 1.9332680702209473, + "learning_rate": 0.0005, + "epoch": 0.4840363251334741, + "step": 10800 + }, + { + "loss": 14.1051, + "grad_norm": 1.975170612335205, + "learning_rate": 0.0005, + "epoch": 0.4842604160247396, + "step": 10805 + }, + { + "loss": 13.994, + "grad_norm": 1.882921576499939, + "learning_rate": 0.0005, + "epoch": 0.48448450691600514, + "step": 10810 + }, + { + "loss": 14.0343, + "grad_norm": 1.848868727684021, + "learning_rate": 0.0005, + "epoch": 0.4847085978072706, + "step": 10815 + }, + { + "loss": 13.9719, + "grad_norm": 1.8909270763397217, + "learning_rate": 0.0005, + "epoch": 0.4849326886985361, + "step": 10820 + }, + { + "loss": 14.0679, + "grad_norm": 1.6264833211898804, + "learning_rate": 0.0005, + "epoch": 0.48515677958980163, + "step": 10825 + }, + { + "loss": 14.073, + "grad_norm": 1.9281812906265259, + "learning_rate": 0.0005, + "epoch": 0.4853808704810671, + "step": 10830 + }, + { + "loss": 14.03, + "grad_norm": 2.016878128051758, + "learning_rate": 0.0005, + "epoch": 0.4856049613723326, + "step": 10835 + }, + { + "loss": 14.0051, + "grad_norm": 1.7301148176193237, + "learning_rate": 0.0005, + "epoch": 0.4858290522635981, + "step": 10840 + }, + { + "loss": 14.0598, + "grad_norm": 1.7245323657989502, + "learning_rate": 0.0005, + "epoch": 0.4860531431548636, + "step": 10845 + }, + { + "loss": 14.0583, + "grad_norm": 1.7455264329910278, + "learning_rate": 0.0005, + "epoch": 0.4862772340461291, + "step": 10850 + }, + { + "loss": 14.0273, + "grad_norm": 1.8355683088302612, + "learning_rate": 0.0005, + "epoch": 0.4865013249373946, + "step": 10855 + }, + { + "loss": 13.9864, + "grad_norm": 1.8264118432998657, + "learning_rate": 0.0005, + "epoch": 0.4867254158286601, + "step": 10860 + }, + { + "loss": 14.049, + "grad_norm": 2.033604145050049, + "learning_rate": 0.0005, + "epoch": 0.4869495067199256, + "step": 10865 + }, + { + "loss": 14.0507, + "grad_norm": 2.197317600250244, + "learning_rate": 0.0005, + "epoch": 0.4871735976111911, + "step": 10870 + }, + { + "loss": 13.9427, + "grad_norm": 1.9294264316558838, + "learning_rate": 0.0005, + "epoch": 0.48739768850245657, + "step": 10875 + }, + { + "loss": 14.109, + "grad_norm": 1.965006709098816, + "learning_rate": 0.0005, + "epoch": 0.4876217793937221, + "step": 10880 + }, + { + "loss": 14.1184, + "grad_norm": 1.8745315074920654, + "learning_rate": 0.0005, + "epoch": 0.4878458702849876, + "step": 10885 + }, + { + "loss": 14.0252, + "grad_norm": 1.86996328830719, + "learning_rate": 0.0005, + "epoch": 0.48806996117625306, + "step": 10890 + }, + { + "loss": 14.0483, + "grad_norm": 1.8305915594100952, + "learning_rate": 0.0005, + "epoch": 0.4882940520675186, + "step": 10895 + }, + { + "loss": 13.9539, + "grad_norm": 1.8650505542755127, + "learning_rate": 0.0005, + "epoch": 0.4885181429587841, + "step": 10900 + }, + { + "loss": 14.1072, + "grad_norm": 1.9065357446670532, + "learning_rate": 0.0005, + "epoch": 0.48874223385004956, + "step": 10905 + }, + { + "loss": 14.0088, + "grad_norm": 1.9280742406845093, + "learning_rate": 0.0005, + "epoch": 0.4889663247413151, + "step": 10910 + }, + { + "loss": 14.0887, + "grad_norm": 1.852734923362732, + "learning_rate": 0.0005, + "epoch": 0.4891904156325806, + "step": 10915 + }, + { + "loss": 14.0088, + "grad_norm": 1.8719218969345093, + "learning_rate": 0.0005, + "epoch": 0.48941450652384605, + "step": 10920 + }, + { + "loss": 14.0579, + "grad_norm": 1.9777988195419312, + "learning_rate": 0.0005, + "epoch": 0.48963859741511156, + "step": 10925 + }, + { + "loss": 13.9555, + "grad_norm": 1.9106028079986572, + "learning_rate": 0.0005, + "epoch": 0.4898626883063771, + "step": 10930 + }, + { + "loss": 13.9221, + "grad_norm": 1.9049543142318726, + "learning_rate": 0.0005, + "epoch": 0.49008677919764254, + "step": 10935 + }, + { + "loss": 14.0693, + "grad_norm": 1.785038948059082, + "learning_rate": 0.0005, + "epoch": 0.49031087008890806, + "step": 10940 + }, + { + "loss": 13.9304, + "grad_norm": 1.9151285886764526, + "learning_rate": 0.0005, + "epoch": 0.4905349609801736, + "step": 10945 + }, + { + "loss": 13.9709, + "grad_norm": 1.9431225061416626, + "learning_rate": 0.0005, + "epoch": 0.49075905187143903, + "step": 10950 + }, + { + "loss": 14.0511, + "grad_norm": 1.8583786487579346, + "learning_rate": 0.0005, + "epoch": 0.49098314276270455, + "step": 10955 + }, + { + "loss": 14.0531, + "grad_norm": 1.806581974029541, + "learning_rate": 0.0005, + "epoch": 0.49120723365397007, + "step": 10960 + }, + { + "loss": 14.0427, + "grad_norm": 1.8097478151321411, + "learning_rate": 0.0005, + "epoch": 0.4914313245452355, + "step": 10965 + }, + { + "loss": 13.9989, + "grad_norm": 1.8856887817382812, + "learning_rate": 0.0005, + "epoch": 0.49165541543650104, + "step": 10970 + }, + { + "loss": 14.0249, + "grad_norm": 1.7090424299240112, + "learning_rate": 0.0005, + "epoch": 0.49187950632776656, + "step": 10975 + }, + { + "loss": 13.9924, + "grad_norm": 1.8393125534057617, + "learning_rate": 0.0005, + "epoch": 0.492103597219032, + "step": 10980 + }, + { + "loss": 14.094, + "grad_norm": 1.9828705787658691, + "learning_rate": 0.0005, + "epoch": 0.49232768811029753, + "step": 10985 + }, + { + "loss": 13.9907, + "grad_norm": 1.8064554929733276, + "learning_rate": 0.0005, + "epoch": 0.49255177900156305, + "step": 10990 + }, + { + "loss": 14.0491, + "grad_norm": 1.8088335990905762, + "learning_rate": 0.0005, + "epoch": 0.4927758698928285, + "step": 10995 + }, + { + "loss": 14.003, + "grad_norm": 1.7703953981399536, + "learning_rate": 0.0005, + "epoch": 0.492999960784094, + "step": 11000 + }, + { + "eval_loss": 1.7476418018341064, + "eval_runtime": 18.8201, + "eval_samples_per_second": 870.558, + "eval_steps_per_second": 7.811, + "epoch": 0.492999960784094, + "step": 11000 + }, + { + "loss": 13.9324, + "grad_norm": 1.9696674346923828, + "learning_rate": 0.0005, + "epoch": 0.49322405167535954, + "step": 11005 + }, + { + "loss": 13.99, + "grad_norm": 1.7030773162841797, + "learning_rate": 0.0005, + "epoch": 0.493448142566625, + "step": 11010 + }, + { + "loss": 13.991, + "grad_norm": 1.8188631534576416, + "learning_rate": 0.0005, + "epoch": 0.4936722334578905, + "step": 11015 + }, + { + "loss": 14.0508, + "grad_norm": 1.8433138132095337, + "learning_rate": 0.0005, + "epoch": 0.49389632434915604, + "step": 11020 + }, + { + "loss": 13.9965, + "grad_norm": 1.8021408319473267, + "learning_rate": 0.0005, + "epoch": 0.4941204152404215, + "step": 11025 + }, + { + "loss": 14.0115, + "grad_norm": 2.0757226943969727, + "learning_rate": 0.0005, + "epoch": 0.494344506131687, + "step": 11030 + }, + { + "loss": 14.0115, + "grad_norm": 1.926236629486084, + "learning_rate": 0.0005, + "epoch": 0.4945685970229525, + "step": 11035 + }, + { + "loss": 14.1292, + "grad_norm": 1.8639720678329468, + "learning_rate": 0.0005, + "epoch": 0.494792687914218, + "step": 11040 + }, + { + "loss": 14.1444, + "grad_norm": 1.8459327220916748, + "learning_rate": 0.0005, + "epoch": 0.4950167788054835, + "step": 11045 + }, + { + "loss": 13.9817, + "grad_norm": 1.8979179859161377, + "learning_rate": 0.0005, + "epoch": 0.495240869696749, + "step": 11050 + }, + { + "loss": 13.9854, + "grad_norm": 1.7826581001281738, + "learning_rate": 0.0005, + "epoch": 0.4954649605880145, + "step": 11055 + }, + { + "loss": 13.9973, + "grad_norm": 1.806075930595398, + "learning_rate": 0.0005, + "epoch": 0.49568905147928, + "step": 11060 + }, + { + "loss": 14.0814, + "grad_norm": 1.7833302021026611, + "learning_rate": 0.0005, + "epoch": 0.4959131423705455, + "step": 11065 + }, + { + "loss": 14.0437, + "grad_norm": 1.7272964715957642, + "learning_rate": 0.0005, + "epoch": 0.496137233261811, + "step": 11070 + }, + { + "loss": 14.0318, + "grad_norm": 1.983668565750122, + "learning_rate": 0.0005, + "epoch": 0.4963613241530765, + "step": 11075 + }, + { + "loss": 13.9409, + "grad_norm": 1.959157943725586, + "learning_rate": 0.0005, + "epoch": 0.496585415044342, + "step": 11080 + }, + { + "loss": 14.0618, + "grad_norm": 2.06475567817688, + "learning_rate": 0.0005, + "epoch": 0.49680950593560746, + "step": 11085 + }, + { + "loss": 14.0494, + "grad_norm": 2.235586643218994, + "learning_rate": 0.0005, + "epoch": 0.497033596826873, + "step": 11090 + }, + { + "loss": 13.9587, + "grad_norm": 2.183415412902832, + "learning_rate": 0.0005, + "epoch": 0.4972576877181385, + "step": 11095 + }, + { + "loss": 13.8766, + "grad_norm": 1.8275450468063354, + "learning_rate": 0.0005, + "epoch": 0.49748177860940396, + "step": 11100 + }, + { + "loss": 14.0381, + "grad_norm": 1.713709831237793, + "learning_rate": 0.0005, + "epoch": 0.4977058695006695, + "step": 11105 + }, + { + "loss": 14.0614, + "grad_norm": 1.8228458166122437, + "learning_rate": 0.0005, + "epoch": 0.497929960391935, + "step": 11110 + }, + { + "loss": 14.1712, + "grad_norm": 2.0068023204803467, + "learning_rate": 0.0005, + "epoch": 0.49815405128320045, + "step": 11115 + }, + { + "loss": 14.012, + "grad_norm": 1.9167864322662354, + "learning_rate": 0.0005, + "epoch": 0.49837814217446597, + "step": 11120 + }, + { + "loss": 14.0244, + "grad_norm": 1.7285057306289673, + "learning_rate": 0.0005, + "epoch": 0.4986022330657315, + "step": 11125 + }, + { + "loss": 14.0137, + "grad_norm": 1.9569995403289795, + "learning_rate": 0.0005, + "epoch": 0.49882632395699694, + "step": 11130 + }, + { + "loss": 14.0519, + "grad_norm": 1.9410467147827148, + "learning_rate": 0.0005, + "epoch": 0.49905041484826246, + "step": 11135 + }, + { + "loss": 14.0137, + "grad_norm": 1.8874870538711548, + "learning_rate": 0.0005, + "epoch": 0.499274505739528, + "step": 11140 + }, + { + "loss": 13.9772, + "grad_norm": 1.7927515506744385, + "learning_rate": 0.0005, + "epoch": 0.49949859663079343, + "step": 11145 + }, + { + "loss": 14.0682, + "grad_norm": 1.7767921686172485, + "learning_rate": 0.0005, + "epoch": 0.49972268752205895, + "step": 11150 + }, + { + "loss": 14.0448, + "grad_norm": 1.9172736406326294, + "learning_rate": 0.0005, + "epoch": 0.49994677841332447, + "step": 11155 + }, + { + "loss": 14.024, + "grad_norm": 1.8923771381378174, + "learning_rate": 0.0005, + "epoch": 0.50017086930459, + "step": 11160 + }, + { + "loss": 14.0349, + "grad_norm": 1.8693808317184448, + "learning_rate": 0.0005, + "epoch": 0.5003949601958554, + "step": 11165 + }, + { + "loss": 14.1001, + "grad_norm": 1.885910987854004, + "learning_rate": 0.0005, + "epoch": 0.5006190510871209, + "step": 11170 + }, + { + "loss": 14.0163, + "grad_norm": 1.7689731121063232, + "learning_rate": 0.0005, + "epoch": 0.5008431419783864, + "step": 11175 + }, + { + "loss": 14.1057, + "grad_norm": 1.9417201280593872, + "learning_rate": 0.0005, + "epoch": 0.5010672328696519, + "step": 11180 + }, + { + "loss": 13.9506, + "grad_norm": 1.7507840394973755, + "learning_rate": 0.0005, + "epoch": 0.5012913237609175, + "step": 11185 + }, + { + "loss": 14.1209, + "grad_norm": 1.9742436408996582, + "learning_rate": 0.0005, + "epoch": 0.501515414652183, + "step": 11190 + }, + { + "loss": 14.116, + "grad_norm": 2.121680736541748, + "learning_rate": 0.0005, + "epoch": 0.5017395055434484, + "step": 11195 + }, + { + "loss": 14.0399, + "grad_norm": 2.2621843814849854, + "learning_rate": 0.0005, + "epoch": 0.5019635964347139, + "step": 11200 + }, + { + "loss": 14.0067, + "grad_norm": 2.0274646282196045, + "learning_rate": 0.0005, + "epoch": 0.5021876873259794, + "step": 11205 + }, + { + "loss": 13.9607, + "grad_norm": 1.958794355392456, + "learning_rate": 0.0005, + "epoch": 0.5024117782172449, + "step": 11210 + }, + { + "loss": 14.0712, + "grad_norm": 1.909808874130249, + "learning_rate": 0.0005, + "epoch": 0.5026358691085104, + "step": 11215 + }, + { + "loss": 14.0126, + "grad_norm": 1.8823260068893433, + "learning_rate": 0.0005, + "epoch": 0.502859959999776, + "step": 11220 + }, + { + "loss": 14.1237, + "grad_norm": 1.9062442779541016, + "learning_rate": 0.0005, + "epoch": 0.5030840508910414, + "step": 11225 + }, + { + "loss": 14.0226, + "grad_norm": 2.011108875274658, + "learning_rate": 0.0005, + "epoch": 0.5033081417823069, + "step": 11230 + }, + { + "loss": 13.9678, + "grad_norm": 1.982003927230835, + "learning_rate": 0.0005, + "epoch": 0.5035322326735724, + "step": 11235 + }, + { + "loss": 14.0562, + "grad_norm": 2.152905225753784, + "learning_rate": 0.0005, + "epoch": 0.5037563235648379, + "step": 11240 + }, + { + "loss": 14.0348, + "grad_norm": 1.9049335718154907, + "learning_rate": 0.0005, + "epoch": 0.5039804144561034, + "step": 11245 + }, + { + "loss": 14.095, + "grad_norm": 1.7218148708343506, + "learning_rate": 0.0005, + "epoch": 0.5042045053473689, + "step": 11250 + }, + { + "loss": 14.1642, + "grad_norm": 1.7122304439544678, + "learning_rate": 0.0005, + "epoch": 0.5044285962386343, + "step": 11255 + }, + { + "loss": 14.1851, + "grad_norm": 1.7507672309875488, + "learning_rate": 0.0005, + "epoch": 0.5046526871298999, + "step": 11260 + }, + { + "loss": 14.0357, + "grad_norm": 1.7015080451965332, + "learning_rate": 0.0005, + "epoch": 0.5048767780211654, + "step": 11265 + }, + { + "loss": 13.9302, + "grad_norm": 1.9490917921066284, + "learning_rate": 0.0005, + "epoch": 0.5051008689124309, + "step": 11270 + }, + { + "loss": 14.1446, + "grad_norm": 2.090062379837036, + "learning_rate": 0.0005, + "epoch": 0.5053249598036964, + "step": 11275 + }, + { + "loss": 14.0398, + "grad_norm": 2.121561288833618, + "learning_rate": 0.0005, + "epoch": 0.5055490506949619, + "step": 11280 + }, + { + "loss": 14.0797, + "grad_norm": 2.3040390014648438, + "learning_rate": 0.0005, + "epoch": 0.5057731415862273, + "step": 11285 + }, + { + "loss": 13.9851, + "grad_norm": 2.0667917728424072, + "learning_rate": 0.0005, + "epoch": 0.5059972324774928, + "step": 11290 + }, + { + "loss": 13.9989, + "grad_norm": 1.7419085502624512, + "learning_rate": 0.0005, + "epoch": 0.5062213233687584, + "step": 11295 + }, + { + "loss": 14.0088, + "grad_norm": 1.9543952941894531, + "learning_rate": 0.0005, + "epoch": 0.5064454142600239, + "step": 11300 + }, + { + "loss": 13.9647, + "grad_norm": 1.8733443021774292, + "learning_rate": 0.0005, + "epoch": 0.5066695051512894, + "step": 11305 + }, + { + "loss": 13.911, + "grad_norm": 1.7971140146255493, + "learning_rate": 0.0005, + "epoch": 0.5068935960425549, + "step": 11310 + }, + { + "loss": 14.0001, + "grad_norm": 1.9756284952163696, + "learning_rate": 0.0005, + "epoch": 0.5071176869338203, + "step": 11315 + }, + { + "loss": 13.9781, + "grad_norm": 1.8535903692245483, + "learning_rate": 0.0005, + "epoch": 0.5073417778250858, + "step": 11320 + }, + { + "loss": 14.1203, + "grad_norm": 1.8332593441009521, + "learning_rate": 0.0005, + "epoch": 0.5075658687163513, + "step": 11325 + }, + { + "loss": 14.0118, + "grad_norm": 2.1774582862854004, + "learning_rate": 0.0005, + "epoch": 0.5077899596076169, + "step": 11330 + }, + { + "loss": 14.049, + "grad_norm": 2.049069881439209, + "learning_rate": 0.0005, + "epoch": 0.5080140504988824, + "step": 11335 + }, + { + "loss": 14.1635, + "grad_norm": 2.0563483238220215, + "learning_rate": 0.0005, + "epoch": 0.5082381413901479, + "step": 11340 + }, + { + "loss": 14.012, + "grad_norm": 1.9674501419067383, + "learning_rate": 0.0005, + "epoch": 0.5084622322814133, + "step": 11345 + }, + { + "loss": 13.9944, + "grad_norm": 2.106797218322754, + "learning_rate": 0.0005, + "epoch": 0.5086863231726788, + "step": 11350 + }, + { + "loss": 14.0308, + "grad_norm": 1.7767261266708374, + "learning_rate": 0.0005, + "epoch": 0.5089104140639443, + "step": 11355 + }, + { + "loss": 14.122, + "grad_norm": 1.788737177848816, + "learning_rate": 0.0005, + "epoch": 0.5091345049552098, + "step": 11360 + }, + { + "loss": 14.06, + "grad_norm": 1.8600432872772217, + "learning_rate": 0.0005, + "epoch": 0.5093585958464754, + "step": 11365 + }, + { + "loss": 14.0932, + "grad_norm": 1.8921267986297607, + "learning_rate": 0.0005, + "epoch": 0.5095826867377409, + "step": 11370 + }, + { + "loss": 13.9985, + "grad_norm": 1.851744532585144, + "learning_rate": 0.0005, + "epoch": 0.5098067776290063, + "step": 11375 + }, + { + "loss": 14.0089, + "grad_norm": 1.879716396331787, + "learning_rate": 0.0005, + "epoch": 0.5100308685202718, + "step": 11380 + }, + { + "loss": 14.16, + "grad_norm": 2.2035632133483887, + "learning_rate": 0.0005, + "epoch": 0.5102549594115373, + "step": 11385 + }, + { + "loss": 14.0877, + "grad_norm": 2.0263664722442627, + "learning_rate": 0.0005, + "epoch": 0.5104790503028028, + "step": 11390 + }, + { + "loss": 14.0885, + "grad_norm": 1.9041746854782104, + "learning_rate": 0.0005, + "epoch": 0.5107031411940683, + "step": 11395 + }, + { + "loss": 14.042, + "grad_norm": 1.671951413154602, + "learning_rate": 0.0005, + "epoch": 0.5109272320853339, + "step": 11400 + }, + { + "loss": 13.9809, + "grad_norm": 1.9111623764038086, + "learning_rate": 0.0005, + "epoch": 0.5111513229765993, + "step": 11405 + }, + { + "loss": 14.1244, + "grad_norm": 1.9198267459869385, + "learning_rate": 0.0005, + "epoch": 0.5113754138678648, + "step": 11410 + }, + { + "loss": 13.9623, + "grad_norm": 1.9086799621582031, + "learning_rate": 0.0005, + "epoch": 0.5115995047591303, + "step": 11415 + }, + { + "loss": 14.0608, + "grad_norm": 1.8582472801208496, + "learning_rate": 0.0005, + "epoch": 0.5118235956503958, + "step": 11420 + }, + { + "loss": 14.0039, + "grad_norm": 2.1490015983581543, + "learning_rate": 0.0005, + "epoch": 0.5120476865416613, + "step": 11425 + }, + { + "loss": 14.0627, + "grad_norm": 1.8919426202774048, + "learning_rate": 0.0005, + "epoch": 0.5122717774329268, + "step": 11430 + }, + { + "loss": 13.9811, + "grad_norm": 1.8844960927963257, + "learning_rate": 0.0005, + "epoch": 0.5124958683241922, + "step": 11435 + }, + { + "loss": 13.9331, + "grad_norm": 1.825016975402832, + "learning_rate": 0.0005, + "epoch": 0.5127199592154578, + "step": 11440 + }, + { + "loss": 13.9853, + "grad_norm": 1.7920079231262207, + "learning_rate": 0.0005, + "epoch": 0.5129440501067233, + "step": 11445 + }, + { + "loss": 13.9268, + "grad_norm": 1.7441658973693848, + "learning_rate": 0.0005, + "epoch": 0.5131681409979888, + "step": 11450 + }, + { + "loss": 14.1053, + "grad_norm": 1.8774532079696655, + "learning_rate": 0.0005, + "epoch": 0.5133922318892543, + "step": 11455 + }, + { + "loss": 14.0352, + "grad_norm": 1.9215784072875977, + "learning_rate": 0.0005, + "epoch": 0.5136163227805198, + "step": 11460 + }, + { + "loss": 14.113, + "grad_norm": 1.8764196634292603, + "learning_rate": 0.0005, + "epoch": 0.5138404136717852, + "step": 11465 + }, + { + "loss": 14.0281, + "grad_norm": 1.7915804386138916, + "learning_rate": 0.0005, + "epoch": 0.5140645045630508, + "step": 11470 + }, + { + "loss": 14.0374, + "grad_norm": 1.8837881088256836, + "learning_rate": 0.0005, + "epoch": 0.5142885954543163, + "step": 11475 + }, + { + "loss": 14.0387, + "grad_norm": 1.8066962957382202, + "learning_rate": 0.0005, + "epoch": 0.5145126863455818, + "step": 11480 + }, + { + "loss": 14.0537, + "grad_norm": 1.827216625213623, + "learning_rate": 0.0005, + "epoch": 0.5147367772368473, + "step": 11485 + }, + { + "loss": 14.0311, + "grad_norm": 1.8255128860473633, + "learning_rate": 0.0005, + "epoch": 0.5149608681281128, + "step": 11490 + }, + { + "loss": 13.9886, + "grad_norm": 1.8707716464996338, + "learning_rate": 0.0005, + "epoch": 0.5151849590193782, + "step": 11495 + }, + { + "loss": 14.1195, + "grad_norm": 2.0188121795654297, + "learning_rate": 0.0005, + "epoch": 0.5154090499106437, + "step": 11500 + }, + { + "eval_loss": 1.7547571659088135, + "eval_runtime": 18.5348, + "eval_samples_per_second": 883.96, + "eval_steps_per_second": 7.931, + "epoch": 0.5154090499106437, + "step": 11500 + }, + { + "loss": 14.0556, + "grad_norm": 1.9607020616531372, + "learning_rate": 0.0005, + "epoch": 0.5156331408019093, + "step": 11505 + }, + { + "loss": 14.0172, + "grad_norm": 1.7551989555358887, + "learning_rate": 0.0005, + "epoch": 0.5158572316931748, + "step": 11510 + }, + { + "loss": 14.0558, + "grad_norm": 1.7272204160690308, + "learning_rate": 0.0005, + "epoch": 0.5160813225844403, + "step": 11515 + }, + { + "loss": 14.035, + "grad_norm": 1.8389956951141357, + "learning_rate": 0.0005, + "epoch": 0.5163054134757058, + "step": 11520 + }, + { + "loss": 13.9865, + "grad_norm": 1.8806477785110474, + "learning_rate": 0.0005, + "epoch": 0.5165295043669712, + "step": 11525 + }, + { + "loss": 14.0794, + "grad_norm": 1.872300386428833, + "learning_rate": 0.0005, + "epoch": 0.5167535952582367, + "step": 11530 + }, + { + "loss": 14.0977, + "grad_norm": 1.9794795513153076, + "learning_rate": 0.0005, + "epoch": 0.5169776861495022, + "step": 11535 + }, + { + "loss": 13.9651, + "grad_norm": 1.8795273303985596, + "learning_rate": 0.0005, + "epoch": 0.5172017770407678, + "step": 11540 + }, + { + "loss": 14.0109, + "grad_norm": 1.9061027765274048, + "learning_rate": 0.0005, + "epoch": 0.5174258679320333, + "step": 11545 + }, + { + "loss": 14.0307, + "grad_norm": 1.7679489850997925, + "learning_rate": 0.0005, + "epoch": 0.5176499588232988, + "step": 11550 + }, + { + "loss": 14.0569, + "grad_norm": 1.9940497875213623, + "learning_rate": 0.0005, + "epoch": 0.5178740497145642, + "step": 11555 + }, + { + "loss": 13.9736, + "grad_norm": 1.8977187871932983, + "learning_rate": 0.0005, + "epoch": 0.5180981406058297, + "step": 11560 + }, + { + "loss": 14.0435, + "grad_norm": 1.9633815288543701, + "learning_rate": 0.0005, + "epoch": 0.5183222314970952, + "step": 11565 + }, + { + "loss": 14.0761, + "grad_norm": 1.9254745244979858, + "learning_rate": 0.0005, + "epoch": 0.5185463223883607, + "step": 11570 + }, + { + "loss": 13.9783, + "grad_norm": 1.8136787414550781, + "learning_rate": 0.0005, + "epoch": 0.5187704132796263, + "step": 11575 + }, + { + "loss": 14.0783, + "grad_norm": 1.8030261993408203, + "learning_rate": 0.0005, + "epoch": 0.5189945041708918, + "step": 11580 + }, + { + "loss": 13.9301, + "grad_norm": 1.7944817543029785, + "learning_rate": 0.0005, + "epoch": 0.5192185950621572, + "step": 11585 + }, + { + "loss": 14.003, + "grad_norm": 1.730994701385498, + "learning_rate": 0.0005, + "epoch": 0.5194426859534227, + "step": 11590 + }, + { + "loss": 13.936, + "grad_norm": 1.8809159994125366, + "learning_rate": 0.0005, + "epoch": 0.5196667768446882, + "step": 11595 + }, + { + "loss": 14.13, + "grad_norm": 1.805044174194336, + "learning_rate": 0.0005, + "epoch": 0.5198908677359537, + "step": 11600 + }, + { + "loss": 14.0864, + "grad_norm": 2.2456016540527344, + "learning_rate": 0.0005, + "epoch": 0.5201149586272192, + "step": 11605 + }, + { + "loss": 14.0473, + "grad_norm": 2.230912446975708, + "learning_rate": 0.0005, + "epoch": 0.5203390495184846, + "step": 11610 + }, + { + "loss": 14.0396, + "grad_norm": 2.18398380279541, + "learning_rate": 0.0005, + "epoch": 0.5205631404097502, + "step": 11615 + }, + { + "loss": 14.0842, + "grad_norm": 1.7159156799316406, + "learning_rate": 0.0005, + "epoch": 0.5207872313010157, + "step": 11620 + }, + { + "loss": 14.0063, + "grad_norm": 1.8257774114608765, + "learning_rate": 0.0005, + "epoch": 0.5210113221922812, + "step": 11625 + }, + { + "loss": 14.0822, + "grad_norm": 1.8890337944030762, + "learning_rate": 0.0005, + "epoch": 0.5212354130835467, + "step": 11630 + }, + { + "loss": 14.042, + "grad_norm": 1.9716954231262207, + "learning_rate": 0.0005, + "epoch": 0.5214595039748122, + "step": 11635 + }, + { + "loss": 14.055, + "grad_norm": 1.842434287071228, + "learning_rate": 0.0005, + "epoch": 0.5216835948660776, + "step": 11640 + }, + { + "loss": 13.9468, + "grad_norm": 1.8771275281906128, + "learning_rate": 0.0005, + "epoch": 0.5219076857573431, + "step": 11645 + }, + { + "loss": 14.0647, + "grad_norm": 1.7564702033996582, + "learning_rate": 0.0005, + "epoch": 0.5221317766486087, + "step": 11650 + }, + { + "loss": 14.0589, + "grad_norm": 1.9361367225646973, + "learning_rate": 0.0005, + "epoch": 0.5223558675398742, + "step": 11655 + }, + { + "loss": 14.0295, + "grad_norm": 1.8580527305603027, + "learning_rate": 0.0005, + "epoch": 0.5225799584311397, + "step": 11660 + }, + { + "loss": 14.1143, + "grad_norm": 1.9104259014129639, + "learning_rate": 0.0005, + "epoch": 0.5228040493224052, + "step": 11665 + }, + { + "loss": 14.1412, + "grad_norm": 1.8740001916885376, + "learning_rate": 0.0005, + "epoch": 0.5230281402136706, + "step": 11670 + }, + { + "loss": 13.9737, + "grad_norm": 1.9475232362747192, + "learning_rate": 0.0005, + "epoch": 0.5232522311049361, + "step": 11675 + }, + { + "loss": 14.0187, + "grad_norm": 1.9687209129333496, + "learning_rate": 0.0005, + "epoch": 0.5234763219962016, + "step": 11680 + }, + { + "loss": 14.0587, + "grad_norm": 1.7069392204284668, + "learning_rate": 0.0005, + "epoch": 0.5237004128874672, + "step": 11685 + }, + { + "loss": 14.1287, + "grad_norm": 1.883009672164917, + "learning_rate": 0.0005, + "epoch": 0.5239245037787327, + "step": 11690 + }, + { + "loss": 13.9363, + "grad_norm": 1.826299786567688, + "learning_rate": 0.0005, + "epoch": 0.5241485946699982, + "step": 11695 + }, + { + "loss": 14.1063, + "grad_norm": 3.701735019683838, + "learning_rate": 0.0005, + "epoch": 0.5243726855612636, + "step": 11700 + }, + { + "loss": 14.1886, + "grad_norm": 9.759357452392578, + "learning_rate": 0.0005, + "epoch": 0.5245967764525291, + "step": 11705 + }, + { + "loss": 14.0431, + "grad_norm": 3.50545334815979, + "learning_rate": 0.0005, + "epoch": 0.5248208673437946, + "step": 11710 + }, + { + "loss": 14.2454, + "grad_norm": 13.252138137817383, + "learning_rate": 0.0005, + "epoch": 0.5250449582350601, + "step": 11715 + }, + { + "loss": 14.1642, + "grad_norm": 3.8615827560424805, + "learning_rate": 0.0005, + "epoch": 0.5252690491263257, + "step": 11720 + }, + { + "loss": 14.1872, + "grad_norm": 3.313523530960083, + "learning_rate": 0.0005, + "epoch": 0.5254931400175912, + "step": 11725 + }, + { + "loss": 14.1133, + "grad_norm": 1.8248555660247803, + "learning_rate": 0.0005, + "epoch": 0.5257172309088566, + "step": 11730 + }, + { + "loss": 13.9341, + "grad_norm": 1.7867991924285889, + "learning_rate": 0.0005, + "epoch": 0.5259413218001221, + "step": 11735 + }, + { + "loss": 14.0033, + "grad_norm": 1.8857210874557495, + "learning_rate": 0.0005, + "epoch": 0.5261654126913876, + "step": 11740 + }, + { + "loss": 13.9379, + "grad_norm": 1.817320466041565, + "learning_rate": 0.0005, + "epoch": 0.5263895035826531, + "step": 11745 + }, + { + "loss": 14.0188, + "grad_norm": 1.8590130805969238, + "learning_rate": 0.0005, + "epoch": 0.5266135944739186, + "step": 11750 + }, + { + "loss": 13.9976, + "grad_norm": 1.9085701704025269, + "learning_rate": 0.0005, + "epoch": 0.5268376853651842, + "step": 11755 + }, + { + "loss": 14.0711, + "grad_norm": 2.036893367767334, + "learning_rate": 0.0005, + "epoch": 0.5270617762564496, + "step": 11760 + }, + { + "loss": 13.9832, + "grad_norm": 1.9735054969787598, + "learning_rate": 0.0005, + "epoch": 0.5272858671477151, + "step": 11765 + }, + { + "loss": 14.0891, + "grad_norm": 1.9286948442459106, + "learning_rate": 0.0005, + "epoch": 0.5275099580389806, + "step": 11770 + }, + { + "loss": 14.0208, + "grad_norm": 1.8106321096420288, + "learning_rate": 0.0005, + "epoch": 0.5277340489302461, + "step": 11775 + }, + { + "loss": 14.1278, + "grad_norm": 1.8342370986938477, + "learning_rate": 0.0005, + "epoch": 0.5279581398215116, + "step": 11780 + }, + { + "loss": 13.9797, + "grad_norm": 1.8223285675048828, + "learning_rate": 0.0005, + "epoch": 0.5281822307127771, + "step": 11785 + }, + { + "loss": 13.9417, + "grad_norm": 1.8281211853027344, + "learning_rate": 0.0005, + "epoch": 0.5284063216040426, + "step": 11790 + }, + { + "loss": 14.0578, + "grad_norm": 2.035158395767212, + "learning_rate": 0.0005, + "epoch": 0.5286304124953081, + "step": 11795 + }, + { + "loss": 14.0402, + "grad_norm": 1.8742728233337402, + "learning_rate": 0.0005, + "epoch": 0.5288545033865736, + "step": 11800 + }, + { + "loss": 13.9907, + "grad_norm": 2.1572251319885254, + "learning_rate": 0.0005, + "epoch": 0.5290785942778391, + "step": 11805 + }, + { + "loss": 14.012, + "grad_norm": 1.802402377128601, + "learning_rate": 0.0005, + "epoch": 0.5293026851691046, + "step": 11810 + }, + { + "loss": 14.0929, + "grad_norm": 1.780339241027832, + "learning_rate": 0.0005, + "epoch": 0.5295267760603701, + "step": 11815 + }, + { + "loss": 14.1459, + "grad_norm": 1.8276687860488892, + "learning_rate": 0.0005, + "epoch": 0.5297508669516355, + "step": 11820 + }, + { + "loss": 13.9739, + "grad_norm": 1.9510287046432495, + "learning_rate": 0.0005, + "epoch": 0.529974957842901, + "step": 11825 + }, + { + "loss": 14.044, + "grad_norm": 1.7390533685684204, + "learning_rate": 0.0005, + "epoch": 0.5301990487341666, + "step": 11830 + }, + { + "loss": 14.0421, + "grad_norm": 1.803524374961853, + "learning_rate": 0.0005, + "epoch": 0.5304231396254321, + "step": 11835 + }, + { + "loss": 13.8787, + "grad_norm": 2.0901153087615967, + "learning_rate": 0.0005, + "epoch": 0.5306472305166976, + "step": 11840 + }, + { + "loss": 14.0694, + "grad_norm": 1.9154999256134033, + "learning_rate": 0.0005, + "epoch": 0.5308713214079631, + "step": 11845 + }, + { + "loss": 13.9105, + "grad_norm": 1.770522117614746, + "learning_rate": 0.0005, + "epoch": 0.5310954122992285, + "step": 11850 + }, + { + "loss": 14.1578, + "grad_norm": 2.0752506256103516, + "learning_rate": 0.0005, + "epoch": 0.531319503190494, + "step": 11855 + }, + { + "loss": 14.0751, + "grad_norm": 2.0580227375030518, + "learning_rate": 0.0005, + "epoch": 0.5315435940817596, + "step": 11860 + }, + { + "loss": 14.0805, + "grad_norm": 1.8285835981369019, + "learning_rate": 0.0005, + "epoch": 0.5317676849730251, + "step": 11865 + }, + { + "loss": 14.0998, + "grad_norm": 1.7315351963043213, + "learning_rate": 0.0005, + "epoch": 0.5319917758642906, + "step": 11870 + }, + { + "loss": 14.052, + "grad_norm": 1.9043768644332886, + "learning_rate": 0.0005, + "epoch": 0.5322158667555561, + "step": 11875 + }, + { + "loss": 13.9807, + "grad_norm": 1.7757564783096313, + "learning_rate": 0.0005, + "epoch": 0.5324399576468215, + "step": 11880 + }, + { + "loss": 14.0045, + "grad_norm": 1.8369570970535278, + "learning_rate": 0.0005, + "epoch": 0.532664048538087, + "step": 11885 + }, + { + "loss": 14.0585, + "grad_norm": 1.9545503854751587, + "learning_rate": 0.0005, + "epoch": 0.5328881394293525, + "step": 11890 + }, + { + "loss": 14.0333, + "grad_norm": 2.004823923110962, + "learning_rate": 0.0005, + "epoch": 0.533112230320618, + "step": 11895 + }, + { + "loss": 14.0833, + "grad_norm": 2.157543659210205, + "learning_rate": 0.0005, + "epoch": 0.5333363212118836, + "step": 11900 + }, + { + "loss": 14.0551, + "grad_norm": 2.024017810821533, + "learning_rate": 0.0005, + "epoch": 0.5335604121031491, + "step": 11905 + }, + { + "loss": 14.0604, + "grad_norm": 1.8759499788284302, + "learning_rate": 0.0005, + "epoch": 0.5337845029944145, + "step": 11910 + }, + { + "loss": 13.9394, + "grad_norm": 1.8008873462677002, + "learning_rate": 0.0005, + "epoch": 0.53400859388568, + "step": 11915 + }, + { + "loss": 13.9449, + "grad_norm": 1.9857897758483887, + "learning_rate": 0.0005, + "epoch": 0.5342326847769455, + "step": 11920 + }, + { + "loss": 14.0237, + "grad_norm": 1.904971957206726, + "learning_rate": 0.0005, + "epoch": 0.534456775668211, + "step": 11925 + }, + { + "loss": 14.024, + "grad_norm": 1.889944314956665, + "learning_rate": 0.0005, + "epoch": 0.5346808665594766, + "step": 11930 + }, + { + "loss": 14.0295, + "grad_norm": 1.821826696395874, + "learning_rate": 0.0005, + "epoch": 0.5349049574507421, + "step": 11935 + }, + { + "loss": 14.0456, + "grad_norm": 1.8112114667892456, + "learning_rate": 0.0005, + "epoch": 0.5351290483420075, + "step": 11940 + }, + { + "loss": 14.0316, + "grad_norm": 1.7718886137008667, + "learning_rate": 0.0005, + "epoch": 0.535353139233273, + "step": 11945 + }, + { + "loss": 14.0575, + "grad_norm": 1.8424601554870605, + "learning_rate": 0.0005, + "epoch": 0.5355772301245385, + "step": 11950 + }, + { + "loss": 13.9971, + "grad_norm": 1.7665033340454102, + "learning_rate": 0.0005, + "epoch": 0.535801321015804, + "step": 11955 + }, + { + "loss": 13.8843, + "grad_norm": 1.8982579708099365, + "learning_rate": 0.0005, + "epoch": 0.5360254119070695, + "step": 11960 + }, + { + "loss": 14.069, + "grad_norm": 1.9286915063858032, + "learning_rate": 0.0005, + "epoch": 0.536249502798335, + "step": 11965 + }, + { + "loss": 14.0386, + "grad_norm": 1.8651976585388184, + "learning_rate": 0.0005, + "epoch": 0.5364735936896005, + "step": 11970 + }, + { + "loss": 14.0344, + "grad_norm": 1.8424943685531616, + "learning_rate": 0.0005, + "epoch": 0.536697684580866, + "step": 11975 + }, + { + "loss": 14.0157, + "grad_norm": 1.9398298263549805, + "learning_rate": 0.0005, + "epoch": 0.5369217754721315, + "step": 11980 + }, + { + "loss": 14.0076, + "grad_norm": 1.9768520593643188, + "learning_rate": 0.0005, + "epoch": 0.537145866363397, + "step": 11985 + }, + { + "loss": 14.0315, + "grad_norm": 1.8067823648452759, + "learning_rate": 0.0005, + "epoch": 0.5373699572546625, + "step": 11990 + }, + { + "loss": 13.976, + "grad_norm": 1.9012209177017212, + "learning_rate": 0.0005, + "epoch": 0.537594048145928, + "step": 11995 + }, + { + "loss": 13.9989, + "grad_norm": 1.8578073978424072, + "learning_rate": 0.0005, + "epoch": 0.5378181390371934, + "step": 12000 + }, + { + "eval_loss": 1.751168131828308, + "eval_runtime": 18.48, + "eval_samples_per_second": 886.579, + "eval_steps_per_second": 7.955, + "epoch": 0.5378181390371934, + "step": 12000 + }, + { + "loss": 13.9335, + "grad_norm": 1.706904649734497, + "learning_rate": 0.0005, + "epoch": 0.538042229928459, + "step": 12005 + }, + { + "loss": 14.1102, + "grad_norm": 1.930979609489441, + "learning_rate": 0.0005, + "epoch": 0.5382663208197245, + "step": 12010 + }, + { + "loss": 14.0288, + "grad_norm": 1.8886891603469849, + "learning_rate": 0.0005, + "epoch": 0.53849041171099, + "step": 12015 + }, + { + "loss": 14.0218, + "grad_norm": 1.8221163749694824, + "learning_rate": 0.0005, + "epoch": 0.5387145026022555, + "step": 12020 + }, + { + "loss": 14.122, + "grad_norm": 1.8023242950439453, + "learning_rate": 0.0005, + "epoch": 0.538938593493521, + "step": 12025 + }, + { + "loss": 14.0216, + "grad_norm": 1.832963466644287, + "learning_rate": 0.0005, + "epoch": 0.5391626843847864, + "step": 12030 + }, + { + "loss": 14.113, + "grad_norm": 2.0387954711914062, + "learning_rate": 0.0005, + "epoch": 0.5393867752760519, + "step": 12035 + }, + { + "loss": 14.0552, + "grad_norm": 1.916006326675415, + "learning_rate": 0.0005, + "epoch": 0.5396108661673175, + "step": 12040 + }, + { + "loss": 14.0393, + "grad_norm": 2.2164087295532227, + "learning_rate": 0.0005, + "epoch": 0.539834957058583, + "step": 12045 + }, + { + "loss": 13.9528, + "grad_norm": 2.0238280296325684, + "learning_rate": 0.0005, + "epoch": 0.5400590479498485, + "step": 12050 + }, + { + "loss": 14.0354, + "grad_norm": 2.0497937202453613, + "learning_rate": 0.0005, + "epoch": 0.540283138841114, + "step": 12055 + }, + { + "loss": 14.0743, + "grad_norm": 1.9421876668930054, + "learning_rate": 0.0005, + "epoch": 0.5405072297323794, + "step": 12060 + }, + { + "loss": 14.0372, + "grad_norm": 1.9722645282745361, + "learning_rate": 0.0005, + "epoch": 0.5407313206236449, + "step": 12065 + }, + { + "loss": 14.0676, + "grad_norm": 1.959842562675476, + "learning_rate": 0.0005, + "epoch": 0.5409554115149104, + "step": 12070 + }, + { + "loss": 14.0179, + "grad_norm": 1.9294716119766235, + "learning_rate": 0.0005, + "epoch": 0.541179502406176, + "step": 12075 + }, + { + "loss": 14.0412, + "grad_norm": 1.8364676237106323, + "learning_rate": 0.0005, + "epoch": 0.5414035932974415, + "step": 12080 + }, + { + "loss": 14.0165, + "grad_norm": 1.904807209968567, + "learning_rate": 0.0005, + "epoch": 0.541627684188707, + "step": 12085 + }, + { + "loss": 13.9646, + "grad_norm": 1.7190061807632446, + "learning_rate": 0.0005, + "epoch": 0.5418517750799724, + "step": 12090 + }, + { + "loss": 13.9799, + "grad_norm": 1.7632275819778442, + "learning_rate": 0.0005, + "epoch": 0.5420758659712379, + "step": 12095 + }, + { + "loss": 14.0236, + "grad_norm": 1.878212332725525, + "learning_rate": 0.0005, + "epoch": 0.5422999568625034, + "step": 12100 + }, + { + "loss": 14.0286, + "grad_norm": 1.9682413339614868, + "learning_rate": 0.0005, + "epoch": 0.542524047753769, + "step": 12105 + }, + { + "loss": 14.0145, + "grad_norm": 1.8260167837142944, + "learning_rate": 0.0005, + "epoch": 0.5427481386450345, + "step": 12110 + }, + { + "loss": 14.0664, + "grad_norm": 1.9433921575546265, + "learning_rate": 0.0005, + "epoch": 0.5429722295363, + "step": 12115 + }, + { + "loss": 14.0621, + "grad_norm": 1.9075546264648438, + "learning_rate": 0.0005, + "epoch": 0.5431963204275654, + "step": 12120 + }, + { + "loss": 14.01, + "grad_norm": 1.8555830717086792, + "learning_rate": 0.0005, + "epoch": 0.5434204113188309, + "step": 12125 + }, + { + "loss": 14.1147, + "grad_norm": 1.71388578414917, + "learning_rate": 0.0005, + "epoch": 0.5436445022100964, + "step": 12130 + }, + { + "loss": 14.0163, + "grad_norm": 1.8536070585250854, + "learning_rate": 0.0005, + "epoch": 0.5438685931013619, + "step": 12135 + }, + { + "loss": 13.968, + "grad_norm": 1.7627605199813843, + "learning_rate": 0.0005, + "epoch": 0.5440926839926274, + "step": 12140 + }, + { + "loss": 13.9629, + "grad_norm": 2.0195271968841553, + "learning_rate": 0.0005, + "epoch": 0.544316774883893, + "step": 12145 + }, + { + "loss": 14.0757, + "grad_norm": 1.9609084129333496, + "learning_rate": 0.0005, + "epoch": 0.5445408657751584, + "step": 12150 + }, + { + "loss": 14.0866, + "grad_norm": 1.967761516571045, + "learning_rate": 0.0005, + "epoch": 0.5447649566664239, + "step": 12155 + }, + { + "loss": 14.0538, + "grad_norm": 1.920175552368164, + "learning_rate": 0.0005, + "epoch": 0.5449890475576894, + "step": 12160 + }, + { + "loss": 14.0651, + "grad_norm": 1.8449351787567139, + "learning_rate": 0.0005, + "epoch": 0.5452131384489549, + "step": 12165 + }, + { + "loss": 13.9519, + "grad_norm": 1.8652801513671875, + "learning_rate": 0.0005, + "epoch": 0.5454372293402204, + "step": 12170 + }, + { + "loss": 14.1119, + "grad_norm": 1.987726092338562, + "learning_rate": 0.0005, + "epoch": 0.5456613202314858, + "step": 12175 + }, + { + "loss": 13.9927, + "grad_norm": 1.8179360628128052, + "learning_rate": 0.0005, + "epoch": 0.5458854111227514, + "step": 12180 + }, + { + "loss": 14.0653, + "grad_norm": 1.8283140659332275, + "learning_rate": 0.0005, + "epoch": 0.5461095020140169, + "step": 12185 + }, + { + "loss": 14.0795, + "grad_norm": 1.810027003288269, + "learning_rate": 0.0005, + "epoch": 0.5463335929052824, + "step": 12190 + }, + { + "loss": 13.9509, + "grad_norm": 1.8754284381866455, + "learning_rate": 0.0005, + "epoch": 0.5465576837965479, + "step": 12195 + }, + { + "loss": 13.9429, + "grad_norm": 1.7635506391525269, + "learning_rate": 0.0005, + "epoch": 0.5467817746878134, + "step": 12200 + }, + { + "loss": 13.9315, + "grad_norm": 1.898576259613037, + "learning_rate": 0.0005, + "epoch": 0.5470058655790788, + "step": 12205 + }, + { + "loss": 14.1612, + "grad_norm": 2.1952714920043945, + "learning_rate": 0.0005, + "epoch": 0.5472299564703443, + "step": 12210 + }, + { + "loss": 14.0364, + "grad_norm": 2.0558507442474365, + "learning_rate": 0.0005, + "epoch": 0.5474540473616099, + "step": 12215 + }, + { + "loss": 14.0311, + "grad_norm": 2.1676862239837646, + "learning_rate": 0.0005, + "epoch": 0.5476781382528754, + "step": 12220 + }, + { + "loss": 14.0522, + "grad_norm": 2.0368027687072754, + "learning_rate": 0.0005, + "epoch": 0.5479022291441409, + "step": 12225 + }, + { + "loss": 14.0133, + "grad_norm": 2.299630641937256, + "learning_rate": 0.0005, + "epoch": 0.5481263200354064, + "step": 12230 + }, + { + "loss": 14.0252, + "grad_norm": 2.0062880516052246, + "learning_rate": 0.0005, + "epoch": 0.5483504109266718, + "step": 12235 + }, + { + "loss": 13.9892, + "grad_norm": 1.8271337747573853, + "learning_rate": 0.0005, + "epoch": 0.5485745018179373, + "step": 12240 + }, + { + "loss": 13.8925, + "grad_norm": 1.8837248086929321, + "learning_rate": 0.0005, + "epoch": 0.5487985927092028, + "step": 12245 + }, + { + "loss": 14.0329, + "grad_norm": 1.819088339805603, + "learning_rate": 0.0005, + "epoch": 0.5490226836004684, + "step": 12250 + }, + { + "loss": 14.0457, + "grad_norm": 2.0993480682373047, + "learning_rate": 0.0005, + "epoch": 0.5492467744917339, + "step": 12255 + }, + { + "loss": 14.05, + "grad_norm": 2.045747995376587, + "learning_rate": 0.0005, + "epoch": 0.5494708653829994, + "step": 12260 + }, + { + "loss": 13.9964, + "grad_norm": 1.8817187547683716, + "learning_rate": 0.0005, + "epoch": 0.5496949562742648, + "step": 12265 + }, + { + "loss": 13.9463, + "grad_norm": 1.8474345207214355, + "learning_rate": 0.0005, + "epoch": 0.5499190471655303, + "step": 12270 + }, + { + "loss": 14.1182, + "grad_norm": 1.8616013526916504, + "learning_rate": 0.0005, + "epoch": 0.5501431380567958, + "step": 12275 + }, + { + "loss": 13.9633, + "grad_norm": 1.86726713180542, + "learning_rate": 0.0005, + "epoch": 0.5503672289480613, + "step": 12280 + }, + { + "loss": 14.1048, + "grad_norm": 1.8453407287597656, + "learning_rate": 0.0005, + "epoch": 0.5505913198393269, + "step": 12285 + }, + { + "loss": 14.087, + "grad_norm": 1.8242368698120117, + "learning_rate": 0.0005, + "epoch": 0.5508154107305924, + "step": 12290 + }, + { + "loss": 13.8923, + "grad_norm": 1.7882949113845825, + "learning_rate": 0.0005, + "epoch": 0.5510395016218578, + "step": 12295 + }, + { + "loss": 14.0861, + "grad_norm": 1.9016592502593994, + "learning_rate": 0.0005, + "epoch": 0.5512635925131233, + "step": 12300 + }, + { + "loss": 13.9075, + "grad_norm": 1.852830171585083, + "learning_rate": 0.0005, + "epoch": 0.5514876834043888, + "step": 12305 + }, + { + "loss": 14.0973, + "grad_norm": 1.9363442659378052, + "learning_rate": 0.0005, + "epoch": 0.5517117742956543, + "step": 12310 + }, + { + "loss": 14.1186, + "grad_norm": 1.948805570602417, + "learning_rate": 0.0005, + "epoch": 0.5519358651869198, + "step": 12315 + }, + { + "loss": 13.9078, + "grad_norm": 1.9615259170532227, + "learning_rate": 0.0005, + "epoch": 0.5521599560781854, + "step": 12320 + }, + { + "loss": 14.0245, + "grad_norm": 2.001514196395874, + "learning_rate": 0.0005, + "epoch": 0.5523840469694508, + "step": 12325 + }, + { + "loss": 14.0718, + "grad_norm": 1.716589331626892, + "learning_rate": 0.0005, + "epoch": 0.5526081378607163, + "step": 12330 + }, + { + "loss": 14.0069, + "grad_norm": 1.803367257118225, + "learning_rate": 0.0005, + "epoch": 0.5528322287519818, + "step": 12335 + }, + { + "loss": 13.9906, + "grad_norm": 1.8775421380996704, + "learning_rate": 0.0005, + "epoch": 0.5530563196432473, + "step": 12340 + }, + { + "loss": 14.0023, + "grad_norm": 1.7530593872070312, + "learning_rate": 0.0005, + "epoch": 0.5532804105345128, + "step": 12345 + }, + { + "loss": 14.0228, + "grad_norm": 1.8927412033081055, + "learning_rate": 0.0005, + "epoch": 0.5535045014257783, + "step": 12350 + }, + { + "loss": 14.0209, + "grad_norm": 1.9034690856933594, + "learning_rate": 0.0005, + "epoch": 0.5537285923170437, + "step": 12355 + }, + { + "loss": 14.0166, + "grad_norm": 1.8328973054885864, + "learning_rate": 0.0005, + "epoch": 0.5539526832083093, + "step": 12360 + }, + { + "loss": 13.9525, + "grad_norm": 1.8188306093215942, + "learning_rate": 0.0005, + "epoch": 0.5541767740995748, + "step": 12365 + }, + { + "loss": 14.0075, + "grad_norm": 2.0496511459350586, + "learning_rate": 0.0005, + "epoch": 0.5544008649908403, + "step": 12370 + }, + { + "loss": 14.0544, + "grad_norm": 1.826499581336975, + "learning_rate": 0.0005, + "epoch": 0.5546249558821058, + "step": 12375 + }, + { + "loss": 14.0, + "grad_norm": 1.9936883449554443, + "learning_rate": 0.0005, + "epoch": 0.5548490467733713, + "step": 12380 + }, + { + "loss": 14.0692, + "grad_norm": 1.9251879453659058, + "learning_rate": 0.0005, + "epoch": 0.5550731376646367, + "step": 12385 + }, + { + "loss": 13.9408, + "grad_norm": 1.8239604234695435, + "learning_rate": 0.0005, + "epoch": 0.5552972285559022, + "step": 12390 + }, + { + "loss": 14.0647, + "grad_norm": 1.909005880355835, + "learning_rate": 0.0005, + "epoch": 0.5555213194471678, + "step": 12395 + }, + { + "loss": 14.0004, + "grad_norm": 1.7871017456054688, + "learning_rate": 0.0005, + "epoch": 0.5557454103384333, + "step": 12400 + }, + { + "loss": 14.1342, + "grad_norm": 2.068019151687622, + "learning_rate": 0.0005, + "epoch": 0.5559695012296988, + "step": 12405 + }, + { + "loss": 14.0307, + "grad_norm": 1.9286525249481201, + "learning_rate": 0.0005, + "epoch": 0.5561935921209643, + "step": 12410 + }, + { + "loss": 14.0578, + "grad_norm": 2.0839881896972656, + "learning_rate": 0.0005, + "epoch": 0.5564176830122297, + "step": 12415 + }, + { + "loss": 13.97, + "grad_norm": 1.8886492252349854, + "learning_rate": 0.0005, + "epoch": 0.5566417739034952, + "step": 12420 + }, + { + "loss": 14.0677, + "grad_norm": 1.867583155632019, + "learning_rate": 0.0005, + "epoch": 0.5568658647947607, + "step": 12425 + }, + { + "loss": 13.9616, + "grad_norm": 2.073392629623413, + "learning_rate": 0.0005, + "epoch": 0.5570899556860263, + "step": 12430 + }, + { + "loss": 14.0997, + "grad_norm": 1.8922902345657349, + "learning_rate": 0.0005, + "epoch": 0.5573140465772918, + "step": 12435 + }, + { + "loss": 13.9662, + "grad_norm": 1.8526886701583862, + "learning_rate": 0.0005, + "epoch": 0.5575381374685573, + "step": 12440 + }, + { + "loss": 14.1219, + "grad_norm": 1.8532624244689941, + "learning_rate": 0.0005, + "epoch": 0.5577622283598227, + "step": 12445 + }, + { + "loss": 13.9798, + "grad_norm": 1.8773828744888306, + "learning_rate": 0.0005, + "epoch": 0.5579863192510882, + "step": 12450 + }, + { + "loss": 14.035, + "grad_norm": 1.787448525428772, + "learning_rate": 0.0005, + "epoch": 0.5582104101423537, + "step": 12455 + }, + { + "loss": 14.0232, + "grad_norm": 1.8511303663253784, + "learning_rate": 0.0005, + "epoch": 0.5584345010336192, + "step": 12460 + }, + { + "loss": 13.9491, + "grad_norm": 1.7718552350997925, + "learning_rate": 0.0005, + "epoch": 0.5586585919248848, + "step": 12465 + }, + { + "loss": 14.0324, + "grad_norm": 1.919750690460205, + "learning_rate": 0.0005, + "epoch": 0.5588826828161503, + "step": 12470 + }, + { + "loss": 14.0058, + "grad_norm": 1.8664422035217285, + "learning_rate": 0.0005, + "epoch": 0.5591067737074157, + "step": 12475 + }, + { + "loss": 13.9433, + "grad_norm": 2.035127878189087, + "learning_rate": 0.0005, + "epoch": 0.5593308645986812, + "step": 12480 + }, + { + "loss": 14.0902, + "grad_norm": 1.8490769863128662, + "learning_rate": 0.0005, + "epoch": 0.5595549554899467, + "step": 12485 + }, + { + "loss": 14.0749, + "grad_norm": 1.7316131591796875, + "learning_rate": 0.0005, + "epoch": 0.5597790463812122, + "step": 12490 + }, + { + "loss": 13.9822, + "grad_norm": 1.8705759048461914, + "learning_rate": 0.0005, + "epoch": 0.5600031372724777, + "step": 12495 + }, + { + "loss": 14.1043, + "grad_norm": 1.9594308137893677, + "learning_rate": 0.0005, + "epoch": 0.5602272281637433, + "step": 12500 + }, + { + "eval_loss": 1.751859426498413, + "eval_runtime": 18.223, + "eval_samples_per_second": 899.084, + "eval_steps_per_second": 8.067, + "epoch": 0.5602272281637433, + "step": 12500 + }, + { + "loss": 14.0527, + "grad_norm": 1.8572828769683838, + "learning_rate": 0.0005, + "epoch": 0.5604513190550087, + "step": 12505 + }, + { + "loss": 14.0881, + "grad_norm": 1.770039677619934, + "learning_rate": 0.0005, + "epoch": 0.5606754099462742, + "step": 12510 + }, + { + "loss": 14.0535, + "grad_norm": 1.951517939567566, + "learning_rate": 0.0005, + "epoch": 0.5608995008375397, + "step": 12515 + }, + { + "loss": 14.1481, + "grad_norm": 1.7481681108474731, + "learning_rate": 0.0005, + "epoch": 0.5611235917288052, + "step": 12520 + }, + { + "loss": 13.9009, + "grad_norm": 1.9862737655639648, + "learning_rate": 0.0005, + "epoch": 0.5613476826200707, + "step": 12525 + }, + { + "loss": 13.9339, + "grad_norm": 2.1382763385772705, + "learning_rate": 0.0005, + "epoch": 0.5615717735113362, + "step": 12530 + }, + { + "loss": 13.9717, + "grad_norm": 1.9665172100067139, + "learning_rate": 0.0005, + "epoch": 0.5617958644026017, + "step": 12535 + }, + { + "loss": 14.0528, + "grad_norm": 1.9153531789779663, + "learning_rate": 0.0005, + "epoch": 0.5620199552938672, + "step": 12540 + }, + { + "loss": 13.888, + "grad_norm": 1.8589550256729126, + "learning_rate": 0.0005, + "epoch": 0.5622440461851327, + "step": 12545 + }, + { + "loss": 13.9988, + "grad_norm": 1.7779061794281006, + "learning_rate": 0.0005, + "epoch": 0.5624681370763982, + "step": 12550 + }, + { + "loss": 13.9705, + "grad_norm": 1.8984662294387817, + "learning_rate": 0.0005, + "epoch": 0.5626922279676637, + "step": 12555 + }, + { + "loss": 14.0459, + "grad_norm": 2.0568203926086426, + "learning_rate": 0.0005, + "epoch": 0.5629163188589292, + "step": 12560 + }, + { + "loss": 14.074, + "grad_norm": 2.0921597480773926, + "learning_rate": 0.0005, + "epoch": 0.5631404097501946, + "step": 12565 + }, + { + "loss": 14.0376, + "grad_norm": 1.914757251739502, + "learning_rate": 0.0005, + "epoch": 0.5633645006414602, + "step": 12570 + }, + { + "loss": 14.0212, + "grad_norm": 2.220177173614502, + "learning_rate": 0.0005, + "epoch": 0.5635885915327257, + "step": 12575 + }, + { + "loss": 13.929, + "grad_norm": 1.9564307928085327, + "learning_rate": 0.0005, + "epoch": 0.5638126824239912, + "step": 12580 + }, + { + "loss": 14.1402, + "grad_norm": 2.0993082523345947, + "learning_rate": 0.0005, + "epoch": 0.5640367733152567, + "step": 12585 + }, + { + "loss": 14.0034, + "grad_norm": 2.193251609802246, + "learning_rate": 0.0005, + "epoch": 0.5642608642065222, + "step": 12590 + }, + { + "loss": 14.013, + "grad_norm": 1.8814992904663086, + "learning_rate": 0.0005, + "epoch": 0.5644849550977876, + "step": 12595 + }, + { + "loss": 14.0452, + "grad_norm": 1.926684021949768, + "learning_rate": 0.0005, + "epoch": 0.5647090459890531, + "step": 12600 + }, + { + "loss": 13.9817, + "grad_norm": 1.8195796012878418, + "learning_rate": 0.0005, + "epoch": 0.5649331368803187, + "step": 12605 + }, + { + "loss": 14.0128, + "grad_norm": 1.8257880210876465, + "learning_rate": 0.0005, + "epoch": 0.5651572277715842, + "step": 12610 + }, + { + "loss": 13.8912, + "grad_norm": 1.9664603471755981, + "learning_rate": 0.0005, + "epoch": 0.5653813186628497, + "step": 12615 + }, + { + "loss": 14.0476, + "grad_norm": 1.8703941106796265, + "learning_rate": 0.0005, + "epoch": 0.5656054095541152, + "step": 12620 + }, + { + "loss": 13.9546, + "grad_norm": 1.8966342210769653, + "learning_rate": 0.0005, + "epoch": 0.5658295004453806, + "step": 12625 + }, + { + "loss": 14.0268, + "grad_norm": 1.9538925886154175, + "learning_rate": 0.0005, + "epoch": 0.5660535913366461, + "step": 12630 + }, + { + "loss": 13.9529, + "grad_norm": 1.9661394357681274, + "learning_rate": 0.0005, + "epoch": 0.5662776822279116, + "step": 12635 + }, + { + "loss": 14.0026, + "grad_norm": 1.824487566947937, + "learning_rate": 0.0005, + "epoch": 0.5665017731191772, + "step": 12640 + }, + { + "loss": 13.9676, + "grad_norm": 1.8401505947113037, + "learning_rate": 0.0005, + "epoch": 0.5667258640104427, + "step": 12645 + }, + { + "loss": 13.9838, + "grad_norm": 1.9098906517028809, + "learning_rate": 0.0005, + "epoch": 0.5669499549017082, + "step": 12650 + }, + { + "loss": 13.946, + "grad_norm": 1.9357730150222778, + "learning_rate": 0.0005, + "epoch": 0.5671740457929736, + "step": 12655 + }, + { + "loss": 13.9252, + "grad_norm": 1.9102228879928589, + "learning_rate": 0.0005, + "epoch": 0.5673981366842391, + "step": 12660 + }, + { + "loss": 14.0503, + "grad_norm": 1.755408525466919, + "learning_rate": 0.0005, + "epoch": 0.5676222275755046, + "step": 12665 + }, + { + "loss": 13.9132, + "grad_norm": 1.8137500286102295, + "learning_rate": 0.0005, + "epoch": 0.5678463184667701, + "step": 12670 + }, + { + "loss": 14.0714, + "grad_norm": 1.842359185218811, + "learning_rate": 0.0005, + "epoch": 0.5680704093580357, + "step": 12675 + }, + { + "loss": 13.9779, + "grad_norm": 1.7982245683670044, + "learning_rate": 0.0005, + "epoch": 0.5682945002493012, + "step": 12680 + }, + { + "loss": 14.1149, + "grad_norm": 1.9560648202896118, + "learning_rate": 0.0005, + "epoch": 0.5685185911405666, + "step": 12685 + }, + { + "loss": 13.9683, + "grad_norm": 1.90084707736969, + "learning_rate": 0.0005, + "epoch": 0.5687426820318321, + "step": 12690 + }, + { + "loss": 14.0096, + "grad_norm": 1.897828459739685, + "learning_rate": 0.0005, + "epoch": 0.5689667729230976, + "step": 12695 + }, + { + "loss": 14.0001, + "grad_norm": 1.7941429615020752, + "learning_rate": 0.0005, + "epoch": 0.5691908638143631, + "step": 12700 + }, + { + "loss": 14.0167, + "grad_norm": 1.9507100582122803, + "learning_rate": 0.0005, + "epoch": 0.5694149547056286, + "step": 12705 + }, + { + "loss": 14.0317, + "grad_norm": 1.8001422882080078, + "learning_rate": 0.0005, + "epoch": 0.5696390455968942, + "step": 12710 + }, + { + "loss": 14.0206, + "grad_norm": 1.9080332517623901, + "learning_rate": 0.0005, + "epoch": 0.5698631364881596, + "step": 12715 + }, + { + "loss": 14.0638, + "grad_norm": 1.915358304977417, + "learning_rate": 0.0005, + "epoch": 0.5700872273794251, + "step": 12720 + }, + { + "loss": 14.0261, + "grad_norm": 1.8281325101852417, + "learning_rate": 0.0005, + "epoch": 0.5703113182706906, + "step": 12725 + }, + { + "loss": 14.0883, + "grad_norm": 2.0495681762695312, + "learning_rate": 0.0005, + "epoch": 0.5705354091619561, + "step": 12730 + }, + { + "loss": 14.0626, + "grad_norm": 1.7946007251739502, + "learning_rate": 0.0005, + "epoch": 0.5707595000532216, + "step": 12735 + }, + { + "loss": 14.0204, + "grad_norm": 1.8077770471572876, + "learning_rate": 0.0005, + "epoch": 0.570983590944487, + "step": 12740 + }, + { + "loss": 13.9629, + "grad_norm": 1.7210853099822998, + "learning_rate": 0.0005, + "epoch": 0.5712076818357525, + "step": 12745 + }, + { + "loss": 13.9486, + "grad_norm": 1.8925403356552124, + "learning_rate": 0.0005, + "epoch": 0.5714317727270181, + "step": 12750 + }, + { + "loss": 13.8341, + "grad_norm": 1.7555067539215088, + "learning_rate": 0.0005, + "epoch": 0.5716558636182836, + "step": 12755 + }, + { + "loss": 13.9755, + "grad_norm": 1.6934796571731567, + "learning_rate": 0.0005, + "epoch": 0.5718799545095491, + "step": 12760 + }, + { + "loss": 14.0371, + "grad_norm": 1.8694788217544556, + "learning_rate": 0.0005, + "epoch": 0.5721040454008146, + "step": 12765 + }, + { + "loss": 14.0642, + "grad_norm": 1.8269613981246948, + "learning_rate": 0.0005, + "epoch": 0.57232813629208, + "step": 12770 + }, + { + "loss": 13.9915, + "grad_norm": 2.02119517326355, + "learning_rate": 0.0005, + "epoch": 0.5725522271833455, + "step": 12775 + }, + { + "loss": 13.8738, + "grad_norm": 1.83871328830719, + "learning_rate": 0.0005, + "epoch": 0.572776318074611, + "step": 12780 + }, + { + "loss": 14.0301, + "grad_norm": 1.9396083354949951, + "learning_rate": 0.0005, + "epoch": 0.5730004089658766, + "step": 12785 + }, + { + "loss": 14.1057, + "grad_norm": 1.9239839315414429, + "learning_rate": 0.0005, + "epoch": 0.5732244998571421, + "step": 12790 + }, + { + "loss": 14.0808, + "grad_norm": 1.8804762363433838, + "learning_rate": 0.0005, + "epoch": 0.5734485907484076, + "step": 12795 + }, + { + "loss": 14.083, + "grad_norm": 1.7791146039962769, + "learning_rate": 0.0005, + "epoch": 0.573672681639673, + "step": 12800 + }, + { + "loss": 13.977, + "grad_norm": 1.9019142389297485, + "learning_rate": 0.0005, + "epoch": 0.5738967725309385, + "step": 12805 + }, + { + "loss": 13.9416, + "grad_norm": 1.6902880668640137, + "learning_rate": 0.0005, + "epoch": 0.574120863422204, + "step": 12810 + }, + { + "loss": 13.9604, + "grad_norm": 1.79051673412323, + "learning_rate": 0.0005, + "epoch": 0.5743449543134695, + "step": 12815 + }, + { + "loss": 14.0087, + "grad_norm": 2.142242431640625, + "learning_rate": 0.0005, + "epoch": 0.5745690452047351, + "step": 12820 + }, + { + "loss": 14.189, + "grad_norm": 2.034118890762329, + "learning_rate": 0.0005, + "epoch": 0.5747931360960006, + "step": 12825 + }, + { + "loss": 14.0089, + "grad_norm": 1.8687961101531982, + "learning_rate": 0.0005, + "epoch": 0.575017226987266, + "step": 12830 + }, + { + "loss": 13.9777, + "grad_norm": 2.0446767807006836, + "learning_rate": 0.0005, + "epoch": 0.5752413178785315, + "step": 12835 + }, + { + "loss": 13.9677, + "grad_norm": 1.864727258682251, + "learning_rate": 0.0005, + "epoch": 0.575465408769797, + "step": 12840 + }, + { + "loss": 13.9624, + "grad_norm": 1.9575270414352417, + "learning_rate": 0.0005, + "epoch": 0.5756894996610625, + "step": 12845 + }, + { + "loss": 14.0536, + "grad_norm": 1.8159977197647095, + "learning_rate": 0.0005, + "epoch": 0.575913590552328, + "step": 12850 + }, + { + "loss": 13.9992, + "grad_norm": 1.8523368835449219, + "learning_rate": 0.0005, + "epoch": 0.5761376814435936, + "step": 12855 + }, + { + "loss": 13.9665, + "grad_norm": 1.796653151512146, + "learning_rate": 0.0005, + "epoch": 0.576361772334859, + "step": 12860 + }, + { + "loss": 14.0113, + "grad_norm": 2.0082266330718994, + "learning_rate": 0.0005, + "epoch": 0.5765858632261245, + "step": 12865 + }, + { + "loss": 14.0491, + "grad_norm": 1.9820599555969238, + "learning_rate": 0.0005, + "epoch": 0.57680995411739, + "step": 12870 + }, + { + "loss": 14.0307, + "grad_norm": 1.8778185844421387, + "learning_rate": 0.0005, + "epoch": 0.5770340450086555, + "step": 12875 + }, + { + "loss": 13.9243, + "grad_norm": 1.9270992279052734, + "learning_rate": 0.0005, + "epoch": 0.577258135899921, + "step": 12880 + }, + { + "loss": 14.1251, + "grad_norm": 1.9180136919021606, + "learning_rate": 0.0005, + "epoch": 0.5774822267911865, + "step": 12885 + }, + { + "loss": 13.9821, + "grad_norm": 1.7854799032211304, + "learning_rate": 0.0005, + "epoch": 0.577706317682452, + "step": 12890 + }, + { + "loss": 14.0954, + "grad_norm": 1.7735947370529175, + "learning_rate": 0.0005, + "epoch": 0.5779304085737175, + "step": 12895 + }, + { + "loss": 13.9303, + "grad_norm": 2.0620439052581787, + "learning_rate": 0.0005, + "epoch": 0.578154499464983, + "step": 12900 + }, + { + "loss": 14.1653, + "grad_norm": 2.0477583408355713, + "learning_rate": 0.0005, + "epoch": 0.5783785903562485, + "step": 12905 + }, + { + "loss": 13.9222, + "grad_norm": 2.063283681869507, + "learning_rate": 0.0005, + "epoch": 0.578602681247514, + "step": 12910 + }, + { + "loss": 13.963, + "grad_norm": 1.889586091041565, + "learning_rate": 0.0005, + "epoch": 0.5788267721387795, + "step": 12915 + }, + { + "loss": 14.0448, + "grad_norm": 1.8375580310821533, + "learning_rate": 0.0005, + "epoch": 0.5790508630300449, + "step": 12920 + }, + { + "loss": 13.9869, + "grad_norm": 1.8818038702011108, + "learning_rate": 0.0005, + "epoch": 0.5792749539213105, + "step": 12925 + }, + { + "loss": 13.9813, + "grad_norm": 1.8376609086990356, + "learning_rate": 0.0005, + "epoch": 0.579499044812576, + "step": 12930 + }, + { + "loss": 13.9736, + "grad_norm": 1.9093542098999023, + "learning_rate": 0.0005, + "epoch": 0.5797231357038415, + "step": 12935 + }, + { + "loss": 13.9723, + "grad_norm": 2.288410186767578, + "learning_rate": 0.0005, + "epoch": 0.579947226595107, + "step": 12940 + }, + { + "loss": 14.0652, + "grad_norm": 1.796730875968933, + "learning_rate": 0.0005, + "epoch": 0.5801713174863725, + "step": 12945 + }, + { + "loss": 13.9637, + "grad_norm": 1.9128367900848389, + "learning_rate": 0.0005, + "epoch": 0.5803954083776379, + "step": 12950 + }, + { + "loss": 13.9768, + "grad_norm": 1.967447280883789, + "learning_rate": 0.0005, + "epoch": 0.5806194992689034, + "step": 12955 + }, + { + "loss": 14.0629, + "grad_norm": 1.8460129499435425, + "learning_rate": 0.0005, + "epoch": 0.580843590160169, + "step": 12960 + }, + { + "loss": 14.0869, + "grad_norm": 2.1665172576904297, + "learning_rate": 0.0005, + "epoch": 0.5810676810514345, + "step": 12965 + }, + { + "loss": 14.0342, + "grad_norm": 1.8314615488052368, + "learning_rate": 0.0005, + "epoch": 0.5812917719427, + "step": 12970 + }, + { + "loss": 13.8923, + "grad_norm": 1.8446085453033447, + "learning_rate": 0.0005, + "epoch": 0.5815158628339655, + "step": 12975 + }, + { + "loss": 13.9526, + "grad_norm": 1.7681913375854492, + "learning_rate": 0.0005, + "epoch": 0.5817399537252309, + "step": 12980 + }, + { + "loss": 14.0395, + "grad_norm": 1.8649137020111084, + "learning_rate": 0.0005, + "epoch": 0.5819640446164964, + "step": 12985 + }, + { + "loss": 14.0145, + "grad_norm": 1.7946351766586304, + "learning_rate": 0.0005, + "epoch": 0.5821881355077619, + "step": 12990 + }, + { + "loss": 14.1076, + "grad_norm": 1.9252318143844604, + "learning_rate": 0.0005, + "epoch": 0.5824122263990275, + "step": 12995 + }, + { + "loss": 13.9776, + "grad_norm": 1.826115608215332, + "learning_rate": 0.0005, + "epoch": 0.582636317290293, + "step": 13000 + }, + { + "eval_loss": 1.7454519271850586, + "eval_runtime": 18.6945, + "eval_samples_per_second": 876.406, + "eval_steps_per_second": 7.863, + "epoch": 0.582636317290293, + "step": 13000 + }, + { + "loss": 13.9629, + "grad_norm": 1.9778342247009277, + "learning_rate": 0.0005, + "epoch": 0.5828604081815585, + "step": 13005 + }, + { + "loss": 14.0724, + "grad_norm": 1.7861636877059937, + "learning_rate": 0.0005, + "epoch": 0.5830844990728239, + "step": 13010 + }, + { + "loss": 13.9045, + "grad_norm": 1.8089948892593384, + "learning_rate": 0.0005, + "epoch": 0.5833085899640894, + "step": 13015 + }, + { + "loss": 14.0613, + "grad_norm": 1.9273558855056763, + "learning_rate": 0.0005, + "epoch": 0.5835326808553549, + "step": 13020 + }, + { + "loss": 13.9659, + "grad_norm": 1.7711695432662964, + "learning_rate": 0.0005, + "epoch": 0.5837567717466204, + "step": 13025 + }, + { + "loss": 14.0277, + "grad_norm": 1.847944974899292, + "learning_rate": 0.0005, + "epoch": 0.583980862637886, + "step": 13030 + }, + { + "loss": 13.8823, + "grad_norm": 1.7399357557296753, + "learning_rate": 0.0005, + "epoch": 0.5842049535291515, + "step": 13035 + }, + { + "loss": 13.988, + "grad_norm": 1.7723686695098877, + "learning_rate": 0.0005, + "epoch": 0.5844290444204169, + "step": 13040 + }, + { + "loss": 13.9147, + "grad_norm": 1.7638523578643799, + "learning_rate": 0.0005, + "epoch": 0.5846531353116824, + "step": 13045 + }, + { + "loss": 14.0222, + "grad_norm": 1.7969489097595215, + "learning_rate": 0.0005, + "epoch": 0.5848772262029479, + "step": 13050 + }, + { + "loss": 14.0576, + "grad_norm": 1.8615041971206665, + "learning_rate": 0.0005, + "epoch": 0.5851013170942134, + "step": 13055 + }, + { + "loss": 13.9736, + "grad_norm": 1.8189938068389893, + "learning_rate": 0.0005, + "epoch": 0.5853254079854789, + "step": 13060 + }, + { + "loss": 13.9696, + "grad_norm": 1.829389214515686, + "learning_rate": 0.0005, + "epoch": 0.5855494988767445, + "step": 13065 + }, + { + "loss": 14.008, + "grad_norm": 1.8700202703475952, + "learning_rate": 0.0005, + "epoch": 0.5857735897680099, + "step": 13070 + }, + { + "loss": 14.0049, + "grad_norm": 1.7231286764144897, + "learning_rate": 0.0005, + "epoch": 0.5859976806592754, + "step": 13075 + }, + { + "loss": 13.9735, + "grad_norm": 2.0502021312713623, + "learning_rate": 0.0005, + "epoch": 0.5862217715505409, + "step": 13080 + }, + { + "loss": 14.0124, + "grad_norm": 2.1990959644317627, + "learning_rate": 0.0005, + "epoch": 0.5864458624418064, + "step": 13085 + }, + { + "loss": 14.1032, + "grad_norm": 1.902991771697998, + "learning_rate": 0.0005, + "epoch": 0.5866699533330719, + "step": 13090 + }, + { + "loss": 13.9135, + "grad_norm": 1.738110065460205, + "learning_rate": 0.0005, + "epoch": 0.5868940442243374, + "step": 13095 + }, + { + "loss": 14.0366, + "grad_norm": 1.736744999885559, + "learning_rate": 0.0005, + "epoch": 0.5871181351156028, + "step": 13100 + }, + { + "loss": 14.0341, + "grad_norm": 1.728366732597351, + "learning_rate": 0.0005, + "epoch": 0.5873422260068684, + "step": 13105 + }, + { + "loss": 14.0086, + "grad_norm": 1.9115058183670044, + "learning_rate": 0.0005, + "epoch": 0.5875663168981339, + "step": 13110 + }, + { + "loss": 13.9843, + "grad_norm": 1.8320019245147705, + "learning_rate": 0.0005, + "epoch": 0.5877904077893994, + "step": 13115 + }, + { + "loss": 13.9944, + "grad_norm": 1.8176792860031128, + "learning_rate": 0.0005, + "epoch": 0.5880144986806649, + "step": 13120 + }, + { + "loss": 14.0356, + "grad_norm": 1.802194356918335, + "learning_rate": 0.0005, + "epoch": 0.5882385895719304, + "step": 13125 + }, + { + "loss": 13.9914, + "grad_norm": 1.7907404899597168, + "learning_rate": 0.0005, + "epoch": 0.5884626804631958, + "step": 13130 + }, + { + "loss": 13.9473, + "grad_norm": 1.9506714344024658, + "learning_rate": 0.0005, + "epoch": 0.5886867713544613, + "step": 13135 + }, + { + "loss": 13.9552, + "grad_norm": 1.9168850183486938, + "learning_rate": 0.0005, + "epoch": 0.5889108622457269, + "step": 13140 + }, + { + "loss": 14.1582, + "grad_norm": 1.8277353048324585, + "learning_rate": 0.0005, + "epoch": 0.5891349531369924, + "step": 13145 + }, + { + "loss": 14.0282, + "grad_norm": 1.944557785987854, + "learning_rate": 0.0005, + "epoch": 0.5893590440282579, + "step": 13150 + }, + { + "loss": 14.017, + "grad_norm": 1.851028323173523, + "learning_rate": 0.0005, + "epoch": 0.5895831349195234, + "step": 13155 + }, + { + "loss": 14.046, + "grad_norm": 1.9107221364974976, + "learning_rate": 0.0005, + "epoch": 0.5898072258107888, + "step": 13160 + }, + { + "loss": 13.96, + "grad_norm": 1.908125638961792, + "learning_rate": 0.0005, + "epoch": 0.5900313167020543, + "step": 13165 + }, + { + "loss": 14.0432, + "grad_norm": 1.729802131652832, + "learning_rate": 0.0005, + "epoch": 0.5902554075933198, + "step": 13170 + }, + { + "loss": 13.8967, + "grad_norm": 1.9202781915664673, + "learning_rate": 0.0005, + "epoch": 0.5904794984845854, + "step": 13175 + }, + { + "loss": 13.9397, + "grad_norm": 1.7435154914855957, + "learning_rate": 0.0005, + "epoch": 0.5907035893758509, + "step": 13180 + }, + { + "loss": 13.9726, + "grad_norm": 1.8927922248840332, + "learning_rate": 0.0005, + "epoch": 0.5909276802671164, + "step": 13185 + }, + { + "loss": 13.9359, + "grad_norm": 1.952462911605835, + "learning_rate": 0.0005, + "epoch": 0.5911517711583818, + "step": 13190 + }, + { + "loss": 13.9552, + "grad_norm": 1.815459132194519, + "learning_rate": 0.0005, + "epoch": 0.5913758620496473, + "step": 13195 + }, + { + "loss": 13.9568, + "grad_norm": 1.8877557516098022, + "learning_rate": 0.0005, + "epoch": 0.5915999529409128, + "step": 13200 + }, + { + "loss": 13.9949, + "grad_norm": 1.7916780710220337, + "learning_rate": 0.0005, + "epoch": 0.5918240438321783, + "step": 13205 + }, + { + "loss": 14.0393, + "grad_norm": 1.8304307460784912, + "learning_rate": 0.0005, + "epoch": 0.5920481347234439, + "step": 13210 + }, + { + "loss": 14.0511, + "grad_norm": 1.842038869857788, + "learning_rate": 0.0005, + "epoch": 0.5922722256147094, + "step": 13215 + }, + { + "loss": 13.9725, + "grad_norm": 1.7629725933074951, + "learning_rate": 0.0005, + "epoch": 0.5924963165059748, + "step": 13220 + }, + { + "loss": 14.0556, + "grad_norm": 1.8662147521972656, + "learning_rate": 0.0005, + "epoch": 0.5927204073972403, + "step": 13225 + }, + { + "loss": 14.1007, + "grad_norm": 1.7180095911026, + "learning_rate": 0.0005, + "epoch": 0.5929444982885058, + "step": 13230 + }, + { + "loss": 14.0402, + "grad_norm": 1.8103516101837158, + "learning_rate": 0.0005, + "epoch": 0.5931685891797713, + "step": 13235 + }, + { + "loss": 13.9343, + "grad_norm": 1.8062591552734375, + "learning_rate": 0.0005, + "epoch": 0.5933926800710368, + "step": 13240 + }, + { + "loss": 14.0915, + "grad_norm": 1.8626552820205688, + "learning_rate": 0.0005, + "epoch": 0.5936167709623024, + "step": 13245 + }, + { + "loss": 14.0781, + "grad_norm": 1.9887112379074097, + "learning_rate": 0.0005, + "epoch": 0.5938408618535678, + "step": 13250 + }, + { + "loss": 13.9957, + "grad_norm": 1.8482990264892578, + "learning_rate": 0.0005, + "epoch": 0.5940649527448333, + "step": 13255 + }, + { + "loss": 14.0905, + "grad_norm": 1.8576593399047852, + "learning_rate": 0.0005, + "epoch": 0.5942890436360988, + "step": 13260 + }, + { + "loss": 14.0453, + "grad_norm": 1.986657738685608, + "learning_rate": 0.0005, + "epoch": 0.5945131345273643, + "step": 13265 + }, + { + "loss": 13.964, + "grad_norm": 1.9155220985412598, + "learning_rate": 0.0005, + "epoch": 0.5947372254186298, + "step": 13270 + }, + { + "loss": 14.0482, + "grad_norm": 1.9993773698806763, + "learning_rate": 0.0005, + "epoch": 0.5949613163098952, + "step": 13275 + }, + { + "loss": 13.9583, + "grad_norm": 1.8450108766555786, + "learning_rate": 0.0005, + "epoch": 0.5951854072011608, + "step": 13280 + }, + { + "loss": 13.9949, + "grad_norm": 1.7589229345321655, + "learning_rate": 0.0005, + "epoch": 0.5954094980924263, + "step": 13285 + }, + { + "loss": 13.9284, + "grad_norm": 1.9227980375289917, + "learning_rate": 0.0005, + "epoch": 0.5956335889836918, + "step": 13290 + }, + { + "loss": 14.0239, + "grad_norm": 1.8273468017578125, + "learning_rate": 0.0005, + "epoch": 0.5958576798749573, + "step": 13295 + }, + { + "loss": 13.975, + "grad_norm": 1.8974274396896362, + "learning_rate": 0.0005, + "epoch": 0.5960817707662228, + "step": 13300 + }, + { + "loss": 13.9155, + "grad_norm": 1.7669709920883179, + "learning_rate": 0.0005, + "epoch": 0.5963058616574882, + "step": 13305 + }, + { + "loss": 13.9947, + "grad_norm": 1.719914436340332, + "learning_rate": 0.0005, + "epoch": 0.5965299525487537, + "step": 13310 + }, + { + "loss": 13.9589, + "grad_norm": 1.8317246437072754, + "learning_rate": 0.0005, + "epoch": 0.5967540434400193, + "step": 13315 + }, + { + "loss": 14.0374, + "grad_norm": 1.9633820056915283, + "learning_rate": 0.0005, + "epoch": 0.5969781343312848, + "step": 13320 + }, + { + "loss": 14.0707, + "grad_norm": 1.788787841796875, + "learning_rate": 0.0005, + "epoch": 0.5972022252225503, + "step": 13325 + }, + { + "loss": 13.9841, + "grad_norm": 1.7529159784317017, + "learning_rate": 0.0005, + "epoch": 0.5974263161138158, + "step": 13330 + }, + { + "loss": 13.9827, + "grad_norm": 1.8291631937026978, + "learning_rate": 0.0005, + "epoch": 0.5976504070050812, + "step": 13335 + }, + { + "loss": 13.9554, + "grad_norm": 1.8712011575698853, + "learning_rate": 0.0005, + "epoch": 0.5978744978963467, + "step": 13340 + }, + { + "loss": 14.015, + "grad_norm": 2.0833263397216797, + "learning_rate": 0.0005, + "epoch": 0.5980985887876122, + "step": 13345 + }, + { + "loss": 14.1187, + "grad_norm": 1.7845981121063232, + "learning_rate": 0.0005, + "epoch": 0.5983226796788778, + "step": 13350 + }, + { + "loss": 13.9722, + "grad_norm": 1.9722967147827148, + "learning_rate": 0.0005, + "epoch": 0.5985467705701433, + "step": 13355 + }, + { + "loss": 14.0666, + "grad_norm": 1.8057105541229248, + "learning_rate": 0.0005, + "epoch": 0.5987708614614088, + "step": 13360 + }, + { + "loss": 13.9814, + "grad_norm": 1.8602194786071777, + "learning_rate": 0.0005, + "epoch": 0.5989949523526742, + "step": 13365 + }, + { + "loss": 13.988, + "grad_norm": 1.906585931777954, + "learning_rate": 0.0005, + "epoch": 0.5992190432439397, + "step": 13370 + }, + { + "loss": 14.0361, + "grad_norm": 1.843865156173706, + "learning_rate": 0.0005, + "epoch": 0.5994431341352052, + "step": 13375 + }, + { + "loss": 14.067, + "grad_norm": 1.943974256515503, + "learning_rate": 0.0005, + "epoch": 0.5996672250264707, + "step": 13380 + }, + { + "loss": 14.0264, + "grad_norm": 1.87297785282135, + "learning_rate": 0.0005, + "epoch": 0.5998913159177363, + "step": 13385 + }, + { + "loss": 13.963, + "grad_norm": 1.9725892543792725, + "learning_rate": 0.0005, + "epoch": 0.6001154068090018, + "step": 13390 + }, + { + "loss": 14.0057, + "grad_norm": 1.7576072216033936, + "learning_rate": 0.0005, + "epoch": 0.6003394977002672, + "step": 13395 + }, + { + "loss": 14.0163, + "grad_norm": 1.7891968488693237, + "learning_rate": 0.0005, + "epoch": 0.6005635885915327, + "step": 13400 + }, + { + "loss": 14.0749, + "grad_norm": 2.0627856254577637, + "learning_rate": 0.0005, + "epoch": 0.6007876794827982, + "step": 13405 + }, + { + "loss": 13.9621, + "grad_norm": 2.045072317123413, + "learning_rate": 0.0005, + "epoch": 0.6010117703740637, + "step": 13410 + }, + { + "loss": 14.0363, + "grad_norm": 1.9276081323623657, + "learning_rate": 0.0005, + "epoch": 0.6012358612653292, + "step": 13415 + }, + { + "loss": 14.0716, + "grad_norm": 1.9165892601013184, + "learning_rate": 0.0005, + "epoch": 0.6014599521565948, + "step": 13420 + }, + { + "loss": 14.0879, + "grad_norm": 1.9438608884811401, + "learning_rate": 0.0005, + "epoch": 0.6016840430478602, + "step": 13425 + }, + { + "loss": 13.9669, + "grad_norm": 1.997025966644287, + "learning_rate": 0.0005, + "epoch": 0.6019081339391257, + "step": 13430 + }, + { + "loss": 13.9591, + "grad_norm": 2.0243873596191406, + "learning_rate": 0.0005, + "epoch": 0.6021322248303912, + "step": 13435 + }, + { + "loss": 13.9436, + "grad_norm": 2.027860641479492, + "learning_rate": 0.0005, + "epoch": 0.6023563157216567, + "step": 13440 + }, + { + "loss": 13.7903, + "grad_norm": 1.785510540008545, + "learning_rate": 0.0005, + "epoch": 0.6025804066129222, + "step": 13445 + }, + { + "loss": 14.0328, + "grad_norm": 1.863256812095642, + "learning_rate": 0.0005, + "epoch": 0.6028044975041877, + "step": 13450 + }, + { + "loss": 13.9454, + "grad_norm": 1.8799083232879639, + "learning_rate": 0.0005, + "epoch": 0.6030285883954531, + "step": 13455 + }, + { + "loss": 13.9203, + "grad_norm": 1.7613584995269775, + "learning_rate": 0.0005, + "epoch": 0.6032526792867187, + "step": 13460 + }, + { + "loss": 13.993, + "grad_norm": 1.7297017574310303, + "learning_rate": 0.0005, + "epoch": 0.6034767701779842, + "step": 13465 + }, + { + "loss": 13.976, + "grad_norm": 1.9386945962905884, + "learning_rate": 0.0005, + "epoch": 0.6037008610692497, + "step": 13470 + }, + { + "loss": 13.8954, + "grad_norm": 1.883724570274353, + "learning_rate": 0.0005, + "epoch": 0.6039249519605152, + "step": 13475 + }, + { + "loss": 13.9777, + "grad_norm": 1.9041953086853027, + "learning_rate": 0.0005, + "epoch": 0.6041490428517807, + "step": 13480 + }, + { + "loss": 14.0164, + "grad_norm": 1.890601396560669, + "learning_rate": 0.0005, + "epoch": 0.6043731337430461, + "step": 13485 + }, + { + "loss": 13.9782, + "grad_norm": 1.8261315822601318, + "learning_rate": 0.0005, + "epoch": 0.6045972246343116, + "step": 13490 + }, + { + "loss": 14.0556, + "grad_norm": 1.8594465255737305, + "learning_rate": 0.0005, + "epoch": 0.6048213155255772, + "step": 13495 + }, + { + "loss": 13.9321, + "grad_norm": 1.9538732767105103, + "learning_rate": 0.0005, + "epoch": 0.6050454064168427, + "step": 13500 + }, + { + "eval_loss": 1.7396843433380127, + "eval_runtime": 18.3572, + "eval_samples_per_second": 892.513, + "eval_steps_per_second": 8.008, + "epoch": 0.6050454064168427, + "step": 13500 + }, + { + "loss": 14.0172, + "grad_norm": 1.9603897333145142, + "learning_rate": 0.0005, + "epoch": 0.6052694973081082, + "step": 13505 + }, + { + "loss": 13.9596, + "grad_norm": 1.8882741928100586, + "learning_rate": 0.0005, + "epoch": 0.6054935881993737, + "step": 13510 + }, + { + "loss": 13.9479, + "grad_norm": 1.9250017404556274, + "learning_rate": 0.0005, + "epoch": 0.6057176790906391, + "step": 13515 + }, + { + "loss": 13.9714, + "grad_norm": 1.8301010131835938, + "learning_rate": 0.0005, + "epoch": 0.6059417699819046, + "step": 13520 + }, + { + "loss": 13.9986, + "grad_norm": 1.8147294521331787, + "learning_rate": 0.0005, + "epoch": 0.6061658608731701, + "step": 13525 + }, + { + "loss": 13.9523, + "grad_norm": 1.8113244771957397, + "learning_rate": 0.0005, + "epoch": 0.6063899517644357, + "step": 13530 + }, + { + "loss": 13.9375, + "grad_norm": 1.8791850805282593, + "learning_rate": 0.0005, + "epoch": 0.6066140426557012, + "step": 13535 + }, + { + "loss": 13.9489, + "grad_norm": 1.732534646987915, + "learning_rate": 0.0005, + "epoch": 0.6068381335469667, + "step": 13540 + }, + { + "loss": 13.8999, + "grad_norm": 1.9004950523376465, + "learning_rate": 0.0005, + "epoch": 0.6070622244382321, + "step": 13545 + }, + { + "loss": 14.0194, + "grad_norm": 1.957031011581421, + "learning_rate": 0.0005, + "epoch": 0.6072863153294976, + "step": 13550 + }, + { + "loss": 14.0431, + "grad_norm": 1.8446530103683472, + "learning_rate": 0.0005, + "epoch": 0.6075104062207631, + "step": 13555 + }, + { + "loss": 13.9807, + "grad_norm": 2.006579875946045, + "learning_rate": 0.0005, + "epoch": 0.6077344971120286, + "step": 13560 + }, + { + "loss": 14.0643, + "grad_norm": 1.957889199256897, + "learning_rate": 0.0005, + "epoch": 0.6079585880032942, + "step": 13565 + }, + { + "loss": 14.0157, + "grad_norm": 1.978819489479065, + "learning_rate": 0.0005, + "epoch": 0.6081826788945597, + "step": 13570 + }, + { + "loss": 14.0688, + "grad_norm": 1.799669623374939, + "learning_rate": 0.0005, + "epoch": 0.6084067697858251, + "step": 13575 + }, + { + "loss": 14.0564, + "grad_norm": 1.74842369556427, + "learning_rate": 0.0005, + "epoch": 0.6086308606770906, + "step": 13580 + }, + { + "loss": 13.9357, + "grad_norm": 1.9134894609451294, + "learning_rate": 0.0005, + "epoch": 0.6088549515683561, + "step": 13585 + }, + { + "loss": 13.9969, + "grad_norm": 1.7899885177612305, + "learning_rate": 0.0005, + "epoch": 0.6090790424596216, + "step": 13590 + }, + { + "loss": 14.1657, + "grad_norm": 1.8893680572509766, + "learning_rate": 0.0005, + "epoch": 0.6093031333508871, + "step": 13595 + }, + { + "loss": 14.0536, + "grad_norm": 1.9131115674972534, + "learning_rate": 0.0005, + "epoch": 0.6095272242421527, + "step": 13600 + }, + { + "loss": 13.9266, + "grad_norm": 1.7662969827651978, + "learning_rate": 0.0005, + "epoch": 0.6097513151334181, + "step": 13605 + }, + { + "loss": 13.9748, + "grad_norm": 1.6508381366729736, + "learning_rate": 0.0005, + "epoch": 0.6099754060246836, + "step": 13610 + }, + { + "loss": 14.0392, + "grad_norm": 1.7770224809646606, + "learning_rate": 0.0005, + "epoch": 0.6101994969159491, + "step": 13615 + }, + { + "loss": 13.9233, + "grad_norm": 1.875481367111206, + "learning_rate": 0.0005, + "epoch": 0.6104235878072146, + "step": 13620 + }, + { + "loss": 13.9507, + "grad_norm": 1.7938653230667114, + "learning_rate": 0.0005, + "epoch": 0.6106476786984801, + "step": 13625 + }, + { + "loss": 13.9316, + "grad_norm": 1.8137539625167847, + "learning_rate": 0.0005, + "epoch": 0.6108717695897457, + "step": 13630 + }, + { + "loss": 13.9899, + "grad_norm": 1.780452847480774, + "learning_rate": 0.0005, + "epoch": 0.611095860481011, + "step": 13635 + }, + { + "loss": 14.0376, + "grad_norm": 1.930036187171936, + "learning_rate": 0.0005, + "epoch": 0.6113199513722766, + "step": 13640 + }, + { + "loss": 13.9412, + "grad_norm": 1.8009343147277832, + "learning_rate": 0.0005, + "epoch": 0.6115440422635421, + "step": 13645 + }, + { + "loss": 14.0176, + "grad_norm": 1.777569055557251, + "learning_rate": 0.0005, + "epoch": 0.6117681331548076, + "step": 13650 + }, + { + "loss": 13.9528, + "grad_norm": 1.8455289602279663, + "learning_rate": 0.0005, + "epoch": 0.6119922240460731, + "step": 13655 + }, + { + "loss": 13.9233, + "grad_norm": 1.8569191694259644, + "learning_rate": 0.0005, + "epoch": 0.6122163149373386, + "step": 13660 + }, + { + "loss": 14.0166, + "grad_norm": 1.8818286657333374, + "learning_rate": 0.0005, + "epoch": 0.612440405828604, + "step": 13665 + }, + { + "loss": 14.0259, + "grad_norm": 1.8745702505111694, + "learning_rate": 0.0005, + "epoch": 0.6126644967198696, + "step": 13670 + }, + { + "loss": 13.8863, + "grad_norm": 1.8713172674179077, + "learning_rate": 0.0005, + "epoch": 0.6128885876111351, + "step": 13675 + }, + { + "loss": 14.083, + "grad_norm": 1.9167075157165527, + "learning_rate": 0.0005, + "epoch": 0.6131126785024006, + "step": 13680 + }, + { + "loss": 13.9466, + "grad_norm": 1.9366717338562012, + "learning_rate": 0.0005, + "epoch": 0.6133367693936661, + "step": 13685 + }, + { + "loss": 14.0146, + "grad_norm": 2.0661909580230713, + "learning_rate": 0.0005, + "epoch": 0.6135608602849316, + "step": 13690 + }, + { + "loss": 14.0099, + "grad_norm": 2.0465962886810303, + "learning_rate": 0.0005, + "epoch": 0.613784951176197, + "step": 13695 + }, + { + "loss": 13.9691, + "grad_norm": 2.050764560699463, + "learning_rate": 0.0005, + "epoch": 0.6140090420674625, + "step": 13700 + }, + { + "loss": 13.962, + "grad_norm": 2.0589582920074463, + "learning_rate": 0.0005, + "epoch": 0.6142331329587281, + "step": 13705 + }, + { + "loss": 13.9126, + "grad_norm": 1.9636064767837524, + "learning_rate": 0.0005, + "epoch": 0.6144572238499936, + "step": 13710 + }, + { + "loss": 13.894, + "grad_norm": 1.9355812072753906, + "learning_rate": 0.0005, + "epoch": 0.6146813147412591, + "step": 13715 + }, + { + "loss": 13.935, + "grad_norm": 1.7583673000335693, + "learning_rate": 0.0005, + "epoch": 0.6149054056325246, + "step": 13720 + }, + { + "loss": 14.0261, + "grad_norm": 1.8463855981826782, + "learning_rate": 0.0005, + "epoch": 0.61512949652379, + "step": 13725 + }, + { + "loss": 14.0147, + "grad_norm": 1.794784665107727, + "learning_rate": 0.0005, + "epoch": 0.6153535874150555, + "step": 13730 + }, + { + "loss": 14.0213, + "grad_norm": 1.9001660346984863, + "learning_rate": 0.0005, + "epoch": 0.615577678306321, + "step": 13735 + }, + { + "loss": 13.9914, + "grad_norm": 1.800702691078186, + "learning_rate": 0.0005, + "epoch": 0.6158017691975866, + "step": 13740 + }, + { + "loss": 14.0857, + "grad_norm": 1.8461076021194458, + "learning_rate": 0.0005, + "epoch": 0.6160258600888521, + "step": 13745 + }, + { + "loss": 13.9006, + "grad_norm": 1.7148480415344238, + "learning_rate": 0.0005, + "epoch": 0.6162499509801176, + "step": 13750 + }, + { + "loss": 13.9633, + "grad_norm": 1.772823452949524, + "learning_rate": 0.0005, + "epoch": 0.616474041871383, + "step": 13755 + }, + { + "loss": 13.9953, + "grad_norm": 1.8532960414886475, + "learning_rate": 0.0005, + "epoch": 0.6166981327626485, + "step": 13760 + }, + { + "loss": 13.9837, + "grad_norm": 1.9373810291290283, + "learning_rate": 0.0005, + "epoch": 0.616922223653914, + "step": 13765 + }, + { + "loss": 14.0104, + "grad_norm": 1.789506435394287, + "learning_rate": 0.0005, + "epoch": 0.6171463145451795, + "step": 13770 + }, + { + "loss": 13.9058, + "grad_norm": 1.9801504611968994, + "learning_rate": 0.0005, + "epoch": 0.6173704054364451, + "step": 13775 + }, + { + "loss": 14.1375, + "grad_norm": 2.04109263420105, + "learning_rate": 0.0005, + "epoch": 0.6175944963277106, + "step": 13780 + }, + { + "loss": 13.953, + "grad_norm": 1.7348535060882568, + "learning_rate": 0.0005, + "epoch": 0.617818587218976, + "step": 13785 + }, + { + "loss": 14.066, + "grad_norm": 1.89817214012146, + "learning_rate": 0.0005, + "epoch": 0.6180426781102415, + "step": 13790 + }, + { + "loss": 13.9058, + "grad_norm": 2.1307947635650635, + "learning_rate": 0.0005, + "epoch": 0.618266769001507, + "step": 13795 + }, + { + "loss": 13.9577, + "grad_norm": 2.0067522525787354, + "learning_rate": 0.0005, + "epoch": 0.6184908598927725, + "step": 13800 + }, + { + "loss": 13.9795, + "grad_norm": 1.8235889673233032, + "learning_rate": 0.0005, + "epoch": 0.618714950784038, + "step": 13805 + }, + { + "loss": 13.9652, + "grad_norm": 1.71876060962677, + "learning_rate": 0.0005, + "epoch": 0.6189390416753036, + "step": 13810 + }, + { + "loss": 13.9877, + "grad_norm": 1.8106869459152222, + "learning_rate": 0.0005, + "epoch": 0.619163132566569, + "step": 13815 + }, + { + "loss": 13.9324, + "grad_norm": 1.8171489238739014, + "learning_rate": 0.0005, + "epoch": 0.6193872234578345, + "step": 13820 + }, + { + "loss": 14.0892, + "grad_norm": 1.7866055965423584, + "learning_rate": 0.0005, + "epoch": 0.6196113143491, + "step": 13825 + }, + { + "loss": 14.0149, + "grad_norm": 1.8658759593963623, + "learning_rate": 0.0005, + "epoch": 0.6198354052403655, + "step": 13830 + }, + { + "loss": 14.0664, + "grad_norm": 2.031806468963623, + "learning_rate": 0.0005, + "epoch": 0.620059496131631, + "step": 13835 + }, + { + "loss": 14.1421, + "grad_norm": 1.8138434886932373, + "learning_rate": 0.0005, + "epoch": 0.6202835870228964, + "step": 13840 + }, + { + "loss": 14.0283, + "grad_norm": 1.8643549680709839, + "learning_rate": 0.0005, + "epoch": 0.620507677914162, + "step": 13845 + }, + { + "loss": 13.8748, + "grad_norm": 2.06784987449646, + "learning_rate": 0.0005, + "epoch": 0.6207317688054275, + "step": 13850 + }, + { + "loss": 13.9531, + "grad_norm": 1.9382271766662598, + "learning_rate": 0.0005, + "epoch": 0.620955859696693, + "step": 13855 + }, + { + "loss": 14.0412, + "grad_norm": 1.8744487762451172, + "learning_rate": 0.0005, + "epoch": 0.6211799505879585, + "step": 13860 + }, + { + "loss": 13.8106, + "grad_norm": 1.8505882024765015, + "learning_rate": 0.0005, + "epoch": 0.621404041479224, + "step": 13865 + }, + { + "loss": 13.9749, + "grad_norm": 1.9018405675888062, + "learning_rate": 0.0005, + "epoch": 0.6216281323704894, + "step": 13870 + }, + { + "loss": 13.958, + "grad_norm": 1.9398044347763062, + "learning_rate": 0.0005, + "epoch": 0.6218522232617549, + "step": 13875 + }, + { + "loss": 13.9535, + "grad_norm": 1.8147361278533936, + "learning_rate": 0.0005, + "epoch": 0.6220763141530204, + "step": 13880 + }, + { + "loss": 13.911, + "grad_norm": 1.730082631111145, + "learning_rate": 0.0005, + "epoch": 0.622300405044286, + "step": 13885 + }, + { + "loss": 13.961, + "grad_norm": 2.0570127964019775, + "learning_rate": 0.0005, + "epoch": 0.6225244959355515, + "step": 13890 + }, + { + "loss": 14.05, + "grad_norm": 1.9781049489974976, + "learning_rate": 0.0005, + "epoch": 0.622748586826817, + "step": 13895 + }, + { + "loss": 14.0566, + "grad_norm": 1.8938955068588257, + "learning_rate": 0.0005, + "epoch": 0.6229726777180824, + "step": 13900 + }, + { + "loss": 14.0239, + "grad_norm": 1.8804343938827515, + "learning_rate": 0.0005, + "epoch": 0.6231967686093479, + "step": 13905 + }, + { + "loss": 14.009, + "grad_norm": 1.9425048828125, + "learning_rate": 0.0005, + "epoch": 0.6234208595006134, + "step": 13910 + }, + { + "loss": 14.0123, + "grad_norm": 1.753278136253357, + "learning_rate": 0.0005, + "epoch": 0.623644950391879, + "step": 13915 + }, + { + "loss": 13.9572, + "grad_norm": 1.8765408992767334, + "learning_rate": 0.0005, + "epoch": 0.6238690412831445, + "step": 13920 + }, + { + "loss": 14.0642, + "grad_norm": 1.867653250694275, + "learning_rate": 0.0005, + "epoch": 0.62409313217441, + "step": 13925 + }, + { + "loss": 13.9205, + "grad_norm": 1.8997493982315063, + "learning_rate": 0.0005, + "epoch": 0.6243172230656754, + "step": 13930 + }, + { + "loss": 13.9988, + "grad_norm": 1.8226754665374756, + "learning_rate": 0.0005, + "epoch": 0.6245413139569409, + "step": 13935 + }, + { + "loss": 13.9536, + "grad_norm": 1.774327039718628, + "learning_rate": 0.0005, + "epoch": 0.6247654048482064, + "step": 13940 + }, + { + "loss": 13.958, + "grad_norm": 2.0624618530273438, + "learning_rate": 0.0005, + "epoch": 0.6249894957394719, + "step": 13945 + }, + { + "loss": 14.0178, + "grad_norm": 2.0096333026885986, + "learning_rate": 0.0005, + "epoch": 0.6252135866307375, + "step": 13950 + }, + { + "loss": 13.9385, + "grad_norm": 1.836005449295044, + "learning_rate": 0.0005, + "epoch": 0.625437677522003, + "step": 13955 + }, + { + "loss": 13.9841, + "grad_norm": 1.7480518817901611, + "learning_rate": 0.0005, + "epoch": 0.6256617684132684, + "step": 13960 + }, + { + "loss": 14.0596, + "grad_norm": 1.9773201942443848, + "learning_rate": 0.0005, + "epoch": 0.6258858593045339, + "step": 13965 + }, + { + "loss": 13.9208, + "grad_norm": 1.8041434288024902, + "learning_rate": 0.0005, + "epoch": 0.6261099501957994, + "step": 13970 + }, + { + "loss": 14.0366, + "grad_norm": 1.7311369180679321, + "learning_rate": 0.0005, + "epoch": 0.6263340410870649, + "step": 13975 + }, + { + "loss": 13.996, + "grad_norm": 1.7201324701309204, + "learning_rate": 0.0005, + "epoch": 0.6265581319783304, + "step": 13980 + }, + { + "loss": 14.0056, + "grad_norm": 1.814854621887207, + "learning_rate": 0.0005, + "epoch": 0.626782222869596, + "step": 13985 + }, + { + "loss": 13.9742, + "grad_norm": 1.9238992929458618, + "learning_rate": 0.0005, + "epoch": 0.6270063137608614, + "step": 13990 + }, + { + "loss": 13.9946, + "grad_norm": 1.7817552089691162, + "learning_rate": 0.0005, + "epoch": 0.6272304046521269, + "step": 13995 + }, + { + "loss": 13.9743, + "grad_norm": 1.9954774379730225, + "learning_rate": 0.0005, + "epoch": 0.6274544955433924, + "step": 14000 + }, + { + "eval_loss": 1.7412445545196533, + "eval_runtime": 18.6224, + "eval_samples_per_second": 879.8, + "eval_steps_per_second": 7.894, + "epoch": 0.6274544955433924, + "step": 14000 + }, + { + "loss": 14.0041, + "grad_norm": 1.7871489524841309, + "learning_rate": 0.0005, + "epoch": 0.6276785864346579, + "step": 14005 + }, + { + "loss": 14.0718, + "grad_norm": 1.705079436302185, + "learning_rate": 0.0005, + "epoch": 0.6279026773259234, + "step": 14010 + }, + { + "loss": 13.9973, + "grad_norm": 1.8414729833602905, + "learning_rate": 0.0005, + "epoch": 0.6281267682171889, + "step": 14015 + }, + { + "loss": 14.0205, + "grad_norm": 1.7663222551345825, + "learning_rate": 0.0005, + "epoch": 0.6283508591084543, + "step": 14020 + }, + { + "loss": 14.0191, + "grad_norm": 1.9664376974105835, + "learning_rate": 0.0005, + "epoch": 0.6285749499997199, + "step": 14025 + }, + { + "loss": 13.9239, + "grad_norm": 1.706502079963684, + "learning_rate": 0.0005, + "epoch": 0.6287990408909854, + "step": 14030 + }, + { + "loss": 14.0895, + "grad_norm": 1.858054757118225, + "learning_rate": 0.0005, + "epoch": 0.6290231317822509, + "step": 14035 + }, + { + "loss": 14.0412, + "grad_norm": 1.9600058794021606, + "learning_rate": 0.0005, + "epoch": 0.6292472226735164, + "step": 14040 + }, + { + "loss": 13.9395, + "grad_norm": 2.096877098083496, + "learning_rate": 0.0005, + "epoch": 0.6294713135647819, + "step": 14045 + }, + { + "loss": 13.9903, + "grad_norm": 1.7399251461029053, + "learning_rate": 0.0005, + "epoch": 0.6296954044560473, + "step": 14050 + }, + { + "loss": 14.0175, + "grad_norm": 1.7890634536743164, + "learning_rate": 0.0005, + "epoch": 0.6299194953473128, + "step": 14055 + }, + { + "loss": 13.9969, + "grad_norm": 1.7637101411819458, + "learning_rate": 0.0005, + "epoch": 0.6301435862385784, + "step": 14060 + }, + { + "loss": 13.9807, + "grad_norm": 1.8002638816833496, + "learning_rate": 0.0005, + "epoch": 0.6303676771298439, + "step": 14065 + }, + { + "loss": 14.1129, + "grad_norm": 1.9056965112686157, + "learning_rate": 0.0005, + "epoch": 0.6305917680211094, + "step": 14070 + }, + { + "loss": 13.936, + "grad_norm": 1.9890260696411133, + "learning_rate": 0.0005, + "epoch": 0.6308158589123749, + "step": 14075 + }, + { + "loss": 14.0304, + "grad_norm": 1.753678321838379, + "learning_rate": 0.0005, + "epoch": 0.6310399498036403, + "step": 14080 + }, + { + "loss": 13.9602, + "grad_norm": 1.7128273248672485, + "learning_rate": 0.0005, + "epoch": 0.6312640406949058, + "step": 14085 + }, + { + "loss": 13.9127, + "grad_norm": 1.706416130065918, + "learning_rate": 0.0005, + "epoch": 0.6314881315861713, + "step": 14090 + }, + { + "loss": 13.8752, + "grad_norm": 1.8593871593475342, + "learning_rate": 0.0005, + "epoch": 0.6317122224774369, + "step": 14095 + }, + { + "loss": 14.0063, + "grad_norm": 2.070998430252075, + "learning_rate": 0.0005, + "epoch": 0.6319363133687024, + "step": 14100 + }, + { + "loss": 14.0104, + "grad_norm": 1.9629713296890259, + "learning_rate": 0.0005, + "epoch": 0.6321604042599679, + "step": 14105 + }, + { + "loss": 13.929, + "grad_norm": 1.7538399696350098, + "learning_rate": 0.0005, + "epoch": 0.6323844951512333, + "step": 14110 + }, + { + "loss": 14.0087, + "grad_norm": 1.8609179258346558, + "learning_rate": 0.0005, + "epoch": 0.6326085860424988, + "step": 14115 + }, + { + "loss": 13.9808, + "grad_norm": 1.7343677282333374, + "learning_rate": 0.0005, + "epoch": 0.6328326769337643, + "step": 14120 + }, + { + "loss": 13.999, + "grad_norm": 1.8937780857086182, + "learning_rate": 0.0005, + "epoch": 0.6330567678250298, + "step": 14125 + }, + { + "loss": 13.9897, + "grad_norm": 1.700460433959961, + "learning_rate": 0.0005, + "epoch": 0.6332808587162954, + "step": 14130 + }, + { + "loss": 13.915, + "grad_norm": 1.805824875831604, + "learning_rate": 0.0005, + "epoch": 0.6335049496075609, + "step": 14135 + }, + { + "loss": 14.0701, + "grad_norm": 1.755516767501831, + "learning_rate": 0.0005, + "epoch": 0.6337290404988263, + "step": 14140 + }, + { + "loss": 13.9003, + "grad_norm": 1.7965532541275024, + "learning_rate": 0.0005, + "epoch": 0.6339531313900918, + "step": 14145 + }, + { + "loss": 13.886, + "grad_norm": 1.8935853242874146, + "learning_rate": 0.0005, + "epoch": 0.6341772222813573, + "step": 14150 + }, + { + "loss": 13.956, + "grad_norm": 1.919097900390625, + "learning_rate": 0.0005, + "epoch": 0.6344013131726228, + "step": 14155 + }, + { + "loss": 14.0817, + "grad_norm": 2.0242080688476562, + "learning_rate": 0.0005, + "epoch": 0.6346254040638883, + "step": 14160 + }, + { + "loss": 14.0245, + "grad_norm": 1.946304440498352, + "learning_rate": 0.0005, + "epoch": 0.6348494949551539, + "step": 14165 + }, + { + "loss": 13.8935, + "grad_norm": 1.9550292491912842, + "learning_rate": 0.0005, + "epoch": 0.6350735858464193, + "step": 14170 + }, + { + "loss": 14.0761, + "grad_norm": 2.2620325088500977, + "learning_rate": 0.0005, + "epoch": 0.6352976767376848, + "step": 14175 + }, + { + "loss": 14.0249, + "grad_norm": 2.0270233154296875, + "learning_rate": 0.0005, + "epoch": 0.6355217676289503, + "step": 14180 + }, + { + "loss": 14.0, + "grad_norm": 2.0550220012664795, + "learning_rate": 0.0005, + "epoch": 0.6357458585202158, + "step": 14185 + }, + { + "loss": 14.0812, + "grad_norm": 2.0032031536102295, + "learning_rate": 0.0005, + "epoch": 0.6359699494114813, + "step": 14190 + }, + { + "loss": 13.9631, + "grad_norm": 1.7279571294784546, + "learning_rate": 0.0005, + "epoch": 0.6361940403027468, + "step": 14195 + }, + { + "loss": 13.9304, + "grad_norm": 1.76564359664917, + "learning_rate": 0.0005, + "epoch": 0.6364181311940122, + "step": 14200 + }, + { + "loss": 13.9507, + "grad_norm": 1.8951811790466309, + "learning_rate": 0.0005, + "epoch": 0.6366422220852778, + "step": 14205 + }, + { + "loss": 13.9794, + "grad_norm": 1.7524734735488892, + "learning_rate": 0.0005, + "epoch": 0.6368663129765433, + "step": 14210 + }, + { + "loss": 14.092, + "grad_norm": 1.8108766078948975, + "learning_rate": 0.0005, + "epoch": 0.6370904038678088, + "step": 14215 + }, + { + "loss": 14.0535, + "grad_norm": 1.8466496467590332, + "learning_rate": 0.0005, + "epoch": 0.6373144947590743, + "step": 14220 + }, + { + "loss": 14.0813, + "grad_norm": 1.866255760192871, + "learning_rate": 0.0005, + "epoch": 0.6375385856503398, + "step": 14225 + }, + { + "loss": 13.8828, + "grad_norm": 1.8671491146087646, + "learning_rate": 0.0005, + "epoch": 0.6377626765416052, + "step": 14230 + }, + { + "loss": 13.8839, + "grad_norm": 1.6825112104415894, + "learning_rate": 0.0005, + "epoch": 0.6379867674328707, + "step": 14235 + }, + { + "loss": 14.0688, + "grad_norm": 1.8906571865081787, + "learning_rate": 0.0005, + "epoch": 0.6382108583241363, + "step": 14240 + }, + { + "loss": 14.0267, + "grad_norm": 1.99757719039917, + "learning_rate": 0.0005, + "epoch": 0.6384349492154018, + "step": 14245 + }, + { + "loss": 13.9307, + "grad_norm": 1.889869213104248, + "learning_rate": 0.0005, + "epoch": 0.6386590401066673, + "step": 14250 + }, + { + "loss": 13.9939, + "grad_norm": 1.7477375268936157, + "learning_rate": 0.0005, + "epoch": 0.6388831309979328, + "step": 14255 + }, + { + "loss": 13.9191, + "grad_norm": 1.7579962015151978, + "learning_rate": 0.0005, + "epoch": 0.6391072218891982, + "step": 14260 + }, + { + "loss": 14.0355, + "grad_norm": 1.838754653930664, + "learning_rate": 0.0005, + "epoch": 0.6393313127804637, + "step": 14265 + }, + { + "loss": 13.9155, + "grad_norm": 1.835952877998352, + "learning_rate": 0.0005, + "epoch": 0.6395554036717293, + "step": 14270 + }, + { + "loss": 13.92, + "grad_norm": 1.85426926612854, + "learning_rate": 0.0005, + "epoch": 0.6397794945629948, + "step": 14275 + }, + { + "loss": 14.0431, + "grad_norm": 1.7805200815200806, + "learning_rate": 0.0005, + "epoch": 0.6400035854542603, + "step": 14280 + }, + { + "loss": 13.9615, + "grad_norm": 1.7967098951339722, + "learning_rate": 0.0005, + "epoch": 0.6402276763455258, + "step": 14285 + }, + { + "loss": 13.9886, + "grad_norm": 1.6179386377334595, + "learning_rate": 0.0005, + "epoch": 0.6404517672367912, + "step": 14290 + }, + { + "loss": 14.0352, + "grad_norm": 1.9071606397628784, + "learning_rate": 0.0005, + "epoch": 0.6406758581280567, + "step": 14295 + }, + { + "loss": 13.9369, + "grad_norm": 1.928659439086914, + "learning_rate": 0.0005, + "epoch": 0.6408999490193222, + "step": 14300 + }, + { + "loss": 14.0594, + "grad_norm": 1.7425745725631714, + "learning_rate": 0.0005, + "epoch": 0.6411240399105878, + "step": 14305 + }, + { + "loss": 13.9505, + "grad_norm": 1.9480642080307007, + "learning_rate": 0.0005, + "epoch": 0.6413481308018533, + "step": 14310 + }, + { + "loss": 13.9178, + "grad_norm": 1.7811546325683594, + "learning_rate": 0.0005, + "epoch": 0.6415722216931188, + "step": 14315 + }, + { + "loss": 13.9264, + "grad_norm": 1.7841154336929321, + "learning_rate": 0.0005, + "epoch": 0.6417963125843842, + "step": 14320 + }, + { + "loss": 13.963, + "grad_norm": 1.872770071029663, + "learning_rate": 0.0005, + "epoch": 0.6420204034756497, + "step": 14325 + }, + { + "loss": 14.0144, + "grad_norm": 2.028611660003662, + "learning_rate": 0.0005, + "epoch": 0.6422444943669152, + "step": 14330 + }, + { + "loss": 13.9069, + "grad_norm": 1.810651421546936, + "learning_rate": 0.0005, + "epoch": 0.6424685852581807, + "step": 14335 + }, + { + "loss": 13.9686, + "grad_norm": 1.918397307395935, + "learning_rate": 0.0005, + "epoch": 0.6426926761494463, + "step": 14340 + }, + { + "loss": 14.0204, + "grad_norm": 1.9416550397872925, + "learning_rate": 0.0005, + "epoch": 0.6429167670407118, + "step": 14345 + }, + { + "loss": 13.9854, + "grad_norm": 1.8357208967208862, + "learning_rate": 0.0005, + "epoch": 0.6431408579319772, + "step": 14350 + }, + { + "loss": 13.9148, + "grad_norm": 1.8995227813720703, + "learning_rate": 0.0005, + "epoch": 0.6433649488232427, + "step": 14355 + }, + { + "loss": 14.0029, + "grad_norm": 1.7630736827850342, + "learning_rate": 0.0005, + "epoch": 0.6435890397145082, + "step": 14360 + }, + { + "loss": 14.0908, + "grad_norm": 1.9035780429840088, + "learning_rate": 0.0005, + "epoch": 0.6438131306057737, + "step": 14365 + }, + { + "loss": 14.0767, + "grad_norm": 2.125985860824585, + "learning_rate": 0.0005, + "epoch": 0.6440372214970392, + "step": 14370 + }, + { + "loss": 14.0037, + "grad_norm": 2.025540351867676, + "learning_rate": 0.0005, + "epoch": 0.6442613123883046, + "step": 14375 + }, + { + "loss": 14.1204, + "grad_norm": 1.9417158365249634, + "learning_rate": 0.0005, + "epoch": 0.6444854032795702, + "step": 14380 + }, + { + "loss": 13.9539, + "grad_norm": 1.753354549407959, + "learning_rate": 0.0005, + "epoch": 0.6447094941708357, + "step": 14385 + }, + { + "loss": 13.9972, + "grad_norm": 1.866416573524475, + "learning_rate": 0.0005, + "epoch": 0.6449335850621012, + "step": 14390 + }, + { + "loss": 13.9683, + "grad_norm": 1.8368868827819824, + "learning_rate": 0.0005, + "epoch": 0.6451576759533667, + "step": 14395 + }, + { + "loss": 13.8554, + "grad_norm": 1.7871224880218506, + "learning_rate": 0.0005, + "epoch": 0.6453817668446322, + "step": 14400 + }, + { + "loss": 13.8737, + "grad_norm": 1.802868127822876, + "learning_rate": 0.0005, + "epoch": 0.6456058577358976, + "step": 14405 + }, + { + "loss": 13.911, + "grad_norm": 1.8559720516204834, + "learning_rate": 0.0005, + "epoch": 0.6458299486271631, + "step": 14410 + }, + { + "loss": 13.9026, + "grad_norm": 2.0018739700317383, + "learning_rate": 0.0005, + "epoch": 0.6460540395184287, + "step": 14415 + }, + { + "loss": 14.0628, + "grad_norm": 1.8371291160583496, + "learning_rate": 0.0005, + "epoch": 0.6462781304096942, + "step": 14420 + }, + { + "loss": 14.0768, + "grad_norm": 2.005053758621216, + "learning_rate": 0.0005, + "epoch": 0.6465022213009597, + "step": 14425 + }, + { + "loss": 13.9509, + "grad_norm": 1.8181270360946655, + "learning_rate": 0.0005, + "epoch": 0.6467263121922252, + "step": 14430 + }, + { + "loss": 13.9489, + "grad_norm": 1.8325514793395996, + "learning_rate": 0.0005, + "epoch": 0.6469504030834906, + "step": 14435 + }, + { + "loss": 13.9991, + "grad_norm": 1.8346320390701294, + "learning_rate": 0.0005, + "epoch": 0.6471744939747561, + "step": 14440 + }, + { + "loss": 14.0045, + "grad_norm": 1.7492088079452515, + "learning_rate": 0.0005, + "epoch": 0.6473985848660216, + "step": 14445 + }, + { + "loss": 14.0071, + "grad_norm": 1.8489519357681274, + "learning_rate": 0.0005, + "epoch": 0.6476226757572872, + "step": 14450 + }, + { + "loss": 13.874, + "grad_norm": 1.8009483814239502, + "learning_rate": 0.0005, + "epoch": 0.6478467666485527, + "step": 14455 + }, + { + "loss": 13.8814, + "grad_norm": 1.7434006929397583, + "learning_rate": 0.0005, + "epoch": 0.6480708575398182, + "step": 14460 + }, + { + "loss": 13.9751, + "grad_norm": 1.910570502281189, + "learning_rate": 0.0005, + "epoch": 0.6482949484310836, + "step": 14465 + }, + { + "loss": 14.0008, + "grad_norm": 1.9025352001190186, + "learning_rate": 0.0005, + "epoch": 0.6485190393223491, + "step": 14470 + }, + { + "loss": 13.9886, + "grad_norm": 1.918157696723938, + "learning_rate": 0.0005, + "epoch": 0.6487431302136146, + "step": 14475 + }, + { + "loss": 13.9403, + "grad_norm": 1.9863786697387695, + "learning_rate": 0.0005, + "epoch": 0.6489672211048801, + "step": 14480 + }, + { + "loss": 13.9518, + "grad_norm": 1.8702291250228882, + "learning_rate": 0.0005, + "epoch": 0.6491913119961457, + "step": 14485 + }, + { + "loss": 14.0877, + "grad_norm": 2.1532609462738037, + "learning_rate": 0.0005, + "epoch": 0.6494154028874112, + "step": 14490 + }, + { + "loss": 13.9791, + "grad_norm": 1.9106634855270386, + "learning_rate": 0.0005, + "epoch": 0.6496394937786766, + "step": 14495 + }, + { + "loss": 13.9435, + "grad_norm": 1.7779203653335571, + "learning_rate": 0.0005, + "epoch": 0.6498635846699421, + "step": 14500 + }, + { + "eval_loss": 1.741438627243042, + "eval_runtime": 18.8225, + "eval_samples_per_second": 870.449, + "eval_steps_per_second": 7.81, + "epoch": 0.6498635846699421, + "step": 14500 + }, + { + "loss": 13.8422, + "grad_norm": 1.825476884841919, + "learning_rate": 0.0005, + "epoch": 0.6500876755612076, + "step": 14505 + }, + { + "loss": 13.8998, + "grad_norm": 2.1719396114349365, + "learning_rate": 0.0005, + "epoch": 0.6503117664524731, + "step": 14510 + }, + { + "loss": 13.9174, + "grad_norm": 1.8220202922821045, + "learning_rate": 0.0005, + "epoch": 0.6505358573437386, + "step": 14515 + }, + { + "loss": 13.8703, + "grad_norm": 1.6797312498092651, + "learning_rate": 0.0005, + "epoch": 0.6507599482350042, + "step": 14520 + }, + { + "loss": 13.9191, + "grad_norm": 1.8802516460418701, + "learning_rate": 0.0005, + "epoch": 0.6509840391262696, + "step": 14525 + }, + { + "loss": 13.9308, + "grad_norm": 1.7543238401412964, + "learning_rate": 0.0005, + "epoch": 0.6512081300175351, + "step": 14530 + }, + { + "loss": 13.9549, + "grad_norm": 1.8070513010025024, + "learning_rate": 0.0005, + "epoch": 0.6514322209088006, + "step": 14535 + }, + { + "loss": 13.8917, + "grad_norm": 1.7234477996826172, + "learning_rate": 0.0005, + "epoch": 0.6516563118000661, + "step": 14540 + }, + { + "loss": 14.0706, + "grad_norm": 1.919089674949646, + "learning_rate": 0.0005, + "epoch": 0.6518804026913316, + "step": 14545 + }, + { + "loss": 13.9334, + "grad_norm": 1.6644461154937744, + "learning_rate": 0.0005, + "epoch": 0.6521044935825971, + "step": 14550 + }, + { + "loss": 13.9985, + "grad_norm": 1.8210339546203613, + "learning_rate": 0.0005, + "epoch": 0.6523285844738625, + "step": 14555 + }, + { + "loss": 14.1057, + "grad_norm": 1.849721074104309, + "learning_rate": 0.0005, + "epoch": 0.6525526753651281, + "step": 14560 + }, + { + "loss": 13.9989, + "grad_norm": 1.818102478981018, + "learning_rate": 0.0005, + "epoch": 0.6527767662563936, + "step": 14565 + }, + { + "loss": 13.8813, + "grad_norm": 1.669271469116211, + "learning_rate": 0.0005, + "epoch": 0.6530008571476591, + "step": 14570 + }, + { + "loss": 14.047, + "grad_norm": 1.731544852256775, + "learning_rate": 0.0005, + "epoch": 0.6532249480389246, + "step": 14575 + }, + { + "loss": 13.9806, + "grad_norm": 1.7254154682159424, + "learning_rate": 0.0005, + "epoch": 0.6534490389301901, + "step": 14580 + }, + { + "loss": 13.9701, + "grad_norm": 1.8537356853485107, + "learning_rate": 0.0005, + "epoch": 0.6536731298214555, + "step": 14585 + }, + { + "loss": 13.9786, + "grad_norm": 1.8968428373336792, + "learning_rate": 0.0005, + "epoch": 0.653897220712721, + "step": 14590 + }, + { + "loss": 13.9947, + "grad_norm": 1.9824936389923096, + "learning_rate": 0.0005, + "epoch": 0.6541213116039866, + "step": 14595 + }, + { + "loss": 13.9541, + "grad_norm": 2.01820969581604, + "learning_rate": 0.0005, + "epoch": 0.6543454024952521, + "step": 14600 + }, + { + "loss": 13.8774, + "grad_norm": 1.703083872795105, + "learning_rate": 0.0005, + "epoch": 0.6545694933865176, + "step": 14605 + }, + { + "loss": 13.8502, + "grad_norm": 1.8082938194274902, + "learning_rate": 0.0005, + "epoch": 0.6547935842777831, + "step": 14610 + }, + { + "loss": 13.9328, + "grad_norm": 1.763091802597046, + "learning_rate": 0.0005, + "epoch": 0.6550176751690485, + "step": 14615 + }, + { + "loss": 14.0332, + "grad_norm": 2.1249189376831055, + "learning_rate": 0.0005, + "epoch": 0.655241766060314, + "step": 14620 + }, + { + "loss": 13.955, + "grad_norm": 1.8449853658676147, + "learning_rate": 0.0005, + "epoch": 0.6554658569515796, + "step": 14625 + }, + { + "loss": 14.0941, + "grad_norm": 1.8855788707733154, + "learning_rate": 0.0005, + "epoch": 0.6556899478428451, + "step": 14630 + }, + { + "loss": 13.9477, + "grad_norm": 1.754029631614685, + "learning_rate": 0.0005, + "epoch": 0.6559140387341106, + "step": 14635 + }, + { + "loss": 14.0136, + "grad_norm": 1.8334318399429321, + "learning_rate": 0.0005, + "epoch": 0.6561381296253761, + "step": 14640 + }, + { + "loss": 14.1017, + "grad_norm": 1.7702960968017578, + "learning_rate": 0.0005, + "epoch": 0.6563622205166415, + "step": 14645 + }, + { + "loss": 13.9838, + "grad_norm": 2.070499897003174, + "learning_rate": 0.0005, + "epoch": 0.656586311407907, + "step": 14650 + }, + { + "loss": 14.0847, + "grad_norm": 2.111660957336426, + "learning_rate": 0.0005, + "epoch": 0.6568104022991725, + "step": 14655 + }, + { + "loss": 13.9565, + "grad_norm": 1.7753303050994873, + "learning_rate": 0.0005, + "epoch": 0.657034493190438, + "step": 14660 + }, + { + "loss": 14.1332, + "grad_norm": 1.8608185052871704, + "learning_rate": 0.0005, + "epoch": 0.6572585840817036, + "step": 14665 + }, + { + "loss": 13.9903, + "grad_norm": 1.9424304962158203, + "learning_rate": 0.0005, + "epoch": 0.6574826749729691, + "step": 14670 + }, + { + "loss": 14.0011, + "grad_norm": 1.7471083402633667, + "learning_rate": 0.0005, + "epoch": 0.6577067658642345, + "step": 14675 + }, + { + "loss": 13.9287, + "grad_norm": 1.8507441282272339, + "learning_rate": 0.0005, + "epoch": 0.6579308567555, + "step": 14680 + }, + { + "loss": 13.9519, + "grad_norm": 1.78011953830719, + "learning_rate": 0.0005, + "epoch": 0.6581549476467655, + "step": 14685 + }, + { + "loss": 13.9735, + "grad_norm": 1.7394940853118896, + "learning_rate": 0.0005, + "epoch": 0.658379038538031, + "step": 14690 + }, + { + "loss": 13.9678, + "grad_norm": 1.8042924404144287, + "learning_rate": 0.0005, + "epoch": 0.6586031294292966, + "step": 14695 + }, + { + "loss": 14.0196, + "grad_norm": 1.7354576587677002, + "learning_rate": 0.0005, + "epoch": 0.6588272203205621, + "step": 14700 + }, + { + "loss": 13.9881, + "grad_norm": 1.9211896657943726, + "learning_rate": 0.0005, + "epoch": 0.6590513112118275, + "step": 14705 + }, + { + "loss": 14.0377, + "grad_norm": 2.0172202587127686, + "learning_rate": 0.0005, + "epoch": 0.659275402103093, + "step": 14710 + }, + { + "loss": 13.8951, + "grad_norm": 1.9813729524612427, + "learning_rate": 0.0005, + "epoch": 0.6594994929943585, + "step": 14715 + }, + { + "loss": 13.9769, + "grad_norm": 1.8304388523101807, + "learning_rate": 0.0005, + "epoch": 0.659723583885624, + "step": 14720 + }, + { + "loss": 14.1043, + "grad_norm": 1.95379638671875, + "learning_rate": 0.0005, + "epoch": 0.6599476747768895, + "step": 14725 + }, + { + "loss": 13.8718, + "grad_norm": 1.9209054708480835, + "learning_rate": 0.0005, + "epoch": 0.660171765668155, + "step": 14730 + }, + { + "loss": 13.9663, + "grad_norm": 2.017179250717163, + "learning_rate": 0.0005, + "epoch": 0.6603958565594205, + "step": 14735 + }, + { + "loss": 13.9567, + "grad_norm": 1.9263821840286255, + "learning_rate": 0.0005, + "epoch": 0.660619947450686, + "step": 14740 + }, + { + "loss": 14.017, + "grad_norm": 1.9511022567749023, + "learning_rate": 0.0005, + "epoch": 0.6608440383419515, + "step": 14745 + }, + { + "loss": 13.9642, + "grad_norm": 2.0967555046081543, + "learning_rate": 0.0005, + "epoch": 0.661068129233217, + "step": 14750 + }, + { + "loss": 13.9685, + "grad_norm": 1.841191053390503, + "learning_rate": 0.0005, + "epoch": 0.6612922201244825, + "step": 14755 + }, + { + "loss": 13.9074, + "grad_norm": 1.949305534362793, + "learning_rate": 0.0005, + "epoch": 0.661516311015748, + "step": 14760 + }, + { + "loss": 13.8287, + "grad_norm": 2.0074777603149414, + "learning_rate": 0.0005, + "epoch": 0.6617404019070134, + "step": 14765 + }, + { + "loss": 14.0528, + "grad_norm": 2.0784366130828857, + "learning_rate": 0.0005, + "epoch": 0.661964492798279, + "step": 14770 + }, + { + "loss": 13.9003, + "grad_norm": 1.922762155532837, + "learning_rate": 0.0005, + "epoch": 0.6621885836895445, + "step": 14775 + }, + { + "loss": 13.9591, + "grad_norm": 2.1148183345794678, + "learning_rate": 0.0005, + "epoch": 0.66241267458081, + "step": 14780 + }, + { + "loss": 13.9369, + "grad_norm": 1.88979172706604, + "learning_rate": 0.0005, + "epoch": 0.6626367654720755, + "step": 14785 + }, + { + "loss": 13.9334, + "grad_norm": 2.0521533489227295, + "learning_rate": 0.0005, + "epoch": 0.662860856363341, + "step": 14790 + }, + { + "loss": 13.8078, + "grad_norm": 1.8560912609100342, + "learning_rate": 0.0005, + "epoch": 0.6630849472546064, + "step": 14795 + }, + { + "loss": 13.992, + "grad_norm": 1.895782232284546, + "learning_rate": 0.0005, + "epoch": 0.6633090381458719, + "step": 14800 + }, + { + "loss": 13.915, + "grad_norm": 1.9879924058914185, + "learning_rate": 0.0005, + "epoch": 0.6635331290371375, + "step": 14805 + }, + { + "loss": 14.0203, + "grad_norm": 2.0573923587799072, + "learning_rate": 0.0005, + "epoch": 0.663757219928403, + "step": 14810 + }, + { + "loss": 13.9983, + "grad_norm": 1.8449429273605347, + "learning_rate": 0.0005, + "epoch": 0.6639813108196685, + "step": 14815 + }, + { + "loss": 13.9999, + "grad_norm": 1.8708304166793823, + "learning_rate": 0.0005, + "epoch": 0.664205401710934, + "step": 14820 + }, + { + "loss": 13.9573, + "grad_norm": 1.8508751392364502, + "learning_rate": 0.0005, + "epoch": 0.6644294926021994, + "step": 14825 + }, + { + "loss": 14.1518, + "grad_norm": 1.816766381263733, + "learning_rate": 0.0005, + "epoch": 0.6646535834934649, + "step": 14830 + }, + { + "loss": 13.9542, + "grad_norm": 1.728379487991333, + "learning_rate": 0.0005, + "epoch": 0.6648776743847304, + "step": 14835 + }, + { + "loss": 13.9146, + "grad_norm": 1.8660993576049805, + "learning_rate": 0.0005, + "epoch": 0.665101765275996, + "step": 14840 + }, + { + "loss": 13.866, + "grad_norm": 1.7663236856460571, + "learning_rate": 0.0005, + "epoch": 0.6653258561672615, + "step": 14845 + }, + { + "loss": 14.0455, + "grad_norm": 1.6998153924942017, + "learning_rate": 0.0005, + "epoch": 0.665549947058527, + "step": 14850 + }, + { + "loss": 14.0245, + "grad_norm": 1.9014002084732056, + "learning_rate": 0.0005, + "epoch": 0.6657740379497924, + "step": 14855 + }, + { + "loss": 14.0883, + "grad_norm": 2.1450681686401367, + "learning_rate": 0.0005, + "epoch": 0.6659981288410579, + "step": 14860 + }, + { + "loss": 14.0052, + "grad_norm": 2.028628349304199, + "learning_rate": 0.0005, + "epoch": 0.6662222197323234, + "step": 14865 + }, + { + "loss": 13.9657, + "grad_norm": 1.8996095657348633, + "learning_rate": 0.0005, + "epoch": 0.6664463106235889, + "step": 14870 + }, + { + "loss": 13.99, + "grad_norm": 1.8172988891601562, + "learning_rate": 0.0005, + "epoch": 0.6666704015148545, + "step": 14875 + }, + { + "loss": 13.9786, + "grad_norm": 1.9849042892456055, + "learning_rate": 0.0005, + "epoch": 0.66689449240612, + "step": 14880 + }, + { + "loss": 13.9666, + "grad_norm": 1.7792553901672363, + "learning_rate": 0.0005, + "epoch": 0.6671185832973854, + "step": 14885 + }, + { + "loss": 14.0166, + "grad_norm": 1.7328038215637207, + "learning_rate": 0.0005, + "epoch": 0.6673426741886509, + "step": 14890 + }, + { + "loss": 13.9913, + "grad_norm": 1.7132660150527954, + "learning_rate": 0.0005, + "epoch": 0.6675667650799164, + "step": 14895 + }, + { + "loss": 13.9751, + "grad_norm": 1.872312307357788, + "learning_rate": 0.0005, + "epoch": 0.6677908559711819, + "step": 14900 + }, + { + "loss": 13.8644, + "grad_norm": 1.7622418403625488, + "learning_rate": 0.0005, + "epoch": 0.6680149468624474, + "step": 14905 + }, + { + "loss": 14.0471, + "grad_norm": 1.7850992679595947, + "learning_rate": 0.0005, + "epoch": 0.668239037753713, + "step": 14910 + }, + { + "loss": 13.9102, + "grad_norm": 1.8782780170440674, + "learning_rate": 0.0005, + "epoch": 0.6684631286449784, + "step": 14915 + }, + { + "loss": 13.9136, + "grad_norm": 2.188697099685669, + "learning_rate": 0.0005, + "epoch": 0.6686872195362439, + "step": 14920 + }, + { + "loss": 13.9138, + "grad_norm": 1.8725215196609497, + "learning_rate": 0.0005, + "epoch": 0.6689113104275094, + "step": 14925 + }, + { + "loss": 13.9785, + "grad_norm": 1.9379011392593384, + "learning_rate": 0.0005, + "epoch": 0.6691354013187749, + "step": 14930 + }, + { + "loss": 13.9834, + "grad_norm": 1.845558762550354, + "learning_rate": 0.0005, + "epoch": 0.6693594922100404, + "step": 14935 + }, + { + "loss": 13.8606, + "grad_norm": 1.8981108665466309, + "learning_rate": 0.0005, + "epoch": 0.6695835831013058, + "step": 14940 + }, + { + "loss": 13.9827, + "grad_norm": 2.040807008743286, + "learning_rate": 0.0005, + "epoch": 0.6698076739925714, + "step": 14945 + }, + { + "loss": 14.0103, + "grad_norm": 1.7667853832244873, + "learning_rate": 0.0005, + "epoch": 0.6700317648838369, + "step": 14950 + }, + { + "loss": 13.9359, + "grad_norm": 1.7468235492706299, + "learning_rate": 0.0005, + "epoch": 0.6702558557751024, + "step": 14955 + }, + { + "loss": 14.0798, + "grad_norm": 1.8681683540344238, + "learning_rate": 0.0005, + "epoch": 0.6704799466663679, + "step": 14960 + }, + { + "loss": 13.9911, + "grad_norm": 1.8676295280456543, + "learning_rate": 0.0005, + "epoch": 0.6707040375576334, + "step": 14965 + }, + { + "loss": 13.9214, + "grad_norm": 2.245338201522827, + "learning_rate": 0.0005, + "epoch": 0.6709281284488988, + "step": 14970 + }, + { + "loss": 13.954, + "grad_norm": 1.9026623964309692, + "learning_rate": 0.0005, + "epoch": 0.6711522193401643, + "step": 14975 + }, + { + "loss": 13.9763, + "grad_norm": 1.7272088527679443, + "learning_rate": 0.0005, + "epoch": 0.6713763102314299, + "step": 14980 + }, + { + "loss": 14.0194, + "grad_norm": 1.7873562574386597, + "learning_rate": 0.0005, + "epoch": 0.6716004011226954, + "step": 14985 + }, + { + "loss": 13.8973, + "grad_norm": 1.8993101119995117, + "learning_rate": 0.0005, + "epoch": 0.6718244920139609, + "step": 14990 + }, + { + "loss": 13.932, + "grad_norm": 1.8094158172607422, + "learning_rate": 0.0005, + "epoch": 0.6720485829052264, + "step": 14995 + }, + { + "loss": 13.9708, + "grad_norm": 2.0334019660949707, + "learning_rate": 0.0005, + "epoch": 0.6722726737964918, + "step": 15000 + }, + { + "eval_loss": 1.7388238906860352, + "eval_runtime": 18.2399, + "eval_samples_per_second": 898.252, + "eval_steps_per_second": 8.059, + "epoch": 0.6722726737964918, + "step": 15000 + }, + { + "loss": 13.9549, + "grad_norm": 1.8386287689208984, + "learning_rate": 0.0005, + "epoch": 0.6724967646877573, + "step": 15005 + }, + { + "loss": 13.8927, + "grad_norm": 1.9849205017089844, + "learning_rate": 0.0005, + "epoch": 0.6727208555790228, + "step": 15010 + }, + { + "loss": 13.9941, + "grad_norm": 1.9122897386550903, + "learning_rate": 0.0005, + "epoch": 0.6729449464702884, + "step": 15015 + }, + { + "loss": 13.9255, + "grad_norm": 1.7986644506454468, + "learning_rate": 0.0005, + "epoch": 0.6731690373615539, + "step": 15020 + }, + { + "loss": 13.9692, + "grad_norm": 1.9500049352645874, + "learning_rate": 0.0005, + "epoch": 0.6733931282528194, + "step": 15025 + }, + { + "loss": 13.8283, + "grad_norm": 1.7760783433914185, + "learning_rate": 0.0005, + "epoch": 0.6736172191440848, + "step": 15030 + }, + { + "loss": 13.8723, + "grad_norm": 1.832217812538147, + "learning_rate": 0.0005, + "epoch": 0.6738413100353503, + "step": 15035 + }, + { + "loss": 14.027, + "grad_norm": 1.9263299703598022, + "learning_rate": 0.0005, + "epoch": 0.6740654009266158, + "step": 15040 + }, + { + "loss": 13.9448, + "grad_norm": 1.7831065654754639, + "learning_rate": 0.0005, + "epoch": 0.6742894918178813, + "step": 15045 + }, + { + "loss": 13.9431, + "grad_norm": 1.9155333042144775, + "learning_rate": 0.0005, + "epoch": 0.6745135827091469, + "step": 15050 + }, + { + "loss": 13.943, + "grad_norm": 1.8451348543167114, + "learning_rate": 0.0005, + "epoch": 0.6747376736004124, + "step": 15055 + }, + { + "loss": 13.9246, + "grad_norm": 1.8482345342636108, + "learning_rate": 0.0005, + "epoch": 0.6749617644916778, + "step": 15060 + }, + { + "loss": 13.9124, + "grad_norm": 1.8485702276229858, + "learning_rate": 0.0005, + "epoch": 0.6751858553829433, + "step": 15065 + }, + { + "loss": 13.9741, + "grad_norm": 1.697227120399475, + "learning_rate": 0.0005, + "epoch": 0.6754099462742088, + "step": 15070 + }, + { + "loss": 13.973, + "grad_norm": 1.7353583574295044, + "learning_rate": 0.0005, + "epoch": 0.6756340371654743, + "step": 15075 + }, + { + "loss": 13.9429, + "grad_norm": 2.0220677852630615, + "learning_rate": 0.0005, + "epoch": 0.6758581280567398, + "step": 15080 + }, + { + "loss": 13.9565, + "grad_norm": 1.9071723222732544, + "learning_rate": 0.0005, + "epoch": 0.6760822189480054, + "step": 15085 + }, + { + "loss": 13.9441, + "grad_norm": 1.8981002569198608, + "learning_rate": 0.0005, + "epoch": 0.6763063098392708, + "step": 15090 + }, + { + "loss": 13.8322, + "grad_norm": 1.9765799045562744, + "learning_rate": 0.0005, + "epoch": 0.6765304007305363, + "step": 15095 + }, + { + "loss": 13.9005, + "grad_norm": 1.9476999044418335, + "learning_rate": 0.0005, + "epoch": 0.6767544916218018, + "step": 15100 + }, + { + "loss": 14.0017, + "grad_norm": 2.263842821121216, + "learning_rate": 0.0005, + "epoch": 0.6769785825130673, + "step": 15105 + }, + { + "loss": 13.8683, + "grad_norm": 1.9773280620574951, + "learning_rate": 0.0005, + "epoch": 0.6772026734043328, + "step": 15110 + }, + { + "loss": 13.9226, + "grad_norm": 1.9092910289764404, + "learning_rate": 0.0005, + "epoch": 0.6774267642955983, + "step": 15115 + }, + { + "loss": 13.8926, + "grad_norm": 2.052339553833008, + "learning_rate": 0.0005, + "epoch": 0.6776508551868637, + "step": 15120 + }, + { + "loss": 13.9082, + "grad_norm": 1.8536814451217651, + "learning_rate": 0.0005, + "epoch": 0.6778749460781293, + "step": 15125 + }, + { + "loss": 13.9144, + "grad_norm": 1.7902272939682007, + "learning_rate": 0.0005, + "epoch": 0.6780990369693948, + "step": 15130 + }, + { + "loss": 13.9273, + "grad_norm": 1.7612346410751343, + "learning_rate": 0.0005, + "epoch": 0.6783231278606603, + "step": 15135 + }, + { + "loss": 13.8805, + "grad_norm": 1.8486686944961548, + "learning_rate": 0.0005, + "epoch": 0.6785472187519258, + "step": 15140 + }, + { + "loss": 13.9629, + "grad_norm": 1.8244805335998535, + "learning_rate": 0.0005, + "epoch": 0.6787713096431913, + "step": 15145 + }, + { + "loss": 13.9529, + "grad_norm": 1.757832407951355, + "learning_rate": 0.0005, + "epoch": 0.6789954005344567, + "step": 15150 + }, + { + "loss": 14.0222, + "grad_norm": 1.7775987386703491, + "learning_rate": 0.0005, + "epoch": 0.6792194914257222, + "step": 15155 + }, + { + "loss": 13.9338, + "grad_norm": 1.7991831302642822, + "learning_rate": 0.0005, + "epoch": 0.6794435823169878, + "step": 15160 + }, + { + "loss": 13.9246, + "grad_norm": 1.7518317699432373, + "learning_rate": 0.0005, + "epoch": 0.6796676732082533, + "step": 15165 + }, + { + "loss": 14.0307, + "grad_norm": 1.7775969505310059, + "learning_rate": 0.0005, + "epoch": 0.6798917640995188, + "step": 15170 + }, + { + "loss": 14.0334, + "grad_norm": 1.9538028240203857, + "learning_rate": 0.0005, + "epoch": 0.6801158549907843, + "step": 15175 + }, + { + "loss": 14.0499, + "grad_norm": 1.7270337343215942, + "learning_rate": 0.0005, + "epoch": 0.6803399458820497, + "step": 15180 + }, + { + "loss": 13.8981, + "grad_norm": 1.7424579858779907, + "learning_rate": 0.0005, + "epoch": 0.6805640367733152, + "step": 15185 + }, + { + "loss": 14.0244, + "grad_norm": 1.9071272611618042, + "learning_rate": 0.0005, + "epoch": 0.6807881276645807, + "step": 15190 + }, + { + "loss": 14.0035, + "grad_norm": 1.7010276317596436, + "learning_rate": 0.0005, + "epoch": 0.6810122185558463, + "step": 15195 + }, + { + "loss": 13.9618, + "grad_norm": 1.6710230112075806, + "learning_rate": 0.0005, + "epoch": 0.6812363094471118, + "step": 15200 + }, + { + "loss": 14.0453, + "grad_norm": 1.7517222166061401, + "learning_rate": 0.0005, + "epoch": 0.6814604003383773, + "step": 15205 + }, + { + "loss": 13.961, + "grad_norm": 1.8641791343688965, + "learning_rate": 0.0005, + "epoch": 0.6816844912296427, + "step": 15210 + }, + { + "loss": 14.0737, + "grad_norm": 1.7744903564453125, + "learning_rate": 0.0005, + "epoch": 0.6819085821209082, + "step": 15215 + }, + { + "loss": 13.9684, + "grad_norm": 1.6295077800750732, + "learning_rate": 0.0005, + "epoch": 0.6821326730121737, + "step": 15220 + }, + { + "loss": 13.8636, + "grad_norm": 1.8711214065551758, + "learning_rate": 0.0005, + "epoch": 0.6823567639034392, + "step": 15225 + }, + { + "loss": 13.9437, + "grad_norm": 1.8667914867401123, + "learning_rate": 0.0005, + "epoch": 0.6825808547947048, + "step": 15230 + }, + { + "loss": 14.0297, + "grad_norm": 1.7881425619125366, + "learning_rate": 0.0005, + "epoch": 0.6828049456859703, + "step": 15235 + }, + { + "loss": 13.9598, + "grad_norm": 2.000901460647583, + "learning_rate": 0.0005, + "epoch": 0.6830290365772357, + "step": 15240 + }, + { + "loss": 13.9195, + "grad_norm": 1.8856955766677856, + "learning_rate": 0.0005, + "epoch": 0.6832531274685012, + "step": 15245 + }, + { + "loss": 13.9155, + "grad_norm": 1.8447706699371338, + "learning_rate": 0.0005, + "epoch": 0.6834772183597667, + "step": 15250 + }, + { + "loss": 13.8523, + "grad_norm": 1.8459632396697998, + "learning_rate": 0.0005, + "epoch": 0.6837013092510322, + "step": 15255 + }, + { + "loss": 13.8948, + "grad_norm": 1.915228009223938, + "learning_rate": 0.0005, + "epoch": 0.6839254001422977, + "step": 15260 + }, + { + "loss": 13.9318, + "grad_norm": 1.9591858386993408, + "learning_rate": 0.0005, + "epoch": 0.6841494910335633, + "step": 15265 + }, + { + "loss": 13.9371, + "grad_norm": 1.8608108758926392, + "learning_rate": 0.0005, + "epoch": 0.6843735819248287, + "step": 15270 + }, + { + "loss": 14.0048, + "grad_norm": 1.806567907333374, + "learning_rate": 0.0005, + "epoch": 0.6845976728160942, + "step": 15275 + }, + { + "loss": 13.989, + "grad_norm": 1.7369107007980347, + "learning_rate": 0.0005, + "epoch": 0.6848217637073597, + "step": 15280 + }, + { + "loss": 13.9489, + "grad_norm": 1.8193225860595703, + "learning_rate": 0.0005, + "epoch": 0.6850458545986252, + "step": 15285 + }, + { + "loss": 13.8948, + "grad_norm": 1.8266242742538452, + "learning_rate": 0.0005, + "epoch": 0.6852699454898907, + "step": 15290 + }, + { + "loss": 14.0153, + "grad_norm": 1.8363949060440063, + "learning_rate": 0.0005, + "epoch": 0.6854940363811562, + "step": 15295 + }, + { + "loss": 14.0074, + "grad_norm": 1.840710997581482, + "learning_rate": 0.0005, + "epoch": 0.6857181272724217, + "step": 15300 + }, + { + "loss": 14.0341, + "grad_norm": 1.8140329122543335, + "learning_rate": 0.0005, + "epoch": 0.6859422181636872, + "step": 15305 + }, + { + "loss": 14.004, + "grad_norm": 1.7864294052124023, + "learning_rate": 0.0005, + "epoch": 0.6861663090549527, + "step": 15310 + }, + { + "loss": 13.959, + "grad_norm": 1.866920828819275, + "learning_rate": 0.0005, + "epoch": 0.6863903999462182, + "step": 15315 + }, + { + "loss": 13.96, + "grad_norm": 1.7539829015731812, + "learning_rate": 0.0005, + "epoch": 0.6866144908374837, + "step": 15320 + }, + { + "loss": 13.9228, + "grad_norm": 1.7904281616210938, + "learning_rate": 0.0005, + "epoch": 0.6868385817287492, + "step": 15325 + }, + { + "loss": 14.0567, + "grad_norm": 1.8457413911819458, + "learning_rate": 0.0005, + "epoch": 0.6870626726200146, + "step": 15330 + }, + { + "loss": 13.9857, + "grad_norm": 1.89803946018219, + "learning_rate": 0.0005, + "epoch": 0.6872867635112802, + "step": 15335 + }, + { + "loss": 13.8866, + "grad_norm": 1.962873101234436, + "learning_rate": 0.0005, + "epoch": 0.6875108544025457, + "step": 15340 + }, + { + "loss": 13.9703, + "grad_norm": 1.8302043676376343, + "learning_rate": 0.0005, + "epoch": 0.6877349452938112, + "step": 15345 + }, + { + "loss": 13.9963, + "grad_norm": 1.7163232564926147, + "learning_rate": 0.0005, + "epoch": 0.6879590361850767, + "step": 15350 + }, + { + "loss": 14.0319, + "grad_norm": 1.7698335647583008, + "learning_rate": 0.0005, + "epoch": 0.6881831270763422, + "step": 15355 + }, + { + "loss": 13.8204, + "grad_norm": 1.7495076656341553, + "learning_rate": 0.0005, + "epoch": 0.6884072179676076, + "step": 15360 + }, + { + "loss": 14.0892, + "grad_norm": 2.0041182041168213, + "learning_rate": 0.0005, + "epoch": 0.6886313088588731, + "step": 15365 + }, + { + "loss": 13.9974, + "grad_norm": 1.8604499101638794, + "learning_rate": 0.0005, + "epoch": 0.6888553997501387, + "step": 15370 + }, + { + "loss": 14.0546, + "grad_norm": 1.8129802942276, + "learning_rate": 0.0005, + "epoch": 0.6890794906414042, + "step": 15375 + }, + { + "loss": 13.9537, + "grad_norm": 1.8278909921646118, + "learning_rate": 0.0005, + "epoch": 0.6893035815326697, + "step": 15380 + }, + { + "loss": 13.9457, + "grad_norm": 1.8413242101669312, + "learning_rate": 0.0005, + "epoch": 0.6895276724239352, + "step": 15385 + }, + { + "loss": 13.9907, + "grad_norm": 1.9248651266098022, + "learning_rate": 0.0005, + "epoch": 0.6897517633152006, + "step": 15390 + }, + { + "loss": 13.8831, + "grad_norm": 1.6882672309875488, + "learning_rate": 0.0005, + "epoch": 0.6899758542064661, + "step": 15395 + }, + { + "loss": 13.9649, + "grad_norm": 2.0407419204711914, + "learning_rate": 0.0005, + "epoch": 0.6901999450977316, + "step": 15400 + }, + { + "loss": 14.0037, + "grad_norm": 1.9546421766281128, + "learning_rate": 0.0005, + "epoch": 0.6904240359889972, + "step": 15405 + }, + { + "loss": 14.0474, + "grad_norm": 1.8498766422271729, + "learning_rate": 0.0005, + "epoch": 0.6906481268802627, + "step": 15410 + }, + { + "loss": 14.0266, + "grad_norm": 1.8743306398391724, + "learning_rate": 0.0005, + "epoch": 0.6908722177715282, + "step": 15415 + }, + { + "loss": 13.9242, + "grad_norm": 1.8721219301223755, + "learning_rate": 0.0005, + "epoch": 0.6910963086627936, + "step": 15420 + }, + { + "loss": 13.9808, + "grad_norm": 1.8055014610290527, + "learning_rate": 0.0005, + "epoch": 0.6913203995540591, + "step": 15425 + }, + { + "loss": 13.934, + "grad_norm": 1.7605301141738892, + "learning_rate": 0.0005, + "epoch": 0.6915444904453246, + "step": 15430 + }, + { + "loss": 13.8644, + "grad_norm": 1.6423132419586182, + "learning_rate": 0.0005, + "epoch": 0.6917685813365901, + "step": 15435 + }, + { + "loss": 13.9572, + "grad_norm": 1.7309364080429077, + "learning_rate": 0.0005, + "epoch": 0.6919926722278557, + "step": 15440 + }, + { + "loss": 14.0399, + "grad_norm": 1.644025206565857, + "learning_rate": 0.0005, + "epoch": 0.6922167631191212, + "step": 15445 + }, + { + "loss": 13.9544, + "grad_norm": 1.9230233430862427, + "learning_rate": 0.0005, + "epoch": 0.6924408540103866, + "step": 15450 + }, + { + "loss": 13.9615, + "grad_norm": 1.8161972761154175, + "learning_rate": 0.0005, + "epoch": 0.6926649449016521, + "step": 15455 + }, + { + "loss": 13.9694, + "grad_norm": 1.834704041481018, + "learning_rate": 0.0005, + "epoch": 0.6928890357929176, + "step": 15460 + }, + { + "loss": 13.8995, + "grad_norm": 1.961165428161621, + "learning_rate": 0.0005, + "epoch": 0.6931131266841831, + "step": 15465 + }, + { + "loss": 13.841, + "grad_norm": 1.9340486526489258, + "learning_rate": 0.0005, + "epoch": 0.6933372175754486, + "step": 15470 + }, + { + "loss": 14.0106, + "grad_norm": 1.840572476387024, + "learning_rate": 0.0005, + "epoch": 0.693561308466714, + "step": 15475 + }, + { + "loss": 14.0718, + "grad_norm": 1.8550306558609009, + "learning_rate": 0.0005, + "epoch": 0.6937853993579796, + "step": 15480 + }, + { + "loss": 13.9573, + "grad_norm": 1.7712377309799194, + "learning_rate": 0.0005, + "epoch": 0.6940094902492451, + "step": 15485 + }, + { + "loss": 13.8744, + "grad_norm": 1.6746519804000854, + "learning_rate": 0.0005, + "epoch": 0.6942335811405106, + "step": 15490 + }, + { + "loss": 13.9816, + "grad_norm": 1.7693535089492798, + "learning_rate": 0.0005, + "epoch": 0.6944576720317761, + "step": 15495 + }, + { + "loss": 13.9546, + "grad_norm": 1.792589783668518, + "learning_rate": 0.0005, + "epoch": 0.6946817629230416, + "step": 15500 + }, + { + "eval_loss": 1.7409549951553345, + "eval_runtime": 19.0449, + "eval_samples_per_second": 860.281, + "eval_steps_per_second": 7.719, + "epoch": 0.6946817629230416, + "step": 15500 + }, + { + "loss": 13.8749, + "grad_norm": 1.7078382968902588, + "learning_rate": 0.0005, + "epoch": 0.694905853814307, + "step": 15505 + }, + { + "loss": 13.949, + "grad_norm": 1.8692890405654907, + "learning_rate": 0.0005, + "epoch": 0.6951299447055725, + "step": 15510 + }, + { + "loss": 13.9288, + "grad_norm": 1.9514987468719482, + "learning_rate": 0.0005, + "epoch": 0.6953540355968381, + "step": 15515 + }, + { + "loss": 13.9926, + "grad_norm": 1.945313811302185, + "learning_rate": 0.0005, + "epoch": 0.6955781264881036, + "step": 15520 + }, + { + "loss": 13.9282, + "grad_norm": 1.845374584197998, + "learning_rate": 0.0005, + "epoch": 0.6958022173793691, + "step": 15525 + }, + { + "loss": 14.0666, + "grad_norm": 1.8912990093231201, + "learning_rate": 0.0005, + "epoch": 0.6960263082706346, + "step": 15530 + }, + { + "loss": 13.9056, + "grad_norm": 1.8223357200622559, + "learning_rate": 0.0005, + "epoch": 0.6962503991619, + "step": 15535 + }, + { + "loss": 13.9665, + "grad_norm": 1.7922083139419556, + "learning_rate": 0.0005, + "epoch": 0.6964744900531655, + "step": 15540 + }, + { + "loss": 13.9223, + "grad_norm": 1.9126029014587402, + "learning_rate": 0.0005, + "epoch": 0.696698580944431, + "step": 15545 + }, + { + "loss": 14.0502, + "grad_norm": 1.927013874053955, + "learning_rate": 0.0005, + "epoch": 0.6969226718356966, + "step": 15550 + }, + { + "loss": 13.9175, + "grad_norm": 1.7995493412017822, + "learning_rate": 0.0005, + "epoch": 0.6971467627269621, + "step": 15555 + }, + { + "loss": 14.0, + "grad_norm": 1.9804531335830688, + "learning_rate": 0.0005, + "epoch": 0.6973708536182276, + "step": 15560 + }, + { + "loss": 13.945, + "grad_norm": 2.095614194869995, + "learning_rate": 0.0005, + "epoch": 0.697594944509493, + "step": 15565 + }, + { + "loss": 13.946, + "grad_norm": 1.8693947792053223, + "learning_rate": 0.0005, + "epoch": 0.6978190354007585, + "step": 15570 + }, + { + "loss": 13.9907, + "grad_norm": 1.7160202264785767, + "learning_rate": 0.0005, + "epoch": 0.698043126292024, + "step": 15575 + }, + { + "loss": 13.9814, + "grad_norm": 1.9428715705871582, + "learning_rate": 0.0005, + "epoch": 0.6982672171832895, + "step": 15580 + }, + { + "loss": 13.9458, + "grad_norm": 1.8247159719467163, + "learning_rate": 0.0005, + "epoch": 0.6984913080745551, + "step": 15585 + }, + { + "loss": 13.894, + "grad_norm": 1.742426872253418, + "learning_rate": 0.0005, + "epoch": 0.6987153989658206, + "step": 15590 + }, + { + "loss": 13.9269, + "grad_norm": 1.9104456901550293, + "learning_rate": 0.0005, + "epoch": 0.698939489857086, + "step": 15595 + }, + { + "loss": 13.9191, + "grad_norm": 1.842275619506836, + "learning_rate": 0.0005, + "epoch": 0.6991635807483515, + "step": 15600 + }, + { + "loss": 14.024, + "grad_norm": 1.793081283569336, + "learning_rate": 0.0005, + "epoch": 0.699387671639617, + "step": 15605 + }, + { + "loss": 13.8893, + "grad_norm": 1.7978259325027466, + "learning_rate": 0.0005, + "epoch": 0.6996117625308825, + "step": 15610 + }, + { + "loss": 13.9192, + "grad_norm": 2.2107441425323486, + "learning_rate": 0.0005, + "epoch": 0.699835853422148, + "step": 15615 + }, + { + "loss": 14.043, + "grad_norm": 2.238145112991333, + "learning_rate": 0.0005, + "epoch": 0.7000599443134136, + "step": 15620 + }, + { + "loss": 13.9411, + "grad_norm": 2.0489559173583984, + "learning_rate": 0.0005, + "epoch": 0.700284035204679, + "step": 15625 + }, + { + "loss": 13.8079, + "grad_norm": 1.8659436702728271, + "learning_rate": 0.0005, + "epoch": 0.7005081260959445, + "step": 15630 + }, + { + "loss": 13.9392, + "grad_norm": 1.7955938577651978, + "learning_rate": 0.0005, + "epoch": 0.70073221698721, + "step": 15635 + }, + { + "loss": 13.9242, + "grad_norm": 1.8826559782028198, + "learning_rate": 0.0005, + "epoch": 0.7009563078784755, + "step": 15640 + }, + { + "loss": 13.9201, + "grad_norm": 1.858927607536316, + "learning_rate": 0.0005, + "epoch": 0.701180398769741, + "step": 15645 + }, + { + "loss": 13.9107, + "grad_norm": 1.839073896408081, + "learning_rate": 0.0005, + "epoch": 0.7014044896610065, + "step": 15650 + }, + { + "loss": 13.9317, + "grad_norm": 1.8229670524597168, + "learning_rate": 0.0005, + "epoch": 0.701628580552272, + "step": 15655 + }, + { + "loss": 13.91, + "grad_norm": 1.8194196224212646, + "learning_rate": 0.0005, + "epoch": 0.7018526714435375, + "step": 15660 + }, + { + "loss": 14.1022, + "grad_norm": 1.8115154504776, + "learning_rate": 0.0005, + "epoch": 0.702076762334803, + "step": 15665 + }, + { + "loss": 13.9124, + "grad_norm": 1.8422259092330933, + "learning_rate": 0.0005, + "epoch": 0.7023008532260685, + "step": 15670 + }, + { + "loss": 13.8756, + "grad_norm": 1.6191073656082153, + "learning_rate": 0.0005, + "epoch": 0.702524944117334, + "step": 15675 + }, + { + "loss": 13.9934, + "grad_norm": 1.692289113998413, + "learning_rate": 0.0005, + "epoch": 0.7027490350085995, + "step": 15680 + }, + { + "loss": 13.9409, + "grad_norm": 2.0738937854766846, + "learning_rate": 0.0005, + "epoch": 0.7029731258998649, + "step": 15685 + }, + { + "loss": 13.8589, + "grad_norm": 2.1270904541015625, + "learning_rate": 0.0005, + "epoch": 0.7031972167911305, + "step": 15690 + }, + { + "loss": 13.9472, + "grad_norm": 1.8903199434280396, + "learning_rate": 0.0005, + "epoch": 0.703421307682396, + "step": 15695 + }, + { + "loss": 13.9934, + "grad_norm": 1.8520416021347046, + "learning_rate": 0.0005, + "epoch": 0.7036453985736615, + "step": 15700 + }, + { + "loss": 13.8661, + "grad_norm": 1.8308141231536865, + "learning_rate": 0.0005, + "epoch": 0.703869489464927, + "step": 15705 + }, + { + "loss": 13.9935, + "grad_norm": 1.8230431079864502, + "learning_rate": 0.0005, + "epoch": 0.7040935803561925, + "step": 15710 + }, + { + "loss": 13.8179, + "grad_norm": 1.8655309677124023, + "learning_rate": 0.0005, + "epoch": 0.7043176712474579, + "step": 15715 + }, + { + "loss": 14.0309, + "grad_norm": 1.8119747638702393, + "learning_rate": 0.0005, + "epoch": 0.7045417621387234, + "step": 15720 + }, + { + "loss": 13.982, + "grad_norm": 1.8590638637542725, + "learning_rate": 0.0005, + "epoch": 0.704765853029989, + "step": 15725 + }, + { + "loss": 13.8865, + "grad_norm": 1.9631258249282837, + "learning_rate": 0.0005, + "epoch": 0.7049899439212545, + "step": 15730 + }, + { + "loss": 13.8423, + "grad_norm": 1.8070425987243652, + "learning_rate": 0.0005, + "epoch": 0.70521403481252, + "step": 15735 + }, + { + "loss": 13.9572, + "grad_norm": 1.8490524291992188, + "learning_rate": 0.0005, + "epoch": 0.7054381257037855, + "step": 15740 + }, + { + "loss": 13.9006, + "grad_norm": 1.8480288982391357, + "learning_rate": 0.0005, + "epoch": 0.7056622165950509, + "step": 15745 + }, + { + "loss": 13.9108, + "grad_norm": 1.937630295753479, + "learning_rate": 0.0005, + "epoch": 0.7058863074863164, + "step": 15750 + }, + { + "loss": 14.0033, + "grad_norm": 1.6791224479675293, + "learning_rate": 0.0005, + "epoch": 0.7061103983775819, + "step": 15755 + }, + { + "loss": 13.8934, + "grad_norm": 1.8505795001983643, + "learning_rate": 0.0005, + "epoch": 0.7063344892688475, + "step": 15760 + }, + { + "loss": 13.8907, + "grad_norm": 1.9696643352508545, + "learning_rate": 0.0005, + "epoch": 0.706558580160113, + "step": 15765 + }, + { + "loss": 13.9201, + "grad_norm": 1.9013051986694336, + "learning_rate": 0.0005, + "epoch": 0.7067826710513785, + "step": 15770 + }, + { + "loss": 13.9411, + "grad_norm": 2.011505365371704, + "learning_rate": 0.0005, + "epoch": 0.7070067619426439, + "step": 15775 + }, + { + "loss": 14.0902, + "grad_norm": 1.9483027458190918, + "learning_rate": 0.0005, + "epoch": 0.7072308528339094, + "step": 15780 + }, + { + "loss": 13.9785, + "grad_norm": 1.9703984260559082, + "learning_rate": 0.0005, + "epoch": 0.7074549437251749, + "step": 15785 + }, + { + "loss": 13.9829, + "grad_norm": 1.9221055507659912, + "learning_rate": 0.0005, + "epoch": 0.7076790346164404, + "step": 15790 + }, + { + "loss": 13.9331, + "grad_norm": 1.96420156955719, + "learning_rate": 0.0005, + "epoch": 0.707903125507706, + "step": 15795 + }, + { + "loss": 13.9016, + "grad_norm": 1.7140847444534302, + "learning_rate": 0.0005, + "epoch": 0.7081272163989715, + "step": 15800 + }, + { + "loss": 13.9334, + "grad_norm": 1.9515072107315063, + "learning_rate": 0.0005, + "epoch": 0.7083513072902369, + "step": 15805 + }, + { + "loss": 13.9258, + "grad_norm": 1.709166407585144, + "learning_rate": 0.0005, + "epoch": 0.7085753981815024, + "step": 15810 + }, + { + "loss": 13.9812, + "grad_norm": 1.7489274740219116, + "learning_rate": 0.0005, + "epoch": 0.7087994890727679, + "step": 15815 + }, + { + "loss": 13.9408, + "grad_norm": 2.133345127105713, + "learning_rate": 0.0005, + "epoch": 0.7090235799640334, + "step": 15820 + }, + { + "loss": 13.9426, + "grad_norm": 1.8625730276107788, + "learning_rate": 0.0005, + "epoch": 0.7092476708552989, + "step": 15825 + }, + { + "loss": 13.9732, + "grad_norm": 1.6733115911483765, + "learning_rate": 0.0005, + "epoch": 0.7094717617465645, + "step": 15830 + }, + { + "loss": 14.0664, + "grad_norm": 1.8962249755859375, + "learning_rate": 0.0005, + "epoch": 0.7096958526378299, + "step": 15835 + }, + { + "loss": 14.0338, + "grad_norm": 1.839948058128357, + "learning_rate": 0.0005, + "epoch": 0.7099199435290954, + "step": 15840 + }, + { + "loss": 13.9143, + "grad_norm": 1.7848726511001587, + "learning_rate": 0.0005, + "epoch": 0.7101440344203609, + "step": 15845 + }, + { + "loss": 13.8563, + "grad_norm": 1.9577068090438843, + "learning_rate": 0.0005, + "epoch": 0.7103681253116264, + "step": 15850 + }, + { + "loss": 13.9217, + "grad_norm": 1.9215444326400757, + "learning_rate": 0.0005, + "epoch": 0.7105922162028919, + "step": 15855 + }, + { + "loss": 13.9757, + "grad_norm": 1.9216139316558838, + "learning_rate": 0.0005, + "epoch": 0.7108163070941574, + "step": 15860 + }, + { + "loss": 13.9598, + "grad_norm": 1.9376696348190308, + "learning_rate": 0.0005, + "epoch": 0.7110403979854228, + "step": 15865 + }, + { + "loss": 13.9578, + "grad_norm": 1.8964476585388184, + "learning_rate": 0.0005, + "epoch": 0.7112644888766884, + "step": 15870 + }, + { + "loss": 13.9611, + "grad_norm": 1.7792725563049316, + "learning_rate": 0.0005, + "epoch": 0.7114885797679539, + "step": 15875 + }, + { + "loss": 14.0214, + "grad_norm": 1.8994535207748413, + "learning_rate": 0.0005, + "epoch": 0.7117126706592194, + "step": 15880 + }, + { + "loss": 13.9837, + "grad_norm": 1.8821009397506714, + "learning_rate": 0.0005, + "epoch": 0.7119367615504849, + "step": 15885 + }, + { + "loss": 14.0434, + "grad_norm": 1.858879804611206, + "learning_rate": 0.0005, + "epoch": 0.7121608524417504, + "step": 15890 + }, + { + "loss": 13.9523, + "grad_norm": 1.7259469032287598, + "learning_rate": 0.0005, + "epoch": 0.7123849433330158, + "step": 15895 + }, + { + "loss": 13.8842, + "grad_norm": 1.9253323078155518, + "learning_rate": 0.0005, + "epoch": 0.7126090342242813, + "step": 15900 + }, + { + "loss": 13.9879, + "grad_norm": 1.9537336826324463, + "learning_rate": 0.0005, + "epoch": 0.7128331251155469, + "step": 15905 + }, + { + "loss": 13.9387, + "grad_norm": 1.8602211475372314, + "learning_rate": 0.0005, + "epoch": 0.7130572160068124, + "step": 15910 + }, + { + "loss": 13.8637, + "grad_norm": 1.9752931594848633, + "learning_rate": 0.0005, + "epoch": 0.7132813068980779, + "step": 15915 + }, + { + "loss": 13.7979, + "grad_norm": 1.6928125619888306, + "learning_rate": 0.0005, + "epoch": 0.7135053977893434, + "step": 15920 + }, + { + "loss": 13.9197, + "grad_norm": 2.161198854446411, + "learning_rate": 0.0005, + "epoch": 0.7137294886806088, + "step": 15925 + }, + { + "loss": 13.993, + "grad_norm": 1.9235631227493286, + "learning_rate": 0.0005, + "epoch": 0.7139535795718743, + "step": 15930 + }, + { + "loss": 13.8041, + "grad_norm": 1.714410424232483, + "learning_rate": 0.0005, + "epoch": 0.7141776704631398, + "step": 15935 + }, + { + "loss": 13.9278, + "grad_norm": 1.8958978652954102, + "learning_rate": 0.0005, + "epoch": 0.7144017613544054, + "step": 15940 + }, + { + "loss": 13.934, + "grad_norm": 1.819340705871582, + "learning_rate": 0.0005, + "epoch": 0.7146258522456709, + "step": 15945 + }, + { + "loss": 13.8556, + "grad_norm": 1.6928635835647583, + "learning_rate": 0.0005, + "epoch": 0.7148499431369364, + "step": 15950 + }, + { + "loss": 13.913, + "grad_norm": 1.6891690492630005, + "learning_rate": 0.0005, + "epoch": 0.7150740340282018, + "step": 15955 + }, + { + "loss": 13.967, + "grad_norm": 1.8896440267562866, + "learning_rate": 0.0005, + "epoch": 0.7152981249194673, + "step": 15960 + }, + { + "loss": 13.9234, + "grad_norm": 1.8856749534606934, + "learning_rate": 0.0005, + "epoch": 0.7155222158107328, + "step": 15965 + }, + { + "loss": 14.0063, + "grad_norm": 1.8018680810928345, + "learning_rate": 0.0005, + "epoch": 0.7157463067019983, + "step": 15970 + }, + { + "loss": 13.8262, + "grad_norm": 1.8745312690734863, + "learning_rate": 0.0005, + "epoch": 0.7159703975932639, + "step": 15975 + }, + { + "loss": 13.9973, + "grad_norm": 1.877651572227478, + "learning_rate": 0.0005, + "epoch": 0.7161944884845294, + "step": 15980 + }, + { + "loss": 13.9225, + "grad_norm": 1.7573840618133545, + "learning_rate": 0.0005, + "epoch": 0.7164185793757948, + "step": 15985 + }, + { + "loss": 13.9996, + "grad_norm": 1.7413743734359741, + "learning_rate": 0.0005, + "epoch": 0.7166426702670603, + "step": 15990 + }, + { + "loss": 13.9276, + "grad_norm": 1.8496475219726562, + "learning_rate": 0.0005, + "epoch": 0.7168667611583258, + "step": 15995 + }, + { + "loss": 14.0066, + "grad_norm": 1.8488664627075195, + "learning_rate": 0.0005, + "epoch": 0.7170908520495913, + "step": 16000 + }, + { + "eval_loss": 1.7406035661697388, + "eval_runtime": 18.3143, + "eval_samples_per_second": 894.602, + "eval_steps_per_second": 8.027, + "epoch": 0.7170908520495913, + "step": 16000 + }, + { + "loss": 14.0059, + "grad_norm": 1.8590061664581299, + "learning_rate": 0.0005, + "epoch": 0.7173149429408568, + "step": 16005 + }, + { + "loss": 13.9813, + "grad_norm": 2.056410074234009, + "learning_rate": 0.0005, + "epoch": 0.7175390338321224, + "step": 16010 + }, + { + "loss": 13.9623, + "grad_norm": 1.8297349214553833, + "learning_rate": 0.0005, + "epoch": 0.7177631247233878, + "step": 16015 + }, + { + "loss": 14.0261, + "grad_norm": 1.8574327230453491, + "learning_rate": 0.0005, + "epoch": 0.7179872156146533, + "step": 16020 + }, + { + "loss": 13.9457, + "grad_norm": 1.8076415061950684, + "learning_rate": 0.0005, + "epoch": 0.7182113065059188, + "step": 16025 + }, + { + "loss": 13.8557, + "grad_norm": 1.8541926145553589, + "learning_rate": 0.0005, + "epoch": 0.7184353973971843, + "step": 16030 + }, + { + "loss": 13.9438, + "grad_norm": 1.8825889825820923, + "learning_rate": 0.0005, + "epoch": 0.7186594882884498, + "step": 16035 + }, + { + "loss": 13.891, + "grad_norm": 1.7605642080307007, + "learning_rate": 0.0005, + "epoch": 0.7188835791797152, + "step": 16040 + }, + { + "loss": 13.9985, + "grad_norm": 1.8549124002456665, + "learning_rate": 0.0005, + "epoch": 0.7191076700709808, + "step": 16045 + }, + { + "loss": 14.0044, + "grad_norm": 1.8616878986358643, + "learning_rate": 0.0005, + "epoch": 0.7193317609622463, + "step": 16050 + }, + { + "loss": 13.986, + "grad_norm": 2.120058059692383, + "learning_rate": 0.0005, + "epoch": 0.7195558518535118, + "step": 16055 + }, + { + "loss": 13.9578, + "grad_norm": 1.7615188360214233, + "learning_rate": 0.0005, + "epoch": 0.7197799427447773, + "step": 16060 + }, + { + "loss": 13.9921, + "grad_norm": 1.9902580976486206, + "learning_rate": 0.0005, + "epoch": 0.7200040336360428, + "step": 16065 + }, + { + "loss": 13.8947, + "grad_norm": 1.999809980392456, + "learning_rate": 0.0005, + "epoch": 0.7202281245273082, + "step": 16070 + }, + { + "loss": 13.9399, + "grad_norm": 1.9101618528366089, + "learning_rate": 0.0005, + "epoch": 0.7204522154185737, + "step": 16075 + }, + { + "loss": 13.9123, + "grad_norm": 1.882951259613037, + "learning_rate": 0.0005, + "epoch": 0.7206763063098393, + "step": 16080 + }, + { + "loss": 13.8909, + "grad_norm": 1.6697813272476196, + "learning_rate": 0.0005, + "epoch": 0.7209003972011048, + "step": 16085 + }, + { + "loss": 13.9208, + "grad_norm": 1.7573597431182861, + "learning_rate": 0.0005, + "epoch": 0.7211244880923703, + "step": 16090 + }, + { + "loss": 13.9518, + "grad_norm": 1.8712494373321533, + "learning_rate": 0.0005, + "epoch": 0.7213485789836358, + "step": 16095 + }, + { + "loss": 14.035, + "grad_norm": 1.8593941926956177, + "learning_rate": 0.0005, + "epoch": 0.7215726698749012, + "step": 16100 + }, + { + "loss": 13.9139, + "grad_norm": 1.979432463645935, + "learning_rate": 0.0005, + "epoch": 0.7217967607661667, + "step": 16105 + }, + { + "loss": 13.9773, + "grad_norm": 1.9317654371261597, + "learning_rate": 0.0005, + "epoch": 0.7220208516574322, + "step": 16110 + }, + { + "loss": 13.9748, + "grad_norm": 1.9029157161712646, + "learning_rate": 0.0005, + "epoch": 0.7222449425486978, + "step": 16115 + }, + { + "loss": 13.975, + "grad_norm": 1.7725335359573364, + "learning_rate": 0.0005, + "epoch": 0.7224690334399633, + "step": 16120 + }, + { + "loss": 13.9674, + "grad_norm": 1.7740458250045776, + "learning_rate": 0.0005, + "epoch": 0.7226931243312288, + "step": 16125 + }, + { + "loss": 13.892, + "grad_norm": 1.7745329141616821, + "learning_rate": 0.0005, + "epoch": 0.7229172152224942, + "step": 16130 + }, + { + "loss": 13.9921, + "grad_norm": 1.7700062990188599, + "learning_rate": 0.0005, + "epoch": 0.7231413061137597, + "step": 16135 + }, + { + "loss": 14.0545, + "grad_norm": 1.8467191457748413, + "learning_rate": 0.0005, + "epoch": 0.7233653970050252, + "step": 16140 + }, + { + "loss": 13.9745, + "grad_norm": 1.7570888996124268, + "learning_rate": 0.0005, + "epoch": 0.7235894878962907, + "step": 16145 + }, + { + "loss": 13.9612, + "grad_norm": 1.7153857946395874, + "learning_rate": 0.0005, + "epoch": 0.7238135787875563, + "step": 16150 + }, + { + "loss": 13.9667, + "grad_norm": 1.918713092803955, + "learning_rate": 0.0005, + "epoch": 0.7240376696788218, + "step": 16155 + }, + { + "loss": 14.0746, + "grad_norm": 1.8288605213165283, + "learning_rate": 0.0005, + "epoch": 0.7242617605700872, + "step": 16160 + }, + { + "loss": 13.9114, + "grad_norm": 1.7235784530639648, + "learning_rate": 0.0005, + "epoch": 0.7244858514613527, + "step": 16165 + }, + { + "loss": 13.9968, + "grad_norm": 1.6746882200241089, + "learning_rate": 0.0005, + "epoch": 0.7247099423526182, + "step": 16170 + }, + { + "loss": 13.9439, + "grad_norm": 1.731046438217163, + "learning_rate": 0.0005, + "epoch": 0.7249340332438837, + "step": 16175 + }, + { + "loss": 13.9765, + "grad_norm": 1.830639123916626, + "learning_rate": 0.0005, + "epoch": 0.7251581241351492, + "step": 16180 + }, + { + "loss": 13.9256, + "grad_norm": 2.0223536491394043, + "learning_rate": 0.0005, + "epoch": 0.7253822150264148, + "step": 16185 + }, + { + "loss": 13.891, + "grad_norm": 1.7806121110916138, + "learning_rate": 0.0005, + "epoch": 0.7256063059176802, + "step": 16190 + }, + { + "loss": 14.0454, + "grad_norm": 1.6691431999206543, + "learning_rate": 0.0005, + "epoch": 0.7258303968089457, + "step": 16195 + }, + { + "loss": 13.8613, + "grad_norm": 1.7030301094055176, + "learning_rate": 0.0005, + "epoch": 0.7260544877002112, + "step": 16200 + }, + { + "loss": 13.869, + "grad_norm": 1.8289581537246704, + "learning_rate": 0.0005, + "epoch": 0.7262785785914767, + "step": 16205 + }, + { + "loss": 13.9307, + "grad_norm": 1.9004987478256226, + "learning_rate": 0.0005, + "epoch": 0.7265026694827422, + "step": 16210 + }, + { + "loss": 13.9059, + "grad_norm": 1.8068387508392334, + "learning_rate": 0.0005, + "epoch": 0.7267267603740077, + "step": 16215 + }, + { + "loss": 13.8845, + "grad_norm": 1.8290519714355469, + "learning_rate": 0.0005, + "epoch": 0.7269508512652731, + "step": 16220 + }, + { + "loss": 13.9848, + "grad_norm": 1.8770573139190674, + "learning_rate": 0.0005, + "epoch": 0.7271749421565387, + "step": 16225 + }, + { + "loss": 13.9106, + "grad_norm": 1.7803281545639038, + "learning_rate": 0.0005, + "epoch": 0.7273990330478042, + "step": 16230 + }, + { + "loss": 13.9031, + "grad_norm": 1.7252570390701294, + "learning_rate": 0.0005, + "epoch": 0.7276231239390697, + "step": 16235 + }, + { + "loss": 13.882, + "grad_norm": 1.8499126434326172, + "learning_rate": 0.0005, + "epoch": 0.7278472148303352, + "step": 16240 + }, + { + "loss": 13.9478, + "grad_norm": 1.8788185119628906, + "learning_rate": 0.0005, + "epoch": 0.7280713057216007, + "step": 16245 + }, + { + "loss": 13.9407, + "grad_norm": 1.9770047664642334, + "learning_rate": 0.0005, + "epoch": 0.7282953966128661, + "step": 16250 + }, + { + "loss": 13.9473, + "grad_norm": 1.9661725759506226, + "learning_rate": 0.0005, + "epoch": 0.7285194875041316, + "step": 16255 + }, + { + "loss": 13.9523, + "grad_norm": 1.7707750797271729, + "learning_rate": 0.0005, + "epoch": 0.7287435783953972, + "step": 16260 + }, + { + "loss": 13.9735, + "grad_norm": 1.7866833209991455, + "learning_rate": 0.0005, + "epoch": 0.7289676692866627, + "step": 16265 + }, + { + "loss": 13.9603, + "grad_norm": 1.8298813104629517, + "learning_rate": 0.0005, + "epoch": 0.7291917601779282, + "step": 16270 + }, + { + "loss": 13.9271, + "grad_norm": 1.8025341033935547, + "learning_rate": 0.0005, + "epoch": 0.7294158510691937, + "step": 16275 + }, + { + "loss": 13.9085, + "grad_norm": 1.7539923191070557, + "learning_rate": 0.0005, + "epoch": 0.7296399419604591, + "step": 16280 + }, + { + "loss": 13.9229, + "grad_norm": 1.9807707071304321, + "learning_rate": 0.0005, + "epoch": 0.7298640328517246, + "step": 16285 + }, + { + "loss": 13.9094, + "grad_norm": 1.8446540832519531, + "learning_rate": 0.0005, + "epoch": 0.7300881237429901, + "step": 16290 + }, + { + "loss": 13.9222, + "grad_norm": 1.7621265649795532, + "learning_rate": 0.0005, + "epoch": 0.7303122146342557, + "step": 16295 + }, + { + "loss": 13.9597, + "grad_norm": 1.9226858615875244, + "learning_rate": 0.0005, + "epoch": 0.7305363055255212, + "step": 16300 + }, + { + "loss": 13.8855, + "grad_norm": 1.8753238916397095, + "learning_rate": 0.0005, + "epoch": 0.7307603964167867, + "step": 16305 + }, + { + "loss": 13.9531, + "grad_norm": 1.7832648754119873, + "learning_rate": 0.0005, + "epoch": 0.7309844873080521, + "step": 16310 + }, + { + "loss": 13.9558, + "grad_norm": 1.9437285661697388, + "learning_rate": 0.0005, + "epoch": 0.7312085781993176, + "step": 16315 + }, + { + "loss": 13.9162, + "grad_norm": 1.9407806396484375, + "learning_rate": 0.0005, + "epoch": 0.7314326690905831, + "step": 16320 + }, + { + "loss": 13.9994, + "grad_norm": 1.8932803869247437, + "learning_rate": 0.0005, + "epoch": 0.7316567599818486, + "step": 16325 + }, + { + "loss": 14.0211, + "grad_norm": 1.7515097856521606, + "learning_rate": 0.0005, + "epoch": 0.7318808508731142, + "step": 16330 + }, + { + "loss": 13.9126, + "grad_norm": 1.8044368028640747, + "learning_rate": 0.0005, + "epoch": 0.7321049417643797, + "step": 16335 + }, + { + "loss": 14.0328, + "grad_norm": 1.7515829801559448, + "learning_rate": 0.0005, + "epoch": 0.7323290326556451, + "step": 16340 + }, + { + "loss": 13.9308, + "grad_norm": 1.720150351524353, + "learning_rate": 0.0005, + "epoch": 0.7325531235469106, + "step": 16345 + }, + { + "loss": 14.0479, + "grad_norm": 1.8786776065826416, + "learning_rate": 0.0005, + "epoch": 0.7327772144381761, + "step": 16350 + }, + { + "loss": 14.0188, + "grad_norm": 1.9431655406951904, + "learning_rate": 0.0005, + "epoch": 0.7330013053294416, + "step": 16355 + }, + { + "loss": 13.9009, + "grad_norm": 1.9714356660842896, + "learning_rate": 0.0005, + "epoch": 0.7332253962207071, + "step": 16360 + }, + { + "loss": 13.8668, + "grad_norm": 1.8676221370697021, + "learning_rate": 0.0005, + "epoch": 0.7334494871119727, + "step": 16365 + }, + { + "loss": 14.0674, + "grad_norm": 1.862158179283142, + "learning_rate": 0.0005, + "epoch": 0.7336735780032381, + "step": 16370 + }, + { + "loss": 13.8538, + "grad_norm": 1.813812255859375, + "learning_rate": 0.0005, + "epoch": 0.7338976688945036, + "step": 16375 + }, + { + "loss": 14.0629, + "grad_norm": 2.1444220542907715, + "learning_rate": 0.0005, + "epoch": 0.7341217597857691, + "step": 16380 + }, + { + "loss": 13.9987, + "grad_norm": 1.824510097503662, + "learning_rate": 0.0005, + "epoch": 0.7343458506770346, + "step": 16385 + }, + { + "loss": 13.9475, + "grad_norm": 1.9032855033874512, + "learning_rate": 0.0005, + "epoch": 0.7345699415683001, + "step": 16390 + }, + { + "loss": 13.8946, + "grad_norm": 1.7234930992126465, + "learning_rate": 0.0005, + "epoch": 0.7347940324595656, + "step": 16395 + }, + { + "loss": 13.9531, + "grad_norm": 1.8535950183868408, + "learning_rate": 0.0005, + "epoch": 0.735018123350831, + "step": 16400 + }, + { + "loss": 13.935, + "grad_norm": 1.7756191492080688, + "learning_rate": 0.0005, + "epoch": 0.7352422142420966, + "step": 16405 + }, + { + "loss": 14.0069, + "grad_norm": 1.6882721185684204, + "learning_rate": 0.0005, + "epoch": 0.7354663051333621, + "step": 16410 + }, + { + "loss": 13.9706, + "grad_norm": 1.888295292854309, + "learning_rate": 0.0005, + "epoch": 0.7356903960246276, + "step": 16415 + }, + { + "loss": 14.0556, + "grad_norm": 1.7977948188781738, + "learning_rate": 0.0005, + "epoch": 0.7359144869158931, + "step": 16420 + }, + { + "loss": 13.952, + "grad_norm": 1.8391417264938354, + "learning_rate": 0.0005, + "epoch": 0.7361385778071586, + "step": 16425 + }, + { + "loss": 13.911, + "grad_norm": 1.754473328590393, + "learning_rate": 0.0005, + "epoch": 0.736362668698424, + "step": 16430 + }, + { + "loss": 13.9549, + "grad_norm": 1.7115532159805298, + "learning_rate": 0.0005, + "epoch": 0.7365867595896896, + "step": 16435 + }, + { + "loss": 13.9322, + "grad_norm": 1.6821171045303345, + "learning_rate": 0.0005, + "epoch": 0.7368108504809551, + "step": 16440 + }, + { + "loss": 13.9352, + "grad_norm": 1.6488306522369385, + "learning_rate": 0.0005, + "epoch": 0.7370349413722206, + "step": 16445 + }, + { + "loss": 13.8723, + "grad_norm": 1.7560571432113647, + "learning_rate": 0.0005, + "epoch": 0.7372590322634861, + "step": 16450 + }, + { + "loss": 13.9244, + "grad_norm": 1.8154276609420776, + "learning_rate": 0.0005, + "epoch": 0.7374831231547516, + "step": 16455 + }, + { + "loss": 14.1038, + "grad_norm": 2.00632905960083, + "learning_rate": 0.0005, + "epoch": 0.737707214046017, + "step": 16460 + }, + { + "loss": 13.986, + "grad_norm": 1.9674841165542603, + "learning_rate": 0.0005, + "epoch": 0.7379313049372825, + "step": 16465 + }, + { + "loss": 13.9672, + "grad_norm": 1.891155481338501, + "learning_rate": 0.0005, + "epoch": 0.738155395828548, + "step": 16470 + }, + { + "loss": 14.0534, + "grad_norm": 1.940896987915039, + "learning_rate": 0.0005, + "epoch": 0.7383794867198136, + "step": 16475 + }, + { + "loss": 13.9107, + "grad_norm": 1.842924952507019, + "learning_rate": 0.0005, + "epoch": 0.7386035776110791, + "step": 16480 + }, + { + "loss": 13.9201, + "grad_norm": 1.7718983888626099, + "learning_rate": 0.0005, + "epoch": 0.7388276685023446, + "step": 16485 + }, + { + "loss": 13.8871, + "grad_norm": 1.7668657302856445, + "learning_rate": 0.0005, + "epoch": 0.73905175939361, + "step": 16490 + }, + { + "loss": 14.0596, + "grad_norm": 1.8025968074798584, + "learning_rate": 0.0005, + "epoch": 0.7392758502848755, + "step": 16495 + }, + { + "loss": 13.7597, + "grad_norm": 1.8243893384933472, + "learning_rate": 0.0005, + "epoch": 0.739499941176141, + "step": 16500 + }, + { + "eval_loss": 1.7385435104370117, + "eval_runtime": 18.6914, + "eval_samples_per_second": 876.555, + "eval_steps_per_second": 7.865, + "epoch": 0.739499941176141, + "step": 16500 + }, + { + "loss": 13.9621, + "grad_norm": 1.63633394241333, + "learning_rate": 0.0005, + "epoch": 0.7397240320674066, + "step": 16505 + }, + { + "loss": 13.7676, + "grad_norm": 1.9245598316192627, + "learning_rate": 0.0005, + "epoch": 0.7399481229586721, + "step": 16510 + }, + { + "loss": 13.9617, + "grad_norm": 1.7750827074050903, + "learning_rate": 0.0005, + "epoch": 0.7401722138499376, + "step": 16515 + }, + { + "loss": 13.9345, + "grad_norm": 1.7867636680603027, + "learning_rate": 0.0005, + "epoch": 0.740396304741203, + "step": 16520 + }, + { + "loss": 13.9694, + "grad_norm": 1.8457869291305542, + "learning_rate": 0.0005, + "epoch": 0.7406203956324685, + "step": 16525 + }, + { + "loss": 13.8827, + "grad_norm": 1.8203812837600708, + "learning_rate": 0.0005, + "epoch": 0.740844486523734, + "step": 16530 + }, + { + "loss": 13.9653, + "grad_norm": 1.8373428583145142, + "learning_rate": 0.0005, + "epoch": 0.7410685774149995, + "step": 16535 + }, + { + "loss": 13.8845, + "grad_norm": 1.7521898746490479, + "learning_rate": 0.0005, + "epoch": 0.7412926683062651, + "step": 16540 + }, + { + "loss": 13.97, + "grad_norm": 1.7927263975143433, + "learning_rate": 0.0005, + "epoch": 0.7415167591975306, + "step": 16545 + }, + { + "loss": 13.986, + "grad_norm": 2.0001494884490967, + "learning_rate": 0.0005, + "epoch": 0.741740850088796, + "step": 16550 + }, + { + "loss": 13.9242, + "grad_norm": 1.772683024406433, + "learning_rate": 0.0005, + "epoch": 0.7419649409800615, + "step": 16555 + }, + { + "loss": 13.9403, + "grad_norm": 1.6826038360595703, + "learning_rate": 0.0005, + "epoch": 0.742189031871327, + "step": 16560 + }, + { + "loss": 13.92, + "grad_norm": 1.8097511529922485, + "learning_rate": 0.0005, + "epoch": 0.7424131227625925, + "step": 16565 + }, + { + "loss": 13.9673, + "grad_norm": 1.8825819492340088, + "learning_rate": 0.0005, + "epoch": 0.742637213653858, + "step": 16570 + }, + { + "loss": 13.8282, + "grad_norm": 1.8934838771820068, + "learning_rate": 0.0005, + "epoch": 0.7428613045451236, + "step": 16575 + }, + { + "loss": 13.9764, + "grad_norm": 1.666270136833191, + "learning_rate": 0.0005, + "epoch": 0.743085395436389, + "step": 16580 + }, + { + "loss": 13.9107, + "grad_norm": 1.7280218601226807, + "learning_rate": 0.0005, + "epoch": 0.7433094863276545, + "step": 16585 + }, + { + "loss": 13.8924, + "grad_norm": 1.6558258533477783, + "learning_rate": 0.0005, + "epoch": 0.74353357721892, + "step": 16590 + }, + { + "loss": 13.8897, + "grad_norm": 1.7129403352737427, + "learning_rate": 0.0005, + "epoch": 0.7437576681101855, + "step": 16595 + }, + { + "loss": 13.8267, + "grad_norm": 1.8275370597839355, + "learning_rate": 0.0005, + "epoch": 0.743981759001451, + "step": 16600 + }, + { + "loss": 13.8312, + "grad_norm": 1.9228730201721191, + "learning_rate": 0.0005, + "epoch": 0.7442058498927164, + "step": 16605 + }, + { + "loss": 14.0149, + "grad_norm": 1.7469290494918823, + "learning_rate": 0.0005, + "epoch": 0.744429940783982, + "step": 16610 + }, + { + "loss": 13.8832, + "grad_norm": 1.7500646114349365, + "learning_rate": 0.0005, + "epoch": 0.7446540316752475, + "step": 16615 + }, + { + "loss": 13.9315, + "grad_norm": 1.7939316034317017, + "learning_rate": 0.0005, + "epoch": 0.744878122566513, + "step": 16620 + }, + { + "loss": 13.82, + "grad_norm": 1.9013673067092896, + "learning_rate": 0.0005, + "epoch": 0.7451022134577785, + "step": 16625 + }, + { + "loss": 13.8628, + "grad_norm": 1.7872364521026611, + "learning_rate": 0.0005, + "epoch": 0.745326304349044, + "step": 16630 + }, + { + "loss": 13.9992, + "grad_norm": 1.8590223789215088, + "learning_rate": 0.0005, + "epoch": 0.7455503952403094, + "step": 16635 + }, + { + "loss": 14.049, + "grad_norm": 1.8022938966751099, + "learning_rate": 0.0005, + "epoch": 0.7457744861315749, + "step": 16640 + }, + { + "loss": 13.9455, + "grad_norm": 1.7616809606552124, + "learning_rate": 0.0005, + "epoch": 0.7459985770228404, + "step": 16645 + }, + { + "loss": 13.8782, + "grad_norm": 1.9642635583877563, + "learning_rate": 0.0005, + "epoch": 0.746222667914106, + "step": 16650 + }, + { + "loss": 13.9376, + "grad_norm": 1.945994257926941, + "learning_rate": 0.0005, + "epoch": 0.7464467588053715, + "step": 16655 + }, + { + "loss": 13.9022, + "grad_norm": 1.802782416343689, + "learning_rate": 0.0005, + "epoch": 0.746670849696637, + "step": 16660 + }, + { + "loss": 14.0735, + "grad_norm": 1.7638579607009888, + "learning_rate": 0.0005, + "epoch": 0.7468949405879024, + "step": 16665 + }, + { + "loss": 13.8999, + "grad_norm": 1.7529107332229614, + "learning_rate": 0.0005, + "epoch": 0.7471190314791679, + "step": 16670 + }, + { + "loss": 13.9178, + "grad_norm": 1.919114112854004, + "learning_rate": 0.0005, + "epoch": 0.7473431223704334, + "step": 16675 + }, + { + "loss": 13.8848, + "grad_norm": 1.8599441051483154, + "learning_rate": 0.0005, + "epoch": 0.747567213261699, + "step": 16680 + }, + { + "loss": 13.9828, + "grad_norm": 1.8337881565093994, + "learning_rate": 0.0005, + "epoch": 0.7477913041529645, + "step": 16685 + }, + { + "loss": 13.9717, + "grad_norm": 1.919255018234253, + "learning_rate": 0.0005, + "epoch": 0.74801539504423, + "step": 16690 + }, + { + "loss": 13.8769, + "grad_norm": 1.7353612184524536, + "learning_rate": 0.0005, + "epoch": 0.7482394859354954, + "step": 16695 + }, + { + "loss": 13.9762, + "grad_norm": 1.7704685926437378, + "learning_rate": 0.0005, + "epoch": 0.7484635768267609, + "step": 16700 + }, + { + "loss": 13.9186, + "grad_norm": 1.7047014236450195, + "learning_rate": 0.0005, + "epoch": 0.7486876677180264, + "step": 16705 + }, + { + "loss": 13.8664, + "grad_norm": 1.841894268989563, + "learning_rate": 0.0005, + "epoch": 0.7489117586092919, + "step": 16710 + }, + { + "loss": 13.9868, + "grad_norm": 1.6897482872009277, + "learning_rate": 0.0005, + "epoch": 0.7491358495005574, + "step": 16715 + }, + { + "loss": 14.0523, + "grad_norm": 1.8365100622177124, + "learning_rate": 0.0005, + "epoch": 0.749359940391823, + "step": 16720 + }, + { + "loss": 13.9071, + "grad_norm": 1.8162260055541992, + "learning_rate": 0.0005, + "epoch": 0.7495840312830884, + "step": 16725 + }, + { + "loss": 13.9705, + "grad_norm": 1.6817643642425537, + "learning_rate": 0.0005, + "epoch": 0.7498081221743539, + "step": 16730 + }, + { + "loss": 13.9116, + "grad_norm": 1.930167317390442, + "learning_rate": 0.0005, + "epoch": 0.7500322130656194, + "step": 16735 + }, + { + "loss": 13.936, + "grad_norm": 1.8676118850708008, + "learning_rate": 0.0005, + "epoch": 0.7502563039568849, + "step": 16740 + }, + { + "loss": 13.9527, + "grad_norm": 1.9254913330078125, + "learning_rate": 0.0005, + "epoch": 0.7504803948481504, + "step": 16745 + }, + { + "loss": 14.0472, + "grad_norm": 1.6709167957305908, + "learning_rate": 0.0005, + "epoch": 0.750704485739416, + "step": 16750 + }, + { + "loss": 13.794, + "grad_norm": 1.8618227243423462, + "learning_rate": 0.0005, + "epoch": 0.7509285766306814, + "step": 16755 + }, + { + "loss": 13.8492, + "grad_norm": 1.8963994979858398, + "learning_rate": 0.0005, + "epoch": 0.7511526675219469, + "step": 16760 + }, + { + "loss": 13.9819, + "grad_norm": 1.883829951286316, + "learning_rate": 0.0005, + "epoch": 0.7513767584132124, + "step": 16765 + }, + { + "loss": 13.9072, + "grad_norm": 1.777325987815857, + "learning_rate": 0.0005, + "epoch": 0.7516008493044779, + "step": 16770 + }, + { + "loss": 13.9403, + "grad_norm": 1.7898032665252686, + "learning_rate": 0.0005, + "epoch": 0.7518249401957434, + "step": 16775 + }, + { + "loss": 13.9503, + "grad_norm": 1.8185434341430664, + "learning_rate": 0.0005, + "epoch": 0.7520490310870089, + "step": 16780 + }, + { + "loss": 13.9264, + "grad_norm": 1.8035706281661987, + "learning_rate": 0.0005, + "epoch": 0.7522731219782743, + "step": 16785 + }, + { + "loss": 14.0012, + "grad_norm": 1.9327406883239746, + "learning_rate": 0.0005, + "epoch": 0.7524972128695399, + "step": 16790 + }, + { + "loss": 13.9255, + "grad_norm": 1.8507956266403198, + "learning_rate": 0.0005, + "epoch": 0.7527213037608054, + "step": 16795 + }, + { + "loss": 13.7956, + "grad_norm": 1.8498870134353638, + "learning_rate": 0.0005, + "epoch": 0.7529453946520709, + "step": 16800 + }, + { + "loss": 13.9411, + "grad_norm": 1.7851393222808838, + "learning_rate": 0.0005, + "epoch": 0.7531694855433364, + "step": 16805 + }, + { + "loss": 13.9635, + "grad_norm": 1.8075121641159058, + "learning_rate": 0.0005, + "epoch": 0.7533935764346019, + "step": 16810 + }, + { + "loss": 13.882, + "grad_norm": 1.9553426504135132, + "learning_rate": 0.0005, + "epoch": 0.7536176673258673, + "step": 16815 + }, + { + "loss": 13.8641, + "grad_norm": 1.8933708667755127, + "learning_rate": 0.0005, + "epoch": 0.7538417582171328, + "step": 16820 + }, + { + "loss": 13.9657, + "grad_norm": 1.7268893718719482, + "learning_rate": 0.0005, + "epoch": 0.7540658491083984, + "step": 16825 + }, + { + "loss": 13.9142, + "grad_norm": 1.8427122831344604, + "learning_rate": 0.0005, + "epoch": 0.7542899399996639, + "step": 16830 + }, + { + "loss": 13.9643, + "grad_norm": 1.8097573518753052, + "learning_rate": 0.0005, + "epoch": 0.7545140308909294, + "step": 16835 + }, + { + "loss": 13.9884, + "grad_norm": 1.8019248247146606, + "learning_rate": 0.0005, + "epoch": 0.7547381217821949, + "step": 16840 + }, + { + "loss": 13.9075, + "grad_norm": 1.658731460571289, + "learning_rate": 0.0005, + "epoch": 0.7549622126734603, + "step": 16845 + }, + { + "loss": 13.9988, + "grad_norm": 1.731905460357666, + "learning_rate": 0.0005, + "epoch": 0.7551863035647258, + "step": 16850 + }, + { + "loss": 14.0389, + "grad_norm": 1.7171862125396729, + "learning_rate": 0.0005, + "epoch": 0.7554103944559913, + "step": 16855 + }, + { + "loss": 13.9988, + "grad_norm": 1.8597036600112915, + "learning_rate": 0.0005, + "epoch": 0.7556344853472569, + "step": 16860 + }, + { + "loss": 14.0282, + "grad_norm": 1.7419289350509644, + "learning_rate": 0.0005, + "epoch": 0.7558585762385224, + "step": 16865 + }, + { + "loss": 13.8449, + "grad_norm": 1.7694988250732422, + "learning_rate": 0.0005, + "epoch": 0.7560826671297879, + "step": 16870 + }, + { + "loss": 13.9721, + "grad_norm": 1.701286792755127, + "learning_rate": 0.0005, + "epoch": 0.7563067580210533, + "step": 16875 + }, + { + "loss": 13.9108, + "grad_norm": 2.0243606567382812, + "learning_rate": 0.0005, + "epoch": 0.7565308489123188, + "step": 16880 + }, + { + "loss": 13.9166, + "grad_norm": 1.786948800086975, + "learning_rate": 0.0005, + "epoch": 0.7567549398035843, + "step": 16885 + }, + { + "loss": 13.9138, + "grad_norm": 1.8174067735671997, + "learning_rate": 0.0005, + "epoch": 0.7569790306948498, + "step": 16890 + }, + { + "loss": 13.8914, + "grad_norm": 1.8659418821334839, + "learning_rate": 0.0005, + "epoch": 0.7572031215861154, + "step": 16895 + }, + { + "loss": 13.9287, + "grad_norm": 1.84630286693573, + "learning_rate": 0.0005, + "epoch": 0.7574272124773809, + "step": 16900 + }, + { + "loss": 13.9427, + "grad_norm": 1.8225772380828857, + "learning_rate": 0.0005, + "epoch": 0.7576513033686463, + "step": 16905 + }, + { + "loss": 13.9092, + "grad_norm": 1.7180335521697998, + "learning_rate": 0.0005, + "epoch": 0.7578753942599118, + "step": 16910 + }, + { + "loss": 14.0041, + "grad_norm": 1.6755090951919556, + "learning_rate": 0.0005, + "epoch": 0.7580994851511773, + "step": 16915 + }, + { + "loss": 13.8323, + "grad_norm": 1.6758276224136353, + "learning_rate": 0.0005, + "epoch": 0.7583235760424428, + "step": 16920 + }, + { + "loss": 13.9482, + "grad_norm": 1.7528473138809204, + "learning_rate": 0.0005, + "epoch": 0.7585476669337083, + "step": 16925 + }, + { + "loss": 13.9798, + "grad_norm": 1.7393977642059326, + "learning_rate": 0.0005, + "epoch": 0.7587717578249739, + "step": 16930 + }, + { + "loss": 13.8582, + "grad_norm": 1.8464726209640503, + "learning_rate": 0.0005, + "epoch": 0.7589958487162393, + "step": 16935 + }, + { + "loss": 14.0705, + "grad_norm": 1.881400465965271, + "learning_rate": 0.0005, + "epoch": 0.7592199396075048, + "step": 16940 + }, + { + "loss": 13.9858, + "grad_norm": 1.8284975290298462, + "learning_rate": 0.0005, + "epoch": 0.7594440304987703, + "step": 16945 + }, + { + "loss": 13.8379, + "grad_norm": 1.7476427555084229, + "learning_rate": 0.0005, + "epoch": 0.7596681213900358, + "step": 16950 + }, + { + "loss": 13.8957, + "grad_norm": 1.874758243560791, + "learning_rate": 0.0005, + "epoch": 0.7598922122813013, + "step": 16955 + }, + { + "loss": 13.9538, + "grad_norm": 2.0369491577148438, + "learning_rate": 0.0005, + "epoch": 0.7601163031725668, + "step": 16960 + }, + { + "loss": 13.9057, + "grad_norm": 1.8507211208343506, + "learning_rate": 0.0005, + "epoch": 0.7603403940638322, + "step": 16965 + }, + { + "loss": 13.9471, + "grad_norm": 2.078411340713501, + "learning_rate": 0.0005, + "epoch": 0.7605644849550978, + "step": 16970 + }, + { + "loss": 13.8973, + "grad_norm": 1.8963191509246826, + "learning_rate": 0.0005, + "epoch": 0.7607885758463633, + "step": 16975 + }, + { + "loss": 14.0286, + "grad_norm": 1.8882653713226318, + "learning_rate": 0.0005, + "epoch": 0.7610126667376288, + "step": 16980 + }, + { + "loss": 13.9606, + "grad_norm": 1.7255523204803467, + "learning_rate": 0.0005, + "epoch": 0.7612367576288943, + "step": 16985 + }, + { + "loss": 13.8637, + "grad_norm": 1.7835386991500854, + "learning_rate": 0.0005, + "epoch": 0.7614608485201598, + "step": 16990 + }, + { + "loss": 13.9112, + "grad_norm": 1.9392805099487305, + "learning_rate": 0.0005, + "epoch": 0.7616849394114252, + "step": 16995 + }, + { + "loss": 13.905, + "grad_norm": 2.0097835063934326, + "learning_rate": 0.0005, + "epoch": 0.7619090303026907, + "step": 17000 + }, + { + "eval_loss": 1.7384229898452759, + "eval_runtime": 18.5124, + "eval_samples_per_second": 885.026, + "eval_steps_per_second": 7.941, + "epoch": 0.7619090303026907, + "step": 17000 + }, + { + "loss": 13.8823, + "grad_norm": 2.398235321044922, + "learning_rate": 0.0005, + "epoch": 0.7621331211939563, + "step": 17005 + }, + { + "loss": 14.0402, + "grad_norm": 2.1302218437194824, + "learning_rate": 0.0005, + "epoch": 0.7623572120852218, + "step": 17010 + }, + { + "loss": 13.9419, + "grad_norm": 1.7162015438079834, + "learning_rate": 0.0005, + "epoch": 0.7625813029764873, + "step": 17015 + }, + { + "loss": 13.8423, + "grad_norm": 2.0047333240509033, + "learning_rate": 0.0005, + "epoch": 0.7628053938677528, + "step": 17020 + }, + { + "loss": 13.797, + "grad_norm": 1.8566488027572632, + "learning_rate": 0.0005, + "epoch": 0.7630294847590182, + "step": 17025 + }, + { + "loss": 13.8469, + "grad_norm": 1.8598730564117432, + "learning_rate": 0.0005, + "epoch": 0.7632535756502837, + "step": 17030 + }, + { + "loss": 13.9452, + "grad_norm": 1.9671058654785156, + "learning_rate": 0.0005, + "epoch": 0.7634776665415492, + "step": 17035 + }, + { + "loss": 13.9941, + "grad_norm": 1.9251089096069336, + "learning_rate": 0.0005, + "epoch": 0.7637017574328148, + "step": 17040 + }, + { + "loss": 13.914, + "grad_norm": 1.6617097854614258, + "learning_rate": 0.0005, + "epoch": 0.7639258483240803, + "step": 17045 + }, + { + "loss": 13.9591, + "grad_norm": 1.8101284503936768, + "learning_rate": 0.0005, + "epoch": 0.7641499392153458, + "step": 17050 + }, + { + "loss": 13.7848, + "grad_norm": 1.8096923828125, + "learning_rate": 0.0005, + "epoch": 0.7643740301066112, + "step": 17055 + }, + { + "loss": 13.9079, + "grad_norm": 1.896112084388733, + "learning_rate": 0.0005, + "epoch": 0.7645981209978767, + "step": 17060 + }, + { + "loss": 14.0023, + "grad_norm": 1.9171850681304932, + "learning_rate": 0.0005, + "epoch": 0.7648222118891422, + "step": 17065 + }, + { + "loss": 13.8576, + "grad_norm": 1.7272579669952393, + "learning_rate": 0.0005, + "epoch": 0.7650463027804077, + "step": 17070 + }, + { + "loss": 13.9216, + "grad_norm": 1.7884331941604614, + "learning_rate": 0.0005, + "epoch": 0.7652703936716733, + "step": 17075 + }, + { + "loss": 13.905, + "grad_norm": 1.927427887916565, + "learning_rate": 0.0005, + "epoch": 0.7654944845629388, + "step": 17080 + }, + { + "loss": 13.8944, + "grad_norm": 1.9965101480484009, + "learning_rate": 0.0005, + "epoch": 0.7657185754542042, + "step": 17085 + }, + { + "loss": 14.0026, + "grad_norm": 1.8458534479141235, + "learning_rate": 0.0005, + "epoch": 0.7659426663454697, + "step": 17090 + }, + { + "loss": 13.9527, + "grad_norm": 1.8137216567993164, + "learning_rate": 0.0005, + "epoch": 0.7661667572367352, + "step": 17095 + }, + { + "loss": 13.8867, + "grad_norm": 1.7597182989120483, + "learning_rate": 0.0005, + "epoch": 0.7663908481280007, + "step": 17100 + }, + { + "loss": 13.9618, + "grad_norm": 1.724302053451538, + "learning_rate": 0.0005, + "epoch": 0.7666149390192663, + "step": 17105 + }, + { + "loss": 13.8788, + "grad_norm": 1.7006725072860718, + "learning_rate": 0.0005, + "epoch": 0.7668390299105318, + "step": 17110 + }, + { + "loss": 13.9112, + "grad_norm": 1.6335428953170776, + "learning_rate": 0.0005, + "epoch": 0.7670631208017972, + "step": 17115 + }, + { + "loss": 13.9204, + "grad_norm": 1.7712724208831787, + "learning_rate": 0.0005, + "epoch": 0.7672872116930627, + "step": 17120 + }, + { + "loss": 13.9966, + "grad_norm": 1.7821861505508423, + "learning_rate": 0.0005, + "epoch": 0.7675113025843282, + "step": 17125 + }, + { + "loss": 13.8706, + "grad_norm": 1.9097272157669067, + "learning_rate": 0.0005, + "epoch": 0.7677353934755937, + "step": 17130 + }, + { + "loss": 13.8536, + "grad_norm": 2.0328292846679688, + "learning_rate": 0.0005, + "epoch": 0.7679594843668592, + "step": 17135 + }, + { + "loss": 13.9957, + "grad_norm": 2.003213882446289, + "learning_rate": 0.0005, + "epoch": 0.7681835752581246, + "step": 17140 + }, + { + "loss": 13.8826, + "grad_norm": 2.1363508701324463, + "learning_rate": 0.0005, + "epoch": 0.7684076661493902, + "step": 17145 + }, + { + "loss": 13.9027, + "grad_norm": 1.8297216892242432, + "learning_rate": 0.0005, + "epoch": 0.7686317570406557, + "step": 17150 + }, + { + "loss": 13.9279, + "grad_norm": 1.645095705986023, + "learning_rate": 0.0005, + "epoch": 0.7688558479319212, + "step": 17155 + }, + { + "loss": 13.8988, + "grad_norm": 1.7365893125534058, + "learning_rate": 0.0005, + "epoch": 0.7690799388231867, + "step": 17160 + }, + { + "loss": 14.1075, + "grad_norm": 1.8787424564361572, + "learning_rate": 0.0005, + "epoch": 0.7693040297144522, + "step": 17165 + }, + { + "loss": 13.8715, + "grad_norm": 1.850673794746399, + "learning_rate": 0.0005, + "epoch": 0.7695281206057176, + "step": 17170 + }, + { + "loss": 13.9795, + "grad_norm": 2.0193114280700684, + "learning_rate": 0.0005, + "epoch": 0.7697522114969831, + "step": 17175 + }, + { + "loss": 13.957, + "grad_norm": 1.9191477298736572, + "learning_rate": 0.0005, + "epoch": 0.7699763023882487, + "step": 17180 + }, + { + "loss": 13.9135, + "grad_norm": 1.8473769426345825, + "learning_rate": 0.0005, + "epoch": 0.7702003932795142, + "step": 17185 + }, + { + "loss": 13.9033, + "grad_norm": 1.8285760879516602, + "learning_rate": 0.0005, + "epoch": 0.7704244841707797, + "step": 17190 + }, + { + "loss": 13.8782, + "grad_norm": 1.740413784980774, + "learning_rate": 0.0005, + "epoch": 0.7706485750620452, + "step": 17195 + }, + { + "loss": 13.9807, + "grad_norm": 1.9219098091125488, + "learning_rate": 0.0005, + "epoch": 0.7708726659533106, + "step": 17200 + }, + { + "loss": 13.8142, + "grad_norm": 1.693150520324707, + "learning_rate": 0.0005, + "epoch": 0.7710967568445761, + "step": 17205 + }, + { + "loss": 13.9932, + "grad_norm": 1.8358036279678345, + "learning_rate": 0.0005, + "epoch": 0.7713208477358416, + "step": 17210 + }, + { + "loss": 14.0151, + "grad_norm": 2.059954881668091, + "learning_rate": 0.0005, + "epoch": 0.7715449386271072, + "step": 17215 + }, + { + "loss": 13.9524, + "grad_norm": 1.993303656578064, + "learning_rate": 0.0005, + "epoch": 0.7717690295183727, + "step": 17220 + }, + { + "loss": 13.928, + "grad_norm": 1.7009844779968262, + "learning_rate": 0.0005, + "epoch": 0.7719931204096382, + "step": 17225 + }, + { + "loss": 13.8545, + "grad_norm": 1.731141448020935, + "learning_rate": 0.0005, + "epoch": 0.7722172113009036, + "step": 17230 + }, + { + "loss": 13.9505, + "grad_norm": 1.7808451652526855, + "learning_rate": 0.0005, + "epoch": 0.7724413021921691, + "step": 17235 + }, + { + "loss": 13.8956, + "grad_norm": 1.6263585090637207, + "learning_rate": 0.0005, + "epoch": 0.7726653930834346, + "step": 17240 + }, + { + "loss": 14.0226, + "grad_norm": 1.7615787982940674, + "learning_rate": 0.0005, + "epoch": 0.7728894839747001, + "step": 17245 + }, + { + "loss": 13.9709, + "grad_norm": 1.766060471534729, + "learning_rate": 0.0005, + "epoch": 0.7731135748659657, + "step": 17250 + }, + { + "loss": 14.0055, + "grad_norm": 1.658046841621399, + "learning_rate": 0.0005, + "epoch": 0.7733376657572312, + "step": 17255 + }, + { + "loss": 13.8568, + "grad_norm": 1.9029816389083862, + "learning_rate": 0.0005, + "epoch": 0.7735617566484966, + "step": 17260 + }, + { + "loss": 13.8713, + "grad_norm": 1.806174397468567, + "learning_rate": 0.0005, + "epoch": 0.7737858475397621, + "step": 17265 + }, + { + "loss": 13.9703, + "grad_norm": 1.8879802227020264, + "learning_rate": 0.0005, + "epoch": 0.7740099384310276, + "step": 17270 + }, + { + "loss": 13.9304, + "grad_norm": 1.9935083389282227, + "learning_rate": 0.0005, + "epoch": 0.7742340293222931, + "step": 17275 + }, + { + "loss": 14.0471, + "grad_norm": 1.9430160522460938, + "learning_rate": 0.0005, + "epoch": 0.7744581202135586, + "step": 17280 + }, + { + "loss": 13.863, + "grad_norm": 1.7574583292007446, + "learning_rate": 0.0005, + "epoch": 0.7746822111048242, + "step": 17285 + }, + { + "loss": 13.8583, + "grad_norm": 1.865325927734375, + "learning_rate": 0.0005, + "epoch": 0.7749063019960896, + "step": 17290 + }, + { + "loss": 13.8901, + "grad_norm": 1.9842028617858887, + "learning_rate": 0.0005, + "epoch": 0.7751303928873551, + "step": 17295 + }, + { + "loss": 13.9087, + "grad_norm": 1.7361700534820557, + "learning_rate": 0.0005, + "epoch": 0.7753544837786206, + "step": 17300 + }, + { + "loss": 14.0314, + "grad_norm": 1.8161813020706177, + "learning_rate": 0.0005, + "epoch": 0.7755785746698861, + "step": 17305 + }, + { + "loss": 13.9428, + "grad_norm": 1.755927324295044, + "learning_rate": 0.0005, + "epoch": 0.7758026655611516, + "step": 17310 + }, + { + "loss": 13.8968, + "grad_norm": 1.747763991355896, + "learning_rate": 0.0005, + "epoch": 0.7760267564524171, + "step": 17315 + }, + { + "loss": 14.0306, + "grad_norm": 1.6892719268798828, + "learning_rate": 0.0005, + "epoch": 0.7762508473436825, + "step": 17320 + }, + { + "loss": 13.9098, + "grad_norm": 1.8000086545944214, + "learning_rate": 0.0005, + "epoch": 0.7764749382349481, + "step": 17325 + }, + { + "loss": 14.0461, + "grad_norm": 1.8460227251052856, + "learning_rate": 0.0005, + "epoch": 0.7766990291262136, + "step": 17330 + }, + { + "loss": 13.926, + "grad_norm": 1.7183973789215088, + "learning_rate": 0.0005, + "epoch": 0.7769231200174791, + "step": 17335 + }, + { + "loss": 13.9128, + "grad_norm": 1.7747199535369873, + "learning_rate": 0.0005, + "epoch": 0.7771472109087446, + "step": 17340 + }, + { + "loss": 13.9532, + "grad_norm": 1.8093260526657104, + "learning_rate": 0.0005, + "epoch": 0.7773713018000101, + "step": 17345 + }, + { + "loss": 13.8194, + "grad_norm": 1.9444884061813354, + "learning_rate": 0.0005, + "epoch": 0.7775953926912755, + "step": 17350 + }, + { + "loss": 13.908, + "grad_norm": 1.7580726146697998, + "learning_rate": 0.0005, + "epoch": 0.777819483582541, + "step": 17355 + }, + { + "loss": 13.9547, + "grad_norm": 1.7068579196929932, + "learning_rate": 0.0005, + "epoch": 0.7780435744738066, + "step": 17360 + }, + { + "loss": 13.8771, + "grad_norm": 1.8285844326019287, + "learning_rate": 0.0005, + "epoch": 0.7782676653650721, + "step": 17365 + }, + { + "loss": 13.9304, + "grad_norm": 1.7167176008224487, + "learning_rate": 0.0005, + "epoch": 0.7784917562563376, + "step": 17370 + }, + { + "loss": 13.9736, + "grad_norm": 1.820793867111206, + "learning_rate": 0.0005, + "epoch": 0.7787158471476031, + "step": 17375 + }, + { + "loss": 13.9138, + "grad_norm": 1.8576176166534424, + "learning_rate": 0.0005, + "epoch": 0.7789399380388685, + "step": 17380 + }, + { + "loss": 13.9551, + "grad_norm": 1.7626501321792603, + "learning_rate": 0.0005, + "epoch": 0.779164028930134, + "step": 17385 + }, + { + "loss": 13.9133, + "grad_norm": 1.788894534111023, + "learning_rate": 0.0005, + "epoch": 0.7793881198213995, + "step": 17390 + }, + { + "loss": 13.8893, + "grad_norm": 1.8483351469039917, + "learning_rate": 0.0005, + "epoch": 0.7796122107126651, + "step": 17395 + }, + { + "loss": 13.9881, + "grad_norm": 1.9322800636291504, + "learning_rate": 0.0005, + "epoch": 0.7798363016039306, + "step": 17400 + }, + { + "loss": 13.8728, + "grad_norm": 1.780640721321106, + "learning_rate": 0.0005, + "epoch": 0.7800603924951961, + "step": 17405 + }, + { + "loss": 13.857, + "grad_norm": 1.921074390411377, + "learning_rate": 0.0005, + "epoch": 0.7802844833864615, + "step": 17410 + }, + { + "loss": 13.9098, + "grad_norm": 1.8547496795654297, + "learning_rate": 0.0005, + "epoch": 0.780508574277727, + "step": 17415 + }, + { + "loss": 13.8675, + "grad_norm": 1.8414533138275146, + "learning_rate": 0.0005, + "epoch": 0.7807326651689925, + "step": 17420 + }, + { + "loss": 13.8619, + "grad_norm": 1.734214186668396, + "learning_rate": 0.0005, + "epoch": 0.780956756060258, + "step": 17425 + }, + { + "loss": 13.8024, + "grad_norm": 1.8890767097473145, + "learning_rate": 0.0005, + "epoch": 0.7811808469515236, + "step": 17430 + }, + { + "loss": 13.9844, + "grad_norm": 1.832506537437439, + "learning_rate": 0.0005, + "epoch": 0.7814049378427891, + "step": 17435 + }, + { + "loss": 13.9294, + "grad_norm": 1.803244709968567, + "learning_rate": 0.0005, + "epoch": 0.7816290287340545, + "step": 17440 + }, + { + "loss": 13.956, + "grad_norm": 1.704032063484192, + "learning_rate": 0.0005, + "epoch": 0.78185311962532, + "step": 17445 + }, + { + "loss": 13.9254, + "grad_norm": 1.7548668384552002, + "learning_rate": 0.0005, + "epoch": 0.7820772105165855, + "step": 17450 + }, + { + "loss": 14.0788, + "grad_norm": 1.7440739870071411, + "learning_rate": 0.0005, + "epoch": 0.782301301407851, + "step": 17455 + }, + { + "loss": 13.8141, + "grad_norm": 1.7532907724380493, + "learning_rate": 0.0005, + "epoch": 0.7825253922991166, + "step": 17460 + }, + { + "loss": 13.8557, + "grad_norm": 1.7621829509735107, + "learning_rate": 0.0005, + "epoch": 0.7827494831903821, + "step": 17465 + }, + { + "loss": 13.9685, + "grad_norm": 1.7075783014297485, + "learning_rate": 0.0005, + "epoch": 0.7829735740816475, + "step": 17470 + }, + { + "loss": 13.8901, + "grad_norm": 1.8711745738983154, + "learning_rate": 0.0005, + "epoch": 0.783197664972913, + "step": 17475 + }, + { + "loss": 13.9876, + "grad_norm": 2.0267229080200195, + "learning_rate": 0.0005, + "epoch": 0.7834217558641785, + "step": 17480 + }, + { + "loss": 13.9416, + "grad_norm": 1.8759691715240479, + "learning_rate": 0.0005, + "epoch": 0.783645846755444, + "step": 17485 + }, + { + "loss": 13.9271, + "grad_norm": 1.8488215208053589, + "learning_rate": 0.0005, + "epoch": 0.7838699376467095, + "step": 17490 + }, + { + "loss": 13.9616, + "grad_norm": 1.79073166847229, + "learning_rate": 0.0005, + "epoch": 0.784094028537975, + "step": 17495 + }, + { + "loss": 14.0167, + "grad_norm": 1.8655067682266235, + "learning_rate": 0.0005, + "epoch": 0.7843181194292405, + "step": 17500 + }, + { + "eval_loss": 1.742244005203247, + "eval_runtime": 18.6673, + "eval_samples_per_second": 877.685, + "eval_steps_per_second": 7.875, + "epoch": 0.7843181194292405, + "step": 17500 + }, + { + "loss": 13.8936, + "grad_norm": 1.7196091413497925, + "learning_rate": 0.0005, + "epoch": 0.784542210320506, + "step": 17505 + }, + { + "loss": 13.9612, + "grad_norm": 1.8762168884277344, + "learning_rate": 0.0005, + "epoch": 0.7847663012117715, + "step": 17510 + }, + { + "loss": 13.9842, + "grad_norm": 1.8332551717758179, + "learning_rate": 0.0005, + "epoch": 0.784990392103037, + "step": 17515 + }, + { + "loss": 14.0338, + "grad_norm": 1.7468173503875732, + "learning_rate": 0.0005, + "epoch": 0.7852144829943025, + "step": 17520 + }, + { + "loss": 13.8849, + "grad_norm": 1.7667865753173828, + "learning_rate": 0.0005, + "epoch": 0.785438573885568, + "step": 17525 + }, + { + "loss": 13.9859, + "grad_norm": 1.714019775390625, + "learning_rate": 0.0005, + "epoch": 0.7856626647768334, + "step": 17530 + }, + { + "loss": 14.0067, + "grad_norm": 1.8588460683822632, + "learning_rate": 0.0005, + "epoch": 0.785886755668099, + "step": 17535 + }, + { + "loss": 14.0311, + "grad_norm": 1.6515707969665527, + "learning_rate": 0.0005, + "epoch": 0.7861108465593645, + "step": 17540 + }, + { + "loss": 13.863, + "grad_norm": 1.7797318696975708, + "learning_rate": 0.0005, + "epoch": 0.78633493745063, + "step": 17545 + }, + { + "loss": 14.0213, + "grad_norm": 1.6977412700653076, + "learning_rate": 0.0005, + "epoch": 0.7865590283418955, + "step": 17550 + }, + { + "loss": 13.8579, + "grad_norm": 1.8260390758514404, + "learning_rate": 0.0005, + "epoch": 0.786783119233161, + "step": 17555 + }, + { + "loss": 13.9899, + "grad_norm": 1.8567028045654297, + "learning_rate": 0.0005, + "epoch": 0.7870072101244264, + "step": 17560 + }, + { + "loss": 14.0152, + "grad_norm": 1.9553159475326538, + "learning_rate": 0.0005, + "epoch": 0.7872313010156919, + "step": 17565 + }, + { + "loss": 14.0009, + "grad_norm": 1.808526873588562, + "learning_rate": 0.0005, + "epoch": 0.7874553919069575, + "step": 17570 + }, + { + "loss": 13.8205, + "grad_norm": 1.8745321035385132, + "learning_rate": 0.0005, + "epoch": 0.787679482798223, + "step": 17575 + }, + { + "loss": 13.8825, + "grad_norm": 1.6873557567596436, + "learning_rate": 0.0005, + "epoch": 0.7879035736894885, + "step": 17580 + }, + { + "loss": 14.0002, + "grad_norm": 1.663845181465149, + "learning_rate": 0.0005, + "epoch": 0.788127664580754, + "step": 17585 + }, + { + "loss": 13.9907, + "grad_norm": 1.7362158298492432, + "learning_rate": 0.0005, + "epoch": 0.7883517554720194, + "step": 17590 + }, + { + "loss": 13.9699, + "grad_norm": 1.9989452362060547, + "learning_rate": 0.0005, + "epoch": 0.7885758463632849, + "step": 17595 + }, + { + "loss": 13.9033, + "grad_norm": 1.7774451971054077, + "learning_rate": 0.0005, + "epoch": 0.7887999372545504, + "step": 17600 + }, + { + "loss": 13.9771, + "grad_norm": 1.8537983894348145, + "learning_rate": 0.0005, + "epoch": 0.789024028145816, + "step": 17605 + }, + { + "loss": 13.9926, + "grad_norm": 1.902446985244751, + "learning_rate": 0.0005, + "epoch": 0.7892481190370815, + "step": 17610 + }, + { + "loss": 14.0141, + "grad_norm": 1.8638439178466797, + "learning_rate": 0.0005, + "epoch": 0.789472209928347, + "step": 17615 + }, + { + "loss": 13.9426, + "grad_norm": 1.8101930618286133, + "learning_rate": 0.0005, + "epoch": 0.7896963008196124, + "step": 17620 + }, + { + "loss": 13.9717, + "grad_norm": 1.8008956909179688, + "learning_rate": 0.0005, + "epoch": 0.7899203917108779, + "step": 17625 + }, + { + "loss": 13.8962, + "grad_norm": 1.8648358583450317, + "learning_rate": 0.0005, + "epoch": 0.7901444826021434, + "step": 17630 + }, + { + "loss": 13.8996, + "grad_norm": 1.699042558670044, + "learning_rate": 0.0005, + "epoch": 0.7903685734934089, + "step": 17635 + }, + { + "loss": 13.8396, + "grad_norm": 1.7029451131820679, + "learning_rate": 0.0005, + "epoch": 0.7905926643846745, + "step": 17640 + }, + { + "loss": 13.9211, + "grad_norm": 1.6374444961547852, + "learning_rate": 0.0005, + "epoch": 0.79081675527594, + "step": 17645 + }, + { + "loss": 13.98, + "grad_norm": 1.8231736421585083, + "learning_rate": 0.0005, + "epoch": 0.7910408461672054, + "step": 17650 + }, + { + "loss": 13.8947, + "grad_norm": 1.7066539525985718, + "learning_rate": 0.0005, + "epoch": 0.7912649370584709, + "step": 17655 + }, + { + "loss": 13.9631, + "grad_norm": 1.9213404655456543, + "learning_rate": 0.0005, + "epoch": 0.7914890279497364, + "step": 17660 + }, + { + "loss": 13.8136, + "grad_norm": 1.8781342506408691, + "learning_rate": 0.0005, + "epoch": 0.7917131188410019, + "step": 17665 + }, + { + "loss": 13.9319, + "grad_norm": 1.76526939868927, + "learning_rate": 0.0005, + "epoch": 0.7919372097322674, + "step": 17670 + }, + { + "loss": 14.0505, + "grad_norm": 1.7963687181472778, + "learning_rate": 0.0005, + "epoch": 0.792161300623533, + "step": 17675 + }, + { + "loss": 13.9135, + "grad_norm": 1.9999289512634277, + "learning_rate": 0.0005, + "epoch": 0.7923853915147984, + "step": 17680 + }, + { + "loss": 13.9473, + "grad_norm": 1.8966845273971558, + "learning_rate": 0.0005, + "epoch": 0.7926094824060639, + "step": 17685 + }, + { + "loss": 13.9459, + "grad_norm": 1.9211297035217285, + "learning_rate": 0.0005, + "epoch": 0.7928335732973294, + "step": 17690 + }, + { + "loss": 13.8553, + "grad_norm": 1.8830465078353882, + "learning_rate": 0.0005, + "epoch": 0.7930576641885949, + "step": 17695 + }, + { + "loss": 13.9025, + "grad_norm": 1.9547890424728394, + "learning_rate": 0.0005, + "epoch": 0.7932817550798604, + "step": 17700 + }, + { + "loss": 14.1072, + "grad_norm": 1.8501747846603394, + "learning_rate": 0.0005, + "epoch": 0.7935058459711258, + "step": 17705 + }, + { + "loss": 13.7782, + "grad_norm": 1.9531677961349487, + "learning_rate": 0.0005, + "epoch": 0.7937299368623913, + "step": 17710 + }, + { + "loss": 13.9874, + "grad_norm": 2.038963794708252, + "learning_rate": 0.0005, + "epoch": 0.7939540277536569, + "step": 17715 + }, + { + "loss": 13.9117, + "grad_norm": 1.886931300163269, + "learning_rate": 0.0005, + "epoch": 0.7941781186449224, + "step": 17720 + }, + { + "loss": 13.9347, + "grad_norm": 1.8401106595993042, + "learning_rate": 0.0005, + "epoch": 0.7944022095361879, + "step": 17725 + }, + { + "loss": 13.9397, + "grad_norm": 1.7104369401931763, + "learning_rate": 0.0005, + "epoch": 0.7946263004274534, + "step": 17730 + }, + { + "loss": 13.8901, + "grad_norm": 1.6022361516952515, + "learning_rate": 0.0005, + "epoch": 0.7948503913187188, + "step": 17735 + }, + { + "loss": 13.9179, + "grad_norm": 1.8352298736572266, + "learning_rate": 0.0005, + "epoch": 0.7950744822099843, + "step": 17740 + }, + { + "loss": 13.973, + "grad_norm": 1.8135077953338623, + "learning_rate": 0.0005, + "epoch": 0.7952985731012499, + "step": 17745 + }, + { + "loss": 14.0032, + "grad_norm": 1.7439593076705933, + "learning_rate": 0.0005, + "epoch": 0.7955226639925154, + "step": 17750 + }, + { + "loss": 13.9162, + "grad_norm": 1.909914255142212, + "learning_rate": 0.0005, + "epoch": 0.7957467548837809, + "step": 17755 + }, + { + "loss": 13.9659, + "grad_norm": 1.9050699472427368, + "learning_rate": 0.0005, + "epoch": 0.7959708457750464, + "step": 17760 + }, + { + "loss": 14.0256, + "grad_norm": 1.806579828262329, + "learning_rate": 0.0005, + "epoch": 0.7961949366663118, + "step": 17765 + }, + { + "loss": 13.9308, + "grad_norm": 1.8770440816879272, + "learning_rate": 0.0005, + "epoch": 0.7964190275575773, + "step": 17770 + }, + { + "loss": 13.8989, + "grad_norm": 2.0780200958251953, + "learning_rate": 0.0005, + "epoch": 0.7966431184488428, + "step": 17775 + }, + { + "loss": 13.9108, + "grad_norm": 1.651223063468933, + "learning_rate": 0.0005, + "epoch": 0.7968672093401084, + "step": 17780 + }, + { + "loss": 13.8166, + "grad_norm": 1.7695311307907104, + "learning_rate": 0.0005, + "epoch": 0.7970913002313739, + "step": 17785 + }, + { + "loss": 13.8646, + "grad_norm": 1.7559232711791992, + "learning_rate": 0.0005, + "epoch": 0.7973153911226394, + "step": 17790 + }, + { + "loss": 13.8857, + "grad_norm": 1.7012014389038086, + "learning_rate": 0.0005, + "epoch": 0.7975394820139048, + "step": 17795 + }, + { + "loss": 13.9441, + "grad_norm": 1.9856010675430298, + "learning_rate": 0.0005, + "epoch": 0.7977635729051703, + "step": 17800 + }, + { + "loss": 13.9702, + "grad_norm": 1.7729270458221436, + "learning_rate": 0.0005, + "epoch": 0.7979876637964358, + "step": 17805 + }, + { + "loss": 13.9417, + "grad_norm": 1.7350739240646362, + "learning_rate": 0.0005, + "epoch": 0.7982117546877013, + "step": 17810 + }, + { + "loss": 13.8052, + "grad_norm": 1.6648602485656738, + "learning_rate": 0.0005, + "epoch": 0.7984358455789669, + "step": 17815 + }, + { + "loss": 14.0659, + "grad_norm": 1.7132405042648315, + "learning_rate": 0.0005, + "epoch": 0.7986599364702324, + "step": 17820 + }, + { + "loss": 13.947, + "grad_norm": 1.819995403289795, + "learning_rate": 0.0005, + "epoch": 0.7988840273614978, + "step": 17825 + }, + { + "loss": 13.8886, + "grad_norm": 1.8971381187438965, + "learning_rate": 0.0005, + "epoch": 0.7991081182527633, + "step": 17830 + }, + { + "loss": 13.8627, + "grad_norm": 1.7079675197601318, + "learning_rate": 0.0005, + "epoch": 0.7993322091440288, + "step": 17835 + }, + { + "loss": 13.9065, + "grad_norm": 1.9513356685638428, + "learning_rate": 0.0005, + "epoch": 0.7995563000352943, + "step": 17840 + }, + { + "loss": 13.9798, + "grad_norm": 2.0933167934417725, + "learning_rate": 0.0005, + "epoch": 0.7997803909265598, + "step": 17845 + }, + { + "loss": 13.9302, + "grad_norm": 2.068735122680664, + "learning_rate": 0.0005, + "epoch": 0.8000044818178254, + "step": 17850 + }, + { + "loss": 13.9432, + "grad_norm": 1.9295698404312134, + "learning_rate": 0.0005, + "epoch": 0.8002285727090908, + "step": 17855 + }, + { + "loss": 13.9133, + "grad_norm": 2.0122389793395996, + "learning_rate": 0.0005, + "epoch": 0.8004526636003563, + "step": 17860 + }, + { + "loss": 13.8614, + "grad_norm": 1.9103975296020508, + "learning_rate": 0.0005, + "epoch": 0.8006767544916218, + "step": 17865 + }, + { + "loss": 13.8453, + "grad_norm": 1.851839542388916, + "learning_rate": 0.0005, + "epoch": 0.8009008453828873, + "step": 17870 + }, + { + "loss": 13.8546, + "grad_norm": 1.8850983381271362, + "learning_rate": 0.0005, + "epoch": 0.8011249362741528, + "step": 17875 + }, + { + "loss": 13.9192, + "grad_norm": 1.7939026355743408, + "learning_rate": 0.0005, + "epoch": 0.8013490271654183, + "step": 17880 + }, + { + "loss": 14.0107, + "grad_norm": 1.7058433294296265, + "learning_rate": 0.0005, + "epoch": 0.8015731180566837, + "step": 17885 + }, + { + "loss": 13.9066, + "grad_norm": 1.8725535869598389, + "learning_rate": 0.0005, + "epoch": 0.8017972089479493, + "step": 17890 + }, + { + "loss": 13.8958, + "grad_norm": 1.797574520111084, + "learning_rate": 0.0005, + "epoch": 0.8020212998392148, + "step": 17895 + }, + { + "loss": 13.8328, + "grad_norm": 1.7859054803848267, + "learning_rate": 0.0005, + "epoch": 0.8022453907304803, + "step": 17900 + }, + { + "loss": 13.8831, + "grad_norm": 1.7225736379623413, + "learning_rate": 0.0005, + "epoch": 0.8024694816217458, + "step": 17905 + }, + { + "loss": 13.9417, + "grad_norm": 1.9291692972183228, + "learning_rate": 0.0005, + "epoch": 0.8026935725130113, + "step": 17910 + }, + { + "loss": 13.8104, + "grad_norm": 1.838352918624878, + "learning_rate": 0.0005, + "epoch": 0.8029176634042767, + "step": 17915 + }, + { + "loss": 13.9082, + "grad_norm": 1.8938319683074951, + "learning_rate": 0.0005, + "epoch": 0.8031417542955422, + "step": 17920 + }, + { + "loss": 13.95, + "grad_norm": 1.8627904653549194, + "learning_rate": 0.0005, + "epoch": 0.8033658451868078, + "step": 17925 + }, + { + "loss": 13.9393, + "grad_norm": 1.7981688976287842, + "learning_rate": 0.0005, + "epoch": 0.8035899360780733, + "step": 17930 + }, + { + "loss": 13.8658, + "grad_norm": 1.8402200937271118, + "learning_rate": 0.0005, + "epoch": 0.8038140269693388, + "step": 17935 + }, + { + "loss": 13.9451, + "grad_norm": 1.9201061725616455, + "learning_rate": 0.0005, + "epoch": 0.8040381178606043, + "step": 17940 + }, + { + "loss": 13.9314, + "grad_norm": 1.8516732454299927, + "learning_rate": 0.0005, + "epoch": 0.8042622087518697, + "step": 17945 + }, + { + "loss": 13.8518, + "grad_norm": 1.7574687004089355, + "learning_rate": 0.0005, + "epoch": 0.8044862996431352, + "step": 17950 + }, + { + "loss": 13.8738, + "grad_norm": 1.7064701318740845, + "learning_rate": 0.0005, + "epoch": 0.8047103905344007, + "step": 17955 + }, + { + "loss": 13.9399, + "grad_norm": 1.7522039413452148, + "learning_rate": 0.0005, + "epoch": 0.8049344814256663, + "step": 17960 + }, + { + "loss": 14.0378, + "grad_norm": 1.72758150100708, + "learning_rate": 0.0005, + "epoch": 0.8051585723169318, + "step": 17965 + }, + { + "loss": 13.9735, + "grad_norm": 1.7682212591171265, + "learning_rate": 0.0005, + "epoch": 0.8053826632081973, + "step": 17970 + }, + { + "loss": 13.9341, + "grad_norm": 1.6668306589126587, + "learning_rate": 0.0005, + "epoch": 0.8056067540994627, + "step": 17975 + }, + { + "loss": 13.892, + "grad_norm": 1.7351326942443848, + "learning_rate": 0.0005, + "epoch": 0.8058308449907282, + "step": 17980 + }, + { + "loss": 14.0166, + "grad_norm": 1.6612002849578857, + "learning_rate": 0.0005, + "epoch": 0.8060549358819937, + "step": 17985 + }, + { + "loss": 13.9435, + "grad_norm": 1.7721000909805298, + "learning_rate": 0.0005, + "epoch": 0.8062790267732592, + "step": 17990 + }, + { + "loss": 14.0486, + "grad_norm": 1.7399013042449951, + "learning_rate": 0.0005, + "epoch": 0.8065031176645248, + "step": 17995 + }, + { + "loss": 13.9363, + "grad_norm": 1.867493987083435, + "learning_rate": 0.0005, + "epoch": 0.8067272085557903, + "step": 18000 + }, + { + "eval_loss": 1.7346055507659912, + "eval_runtime": 18.7863, + "eval_samples_per_second": 872.126, + "eval_steps_per_second": 7.825, + "epoch": 0.8067272085557903, + "step": 18000 + }, + { + "loss": 13.9715, + "grad_norm": 1.8982458114624023, + "learning_rate": 0.0005, + "epoch": 0.8069512994470557, + "step": 18005 + }, + { + "loss": 13.9355, + "grad_norm": 2.0637786388397217, + "learning_rate": 0.0005, + "epoch": 0.8071753903383212, + "step": 18010 + }, + { + "loss": 13.9808, + "grad_norm": 1.6983004808425903, + "learning_rate": 0.0005, + "epoch": 0.8073994812295867, + "step": 18015 + }, + { + "loss": 13.8864, + "grad_norm": 1.8731592893600464, + "learning_rate": 0.0005, + "epoch": 0.8076235721208522, + "step": 18020 + }, + { + "loss": 13.831, + "grad_norm": 1.7289857864379883, + "learning_rate": 0.0005, + "epoch": 0.8078476630121177, + "step": 18025 + }, + { + "loss": 13.9294, + "grad_norm": 1.6939040422439575, + "learning_rate": 0.0005, + "epoch": 0.8080717539033833, + "step": 18030 + }, + { + "loss": 13.9463, + "grad_norm": 1.7209194898605347, + "learning_rate": 0.0005, + "epoch": 0.8082958447946487, + "step": 18035 + }, + { + "loss": 14.0243, + "grad_norm": 1.854943037033081, + "learning_rate": 0.0005, + "epoch": 0.8085199356859142, + "step": 18040 + }, + { + "loss": 13.9918, + "grad_norm": 1.75515878200531, + "learning_rate": 0.0005, + "epoch": 0.8087440265771797, + "step": 18045 + }, + { + "loss": 13.9171, + "grad_norm": 1.871372103691101, + "learning_rate": 0.0005, + "epoch": 0.8089681174684452, + "step": 18050 + }, + { + "loss": 13.9423, + "grad_norm": 1.8944830894470215, + "learning_rate": 0.0005, + "epoch": 0.8091922083597107, + "step": 18055 + }, + { + "loss": 13.8909, + "grad_norm": 1.8145884275436401, + "learning_rate": 0.0005, + "epoch": 0.8094162992509762, + "step": 18060 + }, + { + "loss": 13.9074, + "grad_norm": 2.0930395126342773, + "learning_rate": 0.0005, + "epoch": 0.8096403901422417, + "step": 18065 + }, + { + "loss": 13.8805, + "grad_norm": 1.8583229780197144, + "learning_rate": 0.0005, + "epoch": 0.8098644810335072, + "step": 18070 + }, + { + "loss": 13.9727, + "grad_norm": 1.7828443050384521, + "learning_rate": 0.0005, + "epoch": 0.8100885719247727, + "step": 18075 + }, + { + "loss": 13.9119, + "grad_norm": 1.7656959295272827, + "learning_rate": 0.0005, + "epoch": 0.8103126628160382, + "step": 18080 + }, + { + "loss": 13.8061, + "grad_norm": 1.8191685676574707, + "learning_rate": 0.0005, + "epoch": 0.8105367537073037, + "step": 18085 + }, + { + "loss": 13.9294, + "grad_norm": 1.8345978260040283, + "learning_rate": 0.0005, + "epoch": 0.8107608445985692, + "step": 18090 + }, + { + "loss": 13.9271, + "grad_norm": 1.7740345001220703, + "learning_rate": 0.0005, + "epoch": 0.8109849354898346, + "step": 18095 + }, + { + "loss": 13.9762, + "grad_norm": 1.8106119632720947, + "learning_rate": 0.0005, + "epoch": 0.8112090263811002, + "step": 18100 + }, + { + "loss": 13.9898, + "grad_norm": 1.719824194908142, + "learning_rate": 0.0005, + "epoch": 0.8114331172723657, + "step": 18105 + }, + { + "loss": 13.9451, + "grad_norm": 1.7734304666519165, + "learning_rate": 0.0005, + "epoch": 0.8116572081636312, + "step": 18110 + }, + { + "loss": 13.9099, + "grad_norm": 1.7129086256027222, + "learning_rate": 0.0005, + "epoch": 0.8118812990548967, + "step": 18115 + }, + { + "loss": 13.8419, + "grad_norm": 1.9217442274093628, + "learning_rate": 0.0005, + "epoch": 0.8121053899461622, + "step": 18120 + }, + { + "loss": 13.8746, + "grad_norm": 2.0497589111328125, + "learning_rate": 0.0005, + "epoch": 0.8123294808374276, + "step": 18125 + }, + { + "loss": 13.8953, + "grad_norm": 2.071010112762451, + "learning_rate": 0.0005, + "epoch": 0.8125535717286931, + "step": 18130 + }, + { + "loss": 13.9024, + "grad_norm": 1.7323050498962402, + "learning_rate": 0.0005, + "epoch": 0.8127776626199587, + "step": 18135 + }, + { + "loss": 13.7554, + "grad_norm": 1.8137826919555664, + "learning_rate": 0.0005, + "epoch": 0.8130017535112242, + "step": 18140 + }, + { + "loss": 13.9382, + "grad_norm": 1.8626395463943481, + "learning_rate": 0.0005, + "epoch": 0.8132258444024897, + "step": 18145 + }, + { + "loss": 13.9762, + "grad_norm": 1.7347149848937988, + "learning_rate": 0.0005, + "epoch": 0.8134499352937552, + "step": 18150 + }, + { + "loss": 13.9163, + "grad_norm": 1.817475438117981, + "learning_rate": 0.0005, + "epoch": 0.8136740261850206, + "step": 18155 + }, + { + "loss": 13.8897, + "grad_norm": 1.7329870462417603, + "learning_rate": 0.0005, + "epoch": 0.8138981170762861, + "step": 18160 + }, + { + "loss": 13.9202, + "grad_norm": 1.714704155921936, + "learning_rate": 0.0005, + "epoch": 0.8141222079675516, + "step": 18165 + }, + { + "loss": 13.974, + "grad_norm": 1.8908694982528687, + "learning_rate": 0.0005, + "epoch": 0.8143462988588172, + "step": 18170 + }, + { + "loss": 14.0251, + "grad_norm": 1.8410272598266602, + "learning_rate": 0.0005, + "epoch": 0.8145703897500827, + "step": 18175 + }, + { + "loss": 13.8982, + "grad_norm": 1.8369451761245728, + "learning_rate": 0.0005, + "epoch": 0.8147944806413482, + "step": 18180 + }, + { + "loss": 13.865, + "grad_norm": 1.8976902961730957, + "learning_rate": 0.0005, + "epoch": 0.8150185715326136, + "step": 18185 + }, + { + "loss": 13.9005, + "grad_norm": 1.7548612356185913, + "learning_rate": 0.0005, + "epoch": 0.8152426624238791, + "step": 18190 + }, + { + "loss": 13.8124, + "grad_norm": 1.7536848783493042, + "learning_rate": 0.0005, + "epoch": 0.8154667533151446, + "step": 18195 + }, + { + "loss": 13.9581, + "grad_norm": 1.722154140472412, + "learning_rate": 0.0005, + "epoch": 0.8156908442064101, + "step": 18200 + }, + { + "loss": 13.8407, + "grad_norm": 1.8374119997024536, + "learning_rate": 0.0005, + "epoch": 0.8159149350976757, + "step": 18205 + }, + { + "loss": 13.8782, + "grad_norm": 1.992284893989563, + "learning_rate": 0.0005, + "epoch": 0.8161390259889412, + "step": 18210 + }, + { + "loss": 13.8929, + "grad_norm": 1.832295298576355, + "learning_rate": 0.0005, + "epoch": 0.8163631168802066, + "step": 18215 + }, + { + "loss": 13.9147, + "grad_norm": 1.9362643957138062, + "learning_rate": 0.0005, + "epoch": 0.8165872077714721, + "step": 18220 + }, + { + "loss": 13.9553, + "grad_norm": 1.7259230613708496, + "learning_rate": 0.0005, + "epoch": 0.8168112986627376, + "step": 18225 + }, + { + "loss": 13.9219, + "grad_norm": 1.7937911748886108, + "learning_rate": 0.0005, + "epoch": 0.8170353895540031, + "step": 18230 + }, + { + "loss": 13.9057, + "grad_norm": 1.9405204057693481, + "learning_rate": 0.0005, + "epoch": 0.8172594804452686, + "step": 18235 + }, + { + "loss": 13.7556, + "grad_norm": 1.8398072719573975, + "learning_rate": 0.0005, + "epoch": 0.817483571336534, + "step": 18240 + }, + { + "loss": 13.9854, + "grad_norm": 1.8530958890914917, + "learning_rate": 0.0005, + "epoch": 0.8177076622277996, + "step": 18245 + }, + { + "loss": 14.0051, + "grad_norm": 1.9418649673461914, + "learning_rate": 0.0005, + "epoch": 0.8179317531190651, + "step": 18250 + }, + { + "loss": 13.8789, + "grad_norm": 1.7378404140472412, + "learning_rate": 0.0005, + "epoch": 0.8181558440103306, + "step": 18255 + }, + { + "loss": 13.952, + "grad_norm": 1.8054472208023071, + "learning_rate": 0.0005, + "epoch": 0.8183799349015961, + "step": 18260 + }, + { + "loss": 13.8416, + "grad_norm": 1.6757985353469849, + "learning_rate": 0.0005, + "epoch": 0.8186040257928616, + "step": 18265 + }, + { + "loss": 13.8685, + "grad_norm": 1.7990907430648804, + "learning_rate": 0.0005, + "epoch": 0.818828116684127, + "step": 18270 + }, + { + "loss": 13.9584, + "grad_norm": 1.8647428750991821, + "learning_rate": 0.0005, + "epoch": 0.8190522075753925, + "step": 18275 + }, + { + "loss": 13.8945, + "grad_norm": 1.6959542036056519, + "learning_rate": 0.0005, + "epoch": 0.8192762984666581, + "step": 18280 + }, + { + "loss": 13.9194, + "grad_norm": 1.8789013624191284, + "learning_rate": 0.0005, + "epoch": 0.8195003893579236, + "step": 18285 + }, + { + "loss": 13.9844, + "grad_norm": 1.7351081371307373, + "learning_rate": 0.0005, + "epoch": 0.8197244802491891, + "step": 18290 + }, + { + "loss": 13.8088, + "grad_norm": 1.7942126989364624, + "learning_rate": 0.0005, + "epoch": 0.8199485711404546, + "step": 18295 + }, + { + "loss": 13.8635, + "grad_norm": 1.7644922733306885, + "learning_rate": 0.0005, + "epoch": 0.82017266203172, + "step": 18300 + }, + { + "loss": 13.9958, + "grad_norm": 1.9505988359451294, + "learning_rate": 0.0005, + "epoch": 0.8203967529229855, + "step": 18305 + }, + { + "loss": 13.8548, + "grad_norm": 1.931020975112915, + "learning_rate": 0.0005, + "epoch": 0.820620843814251, + "step": 18310 + }, + { + "loss": 13.9201, + "grad_norm": 1.907620906829834, + "learning_rate": 0.0005, + "epoch": 0.8208449347055166, + "step": 18315 + }, + { + "loss": 13.9423, + "grad_norm": 1.9632011651992798, + "learning_rate": 0.0005, + "epoch": 0.8210690255967821, + "step": 18320 + }, + { + "loss": 13.9251, + "grad_norm": 1.6815967559814453, + "learning_rate": 0.0005, + "epoch": 0.8212931164880476, + "step": 18325 + }, + { + "loss": 13.8988, + "grad_norm": 1.7095072269439697, + "learning_rate": 0.0005, + "epoch": 0.821517207379313, + "step": 18330 + }, + { + "loss": 13.9442, + "grad_norm": 1.7073733806610107, + "learning_rate": 0.0005, + "epoch": 0.8217412982705785, + "step": 18335 + }, + { + "loss": 13.9667, + "grad_norm": 1.9493305683135986, + "learning_rate": 0.0005, + "epoch": 0.821965389161844, + "step": 18340 + }, + { + "loss": 13.8833, + "grad_norm": 1.689464807510376, + "learning_rate": 0.0005, + "epoch": 0.8221894800531095, + "step": 18345 + }, + { + "loss": 13.8829, + "grad_norm": 1.9471354484558105, + "learning_rate": 0.0005, + "epoch": 0.8224135709443751, + "step": 18350 + }, + { + "loss": 13.8999, + "grad_norm": 1.8226585388183594, + "learning_rate": 0.0005, + "epoch": 0.8226376618356406, + "step": 18355 + }, + { + "loss": 13.9831, + "grad_norm": 1.8304983377456665, + "learning_rate": 0.0005, + "epoch": 0.822861752726906, + "step": 18360 + }, + { + "loss": 14.0138, + "grad_norm": 1.7459100484848022, + "learning_rate": 0.0005, + "epoch": 0.8230858436181715, + "step": 18365 + }, + { + "loss": 13.9309, + "grad_norm": 1.6721384525299072, + "learning_rate": 0.0005, + "epoch": 0.823309934509437, + "step": 18370 + }, + { + "loss": 13.9495, + "grad_norm": 1.7700474262237549, + "learning_rate": 0.0005, + "epoch": 0.8235340254007025, + "step": 18375 + }, + { + "loss": 13.9657, + "grad_norm": 1.7054543495178223, + "learning_rate": 0.0005, + "epoch": 0.823758116291968, + "step": 18380 + }, + { + "loss": 14.0339, + "grad_norm": 1.7842026948928833, + "learning_rate": 0.0005, + "epoch": 0.8239822071832336, + "step": 18385 + }, + { + "loss": 13.9015, + "grad_norm": 1.9943057298660278, + "learning_rate": 0.0005, + "epoch": 0.824206298074499, + "step": 18390 + }, + { + "loss": 13.9177, + "grad_norm": 2.0690407752990723, + "learning_rate": 0.0005, + "epoch": 0.8244303889657645, + "step": 18395 + }, + { + "loss": 13.9146, + "grad_norm": 1.7490520477294922, + "learning_rate": 0.0005, + "epoch": 0.82465447985703, + "step": 18400 + }, + { + "loss": 13.8496, + "grad_norm": 1.9926691055297852, + "learning_rate": 0.0005, + "epoch": 0.8248785707482955, + "step": 18405 + }, + { + "loss": 13.9622, + "grad_norm": 1.9529262781143188, + "learning_rate": 0.0005, + "epoch": 0.825102661639561, + "step": 18410 + }, + { + "loss": 13.901, + "grad_norm": 1.7856429815292358, + "learning_rate": 0.0005, + "epoch": 0.8253267525308265, + "step": 18415 + }, + { + "loss": 13.8567, + "grad_norm": 1.7407591342926025, + "learning_rate": 0.0005, + "epoch": 0.825550843422092, + "step": 18420 + }, + { + "loss": 13.8574, + "grad_norm": 1.813948631286621, + "learning_rate": 0.0005, + "epoch": 0.8257749343133575, + "step": 18425 + }, + { + "loss": 13.9195, + "grad_norm": 1.7237603664398193, + "learning_rate": 0.0005, + "epoch": 0.825999025204623, + "step": 18430 + }, + { + "loss": 13.8594, + "grad_norm": 1.8184058666229248, + "learning_rate": 0.0005, + "epoch": 0.8262231160958885, + "step": 18435 + }, + { + "loss": 13.9363, + "grad_norm": 1.6922754049301147, + "learning_rate": 0.0005, + "epoch": 0.826447206987154, + "step": 18440 + }, + { + "loss": 13.9343, + "grad_norm": 1.7327067852020264, + "learning_rate": 0.0005, + "epoch": 0.8266712978784195, + "step": 18445 + }, + { + "loss": 13.8157, + "grad_norm": 1.8410130739212036, + "learning_rate": 0.0005, + "epoch": 0.8268953887696849, + "step": 18450 + }, + { + "loss": 13.8803, + "grad_norm": 1.9221383333206177, + "learning_rate": 0.0005, + "epoch": 0.8271194796609505, + "step": 18455 + }, + { + "loss": 13.9386, + "grad_norm": 1.9344953298568726, + "learning_rate": 0.0005, + "epoch": 0.827343570552216, + "step": 18460 + }, + { + "loss": 13.9093, + "grad_norm": 2.010633707046509, + "learning_rate": 0.0005, + "epoch": 0.8275676614434815, + "step": 18465 + }, + { + "loss": 13.865, + "grad_norm": 1.995612621307373, + "learning_rate": 0.0005, + "epoch": 0.827791752334747, + "step": 18470 + }, + { + "loss": 13.8581, + "grad_norm": 1.7131673097610474, + "learning_rate": 0.0005, + "epoch": 0.8280158432260125, + "step": 18475 + }, + { + "loss": 13.9404, + "grad_norm": 1.925093173980713, + "learning_rate": 0.0005, + "epoch": 0.8282399341172779, + "step": 18480 + }, + { + "loss": 13.927, + "grad_norm": 1.9550200700759888, + "learning_rate": 0.0005, + "epoch": 0.8284640250085434, + "step": 18485 + }, + { + "loss": 13.9704, + "grad_norm": 1.8336318731307983, + "learning_rate": 0.0005, + "epoch": 0.828688115899809, + "step": 18490 + }, + { + "loss": 13.8936, + "grad_norm": 1.737156867980957, + "learning_rate": 0.0005, + "epoch": 0.8289122067910745, + "step": 18495 + }, + { + "loss": 13.8499, + "grad_norm": 1.6031452417373657, + "learning_rate": 0.0005, + "epoch": 0.82913629768234, + "step": 18500 + }, + { + "eval_loss": 1.7327139377593994, + "eval_runtime": 18.9496, + "eval_samples_per_second": 864.608, + "eval_steps_per_second": 7.757, + "epoch": 0.82913629768234, + "step": 18500 + }, + { + "loss": 13.9124, + "grad_norm": 1.787706732749939, + "learning_rate": 0.0005, + "epoch": 0.8293603885736055, + "step": 18505 + }, + { + "loss": 13.8967, + "grad_norm": 1.7352012395858765, + "learning_rate": 0.0005, + "epoch": 0.8295844794648709, + "step": 18510 + }, + { + "loss": 13.9343, + "grad_norm": 1.8264018297195435, + "learning_rate": 0.0005, + "epoch": 0.8298085703561364, + "step": 18515 + }, + { + "loss": 13.9116, + "grad_norm": 1.9214221239089966, + "learning_rate": 0.0005, + "epoch": 0.8300326612474019, + "step": 18520 + }, + { + "loss": 13.8691, + "grad_norm": 1.6791073083877563, + "learning_rate": 0.0005, + "epoch": 0.8302567521386675, + "step": 18525 + }, + { + "loss": 13.9978, + "grad_norm": 1.7033040523529053, + "learning_rate": 0.0005, + "epoch": 0.830480843029933, + "step": 18530 + }, + { + "loss": 13.9676, + "grad_norm": 1.7302173376083374, + "learning_rate": 0.0005, + "epoch": 0.8307049339211985, + "step": 18535 + }, + { + "loss": 13.8587, + "grad_norm": 1.6527957916259766, + "learning_rate": 0.0005, + "epoch": 0.8309290248124639, + "step": 18540 + }, + { + "loss": 14.0066, + "grad_norm": 1.7439963817596436, + "learning_rate": 0.0005, + "epoch": 0.8311531157037294, + "step": 18545 + }, + { + "loss": 13.8333, + "grad_norm": 1.8020864725112915, + "learning_rate": 0.0005, + "epoch": 0.8313772065949949, + "step": 18550 + }, + { + "loss": 13.9397, + "grad_norm": 1.6987053155899048, + "learning_rate": 0.0005, + "epoch": 0.8316012974862604, + "step": 18555 + }, + { + "loss": 13.9715, + "grad_norm": 1.8397094011306763, + "learning_rate": 0.0005, + "epoch": 0.831825388377526, + "step": 18560 + }, + { + "loss": 13.8542, + "grad_norm": 1.8940035104751587, + "learning_rate": 0.0005, + "epoch": 0.8320494792687915, + "step": 18565 + }, + { + "loss": 13.7612, + "grad_norm": 1.703979730606079, + "learning_rate": 0.0005, + "epoch": 0.8322735701600569, + "step": 18570 + }, + { + "loss": 13.8202, + "grad_norm": 1.646026372909546, + "learning_rate": 0.0005, + "epoch": 0.8324976610513224, + "step": 18575 + }, + { + "loss": 13.9224, + "grad_norm": 1.7458852529525757, + "learning_rate": 0.0005, + "epoch": 0.8327217519425879, + "step": 18580 + }, + { + "loss": 13.9089, + "grad_norm": 1.803983449935913, + "learning_rate": 0.0005, + "epoch": 0.8329458428338534, + "step": 18585 + }, + { + "loss": 13.8724, + "grad_norm": 1.8974820375442505, + "learning_rate": 0.0005, + "epoch": 0.8331699337251189, + "step": 18590 + }, + { + "loss": 13.8951, + "grad_norm": 1.6343988180160522, + "learning_rate": 0.0005, + "epoch": 0.8333940246163845, + "step": 18595 + }, + { + "loss": 13.9771, + "grad_norm": 1.8503625392913818, + "learning_rate": 0.0005, + "epoch": 0.8336181155076499, + "step": 18600 + }, + { + "loss": 13.9672, + "grad_norm": 1.7192838191986084, + "learning_rate": 0.0005, + "epoch": 0.8338422063989154, + "step": 18605 + }, + { + "loss": 13.8696, + "grad_norm": 1.7600250244140625, + "learning_rate": 0.0005, + "epoch": 0.8340662972901809, + "step": 18610 + }, + { + "loss": 13.811, + "grad_norm": 1.8360655307769775, + "learning_rate": 0.0005, + "epoch": 0.8342903881814464, + "step": 18615 + }, + { + "loss": 13.8613, + "grad_norm": 1.7966500520706177, + "learning_rate": 0.0005, + "epoch": 0.8345144790727119, + "step": 18620 + }, + { + "loss": 13.9652, + "grad_norm": 1.9188029766082764, + "learning_rate": 0.0005, + "epoch": 0.8347385699639774, + "step": 18625 + }, + { + "loss": 13.7856, + "grad_norm": 1.8910226821899414, + "learning_rate": 0.0005, + "epoch": 0.8349626608552428, + "step": 18630 + }, + { + "loss": 13.8772, + "grad_norm": 1.678817629814148, + "learning_rate": 0.0005, + "epoch": 0.8351867517465084, + "step": 18635 + }, + { + "loss": 13.9314, + "grad_norm": 1.654451608657837, + "learning_rate": 0.0005, + "epoch": 0.8354108426377739, + "step": 18640 + }, + { + "loss": 13.9197, + "grad_norm": 1.702844262123108, + "learning_rate": 0.0005, + "epoch": 0.8356349335290394, + "step": 18645 + }, + { + "loss": 13.8353, + "grad_norm": 1.6565039157867432, + "learning_rate": 0.0005, + "epoch": 0.8358590244203049, + "step": 18650 + }, + { + "loss": 13.8804, + "grad_norm": 1.90591561794281, + "learning_rate": 0.0005, + "epoch": 0.8360831153115704, + "step": 18655 + }, + { + "loss": 13.9052, + "grad_norm": 2.0706241130828857, + "learning_rate": 0.0005, + "epoch": 0.8363072062028358, + "step": 18660 + }, + { + "loss": 13.9134, + "grad_norm": 1.9571928977966309, + "learning_rate": 0.0005, + "epoch": 0.8365312970941013, + "step": 18665 + }, + { + "loss": 13.9123, + "grad_norm": 2.055427074432373, + "learning_rate": 0.0005, + "epoch": 0.8367553879853669, + "step": 18670 + }, + { + "loss": 13.9184, + "grad_norm": 1.6851314306259155, + "learning_rate": 0.0005, + "epoch": 0.8369794788766324, + "step": 18675 + }, + { + "loss": 13.8479, + "grad_norm": 1.6859577894210815, + "learning_rate": 0.0005, + "epoch": 0.8372035697678979, + "step": 18680 + }, + { + "loss": 13.8261, + "grad_norm": 1.812279224395752, + "learning_rate": 0.0005, + "epoch": 0.8374276606591634, + "step": 18685 + }, + { + "loss": 13.9274, + "grad_norm": 1.7130305767059326, + "learning_rate": 0.0005, + "epoch": 0.8376517515504288, + "step": 18690 + }, + { + "loss": 13.8953, + "grad_norm": 1.7281595468521118, + "learning_rate": 0.0005, + "epoch": 0.8378758424416943, + "step": 18695 + }, + { + "loss": 13.8708, + "grad_norm": 1.7609429359436035, + "learning_rate": 0.0005, + "epoch": 0.8380999333329598, + "step": 18700 + }, + { + "loss": 13.9123, + "grad_norm": 1.6971237659454346, + "learning_rate": 0.0005, + "epoch": 0.8383240242242254, + "step": 18705 + }, + { + "loss": 13.8957, + "grad_norm": 1.8378429412841797, + "learning_rate": 0.0005, + "epoch": 0.8385481151154909, + "step": 18710 + }, + { + "loss": 13.8924, + "grad_norm": 1.6270524263381958, + "learning_rate": 0.0005, + "epoch": 0.8387722060067564, + "step": 18715 + }, + { + "loss": 13.9143, + "grad_norm": 1.8351507186889648, + "learning_rate": 0.0005, + "epoch": 0.8389962968980218, + "step": 18720 + }, + { + "loss": 13.9358, + "grad_norm": 1.8356674909591675, + "learning_rate": 0.0005, + "epoch": 0.8392203877892873, + "step": 18725 + }, + { + "loss": 13.8947, + "grad_norm": 1.7781685590744019, + "learning_rate": 0.0005, + "epoch": 0.8394444786805528, + "step": 18730 + }, + { + "loss": 13.9504, + "grad_norm": 1.6525814533233643, + "learning_rate": 0.0005, + "epoch": 0.8396685695718183, + "step": 18735 + }, + { + "loss": 13.9959, + "grad_norm": 1.7032777070999146, + "learning_rate": 0.0005, + "epoch": 0.8398926604630839, + "step": 18740 + }, + { + "loss": 13.9913, + "grad_norm": 1.6849335432052612, + "learning_rate": 0.0005, + "epoch": 0.8401167513543494, + "step": 18745 + }, + { + "loss": 13.9061, + "grad_norm": 1.7574923038482666, + "learning_rate": 0.0005, + "epoch": 0.8403408422456148, + "step": 18750 + }, + { + "loss": 13.9459, + "grad_norm": 1.8186347484588623, + "learning_rate": 0.0005, + "epoch": 0.8405649331368803, + "step": 18755 + }, + { + "loss": 13.8599, + "grad_norm": 1.8368334770202637, + "learning_rate": 0.0005, + "epoch": 0.8407890240281458, + "step": 18760 + }, + { + "loss": 13.9558, + "grad_norm": 1.780190348625183, + "learning_rate": 0.0005, + "epoch": 0.8410131149194113, + "step": 18765 + }, + { + "loss": 13.8318, + "grad_norm": 1.6295452117919922, + "learning_rate": 0.0005, + "epoch": 0.8412372058106768, + "step": 18770 + }, + { + "loss": 13.9218, + "grad_norm": 1.6828653812408447, + "learning_rate": 0.0005, + "epoch": 0.8414612967019424, + "step": 18775 + }, + { + "loss": 13.9613, + "grad_norm": 1.9260400533676147, + "learning_rate": 0.0005, + "epoch": 0.8416853875932078, + "step": 18780 + }, + { + "loss": 13.943, + "grad_norm": 1.6809277534484863, + "learning_rate": 0.0005, + "epoch": 0.8419094784844733, + "step": 18785 + }, + { + "loss": 13.984, + "grad_norm": 1.7654415369033813, + "learning_rate": 0.0005, + "epoch": 0.8421335693757388, + "step": 18790 + }, + { + "loss": 13.8248, + "grad_norm": 1.678613305091858, + "learning_rate": 0.0005, + "epoch": 0.8423576602670043, + "step": 18795 + }, + { + "loss": 13.9117, + "grad_norm": 1.7469117641448975, + "learning_rate": 0.0005, + "epoch": 0.8425817511582698, + "step": 18800 + }, + { + "loss": 13.9719, + "grad_norm": 1.945115566253662, + "learning_rate": 0.0005, + "epoch": 0.8428058420495352, + "step": 18805 + }, + { + "loss": 13.8406, + "grad_norm": 1.745692253112793, + "learning_rate": 0.0005, + "epoch": 0.8430299329408008, + "step": 18810 + }, + { + "loss": 13.9183, + "grad_norm": 1.9122262001037598, + "learning_rate": 0.0005, + "epoch": 0.8432540238320663, + "step": 18815 + }, + { + "loss": 13.9377, + "grad_norm": 1.7762372493743896, + "learning_rate": 0.0005, + "epoch": 0.8434781147233318, + "step": 18820 + }, + { + "loss": 13.8317, + "grad_norm": 1.7507115602493286, + "learning_rate": 0.0005, + "epoch": 0.8437022056145973, + "step": 18825 + }, + { + "loss": 13.9714, + "grad_norm": 1.6686385869979858, + "learning_rate": 0.0005, + "epoch": 0.8439262965058628, + "step": 18830 + }, + { + "loss": 13.8847, + "grad_norm": 1.852190375328064, + "learning_rate": 0.0005, + "epoch": 0.8441503873971282, + "step": 18835 + }, + { + "loss": 13.8739, + "grad_norm": 1.7109217643737793, + "learning_rate": 0.0005, + "epoch": 0.8443744782883937, + "step": 18840 + }, + { + "loss": 13.9082, + "grad_norm": 1.9363154172897339, + "learning_rate": 0.0005, + "epoch": 0.8445985691796593, + "step": 18845 + }, + { + "loss": 13.8858, + "grad_norm": 1.8679718971252441, + "learning_rate": 0.0005, + "epoch": 0.8448226600709248, + "step": 18850 + }, + { + "loss": 13.8217, + "grad_norm": 2.1826541423797607, + "learning_rate": 0.0005, + "epoch": 0.8450467509621903, + "step": 18855 + }, + { + "loss": 13.9356, + "grad_norm": 1.6823848485946655, + "learning_rate": 0.0005, + "epoch": 0.8452708418534558, + "step": 18860 + }, + { + "loss": 13.8828, + "grad_norm": 1.6551467180252075, + "learning_rate": 0.0005, + "epoch": 0.8454949327447212, + "step": 18865 + }, + { + "loss": 13.9097, + "grad_norm": 1.7100468873977661, + "learning_rate": 0.0005, + "epoch": 0.8457190236359867, + "step": 18870 + }, + { + "loss": 13.987, + "grad_norm": 1.6265673637390137, + "learning_rate": 0.0005, + "epoch": 0.8459431145272522, + "step": 18875 + }, + { + "loss": 13.9429, + "grad_norm": 1.8793754577636719, + "learning_rate": 0.0005, + "epoch": 0.8461672054185178, + "step": 18880 + }, + { + "loss": 13.897, + "grad_norm": 1.7657238245010376, + "learning_rate": 0.0005, + "epoch": 0.8463912963097833, + "step": 18885 + }, + { + "loss": 13.9154, + "grad_norm": 1.8032546043395996, + "learning_rate": 0.0005, + "epoch": 0.8466153872010488, + "step": 18890 + }, + { + "loss": 14.0057, + "grad_norm": 1.7182867527008057, + "learning_rate": 0.0005, + "epoch": 0.8468394780923142, + "step": 18895 + }, + { + "loss": 13.8333, + "grad_norm": 1.7613600492477417, + "learning_rate": 0.0005, + "epoch": 0.8470635689835797, + "step": 18900 + }, + { + "loss": 13.9154, + "grad_norm": 1.8472468852996826, + "learning_rate": 0.0005, + "epoch": 0.8472876598748452, + "step": 18905 + }, + { + "loss": 13.8747, + "grad_norm": 1.6938554048538208, + "learning_rate": 0.0005, + "epoch": 0.8475117507661107, + "step": 18910 + }, + { + "loss": 13.9094, + "grad_norm": 1.8218588829040527, + "learning_rate": 0.0005, + "epoch": 0.8477358416573763, + "step": 18915 + }, + { + "loss": 13.91, + "grad_norm": 1.8205779790878296, + "learning_rate": 0.0005, + "epoch": 0.8479599325486418, + "step": 18920 + }, + { + "loss": 14.0384, + "grad_norm": 1.8513416051864624, + "learning_rate": 0.0005, + "epoch": 0.8481840234399072, + "step": 18925 + }, + { + "loss": 13.8546, + "grad_norm": 1.8248463869094849, + "learning_rate": 0.0005, + "epoch": 0.8484081143311727, + "step": 18930 + }, + { + "loss": 13.9754, + "grad_norm": 2.1561028957366943, + "learning_rate": 0.0005, + "epoch": 0.8486322052224382, + "step": 18935 + }, + { + "loss": 13.9083, + "grad_norm": 1.9676803350448608, + "learning_rate": 0.0005, + "epoch": 0.8488562961137037, + "step": 18940 + }, + { + "loss": 13.9145, + "grad_norm": 1.7778428792953491, + "learning_rate": 0.0005, + "epoch": 0.8490803870049692, + "step": 18945 + }, + { + "loss": 13.9913, + "grad_norm": 1.7351329326629639, + "learning_rate": 0.0005, + "epoch": 0.8493044778962348, + "step": 18950 + }, + { + "loss": 13.84, + "grad_norm": 1.831153154373169, + "learning_rate": 0.0005, + "epoch": 0.8495285687875002, + "step": 18955 + }, + { + "loss": 13.8671, + "grad_norm": 1.6546090841293335, + "learning_rate": 0.0005, + "epoch": 0.8497526596787657, + "step": 18960 + }, + { + "loss": 13.9135, + "grad_norm": 1.8007551431655884, + "learning_rate": 0.0005, + "epoch": 0.8499767505700312, + "step": 18965 + }, + { + "loss": 13.9361, + "grad_norm": 2.103647232055664, + "learning_rate": 0.0005, + "epoch": 0.8502008414612967, + "step": 18970 + }, + { + "loss": 13.8892, + "grad_norm": 1.7419698238372803, + "learning_rate": 0.0005, + "epoch": 0.8504249323525622, + "step": 18975 + }, + { + "loss": 13.9764, + "grad_norm": 1.784730076789856, + "learning_rate": 0.0005, + "epoch": 0.8506490232438277, + "step": 18980 + }, + { + "loss": 13.9872, + "grad_norm": 1.8239227533340454, + "learning_rate": 0.0005, + "epoch": 0.8508731141350931, + "step": 18985 + }, + { + "loss": 13.8051, + "grad_norm": 1.6747732162475586, + "learning_rate": 0.0005, + "epoch": 0.8510972050263587, + "step": 18990 + }, + { + "loss": 13.8853, + "grad_norm": 1.812728762626648, + "learning_rate": 0.0005, + "epoch": 0.8513212959176242, + "step": 18995 + }, + { + "loss": 13.9022, + "grad_norm": 1.6415209770202637, + "learning_rate": 0.0005, + "epoch": 0.8515453868088897, + "step": 19000 + }, + { + "eval_loss": 1.7338833808898926, + "eval_runtime": 18.642, + "eval_samples_per_second": 878.875, + "eval_steps_per_second": 7.885, + "epoch": 0.8515453868088897, + "step": 19000 + }, + { + "loss": 13.9902, + "grad_norm": 1.8604235649108887, + "learning_rate": 0.0005, + "epoch": 0.8517694777001552, + "step": 19005 + }, + { + "loss": 13.9708, + "grad_norm": 2.1418986320495605, + "learning_rate": 0.0005, + "epoch": 0.8519935685914207, + "step": 19010 + }, + { + "loss": 13.8404, + "grad_norm": 1.7601839303970337, + "learning_rate": 0.0005, + "epoch": 0.8522176594826861, + "step": 19015 + }, + { + "loss": 13.8326, + "grad_norm": 1.786251187324524, + "learning_rate": 0.0005, + "epoch": 0.8524417503739516, + "step": 19020 + }, + { + "loss": 13.9209, + "grad_norm": 1.8217843770980835, + "learning_rate": 0.0005, + "epoch": 0.8526658412652172, + "step": 19025 + }, + { + "loss": 13.8807, + "grad_norm": 1.7725396156311035, + "learning_rate": 0.0005, + "epoch": 0.8528899321564827, + "step": 19030 + }, + { + "loss": 13.9871, + "grad_norm": 1.8573862314224243, + "learning_rate": 0.0005, + "epoch": 0.8531140230477482, + "step": 19035 + }, + { + "loss": 13.8264, + "grad_norm": 1.7490694522857666, + "learning_rate": 0.0005, + "epoch": 0.8533381139390137, + "step": 19040 + }, + { + "loss": 13.8547, + "grad_norm": 1.8021537065505981, + "learning_rate": 0.0005, + "epoch": 0.8535622048302791, + "step": 19045 + }, + { + "loss": 13.8538, + "grad_norm": 2.0671534538269043, + "learning_rate": 0.0005, + "epoch": 0.8537862957215446, + "step": 19050 + }, + { + "loss": 13.9328, + "grad_norm": 1.8159527778625488, + "learning_rate": 0.0005, + "epoch": 0.8540103866128101, + "step": 19055 + }, + { + "loss": 13.8714, + "grad_norm": 1.8175792694091797, + "learning_rate": 0.0005, + "epoch": 0.8542344775040757, + "step": 19060 + }, + { + "loss": 13.8578, + "grad_norm": 1.8844847679138184, + "learning_rate": 0.0005, + "epoch": 0.8544585683953412, + "step": 19065 + }, + { + "loss": 13.9442, + "grad_norm": 1.9673024415969849, + "learning_rate": 0.0005, + "epoch": 0.8546826592866067, + "step": 19070 + }, + { + "loss": 13.871, + "grad_norm": 1.8300015926361084, + "learning_rate": 0.0005, + "epoch": 0.8549067501778721, + "step": 19075 + }, + { + "loss": 13.913, + "grad_norm": 1.7731380462646484, + "learning_rate": 0.0005, + "epoch": 0.8551308410691376, + "step": 19080 + }, + { + "loss": 14.018, + "grad_norm": 1.9178194999694824, + "learning_rate": 0.0005, + "epoch": 0.8553549319604031, + "step": 19085 + }, + { + "loss": 13.7574, + "grad_norm": 1.7870759963989258, + "learning_rate": 0.0005, + "epoch": 0.8555790228516686, + "step": 19090 + }, + { + "loss": 13.7963, + "grad_norm": 1.7498259544372559, + "learning_rate": 0.0005, + "epoch": 0.8558031137429342, + "step": 19095 + }, + { + "loss": 13.8313, + "grad_norm": 1.940000295639038, + "learning_rate": 0.0005, + "epoch": 0.8560272046341997, + "step": 19100 + }, + { + "loss": 13.858, + "grad_norm": 2.03861403465271, + "learning_rate": 0.0005, + "epoch": 0.8562512955254651, + "step": 19105 + }, + { + "loss": 13.9348, + "grad_norm": 1.7815747261047363, + "learning_rate": 0.0005, + "epoch": 0.8564753864167306, + "step": 19110 + }, + { + "loss": 13.8962, + "grad_norm": 1.8152567148208618, + "learning_rate": 0.0005, + "epoch": 0.8566994773079961, + "step": 19115 + }, + { + "loss": 13.8382, + "grad_norm": 1.733628511428833, + "learning_rate": 0.0005, + "epoch": 0.8569235681992616, + "step": 19120 + }, + { + "loss": 13.7807, + "grad_norm": 1.7825106382369995, + "learning_rate": 0.0005, + "epoch": 0.8571476590905271, + "step": 19125 + }, + { + "loss": 13.9753, + "grad_norm": 1.7949985265731812, + "learning_rate": 0.0005, + "epoch": 0.8573717499817927, + "step": 19130 + }, + { + "loss": 13.9514, + "grad_norm": 1.7800675630569458, + "learning_rate": 0.0005, + "epoch": 0.8575958408730581, + "step": 19135 + }, + { + "loss": 13.8505, + "grad_norm": 1.756445288658142, + "learning_rate": 0.0005, + "epoch": 0.8578199317643236, + "step": 19140 + }, + { + "loss": 13.9917, + "grad_norm": 1.7357224225997925, + "learning_rate": 0.0005, + "epoch": 0.8580440226555891, + "step": 19145 + }, + { + "loss": 13.9566, + "grad_norm": 1.7281850576400757, + "learning_rate": 0.0005, + "epoch": 0.8582681135468546, + "step": 19150 + }, + { + "loss": 13.9108, + "grad_norm": 1.8354952335357666, + "learning_rate": 0.0005, + "epoch": 0.8584922044381201, + "step": 19155 + }, + { + "loss": 13.8971, + "grad_norm": 1.9671531915664673, + "learning_rate": 0.0005, + "epoch": 0.8587162953293856, + "step": 19160 + }, + { + "loss": 13.9922, + "grad_norm": 1.797380805015564, + "learning_rate": 0.0005, + "epoch": 0.858940386220651, + "step": 19165 + }, + { + "loss": 13.8633, + "grad_norm": 1.87890625, + "learning_rate": 0.0005, + "epoch": 0.8591644771119166, + "step": 19170 + }, + { + "loss": 14.0032, + "grad_norm": 1.718583345413208, + "learning_rate": 0.0005, + "epoch": 0.8593885680031821, + "step": 19175 + }, + { + "loss": 13.8237, + "grad_norm": 1.7219116687774658, + "learning_rate": 0.0005, + "epoch": 0.8596126588944476, + "step": 19180 + }, + { + "loss": 13.8988, + "grad_norm": 1.7475054264068604, + "learning_rate": 0.0005, + "epoch": 0.8598367497857131, + "step": 19185 + }, + { + "loss": 13.8977, + "grad_norm": 1.7914988994598389, + "learning_rate": 0.0005, + "epoch": 0.8600608406769786, + "step": 19190 + }, + { + "loss": 13.982, + "grad_norm": 1.9456027746200562, + "learning_rate": 0.0005, + "epoch": 0.860284931568244, + "step": 19195 + }, + { + "loss": 13.9596, + "grad_norm": 1.8214281797409058, + "learning_rate": 0.0005, + "epoch": 0.8605090224595096, + "step": 19200 + }, + { + "loss": 13.9707, + "grad_norm": 1.917678713798523, + "learning_rate": 0.0005, + "epoch": 0.8607331133507751, + "step": 19205 + }, + { + "loss": 13.9555, + "grad_norm": 1.9066157341003418, + "learning_rate": 0.0005, + "epoch": 0.8609572042420406, + "step": 19210 + }, + { + "loss": 13.8718, + "grad_norm": 2.0217795372009277, + "learning_rate": 0.0005, + "epoch": 0.8611812951333061, + "step": 19215 + }, + { + "loss": 13.9169, + "grad_norm": 1.7404314279556274, + "learning_rate": 0.0005, + "epoch": 0.8614053860245716, + "step": 19220 + }, + { + "loss": 13.9322, + "grad_norm": 1.8170093297958374, + "learning_rate": 0.0005, + "epoch": 0.861629476915837, + "step": 19225 + }, + { + "loss": 13.8852, + "grad_norm": 1.9152315855026245, + "learning_rate": 0.0005, + "epoch": 0.8618535678071025, + "step": 19230 + }, + { + "loss": 13.9438, + "grad_norm": 1.834559679031372, + "learning_rate": 0.0005, + "epoch": 0.862077658698368, + "step": 19235 + }, + { + "loss": 13.9153, + "grad_norm": 1.789071798324585, + "learning_rate": 0.0005, + "epoch": 0.8623017495896336, + "step": 19240 + }, + { + "loss": 13.9149, + "grad_norm": 1.931227684020996, + "learning_rate": 0.0005, + "epoch": 0.8625258404808991, + "step": 19245 + }, + { + "loss": 13.8789, + "grad_norm": 1.8586195707321167, + "learning_rate": 0.0005, + "epoch": 0.8627499313721646, + "step": 19250 + }, + { + "loss": 13.9125, + "grad_norm": 1.851481556892395, + "learning_rate": 0.0005, + "epoch": 0.86297402226343, + "step": 19255 + }, + { + "loss": 13.9039, + "grad_norm": 1.7920666933059692, + "learning_rate": 0.0005, + "epoch": 0.8631981131546955, + "step": 19260 + }, + { + "loss": 13.8492, + "grad_norm": 1.775800108909607, + "learning_rate": 0.0005, + "epoch": 0.863422204045961, + "step": 19265 + }, + { + "loss": 13.9419, + "grad_norm": 2.010636329650879, + "learning_rate": 0.0005, + "epoch": 0.8636462949372266, + "step": 19270 + }, + { + "loss": 13.8676, + "grad_norm": 1.778244972229004, + "learning_rate": 0.0005, + "epoch": 0.8638703858284921, + "step": 19275 + }, + { + "loss": 13.8888, + "grad_norm": 1.7896620035171509, + "learning_rate": 0.0005, + "epoch": 0.8640944767197576, + "step": 19280 + }, + { + "loss": 13.7832, + "grad_norm": 1.8602795600891113, + "learning_rate": 0.0005, + "epoch": 0.864318567611023, + "step": 19285 + }, + { + "loss": 13.9504, + "grad_norm": 1.7610522508621216, + "learning_rate": 0.0005, + "epoch": 0.8645426585022885, + "step": 19290 + }, + { + "loss": 13.8595, + "grad_norm": 1.7807914018630981, + "learning_rate": 0.0005, + "epoch": 0.864766749393554, + "step": 19295 + }, + { + "loss": 13.8986, + "grad_norm": 1.550917148590088, + "learning_rate": 0.0005, + "epoch": 0.8649908402848195, + "step": 19300 + }, + { + "loss": 13.9378, + "grad_norm": 1.7979971170425415, + "learning_rate": 0.0005, + "epoch": 0.8652149311760851, + "step": 19305 + }, + { + "loss": 13.936, + "grad_norm": 1.7553660869598389, + "learning_rate": 0.0005, + "epoch": 0.8654390220673506, + "step": 19310 + }, + { + "loss": 13.7785, + "grad_norm": 1.725150465965271, + "learning_rate": 0.0005, + "epoch": 0.865663112958616, + "step": 19315 + }, + { + "loss": 13.8794, + "grad_norm": 2.143718957901001, + "learning_rate": 0.0005, + "epoch": 0.8658872038498815, + "step": 19320 + }, + { + "loss": 13.8788, + "grad_norm": 1.7977149486541748, + "learning_rate": 0.0005, + "epoch": 0.866111294741147, + "step": 19325 + }, + { + "loss": 13.8802, + "grad_norm": 1.9429805278778076, + "learning_rate": 0.0005, + "epoch": 0.8663353856324125, + "step": 19330 + }, + { + "loss": 13.831, + "grad_norm": 1.764933705329895, + "learning_rate": 0.0005, + "epoch": 0.866559476523678, + "step": 19335 + }, + { + "loss": 13.8147, + "grad_norm": 1.8815680742263794, + "learning_rate": 0.0005, + "epoch": 0.8667835674149436, + "step": 19340 + }, + { + "loss": 13.9175, + "grad_norm": 1.7224299907684326, + "learning_rate": 0.0005, + "epoch": 0.867007658306209, + "step": 19345 + }, + { + "loss": 13.86, + "grad_norm": 1.8962249755859375, + "learning_rate": 0.0005, + "epoch": 0.8672317491974745, + "step": 19350 + }, + { + "loss": 13.9039, + "grad_norm": 1.7022427320480347, + "learning_rate": 0.0005, + "epoch": 0.86745584008874, + "step": 19355 + }, + { + "loss": 13.886, + "grad_norm": 1.8437137603759766, + "learning_rate": 0.0005, + "epoch": 0.8676799309800055, + "step": 19360 + }, + { + "loss": 13.9262, + "grad_norm": 1.783321499824524, + "learning_rate": 0.0005, + "epoch": 0.867904021871271, + "step": 19365 + }, + { + "loss": 13.883, + "grad_norm": 1.7349745035171509, + "learning_rate": 0.0005, + "epoch": 0.8681281127625364, + "step": 19370 + }, + { + "loss": 13.8675, + "grad_norm": 1.6686931848526, + "learning_rate": 0.0005, + "epoch": 0.868352203653802, + "step": 19375 + }, + { + "loss": 13.7891, + "grad_norm": 1.6508697271347046, + "learning_rate": 0.0005, + "epoch": 0.8685762945450675, + "step": 19380 + }, + { + "loss": 13.9467, + "grad_norm": 1.7480441331863403, + "learning_rate": 0.0005, + "epoch": 0.868800385436333, + "step": 19385 + }, + { + "loss": 13.9946, + "grad_norm": 1.7600085735321045, + "learning_rate": 0.0005, + "epoch": 0.8690244763275985, + "step": 19390 + }, + { + "loss": 13.8532, + "grad_norm": 1.7637922763824463, + "learning_rate": 0.0005, + "epoch": 0.869248567218864, + "step": 19395 + }, + { + "loss": 13.7949, + "grad_norm": 1.7422178983688354, + "learning_rate": 0.0005, + "epoch": 0.8694726581101294, + "step": 19400 + }, + { + "loss": 13.8172, + "grad_norm": 1.7967084646224976, + "learning_rate": 0.0005, + "epoch": 0.8696967490013949, + "step": 19405 + }, + { + "loss": 13.7484, + "grad_norm": 1.632609486579895, + "learning_rate": 0.0005, + "epoch": 0.8699208398926604, + "step": 19410 + }, + { + "loss": 13.9417, + "grad_norm": 1.8991572856903076, + "learning_rate": 0.0005, + "epoch": 0.870144930783926, + "step": 19415 + }, + { + "loss": 13.9278, + "grad_norm": 1.9792591333389282, + "learning_rate": 0.0005, + "epoch": 0.8703690216751915, + "step": 19420 + }, + { + "loss": 13.9294, + "grad_norm": 1.761389970779419, + "learning_rate": 0.0005, + "epoch": 0.870593112566457, + "step": 19425 + }, + { + "loss": 13.9177, + "grad_norm": 1.7680258750915527, + "learning_rate": 0.0005, + "epoch": 0.8708172034577224, + "step": 19430 + }, + { + "loss": 13.9728, + "grad_norm": 2.0201072692871094, + "learning_rate": 0.0005, + "epoch": 0.8710412943489879, + "step": 19435 + }, + { + "loss": 13.8466, + "grad_norm": 1.7837570905685425, + "learning_rate": 0.0005, + "epoch": 0.8712653852402534, + "step": 19440 + }, + { + "loss": 13.8657, + "grad_norm": 1.7108758687973022, + "learning_rate": 0.0005, + "epoch": 0.871489476131519, + "step": 19445 + }, + { + "loss": 13.916, + "grad_norm": 1.9229105710983276, + "learning_rate": 0.0005, + "epoch": 0.8717135670227845, + "step": 19450 + }, + { + "loss": 13.8466, + "grad_norm": 1.9333982467651367, + "learning_rate": 0.0005, + "epoch": 0.87193765791405, + "step": 19455 + }, + { + "loss": 13.8918, + "grad_norm": 1.7065330743789673, + "learning_rate": 0.0005, + "epoch": 0.8721617488053154, + "step": 19460 + }, + { + "loss": 13.809, + "grad_norm": 2.0091190338134766, + "learning_rate": 0.0005, + "epoch": 0.8723858396965809, + "step": 19465 + }, + { + "loss": 13.9354, + "grad_norm": 1.8350383043289185, + "learning_rate": 0.0005, + "epoch": 0.8726099305878464, + "step": 19470 + }, + { + "loss": 13.8716, + "grad_norm": 1.9058568477630615, + "learning_rate": 0.0005, + "epoch": 0.8728340214791119, + "step": 19475 + }, + { + "loss": 13.8056, + "grad_norm": 1.6378625631332397, + "learning_rate": 0.0005, + "epoch": 0.8730581123703774, + "step": 19480 + }, + { + "loss": 13.8535, + "grad_norm": 1.866127610206604, + "learning_rate": 0.0005, + "epoch": 0.873282203261643, + "step": 19485 + }, + { + "loss": 13.9903, + "grad_norm": 1.8506743907928467, + "learning_rate": 0.0005, + "epoch": 0.8735062941529084, + "step": 19490 + }, + { + "loss": 13.9016, + "grad_norm": 1.6948919296264648, + "learning_rate": 0.0005, + "epoch": 0.8737303850441739, + "step": 19495 + }, + { + "loss": 13.9009, + "grad_norm": 1.6821815967559814, + "learning_rate": 0.0005, + "epoch": 0.8739544759354394, + "step": 19500 + }, + { + "eval_loss": 1.7357780933380127, + "eval_runtime": 18.3496, + "eval_samples_per_second": 892.88, + "eval_steps_per_second": 8.011, + "epoch": 0.8739544759354394, + "step": 19500 + }, + { + "loss": 13.8466, + "grad_norm": 1.8314194679260254, + "learning_rate": 0.0005, + "epoch": 0.8741785668267049, + "step": 19505 + }, + { + "loss": 13.8573, + "grad_norm": 1.7998930215835571, + "learning_rate": 0.0005, + "epoch": 0.8744026577179704, + "step": 19510 + }, + { + "loss": 13.8579, + "grad_norm": 1.8054667711257935, + "learning_rate": 0.0005, + "epoch": 0.874626748609236, + "step": 19515 + }, + { + "loss": 13.8764, + "grad_norm": 1.7818348407745361, + "learning_rate": 0.0005, + "epoch": 0.8748508395005014, + "step": 19520 + }, + { + "loss": 13.887, + "grad_norm": 1.737918734550476, + "learning_rate": 0.0005, + "epoch": 0.8750749303917669, + "step": 19525 + }, + { + "loss": 13.9379, + "grad_norm": 1.8601570129394531, + "learning_rate": 0.0005, + "epoch": 0.8752990212830324, + "step": 19530 + }, + { + "loss": 13.8531, + "grad_norm": 1.8514798879623413, + "learning_rate": 0.0005, + "epoch": 0.8755231121742979, + "step": 19535 + }, + { + "loss": 13.9075, + "grad_norm": 1.7769105434417725, + "learning_rate": 0.0005, + "epoch": 0.8757472030655634, + "step": 19540 + }, + { + "loss": 13.867, + "grad_norm": 1.6772818565368652, + "learning_rate": 0.0005, + "epoch": 0.8759712939568289, + "step": 19545 + }, + { + "loss": 13.9573, + "grad_norm": 1.7507994174957275, + "learning_rate": 0.0005, + "epoch": 0.8761953848480943, + "step": 19550 + }, + { + "loss": 13.8608, + "grad_norm": 1.710281491279602, + "learning_rate": 0.0005, + "epoch": 0.8764194757393599, + "step": 19555 + }, + { + "loss": 13.9944, + "grad_norm": 1.6837866306304932, + "learning_rate": 0.0005, + "epoch": 0.8766435666306254, + "step": 19560 + }, + { + "loss": 13.8669, + "grad_norm": 1.816420078277588, + "learning_rate": 0.0005, + "epoch": 0.8768676575218909, + "step": 19565 + }, + { + "loss": 13.8127, + "grad_norm": 1.7850531339645386, + "learning_rate": 0.0005, + "epoch": 0.8770917484131564, + "step": 19570 + }, + { + "loss": 13.9502, + "grad_norm": 2.1283504962921143, + "learning_rate": 0.0005, + "epoch": 0.8773158393044219, + "step": 19575 + }, + { + "loss": 13.859, + "grad_norm": 1.8324092626571655, + "learning_rate": 0.0005, + "epoch": 0.8775399301956873, + "step": 19580 + }, + { + "loss": 13.8817, + "grad_norm": 1.8633873462677002, + "learning_rate": 0.0005, + "epoch": 0.8777640210869528, + "step": 19585 + }, + { + "loss": 13.9691, + "grad_norm": 1.7599881887435913, + "learning_rate": 0.0005, + "epoch": 0.8779881119782184, + "step": 19590 + }, + { + "loss": 13.8084, + "grad_norm": 1.763671636581421, + "learning_rate": 0.0005, + "epoch": 0.8782122028694839, + "step": 19595 + }, + { + "loss": 13.9648, + "grad_norm": 1.6990044116973877, + "learning_rate": 0.0005, + "epoch": 0.8784362937607494, + "step": 19600 + }, + { + "loss": 13.8874, + "grad_norm": 1.7792152166366577, + "learning_rate": 0.0005, + "epoch": 0.8786603846520149, + "step": 19605 + }, + { + "loss": 13.9478, + "grad_norm": 2.006032705307007, + "learning_rate": 0.0005, + "epoch": 0.8788844755432803, + "step": 19610 + }, + { + "loss": 13.8684, + "grad_norm": 1.5835766792297363, + "learning_rate": 0.0005, + "epoch": 0.8791085664345458, + "step": 19615 + }, + { + "loss": 13.977, + "grad_norm": 1.6947407722473145, + "learning_rate": 0.0005, + "epoch": 0.8793326573258113, + "step": 19620 + }, + { + "loss": 13.8611, + "grad_norm": 1.629675269126892, + "learning_rate": 0.0005, + "epoch": 0.8795567482170769, + "step": 19625 + }, + { + "loss": 13.9137, + "grad_norm": 1.6017608642578125, + "learning_rate": 0.0005, + "epoch": 0.8797808391083424, + "step": 19630 + }, + { + "loss": 13.8787, + "grad_norm": 1.7836179733276367, + "learning_rate": 0.0005, + "epoch": 0.8800049299996079, + "step": 19635 + }, + { + "loss": 13.8423, + "grad_norm": 1.722022294998169, + "learning_rate": 0.0005, + "epoch": 0.8802290208908733, + "step": 19640 + }, + { + "loss": 13.9003, + "grad_norm": 1.904645562171936, + "learning_rate": 0.0005, + "epoch": 0.8804531117821388, + "step": 19645 + }, + { + "loss": 13.8872, + "grad_norm": 1.8378710746765137, + "learning_rate": 0.0005, + "epoch": 0.8806772026734043, + "step": 19650 + }, + { + "loss": 13.876, + "grad_norm": 1.9562255144119263, + "learning_rate": 0.0005, + "epoch": 0.8809012935646698, + "step": 19655 + }, + { + "loss": 13.957, + "grad_norm": 1.8990522623062134, + "learning_rate": 0.0005, + "epoch": 0.8811253844559354, + "step": 19660 + }, + { + "loss": 13.8762, + "grad_norm": 1.7711070775985718, + "learning_rate": 0.0005, + "epoch": 0.8813494753472009, + "step": 19665 + }, + { + "loss": 13.7974, + "grad_norm": 1.7412478923797607, + "learning_rate": 0.0005, + "epoch": 0.8815735662384663, + "step": 19670 + }, + { + "loss": 13.894, + "grad_norm": 1.8522121906280518, + "learning_rate": 0.0005, + "epoch": 0.8817976571297318, + "step": 19675 + }, + { + "loss": 13.8729, + "grad_norm": 1.6800007820129395, + "learning_rate": 0.0005, + "epoch": 0.8820217480209973, + "step": 19680 + }, + { + "loss": 13.8689, + "grad_norm": 1.7280434370040894, + "learning_rate": 0.0005, + "epoch": 0.8822458389122628, + "step": 19685 + }, + { + "loss": 13.8875, + "grad_norm": 1.8460952043533325, + "learning_rate": 0.0005, + "epoch": 0.8824699298035283, + "step": 19690 + }, + { + "loss": 14.0027, + "grad_norm": 1.9587526321411133, + "learning_rate": 0.0005, + "epoch": 0.8826940206947939, + "step": 19695 + }, + { + "loss": 13.7925, + "grad_norm": 1.8540096282958984, + "learning_rate": 0.0005, + "epoch": 0.8829181115860593, + "step": 19700 + }, + { + "loss": 13.8576, + "grad_norm": 1.955402135848999, + "learning_rate": 0.0005, + "epoch": 0.8831422024773248, + "step": 19705 + }, + { + "loss": 13.8973, + "grad_norm": 1.7868638038635254, + "learning_rate": 0.0005, + "epoch": 0.8833662933685903, + "step": 19710 + }, + { + "loss": 13.9396, + "grad_norm": 1.7307571172714233, + "learning_rate": 0.0005, + "epoch": 0.8835903842598558, + "step": 19715 + }, + { + "loss": 13.8281, + "grad_norm": 1.8804699182510376, + "learning_rate": 0.0005, + "epoch": 0.8838144751511213, + "step": 19720 + }, + { + "loss": 13.8287, + "grad_norm": 1.7727694511413574, + "learning_rate": 0.0005, + "epoch": 0.8840385660423868, + "step": 19725 + }, + { + "loss": 13.9586, + "grad_norm": 1.6995322704315186, + "learning_rate": 0.0005, + "epoch": 0.8842626569336522, + "step": 19730 + }, + { + "loss": 13.9179, + "grad_norm": 1.7488749027252197, + "learning_rate": 0.0005, + "epoch": 0.8844867478249178, + "step": 19735 + }, + { + "loss": 13.8479, + "grad_norm": 1.7972700595855713, + "learning_rate": 0.0005, + "epoch": 0.8847108387161833, + "step": 19740 + }, + { + "loss": 13.7467, + "grad_norm": 1.8058732748031616, + "learning_rate": 0.0005, + "epoch": 0.8849349296074488, + "step": 19745 + }, + { + "loss": 13.8238, + "grad_norm": 1.9566853046417236, + "learning_rate": 0.0005, + "epoch": 0.8851590204987143, + "step": 19750 + }, + { + "loss": 13.8789, + "grad_norm": 1.8107908964157104, + "learning_rate": 0.0005, + "epoch": 0.8853831113899798, + "step": 19755 + }, + { + "loss": 13.9075, + "grad_norm": 1.7433689832687378, + "learning_rate": 0.0005, + "epoch": 0.8856072022812452, + "step": 19760 + }, + { + "loss": 13.8934, + "grad_norm": 1.682373285293579, + "learning_rate": 0.0005, + "epoch": 0.8858312931725107, + "step": 19765 + }, + { + "loss": 13.8291, + "grad_norm": 1.7980401515960693, + "learning_rate": 0.0005, + "epoch": 0.8860553840637763, + "step": 19770 + }, + { + "loss": 13.8356, + "grad_norm": 1.777256965637207, + "learning_rate": 0.0005, + "epoch": 0.8862794749550418, + "step": 19775 + }, + { + "loss": 13.9096, + "grad_norm": 1.7960213422775269, + "learning_rate": 0.0005, + "epoch": 0.8865035658463073, + "step": 19780 + }, + { + "loss": 13.9526, + "grad_norm": 1.7837116718292236, + "learning_rate": 0.0005, + "epoch": 0.8867276567375728, + "step": 19785 + }, + { + "loss": 13.8384, + "grad_norm": 1.8213714361190796, + "learning_rate": 0.0005, + "epoch": 0.8869517476288382, + "step": 19790 + }, + { + "loss": 13.8886, + "grad_norm": 1.6740883588790894, + "learning_rate": 0.0005, + "epoch": 0.8871758385201037, + "step": 19795 + }, + { + "loss": 13.8911, + "grad_norm": 1.9716994762420654, + "learning_rate": 0.0005, + "epoch": 0.8873999294113692, + "step": 19800 + }, + { + "loss": 13.963, + "grad_norm": 1.802721381187439, + "learning_rate": 0.0005, + "epoch": 0.8876240203026348, + "step": 19805 + }, + { + "loss": 14.0038, + "grad_norm": 2.034996509552002, + "learning_rate": 0.0005, + "epoch": 0.8878481111939003, + "step": 19810 + }, + { + "loss": 13.7372, + "grad_norm": 1.8164931535720825, + "learning_rate": 0.0005, + "epoch": 0.8880722020851658, + "step": 19815 + }, + { + "loss": 13.9128, + "grad_norm": 1.8093912601470947, + "learning_rate": 0.0005, + "epoch": 0.8882962929764312, + "step": 19820 + }, + { + "loss": 13.8272, + "grad_norm": 1.724300503730774, + "learning_rate": 0.0005, + "epoch": 0.8885203838676967, + "step": 19825 + }, + { + "loss": 13.9372, + "grad_norm": 1.9092543125152588, + "learning_rate": 0.0005, + "epoch": 0.8887444747589622, + "step": 19830 + }, + { + "loss": 13.867, + "grad_norm": 1.949688196182251, + "learning_rate": 0.0005, + "epoch": 0.8889685656502277, + "step": 19835 + }, + { + "loss": 13.9037, + "grad_norm": 1.7329602241516113, + "learning_rate": 0.0005, + "epoch": 0.8891926565414933, + "step": 19840 + }, + { + "loss": 13.8886, + "grad_norm": 1.8665833473205566, + "learning_rate": 0.0005, + "epoch": 0.8894167474327588, + "step": 19845 + }, + { + "loss": 13.8636, + "grad_norm": 1.697461724281311, + "learning_rate": 0.0005, + "epoch": 0.8896408383240242, + "step": 19850 + }, + { + "loss": 13.8087, + "grad_norm": 1.8883639574050903, + "learning_rate": 0.0005, + "epoch": 0.8898649292152897, + "step": 19855 + }, + { + "loss": 13.9671, + "grad_norm": 1.902814269065857, + "learning_rate": 0.0005, + "epoch": 0.8900890201065552, + "step": 19860 + }, + { + "loss": 13.8051, + "grad_norm": 1.8618606328964233, + "learning_rate": 0.0005, + "epoch": 0.8903131109978207, + "step": 19865 + }, + { + "loss": 13.9258, + "grad_norm": 1.9716463088989258, + "learning_rate": 0.0005, + "epoch": 0.8905372018890862, + "step": 19870 + }, + { + "loss": 13.8961, + "grad_norm": 1.9195139408111572, + "learning_rate": 0.0005, + "epoch": 0.8907612927803518, + "step": 19875 + }, + { + "loss": 13.8617, + "grad_norm": 2.006978750228882, + "learning_rate": 0.0005, + "epoch": 0.8909853836716172, + "step": 19880 + }, + { + "loss": 13.8008, + "grad_norm": 1.9325141906738281, + "learning_rate": 0.0005, + "epoch": 0.8912094745628827, + "step": 19885 + }, + { + "loss": 13.9499, + "grad_norm": 2.0363314151763916, + "learning_rate": 0.0005, + "epoch": 0.8914335654541482, + "step": 19890 + }, + { + "loss": 13.9364, + "grad_norm": 1.7569103240966797, + "learning_rate": 0.0005, + "epoch": 0.8916576563454137, + "step": 19895 + }, + { + "loss": 13.8901, + "grad_norm": 1.6593209505081177, + "learning_rate": 0.0005, + "epoch": 0.8918817472366792, + "step": 19900 + }, + { + "loss": 13.879, + "grad_norm": 1.7446542978286743, + "learning_rate": 0.0005, + "epoch": 0.8921058381279446, + "step": 19905 + }, + { + "loss": 13.8354, + "grad_norm": 1.8931866884231567, + "learning_rate": 0.0005, + "epoch": 0.8923299290192102, + "step": 19910 + }, + { + "loss": 13.7994, + "grad_norm": 1.8957358598709106, + "learning_rate": 0.0005, + "epoch": 0.8925540199104757, + "step": 19915 + }, + { + "loss": 13.7895, + "grad_norm": 1.6988424062728882, + "learning_rate": 0.0005, + "epoch": 0.8927781108017412, + "step": 19920 + }, + { + "loss": 13.8562, + "grad_norm": 1.9120550155639648, + "learning_rate": 0.0005, + "epoch": 0.8930022016930067, + "step": 19925 + }, + { + "loss": 13.9294, + "grad_norm": 1.809348702430725, + "learning_rate": 0.0005, + "epoch": 0.8932262925842722, + "step": 19930 + }, + { + "loss": 13.8896, + "grad_norm": 1.9009032249450684, + "learning_rate": 0.0005, + "epoch": 0.8934503834755376, + "step": 19935 + }, + { + "loss": 13.8517, + "grad_norm": 1.8128573894500732, + "learning_rate": 0.0005, + "epoch": 0.8936744743668031, + "step": 19940 + }, + { + "loss": 13.8585, + "grad_norm": 1.8379244804382324, + "learning_rate": 0.0005, + "epoch": 0.8938985652580687, + "step": 19945 + }, + { + "loss": 13.8022, + "grad_norm": 1.6590509414672852, + "learning_rate": 0.0005, + "epoch": 0.8941226561493342, + "step": 19950 + }, + { + "loss": 13.8558, + "grad_norm": 1.6452158689498901, + "learning_rate": 0.0005, + "epoch": 0.8943467470405997, + "step": 19955 + }, + { + "loss": 13.914, + "grad_norm": 1.834581971168518, + "learning_rate": 0.0005, + "epoch": 0.8945708379318652, + "step": 19960 + }, + { + "loss": 13.9026, + "grad_norm": 1.8982086181640625, + "learning_rate": 0.0005, + "epoch": 0.8947949288231306, + "step": 19965 + }, + { + "loss": 13.9778, + "grad_norm": 1.9215788841247559, + "learning_rate": 0.0005, + "epoch": 0.8950190197143961, + "step": 19970 + }, + { + "loss": 13.8827, + "grad_norm": 1.979630470275879, + "learning_rate": 0.0005, + "epoch": 0.8952431106056616, + "step": 19975 + }, + { + "loss": 13.9766, + "grad_norm": 1.9976118803024292, + "learning_rate": 0.0005, + "epoch": 0.8954672014969272, + "step": 19980 + }, + { + "loss": 13.8556, + "grad_norm": 1.787476658821106, + "learning_rate": 0.0005, + "epoch": 0.8956912923881927, + "step": 19985 + }, + { + "loss": 13.8811, + "grad_norm": 1.778098464012146, + "learning_rate": 0.0005, + "epoch": 0.8959153832794582, + "step": 19990 + }, + { + "loss": 13.9134, + "grad_norm": 2.0431041717529297, + "learning_rate": 0.0005, + "epoch": 0.8961394741707236, + "step": 19995 + }, + { + "loss": 13.8592, + "grad_norm": 1.6326873302459717, + "learning_rate": 0.0005, + "epoch": 0.8963635650619891, + "step": 20000 + }, + { + "eval_loss": 1.7306761741638184, + "eval_runtime": 18.5601, + "eval_samples_per_second": 882.752, + "eval_steps_per_second": 7.92, + "epoch": 0.8963635650619891, + "step": 20000 + }, + { + "loss": 13.8888, + "grad_norm": 1.5741008520126343, + "learning_rate": 0.0005, + "epoch": 0.8965876559532546, + "step": 20005 + }, + { + "loss": 13.813, + "grad_norm": 1.871200442314148, + "learning_rate": 0.0005, + "epoch": 0.8968117468445201, + "step": 20010 + }, + { + "loss": 13.879, + "grad_norm": 2.068232536315918, + "learning_rate": 0.0005, + "epoch": 0.8970358377357857, + "step": 20015 + }, + { + "loss": 14.0274, + "grad_norm": 1.8000730276107788, + "learning_rate": 0.0005, + "epoch": 0.8972599286270512, + "step": 20020 + }, + { + "loss": 13.8916, + "grad_norm": 1.913434624671936, + "learning_rate": 0.0005, + "epoch": 0.8974840195183166, + "step": 20025 + }, + { + "loss": 13.8162, + "grad_norm": 1.6789735555648804, + "learning_rate": 0.0005, + "epoch": 0.8977081104095821, + "step": 20030 + }, + { + "loss": 13.8043, + "grad_norm": 1.673842191696167, + "learning_rate": 0.0005, + "epoch": 0.8979322013008476, + "step": 20035 + }, + { + "loss": 13.9092, + "grad_norm": 1.6753332614898682, + "learning_rate": 0.0005, + "epoch": 0.8981562921921131, + "step": 20040 + }, + { + "loss": 13.8253, + "grad_norm": 1.6415342092514038, + "learning_rate": 0.0005, + "epoch": 0.8983803830833786, + "step": 20045 + }, + { + "loss": 13.8676, + "grad_norm": 1.6949872970581055, + "learning_rate": 0.0005, + "epoch": 0.8986044739746442, + "step": 20050 + }, + { + "loss": 13.8147, + "grad_norm": 1.6152931451797485, + "learning_rate": 0.0005, + "epoch": 0.8988285648659096, + "step": 20055 + }, + { + "loss": 13.9099, + "grad_norm": 1.5995293855667114, + "learning_rate": 0.0005, + "epoch": 0.8990526557571751, + "step": 20060 + }, + { + "loss": 13.9379, + "grad_norm": 1.7555081844329834, + "learning_rate": 0.0005, + "epoch": 0.8992767466484406, + "step": 20065 + }, + { + "loss": 13.8903, + "grad_norm": 1.837789535522461, + "learning_rate": 0.0005, + "epoch": 0.8995008375397061, + "step": 20070 + }, + { + "loss": 13.9749, + "grad_norm": 1.7679381370544434, + "learning_rate": 0.0005, + "epoch": 0.8997249284309716, + "step": 20075 + }, + { + "loss": 13.992, + "grad_norm": 1.783341884613037, + "learning_rate": 0.0005, + "epoch": 0.8999490193222371, + "step": 20080 + }, + { + "loss": 13.8361, + "grad_norm": 1.754961371421814, + "learning_rate": 0.0004999977712403221, + "epoch": 0.9001731102135025, + "step": 20085 + }, + { + "loss": 13.9087, + "grad_norm": 1.792695164680481, + "learning_rate": 0.000499984151186201, + "epoch": 0.9003972011047681, + "step": 20090 + }, + { + "loss": 13.8374, + "grad_norm": 1.7169972658157349, + "learning_rate": 0.0004999581499515344, + "epoch": 0.9006212919960336, + "step": 20095 + }, + { + "loss": 13.9435, + "grad_norm": 1.7526806592941284, + "learning_rate": 0.0004999197688241076, + "epoch": 0.9008453828872991, + "step": 20100 + }, + { + "loss": 13.8647, + "grad_norm": 1.7318717241287231, + "learning_rate": 0.0004998690097048561, + "epoch": 0.9010694737785646, + "step": 20105 + }, + { + "loss": 13.9183, + "grad_norm": 1.895723581314087, + "learning_rate": 0.0004998058751077704, + "epoch": 0.9012935646698301, + "step": 20110 + }, + { + "loss": 13.9082, + "grad_norm": 1.8175562620162964, + "learning_rate": 0.0004997303681597721, + "epoch": 0.9015176555610955, + "step": 20115 + }, + { + "loss": 13.8523, + "grad_norm": 1.7849726676940918, + "learning_rate": 0.000499642492600559, + "epoch": 0.901741746452361, + "step": 20120 + }, + { + "loss": 13.8472, + "grad_norm": 1.8542882204055786, + "learning_rate": 0.0004995422527824195, + "epoch": 0.9019658373436266, + "step": 20125 + }, + { + "loss": 13.9467, + "grad_norm": 1.7311253547668457, + "learning_rate": 0.0004994296536700177, + "epoch": 0.9021899282348921, + "step": 20130 + }, + { + "loss": 13.9977, + "grad_norm": 1.6828018426895142, + "learning_rate": 0.0004993047008401468, + "epoch": 0.9024140191261576, + "step": 20135 + }, + { + "loss": 13.8359, + "grad_norm": 1.7294998168945312, + "learning_rate": 0.0004991674004814531, + "epoch": 0.9026381100174231, + "step": 20140 + }, + { + "loss": 13.9296, + "grad_norm": 1.7701478004455566, + "learning_rate": 0.0004990177593941303, + "epoch": 0.9028622009086885, + "step": 20145 + }, + { + "loss": 13.8692, + "grad_norm": 1.8345767259597778, + "learning_rate": 0.000498855784989581, + "epoch": 0.903086291799954, + "step": 20150 + }, + { + "loss": 13.8737, + "grad_norm": 1.6233254671096802, + "learning_rate": 0.0004986814852900517, + "epoch": 0.9033103826912195, + "step": 20155 + }, + { + "loss": 13.8891, + "grad_norm": 1.671451449394226, + "learning_rate": 0.0004984948689282333, + "epoch": 0.9035344735824851, + "step": 20160 + }, + { + "loss": 13.8339, + "grad_norm": 1.6069334745407104, + "learning_rate": 0.0004982959451468356, + "epoch": 0.9037585644737506, + "step": 20165 + }, + { + "loss": 13.898, + "grad_norm": 1.6981641054153442, + "learning_rate": 0.0004980847237981281, + "epoch": 0.9039826553650161, + "step": 20170 + }, + { + "loss": 13.842, + "grad_norm": 1.7139631509780884, + "learning_rate": 0.0004978612153434526, + "epoch": 0.9042067462562815, + "step": 20175 + }, + { + "loss": 13.9396, + "grad_norm": 1.597270131111145, + "learning_rate": 0.000497625430852705, + "epoch": 0.904430837147547, + "step": 20180 + }, + { + "loss": 13.8921, + "grad_norm": 1.760495901107788, + "learning_rate": 0.000497377382003787, + "epoch": 0.9046549280388125, + "step": 20185 + }, + { + "loss": 13.8138, + "grad_norm": 1.900946021080017, + "learning_rate": 0.0004971170810820279, + "epoch": 0.904879018930078, + "step": 20190 + }, + { + "loss": 13.9918, + "grad_norm": 1.8058969974517822, + "learning_rate": 0.0004968445409795756, + "epoch": 0.9051031098213436, + "step": 20195 + }, + { + "loss": 13.8749, + "grad_norm": 1.6940749883651733, + "learning_rate": 0.0004965597751947589, + "epoch": 0.9053272007126091, + "step": 20200 + }, + { + "loss": 13.9152, + "grad_norm": 1.9464482069015503, + "learning_rate": 0.0004962627978314181, + "epoch": 0.9055512916038745, + "step": 20205 + }, + { + "loss": 13.9305, + "grad_norm": 1.7196178436279297, + "learning_rate": 0.0004959536235982073, + "epoch": 0.90577538249514, + "step": 20210 + }, + { + "loss": 13.9398, + "grad_norm": 1.7908014059066772, + "learning_rate": 0.000495632267807865, + "epoch": 0.9059994733864055, + "step": 20215 + }, + { + "loss": 13.9284, + "grad_norm": 1.6106112003326416, + "learning_rate": 0.0004952987463764568, + "epoch": 0.906223564277671, + "step": 20220 + }, + { + "loss": 13.8344, + "grad_norm": 1.6919456720352173, + "learning_rate": 0.0004949530758225857, + "epoch": 0.9064476551689366, + "step": 20225 + }, + { + "loss": 13.9635, + "grad_norm": 1.7031246423721313, + "learning_rate": 0.0004945952732665755, + "epoch": 0.9066717460602021, + "step": 20230 + }, + { + "loss": 13.7976, + "grad_norm": 1.6084965467453003, + "learning_rate": 0.0004942253564296218, + "epoch": 0.9068958369514675, + "step": 20235 + }, + { + "loss": 13.8327, + "grad_norm": 1.846540093421936, + "learning_rate": 0.0004938433436329145, + "epoch": 0.907119927842733, + "step": 20240 + }, + { + "loss": 13.8758, + "grad_norm": 1.7569811344146729, + "learning_rate": 0.0004934492537967308, + "epoch": 0.9073440187339985, + "step": 20245 + }, + { + "loss": 13.9523, + "grad_norm": 1.7285062074661255, + "learning_rate": 0.0004930431064394977, + "epoch": 0.907568109625264, + "step": 20250 + }, + { + "loss": 13.9325, + "grad_norm": 1.881080985069275, + "learning_rate": 0.0004926249216768255, + "epoch": 0.9077922005165295, + "step": 20255 + }, + { + "loss": 13.9285, + "grad_norm": 1.8552517890930176, + "learning_rate": 0.0004921947202205112, + "epoch": 0.908016291407795, + "step": 20260 + }, + { + "loss": 13.9245, + "grad_norm": 1.9606412649154663, + "learning_rate": 0.0004917525233775137, + "epoch": 0.9082403822990605, + "step": 20265 + }, + { + "loss": 13.8919, + "grad_norm": 1.70395827293396, + "learning_rate": 0.0004912983530488966, + "epoch": 0.908464473190326, + "step": 20270 + }, + { + "loss": 13.8468, + "grad_norm": 1.7387700080871582, + "learning_rate": 0.0004908322317287456, + "epoch": 0.9086885640815915, + "step": 20275 + }, + { + "loss": 13.8912, + "grad_norm": 1.8163692951202393, + "learning_rate": 0.0004903541825030532, + "epoch": 0.908912654972857, + "step": 20280 + }, + { + "loss": 13.8181, + "grad_norm": 1.732064127922058, + "learning_rate": 0.0004898642290485751, + "epoch": 0.9091367458641225, + "step": 20285 + }, + { + "loss": 13.8886, + "grad_norm": 1.7527867555618286, + "learning_rate": 0.0004893623956316589, + "epoch": 0.909360836755388, + "step": 20290 + }, + { + "loss": 14.0081, + "grad_norm": 1.914961338043213, + "learning_rate": 0.0004888487071070405, + "epoch": 0.9095849276466534, + "step": 20295 + }, + { + "loss": 13.8787, + "grad_norm": 1.7557860612869263, + "learning_rate": 0.0004883231889166143, + "epoch": 0.909809018537919, + "step": 20300 + }, + { + "loss": 13.8582, + "grad_norm": 1.670168399810791, + "learning_rate": 0.00048778586708817277, + "epoch": 0.9100331094291845, + "step": 20305 + }, + { + "loss": 13.8668, + "grad_norm": 1.6830042600631714, + "learning_rate": 0.00048723676823411727, + "epoch": 0.91025720032045, + "step": 20310 + }, + { + "loss": 13.8074, + "grad_norm": 1.8048717975616455, + "learning_rate": 0.00048667591955014013, + "epoch": 0.9104812912117155, + "step": 20315 + }, + { + "loss": 13.9116, + "grad_norm": 1.7091537714004517, + "learning_rate": 0.0004861033488138774, + "epoch": 0.910705382102981, + "step": 20320 + }, + { + "loss": 13.9951, + "grad_norm": 1.7124671936035156, + "learning_rate": 0.00048551908438353375, + "epoch": 0.9109294729942464, + "step": 20325 + }, + { + "loss": 13.8583, + "grad_norm": 1.6607993841171265, + "learning_rate": 0.0004849231551964771, + "epoch": 0.9111535638855119, + "step": 20330 + }, + { + "loss": 13.8799, + "grad_norm": 1.7443516254425049, + "learning_rate": 0.00048431559076780607, + "epoch": 0.9113776547767775, + "step": 20335 + }, + { + "loss": 13.8942, + "grad_norm": 1.738348126411438, + "learning_rate": 0.0004836964211888878, + "epoch": 0.911601745668043, + "step": 20340 + }, + { + "loss": 13.791, + "grad_norm": 1.6687777042388916, + "learning_rate": 0.0004830656771258677, + "epoch": 0.9118258365593085, + "step": 20345 + }, + { + "loss": 13.852, + "grad_norm": 1.673667550086975, + "learning_rate": 0.00048242338981815085, + "epoch": 0.912049927450574, + "step": 20350 + }, + { + "loss": 13.8364, + "grad_norm": 1.7567943334579468, + "learning_rate": 0.00048176959107685435, + "epoch": 0.9122740183418394, + "step": 20355 + }, + { + "loss": 13.8419, + "grad_norm": 1.7680989503860474, + "learning_rate": 0.000481104313283232, + "epoch": 0.9124981092331049, + "step": 20360 + }, + { + "loss": 13.7604, + "grad_norm": 1.6283504962921143, + "learning_rate": 0.0004804275893870704, + "epoch": 0.9127222001243704, + "step": 20365 + }, + { + "loss": 13.8656, + "grad_norm": 1.7220295667648315, + "learning_rate": 0.00047973945290505766, + "epoch": 0.912946291015636, + "step": 20370 + }, + { + "loss": 13.8799, + "grad_norm": 1.7215722799301147, + "learning_rate": 0.00047903993791912226, + "epoch": 0.9131703819069015, + "step": 20375 + }, + { + "loss": 13.8281, + "grad_norm": 1.8215144872665405, + "learning_rate": 0.000478329079074746, + "epoch": 0.913394472798167, + "step": 20380 + }, + { + "loss": 13.8491, + "grad_norm": 1.7994507551193237, + "learning_rate": 0.0004776069115792475, + "epoch": 0.9136185636894324, + "step": 20385 + }, + { + "loss": 13.8475, + "grad_norm": 1.7810310125350952, + "learning_rate": 0.000476873471200039, + "epoch": 0.9138426545806979, + "step": 20390 + }, + { + "loss": 13.8628, + "grad_norm": 1.8750712871551514, + "learning_rate": 0.00047612879426285425, + "epoch": 0.9140667454719634, + "step": 20395 + }, + { + "loss": 14.005, + "grad_norm": 1.9540858268737793, + "learning_rate": 0.00047537291764995006, + "epoch": 0.9142908363632289, + "step": 20400 + }, + { + "loss": 13.766, + "grad_norm": 1.9318475723266602, + "learning_rate": 0.0004746058787982788, + "epoch": 0.9145149272544945, + "step": 20405 + }, + { + "loss": 13.8293, + "grad_norm": 1.6670743227005005, + "learning_rate": 0.00047382771569763485, + "epoch": 0.91473901814576, + "step": 20410 + }, + { + "loss": 13.9367, + "grad_norm": 1.6447280645370483, + "learning_rate": 0.000473038466888773, + "epoch": 0.9149631090370254, + "step": 20415 + }, + { + "loss": 13.784, + "grad_norm": 1.9621039628982544, + "learning_rate": 0.0004722381714614994, + "epoch": 0.9151871999282909, + "step": 20420 + }, + { + "loss": 13.9344, + "grad_norm": 1.5999281406402588, + "learning_rate": 0.00047142686905273537, + "epoch": 0.9154112908195564, + "step": 20425 + }, + { + "loss": 13.8596, + "grad_norm": 1.761801838874817, + "learning_rate": 0.0004706045998445548, + "epoch": 0.9156353817108219, + "step": 20430 + }, + { + "loss": 13.7922, + "grad_norm": 1.7019723653793335, + "learning_rate": 0.0004697714045621935, + "epoch": 0.9158594726020874, + "step": 20435 + }, + { + "loss": 13.7741, + "grad_norm": 1.755383014678955, + "learning_rate": 0.0004689273244720325, + "epoch": 0.916083563493353, + "step": 20440 + }, + { + "loss": 13.9154, + "grad_norm": 1.742811918258667, + "learning_rate": 0.000468072401379554, + "epoch": 0.9163076543846184, + "step": 20445 + }, + { + "loss": 13.8688, + "grad_norm": 1.6975293159484863, + "learning_rate": 0.000467206677627271, + "epoch": 0.9165317452758839, + "step": 20450 + }, + { + "loss": 13.817, + "grad_norm": 1.7448455095291138, + "learning_rate": 0.00046633019609262997, + "epoch": 0.9167558361671494, + "step": 20455 + }, + { + "loss": 13.8818, + "grad_norm": 1.705538034439087, + "learning_rate": 0.00046544300018588745, + "epoch": 0.9169799270584149, + "step": 20460 + }, + { + "loss": 13.8366, + "grad_norm": 1.7605187892913818, + "learning_rate": 0.00046454513384795986, + "epoch": 0.9172040179496804, + "step": 20465 + }, + { + "loss": 13.932, + "grad_norm": 1.904154896736145, + "learning_rate": 0.0004636366415482474, + "epoch": 0.9174281088409458, + "step": 20470 + }, + { + "loss": 13.7937, + "grad_norm": 1.7890197038650513, + "learning_rate": 0.00046271756828243117, + "epoch": 0.9176521997322113, + "step": 20475 + }, + { + "loss": 13.8559, + "grad_norm": 1.8883311748504639, + "learning_rate": 0.0004617879595702452, + "epoch": 0.9178762906234769, + "step": 20480 + }, + { + "loss": 14.0243, + "grad_norm": 1.725429654121399, + "learning_rate": 0.00046084786145322143, + "epoch": 0.9181003815147424, + "step": 20485 + }, + { + "loss": 13.8569, + "grad_norm": 1.7402434349060059, + "learning_rate": 0.00045989732049240976, + "epoch": 0.9183244724060079, + "step": 20490 + }, + { + "loss": 13.85, + "grad_norm": 1.7885849475860596, + "learning_rate": 0.0004589363837660716, + "epoch": 0.9185485632972734, + "step": 20495 + }, + { + "loss": 13.8368, + "grad_norm": 1.859695315361023, + "learning_rate": 0.0004579650988673487, + "epoch": 0.9187726541885388, + "step": 20500 + }, + { + "eval_loss": 1.7257344722747803, + "eval_runtime": 18.4835, + "eval_samples_per_second": 886.41, + "eval_steps_per_second": 7.953, + "epoch": 0.9187726541885388, + "step": 20500 + }, + { + "loss": 13.8213, + "grad_norm": 1.6608607769012451, + "learning_rate": 0.0004569835139019054, + "epoch": 0.9189967450798043, + "step": 20505 + }, + { + "loss": 13.9715, + "grad_norm": 1.6903189420700073, + "learning_rate": 0.0004559916774855464, + "epoch": 0.9192208359710698, + "step": 20510 + }, + { + "loss": 13.8817, + "grad_norm": 1.82123601436615, + "learning_rate": 0.0004549896387418089, + "epoch": 0.9194449268623354, + "step": 20515 + }, + { + "loss": 13.7962, + "grad_norm": 1.8259592056274414, + "learning_rate": 0.0004539774472995296, + "epoch": 0.9196690177536009, + "step": 20520 + }, + { + "loss": 13.783, + "grad_norm": 1.7011497020721436, + "learning_rate": 0.0004529551532903865, + "epoch": 0.9198931086448664, + "step": 20525 + }, + { + "loss": 13.9082, + "grad_norm": 1.7713634967803955, + "learning_rate": 0.00045192280734641623, + "epoch": 0.9201171995361318, + "step": 20530 + }, + { + "loss": 13.8889, + "grad_norm": 1.750898003578186, + "learning_rate": 0.00045088046059750634, + "epoch": 0.9203412904273973, + "step": 20535 + }, + { + "loss": 13.7703, + "grad_norm": 1.7162286043167114, + "learning_rate": 0.0004498281646688627, + "epoch": 0.9205653813186628, + "step": 20540 + }, + { + "loss": 13.9167, + "grad_norm": 1.7430411577224731, + "learning_rate": 0.00044876597167845276, + "epoch": 0.9207894722099284, + "step": 20545 + }, + { + "loss": 13.921, + "grad_norm": 1.7012724876403809, + "learning_rate": 0.0004476939342344246, + "epoch": 0.9210135631011939, + "step": 20550 + }, + { + "loss": 13.935, + "grad_norm": 1.8371593952178955, + "learning_rate": 0.00044661210543250077, + "epoch": 0.9212376539924594, + "step": 20555 + }, + { + "loss": 13.8992, + "grad_norm": 2.012096881866455, + "learning_rate": 0.00044552053885334875, + "epoch": 0.9214617448837248, + "step": 20560 + }, + { + "loss": 13.7958, + "grad_norm": 1.8178032636642456, + "learning_rate": 0.0004444192885599276, + "epoch": 0.9216858357749903, + "step": 20565 + }, + { + "loss": 13.7693, + "grad_norm": 1.6493083238601685, + "learning_rate": 0.00044330840909480984, + "epoch": 0.9219099266662558, + "step": 20570 + }, + { + "loss": 13.8159, + "grad_norm": 1.667523980140686, + "learning_rate": 0.0004421879554774803, + "epoch": 0.9221340175575213, + "step": 20575 + }, + { + "loss": 13.8814, + "grad_norm": 1.6737457513809204, + "learning_rate": 0.0004410579832016112, + "epoch": 0.9223581084487869, + "step": 20580 + }, + { + "loss": 13.8307, + "grad_norm": 1.687098741531372, + "learning_rate": 0.0004399185482323134, + "epoch": 0.9225821993400524, + "step": 20585 + }, + { + "loss": 13.722, + "grad_norm": 1.6777094602584839, + "learning_rate": 0.00043876970700336496, + "epoch": 0.9228062902313178, + "step": 20590 + }, + { + "loss": 13.8917, + "grad_norm": 1.747768521308899, + "learning_rate": 0.00043761151641441565, + "epoch": 0.9230303811225833, + "step": 20595 + }, + { + "loss": 13.8852, + "grad_norm": 1.6435922384262085, + "learning_rate": 0.00043644403382816913, + "epoch": 0.9232544720138488, + "step": 20600 + }, + { + "loss": 13.8821, + "grad_norm": 1.6521140336990356, + "learning_rate": 0.00043526731706754196, + "epoch": 0.9234785629051143, + "step": 20605 + }, + { + "loss": 13.8213, + "grad_norm": 1.8001853227615356, + "learning_rate": 0.0004340814244127993, + "epoch": 0.9237026537963798, + "step": 20610 + }, + { + "loss": 13.8501, + "grad_norm": 1.6893656253814697, + "learning_rate": 0.00043288641459866915, + "epoch": 0.9239267446876454, + "step": 20615 + }, + { + "loss": 13.7712, + "grad_norm": 1.6274510622024536, + "learning_rate": 0.00043168234681143246, + "epoch": 0.9241508355789108, + "step": 20620 + }, + { + "loss": 13.8701, + "grad_norm": 1.8652204275131226, + "learning_rate": 0.0004304692806859927, + "epoch": 0.9243749264701763, + "step": 20625 + }, + { + "loss": 13.8061, + "grad_norm": 1.9463926553726196, + "learning_rate": 0.00042924727630292125, + "epoch": 0.9245990173614418, + "step": 20630 + }, + { + "loss": 13.8314, + "grad_norm": 1.8704006671905518, + "learning_rate": 0.0004280163941854828, + "epoch": 0.9248231082527073, + "step": 20635 + }, + { + "loss": 13.7852, + "grad_norm": 1.7580029964447021, + "learning_rate": 0.00042677669529663686, + "epoch": 0.9250471991439728, + "step": 20640 + }, + { + "loss": 13.8812, + "grad_norm": 1.6548473834991455, + "learning_rate": 0.00042552824103601916, + "epoch": 0.9252712900352383, + "step": 20645 + }, + { + "loss": 13.7267, + "grad_norm": 1.6016569137573242, + "learning_rate": 0.0004242710932368998, + "epoch": 0.9254953809265037, + "step": 20650 + }, + { + "loss": 13.7553, + "grad_norm": 1.7345774173736572, + "learning_rate": 0.0004230053141631216, + "epoch": 0.9257194718177693, + "step": 20655 + }, + { + "loss": 13.8659, + "grad_norm": 1.749651551246643, + "learning_rate": 0.00042173096650601594, + "epoch": 0.9259435627090348, + "step": 20660 + }, + { + "loss": 13.7555, + "grad_norm": 1.6441956758499146, + "learning_rate": 0.0004204481133812977, + "epoch": 0.9261676536003003, + "step": 20665 + }, + { + "loss": 13.908, + "grad_norm": 1.6786205768585205, + "learning_rate": 0.00041915681832593936, + "epoch": 0.9263917444915658, + "step": 20670 + }, + { + "loss": 13.8288, + "grad_norm": 1.71892249584198, + "learning_rate": 0.00041785714529502427, + "epoch": 0.9266158353828313, + "step": 20675 + }, + { + "loss": 13.7958, + "grad_norm": 1.9180169105529785, + "learning_rate": 0.000416549158658579, + "epoch": 0.9268399262740967, + "step": 20680 + }, + { + "loss": 13.8891, + "grad_norm": 1.7008872032165527, + "learning_rate": 0.0004152329231983852, + "epoch": 0.9270640171653622, + "step": 20685 + }, + { + "loss": 13.8643, + "grad_norm": 1.6335214376449585, + "learning_rate": 0.0004139085041047711, + "epoch": 0.9272881080566278, + "step": 20690 + }, + { + "loss": 13.6942, + "grad_norm": 1.766997218132019, + "learning_rate": 0.00041257596697338286, + "epoch": 0.9275121989478933, + "step": 20695 + }, + { + "loss": 13.7377, + "grad_norm": 1.6384992599487305, + "learning_rate": 0.00041123537780193554, + "epoch": 0.9277362898391588, + "step": 20700 + }, + { + "loss": 13.8826, + "grad_norm": 1.7075258493423462, + "learning_rate": 0.0004098868029869447, + "epoch": 0.9279603807304243, + "step": 20705 + }, + { + "loss": 13.7023, + "grad_norm": 1.6997811794281006, + "learning_rate": 0.00040853030932043775, + "epoch": 0.9281844716216897, + "step": 20710 + }, + { + "loss": 13.7944, + "grad_norm": 1.677700161933899, + "learning_rate": 0.0004071659639866457, + "epoch": 0.9284085625129552, + "step": 20715 + }, + { + "loss": 13.8049, + "grad_norm": 1.6065220832824707, + "learning_rate": 0.0004057938345586761, + "epoch": 0.9286326534042207, + "step": 20720 + }, + { + "loss": 13.842, + "grad_norm": 1.6548614501953125, + "learning_rate": 0.0004044139889951659, + "epoch": 0.9288567442954863, + "step": 20725 + }, + { + "loss": 13.7809, + "grad_norm": 1.7440521717071533, + "learning_rate": 0.00040302649563691575, + "epoch": 0.9290808351867518, + "step": 20730 + }, + { + "loss": 13.8484, + "grad_norm": 1.6194367408752441, + "learning_rate": 0.00040163142320350523, + "epoch": 0.9293049260780173, + "step": 20735 + }, + { + "loss": 13.8731, + "grad_norm": 1.6101369857788086, + "learning_rate": 0.0004002288407898893, + "epoch": 0.9295290169692827, + "step": 20740 + }, + { + "loss": 13.8792, + "grad_norm": 1.6936492919921875, + "learning_rate": 0.0003988188178629763, + "epoch": 0.9297531078605482, + "step": 20745 + }, + { + "loss": 13.7914, + "grad_norm": 1.6488006114959717, + "learning_rate": 0.00039740142425818715, + "epoch": 0.9299771987518137, + "step": 20750 + }, + { + "loss": 13.8232, + "grad_norm": 1.5743663311004639, + "learning_rate": 0.0003959767301759967, + "epoch": 0.9302012896430792, + "step": 20755 + }, + { + "loss": 13.7971, + "grad_norm": 1.6571708917617798, + "learning_rate": 0.00039454480617845676, + "epoch": 0.9304253805343448, + "step": 20760 + }, + { + "loss": 13.8978, + "grad_norm": 1.8308016061782837, + "learning_rate": 0.0003931057231857017, + "epoch": 0.9306494714256103, + "step": 20765 + }, + { + "loss": 13.8654, + "grad_norm": 1.6224706172943115, + "learning_rate": 0.0003916595524724353, + "epoch": 0.9308735623168757, + "step": 20770 + }, + { + "loss": 13.8414, + "grad_norm": 1.542962670326233, + "learning_rate": 0.00039020636566440114, + "epoch": 0.9310976532081412, + "step": 20775 + }, + { + "loss": 13.8781, + "grad_norm": 1.9705100059509277, + "learning_rate": 0.0003887462347348349, + "epoch": 0.9313217440994067, + "step": 20780 + }, + { + "loss": 13.7762, + "grad_norm": 1.6651804447174072, + "learning_rate": 0.00038727923200089975, + "epoch": 0.9315458349906722, + "step": 20785 + }, + { + "loss": 13.7654, + "grad_norm": 1.8163079023361206, + "learning_rate": 0.0003858054301201047, + "epoch": 0.9317699258819377, + "step": 20790 + }, + { + "loss": 13.8023, + "grad_norm": 1.91973078250885, + "learning_rate": 0.000384324902086706, + "epoch": 0.9319940167732033, + "step": 20795 + }, + { + "loss": 13.8323, + "grad_norm": 1.7480084896087646, + "learning_rate": 0.0003828377212280917, + "epoch": 0.9322181076644687, + "step": 20800 + }, + { + "loss": 13.7816, + "grad_norm": 1.6364456415176392, + "learning_rate": 0.0003813439612011501, + "epoch": 0.9324421985557342, + "step": 20805 + }, + { + "loss": 13.6706, + "grad_norm": 1.5766396522521973, + "learning_rate": 0.0003798436959886219, + "epoch": 0.9326662894469997, + "step": 20810 + }, + { + "loss": 13.7765, + "grad_norm": 1.6076284646987915, + "learning_rate": 0.00037833699989543544, + "epoch": 0.9328903803382652, + "step": 20815 + }, + { + "loss": 13.8702, + "grad_norm": 1.6237750053405762, + "learning_rate": 0.00037682394754502685, + "epoch": 0.9331144712295307, + "step": 20820 + }, + { + "loss": 13.822, + "grad_norm": 1.6188440322875977, + "learning_rate": 0.0003753046138756442, + "epoch": 0.9333385621207962, + "step": 20825 + }, + { + "loss": 13.8648, + "grad_norm": 1.74038827419281, + "learning_rate": 0.0003737790741366358, + "epoch": 0.9335626530120616, + "step": 20830 + }, + { + "loss": 13.7871, + "grad_norm": 1.7488499879837036, + "learning_rate": 0.0003722474038847235, + "epoch": 0.9337867439033272, + "step": 20835 + }, + { + "loss": 13.7595, + "grad_norm": 1.6546235084533691, + "learning_rate": 0.0003707096789802599, + "epoch": 0.9340108347945927, + "step": 20840 + }, + { + "loss": 13.8413, + "grad_norm": 1.6696339845657349, + "learning_rate": 0.00036916597558347215, + "epoch": 0.9342349256858582, + "step": 20845 + }, + { + "loss": 13.7561, + "grad_norm": 1.7855514287948608, + "learning_rate": 0.00036761637015068893, + "epoch": 0.9344590165771237, + "step": 20850 + }, + { + "loss": 13.7537, + "grad_norm": 1.6255345344543457, + "learning_rate": 0.0003660609394305543, + "epoch": 0.9346831074683892, + "step": 20855 + }, + { + "loss": 13.9035, + "grad_norm": 1.6804163455963135, + "learning_rate": 0.00036449976046022643, + "epoch": 0.9349071983596546, + "step": 20860 + }, + { + "loss": 13.8235, + "grad_norm": 1.679900884628296, + "learning_rate": 0.00036293291056156175, + "epoch": 0.9351312892509202, + "step": 20865 + }, + { + "loss": 13.819, + "grad_norm": 1.7183867692947388, + "learning_rate": 0.00036136046733728613, + "epoch": 0.9353553801421857, + "step": 20870 + }, + { + "loss": 13.8095, + "grad_norm": 1.6135212182998657, + "learning_rate": 0.00035978250866715034, + "epoch": 0.9355794710334512, + "step": 20875 + }, + { + "loss": 13.7488, + "grad_norm": 1.6178854703903198, + "learning_rate": 0.00035819911270407374, + "epoch": 0.9358035619247167, + "step": 20880 + }, + { + "loss": 13.8237, + "grad_norm": 1.6501045227050781, + "learning_rate": 0.0003566103578702731, + "epoch": 0.9360276528159822, + "step": 20885 + }, + { + "loss": 13.6911, + "grad_norm": 1.5672601461410522, + "learning_rate": 0.00035501632285337873, + "epoch": 0.9362517437072476, + "step": 20890 + }, + { + "loss": 13.835, + "grad_norm": 1.591254711151123, + "learning_rate": 0.00035341708660253685, + "epoch": 0.9364758345985131, + "step": 20895 + }, + { + "loss": 13.8382, + "grad_norm": 1.7729851007461548, + "learning_rate": 0.00035181272832449984, + "epoch": 0.9366999254897787, + "step": 20900 + }, + { + "loss": 13.6892, + "grad_norm": 1.6097650527954102, + "learning_rate": 0.0003502033274797031, + "epoch": 0.9369240163810442, + "step": 20905 + }, + { + "loss": 13.7056, + "grad_norm": 1.6839957237243652, + "learning_rate": 0.00034858896377832965, + "epoch": 0.9371481072723097, + "step": 20910 + }, + { + "loss": 13.7331, + "grad_norm": 1.6045210361480713, + "learning_rate": 0.00034696971717636217, + "epoch": 0.9373721981635752, + "step": 20915 + }, + { + "loss": 13.681, + "grad_norm": 1.7205440998077393, + "learning_rate": 0.0003453456678716227, + "epoch": 0.9375962890548406, + "step": 20920 + }, + { + "loss": 13.7979, + "grad_norm": 1.5592955350875854, + "learning_rate": 0.0003437168962998014, + "epoch": 0.9378203799461061, + "step": 20925 + }, + { + "loss": 13.7619, + "grad_norm": 1.6273744106292725, + "learning_rate": 0.00034208348313047185, + "epoch": 0.9380444708373716, + "step": 20930 + }, + { + "loss": 13.8481, + "grad_norm": 1.618740200996399, + "learning_rate": 0.0003404455092630959, + "epoch": 0.9382685617286372, + "step": 20935 + }, + { + "loss": 13.7558, + "grad_norm": 1.6477937698364258, + "learning_rate": 0.00033880305582301764, + "epoch": 0.9384926526199027, + "step": 20940 + }, + { + "loss": 13.867, + "grad_norm": 1.6027733087539673, + "learning_rate": 0.0003371562041574439, + "epoch": 0.9387167435111682, + "step": 20945 + }, + { + "loss": 13.8095, + "grad_norm": 1.5962384939193726, + "learning_rate": 0.0003355050358314172, + "epoch": 0.9389408344024336, + "step": 20950 + }, + { + "loss": 13.8599, + "grad_norm": 1.7121936082839966, + "learning_rate": 0.0003338496326237743, + "epoch": 0.9391649252936991, + "step": 20955 + }, + { + "loss": 13.8941, + "grad_norm": 1.6914751529693604, + "learning_rate": 0.0003321900765230969, + "epoch": 0.9393890161849646, + "step": 20960 + }, + { + "loss": 13.8342, + "grad_norm": 1.5547442436218262, + "learning_rate": 0.00033052644972365056, + "epoch": 0.9396131070762301, + "step": 20965 + }, + { + "loss": 13.8076, + "grad_norm": 1.5029220581054688, + "learning_rate": 0.0003288588346213139, + "epoch": 0.9398371979674957, + "step": 20970 + }, + { + "loss": 13.8405, + "grad_norm": 1.5579488277435303, + "learning_rate": 0.00032718731380949754, + "epoch": 0.9400612888587612, + "step": 20975 + }, + { + "loss": 13.7782, + "grad_norm": 1.647713303565979, + "learning_rate": 0.0003255119700750535, + "epoch": 0.9402853797500266, + "step": 20980 + }, + { + "loss": 13.7087, + "grad_norm": 1.569794774055481, + "learning_rate": 0.0003238328863941753, + "epoch": 0.9405094706412921, + "step": 20985 + }, + { + "loss": 13.7996, + "grad_norm": 1.584929347038269, + "learning_rate": 0.0003221501459282877, + "epoch": 0.9407335615325576, + "step": 20990 + }, + { + "loss": 13.8525, + "grad_norm": 1.613590955734253, + "learning_rate": 0.0003204638320199282, + "epoch": 0.9409576524238231, + "step": 20995 + }, + { + "loss": 13.6583, + "grad_norm": 1.6018569469451904, + "learning_rate": 0.0003187740281886195, + "epoch": 0.9411817433150886, + "step": 21000 + }, + { + "eval_loss": 1.7149229049682617, + "eval_runtime": 18.6557, + "eval_samples_per_second": 878.231, + "eval_steps_per_second": 7.88, + "epoch": 0.9411817433150886, + "step": 21000 + }, + { + "loss": 13.7715, + "grad_norm": 1.6226269006729126, + "learning_rate": 0.0003170808181267326, + "epoch": 0.941405834206354, + "step": 21005 + }, + { + "loss": 13.742, + "grad_norm": 1.617499589920044, + "learning_rate": 0.0003153842856953417, + "epoch": 0.9416299250976196, + "step": 21010 + }, + { + "loss": 13.7314, + "grad_norm": 1.5880169868469238, + "learning_rate": 0.000313684514920071, + "epoch": 0.9418540159888851, + "step": 21015 + }, + { + "loss": 13.7776, + "grad_norm": 1.879854679107666, + "learning_rate": 0.0003119815899869329, + "epoch": 0.9420781068801506, + "step": 21020 + }, + { + "loss": 13.7681, + "grad_norm": 1.631612777709961, + "learning_rate": 0.0003102755952381586, + "epoch": 0.9423021977714161, + "step": 21025 + }, + { + "loss": 13.8383, + "grad_norm": 1.8044886589050293, + "learning_rate": 0.00030856661516802055, + "epoch": 0.9425262886626816, + "step": 21030 + }, + { + "loss": 13.7984, + "grad_norm": 1.5461212396621704, + "learning_rate": 0.0003068547344186478, + "epoch": 0.942750379553947, + "step": 21035 + }, + { + "loss": 13.6981, + "grad_norm": 1.5587557554244995, + "learning_rate": 0.00030514003777583397, + "epoch": 0.9429744704452125, + "step": 21040 + }, + { + "loss": 13.6394, + "grad_norm": 1.6465073823928833, + "learning_rate": 0.0003034226101648377, + "epoch": 0.9431985613364781, + "step": 21045 + }, + { + "loss": 13.7656, + "grad_norm": 1.6942849159240723, + "learning_rate": 0.00030170253664617687, + "epoch": 0.9434226522277436, + "step": 21050 + }, + { + "loss": 13.7423, + "grad_norm": 1.5614516735076904, + "learning_rate": 0.0002999799024114151, + "epoch": 0.9436467431190091, + "step": 21055 + }, + { + "loss": 13.7156, + "grad_norm": 1.4851198196411133, + "learning_rate": 0.0002982547927789434, + "epoch": 0.9438708340102746, + "step": 21060 + }, + { + "loss": 13.7704, + "grad_norm": 1.475362777709961, + "learning_rate": 0.00029652729318975333, + "epoch": 0.94409492490154, + "step": 21065 + }, + { + "loss": 13.7994, + "grad_norm": 1.5233135223388672, + "learning_rate": 0.00029479748920320634, + "epoch": 0.9443190157928055, + "step": 21070 + }, + { + "loss": 13.5917, + "grad_norm": 1.5046474933624268, + "learning_rate": 0.0002930654664927955, + "epoch": 0.944543106684071, + "step": 21075 + }, + { + "loss": 13.5741, + "grad_norm": 1.5400524139404297, + "learning_rate": 0.00029133131084190265, + "epoch": 0.9447671975753366, + "step": 21080 + }, + { + "loss": 13.6754, + "grad_norm": 1.6073517799377441, + "learning_rate": 0.0002895951081395496, + "epoch": 0.9449912884666021, + "step": 21085 + }, + { + "loss": 13.6572, + "grad_norm": 1.5722095966339111, + "learning_rate": 0.0002878569443761442, + "epoch": 0.9452153793578676, + "step": 21090 + }, + { + "loss": 13.7381, + "grad_norm": 1.5152595043182373, + "learning_rate": 0.00028611690563922144, + "epoch": 0.945439470249133, + "step": 21095 + }, + { + "loss": 13.7101, + "grad_norm": 1.5220506191253662, + "learning_rate": 0.0002843750781091798, + "epoch": 0.9456635611403985, + "step": 21100 + }, + { + "loss": 13.7555, + "grad_norm": 1.5469439029693604, + "learning_rate": 0.000282631548055013, + "epoch": 0.945887652031664, + "step": 21105 + }, + { + "loss": 13.7246, + "grad_norm": 1.5397659540176392, + "learning_rate": 0.0002808864018300367, + "epoch": 0.9461117429229295, + "step": 21110 + }, + { + "loss": 13.6633, + "grad_norm": 1.6063482761383057, + "learning_rate": 0.00027913972586761246, + "epoch": 0.9463358338141951, + "step": 21115 + }, + { + "loss": 13.749, + "grad_norm": 1.5963630676269531, + "learning_rate": 0.00027739160667686634, + "epoch": 0.9465599247054606, + "step": 21120 + }, + { + "loss": 13.6779, + "grad_norm": 1.4886605739593506, + "learning_rate": 0.00027564213083840433, + "epoch": 0.946784015596726, + "step": 21125 + }, + { + "loss": 13.7391, + "grad_norm": 1.5576682090759277, + "learning_rate": 0.0002738913850000246, + "epoch": 0.9470081064879915, + "step": 21130 + }, + { + "loss": 13.6428, + "grad_norm": 1.5561497211456299, + "learning_rate": 0.00027213945587242506, + "epoch": 0.947232197379257, + "step": 21135 + }, + { + "loss": 13.7662, + "grad_norm": 1.5810502767562866, + "learning_rate": 0.0002703864302249102, + "epoch": 0.9474562882705225, + "step": 21140 + }, + { + "loss": 13.682, + "grad_norm": 1.6845345497131348, + "learning_rate": 0.0002686323948810921, + "epoch": 0.947680379161788, + "step": 21145 + }, + { + "loss": 13.6574, + "grad_norm": 1.6456736326217651, + "learning_rate": 0.0002668774367145913, + "epoch": 0.9479044700530536, + "step": 21150 + }, + { + "loss": 13.6922, + "grad_norm": 1.7522460222244263, + "learning_rate": 0.00026512164264473387, + "epoch": 0.948128560944319, + "step": 21155 + }, + { + "loss": 13.7328, + "grad_norm": 1.5039067268371582, + "learning_rate": 0.0002633650996322461, + "epoch": 0.9483526518355845, + "step": 21160 + }, + { + "loss": 13.7239, + "grad_norm": 1.5195696353912354, + "learning_rate": 0.00026160789467494786, + "epoch": 0.94857674272685, + "step": 21165 + }, + { + "loss": 13.6447, + "grad_norm": 1.589791178703308, + "learning_rate": 0.0002598501148034439, + "epoch": 0.9488008336181155, + "step": 21170 + }, + { + "loss": 13.667, + "grad_norm": 1.6418699026107788, + "learning_rate": 0.00025809184707681316, + "epoch": 0.949024924509381, + "step": 21175 + }, + { + "loss": 13.6324, + "grad_norm": 1.6431262493133545, + "learning_rate": 0.000256333178578297, + "epoch": 0.9492490154006465, + "step": 21180 + }, + { + "loss": 13.6902, + "grad_norm": 1.6390429735183716, + "learning_rate": 0.00025457419641098614, + "epoch": 0.949473106291912, + "step": 21185 + }, + { + "loss": 13.7573, + "grad_norm": 1.4962824583053589, + "learning_rate": 0.0002528149876935065, + "epoch": 0.9496971971831775, + "step": 21190 + }, + { + "loss": 13.599, + "grad_norm": 1.4873199462890625, + "learning_rate": 0.0002510556395557048, + "epoch": 0.949921288074443, + "step": 21195 + }, + { + "loss": 13.6939, + "grad_norm": 1.5821828842163086, + "learning_rate": 0.0002492962391343329, + "epoch": 0.9501453789657085, + "step": 21200 + }, + { + "loss": 13.7601, + "grad_norm": 1.5797697305679321, + "learning_rate": 0.00024753687356873213, + "epoch": 0.950369469856974, + "step": 21205 + }, + { + "loss": 13.6709, + "grad_norm": 1.4244056940078735, + "learning_rate": 0.0002457776299965173, + "epoch": 0.9505935607482395, + "step": 21210 + }, + { + "loss": 13.7313, + "grad_norm": 1.5052160024642944, + "learning_rate": 0.00024401859554926125, + "epoch": 0.9508176516395049, + "step": 21215 + }, + { + "loss": 13.7406, + "grad_norm": 1.6649264097213745, + "learning_rate": 0.0002422598573481797, + "epoch": 0.9510417425307705, + "step": 21220 + }, + { + "loss": 13.7893, + "grad_norm": 1.5528390407562256, + "learning_rate": 0.0002405015024998152, + "epoch": 0.951265833422036, + "step": 21225 + }, + { + "loss": 13.6475, + "grad_norm": 1.5336627960205078, + "learning_rate": 0.0002387436180917243, + "epoch": 0.9514899243133015, + "step": 21230 + }, + { + "loss": 13.6648, + "grad_norm": 1.5527150630950928, + "learning_rate": 0.00023698629118816335, + "epoch": 0.951714015204567, + "step": 21235 + }, + { + "loss": 13.7438, + "grad_norm": 1.579766035079956, + "learning_rate": 0.0002352296088257767, + "epoch": 0.9519381060958325, + "step": 21240 + }, + { + "loss": 13.7382, + "grad_norm": 1.4811670780181885, + "learning_rate": 0.00023347365800928602, + "epoch": 0.9521621969870979, + "step": 21245 + }, + { + "loss": 13.7377, + "grad_norm": 1.6110016107559204, + "learning_rate": 0.00023171852570718097, + "epoch": 0.9523862878783634, + "step": 21250 + }, + { + "loss": 13.6928, + "grad_norm": 1.587537169456482, + "learning_rate": 0.00022996429884741227, + "epoch": 0.952610378769629, + "step": 21255 + }, + { + "loss": 13.5688, + "grad_norm": 1.54538893699646, + "learning_rate": 0.00022821106431308543, + "epoch": 0.9528344696608945, + "step": 21260 + }, + { + "loss": 13.6446, + "grad_norm": 1.3947179317474365, + "learning_rate": 0.00022645890893815878, + "epoch": 0.95305856055216, + "step": 21265 + }, + { + "loss": 13.7477, + "grad_norm": 1.6194840669631958, + "learning_rate": 0.000224707919503142, + "epoch": 0.9532826514434255, + "step": 21270 + }, + { + "loss": 13.6585, + "grad_norm": 1.5126726627349854, + "learning_rate": 0.00022295818273079798, + "epoch": 0.9535067423346909, + "step": 21275 + }, + { + "loss": 13.5939, + "grad_norm": 1.5080811977386475, + "learning_rate": 0.00022120978528184833, + "epoch": 0.9537308332259564, + "step": 21280 + }, + { + "loss": 13.6283, + "grad_norm": 1.4491814374923706, + "learning_rate": 0.00021946281375068058, + "epoch": 0.9539549241172219, + "step": 21285 + }, + { + "loss": 13.6394, + "grad_norm": 1.4810166358947754, + "learning_rate": 0.0002177173546610597, + "epoch": 0.9541790150084875, + "step": 21290 + }, + { + "loss": 13.7695, + "grad_norm": 1.5009487867355347, + "learning_rate": 0.00021597349446184248, + "epoch": 0.954403105899753, + "step": 21295 + }, + { + "loss": 13.6825, + "grad_norm": 1.4756357669830322, + "learning_rate": 0.00021423131952269653, + "epoch": 0.9546271967910185, + "step": 21300 + }, + { + "loss": 13.7127, + "grad_norm": 1.5060594081878662, + "learning_rate": 0.00021249091612982155, + "epoch": 0.9548512876822839, + "step": 21305 + }, + { + "loss": 13.7083, + "grad_norm": 1.4578135013580322, + "learning_rate": 0.00021075237048167678, + "epoch": 0.9550753785735494, + "step": 21310 + }, + { + "loss": 13.6741, + "grad_norm": 1.4751098155975342, + "learning_rate": 0.00020901576868471126, + "epoch": 0.9552994694648149, + "step": 21315 + }, + { + "loss": 13.738, + "grad_norm": 1.4860180616378784, + "learning_rate": 0.00020728119674909894, + "epoch": 0.9555235603560804, + "step": 21320 + }, + { + "loss": 13.6476, + "grad_norm": 1.6425741910934448, + "learning_rate": 0.0002055487405844795, + "epoch": 0.955747651247346, + "step": 21325 + }, + { + "loss": 13.8511, + "grad_norm": 1.4640700817108154, + "learning_rate": 0.00020381848599570275, + "epoch": 0.9559717421386115, + "step": 21330 + }, + { + "loss": 13.6041, + "grad_norm": 1.4632081985473633, + "learning_rate": 0.0002020905186785791, + "epoch": 0.9561958330298769, + "step": 21335 + }, + { + "loss": 13.4984, + "grad_norm": 1.47840416431427, + "learning_rate": 0.0002003649242156355, + "epoch": 0.9564199239211424, + "step": 21340 + }, + { + "loss": 13.7759, + "grad_norm": 1.5127596855163574, + "learning_rate": 0.0001986417880718764, + "epoch": 0.9566440148124079, + "step": 21345 + }, + { + "loss": 13.725, + "grad_norm": 1.437754511833191, + "learning_rate": 0.00019692119559055102, + "epoch": 0.9568681057036734, + "step": 21350 + }, + { + "loss": 13.6686, + "grad_norm": 1.4675992727279663, + "learning_rate": 0.00019520323198892622, + "epoch": 0.9570921965949389, + "step": 21355 + }, + { + "loss": 13.7135, + "grad_norm": 1.4326221942901611, + "learning_rate": 0.00019348798235406628, + "epoch": 0.9573162874862045, + "step": 21360 + }, + { + "loss": 13.7331, + "grad_norm": 1.481142282485962, + "learning_rate": 0.0001917755316386185, + "epoch": 0.9575403783774699, + "step": 21365 + }, + { + "loss": 13.6136, + "grad_norm": 1.4526582956314087, + "learning_rate": 0.00019006596465660547, + "epoch": 0.9577644692687354, + "step": 21370 + }, + { + "loss": 13.6919, + "grad_norm": 1.4179489612579346, + "learning_rate": 0.00018835936607922483, + "epoch": 0.9579885601600009, + "step": 21375 + }, + { + "loss": 13.6267, + "grad_norm": 1.5630279779434204, + "learning_rate": 0.0001866558204306556, + "epoch": 0.9582126510512664, + "step": 21380 + }, + { + "loss": 13.6937, + "grad_norm": 1.4708082675933838, + "learning_rate": 0.00018495541208387128, + "epoch": 0.9584367419425319, + "step": 21385 + }, + { + "loss": 13.6937, + "grad_norm": 1.444834589958191, + "learning_rate": 0.00018325822525646208, + "epoch": 0.9586608328337974, + "step": 21390 + }, + { + "loss": 13.5621, + "grad_norm": 1.498726487159729, + "learning_rate": 0.000181564344006463, + "epoch": 0.9588849237250628, + "step": 21395 + }, + { + "loss": 13.8514, + "grad_norm": 1.4610778093338013, + "learning_rate": 0.0001798738522281907, + "epoch": 0.9591090146163284, + "step": 21400 + }, + { + "loss": 13.6962, + "grad_norm": 1.419760823249817, + "learning_rate": 0.00017818683364808884, + "epoch": 0.9593331055075939, + "step": 21405 + }, + { + "loss": 13.6112, + "grad_norm": 1.4531667232513428, + "learning_rate": 0.00017650337182058086, + "epoch": 0.9595571963988594, + "step": 21410 + }, + { + "loss": 13.6452, + "grad_norm": 1.4697725772857666, + "learning_rate": 0.00017482355012393176, + "epoch": 0.9597812872901249, + "step": 21415 + }, + { + "loss": 13.72, + "grad_norm": 1.4820373058319092, + "learning_rate": 0.0001731474517561188, + "epoch": 0.9600053781813904, + "step": 21420 + }, + { + "loss": 13.5874, + "grad_norm": 1.436105728149414, + "learning_rate": 0.00017147515973071076, + "epoch": 0.9602294690726558, + "step": 21425 + }, + { + "loss": 13.6086, + "grad_norm": 1.4315346479415894, + "learning_rate": 0.00016980675687275614, + "epoch": 0.9604535599639213, + "step": 21430 + }, + { + "loss": 13.669, + "grad_norm": 1.3922876119613647, + "learning_rate": 0.00016814232581468158, + "epoch": 0.9606776508551869, + "step": 21435 + }, + { + "loss": 13.4939, + "grad_norm": 1.5503712892532349, + "learning_rate": 0.00016648194899219885, + "epoch": 0.9609017417464524, + "step": 21440 + }, + { + "loss": 13.5681, + "grad_norm": 1.4173859357833862, + "learning_rate": 0.0001648257086402221, + "epoch": 0.9611258326377179, + "step": 21445 + }, + { + "loss": 13.5037, + "grad_norm": 1.3971552848815918, + "learning_rate": 0.00016317368678879496, + "epoch": 0.9613499235289834, + "step": 21450 + }, + { + "loss": 13.6957, + "grad_norm": 1.4970334768295288, + "learning_rate": 0.00016152596525902764, + "epoch": 0.9615740144202488, + "step": 21455 + }, + { + "loss": 13.5936, + "grad_norm": 1.4297453165054321, + "learning_rate": 0.0001598826256590449, + "epoch": 0.9617981053115143, + "step": 21460 + }, + { + "loss": 13.5697, + "grad_norm": 1.440354585647583, + "learning_rate": 0.0001582437493799434, + "epoch": 0.9620221962027798, + "step": 21465 + }, + { + "loss": 13.6957, + "grad_norm": 1.4379942417144775, + "learning_rate": 0.0001566094175917616, + "epoch": 0.9622462870940454, + "step": 21470 + }, + { + "loss": 13.5226, + "grad_norm": 1.4286212921142578, + "learning_rate": 0.00015497971123945873, + "epoch": 0.9624703779853109, + "step": 21475 + }, + { + "loss": 13.6297, + "grad_norm": 1.476987361907959, + "learning_rate": 0.00015335471103890603, + "epoch": 0.9626944688765764, + "step": 21480 + }, + { + "loss": 13.5253, + "grad_norm": 1.4980093240737915, + "learning_rate": 0.00015173449747288932, + "epoch": 0.9629185597678418, + "step": 21485 + }, + { + "loss": 13.5963, + "grad_norm": 1.3949865102767944, + "learning_rate": 0.00015011915078712252, + "epoch": 0.9631426506591073, + "step": 21490 + }, + { + "loss": 13.6885, + "grad_norm": 1.4428666830062866, + "learning_rate": 0.00014850875098627324, + "epoch": 0.9633667415503728, + "step": 21495 + }, + { + "loss": 13.586, + "grad_norm": 1.4490755796432495, + "learning_rate": 0.00014690337783000075, + "epoch": 0.9635908324416383, + "step": 21500 + }, + { + "eval_loss": 1.6970657110214233, + "eval_runtime": 18.8072, + "eval_samples_per_second": 871.155, + "eval_steps_per_second": 7.816, + "epoch": 0.9635908324416383, + "step": 21500 + }, + { + "loss": 13.6201, + "grad_norm": 1.3927302360534668, + "learning_rate": 0.00014530311082900526, + "epoch": 0.9638149233329039, + "step": 21505 + }, + { + "loss": 13.587, + "grad_norm": 1.371099829673767, + "learning_rate": 0.0001437080292410899, + "epoch": 0.9640390142241694, + "step": 21510 + }, + { + "loss": 13.7055, + "grad_norm": 1.3734136819839478, + "learning_rate": 0.00014211821206723535, + "epoch": 0.9642631051154348, + "step": 21515 + }, + { + "loss": 13.5536, + "grad_norm": 1.366150140762329, + "learning_rate": 0.00014053373804768742, + "epoch": 0.9644871960067003, + "step": 21520 + }, + { + "loss": 13.6378, + "grad_norm": 1.3889002799987793, + "learning_rate": 0.00013895468565805656, + "epoch": 0.9647112868979658, + "step": 21525 + }, + { + "loss": 13.5321, + "grad_norm": 1.3824708461761475, + "learning_rate": 0.00013738113310543176, + "epoch": 0.9649353777892313, + "step": 21530 + }, + { + "loss": 13.5276, + "grad_norm": 1.4974249601364136, + "learning_rate": 0.00013581315832450662, + "epoch": 0.9651594686804968, + "step": 21535 + }, + { + "loss": 13.5834, + "grad_norm": 1.4725018739700317, + "learning_rate": 0.00013425083897371983, + "epoch": 0.9653835595717624, + "step": 21540 + }, + { + "loss": 13.6833, + "grad_norm": 1.446146845817566, + "learning_rate": 0.00013269425243140853, + "epoch": 0.9656076504630278, + "step": 21545 + }, + { + "loss": 13.6362, + "grad_norm": 1.3499586582183838, + "learning_rate": 0.0001311434757919762, + "epoch": 0.9658317413542933, + "step": 21550 + }, + { + "loss": 13.6207, + "grad_norm": 1.40580153465271, + "learning_rate": 0.00012959858586207435, + "epoch": 0.9660558322455588, + "step": 21555 + }, + { + "loss": 13.5173, + "grad_norm": 1.4460829496383667, + "learning_rate": 0.00012805965915679807, + "epoch": 0.9662799231368243, + "step": 21560 + }, + { + "loss": 13.638, + "grad_norm": 1.4111204147338867, + "learning_rate": 0.0001265267718958971, + "epoch": 0.9665040140280898, + "step": 21565 + }, + { + "loss": 13.7012, + "grad_norm": 1.3578362464904785, + "learning_rate": 0.00012500000000000006, + "epoch": 0.9667281049193552, + "step": 21570 + }, + { + "loss": 13.5422, + "grad_norm": 1.5003434419631958, + "learning_rate": 0.00012347941908685464, + "epoch": 0.9669521958106208, + "step": 21575 + }, + { + "loss": 13.5665, + "grad_norm": 1.3598604202270508, + "learning_rate": 0.00012196510446758268, + "epoch": 0.9671762867018863, + "step": 21580 + }, + { + "loss": 13.6541, + "grad_norm": 1.4082996845245361, + "learning_rate": 0.0001204571311429496, + "epoch": 0.9674003775931518, + "step": 21585 + }, + { + "loss": 13.5797, + "grad_norm": 1.3907629251480103, + "learning_rate": 0.00011895557379965005, + "epoch": 0.9676244684844173, + "step": 21590 + }, + { + "loss": 13.5672, + "grad_norm": 1.365025281906128, + "learning_rate": 0.00011746050680660903, + "epoch": 0.9678485593756828, + "step": 21595 + }, + { + "loss": 13.5871, + "grad_norm": 1.3568958044052124, + "learning_rate": 0.00011597200421129844, + "epoch": 0.9680726502669482, + "step": 21600 + }, + { + "loss": 13.5817, + "grad_norm": 1.4446165561676025, + "learning_rate": 0.00011449013973606907, + "epoch": 0.9682967411582137, + "step": 21605 + }, + { + "loss": 13.4769, + "grad_norm": 1.3863264322280884, + "learning_rate": 0.00011301498677450037, + "epoch": 0.9685208320494793, + "step": 21610 + }, + { + "loss": 13.5077, + "grad_norm": 1.3644050359725952, + "learning_rate": 0.00011154661838776472, + "epoch": 0.9687449229407448, + "step": 21615 + }, + { + "loss": 13.5713, + "grad_norm": 1.4149904251098633, + "learning_rate": 0.00011008510730100893, + "epoch": 0.9689690138320103, + "step": 21620 + }, + { + "loss": 13.5241, + "grad_norm": 1.3770707845687866, + "learning_rate": 0.0001086305258997523, + "epoch": 0.9691931047232758, + "step": 21625 + }, + { + "loss": 13.5229, + "grad_norm": 1.3707841634750366, + "learning_rate": 0.00010718294622630187, + "epoch": 0.9694171956145412, + "step": 21630 + }, + { + "loss": 13.4982, + "grad_norm": 1.2850505113601685, + "learning_rate": 0.00010574243997618415, + "epoch": 0.9696412865058067, + "step": 21635 + }, + { + "loss": 13.6666, + "grad_norm": 1.3031147718429565, + "learning_rate": 0.00010430907849459354, + "epoch": 0.9698653773970722, + "step": 21640 + }, + { + "loss": 13.599, + "grad_norm": 1.4308277368545532, + "learning_rate": 0.0001028829327728599, + "epoch": 0.9700894682883378, + "step": 21645 + }, + { + "loss": 13.5511, + "grad_norm": 1.3806493282318115, + "learning_rate": 0.00010146407344493186, + "epoch": 0.9703135591796033, + "step": 21650 + }, + { + "loss": 13.5986, + "grad_norm": 1.3425934314727783, + "learning_rate": 0.0001000525707838783, + "epoch": 0.9705376500708688, + "step": 21655 + }, + { + "loss": 13.6673, + "grad_norm": 1.3432285785675049, + "learning_rate": 9.864849469840822e-05, + "epoch": 0.9707617409621342, + "step": 21660 + }, + { + "loss": 13.6672, + "grad_norm": 1.3797509670257568, + "learning_rate": 9.725191472940837e-05, + "epoch": 0.9709858318533997, + "step": 21665 + }, + { + "loss": 13.6013, + "grad_norm": 1.3327072858810425, + "learning_rate": 9.586290004649867e-05, + "epoch": 0.9712099227446652, + "step": 21670 + }, + { + "loss": 13.6346, + "grad_norm": 1.3804453611373901, + "learning_rate": 9.448151944460656e-05, + "epoch": 0.9714340136359307, + "step": 21675 + }, + { + "loss": 13.5767, + "grad_norm": 1.3617254495620728, + "learning_rate": 9.31078413405601e-05, + "epoch": 0.9716581045271963, + "step": 21680 + }, + { + "loss": 13.5679, + "grad_norm": 1.3983831405639648, + "learning_rate": 9.174193376969866e-05, + "epoch": 0.9718821954184618, + "step": 21685 + }, + { + "loss": 13.5782, + "grad_norm": 1.3388102054595947, + "learning_rate": 9.038386438250415e-05, + "epoch": 0.9721062863097272, + "step": 21690 + }, + { + "loss": 13.5006, + "grad_norm": 1.3537145853042603, + "learning_rate": 8.903370044124967e-05, + "epoch": 0.9723303772009927, + "step": 21695 + }, + { + "loss": 13.5512, + "grad_norm": 1.3179186582565308, + "learning_rate": 8.769150881666851e-05, + "epoch": 0.9725544680922582, + "step": 21700 + }, + { + "loss": 13.5385, + "grad_norm": 1.3044551610946655, + "learning_rate": 8.635735598464243e-05, + "epoch": 0.9727785589835237, + "step": 21705 + }, + { + "loss": 13.5015, + "grad_norm": 1.3320732116699219, + "learning_rate": 8.503130802290862e-05, + "epoch": 0.9730026498747892, + "step": 21710 + }, + { + "loss": 13.457, + "grad_norm": 1.3160337209701538, + "learning_rate": 8.371343060778771e-05, + "epoch": 0.9732267407660548, + "step": 21715 + }, + { + "loss": 13.5789, + "grad_norm": 1.3159329891204834, + "learning_rate": 8.240378901093035e-05, + "epoch": 0.9734508316573202, + "step": 21720 + }, + { + "loss": 13.55, + "grad_norm": 1.3328814506530762, + "learning_rate": 8.110244809608495e-05, + "epoch": 0.9736749225485857, + "step": 21725 + }, + { + "loss": 13.47, + "grad_norm": 1.2326339483261108, + "learning_rate": 7.980947231588471e-05, + "epoch": 0.9738990134398512, + "step": 21730 + }, + { + "loss": 13.4876, + "grad_norm": 1.370176911354065, + "learning_rate": 7.852492570865557e-05, + "epoch": 0.9741231043311167, + "step": 21735 + }, + { + "loss": 13.4974, + "grad_norm": 1.4055640697479248, + "learning_rate": 7.724887189524485e-05, + "epoch": 0.9743471952223822, + "step": 21740 + }, + { + "loss": 13.4864, + "grad_norm": 1.3312071561813354, + "learning_rate": 7.598137407586958e-05, + "epoch": 0.9745712861136477, + "step": 21745 + }, + { + "loss": 13.5479, + "grad_norm": 1.2774869203567505, + "learning_rate": 7.472249502698686e-05, + "epoch": 0.9747953770049131, + "step": 21750 + }, + { + "loss": 13.5515, + "grad_norm": 1.2742916345596313, + "learning_rate": 7.347229709818453e-05, + "epoch": 0.9750194678961787, + "step": 21755 + }, + { + "loss": 13.6203, + "grad_norm": 1.309590220451355, + "learning_rate": 7.223084220909332e-05, + "epoch": 0.9752435587874442, + "step": 21760 + }, + { + "loss": 13.5797, + "grad_norm": 1.290747046470642, + "learning_rate": 7.099819184631928e-05, + "epoch": 0.9754676496787097, + "step": 21765 + }, + { + "loss": 13.5127, + "grad_norm": 1.37006676197052, + "learning_rate": 6.977440706039972e-05, + "epoch": 0.9756917405699752, + "step": 21770 + }, + { + "loss": 13.4896, + "grad_norm": 1.3133093118667603, + "learning_rate": 6.85595484627787e-05, + "epoch": 0.9759158314612407, + "step": 21775 + }, + { + "loss": 13.4632, + "grad_norm": 1.3016916513442993, + "learning_rate": 6.735367622280513e-05, + "epoch": 0.9761399223525061, + "step": 21780 + }, + { + "loss": 13.5153, + "grad_norm": 1.2847900390625, + "learning_rate": 6.615685006475284e-05, + "epoch": 0.9763640132437716, + "step": 21785 + }, + { + "loss": 13.5218, + "grad_norm": 1.3632951974868774, + "learning_rate": 6.496912926486279e-05, + "epoch": 0.9765881041350372, + "step": 21790 + }, + { + "loss": 13.5126, + "grad_norm": 1.2633512020111084, + "learning_rate": 6.379057264840679e-05, + "epoch": 0.9768121950263027, + "step": 21795 + }, + { + "loss": 13.5233, + "grad_norm": 1.3291096687316895, + "learning_rate": 6.262123858677426e-05, + "epoch": 0.9770362859175682, + "step": 21800 + }, + { + "loss": 13.5546, + "grad_norm": 1.2681336402893066, + "learning_rate": 6.146118499458131e-05, + "epoch": 0.9772603768088337, + "step": 21805 + }, + { + "loss": 13.5603, + "grad_norm": 1.3324332237243652, + "learning_rate": 6.0310469326802285e-05, + "epoch": 0.9774844677000991, + "step": 21810 + }, + { + "loss": 13.6537, + "grad_norm": 1.3532867431640625, + "learning_rate": 5.916914857592387e-05, + "epoch": 0.9777085585913646, + "step": 21815 + }, + { + "loss": 13.5717, + "grad_norm": 1.341113805770874, + "learning_rate": 5.803727926912269e-05, + "epoch": 0.9779326494826301, + "step": 21820 + }, + { + "loss": 13.5134, + "grad_norm": 1.4140442609786987, + "learning_rate": 5.691491746546573e-05, + "epoch": 0.9781567403738957, + "step": 21825 + }, + { + "loss": 13.5485, + "grad_norm": 1.2625435590744019, + "learning_rate": 5.580211875313346e-05, + "epoch": 0.9783808312651612, + "step": 21830 + }, + { + "loss": 13.4533, + "grad_norm": 1.3162455558776855, + "learning_rate": 5.469893824666686e-05, + "epoch": 0.9786049221564267, + "step": 21835 + }, + { + "loss": 13.4871, + "grad_norm": 1.3901221752166748, + "learning_rate": 5.3605430584238e-05, + "epoch": 0.9788290130476921, + "step": 21840 + }, + { + "loss": 13.5566, + "grad_norm": 1.3070961236953735, + "learning_rate": 5.252164992494338e-05, + "epoch": 0.9790531039389576, + "step": 21845 + }, + { + "loss": 13.5652, + "grad_norm": 1.3653221130371094, + "learning_rate": 5.1447649946122e-05, + "epoch": 0.9792771948302231, + "step": 21850 + }, + { + "loss": 13.4793, + "grad_norm": 1.3583656549453735, + "learning_rate": 5.038348384069663e-05, + "epoch": 0.9795012857214886, + "step": 21855 + }, + { + "loss": 13.5538, + "grad_norm": 1.2869338989257812, + "learning_rate": 4.9329204314539186e-05, + "epoch": 0.9797253766127542, + "step": 21860 + }, + { + "loss": 13.5843, + "grad_norm": 1.2691680192947388, + "learning_rate": 4.828486358386072e-05, + "epoch": 0.9799494675040197, + "step": 21865 + }, + { + "loss": 13.5242, + "grad_norm": 1.2550824880599976, + "learning_rate": 4.725051337262476e-05, + "epoch": 0.9801735583952851, + "step": 21870 + }, + { + "loss": 13.4907, + "grad_norm": 1.3409019708633423, + "learning_rate": 4.6226204909985777e-05, + "epoch": 0.9803976492865506, + "step": 21875 + }, + { + "loss": 13.5526, + "grad_norm": 1.2815920114517212, + "learning_rate": 4.521198892775202e-05, + "epoch": 0.9806217401778161, + "step": 21880 + }, + { + "loss": 13.4688, + "grad_norm": 1.2765311002731323, + "learning_rate": 4.420791565787288e-05, + "epoch": 0.9808458310690816, + "step": 21885 + }, + { + "loss": 13.4585, + "grad_norm": 1.2942756414413452, + "learning_rate": 4.3214034829950396e-05, + "epoch": 0.9810699219603471, + "step": 21890 + }, + { + "loss": 13.6144, + "grad_norm": 1.3017674684524536, + "learning_rate": 4.223039566877729e-05, + "epoch": 0.9812940128516127, + "step": 21895 + }, + { + "loss": 13.4723, + "grad_norm": 1.2312167882919312, + "learning_rate": 4.125704689189819e-05, + "epoch": 0.9815181037428781, + "step": 21900 + }, + { + "loss": 13.4998, + "grad_norm": 1.3000839948654175, + "learning_rate": 4.0294036707196945e-05, + "epoch": 0.9817421946341436, + "step": 21905 + }, + { + "loss": 13.5832, + "grad_norm": 1.3279622793197632, + "learning_rate": 3.93414128105091e-05, + "epoch": 0.9819662855254091, + "step": 21910 + }, + { + "loss": 13.4335, + "grad_norm": 1.3625671863555908, + "learning_rate": 3.83992223832596e-05, + "epoch": 0.9821903764166746, + "step": 21915 + }, + { + "loss": 13.4777, + "grad_norm": 1.2160567045211792, + "learning_rate": 3.7467512090126e-05, + "epoch": 0.9824144673079401, + "step": 21920 + }, + { + "loss": 13.5191, + "grad_norm": 1.3054990768432617, + "learning_rate": 3.654632807672695e-05, + "epoch": 0.9826385581992056, + "step": 21925 + }, + { + "loss": 13.5742, + "grad_norm": 1.334370732307434, + "learning_rate": 3.563571596733722e-05, + "epoch": 0.982862649090471, + "step": 21930 + }, + { + "loss": 13.6078, + "grad_norm": 1.3172982931137085, + "learning_rate": 3.473572086262783e-05, + "epoch": 0.9830867399817366, + "step": 21935 + }, + { + "loss": 13.4361, + "grad_norm": 1.299933671951294, + "learning_rate": 3.3846387337432034e-05, + "epoch": 0.9833108308730021, + "step": 21940 + }, + { + "loss": 13.547, + "grad_norm": 1.253339409828186, + "learning_rate": 3.2967759438537886e-05, + "epoch": 0.9835349217642676, + "step": 21945 + }, + { + "loss": 13.5014, + "grad_norm": 1.2532811164855957, + "learning_rate": 3.209988068250688e-05, + "epoch": 0.9837590126555331, + "step": 21950 + }, + { + "loss": 13.457, + "grad_norm": 1.2518850564956665, + "learning_rate": 3.1242794053518234e-05, + "epoch": 0.9839831035467986, + "step": 21955 + }, + { + "loss": 13.5006, + "grad_norm": 1.3577061891555786, + "learning_rate": 3.0396542001240145e-05, + "epoch": 0.984207194438064, + "step": 21960 + }, + { + "loss": 13.5796, + "grad_norm": 1.237846851348877, + "learning_rate": 2.9561166438727638e-05, + "epoch": 0.9844312853293296, + "step": 21965 + }, + { + "loss": 13.5357, + "grad_norm": 1.1801531314849854, + "learning_rate": 2.8736708740346146e-05, + "epoch": 0.9846553762205951, + "step": 21970 + }, + { + "loss": 13.5288, + "grad_norm": 1.3670539855957031, + "learning_rate": 2.7923209739722955e-05, + "epoch": 0.9848794671118606, + "step": 21975 + }, + { + "loss": 13.5071, + "grad_norm": 1.3142650127410889, + "learning_rate": 2.7120709727724207e-05, + "epoch": 0.9851035580031261, + "step": 21980 + }, + { + "loss": 13.5824, + "grad_norm": 1.2690343856811523, + "learning_rate": 2.632924845045975e-05, + "epoch": 0.9853276488943916, + "step": 21985 + }, + { + "loss": 13.4953, + "grad_norm": 1.2118849754333496, + "learning_rate": 2.5548865107314605e-05, + "epoch": 0.985551739785657, + "step": 21990 + }, + { + "loss": 13.458, + "grad_norm": 1.2431721687316895, + "learning_rate": 2.4779598349007227e-05, + "epoch": 0.9857758306769225, + "step": 21995 + }, + { + "loss": 13.5144, + "grad_norm": 1.2333933115005493, + "learning_rate": 2.402148627567555e-05, + "epoch": 0.985999921568188, + "step": 22000 + }, + { + "eval_loss": 1.6814430952072144, + "eval_runtime": 18.632, + "eval_samples_per_second": 879.348, + "eval_steps_per_second": 7.89, + "epoch": 0.985999921568188, + "step": 22000 + }, + { + "loss": 13.4373, + "grad_norm": 1.2443017959594727, + "learning_rate": 2.3274566434989626e-05, + "epoch": 0.9862240124594536, + "step": 22005 + }, + { + "loss": 13.4767, + "grad_norm": 1.2459810972213745, + "learning_rate": 2.2538875820292348e-05, + "epoch": 0.9864481033507191, + "step": 22010 + }, + { + "loss": 13.4842, + "grad_norm": 1.2842656373977661, + "learning_rate": 2.181445086876696e-05, + "epoch": 0.9866721942419846, + "step": 22015 + }, + { + "loss": 13.5223, + "grad_norm": 1.3632595539093018, + "learning_rate": 2.1101327459632445e-05, + "epoch": 0.98689628513325, + "step": 22020 + }, + { + "loss": 13.4355, + "grad_norm": 1.2427505254745483, + "learning_rate": 2.0399540912366675e-05, + "epoch": 0.9871203760245155, + "step": 22025 + }, + { + "loss": 13.6738, + "grad_norm": 1.2594168186187744, + "learning_rate": 1.970912598495689e-05, + "epoch": 0.987344466915781, + "step": 22030 + }, + { + "loss": 13.555, + "grad_norm": 1.211691975593567, + "learning_rate": 1.9030116872178316e-05, + "epoch": 0.9875685578070466, + "step": 22035 + }, + { + "loss": 13.5905, + "grad_norm": 1.255252480506897, + "learning_rate": 1.8362547203900625e-05, + "epoch": 0.9877926486983121, + "step": 22040 + }, + { + "loss": 13.4202, + "grad_norm": 1.2320276498794556, + "learning_rate": 1.7706450043422308e-05, + "epoch": 0.9880167395895776, + "step": 22045 + }, + { + "loss": 13.5955, + "grad_norm": 1.2293109893798828, + "learning_rate": 1.7061857885832893e-05, + "epoch": 0.988240830480843, + "step": 22050 + }, + { + "loss": 13.4793, + "grad_norm": 1.2270162105560303, + "learning_rate": 1.6428802656403842e-05, + "epoch": 0.9884649213721085, + "step": 22055 + }, + { + "loss": 13.4329, + "grad_norm": 1.2630739212036133, + "learning_rate": 1.580731570900723e-05, + "epoch": 0.988689012263374, + "step": 22060 + }, + { + "loss": 13.395, + "grad_norm": 1.3208829164505005, + "learning_rate": 1.519742782456282e-05, + "epoch": 0.9889131031546395, + "step": 22065 + }, + { + "loss": 13.4516, + "grad_norm": 1.225806474685669, + "learning_rate": 1.4599169209513568e-05, + "epoch": 0.989137194045905, + "step": 22070 + }, + { + "loss": 13.4567, + "grad_norm": 1.2742187976837158, + "learning_rate": 1.4012569494329664e-05, + "epoch": 0.9893612849371706, + "step": 22075 + }, + { + "loss": 13.5909, + "grad_norm": 1.2377104759216309, + "learning_rate": 1.3437657732040782e-05, + "epoch": 0.989585375828436, + "step": 22080 + }, + { + "loss": 13.5011, + "grad_norm": 1.279120683670044, + "learning_rate": 1.287446239679746e-05, + "epoch": 0.9898094667197015, + "step": 22085 + }, + { + "loss": 13.4, + "grad_norm": 1.2107067108154297, + "learning_rate": 1.232301138246042e-05, + "epoch": 0.990033557610967, + "step": 22090 + }, + { + "loss": 13.4454, + "grad_norm": 1.2431056499481201, + "learning_rate": 1.1783332001219533e-05, + "epoch": 0.9902576485022325, + "step": 22095 + }, + { + "loss": 13.4354, + "grad_norm": 1.2113475799560547, + "learning_rate": 1.1255450982240679e-05, + "epoch": 0.990481739393498, + "step": 22100 + }, + { + "loss": 13.4946, + "grad_norm": 1.2396379709243774, + "learning_rate": 1.0739394470342057e-05, + "epoch": 0.9907058302847636, + "step": 22105 + }, + { + "loss": 13.4944, + "grad_norm": 1.2629179954528809, + "learning_rate": 1.0235188024699471e-05, + "epoch": 0.990929921176029, + "step": 22110 + }, + { + "loss": 13.4847, + "grad_norm": 1.1953667402267456, + "learning_rate": 9.742856617580147e-06, + "epoch": 0.9911540120672945, + "step": 22115 + }, + { + "loss": 13.4528, + "grad_norm": 1.1882737874984741, + "learning_rate": 9.262424633106115e-06, + "epoch": 0.99137810295856, + "step": 22120 + }, + { + "loss": 13.5766, + "grad_norm": 1.2557233572006226, + "learning_rate": 8.793915866046358e-06, + "epoch": 0.9916021938498255, + "step": 22125 + }, + { + "loss": 13.5409, + "grad_norm": 1.2498853206634521, + "learning_rate": 8.337353520638468e-06, + "epoch": 0.991826284741091, + "step": 22130 + }, + { + "loss": 13.4702, + "grad_norm": 1.2380563020706177, + "learning_rate": 7.892760209439298e-06, + "epoch": 0.9920503756323564, + "step": 22135 + }, + { + "loss": 13.5216, + "grad_norm": 1.3622403144836426, + "learning_rate": 7.460157952205032e-06, + "epoch": 0.992274466523622, + "step": 22140 + }, + { + "loss": 13.4822, + "grad_norm": 1.223172903060913, + "learning_rate": 7.039568174800504e-06, + "epoch": 0.9924985574148875, + "step": 22145 + }, + { + "loss": 13.5725, + "grad_norm": 1.1964086294174194, + "learning_rate": 6.631011708138207e-06, + "epoch": 0.992722648306153, + "step": 22150 + }, + { + "loss": 13.3952, + "grad_norm": 1.2155392169952393, + "learning_rate": 6.234508787146543e-06, + "epoch": 0.9929467391974185, + "step": 22155 + }, + { + "loss": 13.5441, + "grad_norm": 1.1746368408203125, + "learning_rate": 5.850079049767309e-06, + "epoch": 0.993170830088684, + "step": 22160 + }, + { + "loss": 13.4357, + "grad_norm": 1.1912814378738403, + "learning_rate": 5.477741535983572e-06, + "epoch": 0.9933949209799494, + "step": 22165 + }, + { + "loss": 13.4152, + "grad_norm": 1.2137175798416138, + "learning_rate": 5.117514686876379e-06, + "epoch": 0.9936190118712149, + "step": 22170 + }, + { + "loss": 13.6085, + "grad_norm": 1.1917462348937988, + "learning_rate": 4.769416343711364e-06, + "epoch": 0.9938431027624804, + "step": 22175 + }, + { + "loss": 13.5033, + "grad_norm": 1.1859139204025269, + "learning_rate": 4.433463747055194e-06, + "epoch": 0.994067193653746, + "step": 22180 + }, + { + "loss": 13.4355, + "grad_norm": 1.1885240077972412, + "learning_rate": 4.10967353592176e-06, + "epoch": 0.9942912845450115, + "step": 22185 + }, + { + "loss": 13.4492, + "grad_norm": 1.2175772190093994, + "learning_rate": 3.798061746947995e-06, + "epoch": 0.994515375436277, + "step": 22190 + }, + { + "loss": 13.491, + "grad_norm": 1.2592718601226807, + "learning_rate": 3.498643813599517e-06, + "epoch": 0.9947394663275424, + "step": 22195 + }, + { + "loss": 13.4639, + "grad_norm": 1.1634012460708618, + "learning_rate": 3.211434565406457e-06, + "epoch": 0.9949635572188079, + "step": 22200 + }, + { + "loss": 13.5012, + "grad_norm": 1.233546257019043, + "learning_rate": 2.9364482272288273e-06, + "epoch": 0.9951876481100734, + "step": 22205 + }, + { + "loss": 13.4361, + "grad_norm": 1.2499737739562988, + "learning_rate": 2.6736984185520286e-06, + "epoch": 0.995411739001339, + "step": 22210 + }, + { + "loss": 13.4777, + "grad_norm": 1.1770788431167603, + "learning_rate": 2.423198152812306e-06, + "epoch": 0.9956358298926045, + "step": 22215 + }, + { + "loss": 13.4065, + "grad_norm": 1.223405361175537, + "learning_rate": 2.1849598367522926e-06, + "epoch": 0.99585992078387, + "step": 22220 + }, + { + "loss": 13.5333, + "grad_norm": 1.2335143089294434, + "learning_rate": 1.958995269806446e-06, + "epoch": 0.9960840116751354, + "step": 22225 + }, + { + "loss": 13.3936, + "grad_norm": 1.1790462732315063, + "learning_rate": 1.7453156435165983e-06, + "epoch": 0.9963081025664009, + "step": 22230 + }, + { + "loss": 13.4622, + "grad_norm": 1.1784507036209106, + "learning_rate": 1.5439315409778443e-06, + "epoch": 0.9965321934576664, + "step": 22235 + }, + { + "loss": 13.4254, + "grad_norm": 1.2225220203399658, + "learning_rate": 1.3548529363142104e-06, + "epoch": 0.9967562843489319, + "step": 22240 + }, + { + "loss": 13.588, + "grad_norm": 1.2711127996444702, + "learning_rate": 1.1780891941847448e-06, + "epoch": 0.9969803752401974, + "step": 22245 + }, + { + "loss": 13.5485, + "grad_norm": 1.226955771446228, + "learning_rate": 1.0136490693196666e-06, + "epoch": 0.997204466131463, + "step": 22250 + }, + { + "loss": 13.5763, + "grad_norm": 1.2618262767791748, + "learning_rate": 8.615407060867663e-07, + "epoch": 0.9974285570227284, + "step": 22255 + }, + { + "loss": 13.5191, + "grad_norm": 1.189131498336792, + "learning_rate": 7.217716380881478e-07, + "epoch": 0.9976526479139939, + "step": 22260 + }, + { + "loss": 13.3736, + "grad_norm": 1.207396149635315, + "learning_rate": 5.943487877868303e-07, + "epoch": 0.9978767388052594, + "step": 22265 + }, + { + "loss": 13.518, + "grad_norm": 1.1892287731170654, + "learning_rate": 4.792784661642458e-07, + "epoch": 0.9981008296965249, + "step": 22270 + }, + { + "loss": 13.5047, + "grad_norm": 1.2454142570495605, + "learning_rate": 3.7656637240732206e-07, + "epoch": 0.9983249205877904, + "step": 22275 + }, + { + "loss": 13.506, + "grad_norm": 1.1761161088943481, + "learning_rate": 2.862175936265421e-07, + "epoch": 0.998549011479056, + "step": 22280 + }, + { + "loss": 13.5602, + "grad_norm": 1.1850390434265137, + "learning_rate": 2.0823660460370098e-07, + "epoch": 0.9987731023703214, + "step": 22285 + }, + { + "loss": 13.3766, + "grad_norm": 1.171899676322937, + "learning_rate": 1.426272675704998e-07, + "epoch": 0.9989971932615869, + "step": 22290 + }, + { + "loss": 13.3907, + "grad_norm": 1.218133568763733, + "learning_rate": 8.939283201708782e-08, + "epoch": 0.9992212841528524, + "step": 22295 + }, + { + "loss": 13.3937, + "grad_norm": 1.1850651502609253, + "learning_rate": 4.8535934531274137e-08, + "epoch": 0.9994453750441179, + "step": 22300 + }, + { + "loss": 13.5193, + "grad_norm": 1.1924799680709839, + "learning_rate": 2.0058598667854756e-08, + "epoch": 0.9996694659353834, + "step": 22305 + }, + { + "loss": 13.5643, + "grad_norm": 1.1612070798873901, + "learning_rate": 3.962234848359225e-09, + "epoch": 0.9998935568266489, + "step": 22310 + }, + { + "train_runtime": 132800.8636, + "train_samples_per_second": 301.079, + "train_steps_per_second": 0.168, + "total_flos": 1.364299850107899e+19, + "train_loss": 14.025886625746066, + "epoch": 0.9999831931831551, + "step": 22312 + }, + { + "eval_loss": 1.681868314743042, + "eval_runtime": 18.5175, + "eval_samples_per_second": 884.782, + "eval_steps_per_second": 7.938, + "epoch": 0.9999831931831551, + "step": 22312 + } + ], + "best_metric": null, + "best_model_checkpoint": null, + "is_local_process_zero": true, + "is_world_process_zero": true, + "is_hyper_param_search": false, + "trial_name": null, + "trial_params": null, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_training_stop": true, + "should_epoch_stop": false, + "should_save": true, + "should_evaluate": false, + "should_log": false + }, + "attributes": {} + } + } +} \ No newline at end of file