diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.0, + "epoch": 2.2450288646568315, "eval_steps": 500, - "global_step": 4677, + "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -24507,8245 +24507,6 @@ "learning_rate": 1.2609813584743948e-05, "loss": 0.1258, "step": 3500 - }, - { - "epoch": 2.2456703014753048, - "grad_norm": 1.343820571899414, - "learning_rate": 1.2599100064281125e-05, - "loss": 0.1414, - "step": 3501 - }, - { - "epoch": 2.246311738293778, - "grad_norm": 1.2883403301239014, - "learning_rate": 1.2588386543818298e-05, - "loss": 0.1443, - "step": 3502 - }, - { - "epoch": 2.2469531751122513, - "grad_norm": 0.9681822061538696, - "learning_rate": 1.2577673023355474e-05, - "loss": 0.1235, - "step": 3503 - }, - { - "epoch": 2.2475946119307246, - "grad_norm": 1.2376668453216553, - "learning_rate": 1.256695950289265e-05, - "loss": 0.1272, - "step": 3504 - }, - { - "epoch": 2.2482360487491984, - "grad_norm": 1.5728809833526611, - "learning_rate": 1.2556245982429825e-05, - "loss": 0.1517, - "step": 3505 - }, - { - "epoch": 2.2488774855676716, - "grad_norm": 1.0207231044769287, - "learning_rate": 1.2545532461967002e-05, - "loss": 0.1425, - "step": 3506 - }, - { - "epoch": 2.249518922386145, - "grad_norm": 0.9559540152549744, - "learning_rate": 1.2534818941504178e-05, - "loss": 0.1329, - "step": 3507 - }, - { - "epoch": 2.250160359204618, - "grad_norm": 1.3080105781555176, - "learning_rate": 1.2524105421041355e-05, - "loss": 0.1366, - "step": 3508 - }, - { - "epoch": 2.250801796023092, - "grad_norm": 1.148154854774475, - "learning_rate": 1.2513391900578531e-05, - "loss": 0.1362, - "step": 3509 - }, - { - "epoch": 2.2514432328415652, - "grad_norm": 1.1086626052856445, - "learning_rate": 1.2502678380115706e-05, - "loss": 0.1336, - "step": 3510 - }, - { - "epoch": 2.2520846696600385, - "grad_norm": 1.549695372581482, - "learning_rate": 1.2491964859652883e-05, - "loss": 0.1426, - "step": 3511 - }, - { - "epoch": 2.252726106478512, - "grad_norm": 2.051417827606201, - "learning_rate": 1.2481251339190059e-05, - "loss": 0.1676, - "step": 3512 - }, - { - "epoch": 2.253367543296985, - "grad_norm": 1.178470492362976, - "learning_rate": 1.2470537818727235e-05, - "loss": 0.1342, - "step": 3513 - }, - { - "epoch": 2.254008980115459, - "grad_norm": 1.3067129850387573, - "learning_rate": 1.245982429826441e-05, - "loss": 0.1423, - "step": 3514 - }, - { - "epoch": 2.254650416933932, - "grad_norm": 1.076694369316101, - "learning_rate": 1.2449110777801585e-05, - "loss": 0.1259, - "step": 3515 - }, - { - "epoch": 2.2552918537524054, - "grad_norm": 1.3969265222549438, - "learning_rate": 1.2438397257338761e-05, - "loss": 0.1428, - "step": 3516 - }, - { - "epoch": 2.2559332905708787, - "grad_norm": 0.977289080619812, - "learning_rate": 1.2427683736875938e-05, - "loss": 0.1158, - "step": 3517 - }, - { - "epoch": 2.256574727389352, - "grad_norm": 1.0767934322357178, - "learning_rate": 1.2416970216413114e-05, - "loss": 0.1246, - "step": 3518 - }, - { - "epoch": 2.2572161642078257, - "grad_norm": 1.3229104280471802, - "learning_rate": 1.2406256695950289e-05, - "loss": 0.1372, - "step": 3519 - }, - { - "epoch": 2.257857601026299, - "grad_norm": 1.210260272026062, - "learning_rate": 1.2395543175487466e-05, - "loss": 0.1295, - "step": 3520 - }, - { - "epoch": 2.2584990378447722, - "grad_norm": 0.9593712091445923, - "learning_rate": 1.2384829655024642e-05, - "loss": 0.1285, - "step": 3521 - }, - { - "epoch": 2.2591404746632455, - "grad_norm": 1.119267463684082, - "learning_rate": 1.2374116134561818e-05, - "loss": 0.1207, - "step": 3522 - }, - { - "epoch": 2.259781911481719, - "grad_norm": 1.514831781387329, - "learning_rate": 1.2363402614098995e-05, - "loss": 0.1287, - "step": 3523 - }, - { - "epoch": 2.2604233483001925, - "grad_norm": 1.20852530002594, - "learning_rate": 1.235268909363617e-05, - "loss": 0.1374, - "step": 3524 - }, - { - "epoch": 2.261064785118666, - "grad_norm": 1.2531942129135132, - "learning_rate": 1.2341975573173344e-05, - "loss": 0.1368, - "step": 3525 - }, - { - "epoch": 2.261706221937139, - "grad_norm": 1.2361570596694946, - "learning_rate": 1.2331262052710521e-05, - "loss": 0.1328, - "step": 3526 - }, - { - "epoch": 2.2623476587556124, - "grad_norm": 1.0108190774917603, - "learning_rate": 1.2320548532247697e-05, - "loss": 0.1208, - "step": 3527 - }, - { - "epoch": 2.262989095574086, - "grad_norm": 1.2126853466033936, - "learning_rate": 1.2309835011784874e-05, - "loss": 0.1354, - "step": 3528 - }, - { - "epoch": 2.2636305323925594, - "grad_norm": 1.8107131719589233, - "learning_rate": 1.2299121491322049e-05, - "loss": 0.1496, - "step": 3529 - }, - { - "epoch": 2.2642719692110327, - "grad_norm": 1.3203986883163452, - "learning_rate": 1.2288407970859225e-05, - "loss": 0.1335, - "step": 3530 - }, - { - "epoch": 2.264913406029506, - "grad_norm": 1.6531140804290771, - "learning_rate": 1.2277694450396401e-05, - "loss": 0.1413, - "step": 3531 - }, - { - "epoch": 2.2655548428479797, - "grad_norm": 1.592700719833374, - "learning_rate": 1.2266980929933576e-05, - "loss": 0.1415, - "step": 3532 - }, - { - "epoch": 2.266196279666453, - "grad_norm": 0.7768902778625488, - "learning_rate": 1.2256267409470753e-05, - "loss": 0.1223, - "step": 3533 - }, - { - "epoch": 2.2668377164849263, - "grad_norm": 1.6260148286819458, - "learning_rate": 1.2245553889007929e-05, - "loss": 0.1513, - "step": 3534 - }, - { - "epoch": 2.2674791533033996, - "grad_norm": 1.1796766519546509, - "learning_rate": 1.2234840368545104e-05, - "loss": 0.1347, - "step": 3535 - }, - { - "epoch": 2.268120590121873, - "grad_norm": 1.4753679037094116, - "learning_rate": 1.222412684808228e-05, - "loss": 0.1482, - "step": 3536 - }, - { - "epoch": 2.2687620269403466, - "grad_norm": 1.4252656698226929, - "learning_rate": 1.2213413327619457e-05, - "loss": 0.1495, - "step": 3537 - }, - { - "epoch": 2.26940346375882, - "grad_norm": 1.9172002077102661, - "learning_rate": 1.2202699807156633e-05, - "loss": 0.1511, - "step": 3538 - }, - { - "epoch": 2.270044900577293, - "grad_norm": 1.4449925422668457, - "learning_rate": 1.2191986286693808e-05, - "loss": 0.1488, - "step": 3539 - }, - { - "epoch": 2.2706863373957664, - "grad_norm": 1.6024101972579956, - "learning_rate": 1.2181272766230984e-05, - "loss": 0.1601, - "step": 3540 - }, - { - "epoch": 2.2713277742142397, - "grad_norm": 1.2116180658340454, - "learning_rate": 1.217055924576816e-05, - "loss": 0.127, - "step": 3541 - }, - { - "epoch": 2.2719692110327134, - "grad_norm": 0.9268909692764282, - "learning_rate": 1.2159845725305336e-05, - "loss": 0.1253, - "step": 3542 - }, - { - "epoch": 2.2726106478511867, - "grad_norm": 1.1599345207214355, - "learning_rate": 1.2149132204842512e-05, - "loss": 0.1356, - "step": 3543 - }, - { - "epoch": 2.27325208466966, - "grad_norm": 1.3968887329101562, - "learning_rate": 1.2138418684379687e-05, - "loss": 0.1435, - "step": 3544 - }, - { - "epoch": 2.2738935214881333, - "grad_norm": 1.0545059442520142, - "learning_rate": 1.2127705163916863e-05, - "loss": 0.1324, - "step": 3545 - }, - { - "epoch": 2.2745349583066066, - "grad_norm": 1.2550338506698608, - "learning_rate": 1.211699164345404e-05, - "loss": 0.1345, - "step": 3546 - }, - { - "epoch": 2.2751763951250803, - "grad_norm": 1.1496870517730713, - "learning_rate": 1.2106278122991216e-05, - "loss": 0.1374, - "step": 3547 - }, - { - "epoch": 2.2758178319435536, - "grad_norm": 1.2752941846847534, - "learning_rate": 1.2095564602528393e-05, - "loss": 0.1342, - "step": 3548 - }, - { - "epoch": 2.276459268762027, - "grad_norm": 1.6423327922821045, - "learning_rate": 1.2084851082065567e-05, - "loss": 0.1522, - "step": 3549 - }, - { - "epoch": 2.2771007055805, - "grad_norm": 1.5230259895324707, - "learning_rate": 1.2074137561602742e-05, - "loss": 0.1405, - "step": 3550 - }, - { - "epoch": 2.277742142398974, - "grad_norm": 0.9093480706214905, - "learning_rate": 1.2063424041139919e-05, - "loss": 0.1236, - "step": 3551 - }, - { - "epoch": 2.278383579217447, - "grad_norm": 1.3939036130905151, - "learning_rate": 1.2052710520677095e-05, - "loss": 0.133, - "step": 3552 - }, - { - "epoch": 2.2790250160359204, - "grad_norm": 1.21937096118927, - "learning_rate": 1.2041997000214272e-05, - "loss": 0.1292, - "step": 3553 - }, - { - "epoch": 2.2796664528543937, - "grad_norm": 1.049047589302063, - "learning_rate": 1.2031283479751446e-05, - "loss": 0.1219, - "step": 3554 - }, - { - "epoch": 2.2803078896728675, - "grad_norm": 1.0619561672210693, - "learning_rate": 1.2020569959288623e-05, - "loss": 0.1391, - "step": 3555 - }, - { - "epoch": 2.2809493264913407, - "grad_norm": 1.1172096729278564, - "learning_rate": 1.20098564388258e-05, - "loss": 0.1344, - "step": 3556 - }, - { - "epoch": 2.281590763309814, - "grad_norm": 1.267316222190857, - "learning_rate": 1.1999142918362976e-05, - "loss": 0.1366, - "step": 3557 - }, - { - "epoch": 2.2822322001282873, - "grad_norm": 1.1255512237548828, - "learning_rate": 1.198842939790015e-05, - "loss": 0.1321, - "step": 3558 - }, - { - "epoch": 2.2828736369467606, - "grad_norm": 0.9083313345909119, - "learning_rate": 1.1977715877437325e-05, - "loss": 0.1319, - "step": 3559 - }, - { - "epoch": 2.2835150737652343, - "grad_norm": 1.0389313697814941, - "learning_rate": 1.1967002356974502e-05, - "loss": 0.1309, - "step": 3560 - }, - { - "epoch": 2.2841565105837076, - "grad_norm": 1.0437548160552979, - "learning_rate": 1.1956288836511678e-05, - "loss": 0.1333, - "step": 3561 - }, - { - "epoch": 2.284797947402181, - "grad_norm": 1.436987042427063, - "learning_rate": 1.1945575316048855e-05, - "loss": 0.1385, - "step": 3562 - }, - { - "epoch": 2.285439384220654, - "grad_norm": 1.075002908706665, - "learning_rate": 1.1934861795586031e-05, - "loss": 0.1366, - "step": 3563 - }, - { - "epoch": 2.2860808210391275, - "grad_norm": 1.377728819847107, - "learning_rate": 1.1924148275123206e-05, - "loss": 0.1325, - "step": 3564 - }, - { - "epoch": 2.286722257857601, - "grad_norm": 1.2234628200531006, - "learning_rate": 1.1913434754660382e-05, - "loss": 0.1292, - "step": 3565 - }, - { - "epoch": 2.2873636946760745, - "grad_norm": 1.559983730316162, - "learning_rate": 1.1902721234197559e-05, - "loss": 0.1448, - "step": 3566 - }, - { - "epoch": 2.2880051314945478, - "grad_norm": 1.212354063987732, - "learning_rate": 1.1892007713734733e-05, - "loss": 0.1395, - "step": 3567 - }, - { - "epoch": 2.288646568313021, - "grad_norm": 1.7442474365234375, - "learning_rate": 1.188129419327191e-05, - "loss": 0.1707, - "step": 3568 - }, - { - "epoch": 2.2892880051314943, - "grad_norm": 1.8707833290100098, - "learning_rate": 1.1870580672809085e-05, - "loss": 0.1586, - "step": 3569 - }, - { - "epoch": 2.289929441949968, - "grad_norm": 1.0937561988830566, - "learning_rate": 1.1859867152346261e-05, - "loss": 0.1304, - "step": 3570 - }, - { - "epoch": 2.2905708787684413, - "grad_norm": 1.1823759078979492, - "learning_rate": 1.1849153631883438e-05, - "loss": 0.1386, - "step": 3571 - }, - { - "epoch": 2.2912123155869146, - "grad_norm": 1.8961848020553589, - "learning_rate": 1.1838440111420614e-05, - "loss": 0.1495, - "step": 3572 - }, - { - "epoch": 2.291853752405388, - "grad_norm": 1.870692491531372, - "learning_rate": 1.182772659095779e-05, - "loss": 0.149, - "step": 3573 - }, - { - "epoch": 2.2924951892238616, - "grad_norm": 1.0497947931289673, - "learning_rate": 1.1817013070494965e-05, - "loss": 0.1261, - "step": 3574 - }, - { - "epoch": 2.293136626042335, - "grad_norm": 0.9906545877456665, - "learning_rate": 1.180629955003214e-05, - "loss": 0.1263, - "step": 3575 - }, - { - "epoch": 2.293778062860808, - "grad_norm": 1.0821454524993896, - "learning_rate": 1.1795586029569316e-05, - "loss": 0.1288, - "step": 3576 - }, - { - "epoch": 2.2944194996792815, - "grad_norm": 1.1945127248764038, - "learning_rate": 1.1784872509106493e-05, - "loss": 0.1349, - "step": 3577 - }, - { - "epoch": 2.2950609364977548, - "grad_norm": 1.39793062210083, - "learning_rate": 1.177415898864367e-05, - "loss": 0.1387, - "step": 3578 - }, - { - "epoch": 2.2957023733162285, - "grad_norm": 1.370193600654602, - "learning_rate": 1.1763445468180844e-05, - "loss": 0.1408, - "step": 3579 - }, - { - "epoch": 2.296343810134702, - "grad_norm": 0.9681302309036255, - "learning_rate": 1.175273194771802e-05, - "loss": 0.1266, - "step": 3580 - }, - { - "epoch": 2.296985246953175, - "grad_norm": 0.9366461634635925, - "learning_rate": 1.1742018427255197e-05, - "loss": 0.125, - "step": 3581 - }, - { - "epoch": 2.2976266837716484, - "grad_norm": 1.2744871377944946, - "learning_rate": 1.1731304906792374e-05, - "loss": 0.1325, - "step": 3582 - }, - { - "epoch": 2.298268120590122, - "grad_norm": 1.5726466178894043, - "learning_rate": 1.1720591386329548e-05, - "loss": 0.1307, - "step": 3583 - }, - { - "epoch": 2.2989095574085954, - "grad_norm": 1.0854864120483398, - "learning_rate": 1.1709877865866723e-05, - "loss": 0.1412, - "step": 3584 - }, - { - "epoch": 2.2995509942270687, - "grad_norm": 0.8869479298591614, - "learning_rate": 1.16991643454039e-05, - "loss": 0.123, - "step": 3585 - }, - { - "epoch": 2.300192431045542, - "grad_norm": 1.316297173500061, - "learning_rate": 1.1688450824941076e-05, - "loss": 0.1299, - "step": 3586 - }, - { - "epoch": 2.3008338678640152, - "grad_norm": 1.1788965463638306, - "learning_rate": 1.1677737304478252e-05, - "loss": 0.1316, - "step": 3587 - }, - { - "epoch": 2.301475304682489, - "grad_norm": 0.9574903249740601, - "learning_rate": 1.1667023784015429e-05, - "loss": 0.1211, - "step": 3588 - }, - { - "epoch": 2.3021167415009622, - "grad_norm": 1.262912631034851, - "learning_rate": 1.1656310263552604e-05, - "loss": 0.137, - "step": 3589 - }, - { - "epoch": 2.3027581783194355, - "grad_norm": 1.4609860181808472, - "learning_rate": 1.164559674308978e-05, - "loss": 0.1401, - "step": 3590 - }, - { - "epoch": 2.303399615137909, - "grad_norm": 1.1162687540054321, - "learning_rate": 1.1634883222626957e-05, - "loss": 0.1205, - "step": 3591 - }, - { - "epoch": 2.304041051956382, - "grad_norm": 1.2020783424377441, - "learning_rate": 1.1624169702164131e-05, - "loss": 0.1367, - "step": 3592 - }, - { - "epoch": 2.304682488774856, - "grad_norm": 1.0615650415420532, - "learning_rate": 1.1613456181701308e-05, - "loss": 0.1262, - "step": 3593 - }, - { - "epoch": 2.305323925593329, - "grad_norm": 1.483933925628662, - "learning_rate": 1.1602742661238483e-05, - "loss": 0.1597, - "step": 3594 - }, - { - "epoch": 2.3059653624118024, - "grad_norm": 1.4642996788024902, - "learning_rate": 1.1592029140775659e-05, - "loss": 0.1248, - "step": 3595 - }, - { - "epoch": 2.3066067992302757, - "grad_norm": 1.1310981512069702, - "learning_rate": 1.1581315620312835e-05, - "loss": 0.1256, - "step": 3596 - }, - { - "epoch": 2.307248236048749, - "grad_norm": 1.2064452171325684, - "learning_rate": 1.1570602099850012e-05, - "loss": 0.1435, - "step": 3597 - }, - { - "epoch": 2.3078896728672227, - "grad_norm": 1.3887256383895874, - "learning_rate": 1.1559888579387188e-05, - "loss": 0.1371, - "step": 3598 - }, - { - "epoch": 2.308531109685696, - "grad_norm": 1.308091640472412, - "learning_rate": 1.1549175058924363e-05, - "loss": 0.1448, - "step": 3599 - }, - { - "epoch": 2.3091725465041693, - "grad_norm": 1.1406121253967285, - "learning_rate": 1.153846153846154e-05, - "loss": 0.1298, - "step": 3600 - }, - { - "epoch": 2.3098139833226425, - "grad_norm": 1.4288982152938843, - "learning_rate": 1.1527748017998714e-05, - "loss": 0.1399, - "step": 3601 - }, - { - "epoch": 2.3104554201411163, - "grad_norm": 1.2366622686386108, - "learning_rate": 1.151703449753589e-05, - "loss": 0.1381, - "step": 3602 - }, - { - "epoch": 2.3110968569595896, - "grad_norm": 1.2820683717727661, - "learning_rate": 1.1506320977073067e-05, - "loss": 0.1364, - "step": 3603 - }, - { - "epoch": 2.311738293778063, - "grad_norm": 0.8005712628364563, - "learning_rate": 1.1495607456610242e-05, - "loss": 0.1235, - "step": 3604 - }, - { - "epoch": 2.312379730596536, - "grad_norm": 1.7405678033828735, - "learning_rate": 1.1484893936147418e-05, - "loss": 0.1514, - "step": 3605 - }, - { - "epoch": 2.31302116741501, - "grad_norm": 1.4292676448822021, - "learning_rate": 1.1474180415684595e-05, - "loss": 0.1279, - "step": 3606 - }, - { - "epoch": 2.313662604233483, - "grad_norm": 1.0892951488494873, - "learning_rate": 1.1463466895221771e-05, - "loss": 0.126, - "step": 3607 - }, - { - "epoch": 2.3143040410519564, - "grad_norm": 1.4147040843963623, - "learning_rate": 1.1452753374758946e-05, - "loss": 0.1482, - "step": 3608 - }, - { - "epoch": 2.3149454778704297, - "grad_norm": 1.0950175523757935, - "learning_rate": 1.1442039854296121e-05, - "loss": 0.126, - "step": 3609 - }, - { - "epoch": 2.315586914688903, - "grad_norm": 1.4065486192703247, - "learning_rate": 1.1431326333833297e-05, - "loss": 0.1435, - "step": 3610 - }, - { - "epoch": 2.3162283515073767, - "grad_norm": 1.2238367795944214, - "learning_rate": 1.1420612813370474e-05, - "loss": 0.1405, - "step": 3611 - }, - { - "epoch": 2.31686978832585, - "grad_norm": 1.8444880247116089, - "learning_rate": 1.140989929290765e-05, - "loss": 0.1327, - "step": 3612 - }, - { - "epoch": 2.3175112251443233, - "grad_norm": 1.2983936071395874, - "learning_rate": 1.1399185772444827e-05, - "loss": 0.1372, - "step": 3613 - }, - { - "epoch": 2.3181526619627966, - "grad_norm": 1.221827507019043, - "learning_rate": 1.1388472251982001e-05, - "loss": 0.1353, - "step": 3614 - }, - { - "epoch": 2.31879409878127, - "grad_norm": 1.5909416675567627, - "learning_rate": 1.1377758731519178e-05, - "loss": 0.135, - "step": 3615 - }, - { - "epoch": 2.3194355355997436, - "grad_norm": 1.1128482818603516, - "learning_rate": 1.1367045211056354e-05, - "loss": 0.1271, - "step": 3616 - }, - { - "epoch": 2.320076972418217, - "grad_norm": 1.2014174461364746, - "learning_rate": 1.135633169059353e-05, - "loss": 0.1331, - "step": 3617 - }, - { - "epoch": 2.32071840923669, - "grad_norm": 1.3233158588409424, - "learning_rate": 1.1345618170130706e-05, - "loss": 0.1417, - "step": 3618 - }, - { - "epoch": 2.3213598460551634, - "grad_norm": 1.8434275388717651, - "learning_rate": 1.133490464966788e-05, - "loss": 0.1485, - "step": 3619 - }, - { - "epoch": 2.3220012828736367, - "grad_norm": 1.0675415992736816, - "learning_rate": 1.1324191129205057e-05, - "loss": 0.1226, - "step": 3620 - }, - { - "epoch": 2.3226427196921104, - "grad_norm": 1.847012996673584, - "learning_rate": 1.1313477608742233e-05, - "loss": 0.1575, - "step": 3621 - }, - { - "epoch": 2.3232841565105837, - "grad_norm": 1.2024664878845215, - "learning_rate": 1.130276408827941e-05, - "loss": 0.1291, - "step": 3622 - }, - { - "epoch": 2.323925593329057, - "grad_norm": 1.78941011428833, - "learning_rate": 1.1292050567816584e-05, - "loss": 0.144, - "step": 3623 - }, - { - "epoch": 2.3245670301475303, - "grad_norm": 1.0731127262115479, - "learning_rate": 1.1281337047353761e-05, - "loss": 0.1312, - "step": 3624 - }, - { - "epoch": 2.325208466966004, - "grad_norm": 1.1655268669128418, - "learning_rate": 1.1270623526890937e-05, - "loss": 0.1276, - "step": 3625 - }, - { - "epoch": 2.3258499037844773, - "grad_norm": 1.7216315269470215, - "learning_rate": 1.1259910006428114e-05, - "loss": 0.1432, - "step": 3626 - }, - { - "epoch": 2.3264913406029506, - "grad_norm": 1.358201026916504, - "learning_rate": 1.1249196485965289e-05, - "loss": 0.131, - "step": 3627 - }, - { - "epoch": 2.327132777421424, - "grad_norm": 1.405027151107788, - "learning_rate": 1.1238482965502465e-05, - "loss": 0.1318, - "step": 3628 - }, - { - "epoch": 2.3277742142398976, - "grad_norm": 1.713647484779358, - "learning_rate": 1.122776944503964e-05, - "loss": 0.1481, - "step": 3629 - }, - { - "epoch": 2.328415651058371, - "grad_norm": 1.5398471355438232, - "learning_rate": 1.1217055924576816e-05, - "loss": 0.1383, - "step": 3630 - }, - { - "epoch": 2.329057087876844, - "grad_norm": 1.6881943941116333, - "learning_rate": 1.1206342404113993e-05, - "loss": 0.136, - "step": 3631 - }, - { - "epoch": 2.3296985246953175, - "grad_norm": 1.251266360282898, - "learning_rate": 1.1195628883651169e-05, - "loss": 0.1249, - "step": 3632 - }, - { - "epoch": 2.3303399615137907, - "grad_norm": 1.0706398487091064, - "learning_rate": 1.1184915363188344e-05, - "loss": 0.1158, - "step": 3633 - }, - { - "epoch": 2.3309813983322645, - "grad_norm": 1.0507930517196655, - "learning_rate": 1.117420184272552e-05, - "loss": 0.1267, - "step": 3634 - }, - { - "epoch": 2.3316228351507378, - "grad_norm": 1.0079911947250366, - "learning_rate": 1.1163488322262695e-05, - "loss": 0.1251, - "step": 3635 - }, - { - "epoch": 2.332264271969211, - "grad_norm": 1.480043888092041, - "learning_rate": 1.1152774801799872e-05, - "loss": 0.1369, - "step": 3636 - }, - { - "epoch": 2.3329057087876843, - "grad_norm": 1.0095016956329346, - "learning_rate": 1.1142061281337048e-05, - "loss": 0.1211, - "step": 3637 - }, - { - "epoch": 2.3335471456061576, - "grad_norm": 1.3667168617248535, - "learning_rate": 1.1131347760874224e-05, - "loss": 0.1291, - "step": 3638 - }, - { - "epoch": 2.3341885824246313, - "grad_norm": 0.9312577843666077, - "learning_rate": 1.11206342404114e-05, - "loss": 0.1249, - "step": 3639 - }, - { - "epoch": 2.3348300192431046, - "grad_norm": 1.2562997341156006, - "learning_rate": 1.1109920719948576e-05, - "loss": 0.1272, - "step": 3640 - }, - { - "epoch": 2.335471456061578, - "grad_norm": 2.322368621826172, - "learning_rate": 1.1099207199485752e-05, - "loss": 0.1586, - "step": 3641 - }, - { - "epoch": 2.336112892880051, - "grad_norm": 1.1749436855316162, - "learning_rate": 1.1088493679022929e-05, - "loss": 0.1212, - "step": 3642 - }, - { - "epoch": 2.3367543296985245, - "grad_norm": 1.1550500392913818, - "learning_rate": 1.1077780158560103e-05, - "loss": 0.1303, - "step": 3643 - }, - { - "epoch": 2.337395766516998, - "grad_norm": 1.0861376523971558, - "learning_rate": 1.1067066638097278e-05, - "loss": 0.1305, - "step": 3644 - }, - { - "epoch": 2.3380372033354715, - "grad_norm": 1.7784297466278076, - "learning_rate": 1.1056353117634455e-05, - "loss": 0.1466, - "step": 3645 - }, - { - "epoch": 2.3386786401539448, - "grad_norm": 1.1985198259353638, - "learning_rate": 1.1045639597171631e-05, - "loss": 0.1322, - "step": 3646 - }, - { - "epoch": 2.339320076972418, - "grad_norm": 1.6327385902404785, - "learning_rate": 1.1034926076708807e-05, - "loss": 0.1467, - "step": 3647 - }, - { - "epoch": 2.339961513790892, - "grad_norm": 1.3304439783096313, - "learning_rate": 1.1024212556245982e-05, - "loss": 0.1263, - "step": 3648 - }, - { - "epoch": 2.340602950609365, - "grad_norm": 1.2341558933258057, - "learning_rate": 1.1013499035783159e-05, - "loss": 0.1292, - "step": 3649 - }, - { - "epoch": 2.3412443874278384, - "grad_norm": 1.2831017971038818, - "learning_rate": 1.1002785515320335e-05, - "loss": 0.1342, - "step": 3650 - }, - { - "epoch": 2.3418858242463116, - "grad_norm": 1.352304458618164, - "learning_rate": 1.0992071994857512e-05, - "loss": 0.1491, - "step": 3651 - }, - { - "epoch": 2.342527261064785, - "grad_norm": 1.0500259399414062, - "learning_rate": 1.0981358474394686e-05, - "loss": 0.1294, - "step": 3652 - }, - { - "epoch": 2.3431686978832587, - "grad_norm": 1.2565025091171265, - "learning_rate": 1.0970644953931863e-05, - "loss": 0.1387, - "step": 3653 - }, - { - "epoch": 2.343810134701732, - "grad_norm": 1.1134988069534302, - "learning_rate": 1.0959931433469038e-05, - "loss": 0.1264, - "step": 3654 - }, - { - "epoch": 2.3444515715202052, - "grad_norm": 1.5018539428710938, - "learning_rate": 1.0949217913006214e-05, - "loss": 0.1351, - "step": 3655 - }, - { - "epoch": 2.3450930083386785, - "grad_norm": 1.5818486213684082, - "learning_rate": 1.093850439254339e-05, - "loss": 0.1511, - "step": 3656 - }, - { - "epoch": 2.3457344451571522, - "grad_norm": 1.3301721811294556, - "learning_rate": 1.0927790872080567e-05, - "loss": 0.1434, - "step": 3657 - }, - { - "epoch": 2.3463758819756255, - "grad_norm": 0.9867799282073975, - "learning_rate": 1.0917077351617742e-05, - "loss": 0.1266, - "step": 3658 - }, - { - "epoch": 2.347017318794099, - "grad_norm": 1.3231115341186523, - "learning_rate": 1.0906363831154918e-05, - "loss": 0.1387, - "step": 3659 - }, - { - "epoch": 2.347658755612572, - "grad_norm": 1.102192997932434, - "learning_rate": 1.0895650310692095e-05, - "loss": 0.1353, - "step": 3660 - }, - { - "epoch": 2.3483001924310454, - "grad_norm": 1.153701663017273, - "learning_rate": 1.088493679022927e-05, - "loss": 0.1459, - "step": 3661 - }, - { - "epoch": 2.348941629249519, - "grad_norm": 1.0102808475494385, - "learning_rate": 1.0874223269766446e-05, - "loss": 0.1187, - "step": 3662 - }, - { - "epoch": 2.3495830660679924, - "grad_norm": 1.116145133972168, - "learning_rate": 1.086350974930362e-05, - "loss": 0.1257, - "step": 3663 - }, - { - "epoch": 2.3502245028864657, - "grad_norm": 1.658619999885559, - "learning_rate": 1.0852796228840797e-05, - "loss": 0.1672, - "step": 3664 - }, - { - "epoch": 2.350865939704939, - "grad_norm": 1.149686336517334, - "learning_rate": 1.0842082708377974e-05, - "loss": 0.1327, - "step": 3665 - }, - { - "epoch": 2.3515073765234122, - "grad_norm": 1.0274324417114258, - "learning_rate": 1.083136918791515e-05, - "loss": 0.1247, - "step": 3666 - }, - { - "epoch": 2.352148813341886, - "grad_norm": 1.2767846584320068, - "learning_rate": 1.0820655667452326e-05, - "loss": 0.1378, - "step": 3667 - }, - { - "epoch": 2.3527902501603593, - "grad_norm": 1.57766854763031, - "learning_rate": 1.0809942146989501e-05, - "loss": 0.1404, - "step": 3668 - }, - { - "epoch": 2.3534316869788325, - "grad_norm": 2.4606213569641113, - "learning_rate": 1.0799228626526676e-05, - "loss": 0.123, - "step": 3669 - }, - { - "epoch": 2.354073123797306, - "grad_norm": 1.4597201347351074, - "learning_rate": 1.0788515106063852e-05, - "loss": 0.1498, - "step": 3670 - }, - { - "epoch": 2.354714560615779, - "grad_norm": 1.6955223083496094, - "learning_rate": 1.0777801585601029e-05, - "loss": 0.154, - "step": 3671 - }, - { - "epoch": 2.355355997434253, - "grad_norm": 1.3732115030288696, - "learning_rate": 1.0767088065138205e-05, - "loss": 0.1399, - "step": 3672 - }, - { - "epoch": 2.355997434252726, - "grad_norm": 1.0354048013687134, - "learning_rate": 1.075637454467538e-05, - "loss": 0.1255, - "step": 3673 - }, - { - "epoch": 2.3566388710711994, - "grad_norm": 1.3246309757232666, - "learning_rate": 1.0745661024212557e-05, - "loss": 0.1366, - "step": 3674 - }, - { - "epoch": 2.3572803078896727, - "grad_norm": 1.4105384349822998, - "learning_rate": 1.0734947503749733e-05, - "loss": 0.1461, - "step": 3675 - }, - { - "epoch": 2.3579217447081464, - "grad_norm": 1.2340402603149414, - "learning_rate": 1.072423398328691e-05, - "loss": 0.1388, - "step": 3676 - }, - { - "epoch": 2.3585631815266197, - "grad_norm": 1.5958229303359985, - "learning_rate": 1.0713520462824086e-05, - "loss": 0.1408, - "step": 3677 - }, - { - "epoch": 2.359204618345093, - "grad_norm": 0.8171095252037048, - "learning_rate": 1.070280694236126e-05, - "loss": 0.1158, - "step": 3678 - }, - { - "epoch": 2.3598460551635663, - "grad_norm": 1.0669353008270264, - "learning_rate": 1.0692093421898435e-05, - "loss": 0.1247, - "step": 3679 - }, - { - "epoch": 2.36048749198204, - "grad_norm": 1.3983041048049927, - "learning_rate": 1.0681379901435612e-05, - "loss": 0.1435, - "step": 3680 - }, - { - "epoch": 2.3611289288005133, - "grad_norm": 1.143537163734436, - "learning_rate": 1.0670666380972788e-05, - "loss": 0.1281, - "step": 3681 - }, - { - "epoch": 2.3617703656189866, - "grad_norm": 1.4105442762374878, - "learning_rate": 1.0659952860509965e-05, - "loss": 0.1354, - "step": 3682 - }, - { - "epoch": 2.36241180243746, - "grad_norm": 0.916313886642456, - "learning_rate": 1.064923934004714e-05, - "loss": 0.1246, - "step": 3683 - }, - { - "epoch": 2.363053239255933, - "grad_norm": 1.3472392559051514, - "learning_rate": 1.0638525819584316e-05, - "loss": 0.1291, - "step": 3684 - }, - { - "epoch": 2.363694676074407, - "grad_norm": 1.2620917558670044, - "learning_rate": 1.0627812299121492e-05, - "loss": 0.1362, - "step": 3685 - }, - { - "epoch": 2.36433611289288, - "grad_norm": 1.0256260633468628, - "learning_rate": 1.0617098778658669e-05, - "loss": 0.128, - "step": 3686 - }, - { - "epoch": 2.3649775497113534, - "grad_norm": 1.101456880569458, - "learning_rate": 1.0606385258195844e-05, - "loss": 0.1247, - "step": 3687 - }, - { - "epoch": 2.3656189865298267, - "grad_norm": 0.9631907939910889, - "learning_rate": 1.0595671737733018e-05, - "loss": 0.1339, - "step": 3688 - }, - { - "epoch": 2.3662604233483, - "grad_norm": 1.311240792274475, - "learning_rate": 1.0584958217270195e-05, - "loss": 0.1241, - "step": 3689 - }, - { - "epoch": 2.3669018601667737, - "grad_norm": 1.604059100151062, - "learning_rate": 1.0574244696807371e-05, - "loss": 0.1468, - "step": 3690 - }, - { - "epoch": 2.367543296985247, - "grad_norm": 1.1396950483322144, - "learning_rate": 1.0563531176344548e-05, - "loss": 0.1276, - "step": 3691 - }, - { - "epoch": 2.3681847338037203, - "grad_norm": 1.4737842082977295, - "learning_rate": 1.0552817655881724e-05, - "loss": 0.1473, - "step": 3692 - }, - { - "epoch": 2.3688261706221936, - "grad_norm": 1.412882685661316, - "learning_rate": 1.0542104135418899e-05, - "loss": 0.1495, - "step": 3693 - }, - { - "epoch": 2.369467607440667, - "grad_norm": 1.1212289333343506, - "learning_rate": 1.0531390614956075e-05, - "loss": 0.1331, - "step": 3694 - }, - { - "epoch": 2.3701090442591406, - "grad_norm": 1.3977458477020264, - "learning_rate": 1.052067709449325e-05, - "loss": 0.1441, - "step": 3695 - }, - { - "epoch": 2.370750481077614, - "grad_norm": 1.0669503211975098, - "learning_rate": 1.0509963574030427e-05, - "loss": 0.1395, - "step": 3696 - }, - { - "epoch": 2.371391917896087, - "grad_norm": 1.1889338493347168, - "learning_rate": 1.0499250053567603e-05, - "loss": 0.1453, - "step": 3697 - }, - { - "epoch": 2.3720333547145604, - "grad_norm": 1.6164346933364868, - "learning_rate": 1.0488536533104778e-05, - "loss": 0.1544, - "step": 3698 - }, - { - "epoch": 2.372674791533034, - "grad_norm": 1.1862961053848267, - "learning_rate": 1.0477823012641954e-05, - "loss": 0.1327, - "step": 3699 - }, - { - "epoch": 2.3733162283515075, - "grad_norm": 1.3558863401412964, - "learning_rate": 1.046710949217913e-05, - "loss": 0.1354, - "step": 3700 - }, - { - "epoch": 2.3739576651699807, - "grad_norm": 1.2362253665924072, - "learning_rate": 1.0456395971716307e-05, - "loss": 0.1298, - "step": 3701 - }, - { - "epoch": 2.374599101988454, - "grad_norm": 1.4661738872528076, - "learning_rate": 1.0445682451253484e-05, - "loss": 0.1428, - "step": 3702 - }, - { - "epoch": 2.3752405388069278, - "grad_norm": 1.5483055114746094, - "learning_rate": 1.0434968930790658e-05, - "loss": 0.1287, - "step": 3703 - }, - { - "epoch": 2.375881975625401, - "grad_norm": 1.1320013999938965, - "learning_rate": 1.0424255410327833e-05, - "loss": 0.1272, - "step": 3704 - }, - { - "epoch": 2.3765234124438743, - "grad_norm": 1.301568865776062, - "learning_rate": 1.041354188986501e-05, - "loss": 0.1446, - "step": 3705 - }, - { - "epoch": 2.3771648492623476, - "grad_norm": 1.3751112222671509, - "learning_rate": 1.0402828369402186e-05, - "loss": 0.136, - "step": 3706 - }, - { - "epoch": 2.377806286080821, - "grad_norm": 1.185622215270996, - "learning_rate": 1.0392114848939363e-05, - "loss": 0.1269, - "step": 3707 - }, - { - "epoch": 2.3784477228992946, - "grad_norm": 1.781885027885437, - "learning_rate": 1.0381401328476537e-05, - "loss": 0.1521, - "step": 3708 - }, - { - "epoch": 2.379089159717768, - "grad_norm": 1.4246711730957031, - "learning_rate": 1.0370687808013714e-05, - "loss": 0.1335, - "step": 3709 - }, - { - "epoch": 2.379730596536241, - "grad_norm": 0.7511674761772156, - "learning_rate": 1.035997428755089e-05, - "loss": 0.1155, - "step": 3710 - }, - { - "epoch": 2.3803720333547145, - "grad_norm": 1.2522021532058716, - "learning_rate": 1.0349260767088067e-05, - "loss": 0.1366, - "step": 3711 - }, - { - "epoch": 2.3810134701731878, - "grad_norm": 1.3569440841674805, - "learning_rate": 1.0338547246625241e-05, - "loss": 0.1446, - "step": 3712 - }, - { - "epoch": 2.3816549069916615, - "grad_norm": 1.075325608253479, - "learning_rate": 1.0327833726162416e-05, - "loss": 0.1395, - "step": 3713 - }, - { - "epoch": 2.3822963438101348, - "grad_norm": 1.553470492362976, - "learning_rate": 1.0317120205699593e-05, - "loss": 0.137, - "step": 3714 - }, - { - "epoch": 2.382937780628608, - "grad_norm": 0.9834217429161072, - "learning_rate": 1.0306406685236769e-05, - "loss": 0.1391, - "step": 3715 - }, - { - "epoch": 2.3835792174470813, - "grad_norm": 1.2843098640441895, - "learning_rate": 1.0295693164773946e-05, - "loss": 0.1316, - "step": 3716 - }, - { - "epoch": 2.3842206542655546, - "grad_norm": 0.9934973120689392, - "learning_rate": 1.0284979644311122e-05, - "loss": 0.1311, - "step": 3717 - }, - { - "epoch": 2.3848620910840284, - "grad_norm": 1.2121882438659668, - "learning_rate": 1.0274266123848297e-05, - "loss": 0.1341, - "step": 3718 - }, - { - "epoch": 2.3855035279025016, - "grad_norm": 1.1913326978683472, - "learning_rate": 1.0263552603385473e-05, - "loss": 0.1349, - "step": 3719 - }, - { - "epoch": 2.386144964720975, - "grad_norm": 1.069502353668213, - "learning_rate": 1.025283908292265e-05, - "loss": 0.134, - "step": 3720 - }, - { - "epoch": 2.386786401539448, - "grad_norm": 1.1953293085098267, - "learning_rate": 1.0242125562459824e-05, - "loss": 0.1338, - "step": 3721 - }, - { - "epoch": 2.387427838357922, - "grad_norm": 1.655177354812622, - "learning_rate": 1.0231412041997001e-05, - "loss": 0.1455, - "step": 3722 - }, - { - "epoch": 2.3880692751763952, - "grad_norm": 1.027678370475769, - "learning_rate": 1.0220698521534176e-05, - "loss": 0.1374, - "step": 3723 - }, - { - "epoch": 2.3887107119948685, - "grad_norm": 0.9876590967178345, - "learning_rate": 1.0209985001071352e-05, - "loss": 0.1301, - "step": 3724 - }, - { - "epoch": 2.389352148813342, - "grad_norm": 1.4161688089370728, - "learning_rate": 1.0199271480608529e-05, - "loss": 0.1332, - "step": 3725 - }, - { - "epoch": 2.389993585631815, - "grad_norm": 0.8615648150444031, - "learning_rate": 1.0188557960145705e-05, - "loss": 0.1186, - "step": 3726 - }, - { - "epoch": 2.390635022450289, - "grad_norm": 1.078986406326294, - "learning_rate": 1.017784443968288e-05, - "loss": 0.1268, - "step": 3727 - }, - { - "epoch": 2.391276459268762, - "grad_norm": 1.8413230180740356, - "learning_rate": 1.0167130919220056e-05, - "loss": 0.1645, - "step": 3728 - }, - { - "epoch": 2.3919178960872354, - "grad_norm": 1.0729998350143433, - "learning_rate": 1.0156417398757233e-05, - "loss": 0.1386, - "step": 3729 - }, - { - "epoch": 2.3925593329057087, - "grad_norm": 1.2341793775558472, - "learning_rate": 1.0145703878294407e-05, - "loss": 0.1342, - "step": 3730 - }, - { - "epoch": 2.3932007697241824, - "grad_norm": 1.4468939304351807, - "learning_rate": 1.0134990357831584e-05, - "loss": 0.1537, - "step": 3731 - }, - { - "epoch": 2.3938422065426557, - "grad_norm": 1.2184746265411377, - "learning_rate": 1.012427683736876e-05, - "loss": 0.1349, - "step": 3732 - }, - { - "epoch": 2.394483643361129, - "grad_norm": 1.9532498121261597, - "learning_rate": 1.0113563316905935e-05, - "loss": 0.1559, - "step": 3733 - }, - { - "epoch": 2.3951250801796022, - "grad_norm": 0.7203018665313721, - "learning_rate": 1.0102849796443112e-05, - "loss": 0.1142, - "step": 3734 - }, - { - "epoch": 2.3957665169980755, - "grad_norm": 1.0794315338134766, - "learning_rate": 1.0092136275980288e-05, - "loss": 0.1342, - "step": 3735 - }, - { - "epoch": 2.3964079538165493, - "grad_norm": 1.2962615489959717, - "learning_rate": 1.0081422755517465e-05, - "loss": 0.1396, - "step": 3736 - }, - { - "epoch": 2.3970493906350225, - "grad_norm": 1.109777808189392, - "learning_rate": 1.007070923505464e-05, - "loss": 0.1274, - "step": 3737 - }, - { - "epoch": 2.397690827453496, - "grad_norm": 0.8863678574562073, - "learning_rate": 1.0059995714591814e-05, - "loss": 0.1269, - "step": 3738 - }, - { - "epoch": 2.398332264271969, - "grad_norm": 0.832064151763916, - "learning_rate": 1.004928219412899e-05, - "loss": 0.13, - "step": 3739 - }, - { - "epoch": 2.3989737010904424, - "grad_norm": 1.3067786693572998, - "learning_rate": 1.0038568673666167e-05, - "loss": 0.1429, - "step": 3740 - }, - { - "epoch": 2.399615137908916, - "grad_norm": 1.0223525762557983, - "learning_rate": 1.0027855153203343e-05, - "loss": 0.136, - "step": 3741 - }, - { - "epoch": 2.4002565747273894, - "grad_norm": 1.4113000631332397, - "learning_rate": 1.001714163274052e-05, - "loss": 0.1429, - "step": 3742 - }, - { - "epoch": 2.4008980115458627, - "grad_norm": 1.0364148616790771, - "learning_rate": 1.0006428112277695e-05, - "loss": 0.1261, - "step": 3743 - }, - { - "epoch": 2.401539448364336, - "grad_norm": 1.24801766872406, - "learning_rate": 9.995714591814871e-06, - "loss": 0.133, - "step": 3744 - }, - { - "epoch": 2.4021808851828093, - "grad_norm": 0.9743726253509521, - "learning_rate": 9.985001071352048e-06, - "loss": 0.1271, - "step": 3745 - }, - { - "epoch": 2.402822322001283, - "grad_norm": 1.2588368654251099, - "learning_rate": 9.974287550889224e-06, - "loss": 0.1267, - "step": 3746 - }, - { - "epoch": 2.4034637588197563, - "grad_norm": 1.2848823070526123, - "learning_rate": 9.963574030426399e-06, - "loss": 0.1394, - "step": 3747 - }, - { - "epoch": 2.4041051956382296, - "grad_norm": 1.5486409664154053, - "learning_rate": 9.952860509963573e-06, - "loss": 0.1449, - "step": 3748 - }, - { - "epoch": 2.404746632456703, - "grad_norm": 0.8463360667228699, - "learning_rate": 9.94214698950075e-06, - "loss": 0.1147, - "step": 3749 - }, - { - "epoch": 2.4053880692751766, - "grad_norm": 1.0676523447036743, - "learning_rate": 9.931433469037926e-06, - "loss": 0.1381, - "step": 3750 - }, - { - "epoch": 2.40602950609365, - "grad_norm": 1.1267610788345337, - "learning_rate": 9.920719948575103e-06, - "loss": 0.133, - "step": 3751 - }, - { - "epoch": 2.406670942912123, - "grad_norm": 0.9562112092971802, - "learning_rate": 9.910006428112278e-06, - "loss": 0.1251, - "step": 3752 - }, - { - "epoch": 2.4073123797305964, - "grad_norm": 1.3000015020370483, - "learning_rate": 9.899292907649454e-06, - "loss": 0.1418, - "step": 3753 - }, - { - "epoch": 2.40795381654907, - "grad_norm": 1.2391440868377686, - "learning_rate": 9.88857938718663e-06, - "loss": 0.1218, - "step": 3754 - }, - { - "epoch": 2.4085952533675434, - "grad_norm": 1.1504108905792236, - "learning_rate": 9.877865866723805e-06, - "loss": 0.134, - "step": 3755 - }, - { - "epoch": 2.4092366901860167, - "grad_norm": 1.398350477218628, - "learning_rate": 9.867152346260982e-06, - "loss": 0.1388, - "step": 3756 - }, - { - "epoch": 2.40987812700449, - "grad_norm": 2.002207040786743, - "learning_rate": 9.856438825798158e-06, - "loss": 0.1644, - "step": 3757 - }, - { - "epoch": 2.4105195638229633, - "grad_norm": 1.1892286539077759, - "learning_rate": 9.845725305335333e-06, - "loss": 0.1278, - "step": 3758 - }, - { - "epoch": 2.411161000641437, - "grad_norm": 1.4371258020401, - "learning_rate": 9.83501178487251e-06, - "loss": 0.1478, - "step": 3759 - }, - { - "epoch": 2.4118024374599103, - "grad_norm": 0.9835406541824341, - "learning_rate": 9.824298264409686e-06, - "loss": 0.1277, - "step": 3760 - }, - { - "epoch": 2.4124438742783836, - "grad_norm": 1.125024676322937, - "learning_rate": 9.813584743946862e-06, - "loss": 0.1296, - "step": 3761 - }, - { - "epoch": 2.413085311096857, - "grad_norm": 0.6303499341011047, - "learning_rate": 9.802871223484037e-06, - "loss": 0.1144, - "step": 3762 - }, - { - "epoch": 2.41372674791533, - "grad_norm": 1.6716876029968262, - "learning_rate": 9.792157703021214e-06, - "loss": 0.1396, - "step": 3763 - }, - { - "epoch": 2.414368184733804, - "grad_norm": 1.057917594909668, - "learning_rate": 9.781444182558388e-06, - "loss": 0.1259, - "step": 3764 - }, - { - "epoch": 2.415009621552277, - "grad_norm": 1.3194100856781006, - "learning_rate": 9.770730662095565e-06, - "loss": 0.1294, - "step": 3765 - }, - { - "epoch": 2.4156510583707504, - "grad_norm": 1.8764894008636475, - "learning_rate": 9.760017141632741e-06, - "loss": 0.1453, - "step": 3766 - }, - { - "epoch": 2.4162924951892237, - "grad_norm": 1.7342703342437744, - "learning_rate": 9.749303621169916e-06, - "loss": 0.1649, - "step": 3767 - }, - { - "epoch": 2.416933932007697, - "grad_norm": 1.286749243736267, - "learning_rate": 9.738590100707092e-06, - "loss": 0.1415, - "step": 3768 - }, - { - "epoch": 2.4175753688261707, - "grad_norm": 1.5871740579605103, - "learning_rate": 9.727876580244269e-06, - "loss": 0.1437, - "step": 3769 - }, - { - "epoch": 2.418216805644644, - "grad_norm": 1.3491371870040894, - "learning_rate": 9.717163059781445e-06, - "loss": 0.1303, - "step": 3770 - }, - { - "epoch": 2.4188582424631173, - "grad_norm": 0.8769645094871521, - "learning_rate": 9.706449539318622e-06, - "loss": 0.119, - "step": 3771 - }, - { - "epoch": 2.4194996792815906, - "grad_norm": 0.8869051933288574, - "learning_rate": 9.695736018855797e-06, - "loss": 0.1234, - "step": 3772 - }, - { - "epoch": 2.4201411161000643, - "grad_norm": 1.4969135522842407, - "learning_rate": 9.685022498392971e-06, - "loss": 0.1368, - "step": 3773 - }, - { - "epoch": 2.4207825529185376, - "grad_norm": 0.950097918510437, - "learning_rate": 9.674308977930148e-06, - "loss": 0.1239, - "step": 3774 - }, - { - "epoch": 2.421423989737011, - "grad_norm": 0.9254378080368042, - "learning_rate": 9.663595457467324e-06, - "loss": 0.1254, - "step": 3775 - }, - { - "epoch": 2.422065426555484, - "grad_norm": 2.0054590702056885, - "learning_rate": 9.6528819370045e-06, - "loss": 0.1704, - "step": 3776 - }, - { - "epoch": 2.422706863373958, - "grad_norm": 1.6668331623077393, - "learning_rate": 9.642168416541675e-06, - "loss": 0.1412, - "step": 3777 - }, - { - "epoch": 2.423348300192431, - "grad_norm": 1.296921730041504, - "learning_rate": 9.631454896078852e-06, - "loss": 0.1341, - "step": 3778 - }, - { - "epoch": 2.4239897370109045, - "grad_norm": 1.049747347831726, - "learning_rate": 9.620741375616028e-06, - "loss": 0.1205, - "step": 3779 - }, - { - "epoch": 2.4246311738293778, - "grad_norm": 1.14466392993927, - "learning_rate": 9.610027855153205e-06, - "loss": 0.1312, - "step": 3780 - }, - { - "epoch": 2.425272610647851, - "grad_norm": 1.2740994691848755, - "learning_rate": 9.59931433469038e-06, - "loss": 0.1412, - "step": 3781 - }, - { - "epoch": 2.4259140474663248, - "grad_norm": 0.9642508625984192, - "learning_rate": 9.588600814227556e-06, - "loss": 0.1246, - "step": 3782 - }, - { - "epoch": 2.426555484284798, - "grad_norm": 1.2745466232299805, - "learning_rate": 9.57788729376473e-06, - "loss": 0.1504, - "step": 3783 - }, - { - "epoch": 2.4271969211032713, - "grad_norm": 1.175035834312439, - "learning_rate": 9.567173773301907e-06, - "loss": 0.1365, - "step": 3784 - }, - { - "epoch": 2.4278383579217446, - "grad_norm": 1.2293000221252441, - "learning_rate": 9.556460252839084e-06, - "loss": 0.124, - "step": 3785 - }, - { - "epoch": 2.428479794740218, - "grad_norm": 1.013196349143982, - "learning_rate": 9.54574673237626e-06, - "loss": 0.1346, - "step": 3786 - }, - { - "epoch": 2.4291212315586916, - "grad_norm": 1.0760701894760132, - "learning_rate": 9.535033211913435e-06, - "loss": 0.132, - "step": 3787 - }, - { - "epoch": 2.429762668377165, - "grad_norm": 1.0303243398666382, - "learning_rate": 9.524319691450611e-06, - "loss": 0.1304, - "step": 3788 - }, - { - "epoch": 2.430404105195638, - "grad_norm": 1.2814167737960815, - "learning_rate": 9.513606170987788e-06, - "loss": 0.1351, - "step": 3789 - }, - { - "epoch": 2.4310455420141115, - "grad_norm": 1.0182573795318604, - "learning_rate": 9.502892650524963e-06, - "loss": 0.1279, - "step": 3790 - }, - { - "epoch": 2.4316869788325848, - "grad_norm": 1.0255162715911865, - "learning_rate": 9.492179130062139e-06, - "loss": 0.1315, - "step": 3791 - }, - { - "epoch": 2.4323284156510585, - "grad_norm": 1.1067639589309692, - "learning_rate": 9.481465609599314e-06, - "loss": 0.125, - "step": 3792 - }, - { - "epoch": 2.432969852469532, - "grad_norm": 1.4882168769836426, - "learning_rate": 9.47075208913649e-06, - "loss": 0.1432, - "step": 3793 - }, - { - "epoch": 2.433611289288005, - "grad_norm": 1.806928277015686, - "learning_rate": 9.460038568673667e-06, - "loss": 0.1541, - "step": 3794 - }, - { - "epoch": 2.4342527261064784, - "grad_norm": 1.133056879043579, - "learning_rate": 9.449325048210843e-06, - "loss": 0.128, - "step": 3795 - }, - { - "epoch": 2.4348941629249516, - "grad_norm": 1.0366827249526978, - "learning_rate": 9.43861152774802e-06, - "loss": 0.1291, - "step": 3796 - }, - { - "epoch": 2.4355355997434254, - "grad_norm": 1.4852190017700195, - "learning_rate": 9.427898007285194e-06, - "loss": 0.1536, - "step": 3797 - }, - { - "epoch": 2.4361770365618987, - "grad_norm": 1.1409419775009155, - "learning_rate": 9.417184486822369e-06, - "loss": 0.1349, - "step": 3798 - }, - { - "epoch": 2.436818473380372, - "grad_norm": 1.4161299467086792, - "learning_rate": 9.406470966359546e-06, - "loss": 0.1515, - "step": 3799 - }, - { - "epoch": 2.4374599101988452, - "grad_norm": 1.042159080505371, - "learning_rate": 9.395757445896722e-06, - "loss": 0.1271, - "step": 3800 - }, - { - "epoch": 2.438101347017319, - "grad_norm": 0.9858924746513367, - "learning_rate": 9.385043925433898e-06, - "loss": 0.1226, - "step": 3801 - }, - { - "epoch": 2.4387427838357922, - "grad_norm": 0.9225063920021057, - "learning_rate": 9.374330404971073e-06, - "loss": 0.1334, - "step": 3802 - }, - { - "epoch": 2.4393842206542655, - "grad_norm": 1.1906507015228271, - "learning_rate": 9.36361688450825e-06, - "loss": 0.1303, - "step": 3803 - }, - { - "epoch": 2.440025657472739, - "grad_norm": 1.10277259349823, - "learning_rate": 9.352903364045426e-06, - "loss": 0.1275, - "step": 3804 - }, - { - "epoch": 2.4406670942912125, - "grad_norm": 1.3499259948730469, - "learning_rate": 9.342189843582603e-06, - "loss": 0.1343, - "step": 3805 - }, - { - "epoch": 2.441308531109686, - "grad_norm": 1.4930280447006226, - "learning_rate": 9.331476323119779e-06, - "loss": 0.1515, - "step": 3806 - }, - { - "epoch": 2.441949967928159, - "grad_norm": 1.6167969703674316, - "learning_rate": 9.320762802656952e-06, - "loss": 0.1475, - "step": 3807 - }, - { - "epoch": 2.4425914047466324, - "grad_norm": 1.47078275680542, - "learning_rate": 9.310049282194129e-06, - "loss": 0.1504, - "step": 3808 - }, - { - "epoch": 2.4432328415651057, - "grad_norm": 1.3345204591751099, - "learning_rate": 9.299335761731305e-06, - "loss": 0.1399, - "step": 3809 - }, - { - "epoch": 2.4438742783835794, - "grad_norm": 1.3848689794540405, - "learning_rate": 9.288622241268481e-06, - "loss": 0.1266, - "step": 3810 - }, - { - "epoch": 2.4445157152020527, - "grad_norm": 1.2905991077423096, - "learning_rate": 9.277908720805658e-06, - "loss": 0.1423, - "step": 3811 - }, - { - "epoch": 2.445157152020526, - "grad_norm": 1.0526106357574463, - "learning_rate": 9.267195200342833e-06, - "loss": 0.1244, - "step": 3812 - }, - { - "epoch": 2.4457985888389993, - "grad_norm": 1.403782606124878, - "learning_rate": 9.25648167988001e-06, - "loss": 0.1367, - "step": 3813 - }, - { - "epoch": 2.4464400256574725, - "grad_norm": 1.467275619506836, - "learning_rate": 9.245768159417186e-06, - "loss": 0.1447, - "step": 3814 - }, - { - "epoch": 2.4470814624759463, - "grad_norm": 1.0239909887313843, - "learning_rate": 9.23505463895436e-06, - "loss": 0.1234, - "step": 3815 - }, - { - "epoch": 2.4477228992944196, - "grad_norm": 1.1119487285614014, - "learning_rate": 9.224341118491537e-06, - "loss": 0.1321, - "step": 3816 - }, - { - "epoch": 2.448364336112893, - "grad_norm": 1.1448465585708618, - "learning_rate": 9.213627598028712e-06, - "loss": 0.1333, - "step": 3817 - }, - { - "epoch": 2.449005772931366, - "grad_norm": 1.2403755187988281, - "learning_rate": 9.202914077565888e-06, - "loss": 0.1325, - "step": 3818 - }, - { - "epoch": 2.4496472097498394, - "grad_norm": 1.114500641822815, - "learning_rate": 9.192200557103064e-06, - "loss": 0.1348, - "step": 3819 - }, - { - "epoch": 2.450288646568313, - "grad_norm": 1.0486178398132324, - "learning_rate": 9.181487036640241e-06, - "loss": 0.1247, - "step": 3820 - }, - { - "epoch": 2.4509300833867864, - "grad_norm": 1.7872153520584106, - "learning_rate": 9.170773516177417e-06, - "loss": 0.1518, - "step": 3821 - }, - { - "epoch": 2.4515715202052597, - "grad_norm": 1.0040520429611206, - "learning_rate": 9.160059995714592e-06, - "loss": 0.1333, - "step": 3822 - }, - { - "epoch": 2.452212957023733, - "grad_norm": 1.1337661743164062, - "learning_rate": 9.149346475251769e-06, - "loss": 0.133, - "step": 3823 - }, - { - "epoch": 2.4528543938422067, - "grad_norm": 1.1643670797348022, - "learning_rate": 9.138632954788943e-06, - "loss": 0.1347, - "step": 3824 - }, - { - "epoch": 2.45349583066068, - "grad_norm": 1.0626801252365112, - "learning_rate": 9.12791943432612e-06, - "loss": 0.1244, - "step": 3825 - }, - { - "epoch": 2.4541372674791533, - "grad_norm": 1.1409783363342285, - "learning_rate": 9.117205913863296e-06, - "loss": 0.1297, - "step": 3826 - }, - { - "epoch": 2.4547787042976266, - "grad_norm": 0.9519407749176025, - "learning_rate": 9.106492393400471e-06, - "loss": 0.1242, - "step": 3827 - }, - { - "epoch": 2.4554201411161003, - "grad_norm": 1.2748175859451294, - "learning_rate": 9.095778872937648e-06, - "loss": 0.1367, - "step": 3828 - }, - { - "epoch": 2.4560615779345736, - "grad_norm": 1.2895904779434204, - "learning_rate": 9.085065352474824e-06, - "loss": 0.1333, - "step": 3829 - }, - { - "epoch": 2.456703014753047, - "grad_norm": 1.0519781112670898, - "learning_rate": 9.074351832012e-06, - "loss": 0.1232, - "step": 3830 - }, - { - "epoch": 2.45734445157152, - "grad_norm": 1.086019515991211, - "learning_rate": 9.063638311549177e-06, - "loss": 0.1233, - "step": 3831 - }, - { - "epoch": 2.4579858883899934, - "grad_norm": 0.8980653882026672, - "learning_rate": 9.05292479108635e-06, - "loss": 0.1224, - "step": 3832 - }, - { - "epoch": 2.458627325208467, - "grad_norm": 0.7466932535171509, - "learning_rate": 9.042211270623526e-06, - "loss": 0.1203, - "step": 3833 - }, - { - "epoch": 2.4592687620269404, - "grad_norm": 1.7349904775619507, - "learning_rate": 9.031497750160703e-06, - "loss": 0.1647, - "step": 3834 - }, - { - "epoch": 2.4599101988454137, - "grad_norm": 1.3825879096984863, - "learning_rate": 9.02078422969788e-06, - "loss": 0.1352, - "step": 3835 - }, - { - "epoch": 2.460551635663887, - "grad_norm": 1.491317868232727, - "learning_rate": 9.010070709235056e-06, - "loss": 0.1465, - "step": 3836 - }, - { - "epoch": 2.4611930724823603, - "grad_norm": 1.1302434206008911, - "learning_rate": 8.99935718877223e-06, - "loss": 0.1381, - "step": 3837 - }, - { - "epoch": 2.461834509300834, - "grad_norm": 1.7087042331695557, - "learning_rate": 8.988643668309407e-06, - "loss": 0.1439, - "step": 3838 - }, - { - "epoch": 2.4624759461193073, - "grad_norm": 1.1164153814315796, - "learning_rate": 8.977930147846583e-06, - "loss": 0.1248, - "step": 3839 - }, - { - "epoch": 2.4631173829377806, - "grad_norm": 1.3534058332443237, - "learning_rate": 8.96721662738376e-06, - "loss": 0.1483, - "step": 3840 - }, - { - "epoch": 2.463758819756254, - "grad_norm": 1.1007658243179321, - "learning_rate": 8.956503106920935e-06, - "loss": 0.1248, - "step": 3841 - }, - { - "epoch": 2.464400256574727, - "grad_norm": 1.752228021621704, - "learning_rate": 8.94578958645811e-06, - "loss": 0.1525, - "step": 3842 - }, - { - "epoch": 2.465041693393201, - "grad_norm": 1.5249707698822021, - "learning_rate": 8.935076065995286e-06, - "loss": 0.14, - "step": 3843 - }, - { - "epoch": 2.465683130211674, - "grad_norm": 1.3236639499664307, - "learning_rate": 8.924362545532462e-06, - "loss": 0.1377, - "step": 3844 - }, - { - "epoch": 2.4663245670301475, - "grad_norm": 1.4901996850967407, - "learning_rate": 8.913649025069639e-06, - "loss": 0.1454, - "step": 3845 - }, - { - "epoch": 2.4669660038486207, - "grad_norm": 0.9329215288162231, - "learning_rate": 8.902935504606815e-06, - "loss": 0.1247, - "step": 3846 - }, - { - "epoch": 2.4676074406670945, - "grad_norm": 1.11151123046875, - "learning_rate": 8.89222198414399e-06, - "loss": 0.1326, - "step": 3847 - }, - { - "epoch": 2.4682488774855678, - "grad_norm": 0.8491328358650208, - "learning_rate": 8.881508463681166e-06, - "loss": 0.1304, - "step": 3848 - }, - { - "epoch": 2.468890314304041, - "grad_norm": 1.358120322227478, - "learning_rate": 8.870794943218343e-06, - "loss": 0.1401, - "step": 3849 - }, - { - "epoch": 2.4695317511225143, - "grad_norm": 1.4210212230682373, - "learning_rate": 8.860081422755518e-06, - "loss": 0.1294, - "step": 3850 - }, - { - "epoch": 2.470173187940988, - "grad_norm": 1.276076316833496, - "learning_rate": 8.849367902292694e-06, - "loss": 0.134, - "step": 3851 - }, - { - "epoch": 2.4708146247594613, - "grad_norm": 1.1553936004638672, - "learning_rate": 8.838654381829869e-06, - "loss": 0.1251, - "step": 3852 - }, - { - "epoch": 2.4714560615779346, - "grad_norm": 1.0947214365005493, - "learning_rate": 8.827940861367045e-06, - "loss": 0.1275, - "step": 3853 - }, - { - "epoch": 2.472097498396408, - "grad_norm": 1.1562120914459229, - "learning_rate": 8.817227340904222e-06, - "loss": 0.1256, - "step": 3854 - }, - { - "epoch": 2.472738935214881, - "grad_norm": 1.2755987644195557, - "learning_rate": 8.806513820441398e-06, - "loss": 0.1397, - "step": 3855 - }, - { - "epoch": 2.473380372033355, - "grad_norm": 1.7009810209274292, - "learning_rate": 8.795800299978573e-06, - "loss": 0.156, - "step": 3856 - }, - { - "epoch": 2.474021808851828, - "grad_norm": 1.1020715236663818, - "learning_rate": 8.78508677951575e-06, - "loss": 0.1444, - "step": 3857 - }, - { - "epoch": 2.4746632456703015, - "grad_norm": 1.5878901481628418, - "learning_rate": 8.774373259052924e-06, - "loss": 0.1536, - "step": 3858 - }, - { - "epoch": 2.4753046824887748, - "grad_norm": 1.4250057935714722, - "learning_rate": 8.7636597385901e-06, - "loss": 0.1418, - "step": 3859 - }, - { - "epoch": 2.475946119307248, - "grad_norm": 1.2996611595153809, - "learning_rate": 8.752946218127277e-06, - "loss": 0.1311, - "step": 3860 - }, - { - "epoch": 2.476587556125722, - "grad_norm": 1.4827874898910522, - "learning_rate": 8.742232697664454e-06, - "loss": 0.1583, - "step": 3861 - }, - { - "epoch": 2.477228992944195, - "grad_norm": 1.141886830329895, - "learning_rate": 8.731519177201628e-06, - "loss": 0.1401, - "step": 3862 - }, - { - "epoch": 2.4778704297626684, - "grad_norm": 0.9514087438583374, - "learning_rate": 8.720805656738805e-06, - "loss": 0.1266, - "step": 3863 - }, - { - "epoch": 2.4785118665811416, - "grad_norm": 1.6655923128128052, - "learning_rate": 8.710092136275981e-06, - "loss": 0.1652, - "step": 3864 - }, - { - "epoch": 2.479153303399615, - "grad_norm": 1.0977610349655151, - "learning_rate": 8.699378615813158e-06, - "loss": 0.1356, - "step": 3865 - }, - { - "epoch": 2.4797947402180887, - "grad_norm": 0.7492948770523071, - "learning_rate": 8.688665095350332e-06, - "loss": 0.116, - "step": 3866 - }, - { - "epoch": 2.480436177036562, - "grad_norm": 0.8779626488685608, - "learning_rate": 8.677951574887507e-06, - "loss": 0.1298, - "step": 3867 - }, - { - "epoch": 2.4810776138550352, - "grad_norm": 1.071336269378662, - "learning_rate": 8.667238054424684e-06, - "loss": 0.1266, - "step": 3868 - }, - { - "epoch": 2.4817190506735085, - "grad_norm": 0.9596415758132935, - "learning_rate": 8.65652453396186e-06, - "loss": 0.125, - "step": 3869 - }, - { - "epoch": 2.482360487491982, - "grad_norm": 1.1053004264831543, - "learning_rate": 8.645811013499037e-06, - "loss": 0.1371, - "step": 3870 - }, - { - "epoch": 2.4830019243104555, - "grad_norm": 0.9578515887260437, - "learning_rate": 8.635097493036213e-06, - "loss": 0.123, - "step": 3871 - }, - { - "epoch": 2.483643361128929, - "grad_norm": 1.1731512546539307, - "learning_rate": 8.624383972573388e-06, - "loss": 0.1342, - "step": 3872 - }, - { - "epoch": 2.484284797947402, - "grad_norm": 1.3140978813171387, - "learning_rate": 8.613670452110564e-06, - "loss": 0.1506, - "step": 3873 - }, - { - "epoch": 2.4849262347658754, - "grad_norm": 1.049821138381958, - "learning_rate": 8.60295693164774e-06, - "loss": 0.1282, - "step": 3874 - }, - { - "epoch": 2.485567671584349, - "grad_norm": 1.3098130226135254, - "learning_rate": 8.592243411184915e-06, - "loss": 0.1416, - "step": 3875 - }, - { - "epoch": 2.4862091084028224, - "grad_norm": 1.2205060720443726, - "learning_rate": 8.581529890722092e-06, - "loss": 0.1315, - "step": 3876 - }, - { - "epoch": 2.4868505452212957, - "grad_norm": 1.0211533308029175, - "learning_rate": 8.570816370259267e-06, - "loss": 0.1303, - "step": 3877 - }, - { - "epoch": 2.487491982039769, - "grad_norm": 0.8421807289123535, - "learning_rate": 8.560102849796443e-06, - "loss": 0.1222, - "step": 3878 - }, - { - "epoch": 2.4881334188582427, - "grad_norm": 0.8054133057594299, - "learning_rate": 8.54938932933362e-06, - "loss": 0.1273, - "step": 3879 - }, - { - "epoch": 2.488774855676716, - "grad_norm": 1.5537437200546265, - "learning_rate": 8.538675808870796e-06, - "loss": 0.1475, - "step": 3880 - }, - { - "epoch": 2.4894162924951893, - "grad_norm": 1.2047125101089478, - "learning_rate": 8.52796228840797e-06, - "loss": 0.1367, - "step": 3881 - }, - { - "epoch": 2.4900577293136625, - "grad_norm": 1.3309502601623535, - "learning_rate": 8.517248767945147e-06, - "loss": 0.1462, - "step": 3882 - }, - { - "epoch": 2.490699166132136, - "grad_norm": 1.1705416440963745, - "learning_rate": 8.506535247482324e-06, - "loss": 0.1272, - "step": 3883 - }, - { - "epoch": 2.4913406029506096, - "grad_norm": 0.9443037509918213, - "learning_rate": 8.495821727019498e-06, - "loss": 0.1312, - "step": 3884 - }, - { - "epoch": 2.491982039769083, - "grad_norm": 1.6174778938293457, - "learning_rate": 8.485108206556675e-06, - "loss": 0.1314, - "step": 3885 - }, - { - "epoch": 2.492623476587556, - "grad_norm": 1.1851688623428345, - "learning_rate": 8.474394686093851e-06, - "loss": 0.1297, - "step": 3886 - }, - { - "epoch": 2.4932649134060294, - "grad_norm": 1.017868995666504, - "learning_rate": 8.463681165631026e-06, - "loss": 0.1333, - "step": 3887 - }, - { - "epoch": 2.4939063502245027, - "grad_norm": 0.9337804317474365, - "learning_rate": 8.452967645168203e-06, - "loss": 0.1339, - "step": 3888 - }, - { - "epoch": 2.4945477870429764, - "grad_norm": 1.1397653818130493, - "learning_rate": 8.442254124705379e-06, - "loss": 0.1323, - "step": 3889 - }, - { - "epoch": 2.4951892238614497, - "grad_norm": 1.064057469367981, - "learning_rate": 8.431540604242555e-06, - "loss": 0.127, - "step": 3890 - }, - { - "epoch": 2.495830660679923, - "grad_norm": 1.6722402572631836, - "learning_rate": 8.42082708377973e-06, - "loss": 0.1515, - "step": 3891 - }, - { - "epoch": 2.4964720974983963, - "grad_norm": 1.1660019159317017, - "learning_rate": 8.410113563316907e-06, - "loss": 0.1291, - "step": 3892 - }, - { - "epoch": 2.4971135343168696, - "grad_norm": 1.3746603727340698, - "learning_rate": 8.399400042854081e-06, - "loss": 0.1405, - "step": 3893 - }, - { - "epoch": 2.4977549711353433, - "grad_norm": 1.1725947856903076, - "learning_rate": 8.388686522391258e-06, - "loss": 0.1347, - "step": 3894 - }, - { - "epoch": 2.4983964079538166, - "grad_norm": 1.334681510925293, - "learning_rate": 8.377973001928434e-06, - "loss": 0.1356, - "step": 3895 - }, - { - "epoch": 2.49903784477229, - "grad_norm": 0.9971758127212524, - "learning_rate": 8.367259481465609e-06, - "loss": 0.1307, - "step": 3896 - }, - { - "epoch": 2.499679281590763, - "grad_norm": 1.3811979293823242, - "learning_rate": 8.356545961002786e-06, - "loss": 0.138, - "step": 3897 - }, - { - "epoch": 2.5003207184092364, - "grad_norm": 1.4015638828277588, - "learning_rate": 8.345832440539962e-06, - "loss": 0.1407, - "step": 3898 - }, - { - "epoch": 2.50096215522771, - "grad_norm": 1.17410409450531, - "learning_rate": 8.335118920077139e-06, - "loss": 0.1276, - "step": 3899 - }, - { - "epoch": 2.5016035920461834, - "grad_norm": 2.025970220565796, - "learning_rate": 8.324405399614315e-06, - "loss": 0.1331, - "step": 3900 - }, - { - "epoch": 2.5022450288646567, - "grad_norm": 1.2469795942306519, - "learning_rate": 8.31369187915149e-06, - "loss": 0.1332, - "step": 3901 - }, - { - "epoch": 2.5028864656831304, - "grad_norm": 1.0715481042861938, - "learning_rate": 8.302978358688664e-06, - "loss": 0.1334, - "step": 3902 - }, - { - "epoch": 2.5035279025016037, - "grad_norm": 1.1359175443649292, - "learning_rate": 8.292264838225841e-06, - "loss": 0.1234, - "step": 3903 - }, - { - "epoch": 2.504169339320077, - "grad_norm": 1.5466971397399902, - "learning_rate": 8.281551317763017e-06, - "loss": 0.1391, - "step": 3904 - }, - { - "epoch": 2.5048107761385503, - "grad_norm": 1.0910485982894897, - "learning_rate": 8.270837797300194e-06, - "loss": 0.1335, - "step": 3905 - }, - { - "epoch": 2.5054522129570236, - "grad_norm": 1.0876286029815674, - "learning_rate": 8.260124276837369e-06, - "loss": 0.126, - "step": 3906 - }, - { - "epoch": 2.5060936497754973, - "grad_norm": 1.5262633562088013, - "learning_rate": 8.249410756374545e-06, - "loss": 0.1414, - "step": 3907 - }, - { - "epoch": 2.5067350865939706, - "grad_norm": 1.2918380498886108, - "learning_rate": 8.238697235911722e-06, - "loss": 0.1372, - "step": 3908 - }, - { - "epoch": 2.507376523412444, - "grad_norm": 1.4275405406951904, - "learning_rate": 8.227983715448898e-06, - "loss": 0.126, - "step": 3909 - }, - { - "epoch": 2.508017960230917, - "grad_norm": 1.7106900215148926, - "learning_rate": 8.217270194986073e-06, - "loss": 0.1468, - "step": 3910 - }, - { - "epoch": 2.5086593970493904, - "grad_norm": 0.96149080991745, - "learning_rate": 8.206556674523247e-06, - "loss": 0.1284, - "step": 3911 - }, - { - "epoch": 2.509300833867864, - "grad_norm": 1.4688620567321777, - "learning_rate": 8.195843154060424e-06, - "loss": 0.1437, - "step": 3912 - }, - { - "epoch": 2.5099422706863375, - "grad_norm": 1.0769174098968506, - "learning_rate": 8.1851296335976e-06, - "loss": 0.1233, - "step": 3913 - }, - { - "epoch": 2.5105837075048107, - "grad_norm": 1.3451851606369019, - "learning_rate": 8.174416113134777e-06, - "loss": 0.128, - "step": 3914 - }, - { - "epoch": 2.511225144323284, - "grad_norm": 0.989099383354187, - "learning_rate": 8.163702592671953e-06, - "loss": 0.1225, - "step": 3915 - }, - { - "epoch": 2.5118665811417573, - "grad_norm": 1.529650330543518, - "learning_rate": 8.152989072209128e-06, - "loss": 0.1481, - "step": 3916 - }, - { - "epoch": 2.512508017960231, - "grad_norm": 1.1071330308914185, - "learning_rate": 8.142275551746305e-06, - "loss": 0.1282, - "step": 3917 - }, - { - "epoch": 2.5131494547787043, - "grad_norm": 1.1096141338348389, - "learning_rate": 8.13156203128348e-06, - "loss": 0.1264, - "step": 3918 - }, - { - "epoch": 2.5137908915971776, - "grad_norm": 1.0807422399520874, - "learning_rate": 8.120848510820656e-06, - "loss": 0.1306, - "step": 3919 - }, - { - "epoch": 2.514432328415651, - "grad_norm": 1.5315426588058472, - "learning_rate": 8.110134990357832e-06, - "loss": 0.1491, - "step": 3920 - }, - { - "epoch": 2.515073765234124, - "grad_norm": 1.344321370124817, - "learning_rate": 8.099421469895007e-06, - "loss": 0.1395, - "step": 3921 - }, - { - "epoch": 2.515715202052598, - "grad_norm": 1.9411039352416992, - "learning_rate": 8.088707949432183e-06, - "loss": 0.1518, - "step": 3922 - }, - { - "epoch": 2.516356638871071, - "grad_norm": 1.137710452079773, - "learning_rate": 8.07799442896936e-06, - "loss": 0.1302, - "step": 3923 - }, - { - "epoch": 2.5169980756895445, - "grad_norm": 1.1376893520355225, - "learning_rate": 8.067280908506536e-06, - "loss": 0.1312, - "step": 3924 - }, - { - "epoch": 2.517639512508018, - "grad_norm": 1.3700103759765625, - "learning_rate": 8.056567388043713e-06, - "loss": 0.1369, - "step": 3925 - }, - { - "epoch": 2.5182809493264915, - "grad_norm": 1.4358510971069336, - "learning_rate": 8.045853867580888e-06, - "loss": 0.1385, - "step": 3926 - }, - { - "epoch": 2.5189223861449648, - "grad_norm": 1.1824042797088623, - "learning_rate": 8.035140347118062e-06, - "loss": 0.132, - "step": 3927 - }, - { - "epoch": 2.519563822963438, - "grad_norm": 1.5600879192352295, - "learning_rate": 8.024426826655239e-06, - "loss": 0.1431, - "step": 3928 - }, - { - "epoch": 2.5202052597819113, - "grad_norm": 1.104215145111084, - "learning_rate": 8.013713306192415e-06, - "loss": 0.1289, - "step": 3929 - }, - { - "epoch": 2.520846696600385, - "grad_norm": 1.3125089406967163, - "learning_rate": 8.002999785729592e-06, - "loss": 0.1322, - "step": 3930 - }, - { - "epoch": 2.5214881334188584, - "grad_norm": 1.5612784624099731, - "learning_rate": 7.992286265266766e-06, - "loss": 0.1417, - "step": 3931 - }, - { - "epoch": 2.5221295702373316, - "grad_norm": 1.0133402347564697, - "learning_rate": 7.981572744803943e-06, - "loss": 0.1335, - "step": 3932 - }, - { - "epoch": 2.522771007055805, - "grad_norm": 1.286283016204834, - "learning_rate": 7.97085922434112e-06, - "loss": 0.1389, - "step": 3933 - }, - { - "epoch": 2.523412443874278, - "grad_norm": 1.1742719411849976, - "learning_rate": 7.960145703878296e-06, - "loss": 0.1389, - "step": 3934 - }, - { - "epoch": 2.524053880692752, - "grad_norm": 1.0691760778427124, - "learning_rate": 7.94943218341547e-06, - "loss": 0.1295, - "step": 3935 - }, - { - "epoch": 2.5246953175112252, - "grad_norm": 1.3479154109954834, - "learning_rate": 7.938718662952645e-06, - "loss": 0.1324, - "step": 3936 - }, - { - "epoch": 2.5253367543296985, - "grad_norm": 1.3951332569122314, - "learning_rate": 7.928005142489822e-06, - "loss": 0.143, - "step": 3937 - }, - { - "epoch": 2.525978191148172, - "grad_norm": 1.0698988437652588, - "learning_rate": 7.917291622026998e-06, - "loss": 0.1378, - "step": 3938 - }, - { - "epoch": 2.526619627966645, - "grad_norm": 1.1004525423049927, - "learning_rate": 7.906578101564175e-06, - "loss": 0.123, - "step": 3939 - }, - { - "epoch": 2.527261064785119, - "grad_norm": 1.9075822830200195, - "learning_rate": 7.895864581101351e-06, - "loss": 0.1552, - "step": 3940 - }, - { - "epoch": 2.527902501603592, - "grad_norm": 1.3467923402786255, - "learning_rate": 7.885151060638526e-06, - "loss": 0.1532, - "step": 3941 - }, - { - "epoch": 2.5285439384220654, - "grad_norm": 1.3073660135269165, - "learning_rate": 7.874437540175702e-06, - "loss": 0.1431, - "step": 3942 - }, - { - "epoch": 2.5291853752405387, - "grad_norm": 1.4872735738754272, - "learning_rate": 7.863724019712879e-06, - "loss": 0.1415, - "step": 3943 - }, - { - "epoch": 2.529826812059012, - "grad_norm": 1.2501579523086548, - "learning_rate": 7.853010499250054e-06, - "loss": 0.1292, - "step": 3944 - }, - { - "epoch": 2.5304682488774857, - "grad_norm": 1.0783801078796387, - "learning_rate": 7.84229697878723e-06, - "loss": 0.1349, - "step": 3945 - }, - { - "epoch": 2.531109685695959, - "grad_norm": 0.8477262258529663, - "learning_rate": 7.831583458324405e-06, - "loss": 0.123, - "step": 3946 - }, - { - "epoch": 2.5317511225144322, - "grad_norm": 0.8347704410552979, - "learning_rate": 7.820869937861581e-06, - "loss": 0.1218, - "step": 3947 - }, - { - "epoch": 2.532392559332906, - "grad_norm": 1.1006628274917603, - "learning_rate": 7.810156417398758e-06, - "loss": 0.1369, - "step": 3948 - }, - { - "epoch": 2.5330339961513793, - "grad_norm": 1.1378707885742188, - "learning_rate": 7.799442896935934e-06, - "loss": 0.125, - "step": 3949 - }, - { - "epoch": 2.5336754329698525, - "grad_norm": 1.091441035270691, - "learning_rate": 7.78872937647311e-06, - "loss": 0.1214, - "step": 3950 - }, - { - "epoch": 2.534316869788326, - "grad_norm": 1.1538841724395752, - "learning_rate": 7.778015856010285e-06, - "loss": 0.1289, - "step": 3951 - }, - { - "epoch": 2.534958306606799, - "grad_norm": 1.6826978921890259, - "learning_rate": 7.767302335547462e-06, - "loss": 0.1414, - "step": 3952 - }, - { - "epoch": 2.535599743425273, - "grad_norm": 0.9671229124069214, - "learning_rate": 7.756588815084637e-06, - "loss": 0.1293, - "step": 3953 - }, - { - "epoch": 2.536241180243746, - "grad_norm": 1.451821208000183, - "learning_rate": 7.745875294621813e-06, - "loss": 0.1399, - "step": 3954 - }, - { - "epoch": 2.5368826170622194, - "grad_norm": 1.4569607973098755, - "learning_rate": 7.73516177415899e-06, - "loss": 0.1372, - "step": 3955 - }, - { - "epoch": 2.5375240538806927, - "grad_norm": 0.9672550559043884, - "learning_rate": 7.724448253696164e-06, - "loss": 0.1346, - "step": 3956 - }, - { - "epoch": 2.538165490699166, - "grad_norm": 1.1573634147644043, - "learning_rate": 7.71373473323334e-06, - "loss": 0.1235, - "step": 3957 - }, - { - "epoch": 2.5388069275176397, - "grad_norm": 1.1627418994903564, - "learning_rate": 7.703021212770517e-06, - "loss": 0.1311, - "step": 3958 - }, - { - "epoch": 2.539448364336113, - "grad_norm": 1.5513839721679688, - "learning_rate": 7.692307692307694e-06, - "loss": 0.1524, - "step": 3959 - }, - { - "epoch": 2.5400898011545863, - "grad_norm": 1.062347412109375, - "learning_rate": 7.681594171844868e-06, - "loss": 0.1281, - "step": 3960 - }, - { - "epoch": 2.5407312379730596, - "grad_norm": 0.8512759804725647, - "learning_rate": 7.670880651382043e-06, - "loss": 0.1104, - "step": 3961 - }, - { - "epoch": 2.541372674791533, - "grad_norm": 1.1390151977539062, - "learning_rate": 7.66016713091922e-06, - "loss": 0.1277, - "step": 3962 - }, - { - "epoch": 2.5420141116100066, - "grad_norm": 1.1250452995300293, - "learning_rate": 7.649453610456396e-06, - "loss": 0.1298, - "step": 3963 - }, - { - "epoch": 2.54265554842848, - "grad_norm": 1.2017213106155396, - "learning_rate": 7.638740089993572e-06, - "loss": 0.1273, - "step": 3964 - }, - { - "epoch": 2.543296985246953, - "grad_norm": 1.2537440061569214, - "learning_rate": 7.628026569530748e-06, - "loss": 0.1303, - "step": 3965 - }, - { - "epoch": 2.5439384220654264, - "grad_norm": 1.0617213249206543, - "learning_rate": 7.6173130490679245e-06, - "loss": 0.123, - "step": 3966 - }, - { - "epoch": 2.5445798588838997, - "grad_norm": 2.0005295276641846, - "learning_rate": 7.6065995286051e-06, - "loss": 0.1516, - "step": 3967 - }, - { - "epoch": 2.5452212957023734, - "grad_norm": 1.9389142990112305, - "learning_rate": 7.595886008142277e-06, - "loss": 0.1696, - "step": 3968 - }, - { - "epoch": 2.5458627325208467, - "grad_norm": 1.1660394668579102, - "learning_rate": 7.585172487679452e-06, - "loss": 0.1192, - "step": 3969 - }, - { - "epoch": 2.54650416933932, - "grad_norm": 1.1110022068023682, - "learning_rate": 7.574458967216627e-06, - "loss": 0.1238, - "step": 3970 - }, - { - "epoch": 2.5471456061577937, - "grad_norm": 1.3959624767303467, - "learning_rate": 7.563745446753803e-06, - "loss": 0.1262, - "step": 3971 - }, - { - "epoch": 2.5477870429762666, - "grad_norm": 1.2868751287460327, - "learning_rate": 7.553031926290979e-06, - "loss": 0.145, - "step": 3972 - }, - { - "epoch": 2.5484284797947403, - "grad_norm": 1.1536738872528076, - "learning_rate": 7.5423184058281555e-06, - "loss": 0.1349, - "step": 3973 - }, - { - "epoch": 2.5490699166132136, - "grad_norm": 1.884139060974121, - "learning_rate": 7.531604885365331e-06, - "loss": 0.1565, - "step": 3974 - }, - { - "epoch": 2.549711353431687, - "grad_norm": 1.042380928993225, - "learning_rate": 7.5208913649025075e-06, - "loss": 0.1239, - "step": 3975 - }, - { - "epoch": 2.5503527902501606, - "grad_norm": 1.545585036277771, - "learning_rate": 7.510177844439684e-06, - "loss": 0.1417, - "step": 3976 - }, - { - "epoch": 2.550994227068634, - "grad_norm": 1.5875639915466309, - "learning_rate": 7.49946432397686e-06, - "loss": 0.1458, - "step": 3977 - }, - { - "epoch": 2.551635663887107, - "grad_norm": 1.387413501739502, - "learning_rate": 7.488750803514034e-06, - "loss": 0.1368, - "step": 3978 - }, - { - "epoch": 2.5522771007055804, - "grad_norm": 1.0635714530944824, - "learning_rate": 7.478037283051211e-06, - "loss": 0.1242, - "step": 3979 - }, - { - "epoch": 2.5529185375240537, - "grad_norm": 1.0144726037979126, - "learning_rate": 7.4673237625883864e-06, - "loss": 0.1288, - "step": 3980 - }, - { - "epoch": 2.5535599743425275, - "grad_norm": 1.145079255104065, - "learning_rate": 7.456610242125563e-06, - "loss": 0.1224, - "step": 3981 - }, - { - "epoch": 2.5542014111610007, - "grad_norm": 1.756293773651123, - "learning_rate": 7.4458967216627385e-06, - "loss": 0.1563, - "step": 3982 - }, - { - "epoch": 2.554842847979474, - "grad_norm": 1.3613256216049194, - "learning_rate": 7.435183201199915e-06, - "loss": 0.1388, - "step": 3983 - }, - { - "epoch": 2.5554842847979473, - "grad_norm": 1.2486767768859863, - "learning_rate": 7.4244696807370906e-06, - "loss": 0.1388, - "step": 3984 - }, - { - "epoch": 2.5561257216164206, - "grad_norm": 1.7350397109985352, - "learning_rate": 7.413756160274267e-06, - "loss": 0.1376, - "step": 3985 - }, - { - "epoch": 2.5567671584348943, - "grad_norm": 1.5033230781555176, - "learning_rate": 7.403042639811443e-06, - "loss": 0.1479, - "step": 3986 - }, - { - "epoch": 2.5574085952533676, - "grad_norm": 0.9029199481010437, - "learning_rate": 7.392329119348617e-06, - "loss": 0.1222, - "step": 3987 - }, - { - "epoch": 2.558050032071841, - "grad_norm": 1.2821909189224243, - "learning_rate": 7.381615598885794e-06, - "loss": 0.1336, - "step": 3988 - }, - { - "epoch": 2.558691468890314, - "grad_norm": 1.2690954208374023, - "learning_rate": 7.37090207842297e-06, - "loss": 0.1358, - "step": 3989 - }, - { - "epoch": 2.5593329057087875, - "grad_norm": 1.6500493288040161, - "learning_rate": 7.360188557960146e-06, - "loss": 0.1398, - "step": 3990 - }, - { - "epoch": 2.559974342527261, - "grad_norm": 1.1333727836608887, - "learning_rate": 7.349475037497322e-06, - "loss": 0.1387, - "step": 3991 - }, - { - "epoch": 2.5606157793457345, - "grad_norm": 1.1013612747192383, - "learning_rate": 7.338761517034498e-06, - "loss": 0.1265, - "step": 3992 - }, - { - "epoch": 2.5612572161642078, - "grad_norm": 1.3120348453521729, - "learning_rate": 7.328047996571674e-06, - "loss": 0.1316, - "step": 3993 - }, - { - "epoch": 2.561898652982681, - "grad_norm": 1.4235252141952515, - "learning_rate": 7.31733447610885e-06, - "loss": 0.142, - "step": 3994 - }, - { - "epoch": 2.5625400898011543, - "grad_norm": 1.5431063175201416, - "learning_rate": 7.306620955646025e-06, - "loss": 0.1321, - "step": 3995 - }, - { - "epoch": 2.563181526619628, - "grad_norm": 1.110455870628357, - "learning_rate": 7.295907435183201e-06, - "loss": 0.1289, - "step": 3996 - }, - { - "epoch": 2.5638229634381013, - "grad_norm": 1.4950686693191528, - "learning_rate": 7.285193914720377e-06, - "loss": 0.1489, - "step": 3997 - }, - { - "epoch": 2.5644644002565746, - "grad_norm": 1.5766870975494385, - "learning_rate": 7.274480394257553e-06, - "loss": 0.1504, - "step": 3998 - }, - { - "epoch": 2.5651058370750484, - "grad_norm": 1.0853345394134521, - "learning_rate": 7.263766873794729e-06, - "loss": 0.1277, - "step": 3999 - }, - { - "epoch": 2.5657472738935216, - "grad_norm": 1.3356324434280396, - "learning_rate": 7.253053353331905e-06, - "loss": 0.1375, - "step": 4000 - }, - { - "epoch": 2.566388710711995, - "grad_norm": 1.1967498064041138, - "learning_rate": 7.242339832869082e-06, - "loss": 0.1235, - "step": 4001 - }, - { - "epoch": 2.567030147530468, - "grad_norm": 0.9464269876480103, - "learning_rate": 7.231626312406257e-06, - "loss": 0.1196, - "step": 4002 - }, - { - "epoch": 2.5676715843489415, - "grad_norm": 0.958443820476532, - "learning_rate": 7.220912791943434e-06, - "loss": 0.1258, - "step": 4003 - }, - { - "epoch": 2.5683130211674152, - "grad_norm": 1.0238155126571655, - "learning_rate": 7.210199271480609e-06, - "loss": 0.1222, - "step": 4004 - }, - { - "epoch": 2.5689544579858885, - "grad_norm": 1.0712366104125977, - "learning_rate": 7.199485751017784e-06, - "loss": 0.1317, - "step": 4005 - }, - { - "epoch": 2.569595894804362, - "grad_norm": 1.5111356973648071, - "learning_rate": 7.188772230554961e-06, - "loss": 0.1283, - "step": 4006 - }, - { - "epoch": 2.570237331622835, - "grad_norm": 1.133782148361206, - "learning_rate": 7.178058710092136e-06, - "loss": 0.1375, - "step": 4007 - }, - { - "epoch": 2.5708787684413084, - "grad_norm": 0.7717103362083435, - "learning_rate": 7.167345189629313e-06, - "loss": 0.1149, - "step": 4008 - }, - { - "epoch": 2.571520205259782, - "grad_norm": 0.8871778249740601, - "learning_rate": 7.156631669166488e-06, - "loss": 0.1281, - "step": 4009 - }, - { - "epoch": 2.5721616420782554, - "grad_norm": 1.793400764465332, - "learning_rate": 7.145918148703665e-06, - "loss": 0.163, - "step": 4010 - }, - { - "epoch": 2.5728030788967287, - "grad_norm": 1.1307915449142456, - "learning_rate": 7.1352046282408404e-06, - "loss": 0.1387, - "step": 4011 - }, - { - "epoch": 2.573444515715202, - "grad_norm": 1.124754786491394, - "learning_rate": 7.124491107778017e-06, - "loss": 0.1322, - "step": 4012 - }, - { - "epoch": 2.5740859525336752, - "grad_norm": 1.3153398036956787, - "learning_rate": 7.113777587315192e-06, - "loss": 0.1404, - "step": 4013 - }, - { - "epoch": 2.574727389352149, - "grad_norm": 0.9840689301490784, - "learning_rate": 7.103064066852367e-06, - "loss": 0.1367, - "step": 4014 - }, - { - "epoch": 2.5753688261706222, - "grad_norm": 1.2003533840179443, - "learning_rate": 7.092350546389544e-06, - "loss": 0.1229, - "step": 4015 - }, - { - "epoch": 2.5760102629890955, - "grad_norm": 1.0220985412597656, - "learning_rate": 7.08163702592672e-06, - "loss": 0.1311, - "step": 4016 - }, - { - "epoch": 2.576651699807569, - "grad_norm": 1.0134532451629639, - "learning_rate": 7.070923505463896e-06, - "loss": 0.1241, - "step": 4017 - }, - { - "epoch": 2.577293136626042, - "grad_norm": 1.1087729930877686, - "learning_rate": 7.060209985001072e-06, - "loss": 0.1302, - "step": 4018 - }, - { - "epoch": 2.577934573444516, - "grad_norm": 1.1887857913970947, - "learning_rate": 7.049496464538248e-06, - "loss": 0.1354, - "step": 4019 - }, - { - "epoch": 2.578576010262989, - "grad_norm": 1.277670979499817, - "learning_rate": 7.038782944075424e-06, - "loss": 0.1323, - "step": 4020 - }, - { - "epoch": 2.5792174470814624, - "grad_norm": 1.826845645904541, - "learning_rate": 7.028069423612599e-06, - "loss": 0.1851, - "step": 4021 - }, - { - "epoch": 2.579858883899936, - "grad_norm": 0.995048463344574, - "learning_rate": 7.017355903149775e-06, - "loss": 0.1207, - "step": 4022 - }, - { - "epoch": 2.5805003207184094, - "grad_norm": 0.9294148087501526, - "learning_rate": 7.006642382686951e-06, - "loss": 0.1215, - "step": 4023 - }, - { - "epoch": 2.5811417575368827, - "grad_norm": 0.9707247018814087, - "learning_rate": 6.995928862224127e-06, - "loss": 0.1304, - "step": 4024 - }, - { - "epoch": 2.581783194355356, - "grad_norm": 1.0668182373046875, - "learning_rate": 6.985215341761303e-06, - "loss": 0.1256, - "step": 4025 - }, - { - "epoch": 2.5824246311738293, - "grad_norm": 1.0889570713043213, - "learning_rate": 6.974501821298479e-06, - "loss": 0.1423, - "step": 4026 - }, - { - "epoch": 2.583066067992303, - "grad_norm": 1.3727424144744873, - "learning_rate": 6.963788300835655e-06, - "loss": 0.1403, - "step": 4027 - }, - { - "epoch": 2.5837075048107763, - "grad_norm": 1.3306654691696167, - "learning_rate": 6.953074780372832e-06, - "loss": 0.1286, - "step": 4028 - }, - { - "epoch": 2.5843489416292496, - "grad_norm": 1.111578345298767, - "learning_rate": 6.942361259910007e-06, - "loss": 0.1374, - "step": 4029 - }, - { - "epoch": 2.584990378447723, - "grad_norm": 1.1168971061706543, - "learning_rate": 6.931647739447182e-06, - "loss": 0.1208, - "step": 4030 - }, - { - "epoch": 2.585631815266196, - "grad_norm": 1.292501449584961, - "learning_rate": 6.9209342189843585e-06, - "loss": 0.1301, - "step": 4031 - }, - { - "epoch": 2.58627325208467, - "grad_norm": 1.0618845224380493, - "learning_rate": 6.910220698521534e-06, - "loss": 0.1143, - "step": 4032 - }, - { - "epoch": 2.586914688903143, - "grad_norm": 1.2264597415924072, - "learning_rate": 6.8995071780587106e-06, - "loss": 0.1464, - "step": 4033 - }, - { - "epoch": 2.5875561257216164, - "grad_norm": 1.1346616744995117, - "learning_rate": 6.888793657595886e-06, - "loss": 0.1318, - "step": 4034 - }, - { - "epoch": 2.5881975625400897, - "grad_norm": 1.4762718677520752, - "learning_rate": 6.878080137133063e-06, - "loss": 0.1393, - "step": 4035 - }, - { - "epoch": 2.588838999358563, - "grad_norm": 1.1119352579116821, - "learning_rate": 6.867366616670238e-06, - "loss": 0.1276, - "step": 4036 - }, - { - "epoch": 2.5894804361770367, - "grad_norm": 1.310075044631958, - "learning_rate": 6.856653096207415e-06, - "loss": 0.1506, - "step": 4037 - }, - { - "epoch": 2.59012187299551, - "grad_norm": 1.6389662027359009, - "learning_rate": 6.8459395757445895e-06, - "loss": 0.1423, - "step": 4038 - }, - { - "epoch": 2.5907633098139833, - "grad_norm": 1.4276032447814941, - "learning_rate": 6.835226055281765e-06, - "loss": 0.1472, - "step": 4039 - }, - { - "epoch": 2.5914047466324566, - "grad_norm": 1.4099501371383667, - "learning_rate": 6.8245125348189415e-06, - "loss": 0.1405, - "step": 4040 - }, - { - "epoch": 2.59204618345093, - "grad_norm": 1.2115596532821655, - "learning_rate": 6.813799014356118e-06, - "loss": 0.1422, - "step": 4041 - }, - { - "epoch": 2.5926876202694036, - "grad_norm": 1.018636703491211, - "learning_rate": 6.8030854938932936e-06, - "loss": 0.1325, - "step": 4042 - }, - { - "epoch": 2.593329057087877, - "grad_norm": 0.8973970413208008, - "learning_rate": 6.79237197343047e-06, - "loss": 0.119, - "step": 4043 - }, - { - "epoch": 2.59397049390635, - "grad_norm": 1.4759984016418457, - "learning_rate": 6.781658452967646e-06, - "loss": 0.1403, - "step": 4044 - }, - { - "epoch": 2.594611930724824, - "grad_norm": 1.380722999572754, - "learning_rate": 6.770944932504822e-06, - "loss": 0.1453, - "step": 4045 - }, - { - "epoch": 2.5952533675432967, - "grad_norm": 1.4576919078826904, - "learning_rate": 6.760231412041998e-06, - "loss": 0.1378, - "step": 4046 - }, - { - "epoch": 2.5958948043617704, - "grad_norm": 1.3540863990783691, - "learning_rate": 6.7495178915791725e-06, - "loss": 0.1317, - "step": 4047 - }, - { - "epoch": 2.5965362411802437, - "grad_norm": 1.2750369310379028, - "learning_rate": 6.738804371116349e-06, - "loss": 0.1253, - "step": 4048 - }, - { - "epoch": 2.597177677998717, - "grad_norm": 1.23829984664917, - "learning_rate": 6.7280908506535245e-06, - "loss": 0.1345, - "step": 4049 - }, - { - "epoch": 2.5978191148171907, - "grad_norm": 1.3829083442687988, - "learning_rate": 6.717377330190701e-06, - "loss": 0.1333, - "step": 4050 - }, - { - "epoch": 2.598460551635664, - "grad_norm": 1.8275178670883179, - "learning_rate": 6.706663809727877e-06, - "loss": 0.1628, - "step": 4051 - }, - { - "epoch": 2.5991019884541373, - "grad_norm": 1.2905542850494385, - "learning_rate": 6.695950289265053e-06, - "loss": 0.1313, - "step": 4052 - }, - { - "epoch": 2.5997434252726106, - "grad_norm": 1.4603614807128906, - "learning_rate": 6.6852367688022295e-06, - "loss": 0.1357, - "step": 4053 - }, - { - "epoch": 2.600384862091084, - "grad_norm": 1.155962347984314, - "learning_rate": 6.674523248339405e-06, - "loss": 0.1241, - "step": 4054 - }, - { - "epoch": 2.6010262989095576, - "grad_norm": 1.2025399208068848, - "learning_rate": 6.6638097278765816e-06, - "loss": 0.1293, - "step": 4055 - }, - { - "epoch": 2.601667735728031, - "grad_norm": 0.9878100156784058, - "learning_rate": 6.653096207413756e-06, - "loss": 0.1237, - "step": 4056 - }, - { - "epoch": 2.602309172546504, - "grad_norm": 1.2571154832839966, - "learning_rate": 6.642382686950932e-06, - "loss": 0.1347, - "step": 4057 - }, - { - "epoch": 2.6029506093649775, - "grad_norm": 1.1804012060165405, - "learning_rate": 6.631669166488108e-06, - "loss": 0.1371, - "step": 4058 - }, - { - "epoch": 2.6035920461834507, - "grad_norm": 1.1166192293167114, - "learning_rate": 6.620955646025284e-06, - "loss": 0.123, - "step": 4059 - }, - { - "epoch": 2.6042334830019245, - "grad_norm": 0.9614681601524353, - "learning_rate": 6.6102421255624604e-06, - "loss": 0.122, - "step": 4060 - }, - { - "epoch": 2.6048749198203978, - "grad_norm": 0.9777079820632935, - "learning_rate": 6.599528605099636e-06, - "loss": 0.128, - "step": 4061 - }, - { - "epoch": 2.605516356638871, - "grad_norm": 0.8104792833328247, - "learning_rate": 6.5888150846368125e-06, - "loss": 0.1204, - "step": 4062 - }, - { - "epoch": 2.6061577934573443, - "grad_norm": 1.7859212160110474, - "learning_rate": 6.578101564173988e-06, - "loss": 0.158, - "step": 4063 - }, - { - "epoch": 2.6067992302758176, - "grad_norm": 1.0874110460281372, - "learning_rate": 6.567388043711163e-06, - "loss": 0.1218, - "step": 4064 - }, - { - "epoch": 2.6074406670942913, - "grad_norm": 1.3639417886734009, - "learning_rate": 6.556674523248339e-06, - "loss": 0.1373, - "step": 4065 - }, - { - "epoch": 2.6080821039127646, - "grad_norm": 1.6091362237930298, - "learning_rate": 6.545961002785515e-06, - "loss": 0.1345, - "step": 4066 - }, - { - "epoch": 2.608723540731238, - "grad_norm": 0.811979353427887, - "learning_rate": 6.535247482322691e-06, - "loss": 0.1266, - "step": 4067 - }, - { - "epoch": 2.609364977549711, - "grad_norm": 1.560934066772461, - "learning_rate": 6.524533961859868e-06, - "loss": 0.1253, - "step": 4068 - }, - { - "epoch": 2.6100064143681845, - "grad_norm": 0.9623285531997681, - "learning_rate": 6.5138204413970434e-06, - "loss": 0.1231, - "step": 4069 - }, - { - "epoch": 2.610647851186658, - "grad_norm": 1.1945290565490723, - "learning_rate": 6.50310692093422e-06, - "loss": 0.1296, - "step": 4070 - }, - { - "epoch": 2.6112892880051315, - "grad_norm": 1.3928170204162598, - "learning_rate": 6.4923934004713955e-06, - "loss": 0.1352, - "step": 4071 - }, - { - "epoch": 2.6119307248236048, - "grad_norm": 0.8489125967025757, - "learning_rate": 6.481679880008572e-06, - "loss": 0.1306, - "step": 4072 - }, - { - "epoch": 2.6125721616420785, - "grad_norm": 1.6212600469589233, - "learning_rate": 6.470966359545747e-06, - "loss": 0.156, - "step": 4073 - }, - { - "epoch": 2.613213598460552, - "grad_norm": 1.660519003868103, - "learning_rate": 6.460252839082922e-06, - "loss": 0.1469, - "step": 4074 - }, - { - "epoch": 2.613855035279025, - "grad_norm": 1.102065086364746, - "learning_rate": 6.449539318620099e-06, - "loss": 0.1274, - "step": 4075 - }, - { - "epoch": 2.6144964720974984, - "grad_norm": 1.271128535270691, - "learning_rate": 6.438825798157274e-06, - "loss": 0.1331, - "step": 4076 - }, - { - "epoch": 2.6151379089159716, - "grad_norm": 1.7861602306365967, - "learning_rate": 6.428112277694451e-06, - "loss": 0.1579, - "step": 4077 - }, - { - "epoch": 2.6157793457344454, - "grad_norm": 1.6415293216705322, - "learning_rate": 6.4173987572316265e-06, - "loss": 0.1515, - "step": 4078 - }, - { - "epoch": 2.6164207825529187, - "grad_norm": 1.377023696899414, - "learning_rate": 6.406685236768803e-06, - "loss": 0.1389, - "step": 4079 - }, - { - "epoch": 2.617062219371392, - "grad_norm": 1.3199048042297363, - "learning_rate": 6.395971716305979e-06, - "loss": 0.1293, - "step": 4080 - }, - { - "epoch": 2.6177036561898652, - "grad_norm": 1.3363102674484253, - "learning_rate": 6.385258195843154e-06, - "loss": 0.1337, - "step": 4081 - }, - { - "epoch": 2.6183450930083385, - "grad_norm": 1.1714617013931274, - "learning_rate": 6.37454467538033e-06, - "loss": 0.1208, - "step": 4082 - }, - { - "epoch": 2.6189865298268122, - "grad_norm": 1.302958607673645, - "learning_rate": 6.363831154917506e-06, - "loss": 0.1386, - "step": 4083 - }, - { - "epoch": 2.6196279666452855, - "grad_norm": 1.475156307220459, - "learning_rate": 6.353117634454682e-06, - "loss": 0.1525, - "step": 4084 - }, - { - "epoch": 2.620269403463759, - "grad_norm": 1.3617374897003174, - "learning_rate": 6.342404113991858e-06, - "loss": 0.1397, - "step": 4085 - }, - { - "epoch": 2.620910840282232, - "grad_norm": 1.5161609649658203, - "learning_rate": 6.331690593529034e-06, - "loss": 0.1467, - "step": 4086 - }, - { - "epoch": 2.6215522771007054, - "grad_norm": 0.8384643197059631, - "learning_rate": 6.32097707306621e-06, - "loss": 0.1179, - "step": 4087 - }, - { - "epoch": 2.622193713919179, - "grad_norm": 1.2657885551452637, - "learning_rate": 6.310263552603386e-06, - "loss": 0.1304, - "step": 4088 - }, - { - "epoch": 2.6228351507376524, - "grad_norm": 0.882183313369751, - "learning_rate": 6.299550032140562e-06, - "loss": 0.1195, - "step": 4089 - }, - { - "epoch": 2.6234765875561257, - "grad_norm": 1.8513755798339844, - "learning_rate": 6.288836511677737e-06, - "loss": 0.1567, - "step": 4090 - }, - { - "epoch": 2.624118024374599, - "grad_norm": 1.163177251815796, - "learning_rate": 6.278122991214913e-06, - "loss": 0.1317, - "step": 4091 - }, - { - "epoch": 2.6247594611930722, - "grad_norm": 0.9479789733886719, - "learning_rate": 6.267409470752089e-06, - "loss": 0.1227, - "step": 4092 - }, - { - "epoch": 2.625400898011546, - "grad_norm": 1.6329010725021362, - "learning_rate": 6.256695950289266e-06, - "loss": 0.1415, - "step": 4093 - }, - { - "epoch": 2.6260423348300193, - "grad_norm": 1.2005977630615234, - "learning_rate": 6.245982429826441e-06, - "loss": 0.125, - "step": 4094 - }, - { - "epoch": 2.6266837716484925, - "grad_norm": 1.129456877708435, - "learning_rate": 6.235268909363618e-06, - "loss": 0.1417, - "step": 4095 - }, - { - "epoch": 2.6273252084669663, - "grad_norm": 1.1442315578460693, - "learning_rate": 6.2245553889007925e-06, - "loss": 0.1376, - "step": 4096 - }, - { - "epoch": 2.627966645285439, - "grad_norm": 1.1272567510604858, - "learning_rate": 6.213841868437969e-06, - "loss": 0.1358, - "step": 4097 - }, - { - "epoch": 2.628608082103913, - "grad_norm": 1.1279352903366089, - "learning_rate": 6.2031283479751445e-06, - "loss": 0.1263, - "step": 4098 - }, - { - "epoch": 2.629249518922386, - "grad_norm": 0.9839051961898804, - "learning_rate": 6.192414827512321e-06, - "loss": 0.1249, - "step": 4099 - }, - { - "epoch": 2.6298909557408594, - "grad_norm": 1.295475721359253, - "learning_rate": 6.1817013070494974e-06, - "loss": 0.138, - "step": 4100 - }, - { - "epoch": 2.630532392559333, - "grad_norm": 1.2463798522949219, - "learning_rate": 6.170987786586672e-06, - "loss": 0.1348, - "step": 4101 - }, - { - "epoch": 2.6311738293778064, - "grad_norm": 1.1719963550567627, - "learning_rate": 6.160274266123849e-06, - "loss": 0.1296, - "step": 4102 - }, - { - "epoch": 2.6318152661962797, - "grad_norm": 1.611863374710083, - "learning_rate": 6.149560745661024e-06, - "loss": 0.1469, - "step": 4103 - }, - { - "epoch": 2.632456703014753, - "grad_norm": 1.1776220798492432, - "learning_rate": 6.138847225198201e-06, - "loss": 0.138, - "step": 4104 - }, - { - "epoch": 2.6330981398332263, - "grad_norm": 1.4355055093765259, - "learning_rate": 6.128133704735376e-06, - "loss": 0.1568, - "step": 4105 - }, - { - "epoch": 2.6337395766517, - "grad_norm": 1.260015606880188, - "learning_rate": 6.117420184272552e-06, - "loss": 0.1319, - "step": 4106 - }, - { - "epoch": 2.6343810134701733, - "grad_norm": 0.9545776844024658, - "learning_rate": 6.106706663809728e-06, - "loss": 0.1319, - "step": 4107 - }, - { - "epoch": 2.6350224502886466, - "grad_norm": 1.0864201784133911, - "learning_rate": 6.095993143346904e-06, - "loss": 0.1176, - "step": 4108 - }, - { - "epoch": 2.63566388710712, - "grad_norm": 1.1299591064453125, - "learning_rate": 6.08527962288408e-06, - "loss": 0.1356, - "step": 4109 - }, - { - "epoch": 2.636305323925593, - "grad_norm": 1.5231860876083374, - "learning_rate": 6.074566102421256e-06, - "loss": 0.1379, - "step": 4110 - }, - { - "epoch": 2.636946760744067, - "grad_norm": 1.0059795379638672, - "learning_rate": 6.063852581958432e-06, - "loss": 0.1315, - "step": 4111 - }, - { - "epoch": 2.63758819756254, - "grad_norm": 1.2856775522232056, - "learning_rate": 6.053139061495608e-06, - "loss": 0.133, - "step": 4112 - }, - { - "epoch": 2.6382296343810134, - "grad_norm": 0.8785386681556702, - "learning_rate": 6.042425541032784e-06, - "loss": 0.1233, - "step": 4113 - }, - { - "epoch": 2.6388710711994867, - "grad_norm": 1.1647133827209473, - "learning_rate": 6.031712020569959e-06, - "loss": 0.128, - "step": 4114 - }, - { - "epoch": 2.63951250801796, - "grad_norm": 1.4470633268356323, - "learning_rate": 6.020998500107136e-06, - "loss": 0.1421, - "step": 4115 - }, - { - "epoch": 2.6401539448364337, - "grad_norm": 1.1758543252944946, - "learning_rate": 6.010284979644311e-06, - "loss": 0.1382, - "step": 4116 - }, - { - "epoch": 2.640795381654907, - "grad_norm": 0.9988510012626648, - "learning_rate": 5.999571459181488e-06, - "loss": 0.1211, - "step": 4117 - }, - { - "epoch": 2.6414368184733803, - "grad_norm": 1.530503749847412, - "learning_rate": 5.988857938718663e-06, - "loss": 0.1412, - "step": 4118 - }, - { - "epoch": 2.642078255291854, - "grad_norm": 1.1362252235412598, - "learning_rate": 5.978144418255839e-06, - "loss": 0.1287, - "step": 4119 - }, - { - "epoch": 2.642719692110327, - "grad_norm": 1.1719805002212524, - "learning_rate": 5.9674308977930155e-06, - "loss": 0.1326, - "step": 4120 - }, - { - "epoch": 2.6433611289288006, - "grad_norm": 1.2522213459014893, - "learning_rate": 5.956717377330191e-06, - "loss": 0.1472, - "step": 4121 - }, - { - "epoch": 2.644002565747274, - "grad_norm": 1.4226981401443481, - "learning_rate": 5.946003856867367e-06, - "loss": 0.1404, - "step": 4122 - }, - { - "epoch": 2.644644002565747, - "grad_norm": 1.4543672800064087, - "learning_rate": 5.935290336404542e-06, - "loss": 0.1324, - "step": 4123 - }, - { - "epoch": 2.645285439384221, - "grad_norm": 1.3847362995147705, - "learning_rate": 5.924576815941719e-06, - "loss": 0.1474, - "step": 4124 - }, - { - "epoch": 2.645926876202694, - "grad_norm": 1.019129991531372, - "learning_rate": 5.913863295478895e-06, - "loss": 0.1221, - "step": 4125 - }, - { - "epoch": 2.6465683130211675, - "grad_norm": 1.522363543510437, - "learning_rate": 5.90314977501607e-06, - "loss": 0.1465, - "step": 4126 - }, - { - "epoch": 2.6472097498396407, - "grad_norm": 1.222892165184021, - "learning_rate": 5.8924362545532465e-06, - "loss": 0.1291, - "step": 4127 - }, - { - "epoch": 2.647851186658114, - "grad_norm": 1.1421548128128052, - "learning_rate": 5.881722734090422e-06, - "loss": 0.1231, - "step": 4128 - }, - { - "epoch": 2.6484926234765878, - "grad_norm": 1.2600369453430176, - "learning_rate": 5.8710092136275985e-06, - "loss": 0.1371, - "step": 4129 - }, - { - "epoch": 2.649134060295061, - "grad_norm": 1.126489520072937, - "learning_rate": 5.860295693164774e-06, - "loss": 0.1295, - "step": 4130 - }, - { - "epoch": 2.6497754971135343, - "grad_norm": 1.3427540063858032, - "learning_rate": 5.84958217270195e-06, - "loss": 0.1451, - "step": 4131 - }, - { - "epoch": 2.6504169339320076, - "grad_norm": 1.1895647048950195, - "learning_rate": 5.838868652239126e-06, - "loss": 0.1339, - "step": 4132 - }, - { - "epoch": 2.651058370750481, - "grad_norm": 1.0652419328689575, - "learning_rate": 5.828155131776302e-06, - "loss": 0.1317, - "step": 4133 - }, - { - "epoch": 2.6516998075689546, - "grad_norm": 1.3015679121017456, - "learning_rate": 5.817441611313478e-06, - "loss": 0.1374, - "step": 4134 - }, - { - "epoch": 2.652341244387428, - "grad_norm": 1.5426417589187622, - "learning_rate": 5.806728090850654e-06, - "loss": 0.1473, - "step": 4135 - }, - { - "epoch": 2.652982681205901, - "grad_norm": 1.074980616569519, - "learning_rate": 5.7960145703878295e-06, - "loss": 0.1325, - "step": 4136 - }, - { - "epoch": 2.6536241180243745, - "grad_norm": 1.6689388751983643, - "learning_rate": 5.785301049925006e-06, - "loss": 0.1533, - "step": 4137 - }, - { - "epoch": 2.6542655548428478, - "grad_norm": 1.255411982536316, - "learning_rate": 5.7745875294621815e-06, - "loss": 0.1382, - "step": 4138 - }, - { - "epoch": 2.6549069916613215, - "grad_norm": 1.1205023527145386, - "learning_rate": 5.763874008999357e-06, - "loss": 0.133, - "step": 4139 - }, - { - "epoch": 2.6555484284797948, - "grad_norm": 1.0363260507583618, - "learning_rate": 5.753160488536534e-06, - "loss": 0.1251, - "step": 4140 - }, - { - "epoch": 2.656189865298268, - "grad_norm": 1.4009459018707275, - "learning_rate": 5.742446968073709e-06, - "loss": 0.1285, - "step": 4141 - }, - { - "epoch": 2.6568313021167413, - "grad_norm": 1.1620614528656006, - "learning_rate": 5.731733447610886e-06, - "loss": 0.146, - "step": 4142 - }, - { - "epoch": 2.6574727389352146, - "grad_norm": 1.7865922451019287, - "learning_rate": 5.7210199271480604e-06, - "loss": 0.1426, - "step": 4143 - }, - { - "epoch": 2.6581141757536884, - "grad_norm": 0.8857876658439636, - "learning_rate": 5.710306406685237e-06, - "loss": 0.1229, - "step": 4144 - }, - { - "epoch": 2.6587556125721616, - "grad_norm": 1.069732427597046, - "learning_rate": 5.699592886222413e-06, - "loss": 0.1298, - "step": 4145 - }, - { - "epoch": 2.659397049390635, - "grad_norm": 0.8448216319084167, - "learning_rate": 5.688879365759589e-06, - "loss": 0.1186, - "step": 4146 - }, - { - "epoch": 2.6600384862091087, - "grad_norm": 1.5762423276901245, - "learning_rate": 5.678165845296765e-06, - "loss": 0.1374, - "step": 4147 - }, - { - "epoch": 2.660679923027582, - "grad_norm": 1.4288543462753296, - "learning_rate": 5.66745232483394e-06, - "loss": 0.1389, - "step": 4148 - }, - { - "epoch": 2.6613213598460552, - "grad_norm": 1.6838182210922241, - "learning_rate": 5.656738804371117e-06, - "loss": 0.1668, - "step": 4149 - }, - { - "epoch": 2.6619627966645285, - "grad_norm": 1.0502722263336182, - "learning_rate": 5.646025283908292e-06, - "loss": 0.12, - "step": 4150 - }, - { - "epoch": 2.662604233483002, - "grad_norm": 1.086418867111206, - "learning_rate": 5.635311763445469e-06, - "loss": 0.1262, - "step": 4151 - }, - { - "epoch": 2.6632456703014755, - "grad_norm": 1.5055456161499023, - "learning_rate": 5.624598242982644e-06, - "loss": 0.1399, - "step": 4152 - }, - { - "epoch": 2.663887107119949, - "grad_norm": 1.1075661182403564, - "learning_rate": 5.61388472251982e-06, - "loss": 0.1346, - "step": 4153 - }, - { - "epoch": 2.664528543938422, - "grad_norm": 0.8924465179443359, - "learning_rate": 5.603171202056996e-06, - "loss": 0.131, - "step": 4154 - }, - { - "epoch": 2.6651699807568954, - "grad_norm": 1.0835427045822144, - "learning_rate": 5.592457681594172e-06, - "loss": 0.1334, - "step": 4155 - }, - { - "epoch": 2.6658114175753687, - "grad_norm": 1.028824806213379, - "learning_rate": 5.5817441611313476e-06, - "loss": 0.1339, - "step": 4156 - }, - { - "epoch": 2.6664528543938424, - "grad_norm": 1.0903544425964355, - "learning_rate": 5.571030640668524e-06, - "loss": 0.1235, - "step": 4157 - }, - { - "epoch": 2.6670942912123157, - "grad_norm": 1.2484084367752075, - "learning_rate": 5.5603171202057e-06, - "loss": 0.1263, - "step": 4158 - }, - { - "epoch": 2.667735728030789, - "grad_norm": 1.2601149082183838, - "learning_rate": 5.549603599742876e-06, - "loss": 0.142, - "step": 4159 - }, - { - "epoch": 2.6683771648492622, - "grad_norm": 1.0408132076263428, - "learning_rate": 5.538890079280052e-06, - "loss": 0.1272, - "step": 4160 - }, - { - "epoch": 2.6690186016677355, - "grad_norm": 1.408660888671875, - "learning_rate": 5.528176558817227e-06, - "loss": 0.1331, - "step": 4161 - }, - { - "epoch": 2.6696600384862093, - "grad_norm": 1.1928631067276, - "learning_rate": 5.517463038354404e-06, - "loss": 0.1403, - "step": 4162 - }, - { - "epoch": 2.6703014753046825, - "grad_norm": 1.1784063577651978, - "learning_rate": 5.506749517891579e-06, - "loss": 0.1439, - "step": 4163 - }, - { - "epoch": 2.670942912123156, - "grad_norm": 1.6775654554367065, - "learning_rate": 5.496035997428756e-06, - "loss": 0.1477, - "step": 4164 - }, - { - "epoch": 2.671584348941629, - "grad_norm": 1.2728732824325562, - "learning_rate": 5.485322476965931e-06, - "loss": 0.143, - "step": 4165 - }, - { - "epoch": 2.6722257857601024, - "grad_norm": 0.9841781854629517, - "learning_rate": 5.474608956503107e-06, - "loss": 0.1285, - "step": 4166 - }, - { - "epoch": 2.672867222578576, - "grad_norm": 1.1160691976547241, - "learning_rate": 5.4638954360402835e-06, - "loss": 0.1386, - "step": 4167 - }, - { - "epoch": 2.6735086593970494, - "grad_norm": 1.1144472360610962, - "learning_rate": 5.453181915577459e-06, - "loss": 0.1376, - "step": 4168 - }, - { - "epoch": 2.6741500962155227, - "grad_norm": 1.416019320487976, - "learning_rate": 5.442468395114635e-06, - "loss": 0.1349, - "step": 4169 - }, - { - "epoch": 2.6747915330339964, - "grad_norm": 1.3004915714263916, - "learning_rate": 5.43175487465181e-06, - "loss": 0.1333, - "step": 4170 - }, - { - "epoch": 2.6754329698524693, - "grad_norm": 1.1656025648117065, - "learning_rate": 5.421041354188987e-06, - "loss": 0.1282, - "step": 4171 - }, - { - "epoch": 2.676074406670943, - "grad_norm": 0.8811773657798767, - "learning_rate": 5.410327833726163e-06, - "loss": 0.1184, - "step": 4172 - }, - { - "epoch": 2.6767158434894163, - "grad_norm": 1.7167236804962158, - "learning_rate": 5.399614313263338e-06, - "loss": 0.1576, - "step": 4173 - }, - { - "epoch": 2.6773572803078896, - "grad_norm": 1.0785207748413086, - "learning_rate": 5.388900792800514e-06, - "loss": 0.1207, - "step": 4174 - }, - { - "epoch": 2.6779987171263633, - "grad_norm": 1.3618459701538086, - "learning_rate": 5.37818727233769e-06, - "loss": 0.1433, - "step": 4175 - }, - { - "epoch": 2.6786401539448366, - "grad_norm": 1.2021962404251099, - "learning_rate": 5.3674737518748665e-06, - "loss": 0.1329, - "step": 4176 - }, - { - "epoch": 2.67928159076331, - "grad_norm": 1.600033164024353, - "learning_rate": 5.356760231412043e-06, - "loss": 0.1488, - "step": 4177 - }, - { - "epoch": 2.679923027581783, - "grad_norm": 1.1401537656784058, - "learning_rate": 5.346046710949218e-06, - "loss": 0.1343, - "step": 4178 - }, - { - "epoch": 2.6805644644002564, - "grad_norm": 0.7640076875686646, - "learning_rate": 5.335333190486394e-06, - "loss": 0.1218, - "step": 4179 - }, - { - "epoch": 2.68120590121873, - "grad_norm": 1.8116148710250854, - "learning_rate": 5.32461967002357e-06, - "loss": 0.1563, - "step": 4180 - }, - { - "epoch": 2.6818473380372034, - "grad_norm": 1.268418312072754, - "learning_rate": 5.313906149560746e-06, - "loss": 0.1431, - "step": 4181 - }, - { - "epoch": 2.6824887748556767, - "grad_norm": 0.9452699422836304, - "learning_rate": 5.303192629097922e-06, - "loss": 0.1316, - "step": 4182 - }, - { - "epoch": 2.68313021167415, - "grad_norm": 1.4100768566131592, - "learning_rate": 5.2924791086350974e-06, - "loss": 0.1438, - "step": 4183 - }, - { - "epoch": 2.6837716484926233, - "grad_norm": 1.6345206499099731, - "learning_rate": 5.281765588172274e-06, - "loss": 0.1632, - "step": 4184 - }, - { - "epoch": 2.684413085311097, - "grad_norm": 0.9739651679992676, - "learning_rate": 5.2710520677094495e-06, - "loss": 0.1218, - "step": 4185 - }, - { - "epoch": 2.6850545221295703, - "grad_norm": 1.3922995328903198, - "learning_rate": 5.260338547246625e-06, - "loss": 0.1438, - "step": 4186 - }, - { - "epoch": 2.6856959589480436, - "grad_norm": 1.0910342931747437, - "learning_rate": 5.2496250267838016e-06, - "loss": 0.1342, - "step": 4187 - }, - { - "epoch": 2.686337395766517, - "grad_norm": 1.1579029560089111, - "learning_rate": 5.238911506320977e-06, - "loss": 0.133, - "step": 4188 - }, - { - "epoch": 2.68697883258499, - "grad_norm": 1.2137489318847656, - "learning_rate": 5.228197985858154e-06, - "loss": 0.1295, - "step": 4189 - }, - { - "epoch": 2.687620269403464, - "grad_norm": 1.0507328510284424, - "learning_rate": 5.217484465395329e-06, - "loss": 0.1299, - "step": 4190 - }, - { - "epoch": 2.688261706221937, - "grad_norm": 1.4330322742462158, - "learning_rate": 5.206770944932505e-06, - "loss": 0.143, - "step": 4191 - }, - { - "epoch": 2.6889031430404104, - "grad_norm": 1.3504648208618164, - "learning_rate": 5.196057424469681e-06, - "loss": 0.1371, - "step": 4192 - }, - { - "epoch": 2.689544579858884, - "grad_norm": 1.0967155694961548, - "learning_rate": 5.185343904006857e-06, - "loss": 0.126, - "step": 4193 - }, - { - "epoch": 2.690186016677357, - "grad_norm": 1.30768620967865, - "learning_rate": 5.174630383544033e-06, - "loss": 0.1357, - "step": 4194 - }, - { - "epoch": 2.6908274534958307, - "grad_norm": 1.7128938436508179, - "learning_rate": 5.163916863081208e-06, - "loss": 0.1632, - "step": 4195 - }, - { - "epoch": 2.691468890314304, - "grad_norm": 1.4362348318099976, - "learning_rate": 5.1532033426183846e-06, - "loss": 0.1435, - "step": 4196 - }, - { - "epoch": 2.6921103271327773, - "grad_norm": 1.709822177886963, - "learning_rate": 5.142489822155561e-06, - "loss": 0.1635, - "step": 4197 - }, - { - "epoch": 2.692751763951251, - "grad_norm": 1.3370851278305054, - "learning_rate": 5.131776301692737e-06, - "loss": 0.1383, - "step": 4198 - }, - { - "epoch": 2.6933932007697243, - "grad_norm": 1.2141880989074707, - "learning_rate": 5.121062781229912e-06, - "loss": 0.1289, - "step": 4199 - }, - { - "epoch": 2.6940346375881976, - "grad_norm": 1.4725722074508667, - "learning_rate": 5.110349260767088e-06, - "loss": 0.121, - "step": 4200 - }, - { - "epoch": 2.694676074406671, - "grad_norm": 1.720637321472168, - "learning_rate": 5.099635740304264e-06, - "loss": 0.1454, - "step": 4201 - }, - { - "epoch": 2.695317511225144, - "grad_norm": 1.5568360090255737, - "learning_rate": 5.08892221984144e-06, - "loss": 0.1513, - "step": 4202 - }, - { - "epoch": 2.695958948043618, - "grad_norm": 1.9075015783309937, - "learning_rate": 5.078208699378616e-06, - "loss": 0.1712, - "step": 4203 - }, - { - "epoch": 2.696600384862091, - "grad_norm": 1.2837096452713013, - "learning_rate": 5.067495178915792e-06, - "loss": 0.1249, - "step": 4204 - }, - { - "epoch": 2.6972418216805645, - "grad_norm": 1.092902421951294, - "learning_rate": 5.0567816584529676e-06, - "loss": 0.1271, - "step": 4205 - }, - { - "epoch": 2.6978832584990378, - "grad_norm": 1.3000577688217163, - "learning_rate": 5.046068137990144e-06, - "loss": 0.1291, - "step": 4206 - }, - { - "epoch": 2.698524695317511, - "grad_norm": 1.5093960762023926, - "learning_rate": 5.03535461752732e-06, - "loss": 0.1435, - "step": 4207 - }, - { - "epoch": 2.6991661321359848, - "grad_norm": 0.7910041213035583, - "learning_rate": 5.024641097064495e-06, - "loss": 0.1163, - "step": 4208 - }, - { - "epoch": 2.699807568954458, - "grad_norm": 1.4094116687774658, - "learning_rate": 5.013927576601672e-06, - "loss": 0.1437, - "step": 4209 - }, - { - "epoch": 2.7004490057729313, - "grad_norm": 1.2681564092636108, - "learning_rate": 5.003214056138847e-06, - "loss": 0.122, - "step": 4210 - }, - { - "epoch": 2.7010904425914046, - "grad_norm": 0.9176029562950134, - "learning_rate": 4.992500535676024e-06, - "loss": 0.1312, - "step": 4211 - }, - { - "epoch": 2.701731879409878, - "grad_norm": 0.9874898195266724, - "learning_rate": 4.981787015213199e-06, - "loss": 0.1218, - "step": 4212 - }, - { - "epoch": 2.7023733162283516, - "grad_norm": 1.2025123834609985, - "learning_rate": 4.971073494750375e-06, - "loss": 0.1333, - "step": 4213 - }, - { - "epoch": 2.703014753046825, - "grad_norm": 1.2739183902740479, - "learning_rate": 4.9603599742875514e-06, - "loss": 0.1331, - "step": 4214 - }, - { - "epoch": 2.703656189865298, - "grad_norm": 1.3216733932495117, - "learning_rate": 4.949646453824727e-06, - "loss": 0.1245, - "step": 4215 - }, - { - "epoch": 2.7042976266837715, - "grad_norm": 0.9566259384155273, - "learning_rate": 4.938932933361903e-06, - "loss": 0.1211, - "step": 4216 - }, - { - "epoch": 2.704939063502245, - "grad_norm": 1.2412692308425903, - "learning_rate": 4.928219412899079e-06, - "loss": 0.1359, - "step": 4217 - }, - { - "epoch": 2.7055805003207185, - "grad_norm": 1.0969536304473877, - "learning_rate": 4.917505892436255e-06, - "loss": 0.1333, - "step": 4218 - }, - { - "epoch": 2.706221937139192, - "grad_norm": 0.8777954578399658, - "learning_rate": 4.906792371973431e-06, - "loss": 0.1262, - "step": 4219 - }, - { - "epoch": 2.706863373957665, - "grad_norm": 1.2526463270187378, - "learning_rate": 4.896078851510607e-06, - "loss": 0.1305, - "step": 4220 - }, - { - "epoch": 2.707504810776139, - "grad_norm": 1.906809687614441, - "learning_rate": 4.885365331047782e-06, - "loss": 0.1502, - "step": 4221 - }, - { - "epoch": 2.708146247594612, - "grad_norm": 1.1972119808197021, - "learning_rate": 4.874651810584958e-06, - "loss": 0.1306, - "step": 4222 - }, - { - "epoch": 2.7087876844130854, - "grad_norm": 1.1893153190612793, - "learning_rate": 4.8639382901221344e-06, - "loss": 0.1306, - "step": 4223 - }, - { - "epoch": 2.7094291212315587, - "grad_norm": 1.3448994159698486, - "learning_rate": 4.853224769659311e-06, - "loss": 0.1422, - "step": 4224 - }, - { - "epoch": 2.710070558050032, - "grad_norm": 1.011138916015625, - "learning_rate": 4.842511249196486e-06, - "loss": 0.1264, - "step": 4225 - }, - { - "epoch": 2.7107119948685057, - "grad_norm": 0.8733639121055603, - "learning_rate": 4.831797728733662e-06, - "loss": 0.12, - "step": 4226 - }, - { - "epoch": 2.711353431686979, - "grad_norm": 1.3832772970199585, - "learning_rate": 4.821084208270838e-06, - "loss": 0.133, - "step": 4227 - }, - { - "epoch": 2.7119948685054522, - "grad_norm": 0.7394426465034485, - "learning_rate": 4.810370687808014e-06, - "loss": 0.1193, - "step": 4228 - }, - { - "epoch": 2.7126363053239255, - "grad_norm": 0.9467939138412476, - "learning_rate": 4.79965716734519e-06, - "loss": 0.1159, - "step": 4229 - }, - { - "epoch": 2.713277742142399, - "grad_norm": 1.3000915050506592, - "learning_rate": 4.788943646882365e-06, - "loss": 0.1316, - "step": 4230 - }, - { - "epoch": 2.7139191789608725, - "grad_norm": 1.5660127401351929, - "learning_rate": 4.778230126419542e-06, - "loss": 0.1376, - "step": 4231 - }, - { - "epoch": 2.714560615779346, - "grad_norm": 1.1400048732757568, - "learning_rate": 4.7675166059567174e-06, - "loss": 0.1339, - "step": 4232 - }, - { - "epoch": 2.715202052597819, - "grad_norm": 1.1241282224655151, - "learning_rate": 4.756803085493894e-06, - "loss": 0.1383, - "step": 4233 - }, - { - "epoch": 2.7158434894162924, - "grad_norm": 1.1603556871414185, - "learning_rate": 4.7460895650310695e-06, - "loss": 0.1227, - "step": 4234 - }, - { - "epoch": 2.7164849262347657, - "grad_norm": 1.279358983039856, - "learning_rate": 4.735376044568245e-06, - "loss": 0.1361, - "step": 4235 - }, - { - "epoch": 2.7171263630532394, - "grad_norm": 1.0536788702011108, - "learning_rate": 4.7246625241054216e-06, - "loss": 0.1255, - "step": 4236 - }, - { - "epoch": 2.7177677998717127, - "grad_norm": 1.5214835405349731, - "learning_rate": 4.713949003642597e-06, - "loss": 0.129, - "step": 4237 - }, - { - "epoch": 2.718409236690186, - "grad_norm": 1.062433123588562, - "learning_rate": 4.703235483179773e-06, - "loss": 0.1355, - "step": 4238 - }, - { - "epoch": 2.7190506735086593, - "grad_norm": 1.085932970046997, - "learning_rate": 4.692521962716949e-06, - "loss": 0.1207, - "step": 4239 - }, - { - "epoch": 2.7196921103271325, - "grad_norm": 1.1517558097839355, - "learning_rate": 4.681808442254125e-06, - "loss": 0.1326, - "step": 4240 - }, - { - "epoch": 2.7203335471456063, - "grad_norm": 1.2358464002609253, - "learning_rate": 4.671094921791301e-06, - "loss": 0.1473, - "step": 4241 - }, - { - "epoch": 2.7209749839640796, - "grad_norm": 0.9967305064201355, - "learning_rate": 4.660381401328476e-06, - "loss": 0.1324, - "step": 4242 - }, - { - "epoch": 2.721616420782553, - "grad_norm": 0.9614265561103821, - "learning_rate": 4.6496678808656525e-06, - "loss": 0.1237, - "step": 4243 - }, - { - "epoch": 2.7222578576010266, - "grad_norm": 1.1527125835418701, - "learning_rate": 4.638954360402829e-06, - "loss": 0.1286, - "step": 4244 - }, - { - "epoch": 2.7228992944194994, - "grad_norm": 1.5239276885986328, - "learning_rate": 4.628240839940005e-06, - "loss": 0.1425, - "step": 4245 - }, - { - "epoch": 2.723540731237973, - "grad_norm": 1.1555993556976318, - "learning_rate": 4.61752731947718e-06, - "loss": 0.1308, - "step": 4246 - }, - { - "epoch": 2.7241821680564464, - "grad_norm": 1.5657410621643066, - "learning_rate": 4.606813799014356e-06, - "loss": 0.1306, - "step": 4247 - }, - { - "epoch": 2.7248236048749197, - "grad_norm": 1.368590235710144, - "learning_rate": 4.596100278551532e-06, - "loss": 0.1408, - "step": 4248 - }, - { - "epoch": 2.7254650416933934, - "grad_norm": 1.1355634927749634, - "learning_rate": 4.585386758088709e-06, - "loss": 0.1299, - "step": 4249 - }, - { - "epoch": 2.7261064785118667, - "grad_norm": 1.562677025794983, - "learning_rate": 4.574673237625884e-06, - "loss": 0.1467, - "step": 4250 - }, - { - "epoch": 2.72674791533034, - "grad_norm": 1.4175142049789429, - "learning_rate": 4.56395971716306e-06, - "loss": 0.1342, - "step": 4251 - }, - { - "epoch": 2.7273893521488133, - "grad_norm": 1.0742725133895874, - "learning_rate": 4.5532461967002355e-06, - "loss": 0.1326, - "step": 4252 - }, - { - "epoch": 2.7280307889672866, - "grad_norm": 0.8949272036552429, - "learning_rate": 4.542532676237412e-06, - "loss": 0.1225, - "step": 4253 - }, - { - "epoch": 2.7286722257857603, - "grad_norm": 1.1600220203399658, - "learning_rate": 4.5318191557745884e-06, - "loss": 0.1251, - "step": 4254 - }, - { - "epoch": 2.7293136626042336, - "grad_norm": 1.106108546257019, - "learning_rate": 4.521105635311763e-06, - "loss": 0.1277, - "step": 4255 - }, - { - "epoch": 2.729955099422707, - "grad_norm": 1.659186601638794, - "learning_rate": 4.51039211484894e-06, - "loss": 0.1518, - "step": 4256 - }, - { - "epoch": 2.73059653624118, - "grad_norm": 1.1915061473846436, - "learning_rate": 4.499678594386115e-06, - "loss": 0.1389, - "step": 4257 - }, - { - "epoch": 2.7312379730596534, - "grad_norm": 1.1862058639526367, - "learning_rate": 4.488965073923292e-06, - "loss": 0.1282, - "step": 4258 - }, - { - "epoch": 2.731879409878127, - "grad_norm": 1.2917028665542603, - "learning_rate": 4.478251553460467e-06, - "loss": 0.1384, - "step": 4259 - }, - { - "epoch": 2.7325208466966004, - "grad_norm": 1.5309444665908813, - "learning_rate": 4.467538032997643e-06, - "loss": 0.1308, - "step": 4260 - }, - { - "epoch": 2.7331622835150737, - "grad_norm": 1.1706370115280151, - "learning_rate": 4.456824512534819e-06, - "loss": 0.1307, - "step": 4261 - }, - { - "epoch": 2.733803720333547, - "grad_norm": 1.3465427160263062, - "learning_rate": 4.446110992071995e-06, - "loss": 0.1282, - "step": 4262 - }, - { - "epoch": 2.7344451571520203, - "grad_norm": 1.231058120727539, - "learning_rate": 4.4353974716091714e-06, - "loss": 0.128, - "step": 4263 - }, - { - "epoch": 2.735086593970494, - "grad_norm": 1.1562108993530273, - "learning_rate": 4.424683951146347e-06, - "loss": 0.1289, - "step": 4264 - }, - { - "epoch": 2.7357280307889673, - "grad_norm": 1.0437183380126953, - "learning_rate": 4.413970430683523e-06, - "loss": 0.1362, - "step": 4265 - }, - { - "epoch": 2.7363694676074406, - "grad_norm": 1.1280593872070312, - "learning_rate": 4.403256910220699e-06, - "loss": 0.1237, - "step": 4266 - }, - { - "epoch": 2.737010904425914, - "grad_norm": 1.3383618593215942, - "learning_rate": 4.392543389757875e-06, - "loss": 0.131, - "step": 4267 - }, - { - "epoch": 2.737652341244387, - "grad_norm": 1.2962918281555176, - "learning_rate": 4.38182986929505e-06, - "loss": 0.1349, - "step": 4268 - }, - { - "epoch": 2.738293778062861, - "grad_norm": 1.099725365638733, - "learning_rate": 4.371116348832227e-06, - "loss": 0.1276, - "step": 4269 - }, - { - "epoch": 2.738935214881334, - "grad_norm": 1.2785247564315796, - "learning_rate": 4.360402828369402e-06, - "loss": 0.139, - "step": 4270 - }, - { - "epoch": 2.7395766516998075, - "grad_norm": 1.2246930599212646, - "learning_rate": 4.349689307906579e-06, - "loss": 0.1285, - "step": 4271 - }, - { - "epoch": 2.740218088518281, - "grad_norm": 1.150823950767517, - "learning_rate": 4.338975787443754e-06, - "loss": 0.1307, - "step": 4272 - }, - { - "epoch": 2.7408595253367545, - "grad_norm": 1.4942538738250732, - "learning_rate": 4.32826226698093e-06, - "loss": 0.1335, - "step": 4273 - }, - { - "epoch": 2.7415009621552278, - "grad_norm": 0.9230483770370483, - "learning_rate": 4.3175487465181065e-06, - "loss": 0.1201, - "step": 4274 - }, - { - "epoch": 2.742142398973701, - "grad_norm": 0.8531761765480042, - "learning_rate": 4.306835226055282e-06, - "loss": 0.1183, - "step": 4275 - }, - { - "epoch": 2.7427838357921743, - "grad_norm": 1.6250059604644775, - "learning_rate": 4.296121705592458e-06, - "loss": 0.1577, - "step": 4276 - }, - { - "epoch": 2.743425272610648, - "grad_norm": 1.6235849857330322, - "learning_rate": 4.285408185129633e-06, - "loss": 0.144, - "step": 4277 - }, - { - "epoch": 2.7440667094291213, - "grad_norm": 1.3672842979431152, - "learning_rate": 4.27469466466681e-06, - "loss": 0.1494, - "step": 4278 - }, - { - "epoch": 2.7447081462475946, - "grad_norm": 1.4153058528900146, - "learning_rate": 4.263981144203985e-06, - "loss": 0.137, - "step": 4279 - }, - { - "epoch": 2.745349583066068, - "grad_norm": 1.3394471406936646, - "learning_rate": 4.253267623741162e-06, - "loss": 0.1399, - "step": 4280 - }, - { - "epoch": 2.745991019884541, - "grad_norm": 1.57339346408844, - "learning_rate": 4.2425541032783375e-06, - "loss": 0.1368, - "step": 4281 - }, - { - "epoch": 2.746632456703015, - "grad_norm": 1.1169493198394775, - "learning_rate": 4.231840582815513e-06, - "loss": 0.1293, - "step": 4282 - }, - { - "epoch": 2.747273893521488, - "grad_norm": 1.2639615535736084, - "learning_rate": 4.2211270623526895e-06, - "loss": 0.135, - "step": 4283 - }, - { - "epoch": 2.7479153303399615, - "grad_norm": 1.0980771780014038, - "learning_rate": 4.210413541889865e-06, - "loss": 0.1402, - "step": 4284 - }, - { - "epoch": 2.7485567671584348, - "grad_norm": 1.2380026578903198, - "learning_rate": 4.199700021427041e-06, - "loss": 0.134, - "step": 4285 - }, - { - "epoch": 2.749198203976908, - "grad_norm": 1.5089077949523926, - "learning_rate": 4.188986500964217e-06, - "loss": 0.1373, - "step": 4286 - }, - { - "epoch": 2.749839640795382, - "grad_norm": 1.0146492719650269, - "learning_rate": 4.178272980501393e-06, - "loss": 0.1231, - "step": 4287 - }, - { - "epoch": 2.750481077613855, - "grad_norm": 1.2834452390670776, - "learning_rate": 4.167559460038569e-06, - "loss": 0.1425, - "step": 4288 - }, - { - "epoch": 2.7511225144323284, - "grad_norm": 1.2542961835861206, - "learning_rate": 4.156845939575745e-06, - "loss": 0.1388, - "step": 4289 - }, - { - "epoch": 2.7517639512508016, - "grad_norm": 1.1534992456436157, - "learning_rate": 4.1461324191129205e-06, - "loss": 0.1325, - "step": 4290 - }, - { - "epoch": 2.752405388069275, - "grad_norm": 1.173378825187683, - "learning_rate": 4.135418898650097e-06, - "loss": 0.1362, - "step": 4291 - }, - { - "epoch": 2.7530468248877487, - "grad_norm": 1.2111116647720337, - "learning_rate": 4.1247053781872725e-06, - "loss": 0.141, - "step": 4292 - }, - { - "epoch": 2.753688261706222, - "grad_norm": 1.0912975072860718, - "learning_rate": 4.113991857724449e-06, - "loss": 0.1287, - "step": 4293 - }, - { - "epoch": 2.7543296985246952, - "grad_norm": 0.9688672423362732, - "learning_rate": 4.103278337261624e-06, - "loss": 0.1229, - "step": 4294 - }, - { - "epoch": 2.754971135343169, - "grad_norm": 1.4467251300811768, - "learning_rate": 4.0925648167988e-06, - "loss": 0.1319, - "step": 4295 - }, - { - "epoch": 2.7556125721616422, - "grad_norm": 1.2722941637039185, - "learning_rate": 4.081851296335977e-06, - "loss": 0.1281, - "step": 4296 - }, - { - "epoch": 2.7562540089801155, - "grad_norm": 0.9581276178359985, - "learning_rate": 4.071137775873152e-06, - "loss": 0.1317, - "step": 4297 - }, - { - "epoch": 2.756895445798589, - "grad_norm": 1.629819631576538, - "learning_rate": 4.060424255410328e-06, - "loss": 0.1571, - "step": 4298 - }, - { - "epoch": 2.757536882617062, - "grad_norm": 1.0112046003341675, - "learning_rate": 4.0497107349475035e-06, - "loss": 0.1314, - "step": 4299 - }, - { - "epoch": 2.758178319435536, - "grad_norm": 1.3808242082595825, - "learning_rate": 4.03899721448468e-06, - "loss": 0.1347, - "step": 4300 - }, - { - "epoch": 2.758819756254009, - "grad_norm": 1.3941524028778076, - "learning_rate": 4.028283694021856e-06, - "loss": 0.1548, - "step": 4301 - }, - { - "epoch": 2.7594611930724824, - "grad_norm": 1.3253757953643799, - "learning_rate": 4.017570173559031e-06, - "loss": 0.1347, - "step": 4302 - }, - { - "epoch": 2.7601026298909557, - "grad_norm": 1.4173271656036377, - "learning_rate": 4.006856653096208e-06, - "loss": 0.1397, - "step": 4303 - }, - { - "epoch": 2.760744066709429, - "grad_norm": 1.0807398557662964, - "learning_rate": 3.996143132633383e-06, - "loss": 0.135, - "step": 4304 - }, - { - "epoch": 2.7613855035279027, - "grad_norm": 1.1685914993286133, - "learning_rate": 3.98542961217056e-06, - "loss": 0.1248, - "step": 4305 - }, - { - "epoch": 2.762026940346376, - "grad_norm": 1.011714220046997, - "learning_rate": 3.974716091707735e-06, - "loss": 0.1221, - "step": 4306 - }, - { - "epoch": 2.7626683771648493, - "grad_norm": 0.946953535079956, - "learning_rate": 3.964002571244911e-06, - "loss": 0.1215, - "step": 4307 - }, - { - "epoch": 2.7633098139833225, - "grad_norm": 1.1117364168167114, - "learning_rate": 3.953289050782087e-06, - "loss": 0.1206, - "step": 4308 - }, - { - "epoch": 2.763951250801796, - "grad_norm": 0.8780766129493713, - "learning_rate": 3.942575530319263e-06, - "loss": 0.1252, - "step": 4309 - }, - { - "epoch": 2.7645926876202696, - "grad_norm": 0.8975713849067688, - "learning_rate": 3.931862009856439e-06, - "loss": 0.1138, - "step": 4310 - }, - { - "epoch": 2.765234124438743, - "grad_norm": 0.9827861189842224, - "learning_rate": 3.921148489393615e-06, - "loss": 0.1332, - "step": 4311 - }, - { - "epoch": 2.765875561257216, - "grad_norm": 0.9619331359863281, - "learning_rate": 3.910434968930791e-06, - "loss": 0.1235, - "step": 4312 - }, - { - "epoch": 2.7665169980756894, - "grad_norm": 1.5095974206924438, - "learning_rate": 3.899721448467967e-06, - "loss": 0.1383, - "step": 4313 - }, - { - "epoch": 2.7671584348941627, - "grad_norm": 1.0893372297286987, - "learning_rate": 3.889007928005143e-06, - "loss": 0.1278, - "step": 4314 - }, - { - "epoch": 2.7677998717126364, - "grad_norm": 0.8953987956047058, - "learning_rate": 3.878294407542318e-06, - "loss": 0.1334, - "step": 4315 - }, - { - "epoch": 2.7684413085311097, - "grad_norm": 1.0814604759216309, - "learning_rate": 3.867580887079495e-06, - "loss": 0.136, - "step": 4316 - }, - { - "epoch": 2.769082745349583, - "grad_norm": 1.4246385097503662, - "learning_rate": 3.85686736661667e-06, - "loss": 0.1448, - "step": 4317 - }, - { - "epoch": 2.7697241821680567, - "grad_norm": 1.128960132598877, - "learning_rate": 3.846153846153847e-06, - "loss": 0.1232, - "step": 4318 - }, - { - "epoch": 2.7703656189865296, - "grad_norm": 1.702210783958435, - "learning_rate": 3.8354403256910216e-06, - "loss": 0.1623, - "step": 4319 - }, - { - "epoch": 2.7710070558050033, - "grad_norm": 0.9334565997123718, - "learning_rate": 3.824726805228198e-06, - "loss": 0.1264, - "step": 4320 - }, - { - "epoch": 2.7716484926234766, - "grad_norm": 1.377364158630371, - "learning_rate": 3.814013284765374e-06, - "loss": 0.1437, - "step": 4321 - }, - { - "epoch": 2.77228992944195, - "grad_norm": 0.9271858930587769, - "learning_rate": 3.80329976430255e-06, - "loss": 0.1256, - "step": 4322 - }, - { - "epoch": 2.7729313662604236, - "grad_norm": 0.9876362681388855, - "learning_rate": 3.792586243839726e-06, - "loss": 0.1315, - "step": 4323 - }, - { - "epoch": 2.773572803078897, - "grad_norm": 0.7405959963798523, - "learning_rate": 3.7818727233769017e-06, - "loss": 0.1173, - "step": 4324 - }, - { - "epoch": 2.77421423989737, - "grad_norm": 1.4258713722229004, - "learning_rate": 3.7711592029140777e-06, - "loss": 0.1418, - "step": 4325 - }, - { - "epoch": 2.7748556767158434, - "grad_norm": 1.5111737251281738, - "learning_rate": 3.7604456824512538e-06, - "loss": 0.1447, - "step": 4326 - }, - { - "epoch": 2.7754971135343167, - "grad_norm": 1.7595278024673462, - "learning_rate": 3.74973216198843e-06, - "loss": 0.1398, - "step": 4327 - }, - { - "epoch": 2.7761385503527904, - "grad_norm": 1.1146472692489624, - "learning_rate": 3.7390186415256054e-06, - "loss": 0.1266, - "step": 4328 - }, - { - "epoch": 2.7767799871712637, - "grad_norm": 1.3222018480300903, - "learning_rate": 3.7283051210627814e-06, - "loss": 0.1396, - "step": 4329 - }, - { - "epoch": 2.777421423989737, - "grad_norm": 1.2048861980438232, - "learning_rate": 3.7175916005999575e-06, - "loss": 0.1256, - "step": 4330 - }, - { - "epoch": 2.7780628608082103, - "grad_norm": 1.1586028337478638, - "learning_rate": 3.7068780801371335e-06, - "loss": 0.1344, - "step": 4331 - }, - { - "epoch": 2.7787042976266836, - "grad_norm": 1.2817738056182861, - "learning_rate": 3.6961645596743087e-06, - "loss": 0.1284, - "step": 4332 - }, - { - "epoch": 2.7793457344451573, - "grad_norm": 0.8288444876670837, - "learning_rate": 3.685451039211485e-06, - "loss": 0.116, - "step": 4333 - }, - { - "epoch": 2.7799871712636306, - "grad_norm": 1.5701156854629517, - "learning_rate": 3.674737518748661e-06, - "loss": 0.1334, - "step": 4334 - }, - { - "epoch": 2.780628608082104, - "grad_norm": 1.364518404006958, - "learning_rate": 3.664023998285837e-06, - "loss": 0.139, - "step": 4335 - }, - { - "epoch": 2.781270044900577, - "grad_norm": 1.3057219982147217, - "learning_rate": 3.6533104778230124e-06, - "loss": 0.138, - "step": 4336 - }, - { - "epoch": 2.7819114817190504, - "grad_norm": 1.679723858833313, - "learning_rate": 3.6425969573601884e-06, - "loss": 0.1511, - "step": 4337 - }, - { - "epoch": 2.782552918537524, - "grad_norm": 1.0120819807052612, - "learning_rate": 3.6318834368973644e-06, - "loss": 0.1249, - "step": 4338 - }, - { - "epoch": 2.7831943553559975, - "grad_norm": 0.9880580306053162, - "learning_rate": 3.621169916434541e-06, - "loss": 0.1256, - "step": 4339 - }, - { - "epoch": 2.7838357921744707, - "grad_norm": 1.2408839464187622, - "learning_rate": 3.610456395971717e-06, - "loss": 0.1288, - "step": 4340 - }, - { - "epoch": 2.784477228992944, - "grad_norm": 1.2296210527420044, - "learning_rate": 3.599742875508892e-06, - "loss": 0.1342, - "step": 4341 - }, - { - "epoch": 2.7851186658114173, - "grad_norm": 1.144499659538269, - "learning_rate": 3.589029355046068e-06, - "loss": 0.1219, - "step": 4342 - }, - { - "epoch": 2.785760102629891, - "grad_norm": 0.8495026230812073, - "learning_rate": 3.578315834583244e-06, - "loss": 0.1212, - "step": 4343 - }, - { - "epoch": 2.7864015394483643, - "grad_norm": 1.2699843645095825, - "learning_rate": 3.5676023141204202e-06, - "loss": 0.1396, - "step": 4344 - }, - { - "epoch": 2.7870429762668376, - "grad_norm": 1.8092314004898071, - "learning_rate": 3.556888793657596e-06, - "loss": 0.1458, - "step": 4345 - }, - { - "epoch": 2.7876844130853113, - "grad_norm": 1.0445226430892944, - "learning_rate": 3.546175273194772e-06, - "loss": 0.1241, - "step": 4346 - }, - { - "epoch": 2.7883258499037846, - "grad_norm": 1.2564557790756226, - "learning_rate": 3.535461752731948e-06, - "loss": 0.1387, - "step": 4347 - }, - { - "epoch": 2.788967286722258, - "grad_norm": 0.9572578072547913, - "learning_rate": 3.524748232269124e-06, - "loss": 0.1157, - "step": 4348 - }, - { - "epoch": 2.789608723540731, - "grad_norm": 1.3354382514953613, - "learning_rate": 3.5140347118062995e-06, - "loss": 0.1427, - "step": 4349 - }, - { - "epoch": 2.7902501603592045, - "grad_norm": 0.9858716726303101, - "learning_rate": 3.5033211913434756e-06, - "loss": 0.1229, - "step": 4350 - }, - { - "epoch": 2.790891597177678, - "grad_norm": 1.4335763454437256, - "learning_rate": 3.4926076708806516e-06, - "loss": 0.1359, - "step": 4351 - }, - { - "epoch": 2.7915330339961515, - "grad_norm": 0.9576603174209595, - "learning_rate": 3.4818941504178276e-06, - "loss": 0.1231, - "step": 4352 - }, - { - "epoch": 2.7921744708146248, - "grad_norm": 1.7171066999435425, - "learning_rate": 3.4711806299550036e-06, - "loss": 0.1566, - "step": 4353 - }, - { - "epoch": 2.792815907633098, - "grad_norm": 1.3940448760986328, - "learning_rate": 3.4604671094921793e-06, - "loss": 0.1409, - "step": 4354 - }, - { - "epoch": 2.7934573444515713, - "grad_norm": 0.9646666646003723, - "learning_rate": 3.4497535890293553e-06, - "loss": 0.1194, - "step": 4355 - }, - { - "epoch": 2.794098781270045, - "grad_norm": 1.0787204504013062, - "learning_rate": 3.4390400685665313e-06, - "loss": 0.1233, - "step": 4356 - }, - { - "epoch": 2.7947402180885184, - "grad_norm": 1.34064781665802, - "learning_rate": 3.4283265481037073e-06, - "loss": 0.1455, - "step": 4357 - }, - { - "epoch": 2.7953816549069916, - "grad_norm": 1.0633896589279175, - "learning_rate": 3.4176130276408825e-06, - "loss": 0.1193, - "step": 4358 - }, - { - "epoch": 2.796023091725465, - "grad_norm": 1.7802568674087524, - "learning_rate": 3.406899507178059e-06, - "loss": 0.1432, - "step": 4359 - }, - { - "epoch": 2.796664528543938, - "grad_norm": 0.9137521982192993, - "learning_rate": 3.396185986715235e-06, - "loss": 0.1262, - "step": 4360 - }, - { - "epoch": 2.797305965362412, - "grad_norm": 1.1956359148025513, - "learning_rate": 3.385472466252411e-06, - "loss": 0.1327, - "step": 4361 - }, - { - "epoch": 2.7979474021808852, - "grad_norm": 1.4577420949935913, - "learning_rate": 3.3747589457895862e-06, - "loss": 0.1303, - "step": 4362 - }, - { - "epoch": 2.7985888389993585, - "grad_norm": 1.464218020439148, - "learning_rate": 3.3640454253267623e-06, - "loss": 0.1364, - "step": 4363 - }, - { - "epoch": 2.799230275817832, - "grad_norm": 1.4455844163894653, - "learning_rate": 3.3533319048639383e-06, - "loss": 0.1578, - "step": 4364 - }, - { - "epoch": 2.799871712636305, - "grad_norm": 1.9057462215423584, - "learning_rate": 3.3426183844011147e-06, - "loss": 0.1553, - "step": 4365 - }, - { - "epoch": 2.800513149454779, - "grad_norm": 1.400762915611267, - "learning_rate": 3.3319048639382908e-06, - "loss": 0.1408, - "step": 4366 - }, - { - "epoch": 2.801154586273252, - "grad_norm": 1.3658690452575684, - "learning_rate": 3.321191343475466e-06, - "loss": 0.1367, - "step": 4367 - }, - { - "epoch": 2.8017960230917254, - "grad_norm": 1.518271565437317, - "learning_rate": 3.310477823012642e-06, - "loss": 0.1219, - "step": 4368 - }, - { - "epoch": 2.802437459910199, - "grad_norm": 1.0993908643722534, - "learning_rate": 3.299764302549818e-06, - "loss": 0.1395, - "step": 4369 - }, - { - "epoch": 2.8030788967286724, - "grad_norm": 1.1354520320892334, - "learning_rate": 3.289050782086994e-06, - "loss": 0.1318, - "step": 4370 - }, - { - "epoch": 2.8037203335471457, - "grad_norm": 1.2776004076004028, - "learning_rate": 3.2783372616241697e-06, - "loss": 0.1435, - "step": 4371 - }, - { - "epoch": 2.804361770365619, - "grad_norm": 1.5173038244247437, - "learning_rate": 3.2676237411613457e-06, - "loss": 0.1496, - "step": 4372 - }, - { - "epoch": 2.8050032071840922, - "grad_norm": 1.7331264019012451, - "learning_rate": 3.2569102206985217e-06, - "loss": 0.1448, - "step": 4373 - }, - { - "epoch": 2.805644644002566, - "grad_norm": 1.2074569463729858, - "learning_rate": 3.2461967002356978e-06, - "loss": 0.1229, - "step": 4374 - }, - { - "epoch": 2.8062860808210393, - "grad_norm": 1.1955205202102661, - "learning_rate": 3.2354831797728734e-06, - "loss": 0.129, - "step": 4375 - }, - { - "epoch": 2.8069275176395125, - "grad_norm": 1.10960853099823, - "learning_rate": 3.2247696593100494e-06, - "loss": 0.124, - "step": 4376 - }, - { - "epoch": 2.807568954457986, - "grad_norm": 1.3659067153930664, - "learning_rate": 3.2140561388472254e-06, - "loss": 0.1518, - "step": 4377 - }, - { - "epoch": 2.808210391276459, - "grad_norm": 1.7306898832321167, - "learning_rate": 3.2033426183844015e-06, - "loss": 0.1353, - "step": 4378 - }, - { - "epoch": 2.808851828094933, - "grad_norm": 1.6284470558166504, - "learning_rate": 3.192629097921577e-06, - "loss": 0.1354, - "step": 4379 - }, - { - "epoch": 2.809493264913406, - "grad_norm": 1.3417491912841797, - "learning_rate": 3.181915577458753e-06, - "loss": 0.1444, - "step": 4380 - }, - { - "epoch": 2.8101347017318794, - "grad_norm": 1.149211049079895, - "learning_rate": 3.171202056995929e-06, - "loss": 0.1215, - "step": 4381 - }, - { - "epoch": 2.8107761385503527, - "grad_norm": 0.8365505337715149, - "learning_rate": 3.160488536533105e-06, - "loss": 0.1201, - "step": 4382 - }, - { - "epoch": 2.811417575368826, - "grad_norm": 1.7079298496246338, - "learning_rate": 3.149775016070281e-06, - "loss": 0.1545, - "step": 4383 - }, - { - "epoch": 2.8120590121872997, - "grad_norm": 1.0981484651565552, - "learning_rate": 3.1390614956074564e-06, - "loss": 0.1242, - "step": 4384 - }, - { - "epoch": 2.812700449005773, - "grad_norm": 0.9725930690765381, - "learning_rate": 3.128347975144633e-06, - "loss": 0.1291, - "step": 4385 - }, - { - "epoch": 2.8133418858242463, - "grad_norm": 1.2957351207733154, - "learning_rate": 3.117634454681809e-06, - "loss": 0.1327, - "step": 4386 - }, - { - "epoch": 2.8139833226427196, - "grad_norm": 1.292046308517456, - "learning_rate": 3.1069209342189845e-06, - "loss": 0.1418, - "step": 4387 - }, - { - "epoch": 2.814624759461193, - "grad_norm": 1.5387675762176514, - "learning_rate": 3.0962074137561605e-06, - "loss": 0.1403, - "step": 4388 - }, - { - "epoch": 2.8152661962796666, - "grad_norm": 1.248110055923462, - "learning_rate": 3.085493893293336e-06, - "loss": 0.1326, - "step": 4389 - }, - { - "epoch": 2.81590763309814, - "grad_norm": 1.7517679929733276, - "learning_rate": 3.074780372830512e-06, - "loss": 0.1378, - "step": 4390 - }, - { - "epoch": 2.816549069916613, - "grad_norm": 1.1825059652328491, - "learning_rate": 3.064066852367688e-06, - "loss": 0.1332, - "step": 4391 - }, - { - "epoch": 2.817190506735087, - "grad_norm": 0.925990879535675, - "learning_rate": 3.053353331904864e-06, - "loss": 0.1248, - "step": 4392 - }, - { - "epoch": 2.8178319435535597, - "grad_norm": 1.1957621574401855, - "learning_rate": 3.04263981144204e-06, - "loss": 0.1287, - "step": 4393 - }, - { - "epoch": 2.8184733803720334, - "grad_norm": 1.0111134052276611, - "learning_rate": 3.031926290979216e-06, - "loss": 0.1253, - "step": 4394 - }, - { - "epoch": 2.8191148171905067, - "grad_norm": 0.9883546829223633, - "learning_rate": 3.021212770516392e-06, - "loss": 0.1216, - "step": 4395 - }, - { - "epoch": 2.81975625400898, - "grad_norm": 0.9870839715003967, - "learning_rate": 3.010499250053568e-06, - "loss": 0.1268, - "step": 4396 - }, - { - "epoch": 2.8203976908274537, - "grad_norm": 1.503152847290039, - "learning_rate": 2.999785729590744e-06, - "loss": 0.1472, - "step": 4397 - }, - { - "epoch": 2.821039127645927, - "grad_norm": 1.0527710914611816, - "learning_rate": 2.9890722091279195e-06, - "loss": 0.1315, - "step": 4398 - }, - { - "epoch": 2.8216805644644003, - "grad_norm": 1.0890986919403076, - "learning_rate": 2.9783586886650956e-06, - "loss": 0.1237, - "step": 4399 - }, - { - "epoch": 2.8223220012828736, - "grad_norm": 2.1050868034362793, - "learning_rate": 2.967645168202271e-06, - "loss": 0.1739, - "step": 4400 - }, - { - "epoch": 2.822963438101347, - "grad_norm": 1.4314026832580566, - "learning_rate": 2.9569316477394476e-06, - "loss": 0.1343, - "step": 4401 - }, - { - "epoch": 2.8236048749198206, - "grad_norm": 1.0624914169311523, - "learning_rate": 2.9462181272766232e-06, - "loss": 0.1283, - "step": 4402 - }, - { - "epoch": 2.824246311738294, - "grad_norm": 1.351535677909851, - "learning_rate": 2.9355046068137993e-06, - "loss": 0.121, - "step": 4403 - }, - { - "epoch": 2.824887748556767, - "grad_norm": 1.1686617136001587, - "learning_rate": 2.924791086350975e-06, - "loss": 0.1381, - "step": 4404 - }, - { - "epoch": 2.8255291853752404, - "grad_norm": 1.2124900817871094, - "learning_rate": 2.914077565888151e-06, - "loss": 0.1207, - "step": 4405 - }, - { - "epoch": 2.8261706221937137, - "grad_norm": 1.2384693622589111, - "learning_rate": 2.903364045425327e-06, - "loss": 0.1242, - "step": 4406 - }, - { - "epoch": 2.8268120590121875, - "grad_norm": 0.968416690826416, - "learning_rate": 2.892650524962503e-06, - "loss": 0.1338, - "step": 4407 - }, - { - "epoch": 2.8274534958306607, - "grad_norm": 0.9625255465507507, - "learning_rate": 2.8819370044996786e-06, - "loss": 0.1246, - "step": 4408 - }, - { - "epoch": 2.828094932649134, - "grad_norm": 1.0401997566223145, - "learning_rate": 2.8712234840368546e-06, - "loss": 0.1265, - "step": 4409 - }, - { - "epoch": 2.8287363694676073, - "grad_norm": 1.299531102180481, - "learning_rate": 2.8605099635740302e-06, - "loss": 0.1378, - "step": 4410 - }, - { - "epoch": 2.8293778062860806, - "grad_norm": 1.5212783813476562, - "learning_rate": 2.8497964431112067e-06, - "loss": 0.1358, - "step": 4411 - }, - { - "epoch": 2.8300192431045543, - "grad_norm": 1.2595406770706177, - "learning_rate": 2.8390829226483827e-06, - "loss": 0.1348, - "step": 4412 - }, - { - "epoch": 2.8306606799230276, - "grad_norm": 0.98666912317276, - "learning_rate": 2.8283694021855583e-06, - "loss": 0.1264, - "step": 4413 - }, - { - "epoch": 2.831302116741501, - "grad_norm": 1.3711464405059814, - "learning_rate": 2.8176558817227343e-06, - "loss": 0.1241, - "step": 4414 - }, - { - "epoch": 2.831943553559974, - "grad_norm": 1.117918610572815, - "learning_rate": 2.80694236125991e-06, - "loss": 0.1262, - "step": 4415 - }, - { - "epoch": 2.8325849903784475, - "grad_norm": 1.1738147735595703, - "learning_rate": 2.796228840797086e-06, - "loss": 0.1243, - "step": 4416 - }, - { - "epoch": 2.833226427196921, - "grad_norm": 1.3092280626296997, - "learning_rate": 2.785515320334262e-06, - "loss": 0.1409, - "step": 4417 - }, - { - "epoch": 2.8338678640153945, - "grad_norm": 1.0338411331176758, - "learning_rate": 2.774801799871438e-06, - "loss": 0.1343, - "step": 4418 - }, - { - "epoch": 2.8345093008338678, - "grad_norm": 1.5222264528274536, - "learning_rate": 2.7640882794086136e-06, - "loss": 0.1592, - "step": 4419 - }, - { - "epoch": 2.8351507376523415, - "grad_norm": 1.7683537006378174, - "learning_rate": 2.7533747589457897e-06, - "loss": 0.142, - "step": 4420 - }, - { - "epoch": 2.8357921744708148, - "grad_norm": 0.9379843473434448, - "learning_rate": 2.7426612384829657e-06, - "loss": 0.1193, - "step": 4421 - }, - { - "epoch": 2.836433611289288, - "grad_norm": 1.3698190450668335, - "learning_rate": 2.7319477180201417e-06, - "loss": 0.1363, - "step": 4422 - }, - { - "epoch": 2.8370750481077613, - "grad_norm": 1.1028403043746948, - "learning_rate": 2.7212341975573173e-06, - "loss": 0.1313, - "step": 4423 - }, - { - "epoch": 2.8377164849262346, - "grad_norm": 1.1790122985839844, - "learning_rate": 2.7105206770944934e-06, - "loss": 0.1257, - "step": 4424 - }, - { - "epoch": 2.8383579217447084, - "grad_norm": 1.2996846437454224, - "learning_rate": 2.699807156631669e-06, - "loss": 0.1394, - "step": 4425 - }, - { - "epoch": 2.8389993585631816, - "grad_norm": 1.5263227224349976, - "learning_rate": 2.689093636168845e-06, - "loss": 0.1395, - "step": 4426 - }, - { - "epoch": 2.839640795381655, - "grad_norm": 0.8989038467407227, - "learning_rate": 2.6783801157060215e-06, - "loss": 0.1205, - "step": 4427 - }, - { - "epoch": 2.840282232200128, - "grad_norm": 1.5629348754882812, - "learning_rate": 2.667666595243197e-06, - "loss": 0.1472, - "step": 4428 - }, - { - "epoch": 2.8409236690186015, - "grad_norm": 0.9738770127296448, - "learning_rate": 2.656953074780373e-06, - "loss": 0.1278, - "step": 4429 - }, - { - "epoch": 2.8415651058370752, - "grad_norm": 1.032665491104126, - "learning_rate": 2.6462395543175487e-06, - "loss": 0.1332, - "step": 4430 - }, - { - "epoch": 2.8422065426555485, - "grad_norm": 1.5645434856414795, - "learning_rate": 2.6355260338547247e-06, - "loss": 0.1391, - "step": 4431 - }, - { - "epoch": 2.842847979474022, - "grad_norm": 1.5612114667892456, - "learning_rate": 2.6248125133919008e-06, - "loss": 0.1385, - "step": 4432 - }, - { - "epoch": 2.843489416292495, - "grad_norm": 1.0871127843856812, - "learning_rate": 2.614098992929077e-06, - "loss": 0.1246, - "step": 4433 - }, - { - "epoch": 2.8441308531109684, - "grad_norm": 1.2899723052978516, - "learning_rate": 2.6033854724662524e-06, - "loss": 0.1343, - "step": 4434 - }, - { - "epoch": 2.844772289929442, - "grad_norm": 1.6598643064498901, - "learning_rate": 2.5926719520034284e-06, - "loss": 0.1496, - "step": 4435 - }, - { - "epoch": 2.8454137267479154, - "grad_norm": 0.9087107181549072, - "learning_rate": 2.581958431540604e-06, - "loss": 0.1143, - "step": 4436 - }, - { - "epoch": 2.8460551635663887, - "grad_norm": 1.0045782327651978, - "learning_rate": 2.5712449110777805e-06, - "loss": 0.1207, - "step": 4437 - }, - { - "epoch": 2.846696600384862, - "grad_norm": 1.5333726406097412, - "learning_rate": 2.560531390614956e-06, - "loss": 0.14, - "step": 4438 - }, - { - "epoch": 2.8473380372033352, - "grad_norm": 1.4856467247009277, - "learning_rate": 2.549817870152132e-06, - "loss": 0.1365, - "step": 4439 - }, - { - "epoch": 2.847979474021809, - "grad_norm": 0.6957578063011169, - "learning_rate": 2.539104349689308e-06, - "loss": 0.1197, - "step": 4440 - }, - { - "epoch": 2.8486209108402822, - "grad_norm": 1.1104000806808472, - "learning_rate": 2.5283908292264838e-06, - "loss": 0.1343, - "step": 4441 - }, - { - "epoch": 2.8492623476587555, - "grad_norm": 1.0297331809997559, - "learning_rate": 2.51767730876366e-06, - "loss": 0.1199, - "step": 4442 - }, - { - "epoch": 2.8499037844772293, - "grad_norm": 1.1006883382797241, - "learning_rate": 2.506963788300836e-06, - "loss": 0.1275, - "step": 4443 - }, - { - "epoch": 2.8505452212957025, - "grad_norm": 1.105401635169983, - "learning_rate": 2.496250267838012e-06, - "loss": 0.1361, - "step": 4444 - }, - { - "epoch": 2.851186658114176, - "grad_norm": 0.9975040555000305, - "learning_rate": 2.4855367473751875e-06, - "loss": 0.1252, - "step": 4445 - }, - { - "epoch": 2.851828094932649, - "grad_norm": 1.0279446840286255, - "learning_rate": 2.4748232269123635e-06, - "loss": 0.1245, - "step": 4446 - }, - { - "epoch": 2.8524695317511224, - "grad_norm": 0.8490254878997803, - "learning_rate": 2.4641097064495395e-06, - "loss": 0.1221, - "step": 4447 - }, - { - "epoch": 2.853110968569596, - "grad_norm": 1.1841367483139038, - "learning_rate": 2.4533961859867156e-06, - "loss": 0.1337, - "step": 4448 - }, - { - "epoch": 2.8537524053880694, - "grad_norm": 1.5829566717147827, - "learning_rate": 2.442682665523891e-06, - "loss": 0.1384, - "step": 4449 - }, - { - "epoch": 2.8543938422065427, - "grad_norm": 1.5184950828552246, - "learning_rate": 2.4319691450610672e-06, - "loss": 0.143, - "step": 4450 - }, - { - "epoch": 2.855035279025016, - "grad_norm": 1.3436793088912964, - "learning_rate": 2.421255624598243e-06, - "loss": 0.1396, - "step": 4451 - }, - { - "epoch": 2.8556767158434893, - "grad_norm": 0.815739095211029, - "learning_rate": 2.410542104135419e-06, - "loss": 0.1208, - "step": 4452 - }, - { - "epoch": 2.856318152661963, - "grad_norm": 1.5976170301437378, - "learning_rate": 2.399828583672595e-06, - "loss": 0.1585, - "step": 4453 - }, - { - "epoch": 2.8569595894804363, - "grad_norm": 1.4598063230514526, - "learning_rate": 2.389115063209771e-06, - "loss": 0.1478, - "step": 4454 - }, - { - "epoch": 2.8576010262989096, - "grad_norm": 1.0840290784835815, - "learning_rate": 2.378401542746947e-06, - "loss": 0.1292, - "step": 4455 - }, - { - "epoch": 2.858242463117383, - "grad_norm": 1.2446163892745972, - "learning_rate": 2.3676880222841226e-06, - "loss": 0.127, - "step": 4456 - }, - { - "epoch": 2.858883899935856, - "grad_norm": 1.4926233291625977, - "learning_rate": 2.3569745018212986e-06, - "loss": 0.1431, - "step": 4457 - }, - { - "epoch": 2.85952533675433, - "grad_norm": 1.181796669960022, - "learning_rate": 2.3462609813584746e-06, - "loss": 0.1262, - "step": 4458 - }, - { - "epoch": 2.860166773572803, - "grad_norm": 1.0567635297775269, - "learning_rate": 2.3355474608956507e-06, - "loss": 0.1278, - "step": 4459 - }, - { - "epoch": 2.8608082103912764, - "grad_norm": 1.0530834197998047, - "learning_rate": 2.3248339404328263e-06, - "loss": 0.1236, - "step": 4460 - }, - { - "epoch": 2.8614496472097497, - "grad_norm": 1.3896763324737549, - "learning_rate": 2.3141204199700023e-06, - "loss": 0.1322, - "step": 4461 - }, - { - "epoch": 2.862091084028223, - "grad_norm": 1.418074131011963, - "learning_rate": 2.303406899507178e-06, - "loss": 0.1385, - "step": 4462 - }, - { - "epoch": 2.8627325208466967, - "grad_norm": 1.1563472747802734, - "learning_rate": 2.2926933790443544e-06, - "loss": 0.1275, - "step": 4463 - }, - { - "epoch": 2.86337395766517, - "grad_norm": 1.2014789581298828, - "learning_rate": 2.28197985858153e-06, - "loss": 0.1399, - "step": 4464 - }, - { - "epoch": 2.8640153944836433, - "grad_norm": 1.5486376285552979, - "learning_rate": 2.271266338118706e-06, - "loss": 0.1348, - "step": 4465 - }, - { - "epoch": 2.864656831302117, - "grad_norm": 0.9226186275482178, - "learning_rate": 2.2605528176558816e-06, - "loss": 0.1253, - "step": 4466 - }, - { - "epoch": 2.86529826812059, - "grad_norm": 1.4516050815582275, - "learning_rate": 2.2498392971930576e-06, - "loss": 0.1395, - "step": 4467 - }, - { - "epoch": 2.8659397049390636, - "grad_norm": 1.0023874044418335, - "learning_rate": 2.2391257767302337e-06, - "loss": 0.1281, - "step": 4468 - }, - { - "epoch": 2.866581141757537, - "grad_norm": 1.507888674736023, - "learning_rate": 2.2284122562674097e-06, - "loss": 0.1423, - "step": 4469 - }, - { - "epoch": 2.86722257857601, - "grad_norm": 0.8865677118301392, - "learning_rate": 2.2176987358045857e-06, - "loss": 0.1263, - "step": 4470 - }, - { - "epoch": 2.867864015394484, - "grad_norm": 1.4517685174942017, - "learning_rate": 2.2069852153417613e-06, - "loss": 0.1346, - "step": 4471 - }, - { - "epoch": 2.868505452212957, - "grad_norm": 1.46281898021698, - "learning_rate": 2.1962716948789374e-06, - "loss": 0.1312, - "step": 4472 - }, - { - "epoch": 2.8691468890314304, - "grad_norm": 1.453552484512329, - "learning_rate": 2.1855581744161134e-06, - "loss": 0.135, - "step": 4473 - }, - { - "epoch": 2.8697883258499037, - "grad_norm": 1.6297810077667236, - "learning_rate": 2.1748446539532894e-06, - "loss": 0.1621, - "step": 4474 - }, - { - "epoch": 2.870429762668377, - "grad_norm": 0.8455319404602051, - "learning_rate": 2.164131133490465e-06, - "loss": 0.1174, - "step": 4475 - }, - { - "epoch": 2.8710711994868507, - "grad_norm": 1.0756893157958984, - "learning_rate": 2.153417613027641e-06, - "loss": 0.1324, - "step": 4476 - }, - { - "epoch": 2.871712636305324, - "grad_norm": 0.9699214696884155, - "learning_rate": 2.1427040925648167e-06, - "loss": 0.1211, - "step": 4477 - }, - { - "epoch": 2.8723540731237973, - "grad_norm": 1.740543007850647, - "learning_rate": 2.1319905721019927e-06, - "loss": 0.1558, - "step": 4478 - }, - { - "epoch": 2.8729955099422706, - "grad_norm": 1.2637462615966797, - "learning_rate": 2.1212770516391687e-06, - "loss": 0.1284, - "step": 4479 - }, - { - "epoch": 2.873636946760744, - "grad_norm": 1.4595786333084106, - "learning_rate": 2.1105635311763448e-06, - "loss": 0.1431, - "step": 4480 - }, - { - "epoch": 2.8742783835792176, - "grad_norm": 1.495163917541504, - "learning_rate": 2.0998500107135204e-06, - "loss": 0.1358, - "step": 4481 - }, - { - "epoch": 2.874919820397691, - "grad_norm": 1.035021424293518, - "learning_rate": 2.0891364902506964e-06, - "loss": 0.1318, - "step": 4482 - }, - { - "epoch": 2.875561257216164, - "grad_norm": 1.0046144723892212, - "learning_rate": 2.0784229697878724e-06, - "loss": 0.1271, - "step": 4483 - }, - { - "epoch": 2.8762026940346375, - "grad_norm": 1.1880406141281128, - "learning_rate": 2.0677094493250485e-06, - "loss": 0.1233, - "step": 4484 - }, - { - "epoch": 2.8768441308531107, - "grad_norm": 1.2123830318450928, - "learning_rate": 2.0569959288622245e-06, - "loss": 0.1299, - "step": 4485 - }, - { - "epoch": 2.8774855676715845, - "grad_norm": 0.8656582832336426, - "learning_rate": 2.0462824083994e-06, - "loss": 0.1248, - "step": 4486 - }, - { - "epoch": 2.8781270044900578, - "grad_norm": 2.09670352935791, - "learning_rate": 2.035568887936576e-06, - "loss": 0.1656, - "step": 4487 - }, - { - "epoch": 2.878768441308531, - "grad_norm": 1.4183948040008545, - "learning_rate": 2.0248553674737517e-06, - "loss": 0.1421, - "step": 4488 - }, - { - "epoch": 2.8794098781270043, - "grad_norm": 1.818442463874817, - "learning_rate": 2.014141847010928e-06, - "loss": 0.1513, - "step": 4489 - }, - { - "epoch": 2.8800513149454776, - "grad_norm": 1.153100609779358, - "learning_rate": 2.003428326548104e-06, - "loss": 0.1252, - "step": 4490 - }, - { - "epoch": 2.8806927517639513, - "grad_norm": 1.3059747219085693, - "learning_rate": 1.99271480608528e-06, - "loss": 0.1353, - "step": 4491 - }, - { - "epoch": 2.8813341885824246, - "grad_norm": 1.4875307083129883, - "learning_rate": 1.9820012856224554e-06, - "loss": 0.1357, - "step": 4492 - }, - { - "epoch": 2.881975625400898, - "grad_norm": 0.9703956246376038, - "learning_rate": 1.9712877651596315e-06, - "loss": 0.1363, - "step": 4493 - }, - { - "epoch": 2.8826170622193716, - "grad_norm": 1.4519399404525757, - "learning_rate": 1.9605742446968075e-06, - "loss": 0.1422, - "step": 4494 - }, - { - "epoch": 2.883258499037845, - "grad_norm": 1.1731901168823242, - "learning_rate": 1.9498607242339835e-06, - "loss": 0.1266, - "step": 4495 - }, - { - "epoch": 2.883899935856318, - "grad_norm": 1.7622405290603638, - "learning_rate": 1.939147203771159e-06, - "loss": 0.1492, - "step": 4496 - }, - { - "epoch": 2.8845413726747915, - "grad_norm": 1.372599482536316, - "learning_rate": 1.928433683308335e-06, - "loss": 0.1362, - "step": 4497 - }, - { - "epoch": 2.8851828094932648, - "grad_norm": 1.032507300376892, - "learning_rate": 1.9177201628455108e-06, - "loss": 0.1289, - "step": 4498 - }, - { - "epoch": 2.8858242463117385, - "grad_norm": 1.435725450515747, - "learning_rate": 1.907006642382687e-06, - "loss": 0.1415, - "step": 4499 - }, - { - "epoch": 2.886465683130212, - "grad_norm": 1.67751944065094, - "learning_rate": 1.896293121919863e-06, - "loss": 0.1462, - "step": 4500 - }, - { - "epoch": 2.887107119948685, - "grad_norm": 1.1992768049240112, - "learning_rate": 1.8855796014570389e-06, - "loss": 0.1271, - "step": 4501 - }, - { - "epoch": 2.8877485567671584, - "grad_norm": 1.4545049667358398, - "learning_rate": 1.874866080994215e-06, - "loss": 0.1416, - "step": 4502 - }, - { - "epoch": 2.8883899935856316, - "grad_norm": 1.945939302444458, - "learning_rate": 1.8641525605313907e-06, - "loss": 0.1511, - "step": 4503 - }, - { - "epoch": 2.8890314304041054, - "grad_norm": 1.1056212186813354, - "learning_rate": 1.8534390400685668e-06, - "loss": 0.1273, - "step": 4504 - }, - { - "epoch": 2.8896728672225787, - "grad_norm": 0.897881805896759, - "learning_rate": 1.8427255196057426e-06, - "loss": 0.1238, - "step": 4505 - }, - { - "epoch": 2.890314304041052, - "grad_norm": 1.0898115634918213, - "learning_rate": 1.8320119991429186e-06, - "loss": 0.1406, - "step": 4506 - }, - { - "epoch": 2.8909557408595252, - "grad_norm": 1.4876495599746704, - "learning_rate": 1.8212984786800942e-06, - "loss": 0.1452, - "step": 4507 - }, - { - "epoch": 2.8915971776779985, - "grad_norm": 1.4291685819625854, - "learning_rate": 1.8105849582172705e-06, - "loss": 0.1386, - "step": 4508 - }, - { - "epoch": 2.8922386144964722, - "grad_norm": 0.8341783285140991, - "learning_rate": 1.799871437754446e-06, - "loss": 0.1192, - "step": 4509 - }, - { - "epoch": 2.8928800513149455, - "grad_norm": 1.4000965356826782, - "learning_rate": 1.789157917291622e-06, - "loss": 0.1371, - "step": 4510 - }, - { - "epoch": 2.893521488133419, - "grad_norm": 1.3946512937545776, - "learning_rate": 1.778444396828798e-06, - "loss": 0.1315, - "step": 4511 - }, - { - "epoch": 2.894162924951892, - "grad_norm": 1.45354163646698, - "learning_rate": 1.767730876365974e-06, - "loss": 0.1417, - "step": 4512 - }, - { - "epoch": 2.8948043617703654, - "grad_norm": 1.0212242603302002, - "learning_rate": 1.7570173559031498e-06, - "loss": 0.1316, - "step": 4513 - }, - { - "epoch": 2.895445798588839, - "grad_norm": 1.2535264492034912, - "learning_rate": 1.7463038354403258e-06, - "loss": 0.1277, - "step": 4514 - }, - { - "epoch": 2.8960872354073124, - "grad_norm": 1.3986079692840576, - "learning_rate": 1.7355903149775018e-06, - "loss": 0.1418, - "step": 4515 - }, - { - "epoch": 2.8967286722257857, - "grad_norm": 1.3020310401916504, - "learning_rate": 1.7248767945146776e-06, - "loss": 0.1364, - "step": 4516 - }, - { - "epoch": 2.8973701090442594, - "grad_norm": 1.2160974740982056, - "learning_rate": 1.7141632740518537e-06, - "loss": 0.1241, - "step": 4517 - }, - { - "epoch": 2.8980115458627327, - "grad_norm": 1.6396387815475464, - "learning_rate": 1.7034497535890295e-06, - "loss": 0.1526, - "step": 4518 - }, - { - "epoch": 2.898652982681206, - "grad_norm": 1.0801637172698975, - "learning_rate": 1.6927362331262055e-06, - "loss": 0.1246, - "step": 4519 - }, - { - "epoch": 2.8992944194996793, - "grad_norm": 1.176525592803955, - "learning_rate": 1.6820227126633811e-06, - "loss": 0.1292, - "step": 4520 - }, - { - "epoch": 2.8999358563181525, - "grad_norm": 1.0672718286514282, - "learning_rate": 1.6713091922005574e-06, - "loss": 0.1367, - "step": 4521 - }, - { - "epoch": 2.9005772931366263, - "grad_norm": 1.3511483669281006, - "learning_rate": 1.660595671737733e-06, - "loss": 0.1309, - "step": 4522 - }, - { - "epoch": 2.9012187299550996, - "grad_norm": 0.942084014415741, - "learning_rate": 1.649882151274909e-06, - "loss": 0.127, - "step": 4523 - }, - { - "epoch": 2.901860166773573, - "grad_norm": 1.1084758043289185, - "learning_rate": 1.6391686308120848e-06, - "loss": 0.1298, - "step": 4524 - }, - { - "epoch": 2.902501603592046, - "grad_norm": 1.5937352180480957, - "learning_rate": 1.6284551103492609e-06, - "loss": 0.1403, - "step": 4525 - }, - { - "epoch": 2.9031430404105194, - "grad_norm": 1.1207832098007202, - "learning_rate": 1.6177415898864367e-06, - "loss": 0.1222, - "step": 4526 - }, - { - "epoch": 2.903784477228993, - "grad_norm": 1.2837399244308472, - "learning_rate": 1.6070280694236127e-06, - "loss": 0.1284, - "step": 4527 - }, - { - "epoch": 2.9044259140474664, - "grad_norm": 1.553173542022705, - "learning_rate": 1.5963145489607885e-06, - "loss": 0.152, - "step": 4528 - }, - { - "epoch": 2.9050673508659397, - "grad_norm": 1.1273585557937622, - "learning_rate": 1.5856010284979646e-06, - "loss": 0.1247, - "step": 4529 - }, - { - "epoch": 2.905708787684413, - "grad_norm": 1.3256640434265137, - "learning_rate": 1.5748875080351406e-06, - "loss": 0.1323, - "step": 4530 - }, - { - "epoch": 2.9063502245028863, - "grad_norm": 1.3124734163284302, - "learning_rate": 1.5641739875723164e-06, - "loss": 0.1351, - "step": 4531 - }, - { - "epoch": 2.90699166132136, - "grad_norm": 1.2205262184143066, - "learning_rate": 1.5534604671094922e-06, - "loss": 0.1316, - "step": 4532 - }, - { - "epoch": 2.9076330981398333, - "grad_norm": 0.9711066484451294, - "learning_rate": 1.542746946646668e-06, - "loss": 0.1162, - "step": 4533 - }, - { - "epoch": 2.9082745349583066, - "grad_norm": 1.4648932218551636, - "learning_rate": 1.532033426183844e-06, - "loss": 0.1356, - "step": 4534 - }, - { - "epoch": 2.90891597177678, - "grad_norm": 1.306843638420105, - "learning_rate": 1.52131990572102e-06, - "loss": 0.1417, - "step": 4535 - }, - { - "epoch": 2.909557408595253, - "grad_norm": 1.4977298974990845, - "learning_rate": 1.510606385258196e-06, - "loss": 0.1404, - "step": 4536 - }, - { - "epoch": 2.910198845413727, - "grad_norm": 1.188600778579712, - "learning_rate": 1.499892864795372e-06, - "loss": 0.1199, - "step": 4537 - }, - { - "epoch": 2.9108402822322, - "grad_norm": 1.0094125270843506, - "learning_rate": 1.4891793443325478e-06, - "loss": 0.1203, - "step": 4538 - }, - { - "epoch": 2.9114817190506734, - "grad_norm": 1.1093536615371704, - "learning_rate": 1.4784658238697238e-06, - "loss": 0.1227, - "step": 4539 - }, - { - "epoch": 2.912123155869147, - "grad_norm": 1.4561131000518799, - "learning_rate": 1.4677523034068996e-06, - "loss": 0.1384, - "step": 4540 - }, - { - "epoch": 2.91276459268762, - "grad_norm": 1.3174751996994019, - "learning_rate": 1.4570387829440755e-06, - "loss": 0.1306, - "step": 4541 - }, - { - "epoch": 2.9134060295060937, - "grad_norm": 0.8180350065231323, - "learning_rate": 1.4463252624812515e-06, - "loss": 0.116, - "step": 4542 - }, - { - "epoch": 2.914047466324567, - "grad_norm": 0.9161573648452759, - "learning_rate": 1.4356117420184273e-06, - "loss": 0.1222, - "step": 4543 - }, - { - "epoch": 2.9146889031430403, - "grad_norm": 1.0703870058059692, - "learning_rate": 1.4248982215556033e-06, - "loss": 0.1204, - "step": 4544 - }, - { - "epoch": 2.915330339961514, - "grad_norm": 1.2637109756469727, - "learning_rate": 1.4141847010927792e-06, - "loss": 0.1518, - "step": 4545 - }, - { - "epoch": 2.9159717767799873, - "grad_norm": 1.2645198106765747, - "learning_rate": 1.403471180629955e-06, - "loss": 0.1299, - "step": 4546 - }, - { - "epoch": 2.9166132135984606, - "grad_norm": 0.9889361262321472, - "learning_rate": 1.392757660167131e-06, - "loss": 0.1318, - "step": 4547 - }, - { - "epoch": 2.917254650416934, - "grad_norm": 0.7724899649620056, - "learning_rate": 1.3820441397043068e-06, - "loss": 0.1191, - "step": 4548 - }, - { - "epoch": 2.917896087235407, - "grad_norm": 1.3524690866470337, - "learning_rate": 1.3713306192414829e-06, - "loss": 0.1334, - "step": 4549 - }, - { - "epoch": 2.918537524053881, - "grad_norm": 1.1170871257781982, - "learning_rate": 1.3606170987786587e-06, - "loss": 0.1357, - "step": 4550 - }, - { - "epoch": 2.919178960872354, - "grad_norm": 1.9386112689971924, - "learning_rate": 1.3499035783158345e-06, - "loss": 0.1552, - "step": 4551 - }, - { - "epoch": 2.9198203976908275, - "grad_norm": 1.3018057346343994, - "learning_rate": 1.3391900578530107e-06, - "loss": 0.1347, - "step": 4552 - }, - { - "epoch": 2.9204618345093007, - "grad_norm": 1.2639812231063843, - "learning_rate": 1.3284765373901866e-06, - "loss": 0.1321, - "step": 4553 - }, - { - "epoch": 2.921103271327774, - "grad_norm": 1.316188097000122, - "learning_rate": 1.3177630169273624e-06, - "loss": 0.1368, - "step": 4554 - }, - { - "epoch": 2.9217447081462478, - "grad_norm": 0.8152785301208496, - "learning_rate": 1.3070494964645384e-06, - "loss": 0.1168, - "step": 4555 - }, - { - "epoch": 2.922386144964721, - "grad_norm": 1.1541779041290283, - "learning_rate": 1.2963359760017142e-06, - "loss": 0.1269, - "step": 4556 - }, - { - "epoch": 2.9230275817831943, - "grad_norm": 1.3332538604736328, - "learning_rate": 1.2856224555388903e-06, - "loss": 0.1293, - "step": 4557 - }, - { - "epoch": 2.9236690186016676, - "grad_norm": 1.6458852291107178, - "learning_rate": 1.274908935076066e-06, - "loss": 0.1537, - "step": 4558 - }, - { - "epoch": 2.924310455420141, - "grad_norm": 1.3973242044448853, - "learning_rate": 1.2641954146132419e-06, - "loss": 0.1315, - "step": 4559 - }, - { - "epoch": 2.9249518922386146, - "grad_norm": 1.1725722551345825, - "learning_rate": 1.253481894150418e-06, - "loss": 0.1213, - "step": 4560 - }, - { - "epoch": 2.925593329057088, - "grad_norm": 1.1168417930603027, - "learning_rate": 1.2427683736875937e-06, - "loss": 0.1354, - "step": 4561 - }, - { - "epoch": 2.926234765875561, - "grad_norm": 1.4243903160095215, - "learning_rate": 1.2320548532247698e-06, - "loss": 0.1297, - "step": 4562 - }, - { - "epoch": 2.9268762026940345, - "grad_norm": 1.6212537288665771, - "learning_rate": 1.2213413327619456e-06, - "loss": 0.1516, - "step": 4563 - }, - { - "epoch": 2.9275176395125078, - "grad_norm": 1.21247398853302, - "learning_rate": 1.2106278122991214e-06, - "loss": 0.1417, - "step": 4564 - }, - { - "epoch": 2.9281590763309815, - "grad_norm": 0.926612377166748, - "learning_rate": 1.1999142918362974e-06, - "loss": 0.1185, - "step": 4565 - }, - { - "epoch": 2.9288005131494548, - "grad_norm": 1.1218516826629639, - "learning_rate": 1.1892007713734735e-06, - "loss": 0.1283, - "step": 4566 - }, - { - "epoch": 2.929441949967928, - "grad_norm": 0.985855758190155, - "learning_rate": 1.1784872509106493e-06, - "loss": 0.1363, - "step": 4567 - }, - { - "epoch": 2.930083386786402, - "grad_norm": 1.4871960878372192, - "learning_rate": 1.1677737304478253e-06, - "loss": 0.1452, - "step": 4568 - }, - { - "epoch": 2.930724823604875, - "grad_norm": 1.3312674760818481, - "learning_rate": 1.1570602099850011e-06, - "loss": 0.1331, - "step": 4569 - }, - { - "epoch": 2.9313662604233484, - "grad_norm": 1.087379813194275, - "learning_rate": 1.1463466895221772e-06, - "loss": 0.1295, - "step": 4570 - }, - { - "epoch": 2.9320076972418216, - "grad_norm": 1.267376184463501, - "learning_rate": 1.135633169059353e-06, - "loss": 0.1311, - "step": 4571 - }, - { - "epoch": 2.932649134060295, - "grad_norm": 1.1369432210922241, - "learning_rate": 1.1249196485965288e-06, - "loss": 0.1309, - "step": 4572 - }, - { - "epoch": 2.9332905708787687, - "grad_norm": 0.8058706521987915, - "learning_rate": 1.1142061281337048e-06, - "loss": 0.1169, - "step": 4573 - }, - { - "epoch": 2.933932007697242, - "grad_norm": 0.9941306114196777, - "learning_rate": 1.1034926076708807e-06, - "loss": 0.1247, - "step": 4574 - }, - { - "epoch": 2.9345734445157152, - "grad_norm": 1.203942894935608, - "learning_rate": 1.0927790872080567e-06, - "loss": 0.1307, - "step": 4575 - }, - { - "epoch": 2.9352148813341885, - "grad_norm": 1.2787325382232666, - "learning_rate": 1.0820655667452325e-06, - "loss": 0.1394, - "step": 4576 - }, - { - "epoch": 2.935856318152662, - "grad_norm": 1.3175487518310547, - "learning_rate": 1.0713520462824083e-06, - "loss": 0.1408, - "step": 4577 - }, - { - "epoch": 2.9364977549711355, - "grad_norm": 1.6416971683502197, - "learning_rate": 1.0606385258195844e-06, - "loss": 0.1491, - "step": 4578 - }, - { - "epoch": 2.937139191789609, - "grad_norm": 1.4588253498077393, - "learning_rate": 1.0499250053567602e-06, - "loss": 0.1397, - "step": 4579 - }, - { - "epoch": 2.937780628608082, - "grad_norm": 1.4185206890106201, - "learning_rate": 1.0392114848939362e-06, - "loss": 0.1478, - "step": 4580 - }, - { - "epoch": 2.9384220654265554, - "grad_norm": 0.8662010431289673, - "learning_rate": 1.0284979644311122e-06, - "loss": 0.1209, - "step": 4581 - }, - { - "epoch": 2.9390635022450287, - "grad_norm": 1.0764386653900146, - "learning_rate": 1.017784443968288e-06, - "loss": 0.1295, - "step": 4582 - }, - { - "epoch": 2.9397049390635024, - "grad_norm": 0.9011473655700684, - "learning_rate": 1.007070923505464e-06, - "loss": 0.1222, - "step": 4583 - }, - { - "epoch": 2.9403463758819757, - "grad_norm": 1.0840494632720947, - "learning_rate": 9.9635740304264e-07, - "loss": 0.1244, - "step": 4584 - }, - { - "epoch": 2.940987812700449, - "grad_norm": 1.3971607685089111, - "learning_rate": 9.856438825798157e-07, - "loss": 0.138, - "step": 4585 - }, - { - "epoch": 2.9416292495189222, - "grad_norm": 1.5322881937026978, - "learning_rate": 9.749303621169918e-07, - "loss": 0.1411, - "step": 4586 - }, - { - "epoch": 2.9422706863373955, - "grad_norm": 0.8241583108901978, - "learning_rate": 9.642168416541676e-07, - "loss": 0.1248, - "step": 4587 - }, - { - "epoch": 2.9429121231558693, - "grad_norm": 1.1293078660964966, - "learning_rate": 9.535033211913435e-07, - "loss": 0.1269, - "step": 4588 - }, - { - "epoch": 2.9435535599743425, - "grad_norm": 1.2237790822982788, - "learning_rate": 9.427898007285194e-07, - "loss": 0.1311, - "step": 4589 - }, - { - "epoch": 2.944194996792816, - "grad_norm": 1.6656171083450317, - "learning_rate": 9.320762802656954e-07, - "loss": 0.1475, - "step": 4590 - }, - { - "epoch": 2.9448364336112896, - "grad_norm": 1.152736783027649, - "learning_rate": 9.213627598028713e-07, - "loss": 0.1315, - "step": 4591 - }, - { - "epoch": 2.945477870429763, - "grad_norm": 1.4101641178131104, - "learning_rate": 9.106492393400471e-07, - "loss": 0.1347, - "step": 4592 - }, - { - "epoch": 2.946119307248236, - "grad_norm": 1.3187679052352905, - "learning_rate": 8.99935718877223e-07, - "loss": 0.134, - "step": 4593 - }, - { - "epoch": 2.9467607440667094, - "grad_norm": 1.4426344633102417, - "learning_rate": 8.89222198414399e-07, - "loss": 0.1389, - "step": 4594 - }, - { - "epoch": 2.9474021808851827, - "grad_norm": 2.054165840148926, - "learning_rate": 8.785086779515749e-07, - "loss": 0.126, - "step": 4595 - }, - { - "epoch": 2.9480436177036564, - "grad_norm": 1.2682515382766724, - "learning_rate": 8.677951574887509e-07, - "loss": 0.1363, - "step": 4596 - }, - { - "epoch": 2.9486850545221297, - "grad_norm": 1.1330899000167847, - "learning_rate": 8.570816370259268e-07, - "loss": 0.1332, - "step": 4597 - }, - { - "epoch": 2.949326491340603, - "grad_norm": 0.7926548719406128, - "learning_rate": 8.463681165631028e-07, - "loss": 0.1198, - "step": 4598 - }, - { - "epoch": 2.9499679281590763, - "grad_norm": 1.5341383218765259, - "learning_rate": 8.356545961002787e-07, - "loss": 0.1389, - "step": 4599 - }, - { - "epoch": 2.9506093649775496, - "grad_norm": 0.8094181418418884, - "learning_rate": 8.249410756374545e-07, - "loss": 0.12, - "step": 4600 - }, - { - "epoch": 2.9512508017960233, - "grad_norm": 1.3672996759414673, - "learning_rate": 8.142275551746304e-07, - "loss": 0.1344, - "step": 4601 - }, - { - "epoch": 2.9518922386144966, - "grad_norm": 1.168218970298767, - "learning_rate": 8.035140347118064e-07, - "loss": 0.1195, - "step": 4602 - }, - { - "epoch": 2.95253367543297, - "grad_norm": 1.42991042137146, - "learning_rate": 7.928005142489823e-07, - "loss": 0.1309, - "step": 4603 - }, - { - "epoch": 2.953175112251443, - "grad_norm": 1.8129701614379883, - "learning_rate": 7.820869937861582e-07, - "loss": 0.1486, - "step": 4604 - }, - { - "epoch": 2.9538165490699164, - "grad_norm": 1.2078971862792969, - "learning_rate": 7.71373473323334e-07, - "loss": 0.1278, - "step": 4605 - }, - { - "epoch": 2.95445798588839, - "grad_norm": 0.7906535267829895, - "learning_rate": 7.6065995286051e-07, - "loss": 0.1215, - "step": 4606 - }, - { - "epoch": 2.9550994227068634, - "grad_norm": 0.8223199248313904, - "learning_rate": 7.49946432397686e-07, - "loss": 0.1184, - "step": 4607 - }, - { - "epoch": 2.9557408595253367, - "grad_norm": 1.4037939310073853, - "learning_rate": 7.392329119348619e-07, - "loss": 0.1292, - "step": 4608 - }, - { - "epoch": 2.95638229634381, - "grad_norm": 1.1070348024368286, - "learning_rate": 7.285193914720377e-07, - "loss": 0.126, - "step": 4609 - }, - { - "epoch": 2.9570237331622833, - "grad_norm": 1.3634099960327148, - "learning_rate": 7.178058710092137e-07, - "loss": 0.1338, - "step": 4610 - }, - { - "epoch": 2.957665169980757, - "grad_norm": 1.1455059051513672, - "learning_rate": 7.070923505463896e-07, - "loss": 0.1359, - "step": 4611 - }, - { - "epoch": 2.9583066067992303, - "grad_norm": 0.9972572326660156, - "learning_rate": 6.963788300835655e-07, - "loss": 0.1245, - "step": 4612 - }, - { - "epoch": 2.9589480436177036, - "grad_norm": 0.8801628351211548, - "learning_rate": 6.856653096207414e-07, - "loss": 0.1197, - "step": 4613 - }, - { - "epoch": 2.9595894804361773, - "grad_norm": 1.141021728515625, - "learning_rate": 6.749517891579172e-07, - "loss": 0.1335, - "step": 4614 - }, - { - "epoch": 2.96023091725465, - "grad_norm": 1.2504584789276123, - "learning_rate": 6.642382686950933e-07, - "loss": 0.1383, - "step": 4615 - }, - { - "epoch": 2.960872354073124, - "grad_norm": 0.850008487701416, - "learning_rate": 6.535247482322692e-07, - "loss": 0.1152, - "step": 4616 - }, - { - "epoch": 2.961513790891597, - "grad_norm": 1.1559522151947021, - "learning_rate": 6.428112277694451e-07, - "loss": 0.1363, - "step": 4617 - }, - { - "epoch": 2.9621552277100704, - "grad_norm": 1.330955982208252, - "learning_rate": 6.320977073066209e-07, - "loss": 0.1325, - "step": 4618 - }, - { - "epoch": 2.962796664528544, - "grad_norm": 1.141345739364624, - "learning_rate": 6.213841868437969e-07, - "loss": 0.1262, - "step": 4619 - }, - { - "epoch": 2.9634381013470175, - "grad_norm": 1.0081840753555298, - "learning_rate": 6.106706663809728e-07, - "loss": 0.1217, - "step": 4620 - }, - { - "epoch": 2.9640795381654907, - "grad_norm": 1.5882848501205444, - "learning_rate": 5.999571459181487e-07, - "loss": 0.1376, - "step": 4621 - }, - { - "epoch": 2.964720974983964, - "grad_norm": 1.1704416275024414, - "learning_rate": 5.892436254553246e-07, - "loss": 0.1262, - "step": 4622 - }, - { - "epoch": 2.9653624118024373, - "grad_norm": 1.3872243165969849, - "learning_rate": 5.785301049925006e-07, - "loss": 0.1347, - "step": 4623 - }, - { - "epoch": 2.966003848620911, - "grad_norm": 1.112236738204956, - "learning_rate": 5.678165845296765e-07, - "loss": 0.142, - "step": 4624 - }, - { - "epoch": 2.9666452854393843, - "grad_norm": 1.2294718027114868, - "learning_rate": 5.571030640668524e-07, - "loss": 0.1334, - "step": 4625 - }, - { - "epoch": 2.9672867222578576, - "grad_norm": 1.2323991060256958, - "learning_rate": 5.463895436040283e-07, - "loss": 0.1196, - "step": 4626 - }, - { - "epoch": 2.967928159076331, - "grad_norm": 1.2373995780944824, - "learning_rate": 5.356760231412042e-07, - "loss": 0.1366, - "step": 4627 - }, - { - "epoch": 2.968569595894804, - "grad_norm": 1.033111333847046, - "learning_rate": 5.249625026783801e-07, - "loss": 0.1277, - "step": 4628 - }, - { - "epoch": 2.969211032713278, - "grad_norm": 1.2212116718292236, - "learning_rate": 5.142489822155561e-07, - "loss": 0.1288, - "step": 4629 - }, - { - "epoch": 2.969852469531751, - "grad_norm": 1.13083815574646, - "learning_rate": 5.03535461752732e-07, - "loss": 0.122, - "step": 4630 - }, - { - "epoch": 2.9704939063502245, - "grad_norm": 1.6866799592971802, - "learning_rate": 4.928219412899079e-07, - "loss": 0.1435, - "step": 4631 - }, - { - "epoch": 2.9711353431686978, - "grad_norm": 1.5242289304733276, - "learning_rate": 4.821084208270838e-07, - "loss": 0.1448, - "step": 4632 - }, - { - "epoch": 2.971776779987171, - "grad_norm": 1.2560980319976807, - "learning_rate": 4.713949003642597e-07, - "loss": 0.1296, - "step": 4633 - }, - { - "epoch": 2.9724182168056448, - "grad_norm": 1.2616114616394043, - "learning_rate": 4.6068137990143564e-07, - "loss": 0.1347, - "step": 4634 - }, - { - "epoch": 2.973059653624118, - "grad_norm": 0.9732151031494141, - "learning_rate": 4.499678594386115e-07, - "loss": 0.1262, - "step": 4635 - }, - { - "epoch": 2.9737010904425913, - "grad_norm": 1.4985333681106567, - "learning_rate": 4.3925433897578744e-07, - "loss": 0.1401, - "step": 4636 - }, - { - "epoch": 2.9743425272610646, - "grad_norm": 2.0245468616485596, - "learning_rate": 4.285408185129634e-07, - "loss": 0.1676, - "step": 4637 - }, - { - "epoch": 2.974983964079538, - "grad_norm": 1.099732518196106, - "learning_rate": 4.1782729805013934e-07, - "loss": 0.1341, - "step": 4638 - }, - { - "epoch": 2.9756254008980116, - "grad_norm": 1.2069478034973145, - "learning_rate": 4.071137775873152e-07, - "loss": 0.1343, - "step": 4639 - }, - { - "epoch": 2.976266837716485, - "grad_norm": 0.8709354400634766, - "learning_rate": 3.9640025712449114e-07, - "loss": 0.1199, - "step": 4640 - }, - { - "epoch": 2.976908274534958, - "grad_norm": 1.2462657690048218, - "learning_rate": 3.85686736661667e-07, - "loss": 0.1368, - "step": 4641 - }, - { - "epoch": 2.977549711353432, - "grad_norm": 1.350285291671753, - "learning_rate": 3.74973216198843e-07, - "loss": 0.1418, - "step": 4642 - }, - { - "epoch": 2.9781911481719052, - "grad_norm": 1.060428261756897, - "learning_rate": 3.6425969573601886e-07, - "loss": 0.1295, - "step": 4643 - }, - { - "epoch": 2.9788325849903785, - "grad_norm": 1.2878365516662598, - "learning_rate": 3.535461752731948e-07, - "loss": 0.1345, - "step": 4644 - }, - { - "epoch": 2.979474021808852, - "grad_norm": 1.0508179664611816, - "learning_rate": 3.428326548103707e-07, - "loss": 0.1266, - "step": 4645 - }, - { - "epoch": 2.980115458627325, - "grad_norm": 0.8315081596374512, - "learning_rate": 3.3211913434754664e-07, - "loss": 0.1231, - "step": 4646 - }, - { - "epoch": 2.980756895445799, - "grad_norm": 1.3258248567581177, - "learning_rate": 3.2140561388472256e-07, - "loss": 0.1291, - "step": 4647 - }, - { - "epoch": 2.981398332264272, - "grad_norm": 1.2107499837875366, - "learning_rate": 3.1069209342189844e-07, - "loss": 0.1396, - "step": 4648 - }, - { - "epoch": 2.9820397690827454, - "grad_norm": 1.0102523565292358, - "learning_rate": 2.9997857295907436e-07, - "loss": 0.1345, - "step": 4649 - }, - { - "epoch": 2.9826812059012187, - "grad_norm": 0.9950736165046692, - "learning_rate": 2.892650524962503e-07, - "loss": 0.1262, - "step": 4650 - }, - { - "epoch": 2.983322642719692, - "grad_norm": 1.413913607597351, - "learning_rate": 2.785515320334262e-07, - "loss": 0.1361, - "step": 4651 - }, - { - "epoch": 2.9839640795381657, - "grad_norm": 1.1332772970199585, - "learning_rate": 2.678380115706021e-07, - "loss": 0.1327, - "step": 4652 - }, - { - "epoch": 2.984605516356639, - "grad_norm": 1.748238205909729, - "learning_rate": 2.5712449110777806e-07, - "loss": 0.1196, - "step": 4653 - }, - { - "epoch": 2.9852469531751122, - "grad_norm": 0.8984864354133606, - "learning_rate": 2.4641097064495393e-07, - "loss": 0.1234, - "step": 4654 - }, - { - "epoch": 2.9858883899935855, - "grad_norm": 1.8696599006652832, - "learning_rate": 2.3569745018212986e-07, - "loss": 0.146, - "step": 4655 - }, - { - "epoch": 2.986529826812059, - "grad_norm": 1.1008124351501465, - "learning_rate": 2.2498392971930576e-07, - "loss": 0.1211, - "step": 4656 - }, - { - "epoch": 2.9871712636305325, - "grad_norm": 1.084455966949463, - "learning_rate": 2.142704092564817e-07, - "loss": 0.126, - "step": 4657 - }, - { - "epoch": 2.987812700449006, - "grad_norm": 1.0937457084655762, - "learning_rate": 2.035568887936576e-07, - "loss": 0.1276, - "step": 4658 - }, - { - "epoch": 2.988454137267479, - "grad_norm": 1.1676932573318481, - "learning_rate": 1.928433683308335e-07, - "loss": 0.138, - "step": 4659 - }, - { - "epoch": 2.9890955740859524, - "grad_norm": 0.9583231210708618, - "learning_rate": 1.8212984786800943e-07, - "loss": 0.1255, - "step": 4660 - }, - { - "epoch": 2.9897370109044257, - "grad_norm": 1.6818722486495972, - "learning_rate": 1.7141632740518536e-07, - "loss": 0.1452, - "step": 4661 - }, - { - "epoch": 2.9903784477228994, - "grad_norm": 0.8881853818893433, - "learning_rate": 1.6070280694236128e-07, - "loss": 0.1163, - "step": 4662 - }, - { - "epoch": 2.9910198845413727, - "grad_norm": 1.4531528949737549, - "learning_rate": 1.4998928647953718e-07, - "loss": 0.1409, - "step": 4663 - }, - { - "epoch": 2.991661321359846, - "grad_norm": 1.7485257387161255, - "learning_rate": 1.392757660167131e-07, - "loss": 0.1532, - "step": 4664 - }, - { - "epoch": 2.9923027581783197, - "grad_norm": 1.6074093580245972, - "learning_rate": 1.2856224555388903e-07, - "loss": 0.1405, - "step": 4665 - }, - { - "epoch": 2.9929441949967925, - "grad_norm": 1.1350222826004028, - "learning_rate": 1.1784872509106493e-07, - "loss": 0.1219, - "step": 4666 - }, - { - "epoch": 2.9935856318152663, - "grad_norm": 2.1866416931152344, - "learning_rate": 1.0713520462824085e-07, - "loss": 0.15, - "step": 4667 - }, - { - "epoch": 2.9942270686337396, - "grad_norm": 1.5415709018707275, - "learning_rate": 9.642168416541675e-08, - "loss": 0.1468, - "step": 4668 - }, - { - "epoch": 2.994868505452213, - "grad_norm": 1.54656183719635, - "learning_rate": 8.570816370259268e-08, - "loss": 0.1414, - "step": 4669 - }, - { - "epoch": 2.9955099422706866, - "grad_norm": 0.8762620687484741, - "learning_rate": 7.499464323976859e-08, - "loss": 0.124, - "step": 4670 - }, - { - "epoch": 2.99615137908916, - "grad_norm": 1.443428635597229, - "learning_rate": 6.428112277694452e-08, - "loss": 0.1338, - "step": 4671 - }, - { - "epoch": 2.996792815907633, - "grad_norm": 1.2783855199813843, - "learning_rate": 5.356760231412043e-08, - "loss": 0.1383, - "step": 4672 - }, - { - "epoch": 2.9974342527261064, - "grad_norm": 1.8705984354019165, - "learning_rate": 4.285408185129634e-08, - "loss": 0.1551, - "step": 4673 - }, - { - "epoch": 2.9980756895445797, - "grad_norm": 1.4174028635025024, - "learning_rate": 3.214056138847226e-08, - "loss": 0.1301, - "step": 4674 - }, - { - "epoch": 2.9987171263630534, - "grad_norm": 1.4996004104614258, - "learning_rate": 2.142704092564817e-08, - "loss": 0.1376, - "step": 4675 - }, - { - "epoch": 2.9993585631815267, - "grad_norm": 1.3305389881134033, - "learning_rate": 1.0713520462824085e-08, - "loss": 0.1246, - "step": 4676 - }, - { - "epoch": 3.0, - "grad_norm": 1.1184957027435303, - "learning_rate": 0.0, - "loss": 0.1355, - "step": 4677 } ], "logging_steps": 1, @@ -32760,12 +24521,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": true + "should_training_stop": false }, "attributes": {} } }, - "total_flos": 5.88504585424724e+17, + "total_flos": 4.4029500652272845e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null