cutelemonlili's picture
Add files using upload-large-folder tool
492bec0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 506,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003952569169960474,
"grad_norm": 16.020900110158806,
"learning_rate": 9.999903631006022e-06,
"loss": 1.0657,
"step": 1
},
{
"epoch": 0.007905138339920948,
"grad_norm": 9.896813279978835,
"learning_rate": 9.999614527738882e-06,
"loss": 0.8817,
"step": 2
},
{
"epoch": 0.011857707509881422,
"grad_norm": 4.710814039433313,
"learning_rate": 9.99913270134281e-06,
"loss": 0.6038,
"step": 3
},
{
"epoch": 0.015810276679841896,
"grad_norm": 3.213510727342975,
"learning_rate": 9.998458170391065e-06,
"loss": 0.5555,
"step": 4
},
{
"epoch": 0.019762845849802372,
"grad_norm": 3.399601233005852,
"learning_rate": 9.99759096088519e-06,
"loss": 0.4486,
"step": 5
},
{
"epoch": 0.023715415019762844,
"grad_norm": 3.2249204890399112,
"learning_rate": 9.996531106254027e-06,
"loss": 0.4616,
"step": 6
},
{
"epoch": 0.02766798418972332,
"grad_norm": 2.5076762498803196,
"learning_rate": 9.995278647352428e-06,
"loss": 0.4705,
"step": 7
},
{
"epoch": 0.03162055335968379,
"grad_norm": 2.865637105499682,
"learning_rate": 9.993833632459675e-06,
"loss": 0.4065,
"step": 8
},
{
"epoch": 0.03557312252964427,
"grad_norm": 2.182471312496633,
"learning_rate": 9.99219611727762e-06,
"loss": 0.3791,
"step": 9
},
{
"epoch": 0.039525691699604744,
"grad_norm": 1.8817059063630837,
"learning_rate": 9.990366164928538e-06,
"loss": 0.3968,
"step": 10
},
{
"epoch": 0.043478260869565216,
"grad_norm": 1.7437964277471723,
"learning_rate": 9.988343845952697e-06,
"loss": 0.3593,
"step": 11
},
{
"epoch": 0.04743083003952569,
"grad_norm": 1.718975131845364,
"learning_rate": 9.986129238305635e-06,
"loss": 0.3241,
"step": 12
},
{
"epoch": 0.05138339920948617,
"grad_norm": 1.7519764781357152,
"learning_rate": 9.983722427355157e-06,
"loss": 0.3514,
"step": 13
},
{
"epoch": 0.05533596837944664,
"grad_norm": 1.8585607479109172,
"learning_rate": 9.98112350587804e-06,
"loss": 0.3307,
"step": 14
},
{
"epoch": 0.05928853754940711,
"grad_norm": 1.648833246586736,
"learning_rate": 9.978332574056468e-06,
"loss": 0.3382,
"step": 15
},
{
"epoch": 0.06324110671936758,
"grad_norm": 1.5144376829290096,
"learning_rate": 9.975349739474156e-06,
"loss": 0.3025,
"step": 16
},
{
"epoch": 0.06719367588932806,
"grad_norm": 1.4722946130746835,
"learning_rate": 9.972175117112208e-06,
"loss": 0.3291,
"step": 17
},
{
"epoch": 0.07114624505928854,
"grad_norm": 1.5590402414570852,
"learning_rate": 9.968808829344692e-06,
"loss": 0.318,
"step": 18
},
{
"epoch": 0.07509881422924901,
"grad_norm": 1.529364514445946,
"learning_rate": 9.965251005933915e-06,
"loss": 0.3029,
"step": 19
},
{
"epoch": 0.07905138339920949,
"grad_norm": 1.5355343280276874,
"learning_rate": 9.961501784025423e-06,
"loss": 0.3075,
"step": 20
},
{
"epoch": 0.08300395256916997,
"grad_norm": 1.4284315641862246,
"learning_rate": 9.95756130814271e-06,
"loss": 0.2733,
"step": 21
},
{
"epoch": 0.08695652173913043,
"grad_norm": 1.529934818238827,
"learning_rate": 9.953429730181653e-06,
"loss": 0.3319,
"step": 22
},
{
"epoch": 0.09090909090909091,
"grad_norm": 1.5594666788219782,
"learning_rate": 9.949107209404664e-06,
"loss": 0.2998,
"step": 23
},
{
"epoch": 0.09486166007905138,
"grad_norm": 1.43783031647119,
"learning_rate": 9.94459391243453e-06,
"loss": 0.2856,
"step": 24
},
{
"epoch": 0.09881422924901186,
"grad_norm": 1.6552583911804946,
"learning_rate": 9.939890013248006e-06,
"loss": 0.3287,
"step": 25
},
{
"epoch": 0.10276679841897234,
"grad_norm": 1.4553289738630346,
"learning_rate": 9.934995693169104e-06,
"loss": 0.2726,
"step": 26
},
{
"epoch": 0.1067193675889328,
"grad_norm": 1.7057032170116595,
"learning_rate": 9.929911140862109e-06,
"loss": 0.3412,
"step": 27
},
{
"epoch": 0.11067193675889328,
"grad_norm": 1.4032529471627393,
"learning_rate": 9.924636552324296e-06,
"loss": 0.2484,
"step": 28
},
{
"epoch": 0.11462450592885376,
"grad_norm": 1.6007146129563619,
"learning_rate": 9.919172130878378e-06,
"loss": 0.3265,
"step": 29
},
{
"epoch": 0.11857707509881422,
"grad_norm": 1.4391496848114302,
"learning_rate": 9.913518087164678e-06,
"loss": 0.285,
"step": 30
},
{
"epoch": 0.1225296442687747,
"grad_norm": 1.4062720696044961,
"learning_rate": 9.907674639132995e-06,
"loss": 0.2548,
"step": 31
},
{
"epoch": 0.12648221343873517,
"grad_norm": 1.410271063639066,
"learning_rate": 9.901642012034214e-06,
"loss": 0.2508,
"step": 32
},
{
"epoch": 0.13043478260869565,
"grad_norm": 1.5527348593856665,
"learning_rate": 9.895420438411616e-06,
"loss": 0.2976,
"step": 33
},
{
"epoch": 0.13438735177865613,
"grad_norm": 1.321537975619614,
"learning_rate": 9.889010158091917e-06,
"loss": 0.2572,
"step": 34
},
{
"epoch": 0.1383399209486166,
"grad_norm": 1.421996806107464,
"learning_rate": 9.882411418176023e-06,
"loss": 0.2706,
"step": 35
},
{
"epoch": 0.1422924901185771,
"grad_norm": 1.3572216549078873,
"learning_rate": 9.875624473029508e-06,
"loss": 0.2545,
"step": 36
},
{
"epoch": 0.14624505928853754,
"grad_norm": 1.2631690700776463,
"learning_rate": 9.8686495842728e-06,
"loss": 0.2625,
"step": 37
},
{
"epoch": 0.15019762845849802,
"grad_norm": 1.2621066835028363,
"learning_rate": 9.861487020771103e-06,
"loss": 0.2591,
"step": 38
},
{
"epoch": 0.1541501976284585,
"grad_norm": 1.2949235488783764,
"learning_rate": 9.854137058624034e-06,
"loss": 0.2309,
"step": 39
},
{
"epoch": 0.15810276679841898,
"grad_norm": 1.3003116841403821,
"learning_rate": 9.846599981154975e-06,
"loss": 0.2326,
"step": 40
},
{
"epoch": 0.16205533596837945,
"grad_norm": 1.3768166039897276,
"learning_rate": 9.838876078900158e-06,
"loss": 0.2397,
"step": 41
},
{
"epoch": 0.16600790513833993,
"grad_norm": 1.3538215396932514,
"learning_rate": 9.830965649597455e-06,
"loss": 0.252,
"step": 42
},
{
"epoch": 0.16996047430830039,
"grad_norm": 1.4163162387664199,
"learning_rate": 9.822868998174914e-06,
"loss": 0.2448,
"step": 43
},
{
"epoch": 0.17391304347826086,
"grad_norm": 1.4257156730091618,
"learning_rate": 9.814586436738998e-06,
"loss": 0.2371,
"step": 44
},
{
"epoch": 0.17786561264822134,
"grad_norm": 1.4437391682465417,
"learning_rate": 9.806118284562547e-06,
"loss": 0.2907,
"step": 45
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.391249530608691,
"learning_rate": 9.797464868072489e-06,
"loss": 0.2683,
"step": 46
},
{
"epoch": 0.1857707509881423,
"grad_norm": 1.4016200934541518,
"learning_rate": 9.788626520837235e-06,
"loss": 0.2473,
"step": 47
},
{
"epoch": 0.18972332015810275,
"grad_norm": 1.279328106626107,
"learning_rate": 9.779603583553842e-06,
"loss": 0.2341,
"step": 48
},
{
"epoch": 0.19367588932806323,
"grad_norm": 1.3404807365252567,
"learning_rate": 9.770396404034863e-06,
"loss": 0.2675,
"step": 49
},
{
"epoch": 0.1976284584980237,
"grad_norm": 1.4585279577064512,
"learning_rate": 9.76100533719495e-06,
"loss": 0.2685,
"step": 50
},
{
"epoch": 0.2015810276679842,
"grad_norm": 1.4424775598833863,
"learning_rate": 9.75143074503717e-06,
"loss": 0.2311,
"step": 51
},
{
"epoch": 0.20553359683794467,
"grad_norm": 1.3955125763500507,
"learning_rate": 9.741672996639046e-06,
"loss": 0.2627,
"step": 52
},
{
"epoch": 0.20948616600790515,
"grad_norm": 1.4097242793234939,
"learning_rate": 9.731732468138338e-06,
"loss": 0.2512,
"step": 53
},
{
"epoch": 0.2134387351778656,
"grad_norm": 1.2779409261860988,
"learning_rate": 9.72160954271854e-06,
"loss": 0.248,
"step": 54
},
{
"epoch": 0.21739130434782608,
"grad_norm": 1.3115141733721187,
"learning_rate": 9.711304610594104e-06,
"loss": 0.2548,
"step": 55
},
{
"epoch": 0.22134387351778656,
"grad_norm": 1.3040054071646883,
"learning_rate": 9.700818068995407e-06,
"loss": 0.2115,
"step": 56
},
{
"epoch": 0.22529644268774704,
"grad_norm": 1.3228671690231326,
"learning_rate": 9.69015032215344e-06,
"loss": 0.2372,
"step": 57
},
{
"epoch": 0.22924901185770752,
"grad_norm": 1.2810295113715335,
"learning_rate": 9.679301781284209e-06,
"loss": 0.2586,
"step": 58
},
{
"epoch": 0.233201581027668,
"grad_norm": 1.2300547258655599,
"learning_rate": 9.668272864572904e-06,
"loss": 0.2122,
"step": 59
},
{
"epoch": 0.23715415019762845,
"grad_norm": 1.331764793327734,
"learning_rate": 9.65706399715777e-06,
"loss": 0.2294,
"step": 60
},
{
"epoch": 0.24110671936758893,
"grad_norm": 1.245135656410948,
"learning_rate": 9.645675611113715e-06,
"loss": 0.2234,
"step": 61
},
{
"epoch": 0.2450592885375494,
"grad_norm": 1.2821048117267706,
"learning_rate": 9.634108145435665e-06,
"loss": 0.2111,
"step": 62
},
{
"epoch": 0.2490118577075099,
"grad_norm": 1.3953990850010947,
"learning_rate": 9.62236204602163e-06,
"loss": 0.2423,
"step": 63
},
{
"epoch": 0.25296442687747034,
"grad_norm": 1.3305442952896998,
"learning_rate": 9.610437765655522e-06,
"loss": 0.2169,
"step": 64
},
{
"epoch": 0.25691699604743085,
"grad_norm": 1.3427264285897607,
"learning_rate": 9.598335763989703e-06,
"loss": 0.2387,
"step": 65
},
{
"epoch": 0.2608695652173913,
"grad_norm": 1.2462397253811373,
"learning_rate": 9.586056507527266e-06,
"loss": 0.2002,
"step": 66
},
{
"epoch": 0.2648221343873518,
"grad_norm": 1.4106601403861017,
"learning_rate": 9.573600469604044e-06,
"loss": 0.2345,
"step": 67
},
{
"epoch": 0.26877470355731226,
"grad_norm": 1.4558970695841007,
"learning_rate": 9.560968130370376e-06,
"loss": 0.257,
"step": 68
},
{
"epoch": 0.2727272727272727,
"grad_norm": 1.2869342015693788,
"learning_rate": 9.548159976772593e-06,
"loss": 0.2272,
"step": 69
},
{
"epoch": 0.2766798418972332,
"grad_norm": 1.2457392708337578,
"learning_rate": 9.535176502534242e-06,
"loss": 0.2011,
"step": 70
},
{
"epoch": 0.28063241106719367,
"grad_norm": 1.2710277485877495,
"learning_rate": 9.522018208137066e-06,
"loss": 0.2198,
"step": 71
},
{
"epoch": 0.2845849802371542,
"grad_norm": 1.4429390388818002,
"learning_rate": 9.508685600801704e-06,
"loss": 0.2277,
"step": 72
},
{
"epoch": 0.2885375494071146,
"grad_norm": 1.2389843654949855,
"learning_rate": 9.495179194468135e-06,
"loss": 0.2269,
"step": 73
},
{
"epoch": 0.2924901185770751,
"grad_norm": 1.2313451113400058,
"learning_rate": 9.481499509775878e-06,
"loss": 0.2229,
"step": 74
},
{
"epoch": 0.2964426877470356,
"grad_norm": 1.2005736870957262,
"learning_rate": 9.467647074043911e-06,
"loss": 0.2132,
"step": 75
},
{
"epoch": 0.30039525691699603,
"grad_norm": 1.3356666012024943,
"learning_rate": 9.453622421250353e-06,
"loss": 0.2261,
"step": 76
},
{
"epoch": 0.30434782608695654,
"grad_norm": 1.307750115235254,
"learning_rate": 9.439426092011877e-06,
"loss": 0.2186,
"step": 77
},
{
"epoch": 0.308300395256917,
"grad_norm": 1.3023411955103703,
"learning_rate": 9.42505863356287e-06,
"loss": 0.2241,
"step": 78
},
{
"epoch": 0.31225296442687744,
"grad_norm": 1.2340243980259775,
"learning_rate": 9.410520599734338e-06,
"loss": 0.2195,
"step": 79
},
{
"epoch": 0.31620553359683795,
"grad_norm": 1.2136232872238946,
"learning_rate": 9.395812550932559e-06,
"loss": 0.2105,
"step": 80
},
{
"epoch": 0.3201581027667984,
"grad_norm": 1.3027913426674749,
"learning_rate": 9.38093505411748e-06,
"loss": 0.2112,
"step": 81
},
{
"epoch": 0.3241106719367589,
"grad_norm": 1.2355979390019485,
"learning_rate": 9.365888682780862e-06,
"loss": 0.2041,
"step": 82
},
{
"epoch": 0.32806324110671936,
"grad_norm": 1.3255611172622812,
"learning_rate": 9.35067401692417e-06,
"loss": 0.2133,
"step": 83
},
{
"epoch": 0.33201581027667987,
"grad_norm": 1.2110730151497104,
"learning_rate": 9.335291643036221e-06,
"loss": 0.1937,
"step": 84
},
{
"epoch": 0.3359683794466403,
"grad_norm": 1.2302820012512985,
"learning_rate": 9.319742154070578e-06,
"loss": 0.2127,
"step": 85
},
{
"epoch": 0.33992094861660077,
"grad_norm": 1.396952411441183,
"learning_rate": 9.30402614942268e-06,
"loss": 0.2498,
"step": 86
},
{
"epoch": 0.3438735177865613,
"grad_norm": 1.4467617558469186,
"learning_rate": 9.288144234906753e-06,
"loss": 0.2582,
"step": 87
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.2724563579817645,
"learning_rate": 9.272097022732444e-06,
"loss": 0.2094,
"step": 88
},
{
"epoch": 0.35177865612648224,
"grad_norm": 1.374464706651341,
"learning_rate": 9.255885131481231e-06,
"loss": 0.2333,
"step": 89
},
{
"epoch": 0.3557312252964427,
"grad_norm": 1.1876568411175543,
"learning_rate": 9.239509186082574e-06,
"loss": 0.1893,
"step": 90
},
{
"epoch": 0.35968379446640314,
"grad_norm": 1.3268240601065355,
"learning_rate": 9.222969817789829e-06,
"loss": 0.2124,
"step": 91
},
{
"epoch": 0.36363636363636365,
"grad_norm": 1.2681330364031638,
"learning_rate": 9.206267664155906e-06,
"loss": 0.1973,
"step": 92
},
{
"epoch": 0.3675889328063241,
"grad_norm": 1.2255166053795126,
"learning_rate": 9.189403369008704e-06,
"loss": 0.224,
"step": 93
},
{
"epoch": 0.3715415019762846,
"grad_norm": 1.3042156326130103,
"learning_rate": 9.172377582426286e-06,
"loss": 0.2366,
"step": 94
},
{
"epoch": 0.37549407114624506,
"grad_norm": 1.2467295856925376,
"learning_rate": 9.155190960711822e-06,
"loss": 0.2307,
"step": 95
},
{
"epoch": 0.3794466403162055,
"grad_norm": 1.3438520348323209,
"learning_rate": 9.137844166368289e-06,
"loss": 0.2358,
"step": 96
},
{
"epoch": 0.383399209486166,
"grad_norm": 1.195811544532563,
"learning_rate": 9.120337868072933e-06,
"loss": 0.1998,
"step": 97
},
{
"epoch": 0.38735177865612647,
"grad_norm": 1.1483129999113106,
"learning_rate": 9.1026727406515e-06,
"loss": 0.1992,
"step": 98
},
{
"epoch": 0.391304347826087,
"grad_norm": 1.2903993706253556,
"learning_rate": 9.08484946505221e-06,
"loss": 0.2103,
"step": 99
},
{
"epoch": 0.3952569169960474,
"grad_norm": 1.3546933181924383,
"learning_rate": 9.066868728319522e-06,
"loss": 0.2431,
"step": 100
},
{
"epoch": 0.39920948616600793,
"grad_norm": 1.2376669346579956,
"learning_rate": 9.048731223567636e-06,
"loss": 0.2112,
"step": 101
},
{
"epoch": 0.4031620553359684,
"grad_norm": 1.1433249084583228,
"learning_rate": 9.03043764995379e-06,
"loss": 0.187,
"step": 102
},
{
"epoch": 0.40711462450592883,
"grad_norm": 1.2494316975005797,
"learning_rate": 9.011988712651295e-06,
"loss": 0.2254,
"step": 103
},
{
"epoch": 0.41106719367588934,
"grad_norm": 1.164269222531622,
"learning_rate": 8.993385122822364e-06,
"loss": 0.196,
"step": 104
},
{
"epoch": 0.4150197628458498,
"grad_norm": 1.2168120947010408,
"learning_rate": 8.974627597590693e-06,
"loss": 0.1871,
"step": 105
},
{
"epoch": 0.4189723320158103,
"grad_norm": 1.2068782855519822,
"learning_rate": 8.955716860013812e-06,
"loss": 0.2061,
"step": 106
},
{
"epoch": 0.42292490118577075,
"grad_norm": 1.217200175520014,
"learning_rate": 8.936653639055225e-06,
"loss": 0.2128,
"step": 107
},
{
"epoch": 0.4268774703557312,
"grad_norm": 1.1978052117951064,
"learning_rate": 8.917438669556307e-06,
"loss": 0.2154,
"step": 108
},
{
"epoch": 0.4308300395256917,
"grad_norm": 1.2353347282860097,
"learning_rate": 8.898072692207964e-06,
"loss": 0.2166,
"step": 109
},
{
"epoch": 0.43478260869565216,
"grad_norm": 1.2843635498058075,
"learning_rate": 8.8785564535221e-06,
"loss": 0.222,
"step": 110
},
{
"epoch": 0.43873517786561267,
"grad_norm": 1.2399015935864328,
"learning_rate": 8.85889070580283e-06,
"loss": 0.2106,
"step": 111
},
{
"epoch": 0.4426877470355731,
"grad_norm": 1.2553929591861368,
"learning_rate": 8.839076207117485e-06,
"loss": 0.2428,
"step": 112
},
{
"epoch": 0.44664031620553357,
"grad_norm": 1.344840565207265,
"learning_rate": 8.819113721267385e-06,
"loss": 0.231,
"step": 113
},
{
"epoch": 0.4505928853754941,
"grad_norm": 1.1925173566212948,
"learning_rate": 8.7990040177584e-06,
"loss": 0.1862,
"step": 114
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.2960852273499728,
"learning_rate": 8.778747871771293e-06,
"loss": 0.2015,
"step": 115
},
{
"epoch": 0.45849802371541504,
"grad_norm": 1.222878498466299,
"learning_rate": 8.758346064131824e-06,
"loss": 0.2153,
"step": 116
},
{
"epoch": 0.4624505928853755,
"grad_norm": 1.274954724070636,
"learning_rate": 8.737799381280667e-06,
"loss": 0.2027,
"step": 117
},
{
"epoch": 0.466403162055336,
"grad_norm": 1.387737355454323,
"learning_rate": 8.717108615243081e-06,
"loss": 0.2221,
"step": 118
},
{
"epoch": 0.47035573122529645,
"grad_norm": 1.3426117595660954,
"learning_rate": 8.696274563598395e-06,
"loss": 0.2255,
"step": 119
},
{
"epoch": 0.4743083003952569,
"grad_norm": 1.119139769942221,
"learning_rate": 8.675298029449241e-06,
"loss": 0.18,
"step": 120
},
{
"epoch": 0.4782608695652174,
"grad_norm": 1.2709986698183513,
"learning_rate": 8.65417982139062e-06,
"loss": 0.2204,
"step": 121
},
{
"epoch": 0.48221343873517786,
"grad_norm": 1.3155178470853532,
"learning_rate": 8.63292075347872e-06,
"loss": 0.2098,
"step": 122
},
{
"epoch": 0.48616600790513836,
"grad_norm": 1.1164093498148737,
"learning_rate": 8.611521645199532e-06,
"loss": 0.1699,
"step": 123
},
{
"epoch": 0.4901185770750988,
"grad_norm": 1.2232423322705634,
"learning_rate": 8.589983321437271e-06,
"loss": 0.2048,
"step": 124
},
{
"epoch": 0.49407114624505927,
"grad_norm": 1.19896747867504,
"learning_rate": 8.568306612442579e-06,
"loss": 0.1818,
"step": 125
},
{
"epoch": 0.4980237154150198,
"grad_norm": 1.1935132661062275,
"learning_rate": 8.546492353800504e-06,
"loss": 0.1905,
"step": 126
},
{
"epoch": 0.5019762845849802,
"grad_norm": 1.3129073388913475,
"learning_rate": 8.524541386398318e-06,
"loss": 0.2249,
"step": 127
},
{
"epoch": 0.5059288537549407,
"grad_norm": 1.128105255244932,
"learning_rate": 8.502454556393071e-06,
"loss": 0.1853,
"step": 128
},
{
"epoch": 0.5098814229249012,
"grad_norm": 1.275277474434624,
"learning_rate": 8.480232715179004e-06,
"loss": 0.2268,
"step": 129
},
{
"epoch": 0.5138339920948617,
"grad_norm": 1.199574317058949,
"learning_rate": 8.457876719354708e-06,
"loss": 0.2159,
"step": 130
},
{
"epoch": 0.5177865612648221,
"grad_norm": 1.333440075825727,
"learning_rate": 8.435387430690114e-06,
"loss": 0.2071,
"step": 131
},
{
"epoch": 0.5217391304347826,
"grad_norm": 1.2298826627346486,
"learning_rate": 8.412765716093273e-06,
"loss": 0.1978,
"step": 132
},
{
"epoch": 0.525691699604743,
"grad_norm": 1.1487447920512157,
"learning_rate": 8.390012447576931e-06,
"loss": 0.1681,
"step": 133
},
{
"epoch": 0.5296442687747036,
"grad_norm": 1.1422378679889824,
"learning_rate": 8.367128502224931e-06,
"loss": 0.1799,
"step": 134
},
{
"epoch": 0.5335968379446641,
"grad_norm": 1.1642741860659866,
"learning_rate": 8.344114762158391e-06,
"loss": 0.1945,
"step": 135
},
{
"epoch": 0.5375494071146245,
"grad_norm": 1.3124275732270296,
"learning_rate": 8.320972114501698e-06,
"loss": 0.1994,
"step": 136
},
{
"epoch": 0.541501976284585,
"grad_norm": 1.1914086199917326,
"learning_rate": 8.297701451348324e-06,
"loss": 0.1982,
"step": 137
},
{
"epoch": 0.5454545454545454,
"grad_norm": 1.251119288035372,
"learning_rate": 8.274303669726427e-06,
"loss": 0.1961,
"step": 138
},
{
"epoch": 0.549407114624506,
"grad_norm": 1.1421929686539185,
"learning_rate": 8.250779671564277e-06,
"loss": 0.205,
"step": 139
},
{
"epoch": 0.5533596837944664,
"grad_norm": 1.26120508447205,
"learning_rate": 8.22713036365549e-06,
"loss": 0.2056,
"step": 140
},
{
"epoch": 0.5573122529644269,
"grad_norm": 1.1982016744926482,
"learning_rate": 8.20335665762407e-06,
"loss": 0.2152,
"step": 141
},
{
"epoch": 0.5612648221343873,
"grad_norm": 1.2176314854159234,
"learning_rate": 8.179459469889269e-06,
"loss": 0.2154,
"step": 142
},
{
"epoch": 0.5652173913043478,
"grad_norm": 1.2473624074495893,
"learning_rate": 8.155439721630265e-06,
"loss": 0.2175,
"step": 143
},
{
"epoch": 0.5691699604743083,
"grad_norm": 1.1267383357984246,
"learning_rate": 8.131298338750648e-06,
"loss": 0.1892,
"step": 144
},
{
"epoch": 0.5731225296442688,
"grad_norm": 1.182358551787801,
"learning_rate": 8.10703625184273e-06,
"loss": 0.1972,
"step": 145
},
{
"epoch": 0.5770750988142292,
"grad_norm": 1.2568983709583992,
"learning_rate": 8.082654396151676e-06,
"loss": 0.2156,
"step": 146
},
{
"epoch": 0.5810276679841897,
"grad_norm": 1.3251313569923084,
"learning_rate": 8.058153711539444e-06,
"loss": 0.2089,
"step": 147
},
{
"epoch": 0.5849802371541502,
"grad_norm": 1.0726734543721594,
"learning_rate": 8.03353514244857e-06,
"loss": 0.1809,
"step": 148
},
{
"epoch": 0.5889328063241107,
"grad_norm": 1.217655834438009,
"learning_rate": 8.008799637865741e-06,
"loss": 0.2023,
"step": 149
},
{
"epoch": 0.5928853754940712,
"grad_norm": 1.0772407801697832,
"learning_rate": 7.983948151285242e-06,
"loss": 0.1796,
"step": 150
},
{
"epoch": 0.5968379446640316,
"grad_norm": 1.2413074180785202,
"learning_rate": 7.958981640672173e-06,
"loss": 0.1977,
"step": 151
},
{
"epoch": 0.6007905138339921,
"grad_norm": 1.0852808186316445,
"learning_rate": 7.933901068425539e-06,
"loss": 0.1731,
"step": 152
},
{
"epoch": 0.6047430830039525,
"grad_norm": 1.299528282255904,
"learning_rate": 7.908707401341146e-06,
"loss": 0.2216,
"step": 153
},
{
"epoch": 0.6086956521739131,
"grad_norm": 1.185556918204072,
"learning_rate": 7.883401610574338e-06,
"loss": 0.1908,
"step": 154
},
{
"epoch": 0.6126482213438735,
"grad_norm": 1.304948505750968,
"learning_rate": 7.857984671602547e-06,
"loss": 0.2008,
"step": 155
},
{
"epoch": 0.616600790513834,
"grad_norm": 1.1900402122684852,
"learning_rate": 7.832457564187715e-06,
"loss": 0.1706,
"step": 156
},
{
"epoch": 0.6205533596837944,
"grad_norm": 1.210872241558616,
"learning_rate": 7.806821272338504e-06,
"loss": 0.2054,
"step": 157
},
{
"epoch": 0.6245059288537549,
"grad_norm": 1.1791680850500765,
"learning_rate": 7.781076784272377e-06,
"loss": 0.1911,
"step": 158
},
{
"epoch": 0.6284584980237155,
"grad_norm": 1.2064915376432837,
"learning_rate": 7.755225092377498e-06,
"loss": 0.2123,
"step": 159
},
{
"epoch": 0.6324110671936759,
"grad_norm": 1.1788766410507692,
"learning_rate": 7.729267193174483e-06,
"loss": 0.2166,
"step": 160
},
{
"epoch": 0.6363636363636364,
"grad_norm": 1.2252698894797731,
"learning_rate": 7.703204087277989e-06,
"loss": 0.1919,
"step": 161
},
{
"epoch": 0.6403162055335968,
"grad_norm": 1.349649897365813,
"learning_rate": 7.67703677935813e-06,
"loss": 0.2178,
"step": 162
},
{
"epoch": 0.6442687747035574,
"grad_norm": 1.1345917440027748,
"learning_rate": 7.650766278101762e-06,
"loss": 0.197,
"step": 163
},
{
"epoch": 0.6482213438735178,
"grad_norm": 1.287989390837344,
"learning_rate": 7.624393596173598e-06,
"loss": 0.2381,
"step": 164
},
{
"epoch": 0.6521739130434783,
"grad_norm": 1.1299739332721368,
"learning_rate": 7.597919750177168e-06,
"loss": 0.1808,
"step": 165
},
{
"epoch": 0.6561264822134387,
"grad_norm": 1.2891189521599526,
"learning_rate": 7.5713457606156335e-06,
"loss": 0.219,
"step": 166
},
{
"epoch": 0.6600790513833992,
"grad_norm": 1.2643945396009584,
"learning_rate": 7.5446726518524505e-06,
"loss": 0.2049,
"step": 167
},
{
"epoch": 0.6640316205533597,
"grad_norm": 1.2293140206996043,
"learning_rate": 7.51790145207188e-06,
"loss": 0.2103,
"step": 168
},
{
"epoch": 0.6679841897233202,
"grad_norm": 1.178935960357507,
"learning_rate": 7.4910331932393634e-06,
"loss": 0.199,
"step": 169
},
{
"epoch": 0.6719367588932806,
"grad_norm": 1.2052463331102627,
"learning_rate": 7.464068911061726e-06,
"loss": 0.2096,
"step": 170
},
{
"epoch": 0.6758893280632411,
"grad_norm": 1.1862206001747997,
"learning_rate": 7.437009644947268e-06,
"loss": 0.1987,
"step": 171
},
{
"epoch": 0.6798418972332015,
"grad_norm": 1.256150663623612,
"learning_rate": 7.40985643796569e-06,
"loss": 0.2198,
"step": 172
},
{
"epoch": 0.6837944664031621,
"grad_norm": 1.1442878472683295,
"learning_rate": 7.382610336807887e-06,
"loss": 0.1735,
"step": 173
},
{
"epoch": 0.6877470355731226,
"grad_norm": 1.21934881533331,
"learning_rate": 7.355272391745605e-06,
"loss": 0.201,
"step": 174
},
{
"epoch": 0.691699604743083,
"grad_norm": 1.2085810780811668,
"learning_rate": 7.327843656590948e-06,
"loss": 0.1997,
"step": 175
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.1706763346917641,
"learning_rate": 7.300325188655762e-06,
"loss": 0.175,
"step": 176
},
{
"epoch": 0.6996047430830039,
"grad_norm": 1.3103080637398228,
"learning_rate": 7.2727180487108725e-06,
"loss": 0.2316,
"step": 177
},
{
"epoch": 0.7035573122529645,
"grad_norm": 1.2414175503064024,
"learning_rate": 7.245023300945203e-06,
"loss": 0.2086,
"step": 178
},
{
"epoch": 0.7075098814229249,
"grad_norm": 1.1028313396028102,
"learning_rate": 7.217242012924747e-06,
"loss": 0.1614,
"step": 179
},
{
"epoch": 0.7114624505928854,
"grad_norm": 1.2545708120381063,
"learning_rate": 7.189375255551413e-06,
"loss": 0.2129,
"step": 180
},
{
"epoch": 0.7154150197628458,
"grad_norm": 1.2208082871634096,
"learning_rate": 7.161424103021752e-06,
"loss": 0.186,
"step": 181
},
{
"epoch": 0.7193675889328063,
"grad_norm": 1.1856323959588346,
"learning_rate": 7.133389632785543e-06,
"loss": 0.18,
"step": 182
},
{
"epoch": 0.7233201581027668,
"grad_norm": 1.0491684795158043,
"learning_rate": 7.1052729255042645e-06,
"loss": 0.1738,
"step": 183
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.1602063012667307,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.1888,
"step": 184
},
{
"epoch": 0.7312252964426877,
"grad_norm": 1.168233348357676,
"learning_rate": 7.048797138260829e-06,
"loss": 0.1938,
"step": 185
},
{
"epoch": 0.7351778656126482,
"grad_norm": 1.0998419207012584,
"learning_rate": 7.020440235304593e-06,
"loss": 0.1541,
"step": 186
},
{
"epoch": 0.7391304347826086,
"grad_norm": 1.2094474362406284,
"learning_rate": 6.9920054492312086e-06,
"loss": 0.1863,
"step": 187
},
{
"epoch": 0.7430830039525692,
"grad_norm": 1.1862926984636035,
"learning_rate": 6.963493876133367e-06,
"loss": 0.1991,
"step": 188
},
{
"epoch": 0.7470355731225297,
"grad_norm": 1.309657217121461,
"learning_rate": 6.934906615063716e-06,
"loss": 0.2202,
"step": 189
},
{
"epoch": 0.7509881422924901,
"grad_norm": 1.1164109935733424,
"learning_rate": 6.90624476799249e-06,
"loss": 0.1755,
"step": 190
},
{
"epoch": 0.7549407114624506,
"grad_norm": 1.0725104657393945,
"learning_rate": 6.8775094397650375e-06,
"loss": 0.1669,
"step": 191
},
{
"epoch": 0.758893280632411,
"grad_norm": 1.2352816410215464,
"learning_rate": 6.8487017380592266e-06,
"loss": 0.1962,
"step": 192
},
{
"epoch": 0.7628458498023716,
"grad_norm": 1.145478161105048,
"learning_rate": 6.81982277334275e-06,
"loss": 0.1864,
"step": 193
},
{
"epoch": 0.766798418972332,
"grad_norm": 1.2512415214036399,
"learning_rate": 6.790873658830321e-06,
"loss": 0.1972,
"step": 194
},
{
"epoch": 0.7707509881422925,
"grad_norm": 1.2466180949277024,
"learning_rate": 6.761855510440752e-06,
"loss": 0.1817,
"step": 195
},
{
"epoch": 0.7747035573122529,
"grad_norm": 1.181804911384916,
"learning_rate": 6.732769446753954e-06,
"loss": 0.175,
"step": 196
},
{
"epoch": 0.7786561264822134,
"grad_norm": 1.2910997699602191,
"learning_rate": 6.703616588967804e-06,
"loss": 0.2146,
"step": 197
},
{
"epoch": 0.782608695652174,
"grad_norm": 1.2156034432115361,
"learning_rate": 6.674398060854931e-06,
"loss": 0.2065,
"step": 198
},
{
"epoch": 0.7865612648221344,
"grad_norm": 1.2324700294150777,
"learning_rate": 6.645114988719401e-06,
"loss": 0.218,
"step": 199
},
{
"epoch": 0.7905138339920948,
"grad_norm": 1.1788814940788603,
"learning_rate": 6.615768501353297e-06,
"loss": 0.1911,
"step": 200
},
{
"epoch": 0.7944664031620553,
"grad_norm": 1.239782727484998,
"learning_rate": 6.5863597299932e-06,
"loss": 0.2033,
"step": 201
},
{
"epoch": 0.7984189723320159,
"grad_norm": 1.2328166491787746,
"learning_rate": 6.5568898082765945e-06,
"loss": 0.1885,
"step": 202
},
{
"epoch": 0.8023715415019763,
"grad_norm": 1.2257569520585552,
"learning_rate": 6.527359872198166e-06,
"loss": 0.2087,
"step": 203
},
{
"epoch": 0.8063241106719368,
"grad_norm": 1.090893060994598,
"learning_rate": 6.497771060066008e-06,
"loss": 0.1673,
"step": 204
},
{
"epoch": 0.8102766798418972,
"grad_norm": 1.262115835491651,
"learning_rate": 6.468124512457743e-06,
"loss": 0.2173,
"step": 205
},
{
"epoch": 0.8142292490118577,
"grad_norm": 1.1404830620435313,
"learning_rate": 6.4384213721765565e-06,
"loss": 0.1821,
"step": 206
},
{
"epoch": 0.8181818181818182,
"grad_norm": 1.0789980915654802,
"learning_rate": 6.408662784207149e-06,
"loss": 0.1809,
"step": 207
},
{
"epoch": 0.8221343873517787,
"grad_norm": 1.1275632208098325,
"learning_rate": 6.378849895671594e-06,
"loss": 0.1807,
"step": 208
},
{
"epoch": 0.8260869565217391,
"grad_norm": 1.1869280328722958,
"learning_rate": 6.348983855785122e-06,
"loss": 0.193,
"step": 209
},
{
"epoch": 0.8300395256916996,
"grad_norm": 1.1369831660790293,
"learning_rate": 6.3190658158118205e-06,
"loss": 0.1736,
"step": 210
},
{
"epoch": 0.83399209486166,
"grad_norm": 1.099689140622585,
"learning_rate": 6.289096929020254e-06,
"loss": 0.1737,
"step": 211
},
{
"epoch": 0.8379446640316206,
"grad_norm": 1.1149596155598573,
"learning_rate": 6.25907835063901e-06,
"loss": 0.185,
"step": 212
},
{
"epoch": 0.841897233201581,
"grad_norm": 1.2181136196244295,
"learning_rate": 6.229011237812172e-06,
"loss": 0.2183,
"step": 213
},
{
"epoch": 0.8458498023715415,
"grad_norm": 1.1288870761802783,
"learning_rate": 6.1988967495547016e-06,
"loss": 0.1705,
"step": 214
},
{
"epoch": 0.849802371541502,
"grad_norm": 1.2184909621033004,
"learning_rate": 6.168736046707777e-06,
"loss": 0.2087,
"step": 215
},
{
"epoch": 0.8537549407114624,
"grad_norm": 1.0707635750160447,
"learning_rate": 6.138530291894033e-06,
"loss": 0.1599,
"step": 216
},
{
"epoch": 0.857707509881423,
"grad_norm": 1.15244187620798,
"learning_rate": 6.108280649472751e-06,
"loss": 0.1977,
"step": 217
},
{
"epoch": 0.8616600790513834,
"grad_norm": 1.140414407474285,
"learning_rate": 6.0779882854949745e-06,
"loss": 0.1916,
"step": 218
},
{
"epoch": 0.8656126482213439,
"grad_norm": 1.168184854563085,
"learning_rate": 6.047654367658563e-06,
"loss": 0.1877,
"step": 219
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.095382559762059,
"learning_rate": 6.0172800652631706e-06,
"loss": 0.1941,
"step": 220
},
{
"epoch": 0.8735177865612648,
"grad_norm": 1.170558717112789,
"learning_rate": 5.986866549165185e-06,
"loss": 0.1841,
"step": 221
},
{
"epoch": 0.8774703557312253,
"grad_norm": 1.1273692690940986,
"learning_rate": 5.9564149917325845e-06,
"loss": 0.1776,
"step": 222
},
{
"epoch": 0.8814229249011858,
"grad_norm": 1.1615247705497285,
"learning_rate": 5.925926566799754e-06,
"loss": 0.1923,
"step": 223
},
{
"epoch": 0.8853754940711462,
"grad_norm": 1.1310915029362631,
"learning_rate": 5.895402449622226e-06,
"loss": 0.1772,
"step": 224
},
{
"epoch": 0.8893280632411067,
"grad_norm": 1.168875863428828,
"learning_rate": 5.864843816831388e-06,
"loss": 0.1844,
"step": 225
},
{
"epoch": 0.8932806324110671,
"grad_norm": 1.1674668972919495,
"learning_rate": 5.8342518463891195e-06,
"loss": 0.1632,
"step": 226
},
{
"epoch": 0.8972332015810277,
"grad_norm": 1.2880944650610078,
"learning_rate": 5.803627717542386e-06,
"loss": 0.1961,
"step": 227
},
{
"epoch": 0.9011857707509882,
"grad_norm": 1.224311023303702,
"learning_rate": 5.7729726107777855e-06,
"loss": 0.1884,
"step": 228
},
{
"epoch": 0.9051383399209486,
"grad_norm": 1.1555445601349188,
"learning_rate": 5.742287707776034e-06,
"loss": 0.1746,
"step": 229
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.137507280031818,
"learning_rate": 5.711574191366427e-06,
"loss": 0.1767,
"step": 230
},
{
"epoch": 0.9130434782608695,
"grad_norm": 1.1300350500871745,
"learning_rate": 5.680833245481234e-06,
"loss": 0.1748,
"step": 231
},
{
"epoch": 0.9169960474308301,
"grad_norm": 1.061749424236297,
"learning_rate": 5.650066055110067e-06,
"loss": 0.1682,
"step": 232
},
{
"epoch": 0.9209486166007905,
"grad_norm": 1.270168772191801,
"learning_rate": 5.6192738062542e-06,
"loss": 0.1883,
"step": 233
},
{
"epoch": 0.924901185770751,
"grad_norm": 1.0546305674656924,
"learning_rate": 5.588457685880851e-06,
"loss": 0.1634,
"step": 234
},
{
"epoch": 0.9288537549407114,
"grad_norm": 1.1513270673921783,
"learning_rate": 5.557618881877428e-06,
"loss": 0.1745,
"step": 235
},
{
"epoch": 0.932806324110672,
"grad_norm": 1.4466685740671457,
"learning_rate": 5.526758583005736e-06,
"loss": 0.2153,
"step": 236
},
{
"epoch": 0.9367588932806324,
"grad_norm": 1.2211664220988006,
"learning_rate": 5.495877978856159e-06,
"loss": 0.1962,
"step": 237
},
{
"epoch": 0.9407114624505929,
"grad_norm": 1.1382037688163558,
"learning_rate": 5.464978259801797e-06,
"loss": 0.1765,
"step": 238
},
{
"epoch": 0.9446640316205533,
"grad_norm": 1.1600221011599767,
"learning_rate": 5.4340606169525915e-06,
"loss": 0.1948,
"step": 239
},
{
"epoch": 0.9486166007905138,
"grad_norm": 1.2329095686972222,
"learning_rate": 5.40312624210939e-06,
"loss": 0.1958,
"step": 240
},
{
"epoch": 0.9525691699604744,
"grad_norm": 1.1077759614799594,
"learning_rate": 5.372176327718029e-06,
"loss": 0.1772,
"step": 241
},
{
"epoch": 0.9565217391304348,
"grad_norm": 1.3525530316416365,
"learning_rate": 5.341212066823356e-06,
"loss": 0.2068,
"step": 242
},
{
"epoch": 0.9604743083003953,
"grad_norm": 1.1641895345672653,
"learning_rate": 5.3102346530232365e-06,
"loss": 0.1983,
"step": 243
},
{
"epoch": 0.9644268774703557,
"grad_norm": 1.210471873881558,
"learning_rate": 5.2792452804225535e-06,
"loss": 0.203,
"step": 244
},
{
"epoch": 0.9683794466403162,
"grad_norm": 1.1895101061042292,
"learning_rate": 5.248245143587172e-06,
"loss": 0.1838,
"step": 245
},
{
"epoch": 0.9723320158102767,
"grad_norm": 1.132321900931639,
"learning_rate": 5.2172354374978905e-06,
"loss": 0.1843,
"step": 246
},
{
"epoch": 0.9762845849802372,
"grad_norm": 1.2024875975928633,
"learning_rate": 5.186217357504382e-06,
"loss": 0.1971,
"step": 247
},
{
"epoch": 0.9802371541501976,
"grad_norm": 1.1385111787681224,
"learning_rate": 5.155192099279113e-06,
"loss": 0.1581,
"step": 248
},
{
"epoch": 0.9841897233201581,
"grad_norm": 1.1812364752105893,
"learning_rate": 5.124160858771252e-06,
"loss": 0.1868,
"step": 249
},
{
"epoch": 0.9881422924901185,
"grad_norm": 1.0663746142171262,
"learning_rate": 5.093124832160569e-06,
"loss": 0.1702,
"step": 250
},
{
"epoch": 0.9920948616600791,
"grad_norm": 1.0637107177598129,
"learning_rate": 5.06208521581133e-06,
"loss": 0.1758,
"step": 251
},
{
"epoch": 0.9960474308300395,
"grad_norm": 1.082027772728129,
"learning_rate": 5.0310432062261764e-06,
"loss": 0.1718,
"step": 252
},
{
"epoch": 1.0,
"grad_norm": 1.2004136292445025,
"learning_rate": 5e-06,
"loss": 0.18,
"step": 253
},
{
"epoch": 1.0039525691699605,
"grad_norm": 1.1438358979908518,
"learning_rate": 4.968956793773825e-06,
"loss": 0.128,
"step": 254
},
{
"epoch": 1.007905138339921,
"grad_norm": 1.0815724819973256,
"learning_rate": 4.9379147841886715e-06,
"loss": 0.1374,
"step": 255
},
{
"epoch": 1.0118577075098814,
"grad_norm": 0.9971041298616835,
"learning_rate": 4.906875167839433e-06,
"loss": 0.1295,
"step": 256
},
{
"epoch": 1.0158102766798418,
"grad_norm": 1.0834575346736253,
"learning_rate": 4.875839141228751e-06,
"loss": 0.1435,
"step": 257
},
{
"epoch": 1.0197628458498025,
"grad_norm": 1.068216857230524,
"learning_rate": 4.844807900720888e-06,
"loss": 0.1369,
"step": 258
},
{
"epoch": 1.023715415019763,
"grad_norm": 1.1436718795229075,
"learning_rate": 4.813782642495618e-06,
"loss": 0.1299,
"step": 259
},
{
"epoch": 1.0276679841897234,
"grad_norm": 1.2041205262898764,
"learning_rate": 4.78276456250211e-06,
"loss": 0.1393,
"step": 260
},
{
"epoch": 1.0316205533596838,
"grad_norm": 1.3153176210311313,
"learning_rate": 4.75175485641283e-06,
"loss": 0.1587,
"step": 261
},
{
"epoch": 1.0355731225296443,
"grad_norm": 1.2818237661945875,
"learning_rate": 4.720754719577448e-06,
"loss": 0.1417,
"step": 262
},
{
"epoch": 1.0395256916996047,
"grad_norm": 1.346590041013075,
"learning_rate": 4.689765346976765e-06,
"loss": 0.1414,
"step": 263
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.396103108426267,
"learning_rate": 4.6587879331766465e-06,
"loss": 0.1541,
"step": 264
},
{
"epoch": 1.0474308300395256,
"grad_norm": 1.3369506628795504,
"learning_rate": 4.627823672281972e-06,
"loss": 0.1566,
"step": 265
},
{
"epoch": 1.051383399209486,
"grad_norm": 1.2097827541662982,
"learning_rate": 4.596873757890612e-06,
"loss": 0.1381,
"step": 266
},
{
"epoch": 1.0553359683794465,
"grad_norm": 1.1170762868644442,
"learning_rate": 4.565939383047411e-06,
"loss": 0.1135,
"step": 267
},
{
"epoch": 1.0592885375494072,
"grad_norm": 1.2233155007588437,
"learning_rate": 4.535021740198202e-06,
"loss": 0.1609,
"step": 268
},
{
"epoch": 1.0632411067193677,
"grad_norm": 1.2914900494723907,
"learning_rate": 4.504122021143842e-06,
"loss": 0.1454,
"step": 269
},
{
"epoch": 1.0671936758893281,
"grad_norm": 1.0833968374959027,
"learning_rate": 4.473241416994265e-06,
"loss": 0.1374,
"step": 270
},
{
"epoch": 1.0711462450592886,
"grad_norm": 1.130910587654628,
"learning_rate": 4.442381118122573e-06,
"loss": 0.1249,
"step": 271
},
{
"epoch": 1.075098814229249,
"grad_norm": 1.0290406864536354,
"learning_rate": 4.41154231411915e-06,
"loss": 0.1179,
"step": 272
},
{
"epoch": 1.0790513833992095,
"grad_norm": 1.1351038290482456,
"learning_rate": 4.3807261937458005e-06,
"loss": 0.1446,
"step": 273
},
{
"epoch": 1.08300395256917,
"grad_norm": 0.9979191858289315,
"learning_rate": 4.349933944889934e-06,
"loss": 0.1085,
"step": 274
},
{
"epoch": 1.0869565217391304,
"grad_norm": 1.0703887978514581,
"learning_rate": 4.319166754518768e-06,
"loss": 0.1277,
"step": 275
},
{
"epoch": 1.0909090909090908,
"grad_norm": 1.1475934247053179,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.1394,
"step": 276
},
{
"epoch": 1.0948616600790513,
"grad_norm": 1.1836305673827001,
"learning_rate": 4.257712292223967e-06,
"loss": 0.1342,
"step": 277
},
{
"epoch": 1.098814229249012,
"grad_norm": 1.2046134166270397,
"learning_rate": 4.227027389222215e-06,
"loss": 0.1344,
"step": 278
},
{
"epoch": 1.1027667984189724,
"grad_norm": 1.035753316225185,
"learning_rate": 4.196372282457614e-06,
"loss": 0.1127,
"step": 279
},
{
"epoch": 1.1067193675889329,
"grad_norm": 1.1282169817707273,
"learning_rate": 4.165748153610881e-06,
"loss": 0.1293,
"step": 280
},
{
"epoch": 1.1106719367588933,
"grad_norm": 1.1372474070307015,
"learning_rate": 4.1351561831686136e-06,
"loss": 0.1332,
"step": 281
},
{
"epoch": 1.1146245059288538,
"grad_norm": 1.1594526273221435,
"learning_rate": 4.104597550377776e-06,
"loss": 0.1263,
"step": 282
},
{
"epoch": 1.1185770750988142,
"grad_norm": 1.15152490259451,
"learning_rate": 4.074073433200249e-06,
"loss": 0.1285,
"step": 283
},
{
"epoch": 1.1225296442687747,
"grad_norm": 1.1540429682296736,
"learning_rate": 4.043585008267418e-06,
"loss": 0.118,
"step": 284
},
{
"epoch": 1.1264822134387351,
"grad_norm": 1.2879322008831273,
"learning_rate": 4.013133450834818e-06,
"loss": 0.1532,
"step": 285
},
{
"epoch": 1.1304347826086956,
"grad_norm": 1.021378919711874,
"learning_rate": 3.982719934736832e-06,
"loss": 0.0988,
"step": 286
},
{
"epoch": 1.1343873517786562,
"grad_norm": 1.1026842967832002,
"learning_rate": 3.95234563234144e-06,
"loss": 0.1148,
"step": 287
},
{
"epoch": 1.1383399209486167,
"grad_norm": 1.1221443001758493,
"learning_rate": 3.9220117145050254e-06,
"loss": 0.1346,
"step": 288
},
{
"epoch": 1.1422924901185771,
"grad_norm": 1.179825748135588,
"learning_rate": 3.89171935052725e-06,
"loss": 0.132,
"step": 289
},
{
"epoch": 1.1462450592885376,
"grad_norm": 1.1145447568885478,
"learning_rate": 3.861469708105969e-06,
"loss": 0.1262,
"step": 290
},
{
"epoch": 1.150197628458498,
"grad_norm": 1.1805243512710089,
"learning_rate": 3.831263953292225e-06,
"loss": 0.1464,
"step": 291
},
{
"epoch": 1.1541501976284585,
"grad_norm": 1.1752099012387707,
"learning_rate": 3.8011032504453e-06,
"loss": 0.1303,
"step": 292
},
{
"epoch": 1.158102766798419,
"grad_norm": 1.1724153479966961,
"learning_rate": 3.7709887621878305e-06,
"loss": 0.1237,
"step": 293
},
{
"epoch": 1.1620553359683794,
"grad_norm": 1.1312088107686393,
"learning_rate": 3.740921649360991e-06,
"loss": 0.1329,
"step": 294
},
{
"epoch": 1.1660079051383399,
"grad_norm": 1.2140114398756852,
"learning_rate": 3.710903070979749e-06,
"loss": 0.146,
"step": 295
},
{
"epoch": 1.1699604743083003,
"grad_norm": 1.177316021638641,
"learning_rate": 3.680934184188182e-06,
"loss": 0.1369,
"step": 296
},
{
"epoch": 1.1739130434782608,
"grad_norm": 1.0958487706218825,
"learning_rate": 3.6510161442148783e-06,
"loss": 0.1224,
"step": 297
},
{
"epoch": 1.1778656126482214,
"grad_norm": 1.0391815446060055,
"learning_rate": 3.621150104328407e-06,
"loss": 0.1248,
"step": 298
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.0701427010593552,
"learning_rate": 3.5913372157928515e-06,
"loss": 0.1224,
"step": 299
},
{
"epoch": 1.1857707509881423,
"grad_norm": 1.1848224795154656,
"learning_rate": 3.5615786278234443e-06,
"loss": 0.1491,
"step": 300
},
{
"epoch": 1.1897233201581028,
"grad_norm": 1.1762535991099512,
"learning_rate": 3.5318754875422588e-06,
"loss": 0.1347,
"step": 301
},
{
"epoch": 1.1936758893280632,
"grad_norm": 1.0206443853029172,
"learning_rate": 3.5022289399339933e-06,
"loss": 0.1022,
"step": 302
},
{
"epoch": 1.1976284584980237,
"grad_norm": 1.0357149112196293,
"learning_rate": 3.4726401278018353e-06,
"loss": 0.1134,
"step": 303
},
{
"epoch": 1.2015810276679841,
"grad_norm": 1.156607546118926,
"learning_rate": 3.443110191723407e-06,
"loss": 0.127,
"step": 304
},
{
"epoch": 1.2055335968379446,
"grad_norm": 1.1452610773145862,
"learning_rate": 3.4136402700068034e-06,
"loss": 0.1242,
"step": 305
},
{
"epoch": 1.2094861660079053,
"grad_norm": 1.137660713683979,
"learning_rate": 3.384231498646706e-06,
"loss": 0.1302,
"step": 306
},
{
"epoch": 1.2134387351778657,
"grad_norm": 1.3751968299311201,
"learning_rate": 3.3548850112805985e-06,
"loss": 0.1345,
"step": 307
},
{
"epoch": 1.2173913043478262,
"grad_norm": 1.1820163169421647,
"learning_rate": 3.3256019391450696e-06,
"loss": 0.124,
"step": 308
},
{
"epoch": 1.2213438735177866,
"grad_norm": 1.254836523224082,
"learning_rate": 3.296383411032198e-06,
"loss": 0.1592,
"step": 309
},
{
"epoch": 1.225296442687747,
"grad_norm": 1.2092831479934754,
"learning_rate": 3.267230553246047e-06,
"loss": 0.1374,
"step": 310
},
{
"epoch": 1.2292490118577075,
"grad_norm": 1.3298953359466184,
"learning_rate": 3.2381444895592483e-06,
"loss": 0.1599,
"step": 311
},
{
"epoch": 1.233201581027668,
"grad_norm": 1.15714090782168,
"learning_rate": 3.209126341169681e-06,
"loss": 0.1178,
"step": 312
},
{
"epoch": 1.2371541501976284,
"grad_norm": 1.1321197195627186,
"learning_rate": 3.180177226657251e-06,
"loss": 0.1324,
"step": 313
},
{
"epoch": 1.2411067193675889,
"grad_norm": 1.1427780558349572,
"learning_rate": 3.151298261940775e-06,
"loss": 0.1354,
"step": 314
},
{
"epoch": 1.2450592885375493,
"grad_norm": 1.1792578676088221,
"learning_rate": 3.122490560234964e-06,
"loss": 0.1316,
"step": 315
},
{
"epoch": 1.2490118577075098,
"grad_norm": 1.173771729377104,
"learning_rate": 3.0937552320075116e-06,
"loss": 0.1237,
"step": 316
},
{
"epoch": 1.2529644268774702,
"grad_norm": 1.2142503247487184,
"learning_rate": 3.065093384936285e-06,
"loss": 0.1385,
"step": 317
},
{
"epoch": 1.256916996047431,
"grad_norm": 1.2169127173091283,
"learning_rate": 3.0365061238666336e-06,
"loss": 0.1442,
"step": 318
},
{
"epoch": 1.2608695652173914,
"grad_norm": 1.3055869261396447,
"learning_rate": 3.007994550768793e-06,
"loss": 0.136,
"step": 319
},
{
"epoch": 1.2648221343873518,
"grad_norm": 1.2043193143210125,
"learning_rate": 2.979559764695409e-06,
"loss": 0.1447,
"step": 320
},
{
"epoch": 1.2687747035573123,
"grad_norm": 1.1233412613298097,
"learning_rate": 2.951202861739173e-06,
"loss": 0.1282,
"step": 321
},
{
"epoch": 1.2727272727272727,
"grad_norm": 1.0606810788891305,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.1212,
"step": 322
},
{
"epoch": 1.2766798418972332,
"grad_norm": 1.0460756961226252,
"learning_rate": 2.8947270744957385e-06,
"loss": 0.1173,
"step": 323
},
{
"epoch": 1.2806324110671936,
"grad_norm": 1.1897624975044134,
"learning_rate": 2.8666103672144597e-06,
"loss": 0.1407,
"step": 324
},
{
"epoch": 1.2845849802371543,
"grad_norm": 1.1942264596916479,
"learning_rate": 2.8385758969782507e-06,
"loss": 0.1286,
"step": 325
},
{
"epoch": 1.2885375494071147,
"grad_norm": 1.1341245622744474,
"learning_rate": 2.810624744448588e-06,
"loss": 0.128,
"step": 326
},
{
"epoch": 1.2924901185770752,
"grad_norm": 1.165964008709128,
"learning_rate": 2.7827579870752542e-06,
"loss": 0.136,
"step": 327
},
{
"epoch": 1.2964426877470356,
"grad_norm": 1.1358629236681395,
"learning_rate": 2.7549766990547973e-06,
"loss": 0.1238,
"step": 328
},
{
"epoch": 1.300395256916996,
"grad_norm": 1.144473644830131,
"learning_rate": 2.727281951289128e-06,
"loss": 0.1333,
"step": 329
},
{
"epoch": 1.3043478260869565,
"grad_norm": 1.1761434599454856,
"learning_rate": 2.6996748113442397e-06,
"loss": 0.1223,
"step": 330
},
{
"epoch": 1.308300395256917,
"grad_norm": 1.1788613585620644,
"learning_rate": 2.672156343409053e-06,
"loss": 0.1349,
"step": 331
},
{
"epoch": 1.3122529644268774,
"grad_norm": 1.1047408820558797,
"learning_rate": 2.644727608254396e-06,
"loss": 0.1215,
"step": 332
},
{
"epoch": 1.316205533596838,
"grad_norm": 1.1915355492120698,
"learning_rate": 2.6173896631921134e-06,
"loss": 0.1405,
"step": 333
},
{
"epoch": 1.3201581027667983,
"grad_norm": 1.1396002338772575,
"learning_rate": 2.590143562034312e-06,
"loss": 0.1297,
"step": 334
},
{
"epoch": 1.3241106719367588,
"grad_norm": 0.9928956463128177,
"learning_rate": 2.5629903550527343e-06,
"loss": 0.1032,
"step": 335
},
{
"epoch": 1.3280632411067192,
"grad_norm": 1.1497535091692899,
"learning_rate": 2.535931088938274e-06,
"loss": 0.1342,
"step": 336
},
{
"epoch": 1.33201581027668,
"grad_norm": 1.1209952848082338,
"learning_rate": 2.5089668067606365e-06,
"loss": 0.1185,
"step": 337
},
{
"epoch": 1.3359683794466404,
"grad_norm": 1.0839706555945572,
"learning_rate": 2.4820985479281184e-06,
"loss": 0.1139,
"step": 338
},
{
"epoch": 1.3399209486166008,
"grad_norm": 1.1440761155145645,
"learning_rate": 2.45532734814755e-06,
"loss": 0.134,
"step": 339
},
{
"epoch": 1.3438735177865613,
"grad_norm": 1.082616882918915,
"learning_rate": 2.4286542393843665e-06,
"loss": 0.1281,
"step": 340
},
{
"epoch": 1.3478260869565217,
"grad_norm": 1.17500675790559,
"learning_rate": 2.4020802498228333e-06,
"loss": 0.1432,
"step": 341
},
{
"epoch": 1.3517786561264822,
"grad_norm": 1.1182114019109513,
"learning_rate": 2.3756064038264033e-06,
"loss": 0.134,
"step": 342
},
{
"epoch": 1.3557312252964426,
"grad_norm": 1.1566712806713801,
"learning_rate": 2.3492337218982396e-06,
"loss": 0.1285,
"step": 343
},
{
"epoch": 1.359683794466403,
"grad_norm": 1.0164245424014764,
"learning_rate": 2.3229632206418727e-06,
"loss": 0.1055,
"step": 344
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.1202273103032814,
"learning_rate": 2.296795912722014e-06,
"loss": 0.1164,
"step": 345
},
{
"epoch": 1.3675889328063242,
"grad_norm": 1.2025482829810825,
"learning_rate": 2.270732806825517e-06,
"loss": 0.1413,
"step": 346
},
{
"epoch": 1.3715415019762847,
"grad_norm": 1.1369886395664646,
"learning_rate": 2.244774907622504e-06,
"loss": 0.1312,
"step": 347
},
{
"epoch": 1.3754940711462451,
"grad_norm": 1.1150013437618065,
"learning_rate": 2.2189232157276247e-06,
"loss": 0.1379,
"step": 348
},
{
"epoch": 1.3794466403162056,
"grad_norm": 1.086272977870767,
"learning_rate": 2.1931787276614968e-06,
"loss": 0.1178,
"step": 349
},
{
"epoch": 1.383399209486166,
"grad_norm": 1.1640308203458913,
"learning_rate": 2.167542435812286e-06,
"loss": 0.1356,
"step": 350
},
{
"epoch": 1.3873517786561265,
"grad_norm": 1.1121490307629227,
"learning_rate": 2.142015328397454e-06,
"loss": 0.1237,
"step": 351
},
{
"epoch": 1.391304347826087,
"grad_norm": 1.1574528436812335,
"learning_rate": 2.1165983894256647e-06,
"loss": 0.1255,
"step": 352
},
{
"epoch": 1.3952569169960474,
"grad_norm": 1.361980674009939,
"learning_rate": 2.0912925986588547e-06,
"loss": 0.1389,
"step": 353
},
{
"epoch": 1.3992094861660078,
"grad_norm": 1.051150691395027,
"learning_rate": 2.0660989315744624e-06,
"loss": 0.1168,
"step": 354
},
{
"epoch": 1.4031620553359683,
"grad_norm": 1.1382181091816,
"learning_rate": 2.0410183593278287e-06,
"loss": 0.1367,
"step": 355
},
{
"epoch": 1.4071146245059287,
"grad_norm": 1.127194102052322,
"learning_rate": 2.016051848714758e-06,
"loss": 0.131,
"step": 356
},
{
"epoch": 1.4110671936758894,
"grad_norm": 1.3646197569162608,
"learning_rate": 1.991200362134258e-06,
"loss": 0.1668,
"step": 357
},
{
"epoch": 1.4150197628458498,
"grad_norm": 1.179231150635698,
"learning_rate": 1.9664648575514316e-06,
"loss": 0.1309,
"step": 358
},
{
"epoch": 1.4189723320158103,
"grad_norm": 1.1478284079295227,
"learning_rate": 1.9418462884605555e-06,
"loss": 0.1407,
"step": 359
},
{
"epoch": 1.4229249011857708,
"grad_norm": 0.966600715786351,
"learning_rate": 1.9173456038483244e-06,
"loss": 0.0989,
"step": 360
},
{
"epoch": 1.4268774703557312,
"grad_norm": 1.089318761705495,
"learning_rate": 1.8929637481572715e-06,
"loss": 0.1233,
"step": 361
},
{
"epoch": 1.4308300395256917,
"grad_norm": 1.1956595892466026,
"learning_rate": 1.8687016612493542e-06,
"loss": 0.1266,
"step": 362
},
{
"epoch": 1.434782608695652,
"grad_norm": 1.199815800783593,
"learning_rate": 1.8445602783697375e-06,
"loss": 0.1383,
"step": 363
},
{
"epoch": 1.4387351778656128,
"grad_norm": 1.0596031990819328,
"learning_rate": 1.8205405301107343e-06,
"loss": 0.1183,
"step": 364
},
{
"epoch": 1.4426877470355732,
"grad_norm": 1.1937639817958505,
"learning_rate": 1.7966433423759327e-06,
"loss": 0.1452,
"step": 365
},
{
"epoch": 1.4466403162055337,
"grad_norm": 1.0906265845240057,
"learning_rate": 1.772869636344512e-06,
"loss": 0.111,
"step": 366
},
{
"epoch": 1.4505928853754941,
"grad_norm": 1.2231633736793504,
"learning_rate": 1.7492203284357245e-06,
"loss": 0.1362,
"step": 367
},
{
"epoch": 1.4545454545454546,
"grad_norm": 1.0619383271235998,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.1202,
"step": 368
},
{
"epoch": 1.458498023715415,
"grad_norm": 1.0900241430901978,
"learning_rate": 1.702298548651678e-06,
"loss": 0.1162,
"step": 369
},
{
"epoch": 1.4624505928853755,
"grad_norm": 1.1512284006660594,
"learning_rate": 1.6790278854983033e-06,
"loss": 0.1322,
"step": 370
},
{
"epoch": 1.466403162055336,
"grad_norm": 1.1329200558876649,
"learning_rate": 1.6558852378416113e-06,
"loss": 0.1362,
"step": 371
},
{
"epoch": 1.4703557312252964,
"grad_norm": 1.0410869979854105,
"learning_rate": 1.6328714977750698e-06,
"loss": 0.1239,
"step": 372
},
{
"epoch": 1.4743083003952568,
"grad_norm": 1.2545052203915814,
"learning_rate": 1.6099875524230707e-06,
"loss": 0.1476,
"step": 373
},
{
"epoch": 1.4782608695652173,
"grad_norm": 1.2003692839751088,
"learning_rate": 1.5872342839067305e-06,
"loss": 0.1381,
"step": 374
},
{
"epoch": 1.4822134387351777,
"grad_norm": 1.1739098836348705,
"learning_rate": 1.5646125693098863e-06,
"loss": 0.1295,
"step": 375
},
{
"epoch": 1.4861660079051384,
"grad_norm": 1.1535530949440382,
"learning_rate": 1.542123280645292e-06,
"loss": 0.1234,
"step": 376
},
{
"epoch": 1.4901185770750989,
"grad_norm": 1.1168208484387032,
"learning_rate": 1.519767284820996e-06,
"loss": 0.1152,
"step": 377
},
{
"epoch": 1.4940711462450593,
"grad_norm": 1.0340813124452886,
"learning_rate": 1.4975454436069292e-06,
"loss": 0.1114,
"step": 378
},
{
"epoch": 1.4980237154150198,
"grad_norm": 1.117157676559404,
"learning_rate": 1.4754586136016841e-06,
"loss": 0.127,
"step": 379
},
{
"epoch": 1.5019762845849802,
"grad_norm": 1.1411823176863023,
"learning_rate": 1.4535076461994974e-06,
"loss": 0.1243,
"step": 380
},
{
"epoch": 1.5059288537549407,
"grad_norm": 1.0074369184939824,
"learning_rate": 1.431693387557424e-06,
"loss": 0.1007,
"step": 381
},
{
"epoch": 1.5098814229249014,
"grad_norm": 1.1674952879116636,
"learning_rate": 1.4100166785627301e-06,
"loss": 0.1201,
"step": 382
},
{
"epoch": 1.5138339920948618,
"grad_norm": 1.0610397050784564,
"learning_rate": 1.3884783548004704e-06,
"loss": 0.1208,
"step": 383
},
{
"epoch": 1.5177865612648223,
"grad_norm": 1.1703142856070294,
"learning_rate": 1.3670792465212828e-06,
"loss": 0.131,
"step": 384
},
{
"epoch": 1.5217391304347827,
"grad_norm": 1.24844816636085,
"learning_rate": 1.3458201786093795e-06,
"loss": 0.1426,
"step": 385
},
{
"epoch": 1.5256916996047432,
"grad_norm": 1.0289708643260822,
"learning_rate": 1.3247019705507596e-06,
"loss": 0.107,
"step": 386
},
{
"epoch": 1.5296442687747036,
"grad_norm": 1.0411982481948332,
"learning_rate": 1.3037254364016068e-06,
"loss": 0.1106,
"step": 387
},
{
"epoch": 1.533596837944664,
"grad_norm": 1.0887512215886415,
"learning_rate": 1.2828913847569185e-06,
"loss": 0.126,
"step": 388
},
{
"epoch": 1.5375494071146245,
"grad_norm": 1.1038917846986824,
"learning_rate": 1.2622006187193348e-06,
"loss": 0.1169,
"step": 389
},
{
"epoch": 1.541501976284585,
"grad_norm": 1.1650275815326414,
"learning_rate": 1.2416539358681772e-06,
"loss": 0.1243,
"step": 390
},
{
"epoch": 1.5454545454545454,
"grad_norm": 1.126326384634314,
"learning_rate": 1.2212521282287093e-06,
"loss": 0.129,
"step": 391
},
{
"epoch": 1.5494071146245059,
"grad_norm": 1.129180074421009,
"learning_rate": 1.2009959822416012e-06,
"loss": 0.1382,
"step": 392
},
{
"epoch": 1.5533596837944663,
"grad_norm": 1.164913976767265,
"learning_rate": 1.1808862787326176e-06,
"loss": 0.1217,
"step": 393
},
{
"epoch": 1.5573122529644268,
"grad_norm": 1.1842645447436517,
"learning_rate": 1.1609237928825174e-06,
"loss": 0.1392,
"step": 394
},
{
"epoch": 1.5612648221343872,
"grad_norm": 1.127758214888184,
"learning_rate": 1.1411092941971702e-06,
"loss": 0.1334,
"step": 395
},
{
"epoch": 1.5652173913043477,
"grad_norm": 1.0268028668010976,
"learning_rate": 1.1214435464779006e-06,
"loss": 0.1092,
"step": 396
},
{
"epoch": 1.5691699604743083,
"grad_norm": 1.1298748698800432,
"learning_rate": 1.1019273077920366e-06,
"loss": 0.1199,
"step": 397
},
{
"epoch": 1.5731225296442688,
"grad_norm": 1.151983299642192,
"learning_rate": 1.0825613304436938e-06,
"loss": 0.1214,
"step": 398
},
{
"epoch": 1.5770750988142292,
"grad_norm": 1.056563475272345,
"learning_rate": 1.0633463609447753e-06,
"loss": 0.1137,
"step": 399
},
{
"epoch": 1.5810276679841897,
"grad_norm": 1.0401127604187286,
"learning_rate": 1.0442831399861903e-06,
"loss": 0.1116,
"step": 400
},
{
"epoch": 1.5849802371541502,
"grad_norm": 1.137297843112467,
"learning_rate": 1.0253724024093103e-06,
"loss": 0.1259,
"step": 401
},
{
"epoch": 1.5889328063241108,
"grad_norm": 1.2731646482391143,
"learning_rate": 1.006614877177638e-06,
"loss": 0.1398,
"step": 402
},
{
"epoch": 1.5928853754940713,
"grad_norm": 1.2031755255565708,
"learning_rate": 9.880112873487068e-07,
"loss": 0.1435,
"step": 403
},
{
"epoch": 1.5968379446640317,
"grad_norm": 1.0673733506453118,
"learning_rate": 9.695623500462114e-07,
"loss": 0.1213,
"step": 404
},
{
"epoch": 1.6007905138339922,
"grad_norm": 1.1540390417696156,
"learning_rate": 9.512687764323647e-07,
"loss": 0.1309,
"step": 405
},
{
"epoch": 1.6047430830039526,
"grad_norm": 1.1719558092176683,
"learning_rate": 9.331312716804791e-07,
"loss": 0.1353,
"step": 406
},
{
"epoch": 1.608695652173913,
"grad_norm": 1.1328079202728492,
"learning_rate": 9.151505349477901e-07,
"loss": 0.1126,
"step": 407
},
{
"epoch": 1.6126482213438735,
"grad_norm": 1.1668524122475088,
"learning_rate": 8.973272593485011e-07,
"loss": 0.1232,
"step": 408
},
{
"epoch": 1.616600790513834,
"grad_norm": 1.1765999211223719,
"learning_rate": 8.796621319270676e-07,
"loss": 0.1381,
"step": 409
},
{
"epoch": 1.6205533596837944,
"grad_norm": 1.1580704897137293,
"learning_rate": 8.621558336317132e-07,
"loss": 0.125,
"step": 410
},
{
"epoch": 1.6245059288537549,
"grad_norm": 1.2221405236185294,
"learning_rate": 8.448090392881797e-07,
"loss": 0.1434,
"step": 411
},
{
"epoch": 1.6284584980237153,
"grad_norm": 1.1484711907465028,
"learning_rate": 8.276224175737152e-07,
"loss": 0.1341,
"step": 412
},
{
"epoch": 1.6324110671936758,
"grad_norm": 1.0668615052922243,
"learning_rate": 8.105966309912966e-07,
"loss": 0.119,
"step": 413
},
{
"epoch": 1.6363636363636362,
"grad_norm": 1.1708621908812695,
"learning_rate": 7.937323358440935e-07,
"loss": 0.1365,
"step": 414
},
{
"epoch": 1.6403162055335967,
"grad_norm": 1.0037003731577279,
"learning_rate": 7.770301822101712e-07,
"loss": 0.1056,
"step": 415
},
{
"epoch": 1.6442687747035574,
"grad_norm": 1.0648681490157104,
"learning_rate": 7.604908139174255e-07,
"loss": 0.1098,
"step": 416
},
{
"epoch": 1.6482213438735178,
"grad_norm": 1.1995736066849585,
"learning_rate": 7.441148685187694e-07,
"loss": 0.1341,
"step": 417
},
{
"epoch": 1.6521739130434783,
"grad_norm": 1.1848136469443982,
"learning_rate": 7.279029772675572e-07,
"loss": 0.1336,
"step": 418
},
{
"epoch": 1.6561264822134387,
"grad_norm": 1.1795612477078212,
"learning_rate": 7.11855765093249e-07,
"loss": 0.1379,
"step": 419
},
{
"epoch": 1.6600790513833992,
"grad_norm": 1.244369267153096,
"learning_rate": 6.959738505773211e-07,
"loss": 0.1373,
"step": 420
},
{
"epoch": 1.6640316205533598,
"grad_norm": 1.148156272898155,
"learning_rate": 6.802578459294235e-07,
"loss": 0.1274,
"step": 421
},
{
"epoch": 1.6679841897233203,
"grad_norm": 1.1618407598954121,
"learning_rate": 6.647083569637797e-07,
"loss": 0.1208,
"step": 422
},
{
"epoch": 1.6719367588932808,
"grad_norm": 1.1228370521334565,
"learning_rate": 6.493259830758325e-07,
"loss": 0.1181,
"step": 423
},
{
"epoch": 1.6758893280632412,
"grad_norm": 1.1659039027502425,
"learning_rate": 6.341113172191399e-07,
"loss": 0.1251,
"step": 424
},
{
"epoch": 1.6798418972332017,
"grad_norm": 1.1667891678872329,
"learning_rate": 6.190649458825204e-07,
"loss": 0.1373,
"step": 425
},
{
"epoch": 1.683794466403162,
"grad_norm": 1.1054656728972119,
"learning_rate": 6.041874490674416e-07,
"loss": 0.1183,
"step": 426
},
{
"epoch": 1.6877470355731226,
"grad_norm": 1.1318157363661536,
"learning_rate": 5.894794002656628e-07,
"loss": 0.1276,
"step": 427
},
{
"epoch": 1.691699604743083,
"grad_norm": 1.1427617467212088,
"learning_rate": 5.749413664371312e-07,
"loss": 0.1314,
"step": 428
},
{
"epoch": 1.6956521739130435,
"grad_norm": 1.123370168432176,
"learning_rate": 5.60573907988124e-07,
"loss": 0.1172,
"step": 429
},
{
"epoch": 1.699604743083004,
"grad_norm": 1.2640707026191205,
"learning_rate": 5.463775787496484e-07,
"loss": 0.1345,
"step": 430
},
{
"epoch": 1.7035573122529644,
"grad_norm": 1.2270465900901621,
"learning_rate": 5.323529259560911e-07,
"loss": 0.1337,
"step": 431
},
{
"epoch": 1.7075098814229248,
"grad_norm": 1.0714382150531283,
"learning_rate": 5.185004902241241e-07,
"loss": 0.1125,
"step": 432
},
{
"epoch": 1.7114624505928853,
"grad_norm": 1.0825327963083162,
"learning_rate": 5.04820805531866e-07,
"loss": 0.1059,
"step": 433
},
{
"epoch": 1.7154150197628457,
"grad_norm": 1.0596003222945678,
"learning_rate": 4.91314399198296e-07,
"loss": 0.0999,
"step": 434
},
{
"epoch": 1.7193675889328062,
"grad_norm": 1.1990608827420846,
"learning_rate": 4.779817918629326e-07,
"loss": 0.1257,
"step": 435
},
{
"epoch": 1.7233201581027668,
"grad_norm": 1.0887979080170394,
"learning_rate": 4.6482349746575783e-07,
"loss": 0.1236,
"step": 436
},
{
"epoch": 1.7272727272727273,
"grad_norm": 1.0412689170684666,
"learning_rate": 4.5184002322740784e-07,
"loss": 0.1066,
"step": 437
},
{
"epoch": 1.7312252964426877,
"grad_norm": 1.0802724381125226,
"learning_rate": 4.390318696296247e-07,
"loss": 0.1149,
"step": 438
},
{
"epoch": 1.7351778656126482,
"grad_norm": 1.1551936194230803,
"learning_rate": 4.2639953039595725e-07,
"loss": 0.135,
"step": 439
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.2135177448970054,
"learning_rate": 4.139434924727359e-07,
"loss": 0.1349,
"step": 440
},
{
"epoch": 1.7430830039525693,
"grad_norm": 1.0507122616854452,
"learning_rate": 4.0166423601029735e-07,
"loss": 0.1157,
"step": 441
},
{
"epoch": 1.7470355731225298,
"grad_norm": 1.1753401932840621,
"learning_rate": 3.8956223434447936e-07,
"loss": 0.136,
"step": 442
},
{
"epoch": 1.7509881422924902,
"grad_norm": 1.1404040824302084,
"learning_rate": 3.776379539783709e-07,
"loss": 0.1223,
"step": 443
},
{
"epoch": 1.7549407114624507,
"grad_norm": 1.2777386537799145,
"learning_rate": 3.658918545643353e-07,
"loss": 0.1485,
"step": 444
},
{
"epoch": 1.7588932806324111,
"grad_norm": 1.1413712412237624,
"learning_rate": 3.543243888862841e-07,
"loss": 0.1212,
"step": 445
},
{
"epoch": 1.7628458498023716,
"grad_norm": 1.0872960731651602,
"learning_rate": 3.429360028422307e-07,
"loss": 0.1193,
"step": 446
},
{
"epoch": 1.766798418972332,
"grad_norm": 1.0637737003753145,
"learning_rate": 3.317271354270968e-07,
"loss": 0.1133,
"step": 447
},
{
"epoch": 1.7707509881422925,
"grad_norm": 1.1840091720746755,
"learning_rate": 3.2069821871579255e-07,
"loss": 0.1333,
"step": 448
},
{
"epoch": 1.774703557312253,
"grad_norm": 1.1255230443713273,
"learning_rate": 3.098496778465621e-07,
"loss": 0.1181,
"step": 449
},
{
"epoch": 1.7786561264822134,
"grad_norm": 1.0775241906777941,
"learning_rate": 2.991819310045929e-07,
"loss": 0.1189,
"step": 450
},
{
"epoch": 1.7826086956521738,
"grad_norm": 1.0583221665971685,
"learning_rate": 2.88695389405898e-07,
"loss": 0.1103,
"step": 451
},
{
"epoch": 1.7865612648221343,
"grad_norm": 1.1242814725320212,
"learning_rate": 2.783904572814622e-07,
"loss": 0.1285,
"step": 452
},
{
"epoch": 1.7905138339920947,
"grad_norm": 1.1779401797110727,
"learning_rate": 2.682675318616618e-07,
"loss": 0.1294,
"step": 453
},
{
"epoch": 1.7944664031620552,
"grad_norm": 1.0859491545785036,
"learning_rate": 2.583270033609536e-07,
"loss": 0.1123,
"step": 454
},
{
"epoch": 1.7984189723320159,
"grad_norm": 1.0763948870282039,
"learning_rate": 2.4856925496283045e-07,
"loss": 0.1104,
"step": 455
},
{
"epoch": 1.8023715415019763,
"grad_norm": 1.0887108595666282,
"learning_rate": 2.3899466280504936e-07,
"loss": 0.1126,
"step": 456
},
{
"epoch": 1.8063241106719368,
"grad_norm": 1.1717149496208563,
"learning_rate": 2.2960359596513714e-07,
"loss": 0.1351,
"step": 457
},
{
"epoch": 1.8102766798418972,
"grad_norm": 1.229561030964994,
"learning_rate": 2.203964164461597e-07,
"loss": 0.1369,
"step": 458
},
{
"epoch": 1.8142292490118577,
"grad_norm": 1.1633472228135062,
"learning_rate": 2.113734791627664e-07,
"loss": 0.1291,
"step": 459
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.2035274009151151,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.1469,
"step": 460
},
{
"epoch": 1.8221343873517788,
"grad_norm": 1.1836270092696015,
"learning_rate": 1.9388171543745394e-07,
"loss": 0.1379,
"step": 461
},
{
"epoch": 1.8260869565217392,
"grad_norm": 1.0758870256725546,
"learning_rate": 1.8541356326100436e-07,
"loss": 0.1117,
"step": 462
},
{
"epoch": 1.8300395256916997,
"grad_norm": 1.1793736035903457,
"learning_rate": 1.7713100182508604e-07,
"loss": 0.1398,
"step": 463
},
{
"epoch": 1.8339920948616601,
"grad_norm": 1.0801974661120426,
"learning_rate": 1.6903435040254545e-07,
"loss": 0.1128,
"step": 464
},
{
"epoch": 1.8379446640316206,
"grad_norm": 1.07339663507577,
"learning_rate": 1.6112392109984386e-07,
"loss": 0.1099,
"step": 465
},
{
"epoch": 1.841897233201581,
"grad_norm": 1.1120046775263792,
"learning_rate": 1.5340001884502577e-07,
"loss": 0.1262,
"step": 466
},
{
"epoch": 1.8458498023715415,
"grad_norm": 1.0831243721534856,
"learning_rate": 1.4586294137596768e-07,
"loss": 0.1193,
"step": 467
},
{
"epoch": 1.849802371541502,
"grad_norm": 1.0472977237699574,
"learning_rate": 1.385129792288986e-07,
"loss": 0.1041,
"step": 468
},
{
"epoch": 1.8537549407114624,
"grad_norm": 1.1482729228107225,
"learning_rate": 1.313504157272022e-07,
"loss": 0.1301,
"step": 469
},
{
"epoch": 1.8577075098814229,
"grad_norm": 1.1397420372115987,
"learning_rate": 1.2437552697049327e-07,
"loss": 0.1289,
"step": 470
},
{
"epoch": 1.8616600790513833,
"grad_norm": 1.1569650321244938,
"learning_rate": 1.1758858182397692e-07,
"loss": 0.1283,
"step": 471
},
{
"epoch": 1.8656126482213438,
"grad_norm": 1.1694788716855629,
"learning_rate": 1.1098984190808403e-07,
"loss": 0.1335,
"step": 472
},
{
"epoch": 1.8695652173913042,
"grad_norm": 1.0086960578539177,
"learning_rate": 1.0457956158838545e-07,
"loss": 0.0996,
"step": 473
},
{
"epoch": 1.8735177865612647,
"grad_norm": 1.0308075869001883,
"learning_rate": 9.835798796578755e-08,
"loss": 0.1153,
"step": 474
},
{
"epoch": 1.8774703557312253,
"grad_norm": 1.170279753009744,
"learning_rate": 9.232536086700605e-08,
"loss": 0.1304,
"step": 475
},
{
"epoch": 1.8814229249011858,
"grad_norm": 1.0870287190560486,
"learning_rate": 8.648191283532337e-08,
"loss": 0.1213,
"step": 476
},
{
"epoch": 1.8853754940711462,
"grad_norm": 1.1165962048020976,
"learning_rate": 8.082786912162243e-08,
"loss": 0.1161,
"step": 477
},
{
"epoch": 1.8893280632411067,
"grad_norm": 1.1494916541958282,
"learning_rate": 7.536344767570536e-08,
"loss": 0.1141,
"step": 478
},
{
"epoch": 1.8932806324110671,
"grad_norm": 1.0640073229737435,
"learning_rate": 7.008885913789066e-08,
"loss": 0.1228,
"step": 479
},
{
"epoch": 1.8972332015810278,
"grad_norm": 1.1870546140040668,
"learning_rate": 6.500430683089532e-08,
"loss": 0.1303,
"step": 480
},
{
"epoch": 1.9011857707509883,
"grad_norm": 1.173615256902204,
"learning_rate": 6.010998675199554e-08,
"loss": 0.1338,
"step": 481
},
{
"epoch": 1.9051383399209487,
"grad_norm": 1.1958445869792897,
"learning_rate": 5.5406087565471054e-08,
"loss": 0.1405,
"step": 482
},
{
"epoch": 1.9090909090909092,
"grad_norm": 1.1122683154170514,
"learning_rate": 5.089279059533658e-08,
"loss": 0.1281,
"step": 483
},
{
"epoch": 1.9130434782608696,
"grad_norm": 1.0958024070105987,
"learning_rate": 4.657026981834623e-08,
"loss": 0.12,
"step": 484
},
{
"epoch": 1.91699604743083,
"grad_norm": 1.1024888588089012,
"learning_rate": 4.2438691857292215e-08,
"loss": 0.1144,
"step": 485
},
{
"epoch": 1.9209486166007905,
"grad_norm": 1.123264444499075,
"learning_rate": 3.849821597457892e-08,
"loss": 0.1347,
"step": 486
},
{
"epoch": 1.924901185770751,
"grad_norm": 1.0765658056683685,
"learning_rate": 3.474899406608501e-08,
"loss": 0.1129,
"step": 487
},
{
"epoch": 1.9288537549407114,
"grad_norm": 1.148979953908725,
"learning_rate": 3.119117065530808e-08,
"loss": 0.123,
"step": 488
},
{
"epoch": 1.9328063241106719,
"grad_norm": 1.2501363554379483,
"learning_rate": 2.7824882887793058e-08,
"loss": 0.1538,
"step": 489
},
{
"epoch": 1.9367588932806323,
"grad_norm": 1.196112067372065,
"learning_rate": 2.4650260525846404e-08,
"loss": 0.1416,
"step": 490
},
{
"epoch": 1.9407114624505928,
"grad_norm": 1.1128332896345188,
"learning_rate": 2.1667425943532884e-08,
"loss": 0.1147,
"step": 491
},
{
"epoch": 1.9446640316205532,
"grad_norm": 1.1016376226716715,
"learning_rate": 1.8876494121959908e-08,
"loss": 0.1289,
"step": 492
},
{
"epoch": 1.9486166007905137,
"grad_norm": 1.135611367054567,
"learning_rate": 1.627757264484442e-08,
"loss": 0.1294,
"step": 493
},
{
"epoch": 1.9525691699604744,
"grad_norm": 1.1398251434326172,
"learning_rate": 1.387076169436563e-08,
"loss": 0.1262,
"step": 494
},
{
"epoch": 1.9565217391304348,
"grad_norm": 1.0846986101599616,
"learning_rate": 1.1656154047303691e-08,
"loss": 0.1126,
"step": 495
},
{
"epoch": 1.9604743083003953,
"grad_norm": 1.2370763808268233,
"learning_rate": 9.633835071463094e-09,
"loss": 0.1485,
"step": 496
},
{
"epoch": 1.9644268774703557,
"grad_norm": 1.1863239612132057,
"learning_rate": 7.803882722381417e-09,
"loss": 0.1293,
"step": 497
},
{
"epoch": 1.9683794466403162,
"grad_norm": 1.2282535432480546,
"learning_rate": 6.166367540325624e-09,
"loss": 0.1428,
"step": 498
},
{
"epoch": 1.9723320158102768,
"grad_norm": 1.0276030484998475,
"learning_rate": 4.721352647572564e-09,
"loss": 0.1129,
"step": 499
},
{
"epoch": 1.9762845849802373,
"grad_norm": 1.2463780542335487,
"learning_rate": 3.4688937459737004e-09,
"loss": 0.1394,
"step": 500
},
{
"epoch": 1.9762845849802373,
"eval_loss": 0.16894643008708954,
"eval_runtime": 3.7534,
"eval_samples_per_second": 5.595,
"eval_steps_per_second": 1.599,
"step": 500
},
{
"epoch": 1.9802371541501977,
"grad_norm": 1.149104137588847,
"learning_rate": 2.4090391148112734e-09,
"loss": 0.126,
"step": 501
},
{
"epoch": 1.9841897233201582,
"grad_norm": 1.0644722994856588,
"learning_rate": 1.5418296089358964e-09,
"loss": 0.1161,
"step": 502
},
{
"epoch": 1.9881422924901186,
"grad_norm": 1.1078146584096278,
"learning_rate": 8.672986571894859e-10,
"loss": 0.1159,
"step": 503
},
{
"epoch": 1.992094861660079,
"grad_norm": 1.0306240076981967,
"learning_rate": 3.854722611201789e-10,
"loss": 0.1143,
"step": 504
},
{
"epoch": 1.9960474308300395,
"grad_norm": 1.150741799871838,
"learning_rate": 9.636899397813537e-11,
"loss": 0.1237,
"step": 505
},
{
"epoch": 2.0,
"grad_norm": 1.0328104166457073,
"learning_rate": 0.0,
"loss": 0.1028,
"step": 506
},
{
"epoch": 2.0,
"step": 506,
"total_flos": 6736634707968.0,
"train_loss": 0.1787011340083812,
"train_runtime": 1032.8943,
"train_samples_per_second": 3.913,
"train_steps_per_second": 0.49
}
],
"logging_steps": 1,
"max_steps": 506,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 70000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6736634707968.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}