|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 506, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003952569169960474, |
|
"grad_norm": 16.020900110158806, |
|
"learning_rate": 9.999903631006022e-06, |
|
"loss": 1.0657, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007905138339920948, |
|
"grad_norm": 9.896813279978835, |
|
"learning_rate": 9.999614527738882e-06, |
|
"loss": 0.8817, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.011857707509881422, |
|
"grad_norm": 4.710814039433313, |
|
"learning_rate": 9.99913270134281e-06, |
|
"loss": 0.6038, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"grad_norm": 3.213510727342975, |
|
"learning_rate": 9.998458170391065e-06, |
|
"loss": 0.5555, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.019762845849802372, |
|
"grad_norm": 3.399601233005852, |
|
"learning_rate": 9.99759096088519e-06, |
|
"loss": 0.4486, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.023715415019762844, |
|
"grad_norm": 3.2249204890399112, |
|
"learning_rate": 9.996531106254027e-06, |
|
"loss": 0.4616, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02766798418972332, |
|
"grad_norm": 2.5076762498803196, |
|
"learning_rate": 9.995278647352428e-06, |
|
"loss": 0.4705, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03162055335968379, |
|
"grad_norm": 2.865637105499682, |
|
"learning_rate": 9.993833632459675e-06, |
|
"loss": 0.4065, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03557312252964427, |
|
"grad_norm": 2.182471312496633, |
|
"learning_rate": 9.99219611727762e-06, |
|
"loss": 0.3791, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.039525691699604744, |
|
"grad_norm": 1.8817059063630837, |
|
"learning_rate": 9.990366164928538e-06, |
|
"loss": 0.3968, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.043478260869565216, |
|
"grad_norm": 1.7437964277471723, |
|
"learning_rate": 9.988343845952697e-06, |
|
"loss": 0.3593, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04743083003952569, |
|
"grad_norm": 1.718975131845364, |
|
"learning_rate": 9.986129238305635e-06, |
|
"loss": 0.3241, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05138339920948617, |
|
"grad_norm": 1.7519764781357152, |
|
"learning_rate": 9.983722427355157e-06, |
|
"loss": 0.3514, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.05533596837944664, |
|
"grad_norm": 1.8585607479109172, |
|
"learning_rate": 9.98112350587804e-06, |
|
"loss": 0.3307, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05928853754940711, |
|
"grad_norm": 1.648833246586736, |
|
"learning_rate": 9.978332574056468e-06, |
|
"loss": 0.3382, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06324110671936758, |
|
"grad_norm": 1.5144376829290096, |
|
"learning_rate": 9.975349739474156e-06, |
|
"loss": 0.3025, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06719367588932806, |
|
"grad_norm": 1.4722946130746835, |
|
"learning_rate": 9.972175117112208e-06, |
|
"loss": 0.3291, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07114624505928854, |
|
"grad_norm": 1.5590402414570852, |
|
"learning_rate": 9.968808829344692e-06, |
|
"loss": 0.318, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07509881422924901, |
|
"grad_norm": 1.529364514445946, |
|
"learning_rate": 9.965251005933915e-06, |
|
"loss": 0.3029, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.07905138339920949, |
|
"grad_norm": 1.5355343280276874, |
|
"learning_rate": 9.961501784025423e-06, |
|
"loss": 0.3075, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08300395256916997, |
|
"grad_norm": 1.4284315641862246, |
|
"learning_rate": 9.95756130814271e-06, |
|
"loss": 0.2733, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08695652173913043, |
|
"grad_norm": 1.529934818238827, |
|
"learning_rate": 9.953429730181653e-06, |
|
"loss": 0.3319, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09090909090909091, |
|
"grad_norm": 1.5594666788219782, |
|
"learning_rate": 9.949107209404664e-06, |
|
"loss": 0.2998, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.09486166007905138, |
|
"grad_norm": 1.43783031647119, |
|
"learning_rate": 9.94459391243453e-06, |
|
"loss": 0.2856, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09881422924901186, |
|
"grad_norm": 1.6552583911804946, |
|
"learning_rate": 9.939890013248006e-06, |
|
"loss": 0.3287, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10276679841897234, |
|
"grad_norm": 1.4553289738630346, |
|
"learning_rate": 9.934995693169104e-06, |
|
"loss": 0.2726, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1067193675889328, |
|
"grad_norm": 1.7057032170116595, |
|
"learning_rate": 9.929911140862109e-06, |
|
"loss": 0.3412, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11067193675889328, |
|
"grad_norm": 1.4032529471627393, |
|
"learning_rate": 9.924636552324296e-06, |
|
"loss": 0.2484, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.11462450592885376, |
|
"grad_norm": 1.6007146129563619, |
|
"learning_rate": 9.919172130878378e-06, |
|
"loss": 0.3265, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.11857707509881422, |
|
"grad_norm": 1.4391496848114302, |
|
"learning_rate": 9.913518087164678e-06, |
|
"loss": 0.285, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1225296442687747, |
|
"grad_norm": 1.4062720696044961, |
|
"learning_rate": 9.907674639132995e-06, |
|
"loss": 0.2548, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.12648221343873517, |
|
"grad_norm": 1.410271063639066, |
|
"learning_rate": 9.901642012034214e-06, |
|
"loss": 0.2508, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.13043478260869565, |
|
"grad_norm": 1.5527348593856665, |
|
"learning_rate": 9.895420438411616e-06, |
|
"loss": 0.2976, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.13438735177865613, |
|
"grad_norm": 1.321537975619614, |
|
"learning_rate": 9.889010158091917e-06, |
|
"loss": 0.2572, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1383399209486166, |
|
"grad_norm": 1.421996806107464, |
|
"learning_rate": 9.882411418176023e-06, |
|
"loss": 0.2706, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1422924901185771, |
|
"grad_norm": 1.3572216549078873, |
|
"learning_rate": 9.875624473029508e-06, |
|
"loss": 0.2545, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.14624505928853754, |
|
"grad_norm": 1.2631690700776463, |
|
"learning_rate": 9.8686495842728e-06, |
|
"loss": 0.2625, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.15019762845849802, |
|
"grad_norm": 1.2621066835028363, |
|
"learning_rate": 9.861487020771103e-06, |
|
"loss": 0.2591, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.1541501976284585, |
|
"grad_norm": 1.2949235488783764, |
|
"learning_rate": 9.854137058624034e-06, |
|
"loss": 0.2309, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": 1.3003116841403821, |
|
"learning_rate": 9.846599981154975e-06, |
|
"loss": 0.2326, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.16205533596837945, |
|
"grad_norm": 1.3768166039897276, |
|
"learning_rate": 9.838876078900158e-06, |
|
"loss": 0.2397, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.16600790513833993, |
|
"grad_norm": 1.3538215396932514, |
|
"learning_rate": 9.830965649597455e-06, |
|
"loss": 0.252, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.16996047430830039, |
|
"grad_norm": 1.4163162387664199, |
|
"learning_rate": 9.822868998174914e-06, |
|
"loss": 0.2448, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 1.4257156730091618, |
|
"learning_rate": 9.814586436738998e-06, |
|
"loss": 0.2371, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.17786561264822134, |
|
"grad_norm": 1.4437391682465417, |
|
"learning_rate": 9.806118284562547e-06, |
|
"loss": 0.2907, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 1.391249530608691, |
|
"learning_rate": 9.797464868072489e-06, |
|
"loss": 0.2683, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1857707509881423, |
|
"grad_norm": 1.4016200934541518, |
|
"learning_rate": 9.788626520837235e-06, |
|
"loss": 0.2473, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.18972332015810275, |
|
"grad_norm": 1.279328106626107, |
|
"learning_rate": 9.779603583553842e-06, |
|
"loss": 0.2341, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.19367588932806323, |
|
"grad_norm": 1.3404807365252567, |
|
"learning_rate": 9.770396404034863e-06, |
|
"loss": 0.2675, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.1976284584980237, |
|
"grad_norm": 1.4585279577064512, |
|
"learning_rate": 9.76100533719495e-06, |
|
"loss": 0.2685, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2015810276679842, |
|
"grad_norm": 1.4424775598833863, |
|
"learning_rate": 9.75143074503717e-06, |
|
"loss": 0.2311, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.20553359683794467, |
|
"grad_norm": 1.3955125763500507, |
|
"learning_rate": 9.741672996639046e-06, |
|
"loss": 0.2627, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.20948616600790515, |
|
"grad_norm": 1.4097242793234939, |
|
"learning_rate": 9.731732468138338e-06, |
|
"loss": 0.2512, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2134387351778656, |
|
"grad_norm": 1.2779409261860988, |
|
"learning_rate": 9.72160954271854e-06, |
|
"loss": 0.248, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.21739130434782608, |
|
"grad_norm": 1.3115141733721187, |
|
"learning_rate": 9.711304610594104e-06, |
|
"loss": 0.2548, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22134387351778656, |
|
"grad_norm": 1.3040054071646883, |
|
"learning_rate": 9.700818068995407e-06, |
|
"loss": 0.2115, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.22529644268774704, |
|
"grad_norm": 1.3228671690231326, |
|
"learning_rate": 9.69015032215344e-06, |
|
"loss": 0.2372, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.22924901185770752, |
|
"grad_norm": 1.2810295113715335, |
|
"learning_rate": 9.679301781284209e-06, |
|
"loss": 0.2586, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.233201581027668, |
|
"grad_norm": 1.2300547258655599, |
|
"learning_rate": 9.668272864572904e-06, |
|
"loss": 0.2122, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.23715415019762845, |
|
"grad_norm": 1.331764793327734, |
|
"learning_rate": 9.65706399715777e-06, |
|
"loss": 0.2294, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24110671936758893, |
|
"grad_norm": 1.245135656410948, |
|
"learning_rate": 9.645675611113715e-06, |
|
"loss": 0.2234, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2450592885375494, |
|
"grad_norm": 1.2821048117267706, |
|
"learning_rate": 9.634108145435665e-06, |
|
"loss": 0.2111, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2490118577075099, |
|
"grad_norm": 1.3953990850010947, |
|
"learning_rate": 9.62236204602163e-06, |
|
"loss": 0.2423, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.25296442687747034, |
|
"grad_norm": 1.3305442952896998, |
|
"learning_rate": 9.610437765655522e-06, |
|
"loss": 0.2169, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.25691699604743085, |
|
"grad_norm": 1.3427264285897607, |
|
"learning_rate": 9.598335763989703e-06, |
|
"loss": 0.2387, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2608695652173913, |
|
"grad_norm": 1.2462397253811373, |
|
"learning_rate": 9.586056507527266e-06, |
|
"loss": 0.2002, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2648221343873518, |
|
"grad_norm": 1.4106601403861017, |
|
"learning_rate": 9.573600469604044e-06, |
|
"loss": 0.2345, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.26877470355731226, |
|
"grad_norm": 1.4558970695841007, |
|
"learning_rate": 9.560968130370376e-06, |
|
"loss": 0.257, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2727272727272727, |
|
"grad_norm": 1.2869342015693788, |
|
"learning_rate": 9.548159976772593e-06, |
|
"loss": 0.2272, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2766798418972332, |
|
"grad_norm": 1.2457392708337578, |
|
"learning_rate": 9.535176502534242e-06, |
|
"loss": 0.2011, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.28063241106719367, |
|
"grad_norm": 1.2710277485877495, |
|
"learning_rate": 9.522018208137066e-06, |
|
"loss": 0.2198, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2845849802371542, |
|
"grad_norm": 1.4429390388818002, |
|
"learning_rate": 9.508685600801704e-06, |
|
"loss": 0.2277, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2885375494071146, |
|
"grad_norm": 1.2389843654949855, |
|
"learning_rate": 9.495179194468135e-06, |
|
"loss": 0.2269, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2924901185770751, |
|
"grad_norm": 1.2313451113400058, |
|
"learning_rate": 9.481499509775878e-06, |
|
"loss": 0.2229, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2964426877470356, |
|
"grad_norm": 1.2005736870957262, |
|
"learning_rate": 9.467647074043911e-06, |
|
"loss": 0.2132, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.30039525691699603, |
|
"grad_norm": 1.3356666012024943, |
|
"learning_rate": 9.453622421250353e-06, |
|
"loss": 0.2261, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.30434782608695654, |
|
"grad_norm": 1.307750115235254, |
|
"learning_rate": 9.439426092011877e-06, |
|
"loss": 0.2186, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.308300395256917, |
|
"grad_norm": 1.3023411955103703, |
|
"learning_rate": 9.42505863356287e-06, |
|
"loss": 0.2241, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.31225296442687744, |
|
"grad_norm": 1.2340243980259775, |
|
"learning_rate": 9.410520599734338e-06, |
|
"loss": 0.2195, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 1.2136232872238946, |
|
"learning_rate": 9.395812550932559e-06, |
|
"loss": 0.2105, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3201581027667984, |
|
"grad_norm": 1.3027913426674749, |
|
"learning_rate": 9.38093505411748e-06, |
|
"loss": 0.2112, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3241106719367589, |
|
"grad_norm": 1.2355979390019485, |
|
"learning_rate": 9.365888682780862e-06, |
|
"loss": 0.2041, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.32806324110671936, |
|
"grad_norm": 1.3255611172622812, |
|
"learning_rate": 9.35067401692417e-06, |
|
"loss": 0.2133, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.33201581027667987, |
|
"grad_norm": 1.2110730151497104, |
|
"learning_rate": 9.335291643036221e-06, |
|
"loss": 0.1937, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3359683794466403, |
|
"grad_norm": 1.2302820012512985, |
|
"learning_rate": 9.319742154070578e-06, |
|
"loss": 0.2127, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.33992094861660077, |
|
"grad_norm": 1.396952411441183, |
|
"learning_rate": 9.30402614942268e-06, |
|
"loss": 0.2498, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3438735177865613, |
|
"grad_norm": 1.4467617558469186, |
|
"learning_rate": 9.288144234906753e-06, |
|
"loss": 0.2582, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 1.2724563579817645, |
|
"learning_rate": 9.272097022732444e-06, |
|
"loss": 0.2094, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.35177865612648224, |
|
"grad_norm": 1.374464706651341, |
|
"learning_rate": 9.255885131481231e-06, |
|
"loss": 0.2333, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3557312252964427, |
|
"grad_norm": 1.1876568411175543, |
|
"learning_rate": 9.239509186082574e-06, |
|
"loss": 0.1893, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.35968379446640314, |
|
"grad_norm": 1.3268240601065355, |
|
"learning_rate": 9.222969817789829e-06, |
|
"loss": 0.2124, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 1.2681330364031638, |
|
"learning_rate": 9.206267664155906e-06, |
|
"loss": 0.1973, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3675889328063241, |
|
"grad_norm": 1.2255166053795126, |
|
"learning_rate": 9.189403369008704e-06, |
|
"loss": 0.224, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3715415019762846, |
|
"grad_norm": 1.3042156326130103, |
|
"learning_rate": 9.172377582426286e-06, |
|
"loss": 0.2366, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.37549407114624506, |
|
"grad_norm": 1.2467295856925376, |
|
"learning_rate": 9.155190960711822e-06, |
|
"loss": 0.2307, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3794466403162055, |
|
"grad_norm": 1.3438520348323209, |
|
"learning_rate": 9.137844166368289e-06, |
|
"loss": 0.2358, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.383399209486166, |
|
"grad_norm": 1.195811544532563, |
|
"learning_rate": 9.120337868072933e-06, |
|
"loss": 0.1998, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.38735177865612647, |
|
"grad_norm": 1.1483129999113106, |
|
"learning_rate": 9.1026727406515e-06, |
|
"loss": 0.1992, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.391304347826087, |
|
"grad_norm": 1.2903993706253556, |
|
"learning_rate": 9.08484946505221e-06, |
|
"loss": 0.2103, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.3952569169960474, |
|
"grad_norm": 1.3546933181924383, |
|
"learning_rate": 9.066868728319522e-06, |
|
"loss": 0.2431, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.39920948616600793, |
|
"grad_norm": 1.2376669346579956, |
|
"learning_rate": 9.048731223567636e-06, |
|
"loss": 0.2112, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4031620553359684, |
|
"grad_norm": 1.1433249084583228, |
|
"learning_rate": 9.03043764995379e-06, |
|
"loss": 0.187, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.40711462450592883, |
|
"grad_norm": 1.2494316975005797, |
|
"learning_rate": 9.011988712651295e-06, |
|
"loss": 0.2254, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.41106719367588934, |
|
"grad_norm": 1.164269222531622, |
|
"learning_rate": 8.993385122822364e-06, |
|
"loss": 0.196, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4150197628458498, |
|
"grad_norm": 1.2168120947010408, |
|
"learning_rate": 8.974627597590693e-06, |
|
"loss": 0.1871, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4189723320158103, |
|
"grad_norm": 1.2068782855519822, |
|
"learning_rate": 8.955716860013812e-06, |
|
"loss": 0.2061, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.42292490118577075, |
|
"grad_norm": 1.217200175520014, |
|
"learning_rate": 8.936653639055225e-06, |
|
"loss": 0.2128, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.4268774703557312, |
|
"grad_norm": 1.1978052117951064, |
|
"learning_rate": 8.917438669556307e-06, |
|
"loss": 0.2154, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4308300395256917, |
|
"grad_norm": 1.2353347282860097, |
|
"learning_rate": 8.898072692207964e-06, |
|
"loss": 0.2166, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 1.2843635498058075, |
|
"learning_rate": 8.8785564535221e-06, |
|
"loss": 0.222, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.43873517786561267, |
|
"grad_norm": 1.2399015935864328, |
|
"learning_rate": 8.85889070580283e-06, |
|
"loss": 0.2106, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4426877470355731, |
|
"grad_norm": 1.2553929591861368, |
|
"learning_rate": 8.839076207117485e-06, |
|
"loss": 0.2428, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.44664031620553357, |
|
"grad_norm": 1.344840565207265, |
|
"learning_rate": 8.819113721267385e-06, |
|
"loss": 0.231, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4505928853754941, |
|
"grad_norm": 1.1925173566212948, |
|
"learning_rate": 8.7990040177584e-06, |
|
"loss": 0.1862, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.2960852273499728, |
|
"learning_rate": 8.778747871771293e-06, |
|
"loss": 0.2015, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.45849802371541504, |
|
"grad_norm": 1.222878498466299, |
|
"learning_rate": 8.758346064131824e-06, |
|
"loss": 0.2153, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.4624505928853755, |
|
"grad_norm": 1.274954724070636, |
|
"learning_rate": 8.737799381280667e-06, |
|
"loss": 0.2027, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.466403162055336, |
|
"grad_norm": 1.387737355454323, |
|
"learning_rate": 8.717108615243081e-06, |
|
"loss": 0.2221, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.47035573122529645, |
|
"grad_norm": 1.3426117595660954, |
|
"learning_rate": 8.696274563598395e-06, |
|
"loss": 0.2255, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4743083003952569, |
|
"grad_norm": 1.119139769942221, |
|
"learning_rate": 8.675298029449241e-06, |
|
"loss": 0.18, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4782608695652174, |
|
"grad_norm": 1.2709986698183513, |
|
"learning_rate": 8.65417982139062e-06, |
|
"loss": 0.2204, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.48221343873517786, |
|
"grad_norm": 1.3155178470853532, |
|
"learning_rate": 8.63292075347872e-06, |
|
"loss": 0.2098, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.48616600790513836, |
|
"grad_norm": 1.1164093498148737, |
|
"learning_rate": 8.611521645199532e-06, |
|
"loss": 0.1699, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4901185770750988, |
|
"grad_norm": 1.2232423322705634, |
|
"learning_rate": 8.589983321437271e-06, |
|
"loss": 0.2048, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.49407114624505927, |
|
"grad_norm": 1.19896747867504, |
|
"learning_rate": 8.568306612442579e-06, |
|
"loss": 0.1818, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4980237154150198, |
|
"grad_norm": 1.1935132661062275, |
|
"learning_rate": 8.546492353800504e-06, |
|
"loss": 0.1905, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5019762845849802, |
|
"grad_norm": 1.3129073388913475, |
|
"learning_rate": 8.524541386398318e-06, |
|
"loss": 0.2249, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5059288537549407, |
|
"grad_norm": 1.128105255244932, |
|
"learning_rate": 8.502454556393071e-06, |
|
"loss": 0.1853, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5098814229249012, |
|
"grad_norm": 1.275277474434624, |
|
"learning_rate": 8.480232715179004e-06, |
|
"loss": 0.2268, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5138339920948617, |
|
"grad_norm": 1.199574317058949, |
|
"learning_rate": 8.457876719354708e-06, |
|
"loss": 0.2159, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5177865612648221, |
|
"grad_norm": 1.333440075825727, |
|
"learning_rate": 8.435387430690114e-06, |
|
"loss": 0.2071, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 1.2298826627346486, |
|
"learning_rate": 8.412765716093273e-06, |
|
"loss": 0.1978, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.525691699604743, |
|
"grad_norm": 1.1487447920512157, |
|
"learning_rate": 8.390012447576931e-06, |
|
"loss": 0.1681, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5296442687747036, |
|
"grad_norm": 1.1422378679889824, |
|
"learning_rate": 8.367128502224931e-06, |
|
"loss": 0.1799, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5335968379446641, |
|
"grad_norm": 1.1642741860659866, |
|
"learning_rate": 8.344114762158391e-06, |
|
"loss": 0.1945, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5375494071146245, |
|
"grad_norm": 1.3124275732270296, |
|
"learning_rate": 8.320972114501698e-06, |
|
"loss": 0.1994, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.541501976284585, |
|
"grad_norm": 1.1914086199917326, |
|
"learning_rate": 8.297701451348324e-06, |
|
"loss": 0.1982, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 1.251119288035372, |
|
"learning_rate": 8.274303669726427e-06, |
|
"loss": 0.1961, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.549407114624506, |
|
"grad_norm": 1.1421929686539185, |
|
"learning_rate": 8.250779671564277e-06, |
|
"loss": 0.205, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5533596837944664, |
|
"grad_norm": 1.26120508447205, |
|
"learning_rate": 8.22713036365549e-06, |
|
"loss": 0.2056, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5573122529644269, |
|
"grad_norm": 1.1982016744926482, |
|
"learning_rate": 8.20335665762407e-06, |
|
"loss": 0.2152, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5612648221343873, |
|
"grad_norm": 1.2176314854159234, |
|
"learning_rate": 8.179459469889269e-06, |
|
"loss": 0.2154, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5652173913043478, |
|
"grad_norm": 1.2473624074495893, |
|
"learning_rate": 8.155439721630265e-06, |
|
"loss": 0.2175, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5691699604743083, |
|
"grad_norm": 1.1267383357984246, |
|
"learning_rate": 8.131298338750648e-06, |
|
"loss": 0.1892, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5731225296442688, |
|
"grad_norm": 1.182358551787801, |
|
"learning_rate": 8.10703625184273e-06, |
|
"loss": 0.1972, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5770750988142292, |
|
"grad_norm": 1.2568983709583992, |
|
"learning_rate": 8.082654396151676e-06, |
|
"loss": 0.2156, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5810276679841897, |
|
"grad_norm": 1.3251313569923084, |
|
"learning_rate": 8.058153711539444e-06, |
|
"loss": 0.2089, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5849802371541502, |
|
"grad_norm": 1.0726734543721594, |
|
"learning_rate": 8.03353514244857e-06, |
|
"loss": 0.1809, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5889328063241107, |
|
"grad_norm": 1.217655834438009, |
|
"learning_rate": 8.008799637865741e-06, |
|
"loss": 0.2023, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5928853754940712, |
|
"grad_norm": 1.0772407801697832, |
|
"learning_rate": 7.983948151285242e-06, |
|
"loss": 0.1796, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5968379446640316, |
|
"grad_norm": 1.2413074180785202, |
|
"learning_rate": 7.958981640672173e-06, |
|
"loss": 0.1977, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6007905138339921, |
|
"grad_norm": 1.0852808186316445, |
|
"learning_rate": 7.933901068425539e-06, |
|
"loss": 0.1731, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6047430830039525, |
|
"grad_norm": 1.299528282255904, |
|
"learning_rate": 7.908707401341146e-06, |
|
"loss": 0.2216, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6086956521739131, |
|
"grad_norm": 1.185556918204072, |
|
"learning_rate": 7.883401610574338e-06, |
|
"loss": 0.1908, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6126482213438735, |
|
"grad_norm": 1.304948505750968, |
|
"learning_rate": 7.857984671602547e-06, |
|
"loss": 0.2008, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.616600790513834, |
|
"grad_norm": 1.1900402122684852, |
|
"learning_rate": 7.832457564187715e-06, |
|
"loss": 0.1706, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6205533596837944, |
|
"grad_norm": 1.210872241558616, |
|
"learning_rate": 7.806821272338504e-06, |
|
"loss": 0.2054, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6245059288537549, |
|
"grad_norm": 1.1791680850500765, |
|
"learning_rate": 7.781076784272377e-06, |
|
"loss": 0.1911, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6284584980237155, |
|
"grad_norm": 1.2064915376432837, |
|
"learning_rate": 7.755225092377498e-06, |
|
"loss": 0.2123, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 1.1788766410507692, |
|
"learning_rate": 7.729267193174483e-06, |
|
"loss": 0.2166, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6363636363636364, |
|
"grad_norm": 1.2252698894797731, |
|
"learning_rate": 7.703204087277989e-06, |
|
"loss": 0.1919, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6403162055335968, |
|
"grad_norm": 1.349649897365813, |
|
"learning_rate": 7.67703677935813e-06, |
|
"loss": 0.2178, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6442687747035574, |
|
"grad_norm": 1.1345917440027748, |
|
"learning_rate": 7.650766278101762e-06, |
|
"loss": 0.197, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6482213438735178, |
|
"grad_norm": 1.287989390837344, |
|
"learning_rate": 7.624393596173598e-06, |
|
"loss": 0.2381, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6521739130434783, |
|
"grad_norm": 1.1299739332721368, |
|
"learning_rate": 7.597919750177168e-06, |
|
"loss": 0.1808, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6561264822134387, |
|
"grad_norm": 1.2891189521599526, |
|
"learning_rate": 7.5713457606156335e-06, |
|
"loss": 0.219, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6600790513833992, |
|
"grad_norm": 1.2643945396009584, |
|
"learning_rate": 7.5446726518524505e-06, |
|
"loss": 0.2049, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.6640316205533597, |
|
"grad_norm": 1.2293140206996043, |
|
"learning_rate": 7.51790145207188e-06, |
|
"loss": 0.2103, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6679841897233202, |
|
"grad_norm": 1.178935960357507, |
|
"learning_rate": 7.4910331932393634e-06, |
|
"loss": 0.199, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6719367588932806, |
|
"grad_norm": 1.2052463331102627, |
|
"learning_rate": 7.464068911061726e-06, |
|
"loss": 0.2096, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6758893280632411, |
|
"grad_norm": 1.1862206001747997, |
|
"learning_rate": 7.437009644947268e-06, |
|
"loss": 0.1987, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6798418972332015, |
|
"grad_norm": 1.256150663623612, |
|
"learning_rate": 7.40985643796569e-06, |
|
"loss": 0.2198, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6837944664031621, |
|
"grad_norm": 1.1442878472683295, |
|
"learning_rate": 7.382610336807887e-06, |
|
"loss": 0.1735, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6877470355731226, |
|
"grad_norm": 1.21934881533331, |
|
"learning_rate": 7.355272391745605e-06, |
|
"loss": 0.201, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.691699604743083, |
|
"grad_norm": 1.2085810780811668, |
|
"learning_rate": 7.327843656590948e-06, |
|
"loss": 0.1997, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 1.1706763346917641, |
|
"learning_rate": 7.300325188655762e-06, |
|
"loss": 0.175, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6996047430830039, |
|
"grad_norm": 1.3103080637398228, |
|
"learning_rate": 7.2727180487108725e-06, |
|
"loss": 0.2316, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7035573122529645, |
|
"grad_norm": 1.2414175503064024, |
|
"learning_rate": 7.245023300945203e-06, |
|
"loss": 0.2086, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7075098814229249, |
|
"grad_norm": 1.1028313396028102, |
|
"learning_rate": 7.217242012924747e-06, |
|
"loss": 0.1614, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7114624505928854, |
|
"grad_norm": 1.2545708120381063, |
|
"learning_rate": 7.189375255551413e-06, |
|
"loss": 0.2129, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7154150197628458, |
|
"grad_norm": 1.2208082871634096, |
|
"learning_rate": 7.161424103021752e-06, |
|
"loss": 0.186, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7193675889328063, |
|
"grad_norm": 1.1856323959588346, |
|
"learning_rate": 7.133389632785543e-06, |
|
"loss": 0.18, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7233201581027668, |
|
"grad_norm": 1.0491684795158043, |
|
"learning_rate": 7.1052729255042645e-06, |
|
"loss": 0.1738, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 1.1602063012667307, |
|
"learning_rate": 7.0770750650094335e-06, |
|
"loss": 0.1888, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7312252964426877, |
|
"grad_norm": 1.168233348357676, |
|
"learning_rate": 7.048797138260829e-06, |
|
"loss": 0.1938, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7351778656126482, |
|
"grad_norm": 1.0998419207012584, |
|
"learning_rate": 7.020440235304593e-06, |
|
"loss": 0.1541, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.7391304347826086, |
|
"grad_norm": 1.2094474362406284, |
|
"learning_rate": 6.9920054492312086e-06, |
|
"loss": 0.1863, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.7430830039525692, |
|
"grad_norm": 1.1862926984636035, |
|
"learning_rate": 6.963493876133367e-06, |
|
"loss": 0.1991, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7470355731225297, |
|
"grad_norm": 1.309657217121461, |
|
"learning_rate": 6.934906615063716e-06, |
|
"loss": 0.2202, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7509881422924901, |
|
"grad_norm": 1.1164109935733424, |
|
"learning_rate": 6.90624476799249e-06, |
|
"loss": 0.1755, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7549407114624506, |
|
"grad_norm": 1.0725104657393945, |
|
"learning_rate": 6.8775094397650375e-06, |
|
"loss": 0.1669, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.758893280632411, |
|
"grad_norm": 1.2352816410215464, |
|
"learning_rate": 6.8487017380592266e-06, |
|
"loss": 0.1962, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7628458498023716, |
|
"grad_norm": 1.145478161105048, |
|
"learning_rate": 6.81982277334275e-06, |
|
"loss": 0.1864, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.766798418972332, |
|
"grad_norm": 1.2512415214036399, |
|
"learning_rate": 6.790873658830321e-06, |
|
"loss": 0.1972, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7707509881422925, |
|
"grad_norm": 1.2466180949277024, |
|
"learning_rate": 6.761855510440752e-06, |
|
"loss": 0.1817, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.7747035573122529, |
|
"grad_norm": 1.181804911384916, |
|
"learning_rate": 6.732769446753954e-06, |
|
"loss": 0.175, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7786561264822134, |
|
"grad_norm": 1.2910997699602191, |
|
"learning_rate": 6.703616588967804e-06, |
|
"loss": 0.2146, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.782608695652174, |
|
"grad_norm": 1.2156034432115361, |
|
"learning_rate": 6.674398060854931e-06, |
|
"loss": 0.2065, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.7865612648221344, |
|
"grad_norm": 1.2324700294150777, |
|
"learning_rate": 6.645114988719401e-06, |
|
"loss": 0.218, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.7905138339920948, |
|
"grad_norm": 1.1788814940788603, |
|
"learning_rate": 6.615768501353297e-06, |
|
"loss": 0.1911, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7944664031620553, |
|
"grad_norm": 1.239782727484998, |
|
"learning_rate": 6.5863597299932e-06, |
|
"loss": 0.2033, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.7984189723320159, |
|
"grad_norm": 1.2328166491787746, |
|
"learning_rate": 6.5568898082765945e-06, |
|
"loss": 0.1885, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8023715415019763, |
|
"grad_norm": 1.2257569520585552, |
|
"learning_rate": 6.527359872198166e-06, |
|
"loss": 0.2087, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8063241106719368, |
|
"grad_norm": 1.090893060994598, |
|
"learning_rate": 6.497771060066008e-06, |
|
"loss": 0.1673, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8102766798418972, |
|
"grad_norm": 1.262115835491651, |
|
"learning_rate": 6.468124512457743e-06, |
|
"loss": 0.2173, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8142292490118577, |
|
"grad_norm": 1.1404830620435313, |
|
"learning_rate": 6.4384213721765565e-06, |
|
"loss": 0.1821, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8181818181818182, |
|
"grad_norm": 1.0789980915654802, |
|
"learning_rate": 6.408662784207149e-06, |
|
"loss": 0.1809, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.8221343873517787, |
|
"grad_norm": 1.1275632208098325, |
|
"learning_rate": 6.378849895671594e-06, |
|
"loss": 0.1807, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8260869565217391, |
|
"grad_norm": 1.1869280328722958, |
|
"learning_rate": 6.348983855785122e-06, |
|
"loss": 0.193, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8300395256916996, |
|
"grad_norm": 1.1369831660790293, |
|
"learning_rate": 6.3190658158118205e-06, |
|
"loss": 0.1736, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.83399209486166, |
|
"grad_norm": 1.099689140622585, |
|
"learning_rate": 6.289096929020254e-06, |
|
"loss": 0.1737, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.8379446640316206, |
|
"grad_norm": 1.1149596155598573, |
|
"learning_rate": 6.25907835063901e-06, |
|
"loss": 0.185, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.841897233201581, |
|
"grad_norm": 1.2181136196244295, |
|
"learning_rate": 6.229011237812172e-06, |
|
"loss": 0.2183, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.8458498023715415, |
|
"grad_norm": 1.1288870761802783, |
|
"learning_rate": 6.1988967495547016e-06, |
|
"loss": 0.1705, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.849802371541502, |
|
"grad_norm": 1.2184909621033004, |
|
"learning_rate": 6.168736046707777e-06, |
|
"loss": 0.2087, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8537549407114624, |
|
"grad_norm": 1.0707635750160447, |
|
"learning_rate": 6.138530291894033e-06, |
|
"loss": 0.1599, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.857707509881423, |
|
"grad_norm": 1.15244187620798, |
|
"learning_rate": 6.108280649472751e-06, |
|
"loss": 0.1977, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8616600790513834, |
|
"grad_norm": 1.140414407474285, |
|
"learning_rate": 6.0779882854949745e-06, |
|
"loss": 0.1916, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8656126482213439, |
|
"grad_norm": 1.168184854563085, |
|
"learning_rate": 6.047654367658563e-06, |
|
"loss": 0.1877, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 1.095382559762059, |
|
"learning_rate": 6.0172800652631706e-06, |
|
"loss": 0.1941, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8735177865612648, |
|
"grad_norm": 1.170558717112789, |
|
"learning_rate": 5.986866549165185e-06, |
|
"loss": 0.1841, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.8774703557312253, |
|
"grad_norm": 1.1273692690940986, |
|
"learning_rate": 5.9564149917325845e-06, |
|
"loss": 0.1776, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.8814229249011858, |
|
"grad_norm": 1.1615247705497285, |
|
"learning_rate": 5.925926566799754e-06, |
|
"loss": 0.1923, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.8853754940711462, |
|
"grad_norm": 1.1310915029362631, |
|
"learning_rate": 5.895402449622226e-06, |
|
"loss": 0.1772, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.8893280632411067, |
|
"grad_norm": 1.168875863428828, |
|
"learning_rate": 5.864843816831388e-06, |
|
"loss": 0.1844, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.8932806324110671, |
|
"grad_norm": 1.1674668972919495, |
|
"learning_rate": 5.8342518463891195e-06, |
|
"loss": 0.1632, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.8972332015810277, |
|
"grad_norm": 1.2880944650610078, |
|
"learning_rate": 5.803627717542386e-06, |
|
"loss": 0.1961, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9011857707509882, |
|
"grad_norm": 1.224311023303702, |
|
"learning_rate": 5.7729726107777855e-06, |
|
"loss": 0.1884, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9051383399209486, |
|
"grad_norm": 1.1555445601349188, |
|
"learning_rate": 5.742287707776034e-06, |
|
"loss": 0.1746, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.137507280031818, |
|
"learning_rate": 5.711574191366427e-06, |
|
"loss": 0.1767, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9130434782608695, |
|
"grad_norm": 1.1300350500871745, |
|
"learning_rate": 5.680833245481234e-06, |
|
"loss": 0.1748, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.9169960474308301, |
|
"grad_norm": 1.061749424236297, |
|
"learning_rate": 5.650066055110067e-06, |
|
"loss": 0.1682, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9209486166007905, |
|
"grad_norm": 1.270168772191801, |
|
"learning_rate": 5.6192738062542e-06, |
|
"loss": 0.1883, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.924901185770751, |
|
"grad_norm": 1.0546305674656924, |
|
"learning_rate": 5.588457685880851e-06, |
|
"loss": 0.1634, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.9288537549407114, |
|
"grad_norm": 1.1513270673921783, |
|
"learning_rate": 5.557618881877428e-06, |
|
"loss": 0.1745, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.932806324110672, |
|
"grad_norm": 1.4466685740671457, |
|
"learning_rate": 5.526758583005736e-06, |
|
"loss": 0.2153, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.9367588932806324, |
|
"grad_norm": 1.2211664220988006, |
|
"learning_rate": 5.495877978856159e-06, |
|
"loss": 0.1962, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.9407114624505929, |
|
"grad_norm": 1.1382037688163558, |
|
"learning_rate": 5.464978259801797e-06, |
|
"loss": 0.1765, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.9446640316205533, |
|
"grad_norm": 1.1600221011599767, |
|
"learning_rate": 5.4340606169525915e-06, |
|
"loss": 0.1948, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 1.2329095686972222, |
|
"learning_rate": 5.40312624210939e-06, |
|
"loss": 0.1958, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9525691699604744, |
|
"grad_norm": 1.1077759614799594, |
|
"learning_rate": 5.372176327718029e-06, |
|
"loss": 0.1772, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.9565217391304348, |
|
"grad_norm": 1.3525530316416365, |
|
"learning_rate": 5.341212066823356e-06, |
|
"loss": 0.2068, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.9604743083003953, |
|
"grad_norm": 1.1641895345672653, |
|
"learning_rate": 5.3102346530232365e-06, |
|
"loss": 0.1983, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.9644268774703557, |
|
"grad_norm": 1.210471873881558, |
|
"learning_rate": 5.2792452804225535e-06, |
|
"loss": 0.203, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.9683794466403162, |
|
"grad_norm": 1.1895101061042292, |
|
"learning_rate": 5.248245143587172e-06, |
|
"loss": 0.1838, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9723320158102767, |
|
"grad_norm": 1.132321900931639, |
|
"learning_rate": 5.2172354374978905e-06, |
|
"loss": 0.1843, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.9762845849802372, |
|
"grad_norm": 1.2024875975928633, |
|
"learning_rate": 5.186217357504382e-06, |
|
"loss": 0.1971, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.9802371541501976, |
|
"grad_norm": 1.1385111787681224, |
|
"learning_rate": 5.155192099279113e-06, |
|
"loss": 0.1581, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.9841897233201581, |
|
"grad_norm": 1.1812364752105893, |
|
"learning_rate": 5.124160858771252e-06, |
|
"loss": 0.1868, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.9881422924901185, |
|
"grad_norm": 1.0663746142171262, |
|
"learning_rate": 5.093124832160569e-06, |
|
"loss": 0.1702, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9920948616600791, |
|
"grad_norm": 1.0637107177598129, |
|
"learning_rate": 5.06208521581133e-06, |
|
"loss": 0.1758, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.9960474308300395, |
|
"grad_norm": 1.082027772728129, |
|
"learning_rate": 5.0310432062261764e-06, |
|
"loss": 0.1718, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.2004136292445025, |
|
"learning_rate": 5e-06, |
|
"loss": 0.18, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.0039525691699605, |
|
"grad_norm": 1.1438358979908518, |
|
"learning_rate": 4.968956793773825e-06, |
|
"loss": 0.128, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.007905138339921, |
|
"grad_norm": 1.0815724819973256, |
|
"learning_rate": 4.9379147841886715e-06, |
|
"loss": 0.1374, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.0118577075098814, |
|
"grad_norm": 0.9971041298616835, |
|
"learning_rate": 4.906875167839433e-06, |
|
"loss": 0.1295, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.0158102766798418, |
|
"grad_norm": 1.0834575346736253, |
|
"learning_rate": 4.875839141228751e-06, |
|
"loss": 0.1435, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.0197628458498025, |
|
"grad_norm": 1.068216857230524, |
|
"learning_rate": 4.844807900720888e-06, |
|
"loss": 0.1369, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.023715415019763, |
|
"grad_norm": 1.1436718795229075, |
|
"learning_rate": 4.813782642495618e-06, |
|
"loss": 0.1299, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.0276679841897234, |
|
"grad_norm": 1.2041205262898764, |
|
"learning_rate": 4.78276456250211e-06, |
|
"loss": 0.1393, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.0316205533596838, |
|
"grad_norm": 1.3153176210311313, |
|
"learning_rate": 4.75175485641283e-06, |
|
"loss": 0.1587, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.0355731225296443, |
|
"grad_norm": 1.2818237661945875, |
|
"learning_rate": 4.720754719577448e-06, |
|
"loss": 0.1417, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.0395256916996047, |
|
"grad_norm": 1.346590041013075, |
|
"learning_rate": 4.689765346976765e-06, |
|
"loss": 0.1414, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 1.396103108426267, |
|
"learning_rate": 4.6587879331766465e-06, |
|
"loss": 0.1541, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.0474308300395256, |
|
"grad_norm": 1.3369506628795504, |
|
"learning_rate": 4.627823672281972e-06, |
|
"loss": 0.1566, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.051383399209486, |
|
"grad_norm": 1.2097827541662982, |
|
"learning_rate": 4.596873757890612e-06, |
|
"loss": 0.1381, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.0553359683794465, |
|
"grad_norm": 1.1170762868644442, |
|
"learning_rate": 4.565939383047411e-06, |
|
"loss": 0.1135, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.0592885375494072, |
|
"grad_norm": 1.2233155007588437, |
|
"learning_rate": 4.535021740198202e-06, |
|
"loss": 0.1609, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.0632411067193677, |
|
"grad_norm": 1.2914900494723907, |
|
"learning_rate": 4.504122021143842e-06, |
|
"loss": 0.1454, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.0671936758893281, |
|
"grad_norm": 1.0833968374959027, |
|
"learning_rate": 4.473241416994265e-06, |
|
"loss": 0.1374, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0711462450592886, |
|
"grad_norm": 1.130910587654628, |
|
"learning_rate": 4.442381118122573e-06, |
|
"loss": 0.1249, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.075098814229249, |
|
"grad_norm": 1.0290406864536354, |
|
"learning_rate": 4.41154231411915e-06, |
|
"loss": 0.1179, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.0790513833992095, |
|
"grad_norm": 1.1351038290482456, |
|
"learning_rate": 4.3807261937458005e-06, |
|
"loss": 0.1446, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.08300395256917, |
|
"grad_norm": 0.9979191858289315, |
|
"learning_rate": 4.349933944889934e-06, |
|
"loss": 0.1085, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.0869565217391304, |
|
"grad_norm": 1.0703887978514581, |
|
"learning_rate": 4.319166754518768e-06, |
|
"loss": 0.1277, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 1.1475934247053179, |
|
"learning_rate": 4.2884258086335755e-06, |
|
"loss": 0.1394, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.0948616600790513, |
|
"grad_norm": 1.1836305673827001, |
|
"learning_rate": 4.257712292223967e-06, |
|
"loss": 0.1342, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.098814229249012, |
|
"grad_norm": 1.2046134166270397, |
|
"learning_rate": 4.227027389222215e-06, |
|
"loss": 0.1344, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.1027667984189724, |
|
"grad_norm": 1.035753316225185, |
|
"learning_rate": 4.196372282457614e-06, |
|
"loss": 0.1127, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1067193675889329, |
|
"grad_norm": 1.1282169817707273, |
|
"learning_rate": 4.165748153610881e-06, |
|
"loss": 0.1293, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1106719367588933, |
|
"grad_norm": 1.1372474070307015, |
|
"learning_rate": 4.1351561831686136e-06, |
|
"loss": 0.1332, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.1146245059288538, |
|
"grad_norm": 1.1594526273221435, |
|
"learning_rate": 4.104597550377776e-06, |
|
"loss": 0.1263, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.1185770750988142, |
|
"grad_norm": 1.15152490259451, |
|
"learning_rate": 4.074073433200249e-06, |
|
"loss": 0.1285, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.1225296442687747, |
|
"grad_norm": 1.1540429682296736, |
|
"learning_rate": 4.043585008267418e-06, |
|
"loss": 0.118, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.1264822134387351, |
|
"grad_norm": 1.2879322008831273, |
|
"learning_rate": 4.013133450834818e-06, |
|
"loss": 0.1532, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1304347826086956, |
|
"grad_norm": 1.021378919711874, |
|
"learning_rate": 3.982719934736832e-06, |
|
"loss": 0.0988, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.1343873517786562, |
|
"grad_norm": 1.1026842967832002, |
|
"learning_rate": 3.95234563234144e-06, |
|
"loss": 0.1148, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.1383399209486167, |
|
"grad_norm": 1.1221443001758493, |
|
"learning_rate": 3.9220117145050254e-06, |
|
"loss": 0.1346, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.1422924901185771, |
|
"grad_norm": 1.179825748135588, |
|
"learning_rate": 3.89171935052725e-06, |
|
"loss": 0.132, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.1462450592885376, |
|
"grad_norm": 1.1145447568885478, |
|
"learning_rate": 3.861469708105969e-06, |
|
"loss": 0.1262, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.150197628458498, |
|
"grad_norm": 1.1805243512710089, |
|
"learning_rate": 3.831263953292225e-06, |
|
"loss": 0.1464, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.1541501976284585, |
|
"grad_norm": 1.1752099012387707, |
|
"learning_rate": 3.8011032504453e-06, |
|
"loss": 0.1303, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.158102766798419, |
|
"grad_norm": 1.1724153479966961, |
|
"learning_rate": 3.7709887621878305e-06, |
|
"loss": 0.1237, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.1620553359683794, |
|
"grad_norm": 1.1312088107686393, |
|
"learning_rate": 3.740921649360991e-06, |
|
"loss": 0.1329, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.1660079051383399, |
|
"grad_norm": 1.2140114398756852, |
|
"learning_rate": 3.710903070979749e-06, |
|
"loss": 0.146, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.1699604743083003, |
|
"grad_norm": 1.177316021638641, |
|
"learning_rate": 3.680934184188182e-06, |
|
"loss": 0.1369, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.1739130434782608, |
|
"grad_norm": 1.0958487706218825, |
|
"learning_rate": 3.6510161442148783e-06, |
|
"loss": 0.1224, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.1778656126482214, |
|
"grad_norm": 1.0391815446060055, |
|
"learning_rate": 3.621150104328407e-06, |
|
"loss": 0.1248, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.1818181818181819, |
|
"grad_norm": 1.0701427010593552, |
|
"learning_rate": 3.5913372157928515e-06, |
|
"loss": 0.1224, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.1857707509881423, |
|
"grad_norm": 1.1848224795154656, |
|
"learning_rate": 3.5615786278234443e-06, |
|
"loss": 0.1491, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1897233201581028, |
|
"grad_norm": 1.1762535991099512, |
|
"learning_rate": 3.5318754875422588e-06, |
|
"loss": 0.1347, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.1936758893280632, |
|
"grad_norm": 1.0206443853029172, |
|
"learning_rate": 3.5022289399339933e-06, |
|
"loss": 0.1022, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.1976284584980237, |
|
"grad_norm": 1.0357149112196293, |
|
"learning_rate": 3.4726401278018353e-06, |
|
"loss": 0.1134, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.2015810276679841, |
|
"grad_norm": 1.156607546118926, |
|
"learning_rate": 3.443110191723407e-06, |
|
"loss": 0.127, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.2055335968379446, |
|
"grad_norm": 1.1452610773145862, |
|
"learning_rate": 3.4136402700068034e-06, |
|
"loss": 0.1242, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2094861660079053, |
|
"grad_norm": 1.137660713683979, |
|
"learning_rate": 3.384231498646706e-06, |
|
"loss": 0.1302, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.2134387351778657, |
|
"grad_norm": 1.3751968299311201, |
|
"learning_rate": 3.3548850112805985e-06, |
|
"loss": 0.1345, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 1.1820163169421647, |
|
"learning_rate": 3.3256019391450696e-06, |
|
"loss": 0.124, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.2213438735177866, |
|
"grad_norm": 1.254836523224082, |
|
"learning_rate": 3.296383411032198e-06, |
|
"loss": 0.1592, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.225296442687747, |
|
"grad_norm": 1.2092831479934754, |
|
"learning_rate": 3.267230553246047e-06, |
|
"loss": 0.1374, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2292490118577075, |
|
"grad_norm": 1.3298953359466184, |
|
"learning_rate": 3.2381444895592483e-06, |
|
"loss": 0.1599, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.233201581027668, |
|
"grad_norm": 1.15714090782168, |
|
"learning_rate": 3.209126341169681e-06, |
|
"loss": 0.1178, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.2371541501976284, |
|
"grad_norm": 1.1321197195627186, |
|
"learning_rate": 3.180177226657251e-06, |
|
"loss": 0.1324, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.2411067193675889, |
|
"grad_norm": 1.1427780558349572, |
|
"learning_rate": 3.151298261940775e-06, |
|
"loss": 0.1354, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.2450592885375493, |
|
"grad_norm": 1.1792578676088221, |
|
"learning_rate": 3.122490560234964e-06, |
|
"loss": 0.1316, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2490118577075098, |
|
"grad_norm": 1.173771729377104, |
|
"learning_rate": 3.0937552320075116e-06, |
|
"loss": 0.1237, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.2529644268774702, |
|
"grad_norm": 1.2142503247487184, |
|
"learning_rate": 3.065093384936285e-06, |
|
"loss": 0.1385, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.256916996047431, |
|
"grad_norm": 1.2169127173091283, |
|
"learning_rate": 3.0365061238666336e-06, |
|
"loss": 0.1442, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.2608695652173914, |
|
"grad_norm": 1.3055869261396447, |
|
"learning_rate": 3.007994550768793e-06, |
|
"loss": 0.136, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.2648221343873518, |
|
"grad_norm": 1.2043193143210125, |
|
"learning_rate": 2.979559764695409e-06, |
|
"loss": 0.1447, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2687747035573123, |
|
"grad_norm": 1.1233412613298097, |
|
"learning_rate": 2.951202861739173e-06, |
|
"loss": 0.1282, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 1.0606810788891305, |
|
"learning_rate": 2.9229249349905686e-06, |
|
"loss": 0.1212, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.2766798418972332, |
|
"grad_norm": 1.0460756961226252, |
|
"learning_rate": 2.8947270744957385e-06, |
|
"loss": 0.1173, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.2806324110671936, |
|
"grad_norm": 1.1897624975044134, |
|
"learning_rate": 2.8666103672144597e-06, |
|
"loss": 0.1407, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.2845849802371543, |
|
"grad_norm": 1.1942264596916479, |
|
"learning_rate": 2.8385758969782507e-06, |
|
"loss": 0.1286, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.2885375494071147, |
|
"grad_norm": 1.1341245622744474, |
|
"learning_rate": 2.810624744448588e-06, |
|
"loss": 0.128, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.2924901185770752, |
|
"grad_norm": 1.165964008709128, |
|
"learning_rate": 2.7827579870752542e-06, |
|
"loss": 0.136, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.2964426877470356, |
|
"grad_norm": 1.1358629236681395, |
|
"learning_rate": 2.7549766990547973e-06, |
|
"loss": 0.1238, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.300395256916996, |
|
"grad_norm": 1.144473644830131, |
|
"learning_rate": 2.727281951289128e-06, |
|
"loss": 0.1333, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 1.1761434599454856, |
|
"learning_rate": 2.6996748113442397e-06, |
|
"loss": 0.1223, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.308300395256917, |
|
"grad_norm": 1.1788613585620644, |
|
"learning_rate": 2.672156343409053e-06, |
|
"loss": 0.1349, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.3122529644268774, |
|
"grad_norm": 1.1047408820558797, |
|
"learning_rate": 2.644727608254396e-06, |
|
"loss": 0.1215, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.316205533596838, |
|
"grad_norm": 1.1915355492120698, |
|
"learning_rate": 2.6173896631921134e-06, |
|
"loss": 0.1405, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.3201581027667983, |
|
"grad_norm": 1.1396002338772575, |
|
"learning_rate": 2.590143562034312e-06, |
|
"loss": 0.1297, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.3241106719367588, |
|
"grad_norm": 0.9928956463128177, |
|
"learning_rate": 2.5629903550527343e-06, |
|
"loss": 0.1032, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.3280632411067192, |
|
"grad_norm": 1.1497535091692899, |
|
"learning_rate": 2.535931088938274e-06, |
|
"loss": 0.1342, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.33201581027668, |
|
"grad_norm": 1.1209952848082338, |
|
"learning_rate": 2.5089668067606365e-06, |
|
"loss": 0.1185, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.3359683794466404, |
|
"grad_norm": 1.0839706555945572, |
|
"learning_rate": 2.4820985479281184e-06, |
|
"loss": 0.1139, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.3399209486166008, |
|
"grad_norm": 1.1440761155145645, |
|
"learning_rate": 2.45532734814755e-06, |
|
"loss": 0.134, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.3438735177865613, |
|
"grad_norm": 1.082616882918915, |
|
"learning_rate": 2.4286542393843665e-06, |
|
"loss": 0.1281, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3478260869565217, |
|
"grad_norm": 1.17500675790559, |
|
"learning_rate": 2.4020802498228333e-06, |
|
"loss": 0.1432, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.3517786561264822, |
|
"grad_norm": 1.1182114019109513, |
|
"learning_rate": 2.3756064038264033e-06, |
|
"loss": 0.134, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.3557312252964426, |
|
"grad_norm": 1.1566712806713801, |
|
"learning_rate": 2.3492337218982396e-06, |
|
"loss": 0.1285, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.359683794466403, |
|
"grad_norm": 1.0164245424014764, |
|
"learning_rate": 2.3229632206418727e-06, |
|
"loss": 0.1055, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 1.1202273103032814, |
|
"learning_rate": 2.296795912722014e-06, |
|
"loss": 0.1164, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.3675889328063242, |
|
"grad_norm": 1.2025482829810825, |
|
"learning_rate": 2.270732806825517e-06, |
|
"loss": 0.1413, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.3715415019762847, |
|
"grad_norm": 1.1369886395664646, |
|
"learning_rate": 2.244774907622504e-06, |
|
"loss": 0.1312, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.3754940711462451, |
|
"grad_norm": 1.1150013437618065, |
|
"learning_rate": 2.2189232157276247e-06, |
|
"loss": 0.1379, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.3794466403162056, |
|
"grad_norm": 1.086272977870767, |
|
"learning_rate": 2.1931787276614968e-06, |
|
"loss": 0.1178, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.383399209486166, |
|
"grad_norm": 1.1640308203458913, |
|
"learning_rate": 2.167542435812286e-06, |
|
"loss": 0.1356, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.3873517786561265, |
|
"grad_norm": 1.1121490307629227, |
|
"learning_rate": 2.142015328397454e-06, |
|
"loss": 0.1237, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 1.1574528436812335, |
|
"learning_rate": 2.1165983894256647e-06, |
|
"loss": 0.1255, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.3952569169960474, |
|
"grad_norm": 1.361980674009939, |
|
"learning_rate": 2.0912925986588547e-06, |
|
"loss": 0.1389, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.3992094861660078, |
|
"grad_norm": 1.051150691395027, |
|
"learning_rate": 2.0660989315744624e-06, |
|
"loss": 0.1168, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.4031620553359683, |
|
"grad_norm": 1.1382181091816, |
|
"learning_rate": 2.0410183593278287e-06, |
|
"loss": 0.1367, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.4071146245059287, |
|
"grad_norm": 1.127194102052322, |
|
"learning_rate": 2.016051848714758e-06, |
|
"loss": 0.131, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.4110671936758894, |
|
"grad_norm": 1.3646197569162608, |
|
"learning_rate": 1.991200362134258e-06, |
|
"loss": 0.1668, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.4150197628458498, |
|
"grad_norm": 1.179231150635698, |
|
"learning_rate": 1.9664648575514316e-06, |
|
"loss": 0.1309, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.4189723320158103, |
|
"grad_norm": 1.1478284079295227, |
|
"learning_rate": 1.9418462884605555e-06, |
|
"loss": 0.1407, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.4229249011857708, |
|
"grad_norm": 0.966600715786351, |
|
"learning_rate": 1.9173456038483244e-06, |
|
"loss": 0.0989, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4268774703557312, |
|
"grad_norm": 1.089318761705495, |
|
"learning_rate": 1.8929637481572715e-06, |
|
"loss": 0.1233, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.4308300395256917, |
|
"grad_norm": 1.1956595892466026, |
|
"learning_rate": 1.8687016612493542e-06, |
|
"loss": 0.1266, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.434782608695652, |
|
"grad_norm": 1.199815800783593, |
|
"learning_rate": 1.8445602783697375e-06, |
|
"loss": 0.1383, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.4387351778656128, |
|
"grad_norm": 1.0596031990819328, |
|
"learning_rate": 1.8205405301107343e-06, |
|
"loss": 0.1183, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.4426877470355732, |
|
"grad_norm": 1.1937639817958505, |
|
"learning_rate": 1.7966433423759327e-06, |
|
"loss": 0.1452, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.4466403162055337, |
|
"grad_norm": 1.0906265845240057, |
|
"learning_rate": 1.772869636344512e-06, |
|
"loss": 0.111, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.4505928853754941, |
|
"grad_norm": 1.2231633736793504, |
|
"learning_rate": 1.7492203284357245e-06, |
|
"loss": 0.1362, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 1.0619383271235998, |
|
"learning_rate": 1.7256963302735752e-06, |
|
"loss": 0.1202, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.458498023715415, |
|
"grad_norm": 1.0900241430901978, |
|
"learning_rate": 1.702298548651678e-06, |
|
"loss": 0.1162, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.4624505928853755, |
|
"grad_norm": 1.1512284006660594, |
|
"learning_rate": 1.6790278854983033e-06, |
|
"loss": 0.1322, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.466403162055336, |
|
"grad_norm": 1.1329200558876649, |
|
"learning_rate": 1.6558852378416113e-06, |
|
"loss": 0.1362, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.4703557312252964, |
|
"grad_norm": 1.0410869979854105, |
|
"learning_rate": 1.6328714977750698e-06, |
|
"loss": 0.1239, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.4743083003952568, |
|
"grad_norm": 1.2545052203915814, |
|
"learning_rate": 1.6099875524230707e-06, |
|
"loss": 0.1476, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.4782608695652173, |
|
"grad_norm": 1.2003692839751088, |
|
"learning_rate": 1.5872342839067305e-06, |
|
"loss": 0.1381, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.4822134387351777, |
|
"grad_norm": 1.1739098836348705, |
|
"learning_rate": 1.5646125693098863e-06, |
|
"loss": 0.1295, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.4861660079051384, |
|
"grad_norm": 1.1535530949440382, |
|
"learning_rate": 1.542123280645292e-06, |
|
"loss": 0.1234, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.4901185770750989, |
|
"grad_norm": 1.1168208484387032, |
|
"learning_rate": 1.519767284820996e-06, |
|
"loss": 0.1152, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.4940711462450593, |
|
"grad_norm": 1.0340813124452886, |
|
"learning_rate": 1.4975454436069292e-06, |
|
"loss": 0.1114, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.4980237154150198, |
|
"grad_norm": 1.117157676559404, |
|
"learning_rate": 1.4754586136016841e-06, |
|
"loss": 0.127, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.5019762845849802, |
|
"grad_norm": 1.1411823176863023, |
|
"learning_rate": 1.4535076461994974e-06, |
|
"loss": 0.1243, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5059288537549407, |
|
"grad_norm": 1.0074369184939824, |
|
"learning_rate": 1.431693387557424e-06, |
|
"loss": 0.1007, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.5098814229249014, |
|
"grad_norm": 1.1674952879116636, |
|
"learning_rate": 1.4100166785627301e-06, |
|
"loss": 0.1201, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.5138339920948618, |
|
"grad_norm": 1.0610397050784564, |
|
"learning_rate": 1.3884783548004704e-06, |
|
"loss": 0.1208, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.5177865612648223, |
|
"grad_norm": 1.1703142856070294, |
|
"learning_rate": 1.3670792465212828e-06, |
|
"loss": 0.131, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.5217391304347827, |
|
"grad_norm": 1.24844816636085, |
|
"learning_rate": 1.3458201786093795e-06, |
|
"loss": 0.1426, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.5256916996047432, |
|
"grad_norm": 1.0289708643260822, |
|
"learning_rate": 1.3247019705507596e-06, |
|
"loss": 0.107, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.5296442687747036, |
|
"grad_norm": 1.0411982481948332, |
|
"learning_rate": 1.3037254364016068e-06, |
|
"loss": 0.1106, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.533596837944664, |
|
"grad_norm": 1.0887512215886415, |
|
"learning_rate": 1.2828913847569185e-06, |
|
"loss": 0.126, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.5375494071146245, |
|
"grad_norm": 1.1038917846986824, |
|
"learning_rate": 1.2622006187193348e-06, |
|
"loss": 0.1169, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.541501976284585, |
|
"grad_norm": 1.1650275815326414, |
|
"learning_rate": 1.2416539358681772e-06, |
|
"loss": 0.1243, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.5454545454545454, |
|
"grad_norm": 1.126326384634314, |
|
"learning_rate": 1.2212521282287093e-06, |
|
"loss": 0.129, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.5494071146245059, |
|
"grad_norm": 1.129180074421009, |
|
"learning_rate": 1.2009959822416012e-06, |
|
"loss": 0.1382, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.5533596837944663, |
|
"grad_norm": 1.164913976767265, |
|
"learning_rate": 1.1808862787326176e-06, |
|
"loss": 0.1217, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.5573122529644268, |
|
"grad_norm": 1.1842645447436517, |
|
"learning_rate": 1.1609237928825174e-06, |
|
"loss": 0.1392, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.5612648221343872, |
|
"grad_norm": 1.127758214888184, |
|
"learning_rate": 1.1411092941971702e-06, |
|
"loss": 0.1334, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 1.0268028668010976, |
|
"learning_rate": 1.1214435464779006e-06, |
|
"loss": 0.1092, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.5691699604743083, |
|
"grad_norm": 1.1298748698800432, |
|
"learning_rate": 1.1019273077920366e-06, |
|
"loss": 0.1199, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.5731225296442688, |
|
"grad_norm": 1.151983299642192, |
|
"learning_rate": 1.0825613304436938e-06, |
|
"loss": 0.1214, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.5770750988142292, |
|
"grad_norm": 1.056563475272345, |
|
"learning_rate": 1.0633463609447753e-06, |
|
"loss": 0.1137, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.5810276679841897, |
|
"grad_norm": 1.0401127604187286, |
|
"learning_rate": 1.0442831399861903e-06, |
|
"loss": 0.1116, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.5849802371541502, |
|
"grad_norm": 1.137297843112467, |
|
"learning_rate": 1.0253724024093103e-06, |
|
"loss": 0.1259, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.5889328063241108, |
|
"grad_norm": 1.2731646482391143, |
|
"learning_rate": 1.006614877177638e-06, |
|
"loss": 0.1398, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.5928853754940713, |
|
"grad_norm": 1.2031755255565708, |
|
"learning_rate": 9.880112873487068e-07, |
|
"loss": 0.1435, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.5968379446640317, |
|
"grad_norm": 1.0673733506453118, |
|
"learning_rate": 9.695623500462114e-07, |
|
"loss": 0.1213, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.6007905138339922, |
|
"grad_norm": 1.1540390417696156, |
|
"learning_rate": 9.512687764323647e-07, |
|
"loss": 0.1309, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.6047430830039526, |
|
"grad_norm": 1.1719558092176683, |
|
"learning_rate": 9.331312716804791e-07, |
|
"loss": 0.1353, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.608695652173913, |
|
"grad_norm": 1.1328079202728492, |
|
"learning_rate": 9.151505349477901e-07, |
|
"loss": 0.1126, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.6126482213438735, |
|
"grad_norm": 1.1668524122475088, |
|
"learning_rate": 8.973272593485011e-07, |
|
"loss": 0.1232, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.616600790513834, |
|
"grad_norm": 1.1765999211223719, |
|
"learning_rate": 8.796621319270676e-07, |
|
"loss": 0.1381, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.6205533596837944, |
|
"grad_norm": 1.1580704897137293, |
|
"learning_rate": 8.621558336317132e-07, |
|
"loss": 0.125, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6245059288537549, |
|
"grad_norm": 1.2221405236185294, |
|
"learning_rate": 8.448090392881797e-07, |
|
"loss": 0.1434, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.6284584980237153, |
|
"grad_norm": 1.1484711907465028, |
|
"learning_rate": 8.276224175737152e-07, |
|
"loss": 0.1341, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.6324110671936758, |
|
"grad_norm": 1.0668615052922243, |
|
"learning_rate": 8.105966309912966e-07, |
|
"loss": 0.119, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 1.1708621908812695, |
|
"learning_rate": 7.937323358440935e-07, |
|
"loss": 0.1365, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.6403162055335967, |
|
"grad_norm": 1.0037003731577279, |
|
"learning_rate": 7.770301822101712e-07, |
|
"loss": 0.1056, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.6442687747035574, |
|
"grad_norm": 1.0648681490157104, |
|
"learning_rate": 7.604908139174255e-07, |
|
"loss": 0.1098, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.6482213438735178, |
|
"grad_norm": 1.1995736066849585, |
|
"learning_rate": 7.441148685187694e-07, |
|
"loss": 0.1341, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.6521739130434783, |
|
"grad_norm": 1.1848136469443982, |
|
"learning_rate": 7.279029772675572e-07, |
|
"loss": 0.1336, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.6561264822134387, |
|
"grad_norm": 1.1795612477078212, |
|
"learning_rate": 7.11855765093249e-07, |
|
"loss": 0.1379, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.6600790513833992, |
|
"grad_norm": 1.244369267153096, |
|
"learning_rate": 6.959738505773211e-07, |
|
"loss": 0.1373, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.6640316205533598, |
|
"grad_norm": 1.148156272898155, |
|
"learning_rate": 6.802578459294235e-07, |
|
"loss": 0.1274, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.6679841897233203, |
|
"grad_norm": 1.1618407598954121, |
|
"learning_rate": 6.647083569637797e-07, |
|
"loss": 0.1208, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.6719367588932808, |
|
"grad_norm": 1.1228370521334565, |
|
"learning_rate": 6.493259830758325e-07, |
|
"loss": 0.1181, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.6758893280632412, |
|
"grad_norm": 1.1659039027502425, |
|
"learning_rate": 6.341113172191399e-07, |
|
"loss": 0.1251, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.6798418972332017, |
|
"grad_norm": 1.1667891678872329, |
|
"learning_rate": 6.190649458825204e-07, |
|
"loss": 0.1373, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.683794466403162, |
|
"grad_norm": 1.1054656728972119, |
|
"learning_rate": 6.041874490674416e-07, |
|
"loss": 0.1183, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.6877470355731226, |
|
"grad_norm": 1.1318157363661536, |
|
"learning_rate": 5.894794002656628e-07, |
|
"loss": 0.1276, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.691699604743083, |
|
"grad_norm": 1.1427617467212088, |
|
"learning_rate": 5.749413664371312e-07, |
|
"loss": 0.1314, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.6956521739130435, |
|
"grad_norm": 1.123370168432176, |
|
"learning_rate": 5.60573907988124e-07, |
|
"loss": 0.1172, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.699604743083004, |
|
"grad_norm": 1.2640707026191205, |
|
"learning_rate": 5.463775787496484e-07, |
|
"loss": 0.1345, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7035573122529644, |
|
"grad_norm": 1.2270465900901621, |
|
"learning_rate": 5.323529259560911e-07, |
|
"loss": 0.1337, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.7075098814229248, |
|
"grad_norm": 1.0714382150531283, |
|
"learning_rate": 5.185004902241241e-07, |
|
"loss": 0.1125, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.7114624505928853, |
|
"grad_norm": 1.0825327963083162, |
|
"learning_rate": 5.04820805531866e-07, |
|
"loss": 0.1059, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.7154150197628457, |
|
"grad_norm": 1.0596003222945678, |
|
"learning_rate": 4.91314399198296e-07, |
|
"loss": 0.0999, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.7193675889328062, |
|
"grad_norm": 1.1990608827420846, |
|
"learning_rate": 4.779817918629326e-07, |
|
"loss": 0.1257, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.7233201581027668, |
|
"grad_norm": 1.0887979080170394, |
|
"learning_rate": 4.6482349746575783e-07, |
|
"loss": 0.1236, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 1.0412689170684666, |
|
"learning_rate": 4.5184002322740784e-07, |
|
"loss": 0.1066, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.7312252964426877, |
|
"grad_norm": 1.0802724381125226, |
|
"learning_rate": 4.390318696296247e-07, |
|
"loss": 0.1149, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.7351778656126482, |
|
"grad_norm": 1.1551936194230803, |
|
"learning_rate": 4.2639953039595725e-07, |
|
"loss": 0.135, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 1.2135177448970054, |
|
"learning_rate": 4.139434924727359e-07, |
|
"loss": 0.1349, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.7430830039525693, |
|
"grad_norm": 1.0507122616854452, |
|
"learning_rate": 4.0166423601029735e-07, |
|
"loss": 0.1157, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.7470355731225298, |
|
"grad_norm": 1.1753401932840621, |
|
"learning_rate": 3.8956223434447936e-07, |
|
"loss": 0.136, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.7509881422924902, |
|
"grad_norm": 1.1404040824302084, |
|
"learning_rate": 3.776379539783709e-07, |
|
"loss": 0.1223, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.7549407114624507, |
|
"grad_norm": 1.2777386537799145, |
|
"learning_rate": 3.658918545643353e-07, |
|
"loss": 0.1485, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.7588932806324111, |
|
"grad_norm": 1.1413712412237624, |
|
"learning_rate": 3.543243888862841e-07, |
|
"loss": 0.1212, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.7628458498023716, |
|
"grad_norm": 1.0872960731651602, |
|
"learning_rate": 3.429360028422307e-07, |
|
"loss": 0.1193, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.766798418972332, |
|
"grad_norm": 1.0637737003753145, |
|
"learning_rate": 3.317271354270968e-07, |
|
"loss": 0.1133, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.7707509881422925, |
|
"grad_norm": 1.1840091720746755, |
|
"learning_rate": 3.2069821871579255e-07, |
|
"loss": 0.1333, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.774703557312253, |
|
"grad_norm": 1.1255230443713273, |
|
"learning_rate": 3.098496778465621e-07, |
|
"loss": 0.1181, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.7786561264822134, |
|
"grad_norm": 1.0775241906777941, |
|
"learning_rate": 2.991819310045929e-07, |
|
"loss": 0.1189, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.7826086956521738, |
|
"grad_norm": 1.0583221665971685, |
|
"learning_rate": 2.88695389405898e-07, |
|
"loss": 0.1103, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.7865612648221343, |
|
"grad_norm": 1.1242814725320212, |
|
"learning_rate": 2.783904572814622e-07, |
|
"loss": 0.1285, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.7905138339920947, |
|
"grad_norm": 1.1779401797110727, |
|
"learning_rate": 2.682675318616618e-07, |
|
"loss": 0.1294, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.7944664031620552, |
|
"grad_norm": 1.0859491545785036, |
|
"learning_rate": 2.583270033609536e-07, |
|
"loss": 0.1123, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.7984189723320159, |
|
"grad_norm": 1.0763948870282039, |
|
"learning_rate": 2.4856925496283045e-07, |
|
"loss": 0.1104, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.8023715415019763, |
|
"grad_norm": 1.0887108595666282, |
|
"learning_rate": 2.3899466280504936e-07, |
|
"loss": 0.1126, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.8063241106719368, |
|
"grad_norm": 1.1717149496208563, |
|
"learning_rate": 2.2960359596513714e-07, |
|
"loss": 0.1351, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.8102766798418972, |
|
"grad_norm": 1.229561030964994, |
|
"learning_rate": 2.203964164461597e-07, |
|
"loss": 0.1369, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.8142292490118577, |
|
"grad_norm": 1.1633472228135062, |
|
"learning_rate": 2.113734791627664e-07, |
|
"loss": 0.1291, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.2035274009151151, |
|
"learning_rate": 2.0253513192751374e-07, |
|
"loss": 0.1469, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8221343873517788, |
|
"grad_norm": 1.1836270092696015, |
|
"learning_rate": 1.9388171543745394e-07, |
|
"loss": 0.1379, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.8260869565217392, |
|
"grad_norm": 1.0758870256725546, |
|
"learning_rate": 1.8541356326100436e-07, |
|
"loss": 0.1117, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.8300395256916997, |
|
"grad_norm": 1.1793736035903457, |
|
"learning_rate": 1.7713100182508604e-07, |
|
"loss": 0.1398, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.8339920948616601, |
|
"grad_norm": 1.0801974661120426, |
|
"learning_rate": 1.6903435040254545e-07, |
|
"loss": 0.1128, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.8379446640316206, |
|
"grad_norm": 1.07339663507577, |
|
"learning_rate": 1.6112392109984386e-07, |
|
"loss": 0.1099, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.841897233201581, |
|
"grad_norm": 1.1120046775263792, |
|
"learning_rate": 1.5340001884502577e-07, |
|
"loss": 0.1262, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.8458498023715415, |
|
"grad_norm": 1.0831243721534856, |
|
"learning_rate": 1.4586294137596768e-07, |
|
"loss": 0.1193, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.849802371541502, |
|
"grad_norm": 1.0472977237699574, |
|
"learning_rate": 1.385129792288986e-07, |
|
"loss": 0.1041, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.8537549407114624, |
|
"grad_norm": 1.1482729228107225, |
|
"learning_rate": 1.313504157272022e-07, |
|
"loss": 0.1301, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.8577075098814229, |
|
"grad_norm": 1.1397420372115987, |
|
"learning_rate": 1.2437552697049327e-07, |
|
"loss": 0.1289, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.8616600790513833, |
|
"grad_norm": 1.1569650321244938, |
|
"learning_rate": 1.1758858182397692e-07, |
|
"loss": 0.1283, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.8656126482213438, |
|
"grad_norm": 1.1694788716855629, |
|
"learning_rate": 1.1098984190808403e-07, |
|
"loss": 0.1335, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.8695652173913042, |
|
"grad_norm": 1.0086960578539177, |
|
"learning_rate": 1.0457956158838545e-07, |
|
"loss": 0.0996, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.8735177865612647, |
|
"grad_norm": 1.0308075869001883, |
|
"learning_rate": 9.835798796578755e-08, |
|
"loss": 0.1153, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.8774703557312253, |
|
"grad_norm": 1.170279753009744, |
|
"learning_rate": 9.232536086700605e-08, |
|
"loss": 0.1304, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.8814229249011858, |
|
"grad_norm": 1.0870287190560486, |
|
"learning_rate": 8.648191283532337e-08, |
|
"loss": 0.1213, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.8853754940711462, |
|
"grad_norm": 1.1165962048020976, |
|
"learning_rate": 8.082786912162243e-08, |
|
"loss": 0.1161, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.8893280632411067, |
|
"grad_norm": 1.1494916541958282, |
|
"learning_rate": 7.536344767570536e-08, |
|
"loss": 0.1141, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.8932806324110671, |
|
"grad_norm": 1.0640073229737435, |
|
"learning_rate": 7.008885913789066e-08, |
|
"loss": 0.1228, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.8972332015810278, |
|
"grad_norm": 1.1870546140040668, |
|
"learning_rate": 6.500430683089532e-08, |
|
"loss": 0.1303, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.9011857707509883, |
|
"grad_norm": 1.173615256902204, |
|
"learning_rate": 6.010998675199554e-08, |
|
"loss": 0.1338, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.9051383399209487, |
|
"grad_norm": 1.1958445869792897, |
|
"learning_rate": 5.5406087565471054e-08, |
|
"loss": 0.1405, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.9090909090909092, |
|
"grad_norm": 1.1122683154170514, |
|
"learning_rate": 5.089279059533658e-08, |
|
"loss": 0.1281, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 1.0958024070105987, |
|
"learning_rate": 4.657026981834623e-08, |
|
"loss": 0.12, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.91699604743083, |
|
"grad_norm": 1.1024888588089012, |
|
"learning_rate": 4.2438691857292215e-08, |
|
"loss": 0.1144, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9209486166007905, |
|
"grad_norm": 1.123264444499075, |
|
"learning_rate": 3.849821597457892e-08, |
|
"loss": 0.1347, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.924901185770751, |
|
"grad_norm": 1.0765658056683685, |
|
"learning_rate": 3.474899406608501e-08, |
|
"loss": 0.1129, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.9288537549407114, |
|
"grad_norm": 1.148979953908725, |
|
"learning_rate": 3.119117065530808e-08, |
|
"loss": 0.123, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.9328063241106719, |
|
"grad_norm": 1.2501363554379483, |
|
"learning_rate": 2.7824882887793058e-08, |
|
"loss": 0.1538, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.9367588932806323, |
|
"grad_norm": 1.196112067372065, |
|
"learning_rate": 2.4650260525846404e-08, |
|
"loss": 0.1416, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.9407114624505928, |
|
"grad_norm": 1.1128332896345188, |
|
"learning_rate": 2.1667425943532884e-08, |
|
"loss": 0.1147, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.9446640316205532, |
|
"grad_norm": 1.1016376226716715, |
|
"learning_rate": 1.8876494121959908e-08, |
|
"loss": 0.1289, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.9486166007905137, |
|
"grad_norm": 1.135611367054567, |
|
"learning_rate": 1.627757264484442e-08, |
|
"loss": 0.1294, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.9525691699604744, |
|
"grad_norm": 1.1398251434326172, |
|
"learning_rate": 1.387076169436563e-08, |
|
"loss": 0.1262, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.9565217391304348, |
|
"grad_norm": 1.0846986101599616, |
|
"learning_rate": 1.1656154047303691e-08, |
|
"loss": 0.1126, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.9604743083003953, |
|
"grad_norm": 1.2370763808268233, |
|
"learning_rate": 9.633835071463094e-09, |
|
"loss": 0.1485, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.9644268774703557, |
|
"grad_norm": 1.1863239612132057, |
|
"learning_rate": 7.803882722381417e-09, |
|
"loss": 0.1293, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.9683794466403162, |
|
"grad_norm": 1.2282535432480546, |
|
"learning_rate": 6.166367540325624e-09, |
|
"loss": 0.1428, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.9723320158102768, |
|
"grad_norm": 1.0276030484998475, |
|
"learning_rate": 4.721352647572564e-09, |
|
"loss": 0.1129, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.9762845849802373, |
|
"grad_norm": 1.2463780542335487, |
|
"learning_rate": 3.4688937459737004e-09, |
|
"loss": 0.1394, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9762845849802373, |
|
"eval_loss": 0.16894643008708954, |
|
"eval_runtime": 3.7534, |
|
"eval_samples_per_second": 5.595, |
|
"eval_steps_per_second": 1.599, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.9802371541501977, |
|
"grad_norm": 1.149104137588847, |
|
"learning_rate": 2.4090391148112734e-09, |
|
"loss": 0.126, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.9841897233201582, |
|
"grad_norm": 1.0644722994856588, |
|
"learning_rate": 1.5418296089358964e-09, |
|
"loss": 0.1161, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.9881422924901186, |
|
"grad_norm": 1.1078146584096278, |
|
"learning_rate": 8.672986571894859e-10, |
|
"loss": 0.1159, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.992094861660079, |
|
"grad_norm": 1.0306240076981967, |
|
"learning_rate": 3.854722611201789e-10, |
|
"loss": 0.1143, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.9960474308300395, |
|
"grad_norm": 1.150741799871838, |
|
"learning_rate": 9.636899397813537e-11, |
|
"loss": 0.1237, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.0328104166457073, |
|
"learning_rate": 0.0, |
|
"loss": 0.1028, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 506, |
|
"total_flos": 6736634707968.0, |
|
"train_loss": 0.1787011340083812, |
|
"train_runtime": 1032.8943, |
|
"train_samples_per_second": 3.913, |
|
"train_steps_per_second": 0.49 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 506, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 70000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6736634707968.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|