|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9941672067401166, |
|
"eval_steps": 50, |
|
"global_step": 1155, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012961762799740765, |
|
"grad_norm": 17.21252896134672, |
|
"learning_rate": 5e-07, |
|
"loss": 1.6823, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02592352559948153, |
|
"grad_norm": 12.008964700255303, |
|
"learning_rate": 1e-06, |
|
"loss": 1.566, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03888528839922229, |
|
"grad_norm": 7.9698215919555055, |
|
"learning_rate": 9.99952949745378e-07, |
|
"loss": 1.2721, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05184705119896306, |
|
"grad_norm": 4.447326184155441, |
|
"learning_rate": 9.998118078364185e-07, |
|
"loss": 1.1461, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06480881399870382, |
|
"grad_norm": 3.9602084655818675, |
|
"learning_rate": 9.995766008361718e-07, |
|
"loss": 1.0509, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07777057679844458, |
|
"grad_norm": 3.524237361034851, |
|
"learning_rate": 9.992473730108354e-07, |
|
"loss": 0.9982, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09073233959818536, |
|
"grad_norm": 3.633387524272681, |
|
"learning_rate": 9.98824186321421e-07, |
|
"loss": 0.9804, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10369410239792612, |
|
"grad_norm": 3.4822921075242577, |
|
"learning_rate": 9.98307120412095e-07, |
|
"loss": 0.954, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11665586519766688, |
|
"grad_norm": 3.519195574393169, |
|
"learning_rate": 9.976962725951878e-07, |
|
"loss": 0.928, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12961762799740764, |
|
"grad_norm": 3.3308237206179423, |
|
"learning_rate": 9.969917578328807e-07, |
|
"loss": 0.9184, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12961762799740764, |
|
"eval_loss": 0.92087721824646, |
|
"eval_runtime": 190.61, |
|
"eval_samples_per_second": 57.547, |
|
"eval_steps_per_second": 0.902, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.1425793907971484, |
|
"grad_norm": 3.343853179925445, |
|
"learning_rate": 9.961937087155695e-07, |
|
"loss": 0.9194, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15554115359688916, |
|
"grad_norm": 3.368850277351612, |
|
"learning_rate": 9.953022754369114e-07, |
|
"loss": 0.8832, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16850291639662995, |
|
"grad_norm": 3.3163787130271003, |
|
"learning_rate": 9.943176257655565e-07, |
|
"loss": 0.8923, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18146467919637072, |
|
"grad_norm": 3.242609823802845, |
|
"learning_rate": 9.932399450135765e-07, |
|
"loss": 0.8883, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19442644199611148, |
|
"grad_norm": 3.402686266652709, |
|
"learning_rate": 9.920694360015862e-07, |
|
"loss": 0.8639, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.20738820479585224, |
|
"grad_norm": 3.404608035399376, |
|
"learning_rate": 9.908063190205739e-07, |
|
"loss": 0.8732, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.220349967595593, |
|
"grad_norm": 3.3975642435264732, |
|
"learning_rate": 9.894508317904417e-07, |
|
"loss": 0.8883, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.23331173039533376, |
|
"grad_norm": 3.558535819762842, |
|
"learning_rate": 9.88003229415267e-07, |
|
"loss": 0.8671, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.24627349319507452, |
|
"grad_norm": 3.6228022238357815, |
|
"learning_rate": 9.864637843352913e-07, |
|
"loss": 0.8771, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2592352559948153, |
|
"grad_norm": 3.6210872209063174, |
|
"learning_rate": 9.848327862756466e-07, |
|
"loss": 0.8611, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2592352559948153, |
|
"eval_loss": 0.8652921915054321, |
|
"eval_runtime": 190.2747, |
|
"eval_samples_per_second": 57.648, |
|
"eval_steps_per_second": 0.904, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.27219701879455604, |
|
"grad_norm": 3.312411561694666, |
|
"learning_rate": 9.831105421918285e-07, |
|
"loss": 0.8547, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2851587815942968, |
|
"grad_norm": 3.4066727133401598, |
|
"learning_rate": 9.81297376211928e-07, |
|
"loss": 0.8369, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.29812054439403757, |
|
"grad_norm": 3.2867314575644575, |
|
"learning_rate": 9.79393629575629e-07, |
|
"loss": 0.8403, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.31108230719377833, |
|
"grad_norm": 3.599628911972995, |
|
"learning_rate": 9.773996605699875e-07, |
|
"loss": 0.8437, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.32404406999351915, |
|
"grad_norm": 3.3280113051932183, |
|
"learning_rate": 9.753158444620013e-07, |
|
"loss": 0.8295, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3370058327932599, |
|
"grad_norm": 3.3041114591018035, |
|
"learning_rate": 9.73142573427984e-07, |
|
"loss": 0.8416, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.34996759559300067, |
|
"grad_norm": 3.1677202986884647, |
|
"learning_rate": 9.708802564797578e-07, |
|
"loss": 0.8372, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.36292935839274143, |
|
"grad_norm": 3.4261833858436126, |
|
"learning_rate": 9.685293193876765e-07, |
|
"loss": 0.8407, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3758911211924822, |
|
"grad_norm": 3.2025888685071995, |
|
"learning_rate": 9.660902046004953e-07, |
|
"loss": 0.8218, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38885288399222295, |
|
"grad_norm": 3.3672158659218945, |
|
"learning_rate": 9.635633711621011e-07, |
|
"loss": 0.8257, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38885288399222295, |
|
"eval_loss": 0.8396357893943787, |
|
"eval_runtime": 190.236, |
|
"eval_samples_per_second": 57.66, |
|
"eval_steps_per_second": 0.904, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4018146467919637, |
|
"grad_norm": 3.5382812532295653, |
|
"learning_rate": 9.609492946251208e-07, |
|
"loss": 0.8335, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4147764095917045, |
|
"grad_norm": 3.2987076575989396, |
|
"learning_rate": 9.58248466961421e-07, |
|
"loss": 0.8274, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.42773817239144524, |
|
"grad_norm": 3.209384087480358, |
|
"learning_rate": 9.554613964695188e-07, |
|
"loss": 0.8419, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.440699935191186, |
|
"grad_norm": 3.2525057388499867, |
|
"learning_rate": 9.525886076789193e-07, |
|
"loss": 0.8264, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.45366169799092676, |
|
"grad_norm": 3.2659862173369225, |
|
"learning_rate": 9.496306412513988e-07, |
|
"loss": 0.813, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4666234607906675, |
|
"grad_norm": 3.617112426560761, |
|
"learning_rate": 9.465880538792517e-07, |
|
"loss": 0.8209, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4795852235904083, |
|
"grad_norm": 3.8241965496757055, |
|
"learning_rate": 9.434614181805202e-07, |
|
"loss": 0.8246, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.49254698639014904, |
|
"grad_norm": 3.389966079797953, |
|
"learning_rate": 9.402513225912271e-07, |
|
"loss": 0.8236, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5055087491898899, |
|
"grad_norm": 3.31422039420999, |
|
"learning_rate": 9.36958371254632e-07, |
|
"loss": 0.8158, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5184705119896306, |
|
"grad_norm": 3.365373850064581, |
|
"learning_rate": 9.335831839075302e-07, |
|
"loss": 0.8308, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5184705119896306, |
|
"eval_loss": 0.8231202960014343, |
|
"eval_runtime": 190.2867, |
|
"eval_samples_per_second": 57.645, |
|
"eval_steps_per_second": 0.904, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5314322747893714, |
|
"grad_norm": 3.441006121650103, |
|
"learning_rate": 9.301263957636179e-07, |
|
"loss": 0.8191, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5443940375891121, |
|
"grad_norm": 3.428707563220311, |
|
"learning_rate": 9.265886573939446e-07, |
|
"loss": 0.8191, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5573558003888529, |
|
"grad_norm": 3.5640266455568166, |
|
"learning_rate": 9.229706346044747e-07, |
|
"loss": 0.8255, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5703175631885936, |
|
"grad_norm": 3.551803069534866, |
|
"learning_rate": 9.192730083107818e-07, |
|
"loss": 0.8164, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5832793259883344, |
|
"grad_norm": 3.414031666969802, |
|
"learning_rate": 9.154964744099005e-07, |
|
"loss": 0.7939, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5962410887880751, |
|
"grad_norm": 3.442613850161029, |
|
"learning_rate": 9.116417436493573e-07, |
|
"loss": 0.8034, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.609202851587816, |
|
"grad_norm": 3.462275139287757, |
|
"learning_rate": 9.077095414934075e-07, |
|
"loss": 0.8063, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6221646143875567, |
|
"grad_norm": 3.447345184549608, |
|
"learning_rate": 9.037006079865015e-07, |
|
"loss": 0.8233, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6351263771872975, |
|
"grad_norm": 3.5938927981457933, |
|
"learning_rate": 8.996156976140086e-07, |
|
"loss": 0.8283, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6480881399870383, |
|
"grad_norm": 3.5811512536833523, |
|
"learning_rate": 8.95455579160221e-07, |
|
"loss": 0.8168, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6480881399870383, |
|
"eval_loss": 0.810850977897644, |
|
"eval_runtime": 190.2136, |
|
"eval_samples_per_second": 57.667, |
|
"eval_steps_per_second": 0.904, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.661049902786779, |
|
"grad_norm": 3.3124363238635715, |
|
"learning_rate": 8.912210355636689e-07, |
|
"loss": 0.7922, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6740116655865198, |
|
"grad_norm": 3.3955357238579693, |
|
"learning_rate": 8.8691286376977e-07, |
|
"loss": 0.8044, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6869734283862605, |
|
"grad_norm": 3.1989325895717053, |
|
"learning_rate": 8.825318745808439e-07, |
|
"loss": 0.7933, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6999351911860013, |
|
"grad_norm": 3.393447117563635, |
|
"learning_rate": 8.780788925035177e-07, |
|
"loss": 0.8018, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.712896953985742, |
|
"grad_norm": 3.2831528447418132, |
|
"learning_rate": 8.735547555935537e-07, |
|
"loss": 0.7961, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7258587167854829, |
|
"grad_norm": 3.4250845276838486, |
|
"learning_rate": 8.689603152981262e-07, |
|
"loss": 0.812, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7388204795852236, |
|
"grad_norm": 3.2575625313957035, |
|
"learning_rate": 8.64296436295578e-07, |
|
"loss": 0.7971, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7517822423849644, |
|
"grad_norm": 3.5149123785883014, |
|
"learning_rate": 8.595639963326879e-07, |
|
"loss": 0.8067, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7647440051847051, |
|
"grad_norm": 3.5091201056207533, |
|
"learning_rate": 8.547638860594764e-07, |
|
"loss": 0.8155, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7777057679844459, |
|
"grad_norm": 3.5343144588793036, |
|
"learning_rate": 8.49897008861586e-07, |
|
"loss": 0.8021, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7777057679844459, |
|
"eval_loss": 0.800530195236206, |
|
"eval_runtime": 190.6317, |
|
"eval_samples_per_second": 57.54, |
|
"eval_steps_per_second": 0.902, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7906675307841866, |
|
"grad_norm": 3.3863012076670542, |
|
"learning_rate": 8.449642806902622e-07, |
|
"loss": 0.7993, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8036292935839274, |
|
"grad_norm": 3.5586423328389585, |
|
"learning_rate": 8.399666298899706e-07, |
|
"loss": 0.7918, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8165910563836681, |
|
"grad_norm": 3.12605831331216, |
|
"learning_rate": 8.34904997023682e-07, |
|
"loss": 0.7943, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.829552819183409, |
|
"grad_norm": 3.365606947704741, |
|
"learning_rate": 8.297803346958569e-07, |
|
"loss": 0.783, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8425145819831497, |
|
"grad_norm": 3.525340806653281, |
|
"learning_rate": 8.245936073731651e-07, |
|
"loss": 0.7976, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8554763447828905, |
|
"grad_norm": 3.244247748765073, |
|
"learning_rate": 8.193457912029712e-07, |
|
"loss": 0.7807, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8684381075826313, |
|
"grad_norm": 3.299152179523224, |
|
"learning_rate": 8.140378738296232e-07, |
|
"loss": 0.7836, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.881399870382372, |
|
"grad_norm": 3.6966297262494683, |
|
"learning_rate": 8.086708542085767e-07, |
|
"loss": 0.8063, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8943616331821128, |
|
"grad_norm": 3.459714731085819, |
|
"learning_rate": 8.032457424183909e-07, |
|
"loss": 0.7911, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9073233959818535, |
|
"grad_norm": 3.0350617290741373, |
|
"learning_rate": 7.977635594706298e-07, |
|
"loss": 0.7741, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9073233959818535, |
|
"eval_loss": 0.7928882837295532, |
|
"eval_runtime": 190.396, |
|
"eval_samples_per_second": 57.612, |
|
"eval_steps_per_second": 0.903, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9202851587815943, |
|
"grad_norm": 3.3620653686755686, |
|
"learning_rate": 7.922253371177082e-07, |
|
"loss": 0.7932, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.933246921581335, |
|
"grad_norm": 3.278928351388705, |
|
"learning_rate": 7.866321176587128e-07, |
|
"loss": 0.7871, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9462086843810759, |
|
"grad_norm": 3.296771284934289, |
|
"learning_rate": 7.809849537432431e-07, |
|
"loss": 0.7792, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9591704471808166, |
|
"grad_norm": 4.571766433220969, |
|
"learning_rate": 7.752849081732991e-07, |
|
"loss": 0.788, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9721322099805574, |
|
"grad_norm": 3.4508851083525007, |
|
"learning_rate": 7.695330537032627e-07, |
|
"loss": 0.7614, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9850939727802981, |
|
"grad_norm": 3.4026555887076433, |
|
"learning_rate": 7.637304728380037e-07, |
|
"loss": 0.7964, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9980557355800389, |
|
"grad_norm": 3.4049969426822644, |
|
"learning_rate": 7.5787825762915e-07, |
|
"loss": 0.7659, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0110174983797797, |
|
"grad_norm": 3.7467236394334593, |
|
"learning_rate": 7.519775094695648e-07, |
|
"loss": 0.729, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0239792611795204, |
|
"grad_norm": 3.3057229188741797, |
|
"learning_rate": 7.460293388860614e-07, |
|
"loss": 0.7421, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0369410239792611, |
|
"grad_norm": 3.1846298307152545, |
|
"learning_rate": 7.400348653304021e-07, |
|
"loss": 0.7155, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0369410239792611, |
|
"eval_loss": 0.7879951596260071, |
|
"eval_runtime": 190.5219, |
|
"eval_samples_per_second": 57.573, |
|
"eval_steps_per_second": 0.903, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0499027867790018, |
|
"grad_norm": 3.555528496823252, |
|
"learning_rate": 7.33995216968615e-07, |
|
"loss": 0.721, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0628645495787428, |
|
"grad_norm": 3.17101772663646, |
|
"learning_rate": 7.279115304686733e-07, |
|
"loss": 0.7251, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0758263123784835, |
|
"grad_norm": 3.4524740982989575, |
|
"learning_rate": 7.217849507865723e-07, |
|
"loss": 0.7079, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.0887880751782242, |
|
"grad_norm": 3.5488503758648604, |
|
"learning_rate": 7.156166309508481e-07, |
|
"loss": 0.716, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.101749837977965, |
|
"grad_norm": 3.2843814722509754, |
|
"learning_rate": 7.094077318455761e-07, |
|
"loss": 0.7248, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.1147116007777058, |
|
"grad_norm": 3.5818373829792436, |
|
"learning_rate": 7.031594219918915e-07, |
|
"loss": 0.7282, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1276733635774465, |
|
"grad_norm": 3.545436547315646, |
|
"learning_rate": 6.968728773280729e-07, |
|
"loss": 0.7214, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1406351263771872, |
|
"grad_norm": 3.7309987258945956, |
|
"learning_rate": 6.905492809882285e-07, |
|
"loss": 0.7149, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1535968891769282, |
|
"grad_norm": 3.342264257213339, |
|
"learning_rate": 6.841898230796302e-07, |
|
"loss": 0.7118, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.1665586519766689, |
|
"grad_norm": 3.5498283019941668, |
|
"learning_rate": 6.777957004587331e-07, |
|
"loss": 0.7322, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1665586519766689, |
|
"eval_loss": 0.7836564779281616, |
|
"eval_runtime": 190.524, |
|
"eval_samples_per_second": 57.573, |
|
"eval_steps_per_second": 0.903, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1795204147764096, |
|
"grad_norm": 3.4407614110968274, |
|
"learning_rate": 6.713681165059271e-07, |
|
"loss": 0.731, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.1924821775761503, |
|
"grad_norm": 3.606482531048048, |
|
"learning_rate": 6.649082808990585e-07, |
|
"loss": 0.7263, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2054439403758912, |
|
"grad_norm": 3.4160389968965834, |
|
"learning_rate": 6.584174093857675e-07, |
|
"loss": 0.7138, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.218405703175632, |
|
"grad_norm": 3.6414465107657885, |
|
"learning_rate": 6.518967235546841e-07, |
|
"loss": 0.7131, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2313674659753726, |
|
"grad_norm": 3.5928947738604835, |
|
"learning_rate": 6.453474506055227e-07, |
|
"loss": 0.7056, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2443292287751135, |
|
"grad_norm": 3.3876054178886337, |
|
"learning_rate": 6.387708231181228e-07, |
|
"loss": 0.7142, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2572909915748542, |
|
"grad_norm": 3.9025269357210606, |
|
"learning_rate": 6.321680788204757e-07, |
|
"loss": 0.7295, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.270252754374595, |
|
"grad_norm": 3.6340219528437747, |
|
"learning_rate": 6.255404603557833e-07, |
|
"loss": 0.6984, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2832145171743357, |
|
"grad_norm": 3.441153025186208, |
|
"learning_rate": 6.188892150485902e-07, |
|
"loss": 0.7286, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.2961762799740764, |
|
"grad_norm": 3.509621524777449, |
|
"learning_rate": 6.122155946700381e-07, |
|
"loss": 0.7214, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2961762799740764, |
|
"eval_loss": 0.7790202498435974, |
|
"eval_runtime": 190.1439, |
|
"eval_samples_per_second": 57.688, |
|
"eval_steps_per_second": 0.905, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3091380427738173, |
|
"grad_norm": 3.9718306817617957, |
|
"learning_rate": 6.055208552022787e-07, |
|
"loss": 0.7183, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.322099805573558, |
|
"grad_norm": 3.6250018779596203, |
|
"learning_rate": 5.988062566020986e-07, |
|
"loss": 0.7188, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3350615683732987, |
|
"grad_norm": 3.58876192132372, |
|
"learning_rate": 5.920730625637933e-07, |
|
"loss": 0.6976, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.3480233311730396, |
|
"grad_norm": 3.463594237695507, |
|
"learning_rate": 5.85322540281338e-07, |
|
"loss": 0.7274, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3609850939727803, |
|
"grad_norm": 3.755952042482129, |
|
"learning_rate": 5.785559602099018e-07, |
|
"loss": 0.7084, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.373946856772521, |
|
"grad_norm": 3.306784860944235, |
|
"learning_rate": 5.717745958267459e-07, |
|
"loss": 0.7119, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3869086195722617, |
|
"grad_norm": 3.534119314182442, |
|
"learning_rate": 5.649797233915538e-07, |
|
"loss": 0.7086, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.3998703823720027, |
|
"grad_norm": 3.236567266341926, |
|
"learning_rate": 5.581726217062386e-07, |
|
"loss": 0.716, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4128321451717434, |
|
"grad_norm": 3.4785698979562456, |
|
"learning_rate": 5.513545718742701e-07, |
|
"loss": 0.7071, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.425793907971484, |
|
"grad_norm": 3.675603397627949, |
|
"learning_rate": 5.445268570595708e-07, |
|
"loss": 0.6936, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.425793907971484, |
|
"eval_loss": 0.7752651572227478, |
|
"eval_runtime": 190.3759, |
|
"eval_samples_per_second": 57.618, |
|
"eval_steps_per_second": 0.903, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4387556707712248, |
|
"grad_norm": 3.399382435225056, |
|
"learning_rate": 5.376907622450228e-07, |
|
"loss": 0.7155, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.4517174335709657, |
|
"grad_norm": 3.564747777066771, |
|
"learning_rate": 5.308475739906328e-07, |
|
"loss": 0.7086, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4646791963707064, |
|
"grad_norm": 3.346047715273763, |
|
"learning_rate": 5.239985801913999e-07, |
|
"loss": 0.7136, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.4776409591704471, |
|
"grad_norm": 3.716468641603961, |
|
"learning_rate": 5.171450698349329e-07, |
|
"loss": 0.7208, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.490602721970188, |
|
"grad_norm": 3.8457195513896703, |
|
"learning_rate": 5.102883327588608e-07, |
|
"loss": 0.7055, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.5035644847699285, |
|
"grad_norm": 3.6398948460452325, |
|
"learning_rate": 5.034296594080848e-07, |
|
"loss": 0.6824, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5165262475696695, |
|
"grad_norm": 3.3427427312285425, |
|
"learning_rate": 4.965703405919153e-07, |
|
"loss": 0.733, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5294880103694104, |
|
"grad_norm": 3.7140860027489504, |
|
"learning_rate": 4.897116672411394e-07, |
|
"loss": 0.7011, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5424497731691509, |
|
"grad_norm": 3.810495577054968, |
|
"learning_rate": 4.828549301650673e-07, |
|
"loss": 0.7216, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.5554115359688918, |
|
"grad_norm": 3.4920510717873925, |
|
"learning_rate": 4.760014198086001e-07, |
|
"loss": 0.7046, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5554115359688918, |
|
"eval_loss": 0.7716971039772034, |
|
"eval_runtime": 190.4272, |
|
"eval_samples_per_second": 57.602, |
|
"eval_steps_per_second": 0.903, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5683732987686325, |
|
"grad_norm": 3.7379962063873244, |
|
"learning_rate": 4.691524260093672e-07, |
|
"loss": 0.7177, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.5813350615683732, |
|
"grad_norm": 3.5163342715426493, |
|
"learning_rate": 4.6230923775497714e-07, |
|
"loss": 0.698, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5942968243681142, |
|
"grad_norm": 3.7752520950220316, |
|
"learning_rate": 4.554731429404293e-07, |
|
"loss": 0.7068, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.6072585871678549, |
|
"grad_norm": 3.6497035959305637, |
|
"learning_rate": 4.486454281257299e-07, |
|
"loss": 0.7105, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6202203499675956, |
|
"grad_norm": 3.3786220834131817, |
|
"learning_rate": 4.4182737829376135e-07, |
|
"loss": 0.7002, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.6331821127673365, |
|
"grad_norm": 3.745654045093921, |
|
"learning_rate": 4.35020276608446e-07, |
|
"loss": 0.7132, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.646143875567077, |
|
"grad_norm": 3.7550095562633294, |
|
"learning_rate": 4.2822540417325394e-07, |
|
"loss": 0.7239, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.659105638366818, |
|
"grad_norm": 3.6978911774202743, |
|
"learning_rate": 4.2144403979009823e-07, |
|
"loss": 0.708, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6720674011665586, |
|
"grad_norm": 3.380536692165867, |
|
"learning_rate": 4.1467745971866214e-07, |
|
"loss": 0.7152, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.6850291639662993, |
|
"grad_norm": 3.7325575613565922, |
|
"learning_rate": 4.0792693743620686e-07, |
|
"loss": 0.6967, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6850291639662993, |
|
"eval_loss": 0.7689746618270874, |
|
"eval_runtime": 191.11, |
|
"eval_samples_per_second": 57.396, |
|
"eval_steps_per_second": 0.9, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.6979909267660402, |
|
"grad_norm": 3.51032707119909, |
|
"learning_rate": 4.0119374339790133e-07, |
|
"loss": 0.7024, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.710952689565781, |
|
"grad_norm": 3.6309443687197933, |
|
"learning_rate": 3.944791447977213e-07, |
|
"loss": 0.7242, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.7239144523655217, |
|
"grad_norm": 3.4720213982057175, |
|
"learning_rate": 3.87784405329962e-07, |
|
"loss": 0.7089, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.7368762151652626, |
|
"grad_norm": 3.503078080597499, |
|
"learning_rate": 3.8111078495140973e-07, |
|
"loss": 0.7219, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.7498379779650033, |
|
"grad_norm": 3.801326705711802, |
|
"learning_rate": 3.7445953964421684e-07, |
|
"loss": 0.7078, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.762799740764744, |
|
"grad_norm": 3.673701397433746, |
|
"learning_rate": 3.678319211795242e-07, |
|
"loss": 0.7263, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.775761503564485, |
|
"grad_norm": 3.4237663715763023, |
|
"learning_rate": 3.6122917688187717e-07, |
|
"loss": 0.6994, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.7887232663642254, |
|
"grad_norm": 3.705157971233802, |
|
"learning_rate": 3.546525493944773e-07, |
|
"loss": 0.7112, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8016850291639663, |
|
"grad_norm": 3.785127652648907, |
|
"learning_rate": 3.48103276445316e-07, |
|
"loss": 0.7145, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.814646791963707, |
|
"grad_norm": 3.3911683616136177, |
|
"learning_rate": 3.4158259061423255e-07, |
|
"loss": 0.7197, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.814646791963707, |
|
"eval_loss": 0.765801727771759, |
|
"eval_runtime": 190.378, |
|
"eval_samples_per_second": 57.617, |
|
"eval_steps_per_second": 0.903, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8276085547634477, |
|
"grad_norm": 3.3840783324475994, |
|
"learning_rate": 3.3509171910094156e-07, |
|
"loss": 0.7008, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.8405703175631887, |
|
"grad_norm": 3.612580310866905, |
|
"learning_rate": 3.286318834940729e-07, |
|
"loss": 0.7101, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8535320803629294, |
|
"grad_norm": 3.7400467933207557, |
|
"learning_rate": 3.2220429954126686e-07, |
|
"loss": 0.6797, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.86649384316267, |
|
"grad_norm": 3.6560156110852464, |
|
"learning_rate": 3.158101769203698e-07, |
|
"loss": 0.7056, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.879455605962411, |
|
"grad_norm": 3.707238700659647, |
|
"learning_rate": 3.0945071901177145e-07, |
|
"loss": 0.7093, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.8924173687621515, |
|
"grad_norm": 3.606777649061402, |
|
"learning_rate": 3.031271226719271e-07, |
|
"loss": 0.7084, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9053791315618924, |
|
"grad_norm": 3.582291971931833, |
|
"learning_rate": 2.968405780081084e-07, |
|
"loss": 0.7072, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.9183408943616331, |
|
"grad_norm": 3.6451496181338103, |
|
"learning_rate": 2.905922681544238e-07, |
|
"loss": 0.6969, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9313026571613738, |
|
"grad_norm": 3.4586471118745656, |
|
"learning_rate": 2.8438336904915184e-07, |
|
"loss": 0.6882, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.9442644199611148, |
|
"grad_norm": 3.7194825643308853, |
|
"learning_rate": 2.7821504921342774e-07, |
|
"loss": 0.704, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9442644199611148, |
|
"eval_loss": 0.7632837295532227, |
|
"eval_runtime": 190.2948, |
|
"eval_samples_per_second": 57.642, |
|
"eval_steps_per_second": 0.904, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9572261827608555, |
|
"grad_norm": 3.5452530896647696, |
|
"learning_rate": 2.7208846953132683e-07, |
|
"loss": 0.6927, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.9701879455605962, |
|
"grad_norm": 3.5705371811039504, |
|
"learning_rate": 2.66004783031385e-07, |
|
"loss": 0.7037, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.983149708360337, |
|
"grad_norm": 3.4298716866923624, |
|
"learning_rate": 2.599651346695979e-07, |
|
"loss": 0.6926, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.9961114711600778, |
|
"grad_norm": 3.7912871601195284, |
|
"learning_rate": 2.539706611139385e-07, |
|
"loss": 0.7111, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0090732339598185, |
|
"grad_norm": 3.769564370048527, |
|
"learning_rate": 2.480224905304352e-07, |
|
"loss": 0.6801, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.0220349967595594, |
|
"grad_norm": 3.418564384607543, |
|
"learning_rate": 2.4212174237085005e-07, |
|
"loss": 0.663, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.0349967595593, |
|
"grad_norm": 3.730605140988728, |
|
"learning_rate": 2.3626952716199644e-07, |
|
"loss": 0.6784, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.047958522359041, |
|
"grad_norm": 3.7129740066342882, |
|
"learning_rate": 2.3046694629673712e-07, |
|
"loss": 0.6293, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.060920285158782, |
|
"grad_norm": 3.6275068157375108, |
|
"learning_rate": 2.247150918267008e-07, |
|
"loss": 0.6489, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.0738820479585223, |
|
"grad_norm": 3.625813889475341, |
|
"learning_rate": 2.1901504625675688e-07, |
|
"loss": 0.6546, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.0738820479585223, |
|
"eval_loss": 0.7677435874938965, |
|
"eval_runtime": 190.4024, |
|
"eval_samples_per_second": 57.61, |
|
"eval_steps_per_second": 0.903, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.086843810758263, |
|
"grad_norm": 3.4182137647327764, |
|
"learning_rate": 2.1336788234128729e-07, |
|
"loss": 0.6676, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.0998055735580037, |
|
"grad_norm": 3.6556183671319293, |
|
"learning_rate": 2.0777466288229205e-07, |
|
"loss": 0.6629, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.1127673363577446, |
|
"grad_norm": 3.744401914160934, |
|
"learning_rate": 2.0223644052937028e-07, |
|
"loss": 0.6418, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.1257290991574855, |
|
"grad_norm": 3.865359361945009, |
|
"learning_rate": 1.9675425758160924e-07, |
|
"loss": 0.6531, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.138690861957226, |
|
"grad_norm": 3.69643616461443, |
|
"learning_rate": 1.9132914579142338e-07, |
|
"loss": 0.6346, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.151652624756967, |
|
"grad_norm": 3.7535585023145797, |
|
"learning_rate": 1.8596212617037693e-07, |
|
"loss": 0.6654, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.164614387556708, |
|
"grad_norm": 4.0159793685962475, |
|
"learning_rate": 1.8065420879702887e-07, |
|
"loss": 0.6649, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.1775761503564484, |
|
"grad_norm": 3.644285842038128, |
|
"learning_rate": 1.7540639262683487e-07, |
|
"loss": 0.6433, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.1905379131561893, |
|
"grad_norm": 3.9056650075753665, |
|
"learning_rate": 1.70219665304143e-07, |
|
"loss": 0.6769, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.20349967595593, |
|
"grad_norm": 3.7184862818541933, |
|
"learning_rate": 1.6509500297631785e-07, |
|
"loss": 0.651, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.20349967595593, |
|
"eval_loss": 0.7672791481018066, |
|
"eval_runtime": 190.5782, |
|
"eval_samples_per_second": 57.556, |
|
"eval_steps_per_second": 0.903, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2164614387556707, |
|
"grad_norm": 3.8299853960357906, |
|
"learning_rate": 1.6003337011002927e-07, |
|
"loss": 0.6564, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.2294232015554116, |
|
"grad_norm": 3.8417509954938085, |
|
"learning_rate": 1.5503571930973785e-07, |
|
"loss": 0.6426, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.242384964355152, |
|
"grad_norm": 3.647265480928013, |
|
"learning_rate": 1.5010299113841397e-07, |
|
"loss": 0.6518, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.255346727154893, |
|
"grad_norm": 3.5486365560504827, |
|
"learning_rate": 1.4523611394052355e-07, |
|
"loss": 0.6475, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.268308489954634, |
|
"grad_norm": 3.4682735371792526, |
|
"learning_rate": 1.4043600366731213e-07, |
|
"loss": 0.662, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.2812702527543745, |
|
"grad_norm": 3.824875125016563, |
|
"learning_rate": 1.3570356370442189e-07, |
|
"loss": 0.6552, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.2942320155541154, |
|
"grad_norm": 3.828027807398829, |
|
"learning_rate": 1.3103968470187382e-07, |
|
"loss": 0.6476, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.3071937783538563, |
|
"grad_norm": 3.842354090272816, |
|
"learning_rate": 1.2644524440644627e-07, |
|
"loss": 0.6685, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.320155541153597, |
|
"grad_norm": 3.6920011302134044, |
|
"learning_rate": 1.2192110749648232e-07, |
|
"loss": 0.6655, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.3331173039533377, |
|
"grad_norm": 3.7425896847607256, |
|
"learning_rate": 1.1746812541915607e-07, |
|
"loss": 0.6601, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.3331173039533377, |
|
"eval_loss": 0.7666835188865662, |
|
"eval_runtime": 190.628, |
|
"eval_samples_per_second": 57.541, |
|
"eval_steps_per_second": 0.902, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.346079066753078, |
|
"grad_norm": 3.9070327756742107, |
|
"learning_rate": 1.1308713623022986e-07, |
|
"loss": 0.6627, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.359040829552819, |
|
"grad_norm": 3.7802235911895217, |
|
"learning_rate": 1.0877896443633117e-07, |
|
"loss": 0.6593, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.37200259235256, |
|
"grad_norm": 3.61433280563944, |
|
"learning_rate": 1.045444208397791e-07, |
|
"loss": 0.6709, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.3849643551523005, |
|
"grad_norm": 3.972094909530566, |
|
"learning_rate": 1.0038430238599154e-07, |
|
"loss": 0.651, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.3979261179520415, |
|
"grad_norm": 3.796273978107305, |
|
"learning_rate": 9.629939201349852e-08, |
|
"loss": 0.6588, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.4108878807517824, |
|
"grad_norm": 3.7293416826903525, |
|
"learning_rate": 9.229045850659251e-08, |
|
"loss": 0.656, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.423849643551523, |
|
"grad_norm": 3.6150556912476945, |
|
"learning_rate": 8.835825635064265e-08, |
|
"loss": 0.6603, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.436811406351264, |
|
"grad_norm": 3.5117438967831145, |
|
"learning_rate": 8.450352559009949e-08, |
|
"loss": 0.6572, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.4497731691510047, |
|
"grad_norm": 3.599439277396364, |
|
"learning_rate": 8.072699168921825e-08, |
|
"loss": 0.6523, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.462734931950745, |
|
"grad_norm": 3.815395750371357, |
|
"learning_rate": 7.70293653955254e-08, |
|
"loss": 0.6669, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.462734931950745, |
|
"eval_loss": 0.7661544680595398, |
|
"eval_runtime": 190.5379, |
|
"eval_samples_per_second": 57.569, |
|
"eval_steps_per_second": 0.903, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.475696694750486, |
|
"grad_norm": 3.83907673537174, |
|
"learning_rate": 7.341134260605536e-08, |
|
"loss": 0.6467, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.488658457550227, |
|
"grad_norm": 3.71914525200344, |
|
"learning_rate": 6.987360423638205e-08, |
|
"loss": 0.6451, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.5016202203499676, |
|
"grad_norm": 3.8744678609767926, |
|
"learning_rate": 6.641681609246979e-08, |
|
"loss": 0.6499, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.5145819831497085, |
|
"grad_norm": 3.839049543501993, |
|
"learning_rate": 6.304162874536795e-08, |
|
"loss": 0.6381, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.527543745949449, |
|
"grad_norm": 3.890978467270095, |
|
"learning_rate": 5.974867740877281e-08, |
|
"loss": 0.6611, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.54050550874919, |
|
"grad_norm": 3.655850976914982, |
|
"learning_rate": 5.653858181947979e-08, |
|
"loss": 0.6511, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.5534672715489304, |
|
"grad_norm": 3.553408249450878, |
|
"learning_rate": 5.341194612074823e-08, |
|
"loss": 0.645, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.5664290343486713, |
|
"grad_norm": 3.772171071477516, |
|
"learning_rate": 5.0369358748601096e-08, |
|
"loss": 0.664, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.5793907971484122, |
|
"grad_norm": 3.9844066962823783, |
|
"learning_rate": 4.74113923210806e-08, |
|
"loss": 0.6768, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.5923525599481527, |
|
"grad_norm": 3.709224867062063, |
|
"learning_rate": 4.453860353048111e-08, |
|
"loss": 0.6566, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.5923525599481527, |
|
"eval_loss": 0.7657259702682495, |
|
"eval_runtime": 190.4841, |
|
"eval_samples_per_second": 57.585, |
|
"eval_steps_per_second": 0.903, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6053143227478937, |
|
"grad_norm": 3.6125710008133014, |
|
"learning_rate": 4.1751533038578866e-08, |
|
"loss": 0.6588, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.6182760855476346, |
|
"grad_norm": 3.9281818636928305, |
|
"learning_rate": 3.9050705374879086e-08, |
|
"loss": 0.6451, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.631237848347375, |
|
"grad_norm": 3.6915219360218794, |
|
"learning_rate": 3.6436628837898773e-08, |
|
"loss": 0.6537, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.644199611147116, |
|
"grad_norm": 3.8956742656352756, |
|
"learning_rate": 3.390979539950478e-08, |
|
"loss": 0.6646, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.657161373946857, |
|
"grad_norm": 3.693338526233881, |
|
"learning_rate": 3.1470680612323494e-08, |
|
"loss": 0.6398, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.6701231367465974, |
|
"grad_norm": 3.676860096119893, |
|
"learning_rate": 2.9119743520242213e-08, |
|
"loss": 0.656, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.6830848995463383, |
|
"grad_norm": 3.839586455095117, |
|
"learning_rate": 2.6857426572016007e-08, |
|
"loss": 0.6522, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.6960466623460793, |
|
"grad_norm": 3.7505570827639754, |
|
"learning_rate": 2.468415553799874e-08, |
|
"loss": 0.661, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.7090084251458197, |
|
"grad_norm": 3.74016254328062, |
|
"learning_rate": 2.2600339430012438e-08, |
|
"loss": 0.6543, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.7219701879455607, |
|
"grad_norm": 4.044963430877048, |
|
"learning_rate": 2.0606370424370966e-08, |
|
"loss": 0.6654, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.7219701879455607, |
|
"eval_loss": 0.765480101108551, |
|
"eval_runtime": 190.595, |
|
"eval_samples_per_second": 57.551, |
|
"eval_steps_per_second": 0.902, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.7349319507453016, |
|
"grad_norm": 3.8050950373574115, |
|
"learning_rate": 1.8702623788072024e-08, |
|
"loss": 0.6585, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.747893713545042, |
|
"grad_norm": 3.7424493675710604, |
|
"learning_rate": 1.688945780817147e-08, |
|
"loss": 0.6765, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.760855476344783, |
|
"grad_norm": 3.7803862294906496, |
|
"learning_rate": 1.516721372435342e-08, |
|
"loss": 0.6578, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.7738172391445235, |
|
"grad_norm": 3.7879361506176332, |
|
"learning_rate": 1.3536215664708583e-08, |
|
"loss": 0.6681, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.7867790019442644, |
|
"grad_norm": 3.8430280460507524, |
|
"learning_rate": 1.1996770584732919e-08, |
|
"loss": 0.6412, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.7997407647440054, |
|
"grad_norm": 3.741428860483068, |
|
"learning_rate": 1.0549168209558312e-08, |
|
"loss": 0.6748, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.812702527543746, |
|
"grad_norm": 3.7122256590189564, |
|
"learning_rate": 9.193680979426189e-09, |
|
"loss": 0.6433, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.8256642903434868, |
|
"grad_norm": 3.6981576894413415, |
|
"learning_rate": 7.930563998413797e-09, |
|
"loss": 0.6621, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.8386260531432272, |
|
"grad_norm": 3.7162009738597526, |
|
"learning_rate": 6.760054986423458e-09, |
|
"loss": 0.6623, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.851587815942968, |
|
"grad_norm": 3.76189814784874, |
|
"learning_rate": 5.6823742344433435e-09, |
|
"loss": 0.6389, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.851587815942968, |
|
"eval_loss": 0.7652862668037415, |
|
"eval_runtime": 190.4719, |
|
"eval_samples_per_second": 57.589, |
|
"eval_steps_per_second": 0.903, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.864549578742709, |
|
"grad_norm": 4.07593882169201, |
|
"learning_rate": 4.697724563088645e-09, |
|
"loss": 0.6616, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.8775113415424496, |
|
"grad_norm": 3.95049669925965, |
|
"learning_rate": 3.806291284430274e-09, |
|
"loss": 0.6678, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.8904731043421905, |
|
"grad_norm": 3.692127174629793, |
|
"learning_rate": 3.008242167119257e-09, |
|
"loss": 0.6533, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.9034348671419314, |
|
"grad_norm": 3.9224928557653254, |
|
"learning_rate": 2.303727404812217e-09, |
|
"loss": 0.6568, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.916396629941672, |
|
"grad_norm": 3.700977024408014, |
|
"learning_rate": 1.6928795879049828e-09, |
|
"loss": 0.658, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 2.929358392741413, |
|
"grad_norm": 3.671559049022848, |
|
"learning_rate": 1.1758136785788853e-09, |
|
"loss": 0.6591, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.942320155541154, |
|
"grad_norm": 3.9515548315685107, |
|
"learning_rate": 7.526269891646175e-10, |
|
"loss": 0.6448, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 2.9552819183408943, |
|
"grad_norm": 3.6083444577524615, |
|
"learning_rate": 4.233991638281642e-10, |
|
"loss": 0.6421, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.968243681140635, |
|
"grad_norm": 3.6842766767841733, |
|
"learning_rate": 1.8819216358156865e-10, |
|
"loss": 0.6606, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 2.981205443940376, |
|
"grad_norm": 4.03690890015612, |
|
"learning_rate": 4.7050254621872064e-11, |
|
"loss": 0.6574, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.981205443940376, |
|
"eval_loss": 0.7652673125267029, |
|
"eval_runtime": 190.6904, |
|
"eval_samples_per_second": 57.523, |
|
"eval_steps_per_second": 0.902, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.9941672067401166, |
|
"grad_norm": 4.036929712704943, |
|
"learning_rate": 0.0, |
|
"loss": 0.6573, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 2.9941672067401166, |
|
"step": 1155, |
|
"total_flos": 6810169535102976.0, |
|
"train_loss": 0.7440064066932315, |
|
"train_runtime": 19885.6035, |
|
"train_samples_per_second": 14.893, |
|
"train_steps_per_second": 0.058 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1155, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6810169535102976.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|