{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99492385786802, "eval_steps": 50, "global_step": 885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01692047377326565, "grad_norm": 18.89720056709149, "learning_rate": 5e-07, "loss": 1.7331, "step": 5 }, { "epoch": 0.0338409475465313, "grad_norm": 12.380230951490994, "learning_rate": 1e-06, "loss": 1.5909, "step": 10 }, { "epoch": 0.050761421319796954, "grad_norm": 7.703103534110165, "learning_rate": 9.99919433964529e-07, "loss": 1.3028, "step": 15 }, { "epoch": 0.0676818950930626, "grad_norm": 4.2630631588076096, "learning_rate": 9.996777618216605e-07, "loss": 1.1478, "step": 20 }, { "epoch": 0.08460236886632826, "grad_norm": 3.9048597217577643, "learning_rate": 9.992750614536604e-07, "loss": 1.0675, "step": 25 }, { "epoch": 0.10152284263959391, "grad_norm": 4.006647373245602, "learning_rate": 9.98711462636417e-07, "loss": 1.0324, "step": 30 }, { "epoch": 0.11844331641285956, "grad_norm": 3.5760575012528384, "learning_rate": 9.979871469976195e-07, "loss": 0.9783, "step": 35 }, { "epoch": 0.1353637901861252, "grad_norm": 3.7598475655002166, "learning_rate": 9.971023479582256e-07, "loss": 0.9717, "step": 40 }, { "epoch": 0.15228426395939088, "grad_norm": 3.841087015616369, "learning_rate": 9.960573506572389e-07, "loss": 0.9541, "step": 45 }, { "epoch": 0.1692047377326565, "grad_norm": 3.3432807772469215, "learning_rate": 9.948524918598173e-07, "loss": 0.9459, "step": 50 }, { "epoch": 0.1692047377326565, "eval_loss": 0.9274849891662598, "eval_runtime": 148.0955, "eval_samples_per_second": 56.72, "eval_steps_per_second": 0.891, "step": 50 }, { "epoch": 0.18612521150592218, "grad_norm": 3.3571009660316427, "learning_rate": 9.934881598487478e-07, "loss": 0.9205, "step": 55 }, { "epoch": 0.20304568527918782, "grad_norm": 3.658058336105581, "learning_rate": 9.919647942993147e-07, "loss": 0.9129, "step": 60 }, { "epoch": 0.21996615905245348, "grad_norm": 3.5170771780637082, "learning_rate": 9.9028288613761e-07, "loss": 0.893, "step": 65 }, { "epoch": 0.23688663282571912, "grad_norm": 3.4488412657588325, "learning_rate": 9.884429773823236e-07, "loss": 0.9107, "step": 70 }, { "epoch": 0.25380710659898476, "grad_norm": 3.5097568943852737, "learning_rate": 9.864456609700723e-07, "loss": 0.8996, "step": 75 }, { "epoch": 0.2707275803722504, "grad_norm": 3.3607900424314705, "learning_rate": 9.842915805643156e-07, "loss": 0.897, "step": 80 }, { "epoch": 0.2876480541455161, "grad_norm": 3.5364404109551226, "learning_rate": 9.819814303479267e-07, "loss": 0.8995, "step": 85 }, { "epoch": 0.30456852791878175, "grad_norm": 3.6326997998059873, "learning_rate": 9.795159547994828e-07, "loss": 0.8845, "step": 90 }, { "epoch": 0.32148900169204736, "grad_norm": 3.342400707382201, "learning_rate": 9.76895948453346e-07, "loss": 0.8761, "step": 95 }, { "epoch": 0.338409475465313, "grad_norm": 3.5106421873680307, "learning_rate": 9.74122255643613e-07, "loss": 0.8992, "step": 100 }, { "epoch": 0.338409475465313, "eval_loss": 0.8709495663642883, "eval_runtime": 146.2171, "eval_samples_per_second": 57.449, "eval_steps_per_second": 0.903, "step": 100 }, { "epoch": 0.3553299492385787, "grad_norm": 3.412793355215337, "learning_rate": 9.711957702320174e-07, "loss": 0.8915, "step": 105 }, { "epoch": 0.37225042301184436, "grad_norm": 3.4690823915126363, "learning_rate": 9.681174353198686e-07, "loss": 0.8814, "step": 110 }, { "epoch": 0.38917089678510997, "grad_norm": 3.370071251369415, "learning_rate": 9.648882429441256e-07, "loss": 0.8661, "step": 115 }, { "epoch": 0.40609137055837563, "grad_norm": 3.687286677170038, "learning_rate": 9.615092337576987e-07, "loss": 0.8737, "step": 120 }, { "epoch": 0.4230118443316413, "grad_norm": 3.4414847568273346, "learning_rate": 9.579814966940833e-07, "loss": 0.8585, "step": 125 }, { "epoch": 0.43993231810490696, "grad_norm": 3.555637933684099, "learning_rate": 9.543061686164372e-07, "loss": 0.8617, "step": 130 }, { "epoch": 0.45685279187817257, "grad_norm": 3.364839125431491, "learning_rate": 9.504844339512094e-07, "loss": 0.843, "step": 135 }, { "epoch": 0.47377326565143824, "grad_norm": 3.5040756898524745, "learning_rate": 9.465175243064428e-07, "loss": 0.864, "step": 140 }, { "epoch": 0.4906937394247039, "grad_norm": 3.4002571057955366, "learning_rate": 9.424067180748691e-07, "loss": 0.853, "step": 145 }, { "epoch": 0.5076142131979695, "grad_norm": 3.5879355202073193, "learning_rate": 9.381533400219317e-07, "loss": 0.8515, "step": 150 }, { "epoch": 0.5076142131979695, "eval_loss": 0.844601571559906, "eval_runtime": 146.2138, "eval_samples_per_second": 57.45, "eval_steps_per_second": 0.903, "step": 150 }, { "epoch": 0.5245346869712352, "grad_norm": 3.3747757696246925, "learning_rate": 9.337587608588588e-07, "loss": 0.8511, "step": 155 }, { "epoch": 0.5414551607445008, "grad_norm": 3.5565216959326644, "learning_rate": 9.29224396800933e-07, "loss": 0.8405, "step": 160 }, { "epoch": 0.5583756345177665, "grad_norm": 4.512291872844238, "learning_rate": 9.245517091110968e-07, "loss": 0.8318, "step": 165 }, { "epoch": 0.5752961082910322, "grad_norm": 3.332018798468554, "learning_rate": 9.197422036290386e-07, "loss": 0.8507, "step": 170 }, { "epoch": 0.5922165820642978, "grad_norm": 3.5149539879760603, "learning_rate": 9.147974302859156e-07, "loss": 0.8679, "step": 175 }, { "epoch": 0.6091370558375635, "grad_norm": 3.5792424943759924, "learning_rate": 9.097189826048659e-07, "loss": 0.8422, "step": 180 }, { "epoch": 0.626057529610829, "grad_norm": 3.6495743743155806, "learning_rate": 9.045084971874737e-07, "loss": 0.8392, "step": 185 }, { "epoch": 0.6429780033840947, "grad_norm": 3.4613931720229836, "learning_rate": 8.991676531863507e-07, "loss": 0.8423, "step": 190 }, { "epoch": 0.6598984771573604, "grad_norm": 3.552450649531331, "learning_rate": 8.93698171764006e-07, "loss": 0.838, "step": 195 }, { "epoch": 0.676818950930626, "grad_norm": 3.4396681594412057, "learning_rate": 8.881018155381765e-07, "loss": 0.8498, "step": 200 }, { "epoch": 0.676818950930626, "eval_loss": 0.8278397917747498, "eval_runtime": 146.1617, "eval_samples_per_second": 57.471, "eval_steps_per_second": 0.903, "step": 200 }, { "epoch": 0.6937394247038917, "grad_norm": 3.395442461461206, "learning_rate": 8.823803880137992e-07, "loss": 0.8466, "step": 205 }, { "epoch": 0.7106598984771574, "grad_norm": 3.3738365361465825, "learning_rate": 8.765357330018055e-07, "loss": 0.8115, "step": 210 }, { "epoch": 0.727580372250423, "grad_norm": 3.492914457997946, "learning_rate": 8.705697340249274e-07, "loss": 0.8325, "step": 215 }, { "epoch": 0.7445008460236887, "grad_norm": 3.8131605424361084, "learning_rate": 8.644843137107057e-07, "loss": 0.8204, "step": 220 }, { "epoch": 0.7614213197969543, "grad_norm": 3.243553096025284, "learning_rate": 8.58281433171896e-07, "loss": 0.8196, "step": 225 }, { "epoch": 0.7783417935702199, "grad_norm": 3.6561787618198625, "learning_rate": 8.519630913744724e-07, "loss": 0.8173, "step": 230 }, { "epoch": 0.7952622673434856, "grad_norm": 3.411165657715004, "learning_rate": 8.455313244934324e-07, "loss": 0.8248, "step": 235 }, { "epoch": 0.8121827411167513, "grad_norm": 3.302331738504403, "learning_rate": 8.389882052566105e-07, "loss": 0.8138, "step": 240 }, { "epoch": 0.8291032148900169, "grad_norm": 3.8604845681587605, "learning_rate": 8.323358422767128e-07, "loss": 0.8301, "step": 245 }, { "epoch": 0.8460236886632826, "grad_norm": 3.3819331633437617, "learning_rate": 8.255763793717867e-07, "loss": 0.8105, "step": 250 }, { "epoch": 0.8460236886632826, "eval_loss": 0.815843939781189, "eval_runtime": 146.2558, "eval_samples_per_second": 57.434, "eval_steps_per_second": 0.903, "step": 250 }, { "epoch": 0.8629441624365483, "grad_norm": 3.693173177209318, "learning_rate": 8.187119948743449e-07, "loss": 0.8241, "step": 255 }, { "epoch": 0.8798646362098139, "grad_norm": 3.557032123737138, "learning_rate": 8.117449009293668e-07, "loss": 0.8147, "step": 260 }, { "epoch": 0.8967851099830795, "grad_norm": 3.681990028371156, "learning_rate": 8.046773427814041e-07, "loss": 0.8281, "step": 265 }, { "epoch": 0.9137055837563451, "grad_norm": 3.472801541082773, "learning_rate": 7.975115980510185e-07, "loss": 0.8243, "step": 270 }, { "epoch": 0.9306260575296108, "grad_norm": 3.421718448250381, "learning_rate": 7.902499760007867e-07, "loss": 0.8296, "step": 275 }, { "epoch": 0.9475465313028765, "grad_norm": 3.3839679140101864, "learning_rate": 7.828948167911073e-07, "loss": 0.8113, "step": 280 }, { "epoch": 0.9644670050761421, "grad_norm": 3.368606848173354, "learning_rate": 7.754484907260512e-07, "loss": 0.8051, "step": 285 }, { "epoch": 0.9813874788494078, "grad_norm": 3.6569030875938027, "learning_rate": 7.679133974894982e-07, "loss": 0.8208, "step": 290 }, { "epoch": 0.9983079526226735, "grad_norm": 3.425142212232347, "learning_rate": 7.602919653718043e-07, "loss": 0.8058, "step": 295 }, { "epoch": 1.015228426395939, "grad_norm": 3.456597451379273, "learning_rate": 7.525866504872506e-07, "loss": 0.7739, "step": 300 }, { "epoch": 1.015228426395939, "eval_loss": 0.8076632022857666, "eval_runtime": 146.1422, "eval_samples_per_second": 57.478, "eval_steps_per_second": 0.903, "step": 300 }, { "epoch": 1.0321489001692048, "grad_norm": 3.4266068907750933, "learning_rate": 7.447999359825262e-07, "loss": 0.7588, "step": 305 }, { "epoch": 1.0490693739424704, "grad_norm": 3.4868988018143354, "learning_rate": 7.369343312364993e-07, "loss": 0.7289, "step": 310 }, { "epoch": 1.0659898477157361, "grad_norm": 3.4449977296428105, "learning_rate": 7.289923710515338e-07, "loss": 0.7482, "step": 315 }, { "epoch": 1.0829103214890017, "grad_norm": 3.658957259722615, "learning_rate": 7.209766148366134e-07, "loss": 0.7309, "step": 320 }, { "epoch": 1.0998307952622675, "grad_norm": 3.359992422940204, "learning_rate": 7.128896457825363e-07, "loss": 0.7371, "step": 325 }, { "epoch": 1.116751269035533, "grad_norm": 3.7951723668245902, "learning_rate": 7.047340700294453e-07, "loss": 0.75, "step": 330 }, { "epoch": 1.1336717428087986, "grad_norm": 3.5347910519960837, "learning_rate": 6.965125158269618e-07, "loss": 0.7582, "step": 335 }, { "epoch": 1.1505922165820643, "grad_norm": 3.5966614567208928, "learning_rate": 6.882276326871959e-07, "loss": 0.7455, "step": 340 }, { "epoch": 1.16751269035533, "grad_norm": 3.488331150730137, "learning_rate": 6.798820905309035e-07, "loss": 0.7327, "step": 345 }, { "epoch": 1.1844331641285957, "grad_norm": 3.5101724849934444, "learning_rate": 6.714785788270657e-07, "loss": 0.7286, "step": 350 }, { "epoch": 1.1844331641285957, "eval_loss": 0.8026402592658997, "eval_runtime": 146.1899, "eval_samples_per_second": 57.459, "eval_steps_per_second": 0.903, "step": 350 }, { "epoch": 1.2013536379018612, "grad_norm": 3.6348180590276526, "learning_rate": 6.630198057261709e-07, "loss": 0.751, "step": 355 }, { "epoch": 1.218274111675127, "grad_norm": 3.7503514910524793, "learning_rate": 6.545084971874736e-07, "loss": 0.7379, "step": 360 }, { "epoch": 1.2351945854483926, "grad_norm": 3.5989366875197244, "learning_rate": 6.459473961005168e-07, "loss": 0.7451, "step": 365 }, { "epoch": 1.252115059221658, "grad_norm": 3.522773718959504, "learning_rate": 6.373392614011951e-07, "loss": 0.745, "step": 370 }, { "epoch": 1.2690355329949239, "grad_norm": 3.44987123406693, "learning_rate": 6.286868671826511e-07, "loss": 0.7508, "step": 375 }, { "epoch": 1.2859560067681894, "grad_norm": 4.122111211031723, "learning_rate": 6.199930018012829e-07, "loss": 0.7226, "step": 380 }, { "epoch": 1.3028764805414552, "grad_norm": 3.73599818619215, "learning_rate": 6.112604669781572e-07, "loss": 0.7295, "step": 385 }, { "epoch": 1.3197969543147208, "grad_norm": 3.529190177869209, "learning_rate": 6.024920768961152e-07, "loss": 0.7356, "step": 390 }, { "epoch": 1.3367174280879865, "grad_norm": 3.461330446213272, "learning_rate": 5.936906572928624e-07, "loss": 0.7556, "step": 395 }, { "epoch": 1.353637901861252, "grad_norm": 3.571724685541136, "learning_rate": 5.848590445503344e-07, "loss": 0.7582, "step": 400 }, { "epoch": 1.353637901861252, "eval_loss": 0.7977383732795715, "eval_runtime": 146.2021, "eval_samples_per_second": 57.455, "eval_steps_per_second": 0.903, "step": 400 }, { "epoch": 1.3705583756345177, "grad_norm": 3.6572559945618304, "learning_rate": 5.760000847806337e-07, "loss": 0.7725, "step": 405 }, { "epoch": 1.3874788494077834, "grad_norm": 3.697093126496799, "learning_rate": 5.671166329088277e-07, "loss": 0.7514, "step": 410 }, { "epoch": 1.404399323181049, "grad_norm": 3.836867403357349, "learning_rate": 5.582115517529114e-07, "loss": 0.7483, "step": 415 }, { "epoch": 1.4213197969543148, "grad_norm": 3.4989314616970764, "learning_rate": 5.492877111012218e-07, "loss": 0.7557, "step": 420 }, { "epoch": 1.4382402707275803, "grad_norm": 3.597183035056808, "learning_rate": 5.403479867876087e-07, "loss": 0.7436, "step": 425 }, { "epoch": 1.455160744500846, "grad_norm": 3.7802523149502734, "learning_rate": 5.313952597646567e-07, "loss": 0.7424, "step": 430 }, { "epoch": 1.4720812182741116, "grad_norm": 3.5635818578964344, "learning_rate": 5.224324151752575e-07, "loss": 0.7271, "step": 435 }, { "epoch": 1.4890016920473772, "grad_norm": 3.577963950120742, "learning_rate": 5.134623414228315e-07, "loss": 0.7609, "step": 440 }, { "epoch": 1.505922165820643, "grad_norm": 3.54002403050294, "learning_rate": 5.044879292404989e-07, "loss": 0.7734, "step": 445 }, { "epoch": 1.5228426395939088, "grad_norm": 3.6965077572035665, "learning_rate": 4.95512070759501e-07, "loss": 0.7386, "step": 450 }, { "epoch": 1.5228426395939088, "eval_loss": 0.7924867272377014, "eval_runtime": 146.1132, "eval_samples_per_second": 57.49, "eval_steps_per_second": 0.903, "step": 450 }, { "epoch": 1.5397631133671743, "grad_norm": 3.731715501345578, "learning_rate": 4.865376585771687e-07, "loss": 0.7393, "step": 455 }, { "epoch": 1.5566835871404399, "grad_norm": 3.5303543042860452, "learning_rate": 4.775675848247427e-07, "loss": 0.7362, "step": 460 }, { "epoch": 1.5736040609137056, "grad_norm": 3.6723372038452693, "learning_rate": 4.686047402353433e-07, "loss": 0.7356, "step": 465 }, { "epoch": 1.5905245346869712, "grad_norm": 3.3963951427952694, "learning_rate": 4.596520132123914e-07, "loss": 0.7284, "step": 470 }, { "epoch": 1.6074450084602367, "grad_norm": 3.654129078679521, "learning_rate": 4.507122888987782e-07, "loss": 0.7504, "step": 475 }, { "epoch": 1.6243654822335025, "grad_norm": 3.5729221837567438, "learning_rate": 4.417884482470886e-07, "loss": 0.7505, "step": 480 }, { "epoch": 1.6412859560067683, "grad_norm": 3.7212291449669457, "learning_rate": 4.328833670911724e-07, "loss": 0.7426, "step": 485 }, { "epoch": 1.6582064297800339, "grad_norm": 3.6827531820047894, "learning_rate": 4.239999152193664e-07, "loss": 0.7299, "step": 490 }, { "epoch": 1.6751269035532994, "grad_norm": 3.5324806419119805, "learning_rate": 4.1514095544966557e-07, "loss": 0.7259, "step": 495 }, { "epoch": 1.6920473773265652, "grad_norm": 3.3737883396364463, "learning_rate": 4.0630934270713755e-07, "loss": 0.7479, "step": 500 }, { "epoch": 1.6920473773265652, "eval_loss": 0.7887324690818787, "eval_runtime": 146.106, "eval_samples_per_second": 57.493, "eval_steps_per_second": 0.903, "step": 500 }, { "epoch": 1.708967851099831, "grad_norm": 3.645393673633237, "learning_rate": 3.9750792310388483e-07, "loss": 0.7429, "step": 505 }, { "epoch": 1.7258883248730963, "grad_norm": 3.826381146162921, "learning_rate": 3.8873953302184283e-07, "loss": 0.7207, "step": 510 }, { "epoch": 1.742808798646362, "grad_norm": 3.6852464302004786, "learning_rate": 3.80006998198717e-07, "loss": 0.7523, "step": 515 }, { "epoch": 1.7597292724196278, "grad_norm": 3.53598971920285, "learning_rate": 3.713131328173489e-07, "loss": 0.7447, "step": 520 }, { "epoch": 1.7766497461928934, "grad_norm": 3.468275223623456, "learning_rate": 3.62660738598805e-07, "loss": 0.7406, "step": 525 }, { "epoch": 1.793570219966159, "grad_norm": 3.750572464422938, "learning_rate": 3.5405260389948333e-07, "loss": 0.7472, "step": 530 }, { "epoch": 1.8104906937394247, "grad_norm": 3.804608232839517, "learning_rate": 3.454915028125263e-07, "loss": 0.7269, "step": 535 }, { "epoch": 1.8274111675126905, "grad_norm": 3.6798567744235626, "learning_rate": 3.369801942738291e-07, "loss": 0.7194, "step": 540 }, { "epoch": 1.844331641285956, "grad_norm": 3.5376430558792604, "learning_rate": 3.285214211729343e-07, "loss": 0.7161, "step": 545 }, { "epoch": 1.8612521150592216, "grad_norm": 3.508281360828586, "learning_rate": 3.2011790946909666e-07, "loss": 0.7374, "step": 550 }, { "epoch": 1.8612521150592216, "eval_loss": 0.7846313118934631, "eval_runtime": 146.1979, "eval_samples_per_second": 57.456, "eval_steps_per_second": 0.903, "step": 550 }, { "epoch": 1.8781725888324874, "grad_norm": 3.6370839999982794, "learning_rate": 3.11772367312804e-07, "loss": 0.7363, "step": 555 }, { "epoch": 1.895093062605753, "grad_norm": 3.6667447920505767, "learning_rate": 3.034874841730382e-07, "loss": 0.7205, "step": 560 }, { "epoch": 1.9120135363790185, "grad_norm": 3.6329553320121333, "learning_rate": 2.9526592997055483e-07, "loss": 0.7405, "step": 565 }, { "epoch": 1.9289340101522843, "grad_norm": 3.625112558207132, "learning_rate": 2.8711035421746363e-07, "loss": 0.7262, "step": 570 }, { "epoch": 1.94585448392555, "grad_norm": 3.593092005119921, "learning_rate": 2.7902338516338674e-07, "loss": 0.7321, "step": 575 }, { "epoch": 1.9627749576988156, "grad_norm": 3.8305447044984087, "learning_rate": 2.7100762894846627e-07, "loss": 0.7318, "step": 580 }, { "epoch": 1.9796954314720812, "grad_norm": 3.7348260486234075, "learning_rate": 2.6306566876350067e-07, "loss": 0.7254, "step": 585 }, { "epoch": 1.996615905245347, "grad_norm": 3.575785573364731, "learning_rate": 2.5520006401747395e-07, "loss": 0.7506, "step": 590 }, { "epoch": 2.0135363790186127, "grad_norm": 3.91770848066435, "learning_rate": 2.474133495127494e-07, "loss": 0.7056, "step": 595 }, { "epoch": 2.030456852791878, "grad_norm": 3.5555997135824184, "learning_rate": 2.3970803462819583e-07, "loss": 0.7075, "step": 600 }, { "epoch": 2.030456852791878, "eval_loss": 0.7871306538581848, "eval_runtime": 146.0491, "eval_samples_per_second": 57.515, "eval_steps_per_second": 0.904, "step": 600 }, { "epoch": 2.047377326565144, "grad_norm": 3.7633202447201377, "learning_rate": 2.3208660251050156e-07, "loss": 0.6935, "step": 605 }, { "epoch": 2.0642978003384096, "grad_norm": 3.671976347169534, "learning_rate": 2.2455150927394878e-07, "loss": 0.6892, "step": 610 }, { "epoch": 2.081218274111675, "grad_norm": 3.7546761949911165, "learning_rate": 2.1710518320889276e-07, "loss": 0.6762, "step": 615 }, { "epoch": 2.0981387478849407, "grad_norm": 3.6517301083486684, "learning_rate": 2.097500239992132e-07, "loss": 0.6789, "step": 620 }, { "epoch": 2.1150592216582065, "grad_norm": 3.8614591719892286, "learning_rate": 2.0248840194898155e-07, "loss": 0.6782, "step": 625 }, { "epoch": 2.1319796954314723, "grad_norm": 3.7437837099098137, "learning_rate": 1.9532265721859597e-07, "loss": 0.6694, "step": 630 }, { "epoch": 2.1489001692047376, "grad_norm": 3.7205556217713482, "learning_rate": 1.8825509907063326e-07, "loss": 0.6848, "step": 635 }, { "epoch": 2.1658206429780034, "grad_norm": 3.957502578909951, "learning_rate": 1.812880051256551e-07, "loss": 0.6742, "step": 640 }, { "epoch": 2.182741116751269, "grad_norm": 3.764749715334981, "learning_rate": 1.744236206282132e-07, "loss": 0.6949, "step": 645 }, { "epoch": 2.199661590524535, "grad_norm": 3.6886209727019326, "learning_rate": 1.6766415772328728e-07, "loss": 0.6818, "step": 650 }, { "epoch": 2.199661590524535, "eval_loss": 0.7874469757080078, "eval_runtime": 146.2957, "eval_samples_per_second": 57.418, "eval_steps_per_second": 0.902, "step": 650 }, { "epoch": 2.2165820642978002, "grad_norm": 3.8767093338924434, "learning_rate": 1.6101179474338966e-07, "loss": 0.698, "step": 655 }, { "epoch": 2.233502538071066, "grad_norm": 3.9008502666439955, "learning_rate": 1.5446867550656767e-07, "loss": 0.6951, "step": 660 }, { "epoch": 2.250423011844332, "grad_norm": 3.600044317004702, "learning_rate": 1.4803690862552753e-07, "loss": 0.6736, "step": 665 }, { "epoch": 2.267343485617597, "grad_norm": 3.8954074320863623, "learning_rate": 1.4171856682810384e-07, "loss": 0.6796, "step": 670 }, { "epoch": 2.284263959390863, "grad_norm": 4.011977846573959, "learning_rate": 1.3551568628929432e-07, "loss": 0.673, "step": 675 }, { "epoch": 2.3011844331641287, "grad_norm": 3.823282280716888, "learning_rate": 1.2943026597507267e-07, "loss": 0.6729, "step": 680 }, { "epoch": 2.3181049069373945, "grad_norm": 3.992535275167548, "learning_rate": 1.2346426699819456e-07, "loss": 0.6919, "step": 685 }, { "epoch": 2.33502538071066, "grad_norm": 3.7501814671126255, "learning_rate": 1.176196119862008e-07, "loss": 0.6808, "step": 690 }, { "epoch": 2.3519458544839256, "grad_norm": 3.886306791969186, "learning_rate": 1.1189818446182358e-07, "loss": 0.6817, "step": 695 }, { "epoch": 2.3688663282571913, "grad_norm": 3.795337541416798, "learning_rate": 1.0630182823599399e-07, "loss": 0.6747, "step": 700 }, { "epoch": 2.3688663282571913, "eval_loss": 0.7868276834487915, "eval_runtime": 146.1359, "eval_samples_per_second": 57.481, "eval_steps_per_second": 0.903, "step": 700 }, { "epoch": 2.3857868020304567, "grad_norm": 3.9750084710216482, "learning_rate": 1.0083234681364932e-07, "loss": 0.6816, "step": 705 }, { "epoch": 2.4027072758037225, "grad_norm": 3.8888332091113544, "learning_rate": 9.549150281252632e-08, "loss": 0.6798, "step": 710 }, { "epoch": 2.4196277495769882, "grad_norm": 3.8260776584323786, "learning_rate": 9.028101739513405e-08, "loss": 0.6767, "step": 715 }, { "epoch": 2.436548223350254, "grad_norm": 3.9871840761205415, "learning_rate": 8.520256971408452e-08, "loss": 0.6864, "step": 720 }, { "epoch": 2.4534686971235193, "grad_norm": 3.689159095170659, "learning_rate": 8.025779637096137e-08, "loss": 0.6879, "step": 725 }, { "epoch": 2.470389170896785, "grad_norm": 4.029588500065508, "learning_rate": 7.544829088890325e-08, "loss": 0.6955, "step": 730 }, { "epoch": 2.487309644670051, "grad_norm": 3.911239676026825, "learning_rate": 7.077560319906694e-08, "loss": 0.68, "step": 735 }, { "epoch": 2.504230118443316, "grad_norm": 3.9088480475319742, "learning_rate": 6.624123914114122e-08, "loss": 0.6914, "step": 740 }, { "epoch": 2.521150592216582, "grad_norm": 3.8604080998201677, "learning_rate": 6.184665997806831e-08, "loss": 0.6944, "step": 745 }, { "epoch": 2.5380710659898478, "grad_norm": 3.7419532051364546, "learning_rate": 5.759328192513074e-08, "loss": 0.6849, "step": 750 }, { "epoch": 2.5380710659898478, "eval_loss": 0.7860944271087646, "eval_runtime": 146.2056, "eval_samples_per_second": 57.453, "eval_steps_per_second": 0.903, "step": 750 }, { "epoch": 2.5549915397631136, "grad_norm": 3.8511488857625675, "learning_rate": 5.348247569355735e-08, "loss": 0.6915, "step": 755 }, { "epoch": 2.571912013536379, "grad_norm": 3.820407730040828, "learning_rate": 4.951556604879048e-08, "loss": 0.6753, "step": 760 }, { "epoch": 2.5888324873096447, "grad_norm": 4.10704692193588, "learning_rate": 4.569383138356275e-08, "loss": 0.6839, "step": 765 }, { "epoch": 2.6057529610829104, "grad_norm": 3.8660705841934315, "learning_rate": 4.201850330591677e-08, "loss": 0.6748, "step": 770 }, { "epoch": 2.6226734348561758, "grad_norm": 3.7908753446624086, "learning_rate": 3.8490766242301353e-08, "loss": 0.6832, "step": 775 }, { "epoch": 2.6395939086294415, "grad_norm": 3.8661392064625137, "learning_rate": 3.5111757055874326e-08, "loss": 0.6902, "step": 780 }, { "epoch": 2.6565143824027073, "grad_norm": 3.7157772134571623, "learning_rate": 3.188256468013139e-08, "loss": 0.6742, "step": 785 }, { "epoch": 2.673434856175973, "grad_norm": 3.6900269313746743, "learning_rate": 2.8804229767982636e-08, "loss": 0.6769, "step": 790 }, { "epoch": 2.6903553299492384, "grad_norm": 3.8096781294038538, "learning_rate": 2.587774435638679e-08, "loss": 0.6962, "step": 795 }, { "epoch": 2.707275803722504, "grad_norm": 3.790546594792546, "learning_rate": 2.3104051546654013e-08, "loss": 0.6922, "step": 800 }, { "epoch": 2.707275803722504, "eval_loss": 0.7856406569480896, "eval_runtime": 146.2171, "eval_samples_per_second": 57.449, "eval_steps_per_second": 0.903, "step": 800 }, { "epoch": 2.72419627749577, "grad_norm": 3.8429877767556664, "learning_rate": 2.048404520051722e-08, "loss": 0.6888, "step": 805 }, { "epoch": 2.7411167512690353, "grad_norm": 3.6312764917444165, "learning_rate": 1.8018569652073378e-08, "loss": 0.6814, "step": 810 }, { "epoch": 2.758037225042301, "grad_norm": 3.787975571643184, "learning_rate": 1.570841943568446e-08, "loss": 0.6807, "step": 815 }, { "epoch": 2.774957698815567, "grad_norm": 3.9473751919547055, "learning_rate": 1.3554339029927531e-08, "loss": 0.6719, "step": 820 }, { "epoch": 2.7918781725888326, "grad_norm": 4.009002167477452, "learning_rate": 1.1557022617676216e-08, "loss": 0.6869, "step": 825 }, { "epoch": 2.808798646362098, "grad_norm": 3.8014391891380934, "learning_rate": 9.717113862389992e-09, "loss": 0.6805, "step": 830 }, { "epoch": 2.8257191201353637, "grad_norm": 3.6206775186302282, "learning_rate": 8.035205700685165e-09, "loss": 0.6712, "step": 835 }, { "epoch": 2.8426395939086295, "grad_norm": 3.7790956293706195, "learning_rate": 6.511840151252168e-09, "loss": 0.6857, "step": 840 }, { "epoch": 2.859560067681895, "grad_norm": 3.946872462701907, "learning_rate": 5.147508140182555e-09, "loss": 0.6891, "step": 845 }, { "epoch": 2.8764805414551606, "grad_norm": 4.188561175850185, "learning_rate": 3.9426493427611175e-09, "loss": 0.6948, "step": 850 }, { "epoch": 2.8764805414551606, "eval_loss": 0.7854181528091431, "eval_runtime": 146.4648, "eval_samples_per_second": 57.352, "eval_steps_per_second": 0.901, "step": 850 }, { "epoch": 2.8934010152284264, "grad_norm": 3.9493971658293505, "learning_rate": 2.897652041774279e-09, "loss": 0.6828, "step": 855 }, { "epoch": 2.910321489001692, "grad_norm": 3.921042083811142, "learning_rate": 2.0128530023804656e-09, "loss": 0.6912, "step": 860 }, { "epoch": 2.927241962774958, "grad_norm": 3.765705430002199, "learning_rate": 1.2885373635829754e-09, "loss": 0.6955, "step": 865 }, { "epoch": 2.9441624365482233, "grad_norm": 3.7974861126821366, "learning_rate": 7.249385463395374e-10, "loss": 0.691, "step": 870 }, { "epoch": 2.961082910321489, "grad_norm": 3.7353656428622743, "learning_rate": 3.22238178339318e-10, "loss": 0.6855, "step": 875 }, { "epoch": 2.9780033840947544, "grad_norm": 3.7079032515430557, "learning_rate": 8.056603547090812e-11, "loss": 0.692, "step": 880 }, { "epoch": 2.99492385786802, "grad_norm": 3.6976604610814037, "learning_rate": 0.0, "loss": 0.6867, "step": 885 }, { "epoch": 2.99492385786802, "step": 885, "total_flos": 5218127163949056.0, "train_loss": 0.7771235016106213, "train_runtime": 14098.017, "train_samples_per_second": 16.087, "train_steps_per_second": 0.063 } ], "logging_steps": 5, "max_steps": 885, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5218127163949056.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }