{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99492385786802, "eval_steps": 50, "global_step": 885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01692047377326565, "grad_norm": 18.11811168155941, "learning_rate": 5e-07, "loss": 1.7276, "step": 5 }, { "epoch": 0.0338409475465313, "grad_norm": 12.465284537803942, "learning_rate": 1e-06, "loss": 1.5961, "step": 10 }, { "epoch": 0.050761421319796954, "grad_norm": 8.029594619011075, "learning_rate": 9.99919433964529e-07, "loss": 1.2868, "step": 15 }, { "epoch": 0.0676818950930626, "grad_norm": 4.322443518606483, "learning_rate": 9.996777618216605e-07, "loss": 1.1317, "step": 20 }, { "epoch": 0.08460236886632826, "grad_norm": 3.908382290362389, "learning_rate": 9.992750614536604e-07, "loss": 1.067, "step": 25 }, { "epoch": 0.10152284263959391, "grad_norm": 3.3966731740840395, "learning_rate": 9.98711462636417e-07, "loss": 1.0087, "step": 30 }, { "epoch": 0.11844331641285956, "grad_norm": 3.404122104363119, "learning_rate": 9.979871469976195e-07, "loss": 0.9993, "step": 35 }, { "epoch": 0.1353637901861252, "grad_norm": 3.5130122180648637, "learning_rate": 9.971023479582256e-07, "loss": 0.9808, "step": 40 }, { "epoch": 0.15228426395939088, "grad_norm": 3.8017404235610677, "learning_rate": 9.960573506572389e-07, "loss": 0.9589, "step": 45 }, { "epoch": 0.1692047377326565, "grad_norm": 3.7077817350880475, "learning_rate": 9.948524918598173e-07, "loss": 0.9418, "step": 50 }, { "epoch": 0.1692047377326565, "eval_loss": 0.9316250085830688, "eval_runtime": 146.1453, "eval_samples_per_second": 57.477, "eval_steps_per_second": 0.903, "step": 50 }, { "epoch": 0.18612521150592218, "grad_norm": 3.8413604284063037, "learning_rate": 9.934881598487478e-07, "loss": 0.9442, "step": 55 }, { "epoch": 0.20304568527918782, "grad_norm": 3.4583776939446076, "learning_rate": 9.919647942993147e-07, "loss": 0.928, "step": 60 }, { "epoch": 0.21996615905245348, "grad_norm": 4.21857142186165, "learning_rate": 9.9028288613761e-07, "loss": 0.9234, "step": 65 }, { "epoch": 0.23688663282571912, "grad_norm": 3.6660260059909637, "learning_rate": 9.884429773823236e-07, "loss": 0.9051, "step": 70 }, { "epoch": 0.25380710659898476, "grad_norm": 3.3907413557630837, "learning_rate": 9.864456609700723e-07, "loss": 0.8896, "step": 75 }, { "epoch": 0.2707275803722504, "grad_norm": 3.4712295626998713, "learning_rate": 9.842915805643156e-07, "loss": 0.8869, "step": 80 }, { "epoch": 0.2876480541455161, "grad_norm": 3.51256446643522, "learning_rate": 9.819814303479267e-07, "loss": 0.877, "step": 85 }, { "epoch": 0.30456852791878175, "grad_norm": 3.3964554088877685, "learning_rate": 9.795159547994828e-07, "loss": 0.8925, "step": 90 }, { "epoch": 0.32148900169204736, "grad_norm": 3.372268647607599, "learning_rate": 9.76895948453346e-07, "loss": 0.893, "step": 95 }, { "epoch": 0.338409475465313, "grad_norm": 3.5879097796760644, "learning_rate": 9.74122255643613e-07, "loss": 0.8739, "step": 100 }, { "epoch": 0.338409475465313, "eval_loss": 0.8766918778419495, "eval_runtime": 146.0181, "eval_samples_per_second": 57.527, "eval_steps_per_second": 0.904, "step": 100 }, { "epoch": 0.3553299492385787, "grad_norm": 3.6525162260417656, "learning_rate": 9.711957702320174e-07, "loss": 0.8943, "step": 105 }, { "epoch": 0.37225042301184436, "grad_norm": 3.825829336255867, "learning_rate": 9.681174353198686e-07, "loss": 0.869, "step": 110 }, { "epoch": 0.38917089678510997, "grad_norm": 3.5161921261333418, "learning_rate": 9.648882429441256e-07, "loss": 0.8732, "step": 115 }, { "epoch": 0.40609137055837563, "grad_norm": 3.754342765222721, "learning_rate": 9.615092337576987e-07, "loss": 0.8695, "step": 120 }, { "epoch": 0.4230118443316413, "grad_norm": 3.8927983233298966, "learning_rate": 9.579814966940833e-07, "loss": 0.8676, "step": 125 }, { "epoch": 0.43993231810490696, "grad_norm": 3.313844438659833, "learning_rate": 9.543061686164372e-07, "loss": 0.8448, "step": 130 }, { "epoch": 0.45685279187817257, "grad_norm": 3.5030033367963354, "learning_rate": 9.504844339512094e-07, "loss": 0.848, "step": 135 }, { "epoch": 0.47377326565143824, "grad_norm": 3.482428868379046, "learning_rate": 9.465175243064428e-07, "loss": 0.853, "step": 140 }, { "epoch": 0.4906937394247039, "grad_norm": 3.4957931844026326, "learning_rate": 9.424067180748691e-07, "loss": 0.8768, "step": 145 }, { "epoch": 0.5076142131979695, "grad_norm": 3.6539662236846318, "learning_rate": 9.381533400219317e-07, "loss": 0.8408, "step": 150 }, { "epoch": 0.5076142131979695, "eval_loss": 0.8505008220672607, "eval_runtime": 145.9826, "eval_samples_per_second": 57.541, "eval_steps_per_second": 0.904, "step": 150 }, { "epoch": 0.5245346869712352, "grad_norm": 3.5394178909999634, "learning_rate": 9.337587608588588e-07, "loss": 0.8527, "step": 155 }, { "epoch": 0.5414551607445008, "grad_norm": 3.3947598872414386, "learning_rate": 9.29224396800933e-07, "loss": 0.8408, "step": 160 }, { "epoch": 0.5583756345177665, "grad_norm": 3.4873778908245434, "learning_rate": 9.245517091110968e-07, "loss": 0.8367, "step": 165 }, { "epoch": 0.5752961082910322, "grad_norm": 3.50355137921012, "learning_rate": 9.197422036290386e-07, "loss": 0.8654, "step": 170 }, { "epoch": 0.5922165820642978, "grad_norm": 3.665320201247955, "learning_rate": 9.147974302859156e-07, "loss": 0.8467, "step": 175 }, { "epoch": 0.6091370558375635, "grad_norm": 3.5032254780990266, "learning_rate": 9.097189826048659e-07, "loss": 0.8456, "step": 180 }, { "epoch": 0.626057529610829, "grad_norm": 3.3742001228223693, "learning_rate": 9.045084971874737e-07, "loss": 0.8186, "step": 185 }, { "epoch": 0.6429780033840947, "grad_norm": 3.5363272285750442, "learning_rate": 8.991676531863507e-07, "loss": 0.8354, "step": 190 }, { "epoch": 0.6598984771573604, "grad_norm": 3.6685050093786975, "learning_rate": 8.93698171764006e-07, "loss": 0.8415, "step": 195 }, { "epoch": 0.676818950930626, "grad_norm": 3.580018631107888, "learning_rate": 8.881018155381765e-07, "loss": 0.8415, "step": 200 }, { "epoch": 0.676818950930626, "eval_loss": 0.8333195447921753, "eval_runtime": 145.929, "eval_samples_per_second": 57.562, "eval_steps_per_second": 0.905, "step": 200 }, { "epoch": 0.6937394247038917, "grad_norm": 3.4977190055813225, "learning_rate": 8.823803880137992e-07, "loss": 0.8425, "step": 205 }, { "epoch": 0.7106598984771574, "grad_norm": 3.424911861726825, "learning_rate": 8.765357330018055e-07, "loss": 0.8197, "step": 210 }, { "epoch": 0.727580372250423, "grad_norm": 3.5180014609137804, "learning_rate": 8.705697340249274e-07, "loss": 0.84, "step": 215 }, { "epoch": 0.7445008460236887, "grad_norm": 3.269933787195742, "learning_rate": 8.644843137107057e-07, "loss": 0.8192, "step": 220 }, { "epoch": 0.7614213197969543, "grad_norm": 3.490737285532297, "learning_rate": 8.58281433171896e-07, "loss": 0.8392, "step": 225 }, { "epoch": 0.7783417935702199, "grad_norm": 3.6006380297077745, "learning_rate": 8.519630913744724e-07, "loss": 0.8285, "step": 230 }, { "epoch": 0.7952622673434856, "grad_norm": 3.5073056320425744, "learning_rate": 8.455313244934324e-07, "loss": 0.8157, "step": 235 }, { "epoch": 0.8121827411167513, "grad_norm": 3.384395618192681, "learning_rate": 8.389882052566105e-07, "loss": 0.8205, "step": 240 }, { "epoch": 0.8291032148900169, "grad_norm": 3.4511971640264494, "learning_rate": 8.323358422767128e-07, "loss": 0.8236, "step": 245 }, { "epoch": 0.8460236886632826, "grad_norm": 3.454441667633359, "learning_rate": 8.255763793717867e-07, "loss": 0.8121, "step": 250 }, { "epoch": 0.8460236886632826, "eval_loss": 0.8217905163764954, "eval_runtime": 146.2621, "eval_samples_per_second": 57.431, "eval_steps_per_second": 0.902, "step": 250 }, { "epoch": 0.8629441624365483, "grad_norm": 3.4938458445163985, "learning_rate": 8.187119948743449e-07, "loss": 0.8432, "step": 255 }, { "epoch": 0.8798646362098139, "grad_norm": 3.6058018876142834, "learning_rate": 8.117449009293668e-07, "loss": 0.8015, "step": 260 }, { "epoch": 0.8967851099830795, "grad_norm": 3.5735964122028037, "learning_rate": 8.046773427814041e-07, "loss": 0.8385, "step": 265 }, { "epoch": 0.9137055837563451, "grad_norm": 3.6543886570209496, "learning_rate": 7.975115980510185e-07, "loss": 0.8056, "step": 270 }, { "epoch": 0.9306260575296108, "grad_norm": 3.3189069792277515, "learning_rate": 7.902499760007867e-07, "loss": 0.8269, "step": 275 }, { "epoch": 0.9475465313028765, "grad_norm": 3.3073330589639034, "learning_rate": 7.828948167911073e-07, "loss": 0.8222, "step": 280 }, { "epoch": 0.9644670050761421, "grad_norm": 3.4922958115443046, "learning_rate": 7.754484907260512e-07, "loss": 0.8149, "step": 285 }, { "epoch": 0.9813874788494078, "grad_norm": 3.327799524845609, "learning_rate": 7.679133974894982e-07, "loss": 0.846, "step": 290 }, { "epoch": 0.9983079526226735, "grad_norm": 3.3720725389154973, "learning_rate": 7.602919653718043e-07, "loss": 0.8155, "step": 295 }, { "epoch": 1.015228426395939, "grad_norm": 3.440889104712857, "learning_rate": 7.525866504872506e-07, "loss": 0.7579, "step": 300 }, { "epoch": 1.015228426395939, "eval_loss": 0.8133670687675476, "eval_runtime": 146.0887, "eval_samples_per_second": 57.499, "eval_steps_per_second": 0.904, "step": 300 }, { "epoch": 1.0321489001692048, "grad_norm": 3.523402139992769, "learning_rate": 7.447999359825262e-07, "loss": 0.757, "step": 305 }, { "epoch": 1.0490693739424704, "grad_norm": 3.736820649423743, "learning_rate": 7.369343312364993e-07, "loss": 0.7555, "step": 310 }, { "epoch": 1.0659898477157361, "grad_norm": 3.63625396256149, "learning_rate": 7.289923710515338e-07, "loss": 0.7562, "step": 315 }, { "epoch": 1.0829103214890017, "grad_norm": 3.444640371019277, "learning_rate": 7.209766148366134e-07, "loss": 0.7431, "step": 320 }, { "epoch": 1.0998307952622675, "grad_norm": 3.33725792795571, "learning_rate": 7.128896457825363e-07, "loss": 0.7419, "step": 325 }, { "epoch": 1.116751269035533, "grad_norm": 3.463759633912316, "learning_rate": 7.047340700294453e-07, "loss": 0.7359, "step": 330 }, { "epoch": 1.1336717428087986, "grad_norm": 3.6911828143235175, "learning_rate": 6.965125158269618e-07, "loss": 0.7483, "step": 335 }, { "epoch": 1.1505922165820643, "grad_norm": 3.425162825369402, "learning_rate": 6.882276326871959e-07, "loss": 0.7523, "step": 340 }, { "epoch": 1.16751269035533, "grad_norm": 3.534755736267555, "learning_rate": 6.798820905309035e-07, "loss": 0.7477, "step": 345 }, { "epoch": 1.1844331641285957, "grad_norm": 3.355585464713691, "learning_rate": 6.714785788270657e-07, "loss": 0.7469, "step": 350 }, { "epoch": 1.1844331641285957, "eval_loss": 0.8087317943572998, "eval_runtime": 146.194, "eval_samples_per_second": 57.458, "eval_steps_per_second": 0.903, "step": 350 }, { "epoch": 1.2013536379018612, "grad_norm": 3.7479898725924947, "learning_rate": 6.630198057261709e-07, "loss": 0.7506, "step": 355 }, { "epoch": 1.218274111675127, "grad_norm": 3.57646668569766, "learning_rate": 6.545084971874736e-07, "loss": 0.7418, "step": 360 }, { "epoch": 1.2351945854483926, "grad_norm": 3.4779353040269227, "learning_rate": 6.459473961005168e-07, "loss": 0.7385, "step": 365 }, { "epoch": 1.252115059221658, "grad_norm": 3.5090655067073677, "learning_rate": 6.373392614011951e-07, "loss": 0.7361, "step": 370 }, { "epoch": 1.2690355329949239, "grad_norm": 3.5457643005290005, "learning_rate": 6.286868671826511e-07, "loss": 0.7667, "step": 375 }, { "epoch": 1.2859560067681894, "grad_norm": 3.700629853039733, "learning_rate": 6.199930018012829e-07, "loss": 0.751, "step": 380 }, { "epoch": 1.3028764805414552, "grad_norm": 3.7733854906847215, "learning_rate": 6.112604669781572e-07, "loss": 0.7435, "step": 385 }, { "epoch": 1.3197969543147208, "grad_norm": 3.609967851597449, "learning_rate": 6.024920768961152e-07, "loss": 0.7506, "step": 390 }, { "epoch": 1.3367174280879865, "grad_norm": 3.5839373687018057, "learning_rate": 5.936906572928624e-07, "loss": 0.7525, "step": 395 }, { "epoch": 1.353637901861252, "grad_norm": 3.7104702064535697, "learning_rate": 5.848590445503344e-07, "loss": 0.7411, "step": 400 }, { "epoch": 1.353637901861252, "eval_loss": 0.803146243095398, "eval_runtime": 146.173, "eval_samples_per_second": 57.466, "eval_steps_per_second": 0.903, "step": 400 }, { "epoch": 1.3705583756345177, "grad_norm": 3.586978273247657, "learning_rate": 5.760000847806337e-07, "loss": 0.7262, "step": 405 }, { "epoch": 1.3874788494077834, "grad_norm": 3.5534067032813033, "learning_rate": 5.671166329088277e-07, "loss": 0.7478, "step": 410 }, { "epoch": 1.404399323181049, "grad_norm": 3.5702928476693505, "learning_rate": 5.582115517529114e-07, "loss": 0.7482, "step": 415 }, { "epoch": 1.4213197969543148, "grad_norm": 3.497025840273878, "learning_rate": 5.492877111012218e-07, "loss": 0.7305, "step": 420 }, { "epoch": 1.4382402707275803, "grad_norm": 3.8312529281102865, "learning_rate": 5.403479867876087e-07, "loss": 0.7417, "step": 425 }, { "epoch": 1.455160744500846, "grad_norm": 3.5923517702731833, "learning_rate": 5.313952597646567e-07, "loss": 0.7552, "step": 430 }, { "epoch": 1.4720812182741116, "grad_norm": 3.441823126296519, "learning_rate": 5.224324151752575e-07, "loss": 0.7504, "step": 435 }, { "epoch": 1.4890016920473772, "grad_norm": 3.5174370036638534, "learning_rate": 5.134623414228315e-07, "loss": 0.7511, "step": 440 }, { "epoch": 1.505922165820643, "grad_norm": 4.018568644922475, "learning_rate": 5.044879292404989e-07, "loss": 0.7493, "step": 445 }, { "epoch": 1.5228426395939088, "grad_norm": 3.5869525261938184, "learning_rate": 4.95512070759501e-07, "loss": 0.7378, "step": 450 }, { "epoch": 1.5228426395939088, "eval_loss": 0.798000693321228, "eval_runtime": 146.3627, "eval_samples_per_second": 57.392, "eval_steps_per_second": 0.902, "step": 450 }, { "epoch": 1.5397631133671743, "grad_norm": 3.749721313687578, "learning_rate": 4.865376585771687e-07, "loss": 0.762, "step": 455 }, { "epoch": 1.5566835871404399, "grad_norm": 3.680890544965231, "learning_rate": 4.775675848247427e-07, "loss": 0.7417, "step": 460 }, { "epoch": 1.5736040609137056, "grad_norm": 3.5281531142193874, "learning_rate": 4.686047402353433e-07, "loss": 0.736, "step": 465 }, { "epoch": 1.5905245346869712, "grad_norm": 3.555081148710941, "learning_rate": 4.596520132123914e-07, "loss": 0.7351, "step": 470 }, { "epoch": 1.6074450084602367, "grad_norm": 3.4493804884671944, "learning_rate": 4.507122888987782e-07, "loss": 0.7514, "step": 475 }, { "epoch": 1.6243654822335025, "grad_norm": 3.7270963408742452, "learning_rate": 4.417884482470886e-07, "loss": 0.7374, "step": 480 }, { "epoch": 1.6412859560067683, "grad_norm": 3.845425412960905, "learning_rate": 4.328833670911724e-07, "loss": 0.7547, "step": 485 }, { "epoch": 1.6582064297800339, "grad_norm": 3.4772313441838505, "learning_rate": 4.239999152193664e-07, "loss": 0.7345, "step": 490 }, { "epoch": 1.6751269035532994, "grad_norm": 3.8728376091114436, "learning_rate": 4.1514095544966557e-07, "loss": 0.731, "step": 495 }, { "epoch": 1.6920473773265652, "grad_norm": 3.5535644482480633, "learning_rate": 4.0630934270713755e-07, "loss": 0.7423, "step": 500 }, { "epoch": 1.6920473773265652, "eval_loss": 0.7937961220741272, "eval_runtime": 146.133, "eval_samples_per_second": 57.482, "eval_steps_per_second": 0.903, "step": 500 }, { "epoch": 1.708967851099831, "grad_norm": 3.60894521319071, "learning_rate": 3.9750792310388483e-07, "loss": 0.7264, "step": 505 }, { "epoch": 1.7258883248730963, "grad_norm": 3.5752693076457227, "learning_rate": 3.8873953302184283e-07, "loss": 0.7129, "step": 510 }, { "epoch": 1.742808798646362, "grad_norm": 3.5319598859647208, "learning_rate": 3.80006998198717e-07, "loss": 0.7379, "step": 515 }, { "epoch": 1.7597292724196278, "grad_norm": 3.7064342382307136, "learning_rate": 3.713131328173489e-07, "loss": 0.7291, "step": 520 }, { "epoch": 1.7766497461928934, "grad_norm": 3.6278369087716413, "learning_rate": 3.62660738598805e-07, "loss": 0.7248, "step": 525 }, { "epoch": 1.793570219966159, "grad_norm": 3.888359889647679, "learning_rate": 3.5405260389948333e-07, "loss": 0.7516, "step": 530 }, { "epoch": 1.8104906937394247, "grad_norm": 3.4352787394554727, "learning_rate": 3.454915028125263e-07, "loss": 0.7398, "step": 535 }, { "epoch": 1.8274111675126905, "grad_norm": 3.5354090229387714, "learning_rate": 3.369801942738291e-07, "loss": 0.7415, "step": 540 }, { "epoch": 1.844331641285956, "grad_norm": 3.6324342637431912, "learning_rate": 3.285214211729343e-07, "loss": 0.7343, "step": 545 }, { "epoch": 1.8612521150592216, "grad_norm": 4.0734439950626555, "learning_rate": 3.2011790946909666e-07, "loss": 0.7363, "step": 550 }, { "epoch": 1.8612521150592216, "eval_loss": 0.7899667620658875, "eval_runtime": 146.2033, "eval_samples_per_second": 57.454, "eval_steps_per_second": 0.903, "step": 550 }, { "epoch": 1.8781725888324874, "grad_norm": 3.488509809559841, "learning_rate": 3.11772367312804e-07, "loss": 0.722, "step": 555 }, { "epoch": 1.895093062605753, "grad_norm": 3.568702481260837, "learning_rate": 3.034874841730382e-07, "loss": 0.7268, "step": 560 }, { "epoch": 1.9120135363790185, "grad_norm": 3.5437718957391584, "learning_rate": 2.9526592997055483e-07, "loss": 0.7397, "step": 565 }, { "epoch": 1.9289340101522843, "grad_norm": 3.467818973564197, "learning_rate": 2.8711035421746363e-07, "loss": 0.7438, "step": 570 }, { "epoch": 1.94585448392555, "grad_norm": 3.7876015240002396, "learning_rate": 2.7902338516338674e-07, "loss": 0.7221, "step": 575 }, { "epoch": 1.9627749576988156, "grad_norm": 3.8139203476929824, "learning_rate": 2.7100762894846627e-07, "loss": 0.7257, "step": 580 }, { "epoch": 1.9796954314720812, "grad_norm": 3.6272530624543853, "learning_rate": 2.6306566876350067e-07, "loss": 0.7254, "step": 585 }, { "epoch": 1.996615905245347, "grad_norm": 3.676365257262644, "learning_rate": 2.5520006401747395e-07, "loss": 0.7208, "step": 590 }, { "epoch": 2.0135363790186127, "grad_norm": 3.8206014953262084, "learning_rate": 2.474133495127494e-07, "loss": 0.7074, "step": 595 }, { "epoch": 2.030456852791878, "grad_norm": 3.876566280546555, "learning_rate": 2.3970803462819583e-07, "loss": 0.6778, "step": 600 }, { "epoch": 2.030456852791878, "eval_loss": 0.7937635183334351, "eval_runtime": 146.134, "eval_samples_per_second": 57.481, "eval_steps_per_second": 0.903, "step": 600 }, { "epoch": 2.047377326565144, "grad_norm": 3.9168575166004937, "learning_rate": 2.3208660251050156e-07, "loss": 0.6807, "step": 605 }, { "epoch": 2.0642978003384096, "grad_norm": 3.894680280362581, "learning_rate": 2.2455150927394878e-07, "loss": 0.6935, "step": 610 }, { "epoch": 2.081218274111675, "grad_norm": 3.9693943038444104, "learning_rate": 2.1710518320889276e-07, "loss": 0.691, "step": 615 }, { "epoch": 2.0981387478849407, "grad_norm": 3.794663051980087, "learning_rate": 2.097500239992132e-07, "loss": 0.6957, "step": 620 }, { "epoch": 2.1150592216582065, "grad_norm": 3.9622114678017306, "learning_rate": 2.0248840194898155e-07, "loss": 0.6752, "step": 625 }, { "epoch": 2.1319796954314723, "grad_norm": 3.7595934491117573, "learning_rate": 1.9532265721859597e-07, "loss": 0.6771, "step": 630 }, { "epoch": 2.1489001692047376, "grad_norm": 3.9388724645377162, "learning_rate": 1.8825509907063326e-07, "loss": 0.6858, "step": 635 }, { "epoch": 2.1658206429780034, "grad_norm": 3.7788201106321972, "learning_rate": 1.812880051256551e-07, "loss": 0.6982, "step": 640 }, { "epoch": 2.182741116751269, "grad_norm": 3.947050032362458, "learning_rate": 1.744236206282132e-07, "loss": 0.6895, "step": 645 }, { "epoch": 2.199661590524535, "grad_norm": 3.600017791091893, "learning_rate": 1.6766415772328728e-07, "loss": 0.6716, "step": 650 }, { "epoch": 2.199661590524535, "eval_loss": 0.7936422824859619, "eval_runtime": 146.1456, "eval_samples_per_second": 57.477, "eval_steps_per_second": 0.903, "step": 650 }, { "epoch": 2.2165820642978002, "grad_norm": 4.081340915582073, "learning_rate": 1.6101179474338966e-07, "loss": 0.6804, "step": 655 }, { "epoch": 2.233502538071066, "grad_norm": 3.7657350706904342, "learning_rate": 1.5446867550656767e-07, "loss": 0.6899, "step": 660 }, { "epoch": 2.250423011844332, "grad_norm": 3.9022387275901536, "learning_rate": 1.4803690862552753e-07, "loss": 0.6798, "step": 665 }, { "epoch": 2.267343485617597, "grad_norm": 3.682924093787561, "learning_rate": 1.4171856682810384e-07, "loss": 0.6814, "step": 670 }, { "epoch": 2.284263959390863, "grad_norm": 3.6877950643476938, "learning_rate": 1.3551568628929432e-07, "loss": 0.6739, "step": 675 }, { "epoch": 2.3011844331641287, "grad_norm": 3.935618521241239, "learning_rate": 1.2943026597507267e-07, "loss": 0.6929, "step": 680 }, { "epoch": 2.3181049069373945, "grad_norm": 3.5924620792590534, "learning_rate": 1.2346426699819456e-07, "loss": 0.6951, "step": 685 }, { "epoch": 2.33502538071066, "grad_norm": 3.8665100002377204, "learning_rate": 1.176196119862008e-07, "loss": 0.6861, "step": 690 }, { "epoch": 2.3519458544839256, "grad_norm": 3.7323294473109843, "learning_rate": 1.1189818446182358e-07, "loss": 0.6934, "step": 695 }, { "epoch": 2.3688663282571913, "grad_norm": 3.689893104326803, "learning_rate": 1.0630182823599399e-07, "loss": 0.6809, "step": 700 }, { "epoch": 2.3688663282571913, "eval_loss": 0.7926481366157532, "eval_runtime": 146.1872, "eval_samples_per_second": 57.461, "eval_steps_per_second": 0.903, "step": 700 }, { "epoch": 2.3857868020304567, "grad_norm": 3.724488260369369, "learning_rate": 1.0083234681364932e-07, "loss": 0.6874, "step": 705 }, { "epoch": 2.4027072758037225, "grad_norm": 4.0223025153408285, "learning_rate": 9.549150281252632e-08, "loss": 0.6796, "step": 710 }, { "epoch": 2.4196277495769882, "grad_norm": 3.8561396496122247, "learning_rate": 9.028101739513405e-08, "loss": 0.6843, "step": 715 }, { "epoch": 2.436548223350254, "grad_norm": 3.801934759112675, "learning_rate": 8.520256971408452e-08, "loss": 0.685, "step": 720 }, { "epoch": 2.4534686971235193, "grad_norm": 3.83257434946296, "learning_rate": 8.025779637096137e-08, "loss": 0.6828, "step": 725 }, { "epoch": 2.470389170896785, "grad_norm": 3.6401708717024146, "learning_rate": 7.544829088890325e-08, "loss": 0.6717, "step": 730 }, { "epoch": 2.487309644670051, "grad_norm": 3.9122520606711917, "learning_rate": 7.077560319906694e-08, "loss": 0.6897, "step": 735 }, { "epoch": 2.504230118443316, "grad_norm": 3.9524113141623154, "learning_rate": 6.624123914114122e-08, "loss": 0.6848, "step": 740 }, { "epoch": 2.521150592216582, "grad_norm": 3.670900471362486, "learning_rate": 6.184665997806831e-08, "loss": 0.6847, "step": 745 }, { "epoch": 2.5380710659898478, "grad_norm": 3.9559042523399115, "learning_rate": 5.759328192513074e-08, "loss": 0.6973, "step": 750 }, { "epoch": 2.5380710659898478, "eval_loss": 0.7919026017189026, "eval_runtime": 146.2281, "eval_samples_per_second": 57.444, "eval_steps_per_second": 0.903, "step": 750 }, { "epoch": 2.5549915397631136, "grad_norm": 3.8423336960320538, "learning_rate": 5.348247569355735e-08, "loss": 0.6949, "step": 755 }, { "epoch": 2.571912013536379, "grad_norm": 3.7975655402725943, "learning_rate": 4.951556604879048e-08, "loss": 0.6866, "step": 760 }, { "epoch": 2.5888324873096447, "grad_norm": 3.568623425566606, "learning_rate": 4.569383138356275e-08, "loss": 0.683, "step": 765 }, { "epoch": 2.6057529610829104, "grad_norm": 3.776851694466855, "learning_rate": 4.201850330591677e-08, "loss": 0.6749, "step": 770 }, { "epoch": 2.6226734348561758, "grad_norm": 3.7650644536804894, "learning_rate": 3.8490766242301353e-08, "loss": 0.6824, "step": 775 }, { "epoch": 2.6395939086294415, "grad_norm": 4.1315575147110515, "learning_rate": 3.5111757055874326e-08, "loss": 0.6915, "step": 780 }, { "epoch": 2.6565143824027073, "grad_norm": 3.945733483403889, "learning_rate": 3.188256468013139e-08, "loss": 0.6948, "step": 785 }, { "epoch": 2.673434856175973, "grad_norm": 4.0702928933504445, "learning_rate": 2.8804229767982636e-08, "loss": 0.7135, "step": 790 }, { "epoch": 2.6903553299492384, "grad_norm": 4.049036919724828, "learning_rate": 2.587774435638679e-08, "loss": 0.7004, "step": 795 }, { "epoch": 2.707275803722504, "grad_norm": 3.686063905885555, "learning_rate": 2.3104051546654013e-08, "loss": 0.6804, "step": 800 }, { "epoch": 2.707275803722504, "eval_loss": 0.7912485599517822, "eval_runtime": 146.0875, "eval_samples_per_second": 57.5, "eval_steps_per_second": 0.904, "step": 800 }, { "epoch": 2.72419627749577, "grad_norm": 3.7450509400766707, "learning_rate": 2.048404520051722e-08, "loss": 0.6934, "step": 805 }, { "epoch": 2.7411167512690353, "grad_norm": 3.7829378959141673, "learning_rate": 1.8018569652073378e-08, "loss": 0.66, "step": 810 }, { "epoch": 2.758037225042301, "grad_norm": 3.790421803380348, "learning_rate": 1.570841943568446e-08, "loss": 0.6893, "step": 815 }, { "epoch": 2.774957698815567, "grad_norm": 3.724677293686406, "learning_rate": 1.3554339029927531e-08, "loss": 0.6814, "step": 820 }, { "epoch": 2.7918781725888326, "grad_norm": 3.9688985767320704, "learning_rate": 1.1557022617676216e-08, "loss": 0.673, "step": 825 }, { "epoch": 2.808798646362098, "grad_norm": 4.22397815551477, "learning_rate": 9.717113862389992e-09, "loss": 0.689, "step": 830 }, { "epoch": 2.8257191201353637, "grad_norm": 3.740661431646974, "learning_rate": 8.035205700685165e-09, "loss": 0.6853, "step": 835 }, { "epoch": 2.8426395939086295, "grad_norm": 4.039653126712931, "learning_rate": 6.511840151252168e-09, "loss": 0.6823, "step": 840 }, { "epoch": 2.859560067681895, "grad_norm": 3.665159603413248, "learning_rate": 5.147508140182555e-09, "loss": 0.6717, "step": 845 }, { "epoch": 2.8764805414551606, "grad_norm": 3.7704278001935525, "learning_rate": 3.9426493427611175e-09, "loss": 0.6684, "step": 850 }, { "epoch": 2.8764805414551606, "eval_loss": 0.791114091873169, "eval_runtime": 146.2505, "eval_samples_per_second": 57.436, "eval_steps_per_second": 0.903, "step": 850 }, { "epoch": 2.8934010152284264, "grad_norm": 3.6709944458168975, "learning_rate": 2.897652041774279e-09, "loss": 0.6849, "step": 855 }, { "epoch": 2.910321489001692, "grad_norm": 3.8441912749301244, "learning_rate": 2.0128530023804656e-09, "loss": 0.6833, "step": 860 }, { "epoch": 2.927241962774958, "grad_norm": 3.8325829088626713, "learning_rate": 1.2885373635829754e-09, "loss": 0.691, "step": 865 }, { "epoch": 2.9441624365482233, "grad_norm": 3.6989879423314207, "learning_rate": 7.249385463395374e-10, "loss": 0.6847, "step": 870 }, { "epoch": 2.961082910321489, "grad_norm": 3.697758295388761, "learning_rate": 3.22238178339318e-10, "loss": 0.6721, "step": 875 }, { "epoch": 2.9780033840947544, "grad_norm": 4.082760711386711, "learning_rate": 8.056603547090812e-11, "loss": 0.6798, "step": 880 }, { "epoch": 2.99492385786802, "grad_norm": 4.0230745119972795, "learning_rate": 0.0, "loss": 0.7023, "step": 885 }, { "epoch": 2.99492385786802, "step": 885, "total_flos": 5218127163949056.0, "train_loss": 0.7775604002893308, "train_runtime": 14098.6672, "train_samples_per_second": 16.086, "train_steps_per_second": 0.063 } ], "logging_steps": 5, "max_steps": 885, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5218127163949056.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }