{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99492385786802, "eval_steps": 50, "global_step": 885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01692047377326565, "grad_norm": 17.95940849112974, "learning_rate": 5e-07, "loss": 1.7425, "step": 5 }, { "epoch": 0.0338409475465313, "grad_norm": 12.23967055037602, "learning_rate": 1e-06, "loss": 1.599, "step": 10 }, { "epoch": 0.050761421319796954, "grad_norm": 7.600510364072396, "learning_rate": 9.99919433964529e-07, "loss": 1.2976, "step": 15 }, { "epoch": 0.0676818950930626, "grad_norm": 4.134278668785207, "learning_rate": 9.996777618216605e-07, "loss": 1.1572, "step": 20 }, { "epoch": 0.08460236886632826, "grad_norm": 3.846352071719652, "learning_rate": 9.992750614536604e-07, "loss": 1.0495, "step": 25 }, { "epoch": 0.10152284263959391, "grad_norm": 3.4799081252721886, "learning_rate": 9.98711462636417e-07, "loss": 1.0222, "step": 30 }, { "epoch": 0.11844331641285956, "grad_norm": 3.6435583580883644, "learning_rate": 9.979871469976195e-07, "loss": 0.982, "step": 35 }, { "epoch": 0.1353637901861252, "grad_norm": 3.474443331449369, "learning_rate": 9.971023479582256e-07, "loss": 0.9659, "step": 40 }, { "epoch": 0.15228426395939088, "grad_norm": 3.5291718369580094, "learning_rate": 9.960573506572389e-07, "loss": 0.9517, "step": 45 }, { "epoch": 0.1692047377326565, "grad_norm": 3.6607844369820763, "learning_rate": 9.948524918598173e-07, "loss": 0.9744, "step": 50 }, { "epoch": 0.1692047377326565, "eval_loss": 0.9363481402397156, "eval_runtime": 147.9244, "eval_samples_per_second": 56.786, "eval_steps_per_second": 0.892, "step": 50 }, { "epoch": 0.18612521150592218, "grad_norm": 3.5029465776623305, "learning_rate": 9.934881598487478e-07, "loss": 0.9291, "step": 55 }, { "epoch": 0.20304568527918782, "grad_norm": 3.420583685374325, "learning_rate": 9.919647942993147e-07, "loss": 0.9373, "step": 60 }, { "epoch": 0.21996615905245348, "grad_norm": 3.639006411388483, "learning_rate": 9.9028288613761e-07, "loss": 0.9371, "step": 65 }, { "epoch": 0.23688663282571912, "grad_norm": 3.739200188919654, "learning_rate": 9.884429773823236e-07, "loss": 0.9168, "step": 70 }, { "epoch": 0.25380710659898476, "grad_norm": 3.6961021681290647, "learning_rate": 9.864456609700723e-07, "loss": 0.9036, "step": 75 }, { "epoch": 0.2707275803722504, "grad_norm": 3.469126484018746, "learning_rate": 9.842915805643156e-07, "loss": 0.8789, "step": 80 }, { "epoch": 0.2876480541455161, "grad_norm": 3.4540644342177953, "learning_rate": 9.819814303479267e-07, "loss": 0.8843, "step": 85 }, { "epoch": 0.30456852791878175, "grad_norm": 3.491589457047051, "learning_rate": 9.795159547994828e-07, "loss": 0.878, "step": 90 }, { "epoch": 0.32148900169204736, "grad_norm": 3.3672158895857582, "learning_rate": 9.76895948453346e-07, "loss": 0.8817, "step": 95 }, { "epoch": 0.338409475465313, "grad_norm": 3.4204437907535032, "learning_rate": 9.74122255643613e-07, "loss": 0.8749, "step": 100 }, { "epoch": 0.338409475465313, "eval_loss": 0.8794726729393005, "eval_runtime": 146.5584, "eval_samples_per_second": 57.315, "eval_steps_per_second": 0.901, "step": 100 }, { "epoch": 0.3553299492385787, "grad_norm": 3.499218396193934, "learning_rate": 9.711957702320174e-07, "loss": 0.8689, "step": 105 }, { "epoch": 0.37225042301184436, "grad_norm": 3.575982847323226, "learning_rate": 9.681174353198686e-07, "loss": 0.8552, "step": 110 }, { "epoch": 0.38917089678510997, "grad_norm": 3.5820322461640624, "learning_rate": 9.648882429441256e-07, "loss": 0.8723, "step": 115 }, { "epoch": 0.40609137055837563, "grad_norm": 3.428649928071515, "learning_rate": 9.615092337576987e-07, "loss": 0.8737, "step": 120 }, { "epoch": 0.4230118443316413, "grad_norm": 3.439041413094408, "learning_rate": 9.579814966940833e-07, "loss": 0.8574, "step": 125 }, { "epoch": 0.43993231810490696, "grad_norm": 3.611321063385662, "learning_rate": 9.543061686164372e-07, "loss": 0.8774, "step": 130 }, { "epoch": 0.45685279187817257, "grad_norm": 3.3003493132482657, "learning_rate": 9.504844339512094e-07, "loss": 0.8594, "step": 135 }, { "epoch": 0.47377326565143824, "grad_norm": 3.4741051861556684, "learning_rate": 9.465175243064428e-07, "loss": 0.8674, "step": 140 }, { "epoch": 0.4906937394247039, "grad_norm": 3.2142657401436416, "learning_rate": 9.424067180748691e-07, "loss": 0.8648, "step": 145 }, { "epoch": 0.5076142131979695, "grad_norm": 3.5722793490057336, "learning_rate": 9.381533400219317e-07, "loss": 0.8423, "step": 150 }, { "epoch": 0.5076142131979695, "eval_loss": 0.8532436490058899, "eval_runtime": 146.5502, "eval_samples_per_second": 57.318, "eval_steps_per_second": 0.901, "step": 150 }, { "epoch": 0.5245346869712352, "grad_norm": 3.387238895689527, "learning_rate": 9.337587608588588e-07, "loss": 0.8344, "step": 155 }, { "epoch": 0.5414551607445008, "grad_norm": 3.3257959351398165, "learning_rate": 9.29224396800933e-07, "loss": 0.8296, "step": 160 }, { "epoch": 0.5583756345177665, "grad_norm": 3.4206543705067425, "learning_rate": 9.245517091110968e-07, "loss": 0.8281, "step": 165 }, { "epoch": 0.5752961082910322, "grad_norm": 3.2917606038623672, "learning_rate": 9.197422036290386e-07, "loss": 0.8388, "step": 170 }, { "epoch": 0.5922165820642978, "grad_norm": 3.6356342856497874, "learning_rate": 9.147974302859156e-07, "loss": 0.8479, "step": 175 }, { "epoch": 0.6091370558375635, "grad_norm": 3.531537582517598, "learning_rate": 9.097189826048659e-07, "loss": 0.8465, "step": 180 }, { "epoch": 0.626057529610829, "grad_norm": 3.534890106733192, "learning_rate": 9.045084971874737e-07, "loss": 0.8338, "step": 185 }, { "epoch": 0.6429780033840947, "grad_norm": 3.5162595243845476, "learning_rate": 8.991676531863507e-07, "loss": 0.8454, "step": 190 }, { "epoch": 0.6598984771573604, "grad_norm": 3.6712481126005607, "learning_rate": 8.93698171764006e-07, "loss": 0.8239, "step": 195 }, { "epoch": 0.676818950930626, "grad_norm": 3.3780652656023573, "learning_rate": 8.881018155381765e-07, "loss": 0.8269, "step": 200 }, { "epoch": 0.676818950930626, "eval_loss": 0.8365465998649597, "eval_runtime": 146.6071, "eval_samples_per_second": 57.296, "eval_steps_per_second": 0.9, "step": 200 }, { "epoch": 0.6937394247038917, "grad_norm": 3.3425859477068123, "learning_rate": 8.823803880137992e-07, "loss": 0.8382, "step": 205 }, { "epoch": 0.7106598984771574, "grad_norm": 3.2738301474432956, "learning_rate": 8.765357330018055e-07, "loss": 0.8456, "step": 210 }, { "epoch": 0.727580372250423, "grad_norm": 3.563151004829781, "learning_rate": 8.705697340249274e-07, "loss": 0.8266, "step": 215 }, { "epoch": 0.7445008460236887, "grad_norm": 3.1683419089159126, "learning_rate": 8.644843137107057e-07, "loss": 0.8403, "step": 220 }, { "epoch": 0.7614213197969543, "grad_norm": 3.609357791743884, "learning_rate": 8.58281433171896e-07, "loss": 0.8244, "step": 225 }, { "epoch": 0.7783417935702199, "grad_norm": 3.421312500448169, "learning_rate": 8.519630913744724e-07, "loss": 0.8288, "step": 230 }, { "epoch": 0.7952622673434856, "grad_norm": 3.577171842922122, "learning_rate": 8.455313244934324e-07, "loss": 0.8167, "step": 235 }, { "epoch": 0.8121827411167513, "grad_norm": 3.325456007325782, "learning_rate": 8.389882052566105e-07, "loss": 0.8118, "step": 240 }, { "epoch": 0.8291032148900169, "grad_norm": 3.5627054194832914, "learning_rate": 8.323358422767128e-07, "loss": 0.8378, "step": 245 }, { "epoch": 0.8460236886632826, "grad_norm": 3.57666960479274, "learning_rate": 8.255763793717867e-07, "loss": 0.8223, "step": 250 }, { "epoch": 0.8460236886632826, "eval_loss": 0.8226217031478882, "eval_runtime": 146.6337, "eval_samples_per_second": 57.286, "eval_steps_per_second": 0.9, "step": 250 }, { "epoch": 0.8629441624365483, "grad_norm": 3.4619536067260244, "learning_rate": 8.187119948743449e-07, "loss": 0.8215, "step": 255 }, { "epoch": 0.8798646362098139, "grad_norm": 3.3755438705432805, "learning_rate": 8.117449009293668e-07, "loss": 0.7959, "step": 260 }, { "epoch": 0.8967851099830795, "grad_norm": 3.3504490747841595, "learning_rate": 8.046773427814041e-07, "loss": 0.8198, "step": 265 }, { "epoch": 0.9137055837563451, "grad_norm": 3.652293584185451, "learning_rate": 7.975115980510185e-07, "loss": 0.8198, "step": 270 }, { "epoch": 0.9306260575296108, "grad_norm": 3.497277962331386, "learning_rate": 7.902499760007867e-07, "loss": 0.8181, "step": 275 }, { "epoch": 0.9475465313028765, "grad_norm": 3.5043433150139336, "learning_rate": 7.828948167911073e-07, "loss": 0.8151, "step": 280 }, { "epoch": 0.9644670050761421, "grad_norm": 3.5662591692739034, "learning_rate": 7.754484907260512e-07, "loss": 0.8192, "step": 285 }, { "epoch": 0.9813874788494078, "grad_norm": 3.741586297285277, "learning_rate": 7.679133974894982e-07, "loss": 0.7975, "step": 290 }, { "epoch": 0.9983079526226735, "grad_norm": 3.2550447105418057, "learning_rate": 7.602919653718043e-07, "loss": 0.7885, "step": 295 }, { "epoch": 1.015228426395939, "grad_norm": 3.3766404706055613, "learning_rate": 7.525866504872506e-07, "loss": 0.7651, "step": 300 }, { "epoch": 1.015228426395939, "eval_loss": 0.8148965239524841, "eval_runtime": 146.61, "eval_samples_per_second": 57.295, "eval_steps_per_second": 0.9, "step": 300 }, { "epoch": 1.0321489001692048, "grad_norm": 3.5183190070892314, "learning_rate": 7.447999359825262e-07, "loss": 0.7393, "step": 305 }, { "epoch": 1.0490693739424704, "grad_norm": 3.7614891711473657, "learning_rate": 7.369343312364993e-07, "loss": 0.7621, "step": 310 }, { "epoch": 1.0659898477157361, "grad_norm": 3.5318777711133125, "learning_rate": 7.289923710515338e-07, "loss": 0.7546, "step": 315 }, { "epoch": 1.0829103214890017, "grad_norm": 3.5586679876971754, "learning_rate": 7.209766148366134e-07, "loss": 0.759, "step": 320 }, { "epoch": 1.0998307952622675, "grad_norm": 3.3557844891140305, "learning_rate": 7.128896457825363e-07, "loss": 0.7445, "step": 325 }, { "epoch": 1.116751269035533, "grad_norm": 3.439711853714508, "learning_rate": 7.047340700294453e-07, "loss": 0.7406, "step": 330 }, { "epoch": 1.1336717428087986, "grad_norm": 3.522824978614251, "learning_rate": 6.965125158269618e-07, "loss": 0.7368, "step": 335 }, { "epoch": 1.1505922165820643, "grad_norm": 4.008601044386287, "learning_rate": 6.882276326871959e-07, "loss": 0.7578, "step": 340 }, { "epoch": 1.16751269035533, "grad_norm": 3.6557873733426955, "learning_rate": 6.798820905309035e-07, "loss": 0.7332, "step": 345 }, { "epoch": 1.1844331641285957, "grad_norm": 3.5152732593214515, "learning_rate": 6.714785788270657e-07, "loss": 0.7388, "step": 350 }, { "epoch": 1.1844331641285957, "eval_loss": 0.8107805252075195, "eval_runtime": 146.5199, "eval_samples_per_second": 57.33, "eval_steps_per_second": 0.901, "step": 350 }, { "epoch": 1.2013536379018612, "grad_norm": 3.7338182802228093, "learning_rate": 6.630198057261709e-07, "loss": 0.7406, "step": 355 }, { "epoch": 1.218274111675127, "grad_norm": 3.5135812697699724, "learning_rate": 6.545084971874736e-07, "loss": 0.7421, "step": 360 }, { "epoch": 1.2351945854483926, "grad_norm": 3.508021675469905, "learning_rate": 6.459473961005168e-07, "loss": 0.7755, "step": 365 }, { "epoch": 1.252115059221658, "grad_norm": 3.5287017860167196, "learning_rate": 6.373392614011951e-07, "loss": 0.7408, "step": 370 }, { "epoch": 1.2690355329949239, "grad_norm": 3.6233235029794093, "learning_rate": 6.286868671826511e-07, "loss": 0.751, "step": 375 }, { "epoch": 1.2859560067681894, "grad_norm": 3.5669498367227304, "learning_rate": 6.199930018012829e-07, "loss": 0.7276, "step": 380 }, { "epoch": 1.3028764805414552, "grad_norm": 3.7287000280408176, "learning_rate": 6.112604669781572e-07, "loss": 0.7278, "step": 385 }, { "epoch": 1.3197969543147208, "grad_norm": 3.824405237133237, "learning_rate": 6.024920768961152e-07, "loss": 0.743, "step": 390 }, { "epoch": 1.3367174280879865, "grad_norm": 3.5197677626280965, "learning_rate": 5.936906572928624e-07, "loss": 0.7159, "step": 395 }, { "epoch": 1.353637901861252, "grad_norm": 3.759524343808812, "learning_rate": 5.848590445503344e-07, "loss": 0.7429, "step": 400 }, { "epoch": 1.353637901861252, "eval_loss": 0.805133044719696, "eval_runtime": 146.7971, "eval_samples_per_second": 57.222, "eval_steps_per_second": 0.899, "step": 400 }, { "epoch": 1.3705583756345177, "grad_norm": 3.797267695279564, "learning_rate": 5.760000847806337e-07, "loss": 0.7464, "step": 405 }, { "epoch": 1.3874788494077834, "grad_norm": 3.439223330784389, "learning_rate": 5.671166329088277e-07, "loss": 0.725, "step": 410 }, { "epoch": 1.404399323181049, "grad_norm": 3.6761682639396653, "learning_rate": 5.582115517529114e-07, "loss": 0.7311, "step": 415 }, { "epoch": 1.4213197969543148, "grad_norm": 3.571768390407566, "learning_rate": 5.492877111012218e-07, "loss": 0.7393, "step": 420 }, { "epoch": 1.4382402707275803, "grad_norm": 3.8046958424761623, "learning_rate": 5.403479867876087e-07, "loss": 0.758, "step": 425 }, { "epoch": 1.455160744500846, "grad_norm": 3.552061598209118, "learning_rate": 5.313952597646567e-07, "loss": 0.741, "step": 430 }, { "epoch": 1.4720812182741116, "grad_norm": 3.5137582048526546, "learning_rate": 5.224324151752575e-07, "loss": 0.736, "step": 435 }, { "epoch": 1.4890016920473772, "grad_norm": 3.6806640730520046, "learning_rate": 5.134623414228315e-07, "loss": 0.7414, "step": 440 }, { "epoch": 1.505922165820643, "grad_norm": 3.7306988391241203, "learning_rate": 5.044879292404989e-07, "loss": 0.7578, "step": 445 }, { "epoch": 1.5228426395939088, "grad_norm": 3.5044826704791543, "learning_rate": 4.95512070759501e-07, "loss": 0.7481, "step": 450 }, { "epoch": 1.5228426395939088, "eval_loss": 0.8002220392227173, "eval_runtime": 146.6261, "eval_samples_per_second": 57.289, "eval_steps_per_second": 0.9, "step": 450 }, { "epoch": 1.5397631133671743, "grad_norm": 3.5593238876932416, "learning_rate": 4.865376585771687e-07, "loss": 0.741, "step": 455 }, { "epoch": 1.5566835871404399, "grad_norm": 3.9021045537145174, "learning_rate": 4.775675848247427e-07, "loss": 0.7462, "step": 460 }, { "epoch": 1.5736040609137056, "grad_norm": 3.603020142861588, "learning_rate": 4.686047402353433e-07, "loss": 0.7344, "step": 465 }, { "epoch": 1.5905245346869712, "grad_norm": 3.5798855947417247, "learning_rate": 4.596520132123914e-07, "loss": 0.7246, "step": 470 }, { "epoch": 1.6074450084602367, "grad_norm": 3.392440988553216, "learning_rate": 4.507122888987782e-07, "loss": 0.7304, "step": 475 }, { "epoch": 1.6243654822335025, "grad_norm": 3.7346005543444307, "learning_rate": 4.417884482470886e-07, "loss": 0.7329, "step": 480 }, { "epoch": 1.6412859560067683, "grad_norm": 3.929271128512869, "learning_rate": 4.328833670911724e-07, "loss": 0.7529, "step": 485 }, { "epoch": 1.6582064297800339, "grad_norm": 3.5171536414776163, "learning_rate": 4.239999152193664e-07, "loss": 0.7531, "step": 490 }, { "epoch": 1.6751269035532994, "grad_norm": 3.574806818948794, "learning_rate": 4.1514095544966557e-07, "loss": 0.7418, "step": 495 }, { "epoch": 1.6920473773265652, "grad_norm": 3.5129274484405486, "learning_rate": 4.0630934270713755e-07, "loss": 0.7308, "step": 500 }, { "epoch": 1.6920473773265652, "eval_loss": 0.795360267162323, "eval_runtime": 146.6381, "eval_samples_per_second": 57.284, "eval_steps_per_second": 0.9, "step": 500 }, { "epoch": 1.708967851099831, "grad_norm": 3.483481666306759, "learning_rate": 3.9750792310388483e-07, "loss": 0.7311, "step": 505 }, { "epoch": 1.7258883248730963, "grad_norm": 3.5163371793641387, "learning_rate": 3.8873953302184283e-07, "loss": 0.7268, "step": 510 }, { "epoch": 1.742808798646362, "grad_norm": 3.746899781553285, "learning_rate": 3.80006998198717e-07, "loss": 0.7471, "step": 515 }, { "epoch": 1.7597292724196278, "grad_norm": 3.6600329176410735, "learning_rate": 3.713131328173489e-07, "loss": 0.7426, "step": 520 }, { "epoch": 1.7766497461928934, "grad_norm": 3.715103932641678, "learning_rate": 3.62660738598805e-07, "loss": 0.7452, "step": 525 }, { "epoch": 1.793570219966159, "grad_norm": 3.6219953866879036, "learning_rate": 3.5405260389948333e-07, "loss": 0.7447, "step": 530 }, { "epoch": 1.8104906937394247, "grad_norm": 3.5483251792927866, "learning_rate": 3.454915028125263e-07, "loss": 0.7219, "step": 535 }, { "epoch": 1.8274111675126905, "grad_norm": 3.687667827428392, "learning_rate": 3.369801942738291e-07, "loss": 0.7297, "step": 540 }, { "epoch": 1.844331641285956, "grad_norm": 3.538997656633232, "learning_rate": 3.285214211729343e-07, "loss": 0.7498, "step": 545 }, { "epoch": 1.8612521150592216, "grad_norm": 3.763523903194986, "learning_rate": 3.2011790946909666e-07, "loss": 0.7306, "step": 550 }, { "epoch": 1.8612521150592216, "eval_loss": 0.7919500470161438, "eval_runtime": 146.647, "eval_samples_per_second": 57.28, "eval_steps_per_second": 0.9, "step": 550 }, { "epoch": 1.8781725888324874, "grad_norm": 4.154646770046085, "learning_rate": 3.11772367312804e-07, "loss": 0.7364, "step": 555 }, { "epoch": 1.895093062605753, "grad_norm": 3.6964867615997874, "learning_rate": 3.034874841730382e-07, "loss": 0.7357, "step": 560 }, { "epoch": 1.9120135363790185, "grad_norm": 3.6098682228440024, "learning_rate": 2.9526592997055483e-07, "loss": 0.7435, "step": 565 }, { "epoch": 1.9289340101522843, "grad_norm": 3.5510859742021514, "learning_rate": 2.8711035421746363e-07, "loss": 0.7401, "step": 570 }, { "epoch": 1.94585448392555, "grad_norm": 3.5339412485916477, "learning_rate": 2.7902338516338674e-07, "loss": 0.7196, "step": 575 }, { "epoch": 1.9627749576988156, "grad_norm": 3.654769871220945, "learning_rate": 2.7100762894846627e-07, "loss": 0.7427, "step": 580 }, { "epoch": 1.9796954314720812, "grad_norm": 3.5585570542699485, "learning_rate": 2.6306566876350067e-07, "loss": 0.7549, "step": 585 }, { "epoch": 1.996615905245347, "grad_norm": 3.5792060121886804, "learning_rate": 2.5520006401747395e-07, "loss": 0.7306, "step": 590 }, { "epoch": 2.0135363790186127, "grad_norm": 3.629067553958508, "learning_rate": 2.474133495127494e-07, "loss": 0.7062, "step": 595 }, { "epoch": 2.030456852791878, "grad_norm": 3.646688738000953, "learning_rate": 2.3970803462819583e-07, "loss": 0.7065, "step": 600 }, { "epoch": 2.030456852791878, "eval_loss": 0.7942918539047241, "eval_runtime": 304.0198, "eval_samples_per_second": 27.63, "eval_steps_per_second": 0.434, "step": 600 }, { "epoch": 2.047377326565144, "grad_norm": 4.052211962047984, "learning_rate": 2.3208660251050156e-07, "loss": 0.675, "step": 605 }, { "epoch": 2.0642978003384096, "grad_norm": 3.9688880359901444, "learning_rate": 2.2455150927394878e-07, "loss": 0.6934, "step": 610 }, { "epoch": 2.081218274111675, "grad_norm": 3.8191830448574575, "learning_rate": 2.1710518320889276e-07, "loss": 0.695, "step": 615 }, { "epoch": 2.0981387478849407, "grad_norm": 3.9122017489827488, "learning_rate": 2.097500239992132e-07, "loss": 0.6909, "step": 620 }, { "epoch": 2.1150592216582065, "grad_norm": 4.035937936283847, "learning_rate": 2.0248840194898155e-07, "loss": 0.6869, "step": 625 }, { "epoch": 2.1319796954314723, "grad_norm": 4.0592719997150875, "learning_rate": 1.9532265721859597e-07, "loss": 0.6758, "step": 630 }, { "epoch": 2.1489001692047376, "grad_norm": 3.99048820587498, "learning_rate": 1.8825509907063326e-07, "loss": 0.6717, "step": 635 }, { "epoch": 2.1658206429780034, "grad_norm": 3.6085759994280413, "learning_rate": 1.812880051256551e-07, "loss": 0.7084, "step": 640 }, { "epoch": 2.182741116751269, "grad_norm": 3.8339000315450633, "learning_rate": 1.744236206282132e-07, "loss": 0.6795, "step": 645 }, { "epoch": 2.199661590524535, "grad_norm": 3.8042578709482937, "learning_rate": 1.6766415772328728e-07, "loss": 0.695, "step": 650 }, { "epoch": 2.199661590524535, "eval_loss": 0.7947296500205994, "eval_runtime": 146.6011, "eval_samples_per_second": 57.298, "eval_steps_per_second": 0.9, "step": 650 }, { "epoch": 2.2165820642978002, "grad_norm": 3.6822683929855318, "learning_rate": 1.6101179474338966e-07, "loss": 0.6637, "step": 655 }, { "epoch": 2.233502538071066, "grad_norm": 3.8036837748646746, "learning_rate": 1.5446867550656767e-07, "loss": 0.6846, "step": 660 }, { "epoch": 2.250423011844332, "grad_norm": 3.7154882715692747, "learning_rate": 1.4803690862552753e-07, "loss": 0.6761, "step": 665 }, { "epoch": 2.267343485617597, "grad_norm": 3.8188043343483167, "learning_rate": 1.4171856682810384e-07, "loss": 0.6834, "step": 670 }, { "epoch": 2.284263959390863, "grad_norm": 3.7763683799555494, "learning_rate": 1.3551568628929432e-07, "loss": 0.682, "step": 675 }, { "epoch": 2.3011844331641287, "grad_norm": 3.7184968099032742, "learning_rate": 1.2943026597507267e-07, "loss": 0.6758, "step": 680 }, { "epoch": 2.3181049069373945, "grad_norm": 3.6752732128517978, "learning_rate": 1.2346426699819456e-07, "loss": 0.6778, "step": 685 }, { "epoch": 2.33502538071066, "grad_norm": 3.914291257516614, "learning_rate": 1.176196119862008e-07, "loss": 0.6915, "step": 690 }, { "epoch": 2.3519458544839256, "grad_norm": 3.6920364954305307, "learning_rate": 1.1189818446182358e-07, "loss": 0.6858, "step": 695 }, { "epoch": 2.3688663282571913, "grad_norm": 3.848115462041246, "learning_rate": 1.0630182823599399e-07, "loss": 0.7013, "step": 700 }, { "epoch": 2.3688663282571913, "eval_loss": 0.7938902378082275, "eval_runtime": 146.616, "eval_samples_per_second": 57.292, "eval_steps_per_second": 0.9, "step": 700 }, { "epoch": 2.3857868020304567, "grad_norm": 3.7700574896430665, "learning_rate": 1.0083234681364932e-07, "loss": 0.6637, "step": 705 }, { "epoch": 2.4027072758037225, "grad_norm": 3.662805050537354, "learning_rate": 9.549150281252632e-08, "loss": 0.6885, "step": 710 }, { "epoch": 2.4196277495769882, "grad_norm": 3.7513042616130616, "learning_rate": 9.028101739513405e-08, "loss": 0.6949, "step": 715 }, { "epoch": 2.436548223350254, "grad_norm": 3.5781011423869193, "learning_rate": 8.520256971408452e-08, "loss": 0.6796, "step": 720 }, { "epoch": 2.4534686971235193, "grad_norm": 3.8550128332718465, "learning_rate": 8.025779637096137e-08, "loss": 0.6724, "step": 725 }, { "epoch": 2.470389170896785, "grad_norm": 3.908604741906314, "learning_rate": 7.544829088890325e-08, "loss": 0.6789, "step": 730 }, { "epoch": 2.487309644670051, "grad_norm": 3.815438363979128, "learning_rate": 7.077560319906694e-08, "loss": 0.6878, "step": 735 }, { "epoch": 2.504230118443316, "grad_norm": 3.678241183457789, "learning_rate": 6.624123914114122e-08, "loss": 0.6758, "step": 740 }, { "epoch": 2.521150592216582, "grad_norm": 3.766677918686516, "learning_rate": 6.184665997806831e-08, "loss": 0.6743, "step": 745 }, { "epoch": 2.5380710659898478, "grad_norm": 3.7999340905324974, "learning_rate": 5.759328192513074e-08, "loss": 0.6743, "step": 750 }, { "epoch": 2.5380710659898478, "eval_loss": 0.7932254672050476, "eval_runtime": 146.605, "eval_samples_per_second": 57.297, "eval_steps_per_second": 0.9, "step": 750 }, { "epoch": 2.5549915397631136, "grad_norm": 3.865893271080869, "learning_rate": 5.348247569355735e-08, "loss": 0.6872, "step": 755 }, { "epoch": 2.571912013536379, "grad_norm": 4.144060563570108, "learning_rate": 4.951556604879048e-08, "loss": 0.694, "step": 760 }, { "epoch": 2.5888324873096447, "grad_norm": 3.780680669356305, "learning_rate": 4.569383138356275e-08, "loss": 0.6826, "step": 765 }, { "epoch": 2.6057529610829104, "grad_norm": 4.114587718099576, "learning_rate": 4.201850330591677e-08, "loss": 0.68, "step": 770 }, { "epoch": 2.6226734348561758, "grad_norm": 3.4623751046204285, "learning_rate": 3.8490766242301353e-08, "loss": 0.6658, "step": 775 }, { "epoch": 2.6395939086294415, "grad_norm": 3.9196860968662137, "learning_rate": 3.5111757055874326e-08, "loss": 0.6898, "step": 780 }, { "epoch": 2.6565143824027073, "grad_norm": 3.820036546151991, "learning_rate": 3.188256468013139e-08, "loss": 0.6732, "step": 785 }, { "epoch": 2.673434856175973, "grad_norm": 4.231574698760902, "learning_rate": 2.8804229767982636e-08, "loss": 0.6728, "step": 790 }, { "epoch": 2.6903553299492384, "grad_norm": 4.115986044491551, "learning_rate": 2.587774435638679e-08, "loss": 0.6755, "step": 795 }, { "epoch": 2.707275803722504, "grad_norm": 3.724220098163462, "learning_rate": 2.3104051546654013e-08, "loss": 0.6778, "step": 800 }, { "epoch": 2.707275803722504, "eval_loss": 0.7928686141967773, "eval_runtime": 146.5361, "eval_samples_per_second": 57.324, "eval_steps_per_second": 0.901, "step": 800 }, { "epoch": 2.72419627749577, "grad_norm": 4.055902737307998, "learning_rate": 2.048404520051722e-08, "loss": 0.6846, "step": 805 }, { "epoch": 2.7411167512690353, "grad_norm": 3.761429857208659, "learning_rate": 1.8018569652073378e-08, "loss": 0.6865, "step": 810 }, { "epoch": 2.758037225042301, "grad_norm": 3.828128027906773, "learning_rate": 1.570841943568446e-08, "loss": 0.6712, "step": 815 }, { "epoch": 2.774957698815567, "grad_norm": 3.765063878680342, "learning_rate": 1.3554339029927531e-08, "loss": 0.6816, "step": 820 }, { "epoch": 2.7918781725888326, "grad_norm": 3.827232987418165, "learning_rate": 1.1557022617676216e-08, "loss": 0.6789, "step": 825 }, { "epoch": 2.808798646362098, "grad_norm": 3.9934159183359106, "learning_rate": 9.717113862389992e-09, "loss": 0.6961, "step": 830 }, { "epoch": 2.8257191201353637, "grad_norm": 3.905375697814281, "learning_rate": 8.035205700685165e-09, "loss": 0.6794, "step": 835 }, { "epoch": 2.8426395939086295, "grad_norm": 3.8176339677184687, "learning_rate": 6.511840151252168e-09, "loss": 0.6858, "step": 840 }, { "epoch": 2.859560067681895, "grad_norm": 3.669045035086845, "learning_rate": 5.147508140182555e-09, "loss": 0.6709, "step": 845 }, { "epoch": 2.8764805414551606, "grad_norm": 3.7873007524702644, "learning_rate": 3.9426493427611175e-09, "loss": 0.6951, "step": 850 }, { "epoch": 2.8764805414551606, "eval_loss": 0.7926760911941528, "eval_runtime": 146.7334, "eval_samples_per_second": 57.247, "eval_steps_per_second": 0.9, "step": 850 }, { "epoch": 2.8934010152284264, "grad_norm": 3.6861972611151548, "learning_rate": 2.897652041774279e-09, "loss": 0.6878, "step": 855 }, { "epoch": 2.910321489001692, "grad_norm": 3.6981706590030003, "learning_rate": 2.0128530023804656e-09, "loss": 0.6979, "step": 860 }, { "epoch": 2.927241962774958, "grad_norm": 3.788952889279561, "learning_rate": 1.2885373635829754e-09, "loss": 0.6897, "step": 865 }, { "epoch": 2.9441624365482233, "grad_norm": 3.818071711218172, "learning_rate": 7.249385463395374e-10, "loss": 0.6883, "step": 870 }, { "epoch": 2.961082910321489, "grad_norm": 3.612189636258872, "learning_rate": 3.22238178339318e-10, "loss": 0.6879, "step": 875 }, { "epoch": 2.9780033840947544, "grad_norm": 3.6807850492378975, "learning_rate": 8.056603547090812e-11, "loss": 0.6952, "step": 880 }, { "epoch": 2.99492385786802, "grad_norm": 3.843102017586158, "learning_rate": 0.0, "loss": 0.6819, "step": 885 }, { "epoch": 2.99492385786802, "step": 885, "total_flos": 5218127163949056.0, "train_loss": 0.776407975396194, "train_runtime": 14269.6597, "train_samples_per_second": 15.893, "train_steps_per_second": 0.062 } ], "logging_steps": 5, "max_steps": 885, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5218127163949056.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }