{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9991836734693877, "eval_steps": 500, "global_step": 153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006530612244897959, "grad_norm": 0.6802010536193848, "learning_rate": 4.999472998758978e-05, "loss": 0.7016, "num_input_tokens_seen": 2097152, "step": 1 }, { "epoch": 0.013061224489795919, "grad_norm": 0.6286860704421997, "learning_rate": 4.99789221722016e-05, "loss": 0.6549, "num_input_tokens_seen": 4194304, "step": 2 }, { "epoch": 0.019591836734693877, "grad_norm": 0.5965273976325989, "learning_rate": 4.995258321842611e-05, "loss": 0.6414, "num_input_tokens_seen": 6291456, "step": 3 }, { "epoch": 0.026122448979591838, "grad_norm": 0.513691782951355, "learning_rate": 4.991572423079236e-05, "loss": 0.6061, "num_input_tokens_seen": 8388608, "step": 4 }, { "epoch": 0.0326530612244898, "grad_norm": 0.5384091138839722, "learning_rate": 4.986836074908616e-05, "loss": 0.6026, "num_input_tokens_seen": 10485760, "step": 5 }, { "epoch": 0.03918367346938775, "grad_norm": 0.48317599296569824, "learning_rate": 4.98105127417984e-05, "loss": 0.5596, "num_input_tokens_seen": 12582912, "step": 6 }, { "epoch": 0.045714285714285714, "grad_norm": 0.35217732191085815, "learning_rate": 4.974220459770639e-05, "loss": 0.557, "num_input_tokens_seen": 14680064, "step": 7 }, { "epoch": 0.052244897959183675, "grad_norm": 0.15410029888153076, "learning_rate": 4.966346511559149e-05, "loss": 0.5416, "num_input_tokens_seen": 16777216, "step": 8 }, { "epoch": 0.05877551020408163, "grad_norm": 0.10962219536304474, "learning_rate": 4.957432749209755e-05, "loss": 0.5424, "num_input_tokens_seen": 18874368, "step": 9 }, { "epoch": 0.0653061224489796, "grad_norm": 0.0843987911939621, "learning_rate": 4.9474829307735115e-05, "loss": 0.5242, "num_input_tokens_seen": 20971520, "step": 10 }, { "epoch": 0.07183673469387755, "grad_norm": 0.06282058358192444, "learning_rate": 4.9365012511037514e-05, "loss": 0.5195, "num_input_tokens_seen": 23068672, "step": 11 }, { "epoch": 0.0783673469387755, "grad_norm": 0.05317516624927521, "learning_rate": 4.9244923400875245e-05, "loss": 0.5259, "num_input_tokens_seen": 25165824, "step": 12 }, { "epoch": 0.08489795918367347, "grad_norm": 0.04503787308931351, "learning_rate": 4.911461260693638e-05, "loss": 0.521, "num_input_tokens_seen": 27262976, "step": 13 }, { "epoch": 0.09142857142857143, "grad_norm": 0.040327247232198715, "learning_rate": 4.8974135068381036e-05, "loss": 0.5269, "num_input_tokens_seen": 29360128, "step": 14 }, { "epoch": 0.09795918367346938, "grad_norm": 0.03843770548701286, "learning_rate": 4.882355001067892e-05, "loss": 0.5208, "num_input_tokens_seen": 31457280, "step": 15 }, { "epoch": 0.10448979591836735, "grad_norm": 0.03380432352423668, "learning_rate": 4.8662920920639866e-05, "loss": 0.5193, "num_input_tokens_seen": 33554432, "step": 16 }, { "epoch": 0.1110204081632653, "grad_norm": 0.031558409333229065, "learning_rate": 4.849231551964771e-05, "loss": 0.4995, "num_input_tokens_seen": 35651584, "step": 17 }, { "epoch": 0.11755102040816326, "grad_norm": 0.031164532527327538, "learning_rate": 4.8311805735108894e-05, "loss": 0.5277, "num_input_tokens_seen": 37748736, "step": 18 }, { "epoch": 0.12408163265306123, "grad_norm": 0.030629999935626984, "learning_rate": 4.81214676701278e-05, "loss": 0.5187, "num_input_tokens_seen": 39845888, "step": 19 }, { "epoch": 0.1306122448979592, "grad_norm": 0.028932394459843636, "learning_rate": 4.792138157142158e-05, "loss": 0.5322, "num_input_tokens_seen": 41943040, "step": 20 }, { "epoch": 0.13714285714285715, "grad_norm": 0.027270587161183357, "learning_rate": 4.7711631795488096e-05, "loss": 0.4978, "num_input_tokens_seen": 44040192, "step": 21 }, { "epoch": 0.1436734693877551, "grad_norm": 0.027742592617869377, "learning_rate": 4.749230677304114e-05, "loss": 0.5253, "num_input_tokens_seen": 46137344, "step": 22 }, { "epoch": 0.15020408163265306, "grad_norm": 0.027034137398004532, "learning_rate": 4.726349897172791e-05, "loss": 0.5309, "num_input_tokens_seen": 48234496, "step": 23 }, { "epoch": 0.156734693877551, "grad_norm": 0.026147669181227684, "learning_rate": 4.702530485714461e-05, "loss": 0.5197, "num_input_tokens_seen": 50331648, "step": 24 }, { "epoch": 0.16326530612244897, "grad_norm": 0.025035962462425232, "learning_rate": 4.677782485216644e-05, "loss": 0.5115, "num_input_tokens_seen": 52428800, "step": 25 }, { "epoch": 0.16979591836734695, "grad_norm": 0.024556027725338936, "learning_rate": 4.6521163294609196e-05, "loss": 0.4952, "num_input_tokens_seen": 54525952, "step": 26 }, { "epoch": 0.1763265306122449, "grad_norm": 0.02424173429608345, "learning_rate": 4.625542839324036e-05, "loss": 0.4781, "num_input_tokens_seen": 56623104, "step": 27 }, { "epoch": 0.18285714285714286, "grad_norm": 0.02580653876066208, "learning_rate": 4.598073218215817e-05, "loss": 0.5067, "num_input_tokens_seen": 58720256, "step": 28 }, { "epoch": 0.1893877551020408, "grad_norm": 0.024303199723362923, "learning_rate": 4.5697190473557946e-05, "loss": 0.514, "num_input_tokens_seen": 60817408, "step": 29 }, { "epoch": 0.19591836734693877, "grad_norm": 0.02427365817129612, "learning_rate": 4.540492280890555e-05, "loss": 0.5062, "num_input_tokens_seen": 62914560, "step": 30 }, { "epoch": 0.20244897959183675, "grad_norm": 0.023720987141132355, "learning_rate": 4.510405240853854e-05, "loss": 0.494, "num_input_tokens_seen": 65011712, "step": 31 }, { "epoch": 0.2089795918367347, "grad_norm": 0.0251515731215477, "learning_rate": 4.4794706119716455e-05, "loss": 0.5056, "num_input_tokens_seen": 67108864, "step": 32 }, { "epoch": 0.21551020408163266, "grad_norm": 0.022940408438444138, "learning_rate": 4.447701436314176e-05, "loss": 0.5128, "num_input_tokens_seen": 69206016, "step": 33 }, { "epoch": 0.2220408163265306, "grad_norm": 0.023887069895863533, "learning_rate": 4.415111107797445e-05, "loss": 0.5024, "num_input_tokens_seen": 71303168, "step": 34 }, { "epoch": 0.22857142857142856, "grad_norm": 0.022622637450695038, "learning_rate": 4.381713366536311e-05, "loss": 0.4998, "num_input_tokens_seen": 73400320, "step": 35 }, { "epoch": 0.23510204081632652, "grad_norm": 0.02263909950852394, "learning_rate": 4.347522293051648e-05, "loss": 0.5222, "num_input_tokens_seen": 75497472, "step": 36 }, { "epoch": 0.2416326530612245, "grad_norm": 0.023353304713964462, "learning_rate": 4.312552302333982e-05, "loss": 0.501, "num_input_tokens_seen": 77594624, "step": 37 }, { "epoch": 0.24816326530612245, "grad_norm": 0.02225712686777115, "learning_rate": 4.276818137766118e-05, "loss": 0.5035, "num_input_tokens_seen": 79691776, "step": 38 }, { "epoch": 0.2546938775510204, "grad_norm": 0.022769000381231308, "learning_rate": 4.2403348649073174e-05, "loss": 0.5115, "num_input_tokens_seen": 81788928, "step": 39 }, { "epoch": 0.2612244897959184, "grad_norm": 0.0222884863615036, "learning_rate": 4.203117865141635e-05, "loss": 0.5087, "num_input_tokens_seen": 83886080, "step": 40 }, { "epoch": 0.2677551020408163, "grad_norm": 0.02289729192852974, "learning_rate": 4.1651828291931264e-05, "loss": 0.5095, "num_input_tokens_seen": 85983232, "step": 41 }, { "epoch": 0.2742857142857143, "grad_norm": 0.023385386914014816, "learning_rate": 4.126545750510605e-05, "loss": 0.5033, "num_input_tokens_seen": 88080384, "step": 42 }, { "epoch": 0.2808163265306122, "grad_norm": 0.02286476455628872, "learning_rate": 4.0872229185248075e-05, "loss": 0.5245, "num_input_tokens_seen": 90177536, "step": 43 }, { "epoch": 0.2873469387755102, "grad_norm": 0.02207457832992077, "learning_rate": 4.047230911780737e-05, "loss": 0.5041, "num_input_tokens_seen": 92274688, "step": 44 }, { "epoch": 0.2938775510204082, "grad_norm": 0.021405808627605438, "learning_rate": 4.0065865909481417e-05, "loss": 0.4929, "num_input_tokens_seen": 94371840, "step": 45 }, { "epoch": 0.3004081632653061, "grad_norm": 0.022242728620767593, "learning_rate": 3.965307091713037e-05, "loss": 0.4939, "num_input_tokens_seen": 96468992, "step": 46 }, { "epoch": 0.3069387755102041, "grad_norm": 0.02234838157892227, "learning_rate": 3.923409817553284e-05, "loss": 0.5008, "num_input_tokens_seen": 98566144, "step": 47 }, { "epoch": 0.313469387755102, "grad_norm": 0.020911742001771927, "learning_rate": 3.880912432401265e-05, "loss": 0.5023, "num_input_tokens_seen": 100663296, "step": 48 }, { "epoch": 0.32, "grad_norm": 0.02207380160689354, "learning_rate": 3.837832853196751e-05, "loss": 0.5124, "num_input_tokens_seen": 102760448, "step": 49 }, { "epoch": 0.32653061224489793, "grad_norm": 0.021706702187657356, "learning_rate": 3.794189242333106e-05, "loss": 0.5033, "num_input_tokens_seen": 104857600, "step": 50 }, { "epoch": 0.3330612244897959, "grad_norm": 0.02188958041369915, "learning_rate": 3.7500000000000003e-05, "loss": 0.4958, "num_input_tokens_seen": 106954752, "step": 51 }, { "epoch": 0.3395918367346939, "grad_norm": 0.02165570855140686, "learning_rate": 3.705283756425872e-05, "loss": 0.5006, "num_input_tokens_seen": 109051904, "step": 52 }, { "epoch": 0.3461224489795918, "grad_norm": 0.02152332104742527, "learning_rate": 3.6600593640234086e-05, "loss": 0.5033, "num_input_tokens_seen": 111149056, "step": 53 }, { "epoch": 0.3526530612244898, "grad_norm": 0.02067963406443596, "learning_rate": 3.6143458894413465e-05, "loss": 0.5038, "num_input_tokens_seen": 113246208, "step": 54 }, { "epoch": 0.35918367346938773, "grad_norm": 0.02125193364918232, "learning_rate": 3.568162605525953e-05, "loss": 0.4973, "num_input_tokens_seen": 115343360, "step": 55 }, { "epoch": 0.3657142857142857, "grad_norm": 0.020537488162517548, "learning_rate": 3.5215289831955786e-05, "loss": 0.495, "num_input_tokens_seen": 117440512, "step": 56 }, { "epoch": 0.3722448979591837, "grad_norm": 0.021631518378853798, "learning_rate": 3.474464683231698e-05, "loss": 0.4971, "num_input_tokens_seen": 119537664, "step": 57 }, { "epoch": 0.3787755102040816, "grad_norm": 0.023077189922332764, "learning_rate": 3.426989547989902e-05, "loss": 0.5412, "num_input_tokens_seen": 121634816, "step": 58 }, { "epoch": 0.3853061224489796, "grad_norm": 0.02074201963841915, "learning_rate": 3.379123593034342e-05, "loss": 0.5219, "num_input_tokens_seen": 123731968, "step": 59 }, { "epoch": 0.39183673469387753, "grad_norm": 0.021061761304736137, "learning_rate": 3.330886998699149e-05, "loss": 0.4989, "num_input_tokens_seen": 125829120, "step": 60 }, { "epoch": 0.3983673469387755, "grad_norm": 0.021866271272301674, "learning_rate": 3.282300101580386e-05, "loss": 0.5122, "num_input_tokens_seen": 127926272, "step": 61 }, { "epoch": 0.4048979591836735, "grad_norm": 0.020894547924399376, "learning_rate": 3.2333833859621153e-05, "loss": 0.4987, "num_input_tokens_seen": 130023424, "step": 62 }, { "epoch": 0.4114285714285714, "grad_norm": 0.020912354812026024, "learning_rate": 3.1841574751802076e-05, "loss": 0.4968, "num_input_tokens_seen": 132120576, "step": 63 }, { "epoch": 0.4179591836734694, "grad_norm": 0.02170964516699314, "learning_rate": 3.13464312292752e-05, "loss": 0.499, "num_input_tokens_seen": 134217728, "step": 64 }, { "epoch": 0.42448979591836733, "grad_norm": 0.020900549367070198, "learning_rate": 3.084861204504122e-05, "loss": 0.5055, "num_input_tokens_seen": 136314880, "step": 65 }, { "epoch": 0.4310204081632653, "grad_norm": 0.020687857642769814, "learning_rate": 3.0348327080162435e-05, "loss": 0.5014, "num_input_tokens_seen": 138412032, "step": 66 }, { "epoch": 0.43755102040816324, "grad_norm": 0.02071143500506878, "learning_rate": 2.9845787255276753e-05, "loss": 0.4892, "num_input_tokens_seen": 140509184, "step": 67 }, { "epoch": 0.4440816326530612, "grad_norm": 0.021139299497008324, "learning_rate": 2.9341204441673266e-05, "loss": 0.5069, "num_input_tokens_seen": 142606336, "step": 68 }, { "epoch": 0.4506122448979592, "grad_norm": 0.020430322736501694, "learning_rate": 2.8834791371967142e-05, "loss": 0.4988, "num_input_tokens_seen": 144703488, "step": 69 }, { "epoch": 0.45714285714285713, "grad_norm": 0.020096473395824432, "learning_rate": 2.8326761550411345e-05, "loss": 0.4771, "num_input_tokens_seen": 146800640, "step": 70 }, { "epoch": 0.4636734693877551, "grad_norm": 0.020946258679032326, "learning_rate": 2.781732916288303e-05, "loss": 0.5036, "num_input_tokens_seen": 148897792, "step": 71 }, { "epoch": 0.47020408163265304, "grad_norm": 0.019651729613542557, "learning_rate": 2.7306708986582553e-05, "loss": 0.499, "num_input_tokens_seen": 150994944, "step": 72 }, { "epoch": 0.476734693877551, "grad_norm": 0.020175347104668617, "learning_rate": 2.679511629948319e-05, "loss": 0.5067, "num_input_tokens_seen": 153092096, "step": 73 }, { "epoch": 0.483265306122449, "grad_norm": 0.02037941850721836, "learning_rate": 2.628276678956974e-05, "loss": 0.5015, "num_input_tokens_seen": 155189248, "step": 74 }, { "epoch": 0.4897959183673469, "grad_norm": 0.020525282248854637, "learning_rate": 2.5769876463904265e-05, "loss": 0.504, "num_input_tokens_seen": 157286400, "step": 75 }, { "epoch": 0.4963265306122449, "grad_norm": 0.02100261114537716, "learning_rate": 2.5256661557557247e-05, "loss": 0.513, "num_input_tokens_seen": 159383552, "step": 76 }, { "epoch": 0.5028571428571429, "grad_norm": 0.02081277035176754, "learning_rate": 2.4743338442442755e-05, "loss": 0.5093, "num_input_tokens_seen": 161480704, "step": 77 }, { "epoch": 0.5093877551020408, "grad_norm": 0.01962732896208763, "learning_rate": 2.4230123536095748e-05, "loss": 0.475, "num_input_tokens_seen": 163577856, "step": 78 }, { "epoch": 0.5159183673469387, "grad_norm": 0.020476974546909332, "learning_rate": 2.3717233210430256e-05, "loss": 0.4827, "num_input_tokens_seen": 165675008, "step": 79 }, { "epoch": 0.5224489795918368, "grad_norm": 0.02092832699418068, "learning_rate": 2.3204883700516812e-05, "loss": 0.5234, "num_input_tokens_seen": 167772160, "step": 80 }, { "epoch": 0.5289795918367347, "grad_norm": 0.020366327837109566, "learning_rate": 2.2693291013417453e-05, "loss": 0.5204, "num_input_tokens_seen": 169869312, "step": 81 }, { "epoch": 0.5355102040816326, "grad_norm": 0.020487571135163307, "learning_rate": 2.2182670837116975e-05, "loss": 0.4886, "num_input_tokens_seen": 171966464, "step": 82 }, { "epoch": 0.5420408163265306, "grad_norm": 0.01977311633527279, "learning_rate": 2.1673238449588668e-05, "loss": 0.5005, "num_input_tokens_seen": 174063616, "step": 83 }, { "epoch": 0.5485714285714286, "grad_norm": 0.01979038491845131, "learning_rate": 2.116520862803286e-05, "loss": 0.4832, "num_input_tokens_seen": 176160768, "step": 84 }, { "epoch": 0.5551020408163265, "grad_norm": 0.020210135728120804, "learning_rate": 2.0658795558326743e-05, "loss": 0.5021, "num_input_tokens_seen": 178257920, "step": 85 }, { "epoch": 0.5616326530612245, "grad_norm": 0.020240608602762222, "learning_rate": 2.015421274472325e-05, "loss": 0.4839, "num_input_tokens_seen": 180355072, "step": 86 }, { "epoch": 0.5681632653061225, "grad_norm": 0.020403753966093063, "learning_rate": 1.965167291983757e-05, "loss": 0.5078, "num_input_tokens_seen": 182452224, "step": 87 }, { "epoch": 0.5746938775510204, "grad_norm": 0.020527003332972527, "learning_rate": 1.9151387954958794e-05, "loss": 0.4981, "num_input_tokens_seen": 184549376, "step": 88 }, { "epoch": 0.5812244897959183, "grad_norm": 0.02056964300572872, "learning_rate": 1.8653568770724806e-05, "loss": 0.4913, "num_input_tokens_seen": 186646528, "step": 89 }, { "epoch": 0.5877551020408164, "grad_norm": 0.019625499844551086, "learning_rate": 1.815842524819793e-05, "loss": 0.4862, "num_input_tokens_seen": 188743680, "step": 90 }, { "epoch": 0.5942857142857143, "grad_norm": 0.02040235698223114, "learning_rate": 1.7666166140378852e-05, "loss": 0.5146, "num_input_tokens_seen": 190840832, "step": 91 }, { "epoch": 0.6008163265306122, "grad_norm": 0.02010432258248329, "learning_rate": 1.7176998984196146e-05, "loss": 0.4952, "num_input_tokens_seen": 192937984, "step": 92 }, { "epoch": 0.6073469387755102, "grad_norm": 0.01999637298285961, "learning_rate": 1.6691130013008514e-05, "loss": 0.4845, "num_input_tokens_seen": 195035136, "step": 93 }, { "epoch": 0.6138775510204082, "grad_norm": 0.019539078697562218, "learning_rate": 1.620876406965658e-05, "loss": 0.4881, "num_input_tokens_seen": 197132288, "step": 94 }, { "epoch": 0.6204081632653061, "grad_norm": 0.020548058673739433, "learning_rate": 1.5730104520100982e-05, "loss": 0.498, "num_input_tokens_seen": 199229440, "step": 95 }, { "epoch": 0.626938775510204, "grad_norm": 0.020467426627874374, "learning_rate": 1.5255353167683017e-05, "loss": 0.4989, "num_input_tokens_seen": 201326592, "step": 96 }, { "epoch": 0.6334693877551021, "grad_norm": 0.019751951098442078, "learning_rate": 1.4784710168044213e-05, "loss": 0.4941, "num_input_tokens_seen": 203423744, "step": 97 }, { "epoch": 0.64, "grad_norm": 0.01974237710237503, "learning_rate": 1.4318373944740484e-05, "loss": 0.4919, "num_input_tokens_seen": 205520896, "step": 98 }, { "epoch": 0.6465306122448979, "grad_norm": 0.019516095519065857, "learning_rate": 1.3856541105586545e-05, "loss": 0.5047, "num_input_tokens_seen": 207618048, "step": 99 }, { "epoch": 0.6530612244897959, "grad_norm": 0.01965564861893654, "learning_rate": 1.339940635976592e-05, "loss": 0.5104, "num_input_tokens_seen": 209715200, "step": 100 }, { "epoch": 0.6595918367346939, "grad_norm": 0.019674964249134064, "learning_rate": 1.2947162435741278e-05, "loss": 0.5008, "num_input_tokens_seen": 211812352, "step": 101 }, { "epoch": 0.6661224489795918, "grad_norm": 0.019747601822018623, "learning_rate": 1.2500000000000006e-05, "loss": 0.5022, "num_input_tokens_seen": 213909504, "step": 102 }, { "epoch": 0.6726530612244898, "grad_norm": 0.0198560431599617, "learning_rate": 1.205810757666894e-05, "loss": 0.4979, "num_input_tokens_seen": 216006656, "step": 103 }, { "epoch": 0.6791836734693878, "grad_norm": 0.020007461309432983, "learning_rate": 1.1621671468032493e-05, "loss": 0.4972, "num_input_tokens_seen": 218103808, "step": 104 }, { "epoch": 0.6857142857142857, "grad_norm": 0.019895223900675774, "learning_rate": 1.1190875675987356e-05, "loss": 0.5058, "num_input_tokens_seen": 220200960, "step": 105 }, { "epoch": 0.6922448979591836, "grad_norm": 0.019599556922912598, "learning_rate": 1.0765901824467167e-05, "loss": 0.4935, "num_input_tokens_seen": 222298112, "step": 106 }, { "epoch": 0.6987755102040817, "grad_norm": 0.019807366654276848, "learning_rate": 1.0346929082869641e-05, "loss": 0.5041, "num_input_tokens_seen": 224395264, "step": 107 }, { "epoch": 0.7053061224489796, "grad_norm": 0.019651690497994423, "learning_rate": 9.934134090518593e-06, "loss": 0.5066, "num_input_tokens_seen": 226492416, "step": 108 }, { "epoch": 0.7118367346938775, "grad_norm": 0.020175212994217873, "learning_rate": 9.527690882192636e-06, "loss": 0.5181, "num_input_tokens_seen": 228589568, "step": 109 }, { "epoch": 0.7183673469387755, "grad_norm": 0.021520880982279778, "learning_rate": 9.127770814751933e-06, "loss": 0.4877, "num_input_tokens_seen": 230686720, "step": 110 }, { "epoch": 0.7248979591836735, "grad_norm": 0.02041519619524479, "learning_rate": 8.734542494893955e-06, "loss": 0.4886, "num_input_tokens_seen": 232783872, "step": 111 }, { "epoch": 0.7314285714285714, "grad_norm": 0.020032085478305817, "learning_rate": 8.348171708068747e-06, "loss": 0.4986, "num_input_tokens_seen": 234881024, "step": 112 }, { "epoch": 0.7379591836734694, "grad_norm": 0.01945379003882408, "learning_rate": 7.968821348583644e-06, "loss": 0.4892, "num_input_tokens_seen": 236978176, "step": 113 }, { "epoch": 0.7444897959183674, "grad_norm": 0.019653445109725, "learning_rate": 7.5966513509268365e-06, "loss": 0.5048, "num_input_tokens_seen": 239075328, "step": 114 }, { "epoch": 0.7510204081632653, "grad_norm": 0.020067734643816948, "learning_rate": 7.231818622338823e-06, "loss": 0.4907, "num_input_tokens_seen": 241172480, "step": 115 }, { "epoch": 0.7575510204081632, "grad_norm": 0.0194945577532053, "learning_rate": 6.8744769766601854e-06, "loss": 0.5074, "num_input_tokens_seen": 243269632, "step": 116 }, { "epoch": 0.7640816326530612, "grad_norm": 0.019672313705086708, "learning_rate": 6.524777069483526e-06, "loss": 0.4954, "num_input_tokens_seen": 245366784, "step": 117 }, { "epoch": 0.7706122448979592, "grad_norm": 0.019437000155448914, "learning_rate": 6.182866334636889e-06, "loss": 0.5036, "num_input_tokens_seen": 247463936, "step": 118 }, { "epoch": 0.7771428571428571, "grad_norm": 0.020035067573189735, "learning_rate": 5.848888922025553e-06, "loss": 0.5133, "num_input_tokens_seen": 249561088, "step": 119 }, { "epoch": 0.7836734693877551, "grad_norm": 0.019359605386853218, "learning_rate": 5.522985636858239e-06, "loss": 0.4965, "num_input_tokens_seen": 251658240, "step": 120 }, { "epoch": 0.7902040816326531, "grad_norm": 0.01978667639195919, "learning_rate": 5.205293880283552e-06, "loss": 0.5068, "num_input_tokens_seen": 253755392, "step": 121 }, { "epoch": 0.796734693877551, "grad_norm": 0.019405698403716087, "learning_rate": 4.8959475914614554e-06, "loss": 0.4993, "num_input_tokens_seen": 255852544, "step": 122 }, { "epoch": 0.803265306122449, "grad_norm": 0.01977471075952053, "learning_rate": 4.5950771910944605e-06, "loss": 0.5173, "num_input_tokens_seen": 257949696, "step": 123 }, { "epoch": 0.809795918367347, "grad_norm": 0.01949562318623066, "learning_rate": 4.3028095264420535e-06, "loss": 0.5032, "num_input_tokens_seen": 260046848, "step": 124 }, { "epoch": 0.8163265306122449, "grad_norm": 0.019873222336173058, "learning_rate": 4.019267817841835e-06, "loss": 0.505, "num_input_tokens_seen": 262144000, "step": 125 }, { "epoch": 0.8228571428571428, "grad_norm": 0.019852887839078903, "learning_rate": 3.7445716067596503e-06, "loss": 0.5145, "num_input_tokens_seen": 264241152, "step": 126 }, { "epoch": 0.8293877551020408, "grad_norm": 0.019747605547308922, "learning_rate": 3.478836705390809e-06, "loss": 0.4967, "num_input_tokens_seen": 266338304, "step": 127 }, { "epoch": 0.8359183673469388, "grad_norm": 0.019296282902359962, "learning_rate": 3.222175147833556e-06, "loss": 0.5001, "num_input_tokens_seen": 268435456, "step": 128 }, { "epoch": 0.8424489795918367, "grad_norm": 0.020198358222842216, "learning_rate": 2.974695142855388e-06, "loss": 0.5037, "num_input_tokens_seen": 270532608, "step": 129 }, { "epoch": 0.8489795918367347, "grad_norm": 0.020021170377731323, "learning_rate": 2.7365010282720952e-06, "loss": 0.5238, "num_input_tokens_seen": 272629760, "step": 130 }, { "epoch": 0.8555102040816327, "grad_norm": 0.019700711593031883, "learning_rate": 2.507693226958871e-06, "loss": 0.5007, "num_input_tokens_seen": 274726912, "step": 131 }, { "epoch": 0.8620408163265306, "grad_norm": 0.019315095618367195, "learning_rate": 2.2883682045119063e-06, "loss": 0.5119, "num_input_tokens_seen": 276824064, "step": 132 }, { "epoch": 0.8685714285714285, "grad_norm": 0.02047501504421234, "learning_rate": 2.0786184285784297e-06, "loss": 0.4874, "num_input_tokens_seen": 278921216, "step": 133 }, { "epoch": 0.8751020408163265, "grad_norm": 0.019602535292506218, "learning_rate": 1.8785323298722097e-06, "loss": 0.4969, "num_input_tokens_seen": 281018368, "step": 134 }, { "epoch": 0.8816326530612245, "grad_norm": 0.019447464495897293, "learning_rate": 1.6881942648911076e-06, "loss": 0.4835, "num_input_tokens_seen": 283115520, "step": 135 }, { "epoch": 0.8881632653061224, "grad_norm": 0.0193500854074955, "learning_rate": 1.5076844803522922e-06, "loss": 0.4863, "num_input_tokens_seen": 285212672, "step": 136 }, { "epoch": 0.8946938775510204, "grad_norm": 0.019578518345952034, "learning_rate": 1.3370790793601373e-06, "loss": 0.5046, "num_input_tokens_seen": 287309824, "step": 137 }, { "epoch": 0.9012244897959184, "grad_norm": 0.020458122715353966, "learning_rate": 1.1764499893210878e-06, "loss": 0.5018, "num_input_tokens_seen": 289406976, "step": 138 }, { "epoch": 0.9077551020408163, "grad_norm": 0.019856926053762436, "learning_rate": 1.0258649316189722e-06, "loss": 0.5025, "num_input_tokens_seen": 291504128, "step": 139 }, { "epoch": 0.9142857142857143, "grad_norm": 0.019992457702755928, "learning_rate": 8.85387393063622e-07, "loss": 0.5139, "num_input_tokens_seen": 293601280, "step": 140 }, { "epoch": 0.9208163265306123, "grad_norm": 0.019166303798556328, "learning_rate": 7.550765991247654e-07, "loss": 0.484, "num_input_tokens_seen": 295698432, "step": 141 }, { "epoch": 0.9273469387755102, "grad_norm": 0.019759617745876312, "learning_rate": 6.349874889624962e-07, "loss": 0.5127, "num_input_tokens_seen": 297795584, "step": 142 }, { "epoch": 0.9338775510204081, "grad_norm": 0.019604109227657318, "learning_rate": 5.25170692264887e-07, "loss": 0.4876, "num_input_tokens_seen": 299892736, "step": 143 }, { "epoch": 0.9404081632653061, "grad_norm": 0.019114624708890915, "learning_rate": 4.256725079024554e-07, "loss": 0.4925, "num_input_tokens_seen": 301989888, "step": 144 }, { "epoch": 0.9469387755102041, "grad_norm": 0.019426511600613594, "learning_rate": 3.3653488440851255e-07, "loss": 0.4861, "num_input_tokens_seen": 304087040, "step": 145 }, { "epoch": 0.953469387755102, "grad_norm": 0.019297398626804352, "learning_rate": 2.5779540229361745e-07, "loss": 0.493, "num_input_tokens_seen": 306184192, "step": 146 }, { "epoch": 0.96, "grad_norm": 0.01964535564184189, "learning_rate": 1.8948725820160662e-07, "loss": 0.5109, "num_input_tokens_seen": 308281344, "step": 147 }, { "epoch": 0.966530612244898, "grad_norm": 0.019757628440856934, "learning_rate": 1.3163925091384533e-07, "loss": 0.5182, "num_input_tokens_seen": 310378496, "step": 148 }, { "epoch": 0.9730612244897959, "grad_norm": 0.019987070932984352, "learning_rate": 8.427576920763958e-08, "loss": 0.4902, "num_input_tokens_seen": 312475648, "step": 149 }, { "epoch": 0.9795918367346939, "grad_norm": 0.020403273403644562, "learning_rate": 4.741678157389739e-08, "loss": 0.5019, "num_input_tokens_seen": 314572800, "step": 150 }, { "epoch": 0.9861224489795918, "grad_norm": 0.0199234988540411, "learning_rate": 2.1077827798404726e-08, "loss": 0.5081, "num_input_tokens_seen": 316669952, "step": 151 }, { "epoch": 0.9926530612244898, "grad_norm": 0.019740723073482513, "learning_rate": 5.270012410216185e-09, "loss": 0.4872, "num_input_tokens_seen": 318767104, "step": 152 }, { "epoch": 0.9991836734693877, "grad_norm": 0.01960798352956772, "learning_rate": 0.0, "loss": 0.4965, "num_input_tokens_seen": 320864256, "step": 153 }, { "epoch": 0.9991836734693877, "num_input_tokens_seen": 320864256, "step": 153, "total_flos": 1.2496254746971079e+19, "train_loss": 0.5089508540490094, "train_runtime": 7110.2315, "train_samples_per_second": 11.018, "train_steps_per_second": 0.022 } ], "logging_steps": 1.0, "max_steps": 153, "num_input_tokens_seen": 320864256, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2496254746971079e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }