{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9955869373345102, "eval_steps": 500, "global_step": 141, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00706090026478376, "grad_norm": 0.663875937461853, "learning_rate": 4.999379483168148e-05, "loss": 0.7689, "num_input_tokens_seen": 2097152, "step": 1 }, { "epoch": 0.01412180052956752, "grad_norm": 0.6330733895301819, "learning_rate": 4.997518240705502e-05, "loss": 0.7566, "num_input_tokens_seen": 4194304, "step": 2 }, { "epoch": 0.02118270079435128, "grad_norm": 0.5803675651550293, "learning_rate": 4.9944171965578836e-05, "loss": 0.7349, "num_input_tokens_seen": 6291456, "step": 3 }, { "epoch": 0.02824360105913504, "grad_norm": 0.5279229879379272, "learning_rate": 4.9900778901253635e-05, "loss": 0.7034, "num_input_tokens_seen": 8388608, "step": 4 }, { "epoch": 0.0353045013239188, "grad_norm": 0.5274126529693604, "learning_rate": 4.9845024754980876e-05, "loss": 0.7054, "num_input_tokens_seen": 10485760, "step": 5 }, { "epoch": 0.04236540158870256, "grad_norm": 0.4697350859642029, "learning_rate": 4.97769372038695e-05, "loss": 0.6654, "num_input_tokens_seen": 12582912, "step": 6 }, { "epoch": 0.04942630185348632, "grad_norm": 0.4090496301651001, "learning_rate": 4.969655004749674e-05, "loss": 0.657, "num_input_tokens_seen": 14680064, "step": 7 }, { "epoch": 0.05648720211827008, "grad_norm": 0.230828195810318, "learning_rate": 4.960390319112945e-05, "loss": 0.6431, "num_input_tokens_seen": 16777216, "step": 8 }, { "epoch": 0.06354810238305383, "grad_norm": 0.13997790217399597, "learning_rate": 4.9499042625914674e-05, "loss": 0.6083, "num_input_tokens_seen": 18874368, "step": 9 }, { "epoch": 0.0706090026478376, "grad_norm": 0.10757266730070114, "learning_rate": 4.938202040604898e-05, "loss": 0.6054, "num_input_tokens_seen": 20971520, "step": 10 }, { "epoch": 0.07766990291262135, "grad_norm": 0.0809173658490181, "learning_rate": 4.925289462293807e-05, "loss": 0.6115, "num_input_tokens_seen": 23068672, "step": 11 }, { "epoch": 0.08473080317740513, "grad_norm": 0.0680922418832779, "learning_rate": 4.911172937635942e-05, "loss": 0.6222, "num_input_tokens_seen": 25165824, "step": 12 }, { "epoch": 0.09179170344218888, "grad_norm": 0.05811101943254471, "learning_rate": 4.895859474264229e-05, "loss": 0.5964, "num_input_tokens_seen": 27262976, "step": 13 }, { "epoch": 0.09885260370697264, "grad_norm": 0.05454862490296364, "learning_rate": 4.8793566739880894e-05, "loss": 0.5953, "num_input_tokens_seen": 29360128, "step": 14 }, { "epoch": 0.1059135039717564, "grad_norm": 0.04921136051416397, "learning_rate": 4.861672729019797e-05, "loss": 0.6082, "num_input_tokens_seen": 31457280, "step": 15 }, { "epoch": 0.11297440423654016, "grad_norm": 0.042110029608011246, "learning_rate": 4.842816417907759e-05, "loss": 0.5984, "num_input_tokens_seen": 33554432, "step": 16 }, { "epoch": 0.12003530450132392, "grad_norm": 0.040638867765665054, "learning_rate": 4.8227971011787196e-05, "loss": 0.6054, "num_input_tokens_seen": 35651584, "step": 17 }, { "epoch": 0.12709620476610767, "grad_norm": 0.03651794046163559, "learning_rate": 4.801624716691072e-05, "loss": 0.6196, "num_input_tokens_seen": 37748736, "step": 18 }, { "epoch": 0.13415710503089143, "grad_norm": 0.03351139277219772, "learning_rate": 4.779309774701574e-05, "loss": 0.5951, "num_input_tokens_seen": 39845888, "step": 19 }, { "epoch": 0.1412180052956752, "grad_norm": 0.033977996557950974, "learning_rate": 4.755863352647909e-05, "loss": 0.5973, "num_input_tokens_seen": 41943040, "step": 20 }, { "epoch": 0.14827890556045895, "grad_norm": 0.0329577811062336, "learning_rate": 4.731297089649703e-05, "loss": 0.6119, "num_input_tokens_seen": 44040192, "step": 21 }, { "epoch": 0.1553398058252427, "grad_norm": 0.030406907200813293, "learning_rate": 4.705623180730705e-05, "loss": 0.5967, "num_input_tokens_seen": 46137344, "step": 22 }, { "epoch": 0.1624007060900265, "grad_norm": 0.030717285349965096, "learning_rate": 4.6788543707650124e-05, "loss": 0.6038, "num_input_tokens_seen": 48234496, "step": 23 }, { "epoch": 0.16946160635481025, "grad_norm": 0.028465356677770615, "learning_rate": 4.651003948150349e-05, "loss": 0.6086, "num_input_tokens_seen": 50331648, "step": 24 }, { "epoch": 0.176522506619594, "grad_norm": 0.029263464733958244, "learning_rate": 4.622085738211518e-05, "loss": 0.6128, "num_input_tokens_seen": 52428800, "step": 25 }, { "epoch": 0.18358340688437777, "grad_norm": 0.027932966127991676, "learning_rate": 4.5921140963373335e-05, "loss": 0.6023, "num_input_tokens_seen": 54525952, "step": 26 }, { "epoch": 0.19064430714916153, "grad_norm": 0.02711024135351181, "learning_rate": 4.561103900854401e-05, "loss": 0.5856, "num_input_tokens_seen": 56623104, "step": 27 }, { "epoch": 0.1977052074139453, "grad_norm": 0.027511639520525932, "learning_rate": 4.529070545641328e-05, "loss": 0.5871, "num_input_tokens_seen": 58720256, "step": 28 }, { "epoch": 0.20476610767872905, "grad_norm": 0.026679888367652893, "learning_rate": 4.496029932486986e-05, "loss": 0.5826, "num_input_tokens_seen": 60817408, "step": 29 }, { "epoch": 0.2118270079435128, "grad_norm": 0.026770930737257004, "learning_rate": 4.4619984631966524e-05, "loss": 0.5782, "num_input_tokens_seen": 62914560, "step": 30 }, { "epoch": 0.21888790820829657, "grad_norm": 0.026902856305241585, "learning_rate": 4.426993031449934e-05, "loss": 0.5861, "num_input_tokens_seen": 65011712, "step": 31 }, { "epoch": 0.22594880847308033, "grad_norm": 0.02593802474439144, "learning_rate": 4.391031014414514e-05, "loss": 0.5944, "num_input_tokens_seen": 67108864, "step": 32 }, { "epoch": 0.23300970873786409, "grad_norm": 0.02516627125442028, "learning_rate": 4.354130264119894e-05, "loss": 0.5921, "num_input_tokens_seen": 69206016, "step": 33 }, { "epoch": 0.24007060900264784, "grad_norm": 0.024535510689020157, "learning_rate": 4.316309098595408e-05, "loss": 0.5978, "num_input_tokens_seen": 71303168, "step": 34 }, { "epoch": 0.2471315092674316, "grad_norm": 0.026559777557849884, "learning_rate": 4.2775862927769025e-05, "loss": 0.6108, "num_input_tokens_seen": 73400320, "step": 35 }, { "epoch": 0.25419240953221534, "grad_norm": 0.024654753506183624, "learning_rate": 4.2379810691866064e-05, "loss": 0.5834, "num_input_tokens_seen": 75497472, "step": 36 }, { "epoch": 0.2612533097969991, "grad_norm": 0.025327278301119804, "learning_rate": 4.197513088390813e-05, "loss": 0.6124, "num_input_tokens_seen": 77594624, "step": 37 }, { "epoch": 0.26831421006178285, "grad_norm": 0.025609686970710754, "learning_rate": 4.156202439240111e-05, "loss": 0.6011, "num_input_tokens_seen": 79691776, "step": 38 }, { "epoch": 0.27537511032656664, "grad_norm": 0.025090716779232025, "learning_rate": 4.114069628897006e-05, "loss": 0.5898, "num_input_tokens_seen": 81788928, "step": 39 }, { "epoch": 0.2824360105913504, "grad_norm": 0.026486800983548164, "learning_rate": 4.071135572655892e-05, "loss": 0.5934, "num_input_tokens_seen": 83886080, "step": 40 }, { "epoch": 0.28949691085613416, "grad_norm": 0.023945607244968414, "learning_rate": 4.027421583560414e-05, "loss": 0.6118, "num_input_tokens_seen": 85983232, "step": 41 }, { "epoch": 0.2965578111209179, "grad_norm": 0.023670075461268425, "learning_rate": 3.982949361823388e-05, "loss": 0.5913, "num_input_tokens_seen": 88080384, "step": 42 }, { "epoch": 0.3036187113857017, "grad_norm": 0.02379385009407997, "learning_rate": 3.937740984054526e-05, "loss": 0.5849, "num_input_tokens_seen": 90177536, "step": 43 }, { "epoch": 0.3106796116504854, "grad_norm": 0.02416352741420269, "learning_rate": 3.8918188923013046e-05, "loss": 0.5883, "num_input_tokens_seen": 92274688, "step": 44 }, { "epoch": 0.3177405119152692, "grad_norm": 0.024241218343377113, "learning_rate": 3.845205882908432e-05, "loss": 0.5985, "num_input_tokens_seen": 94371840, "step": 45 }, { "epoch": 0.324801412180053, "grad_norm": 0.02356942743062973, "learning_rate": 3.797925095201438e-05, "loss": 0.5949, "num_input_tokens_seen": 96468992, "step": 46 }, { "epoch": 0.3318623124448367, "grad_norm": 0.022860877215862274, "learning_rate": 3.7500000000000003e-05, "loss": 0.5653, "num_input_tokens_seen": 98566144, "step": 47 }, { "epoch": 0.3389232127096205, "grad_norm": 0.023545918986201286, "learning_rate": 3.7014543879667094e-05, "loss": 0.6064, "num_input_tokens_seen": 100663296, "step": 48 }, { "epoch": 0.34598411297440423, "grad_norm": 0.02262749709188938, "learning_rate": 3.6523123577970694e-05, "loss": 0.5822, "num_input_tokens_seen": 102760448, "step": 49 }, { "epoch": 0.353045013239188, "grad_norm": 0.023115364834666252, "learning_rate": 3.6025983042565795e-05, "loss": 0.6011, "num_input_tokens_seen": 104857600, "step": 50 }, { "epoch": 0.36010591350397175, "grad_norm": 0.02323303557932377, "learning_rate": 3.552336906070838e-05, "loss": 0.5881, "num_input_tokens_seen": 106954752, "step": 51 }, { "epoch": 0.36716681376875554, "grad_norm": 0.022982951253652573, "learning_rate": 3.501553113674699e-05, "loss": 0.5804, "num_input_tokens_seen": 109051904, "step": 52 }, { "epoch": 0.37422771403353927, "grad_norm": 0.023010022938251495, "learning_rate": 3.450272136826537e-05, "loss": 0.5919, "num_input_tokens_seen": 111149056, "step": 53 }, { "epoch": 0.38128861429832306, "grad_norm": 0.02248605340719223, "learning_rate": 3.398519432093782e-05, "loss": 0.6077, "num_input_tokens_seen": 113246208, "step": 54 }, { "epoch": 0.3883495145631068, "grad_norm": 0.021844904869794846, "learning_rate": 3.3463206902159395e-05, "loss": 0.5748, "num_input_tokens_seen": 115343360, "step": 55 }, { "epoch": 0.3954104148278906, "grad_norm": 0.022307131439447403, "learning_rate": 3.293701823351357e-05, "loss": 0.6056, "num_input_tokens_seen": 117440512, "step": 56 }, { "epoch": 0.4024713150926743, "grad_norm": 0.022010240703821182, "learning_rate": 3.2406889522140856e-05, "loss": 0.5997, "num_input_tokens_seen": 119537664, "step": 57 }, { "epoch": 0.4095322153574581, "grad_norm": 0.0223658736795187, "learning_rate": 3.187308393107201e-05, "loss": 0.5826, "num_input_tokens_seen": 121634816, "step": 58 }, { "epoch": 0.4165931156222418, "grad_norm": 0.022175313904881477, "learning_rate": 3.13358664485904e-05, "loss": 0.5927, "num_input_tokens_seen": 123731968, "step": 59 }, { "epoch": 0.4236540158870256, "grad_norm": 0.022254353389143944, "learning_rate": 3.079550375668821e-05, "loss": 0.5962, "num_input_tokens_seen": 125829120, "step": 60 }, { "epoch": 0.43071491615180935, "grad_norm": 0.022930629551410675, "learning_rate": 3.0252264098681947e-05, "loss": 0.5808, "num_input_tokens_seen": 127926272, "step": 61 }, { "epoch": 0.43777581641659313, "grad_norm": 0.022809607908129692, "learning_rate": 2.9706417146052838e-05, "loss": 0.5962, "num_input_tokens_seen": 130023424, "step": 62 }, { "epoch": 0.44483671668137686, "grad_norm": 0.02221154049038887, "learning_rate": 2.9158233864578254e-05, "loss": 0.5927, "num_input_tokens_seen": 132120576, "step": 63 }, { "epoch": 0.45189761694616065, "grad_norm": 0.022515252232551575, "learning_rate": 2.8607986379820666e-05, "loss": 0.5744, "num_input_tokens_seen": 134217728, "step": 64 }, { "epoch": 0.4589585172109444, "grad_norm": 0.022104725241661072, "learning_rate": 2.8055947842040862e-05, "loss": 0.5835, "num_input_tokens_seen": 136314880, "step": 65 }, { "epoch": 0.46601941747572817, "grad_norm": 0.02250700816512108, "learning_rate": 2.7502392290602463e-05, "loss": 0.5763, "num_input_tokens_seen": 138412032, "step": 66 }, { "epoch": 0.4730803177405119, "grad_norm": 0.021792296320199966, "learning_rate": 2.6947594517935083e-05, "loss": 0.5854, "num_input_tokens_seen": 140509184, "step": 67 }, { "epoch": 0.4801412180052957, "grad_norm": 0.021566050127148628, "learning_rate": 2.6391829933123712e-05, "loss": 0.5715, "num_input_tokens_seen": 142606336, "step": 68 }, { "epoch": 0.4872021182700794, "grad_norm": 0.022085102275013924, "learning_rate": 2.5835374425191866e-05, "loss": 0.5846, "num_input_tokens_seen": 144703488, "step": 69 }, { "epoch": 0.4942630185348632, "grad_norm": 0.02136918157339096, "learning_rate": 2.5278504226146636e-05, "loss": 0.5547, "num_input_tokens_seen": 146800640, "step": 70 }, { "epoch": 0.501323918799647, "grad_norm": 0.02222236432135105, "learning_rate": 2.4721495773853366e-05, "loss": 0.6042, "num_input_tokens_seen": 148897792, "step": 71 }, { "epoch": 0.5083848190644307, "grad_norm": 0.022352294996380806, "learning_rate": 2.4164625574808146e-05, "loss": 0.5754, "num_input_tokens_seen": 150994944, "step": 72 }, { "epoch": 0.5154457193292145, "grad_norm": 0.022009560838341713, "learning_rate": 2.36081700668763e-05, "loss": 0.5847, "num_input_tokens_seen": 153092096, "step": 73 }, { "epoch": 0.5225066195939982, "grad_norm": 0.020856056362390518, "learning_rate": 2.305240548206492e-05, "loss": 0.5949, "num_input_tokens_seen": 155189248, "step": 74 }, { "epoch": 0.529567519858782, "grad_norm": 0.02195914275944233, "learning_rate": 2.2497607709397543e-05, "loss": 0.6038, "num_input_tokens_seen": 157286400, "step": 75 }, { "epoch": 0.5366284201235657, "grad_norm": 0.021507592871785164, "learning_rate": 2.1944052157959143e-05, "loss": 0.5779, "num_input_tokens_seen": 159383552, "step": 76 }, { "epoch": 0.5436893203883495, "grad_norm": 0.022091282531619072, "learning_rate": 2.1392013620179337e-05, "loss": 0.601, "num_input_tokens_seen": 161480704, "step": 77 }, { "epoch": 0.5507502206531333, "grad_norm": 0.022038882598280907, "learning_rate": 2.0841766135421752e-05, "loss": 0.5665, "num_input_tokens_seen": 163577856, "step": 78 }, { "epoch": 0.5578111209179171, "grad_norm": 0.02158939093351364, "learning_rate": 2.0293582853947164e-05, "loss": 0.5928, "num_input_tokens_seen": 165675008, "step": 79 }, { "epoch": 0.5648720211827007, "grad_norm": 0.02173718996345997, "learning_rate": 1.974773590131805e-05, "loss": 0.579, "num_input_tokens_seen": 167772160, "step": 80 }, { "epoch": 0.5719329214474845, "grad_norm": 0.020998917520046234, "learning_rate": 1.920449624331179e-05, "loss": 0.5939, "num_input_tokens_seen": 169869312, "step": 81 }, { "epoch": 0.5789938217122683, "grad_norm": 0.02156190574169159, "learning_rate": 1.8664133551409612e-05, "loss": 0.6001, "num_input_tokens_seen": 171966464, "step": 82 }, { "epoch": 0.5860547219770521, "grad_norm": 0.02090943045914173, "learning_rate": 1.8126916068928e-05, "loss": 0.5604, "num_input_tokens_seen": 174063616, "step": 83 }, { "epoch": 0.5931156222418358, "grad_norm": 0.021919352933764458, "learning_rate": 1.7593110477859153e-05, "loss": 0.5929, "num_input_tokens_seen": 176160768, "step": 84 }, { "epoch": 0.6001765225066196, "grad_norm": 0.021490445360541344, "learning_rate": 1.7062981766486437e-05, "loss": 0.5945, "num_input_tokens_seen": 178257920, "step": 85 }, { "epoch": 0.6072374227714034, "grad_norm": 0.021763818338513374, "learning_rate": 1.6536793097840615e-05, "loss": 0.5829, "num_input_tokens_seen": 180355072, "step": 86 }, { "epoch": 0.6142983230361871, "grad_norm": 0.022147417068481445, "learning_rate": 1.6014805679062185e-05, "loss": 0.5844, "num_input_tokens_seen": 182452224, "step": 87 }, { "epoch": 0.6213592233009708, "grad_norm": 0.020599111914634705, "learning_rate": 1.5497278631734632e-05, "loss": 0.5519, "num_input_tokens_seen": 184549376, "step": 88 }, { "epoch": 0.6284201235657546, "grad_norm": 0.021077901124954224, "learning_rate": 1.498446886325301e-05, "loss": 0.5874, "num_input_tokens_seen": 186646528, "step": 89 }, { "epoch": 0.6354810238305384, "grad_norm": 0.02132106013596058, "learning_rate": 1.447663093929163e-05, "loss": 0.573, "num_input_tokens_seen": 188743680, "step": 90 }, { "epoch": 0.6425419240953222, "grad_norm": 0.021409865468740463, "learning_rate": 1.3974016957434208e-05, "loss": 0.5745, "num_input_tokens_seen": 190840832, "step": 91 }, { "epoch": 0.649602824360106, "grad_norm": 0.021222982555627823, "learning_rate": 1.34768764220293e-05, "loss": 0.5904, "num_input_tokens_seen": 192937984, "step": 92 }, { "epoch": 0.6566637246248896, "grad_norm": 0.020569872111082077, "learning_rate": 1.2985456120332906e-05, "loss": 0.5794, "num_input_tokens_seen": 195035136, "step": 93 }, { "epoch": 0.6637246248896734, "grad_norm": 0.020798902958631516, "learning_rate": 1.2500000000000006e-05, "loss": 0.5734, "num_input_tokens_seen": 197132288, "step": 94 }, { "epoch": 0.6707855251544572, "grad_norm": 0.021490171551704407, "learning_rate": 1.2020749047985627e-05, "loss": 0.5777, "num_input_tokens_seen": 199229440, "step": 95 }, { "epoch": 0.677846425419241, "grad_norm": 0.021630477160215378, "learning_rate": 1.1547941170915686e-05, "loss": 0.5545, "num_input_tokens_seen": 201326592, "step": 96 }, { "epoch": 0.6849073256840247, "grad_norm": 0.021329065784811974, "learning_rate": 1.1081811076986965e-05, "loss": 0.5857, "num_input_tokens_seen": 203423744, "step": 97 }, { "epoch": 0.6919682259488085, "grad_norm": 0.021425435319542885, "learning_rate": 1.062259015945474e-05, "loss": 0.5558, "num_input_tokens_seen": 205520896, "step": 98 }, { "epoch": 0.6990291262135923, "grad_norm": 0.021290112286806107, "learning_rate": 1.0170506381766121e-05, "loss": 0.6004, "num_input_tokens_seen": 207618048, "step": 99 }, { "epoch": 0.706090026478376, "grad_norm": 0.02233896404504776, "learning_rate": 9.72578416439587e-06, "loss": 0.595, "num_input_tokens_seen": 209715200, "step": 100 }, { "epoch": 0.7131509267431597, "grad_norm": 0.021383240818977356, "learning_rate": 9.288644273441083e-06, "loss": 0.592, "num_input_tokens_seen": 211812352, "step": 101 }, { "epoch": 0.7202118270079435, "grad_norm": 0.02151460386812687, "learning_rate": 8.85930371102994e-06, "loss": 0.5802, "num_input_tokens_seen": 213909504, "step": 102 }, { "epoch": 0.7272727272727273, "grad_norm": 0.02100866287946701, "learning_rate": 8.43797560759889e-06, "loss": 0.5622, "num_input_tokens_seen": 216006656, "step": 103 }, { "epoch": 0.7343336275375111, "grad_norm": 0.020722227171063423, "learning_rate": 8.02486911609188e-06, "loss": 0.5596, "num_input_tokens_seen": 218103808, "step": 104 }, { "epoch": 0.7413945278022948, "grad_norm": 0.021745288744568825, "learning_rate": 7.620189308133943e-06, "loss": 0.5861, "num_input_tokens_seen": 220200960, "step": 105 }, { "epoch": 0.7484554280670785, "grad_norm": 0.021946506574749947, "learning_rate": 7.224137072230982e-06, "loss": 0.607, "num_input_tokens_seen": 222298112, "step": 106 }, { "epoch": 0.7555163283318623, "grad_norm": 0.02115868404507637, "learning_rate": 6.836909014045925e-06, "loss": 0.5812, "num_input_tokens_seen": 224395264, "step": 107 }, { "epoch": 0.7625772285966461, "grad_norm": 0.02150718681514263, "learning_rate": 6.458697358801061e-06, "loss": 0.5749, "num_input_tokens_seen": 226492416, "step": 108 }, { "epoch": 0.7696381288614298, "grad_norm": 0.02062981016933918, "learning_rate": 6.0896898558548685e-06, "loss": 0.5641, "num_input_tokens_seen": 228589568, "step": 109 }, { "epoch": 0.7766990291262136, "grad_norm": 0.021470438688993454, "learning_rate": 5.730069685500669e-06, "loss": 0.5815, "num_input_tokens_seen": 230686720, "step": 110 }, { "epoch": 0.7837599293909974, "grad_norm": 0.020590469241142273, "learning_rate": 5.380015368033476e-06, "loss": 0.5747, "num_input_tokens_seen": 232783872, "step": 111 }, { "epoch": 0.7908208296557812, "grad_norm": 0.021280275657773018, "learning_rate": 5.0397006751301435e-06, "loss": 0.5966, "num_input_tokens_seen": 234881024, "step": 112 }, { "epoch": 0.7978817299205648, "grad_norm": 0.02132592350244522, "learning_rate": 4.70929454358672e-06, "loss": 0.579, "num_input_tokens_seen": 236978176, "step": 113 }, { "epoch": 0.8049426301853486, "grad_norm": 0.021257393062114716, "learning_rate": 4.388960991455998e-06, "loss": 0.5965, "num_input_tokens_seen": 239075328, "step": 114 }, { "epoch": 0.8120035304501324, "grad_norm": 0.020477108657360077, "learning_rate": 4.078859036626676e-06, "loss": 0.5736, "num_input_tokens_seen": 241172480, "step": 115 }, { "epoch": 0.8190644307149162, "grad_norm": 0.021446263417601585, "learning_rate": 3.779142617884823e-06, "loss": 0.5848, "num_input_tokens_seen": 243269632, "step": 116 }, { "epoch": 0.8261253309796999, "grad_norm": 0.0212736614048481, "learning_rate": 3.489960518496521e-06, "loss": 0.5726, "num_input_tokens_seen": 245366784, "step": 117 }, { "epoch": 0.8331862312444837, "grad_norm": 0.021760208532214165, "learning_rate": 3.2114562923498766e-06, "loss": 0.6091, "num_input_tokens_seen": 247463936, "step": 118 }, { "epoch": 0.8402471315092674, "grad_norm": 0.02195391058921814, "learning_rate": 2.9437681926929584e-06, "loss": 0.5844, "num_input_tokens_seen": 249561088, "step": 119 }, { "epoch": 0.8473080317740512, "grad_norm": 0.020647749304771423, "learning_rate": 2.687029103502972e-06, "loss": 0.5592, "num_input_tokens_seen": 251658240, "step": 120 }, { "epoch": 0.8543689320388349, "grad_norm": 0.02081936039030552, "learning_rate": 2.441366473520909e-06, "loss": 0.582, "num_input_tokens_seen": 253755392, "step": 121 }, { "epoch": 0.8614298323036187, "grad_norm": 0.020950596779584885, "learning_rate": 2.2069022529842664e-06, "loss": 0.5764, "num_input_tokens_seen": 255852544, "step": 122 }, { "epoch": 0.8684907325684025, "grad_norm": 0.02197173982858658, "learning_rate": 1.983752833089278e-06, "loss": 0.589, "num_input_tokens_seen": 257949696, "step": 123 }, { "epoch": 0.8755516328331863, "grad_norm": 0.022093698382377625, "learning_rate": 1.7720289882128095e-06, "loss": 0.5875, "num_input_tokens_seen": 260046848, "step": 124 }, { "epoch": 0.8826125330979699, "grad_norm": 0.021470658481121063, "learning_rate": 1.5718358209224153e-06, "loss": 0.5902, "num_input_tokens_seen": 262144000, "step": 125 }, { "epoch": 0.8896734333627537, "grad_norm": 0.022243961691856384, "learning_rate": 1.3832727098020332e-06, "loss": 0.572, "num_input_tokens_seen": 264241152, "step": 126 }, { "epoch": 0.8967343336275375, "grad_norm": 0.020884385332465172, "learning_rate": 1.2064332601191163e-06, "loss": 0.5709, "num_input_tokens_seen": 266338304, "step": 127 }, { "epoch": 0.9037952338923213, "grad_norm": 0.0208596121519804, "learning_rate": 1.0414052573577137e-06, "loss": 0.579, "num_input_tokens_seen": 268435456, "step": 128 }, { "epoch": 0.910856134157105, "grad_norm": 0.020583724603056908, "learning_rate": 8.882706236405886e-07, "loss": 0.5761, "num_input_tokens_seen": 270532608, "step": 129 }, { "epoch": 0.9179170344218888, "grad_norm": 0.020887693390250206, "learning_rate": 7.471053770619352e-07, "loss": 0.5812, "num_input_tokens_seen": 272629760, "step": 130 }, { "epoch": 0.9249779346866726, "grad_norm": 0.020977560430765152, "learning_rate": 6.179795939510263e-07, "loss": 0.5849, "num_input_tokens_seen": 274726912, "step": 131 }, { "epoch": 0.9320388349514563, "grad_norm": 0.021589655429124832, "learning_rate": 5.009573740853313e-07, "loss": 0.5783, "num_input_tokens_seen": 276824064, "step": 132 }, { "epoch": 0.93909973521624, "grad_norm": 0.021140016615390778, "learning_rate": 3.9609680887055243e-07, "loss": 0.5745, "num_input_tokens_seen": 278921216, "step": 133 }, { "epoch": 0.9461606354810238, "grad_norm": 0.020653944462537766, "learning_rate": 3.034499525032625e-07, "loss": 0.5815, "num_input_tokens_seen": 281018368, "step": 134 }, { "epoch": 0.9532215357458076, "grad_norm": 0.02038555219769478, "learning_rate": 2.230627961304993e-07, "loss": 0.5651, "num_input_tokens_seen": 283115520, "step": 135 }, { "epoch": 0.9602824360105914, "grad_norm": 0.020857810974121094, "learning_rate": 1.5497524501913163e-07, "loss": 0.5691, "num_input_tokens_seen": 285212672, "step": 136 }, { "epoch": 0.9673433362753752, "grad_norm": 0.022090895101428032, "learning_rate": 9.922109874636876e-08, "loss": 0.583, "num_input_tokens_seen": 287309824, "step": 137 }, { "epoch": 0.9744042365401588, "grad_norm": 0.019660325720906258, "learning_rate": 5.5828034421170907e-08, "loss": 0.5648, "num_input_tokens_seen": 289406976, "step": 138 }, { "epoch": 0.9814651368049426, "grad_norm": 0.021183328703045845, "learning_rate": 2.481759294498398e-08, "loss": 0.5765, "num_input_tokens_seen": 291504128, "step": 139 }, { "epoch": 0.9885260370697264, "grad_norm": 0.021396314725279808, "learning_rate": 6.205168318523802e-09, "loss": 0.5787, "num_input_tokens_seen": 293601280, "step": 140 }, { "epoch": 0.9955869373345102, "grad_norm": 0.020481785759329796, "learning_rate": 0.0, "loss": 0.5791, "num_input_tokens_seen": 295698432, "step": 141 }, { "epoch": 0.9955869373345102, "num_input_tokens_seen": 295698432, "step": 141, "total_flos": 1.1516156335443935e+19, "train_loss": 0.5937648889020826, "train_runtime": 6561.4436, "train_samples_per_second": 11.048, "train_steps_per_second": 0.021 } ], "logging_steps": 1.0, "max_steps": 141, "num_input_tokens_seen": 295698432, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1516156335443935e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }