{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 40, "global_step": 157, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006369426751592357, "grad_norm": 0.1806156039237976, "learning_rate": 1e-05, "loss": 1.3031, "step": 1 }, { "epoch": 0.006369426751592357, "eval_loss": 1.5003942251205444, "eval_runtime": 19.6641, "eval_samples_per_second": 55.533, "eval_steps_per_second": 0.915, "step": 1 }, { "epoch": 0.012738853503184714, "grad_norm": 0.1688886284828186, "learning_rate": 2e-05, "loss": 1.3305, "step": 2 }, { "epoch": 0.01910828025477707, "grad_norm": 0.20123907923698425, "learning_rate": 3e-05, "loss": 1.324, "step": 3 }, { "epoch": 0.025477707006369428, "grad_norm": 0.18879620730876923, "learning_rate": 4e-05, "loss": 1.3638, "step": 4 }, { "epoch": 0.03184713375796178, "grad_norm": 0.20348915457725525, "learning_rate": 5e-05, "loss": 1.3686, "step": 5 }, { "epoch": 0.03821656050955414, "grad_norm": 0.212239071726799, "learning_rate": 6e-05, "loss": 1.2865, "step": 6 }, { "epoch": 0.044585987261146494, "grad_norm": 0.19280897080898285, "learning_rate": 7e-05, "loss": 1.313, "step": 7 }, { "epoch": 0.050955414012738856, "grad_norm": 0.1767151653766632, "learning_rate": 8e-05, "loss": 1.3207, "step": 8 }, { "epoch": 0.05732484076433121, "grad_norm": 0.20014327764511108, "learning_rate": 9e-05, "loss": 1.3143, "step": 9 }, { "epoch": 0.06369426751592357, "grad_norm": 0.18035855889320374, "learning_rate": 0.0001, "loss": 1.252, "step": 10 }, { "epoch": 0.07006369426751592, "grad_norm": 0.19993054866790771, "learning_rate": 0.00011000000000000002, "loss": 1.302, "step": 11 }, { "epoch": 0.07643312101910828, "grad_norm": 0.18973341584205627, "learning_rate": 0.00012, "loss": 1.2608, "step": 12 }, { "epoch": 0.08280254777070063, "grad_norm": 0.19669465720653534, "learning_rate": 0.00013000000000000002, "loss": 1.2329, "step": 13 }, { "epoch": 0.08917197452229299, "grad_norm": 0.1886417716741562, "learning_rate": 0.00014, "loss": 1.241, "step": 14 }, { "epoch": 0.09554140127388536, "grad_norm": 0.19076582789421082, "learning_rate": 0.00015000000000000001, "loss": 1.2539, "step": 15 }, { "epoch": 0.10191082802547771, "grad_norm": 0.16027267277240753, "learning_rate": 0.00016, "loss": 1.2123, "step": 16 }, { "epoch": 0.10828025477707007, "grad_norm": 0.16112814843654633, "learning_rate": 0.00017, "loss": 1.2465, "step": 17 }, { "epoch": 0.11464968152866242, "grad_norm": 0.15539830923080444, "learning_rate": 0.00018, "loss": 1.1717, "step": 18 }, { "epoch": 0.12101910828025478, "grad_norm": 0.15739695727825165, "learning_rate": 0.00019, "loss": 1.1412, "step": 19 }, { "epoch": 0.12738853503184713, "grad_norm": 0.15658576786518097, "learning_rate": 0.0002, "loss": 1.1731, "step": 20 }, { "epoch": 0.1337579617834395, "grad_norm": 0.1474328637123108, "learning_rate": 0.00019999866506037345, "loss": 1.2051, "step": 21 }, { "epoch": 0.14012738853503184, "grad_norm": 0.11234907805919647, "learning_rate": 0.00019999466027713507, "loss": 1.1803, "step": 22 }, { "epoch": 0.1464968152866242, "grad_norm": 0.1053839772939682, "learning_rate": 0.00019998798575720776, "loss": 1.1436, "step": 23 }, { "epoch": 0.15286624203821655, "grad_norm": 0.1049942821264267, "learning_rate": 0.00019997864167879312, "loss": 1.1881, "step": 24 }, { "epoch": 0.1592356687898089, "grad_norm": 0.11039146035909653, "learning_rate": 0.00019996662829136676, "loss": 1.1528, "step": 25 }, { "epoch": 0.16560509554140126, "grad_norm": 0.09678228944540024, "learning_rate": 0.0001999519459156716, "loss": 1.1496, "step": 26 }, { "epoch": 0.17197452229299362, "grad_norm": 0.09857058525085449, "learning_rate": 0.0001999345949437094, "loss": 1.1304, "step": 27 }, { "epoch": 0.17834394904458598, "grad_norm": 0.10835567116737366, "learning_rate": 0.0001999145758387301, "loss": 1.2262, "step": 28 }, { "epoch": 0.18471337579617833, "grad_norm": 0.09927600622177124, "learning_rate": 0.0001998918891352197, "loss": 1.1382, "step": 29 }, { "epoch": 0.1910828025477707, "grad_norm": 0.09861327707767487, "learning_rate": 0.00019986653543888568, "loss": 1.1987, "step": 30 }, { "epoch": 0.19745222929936307, "grad_norm": 0.09174010157585144, "learning_rate": 0.00019983851542664126, "loss": 1.127, "step": 31 }, { "epoch": 0.20382165605095542, "grad_norm": 0.08863182365894318, "learning_rate": 0.00019980782984658683, "loss": 1.211, "step": 32 }, { "epoch": 0.21019108280254778, "grad_norm": 0.08810263872146606, "learning_rate": 0.00019977447951799034, "loss": 1.1476, "step": 33 }, { "epoch": 0.21656050955414013, "grad_norm": 0.08641776442527771, "learning_rate": 0.00019973846533126533, "loss": 1.1497, "step": 34 }, { "epoch": 0.2229299363057325, "grad_norm": 0.09637051075696945, "learning_rate": 0.00019969978824794707, "loss": 1.1471, "step": 35 }, { "epoch": 0.22929936305732485, "grad_norm": 0.09402573108673096, "learning_rate": 0.000199658449300667, "loss": 1.0976, "step": 36 }, { "epoch": 0.2356687898089172, "grad_norm": 0.09077832847833633, "learning_rate": 0.00019961444959312508, "loss": 1.1119, "step": 37 }, { "epoch": 0.24203821656050956, "grad_norm": 0.08864310383796692, "learning_rate": 0.0001995677903000604, "loss": 1.1157, "step": 38 }, { "epoch": 0.2484076433121019, "grad_norm": 0.09867957979440689, "learning_rate": 0.0001995184726672197, "loss": 1.1656, "step": 39 }, { "epoch": 0.25477707006369427, "grad_norm": 0.09343115240335464, "learning_rate": 0.00019946649801132427, "loss": 1.1084, "step": 40 }, { "epoch": 0.25477707006369427, "eval_loss": 1.1224156618118286, "eval_runtime": 19.2915, "eval_samples_per_second": 56.605, "eval_steps_per_second": 0.933, "step": 40 }, { "epoch": 0.2611464968152866, "grad_norm": 0.09474795311689377, "learning_rate": 0.00019941186772003464, "loss": 1.1486, "step": 41 }, { "epoch": 0.267515923566879, "grad_norm": 0.09726471453905106, "learning_rate": 0.00019935458325191365, "loss": 1.1499, "step": 42 }, { "epoch": 0.27388535031847133, "grad_norm": 0.09273070096969604, "learning_rate": 0.0001992946461363874, "loss": 1.1361, "step": 43 }, { "epoch": 0.2802547770700637, "grad_norm": 0.10344096273183823, "learning_rate": 0.0001992320579737045, "loss": 1.0999, "step": 44 }, { "epoch": 0.28662420382165604, "grad_norm": 0.09499648213386536, "learning_rate": 0.00019916682043489336, "loss": 1.0919, "step": 45 }, { "epoch": 0.2929936305732484, "grad_norm": 0.09483088552951813, "learning_rate": 0.00019909893526171745, "loss": 1.0992, "step": 46 }, { "epoch": 0.29936305732484075, "grad_norm": 0.10382100939750671, "learning_rate": 0.00019902840426662895, "loss": 1.1093, "step": 47 }, { "epoch": 0.3057324840764331, "grad_norm": 0.10187891870737076, "learning_rate": 0.00019895522933272028, "loss": 1.1063, "step": 48 }, { "epoch": 0.31210191082802546, "grad_norm": 0.1022520437836647, "learning_rate": 0.00019887941241367377, "loss": 1.1095, "step": 49 }, { "epoch": 0.3184713375796178, "grad_norm": 0.11470162868499756, "learning_rate": 0.00019880095553370967, "loss": 1.0859, "step": 50 }, { "epoch": 0.3248407643312102, "grad_norm": 0.09845008701086044, "learning_rate": 0.0001987198607875319, "loss": 1.0941, "step": 51 }, { "epoch": 0.33121019108280253, "grad_norm": 0.1080709770321846, "learning_rate": 0.00019863613034027224, "loss": 1.084, "step": 52 }, { "epoch": 0.3375796178343949, "grad_norm": 0.11064234375953674, "learning_rate": 0.0001985497664274326, "loss": 1.1018, "step": 53 }, { "epoch": 0.34394904458598724, "grad_norm": 0.10099776834249496, "learning_rate": 0.0001984607713548251, "loss": 1.0881, "step": 54 }, { "epoch": 0.3503184713375796, "grad_norm": 0.11960357427597046, "learning_rate": 0.0001983691474985108, "loss": 1.0845, "step": 55 }, { "epoch": 0.35668789808917195, "grad_norm": 0.10840114951133728, "learning_rate": 0.00019827489730473596, "loss": 1.131, "step": 56 }, { "epoch": 0.3630573248407643, "grad_norm": 0.10177604109048843, "learning_rate": 0.00019817802328986697, "loss": 1.079, "step": 57 }, { "epoch": 0.36942675159235666, "grad_norm": 0.11752859503030777, "learning_rate": 0.00019807852804032305, "loss": 1.0833, "step": 58 }, { "epoch": 0.37579617834394907, "grad_norm": 0.11149834841489792, "learning_rate": 0.00019797641421250725, "loss": 1.1009, "step": 59 }, { "epoch": 0.3821656050955414, "grad_norm": 0.10446681827306747, "learning_rate": 0.00019787168453273544, "loss": 1.1211, "step": 60 }, { "epoch": 0.3885350318471338, "grad_norm": 0.12820479273796082, "learning_rate": 0.00019776434179716366, "loss": 1.1455, "step": 61 }, { "epoch": 0.39490445859872614, "grad_norm": 0.10011500865221024, "learning_rate": 0.00019765438887171327, "loss": 1.0779, "step": 62 }, { "epoch": 0.4012738853503185, "grad_norm": 0.11496227979660034, "learning_rate": 0.0001975418286919947, "loss": 1.1174, "step": 63 }, { "epoch": 0.40764331210191085, "grad_norm": 0.10938404500484467, "learning_rate": 0.00019742666426322876, "loss": 1.0576, "step": 64 }, { "epoch": 0.4140127388535032, "grad_norm": 0.12636032700538635, "learning_rate": 0.0001973088986601667, "loss": 1.083, "step": 65 }, { "epoch": 0.42038216560509556, "grad_norm": 0.10620423406362534, "learning_rate": 0.00019718853502700783, "loss": 1.0728, "step": 66 }, { "epoch": 0.4267515923566879, "grad_norm": 0.11206210404634476, "learning_rate": 0.0001970655765773159, "loss": 1.1107, "step": 67 }, { "epoch": 0.43312101910828027, "grad_norm": 0.12613879144191742, "learning_rate": 0.00019694002659393305, "loss": 1.1065, "step": 68 }, { "epoch": 0.4394904458598726, "grad_norm": 0.10636976361274719, "learning_rate": 0.00019681188842889222, "loss": 1.1192, "step": 69 }, { "epoch": 0.445859872611465, "grad_norm": 0.11036239564418793, "learning_rate": 0.00019668116550332766, "loss": 1.1362, "step": 70 }, { "epoch": 0.45222929936305734, "grad_norm": 0.11907072365283966, "learning_rate": 0.0001965478613073837, "loss": 1.1009, "step": 71 }, { "epoch": 0.4585987261146497, "grad_norm": 0.11267364770174026, "learning_rate": 0.00019641197940012137, "loss": 1.0694, "step": 72 }, { "epoch": 0.46496815286624205, "grad_norm": 0.10659351199865341, "learning_rate": 0.00019627352340942353, "loss": 1.0844, "step": 73 }, { "epoch": 0.4713375796178344, "grad_norm": 0.12426211684942245, "learning_rate": 0.00019613249703189796, "loss": 1.1203, "step": 74 }, { "epoch": 0.47770700636942676, "grad_norm": 0.11883872747421265, "learning_rate": 0.00019598890403277864, "loss": 1.0879, "step": 75 }, { "epoch": 0.4840764331210191, "grad_norm": 0.11355262994766235, "learning_rate": 0.0001958427482458253, "loss": 1.1045, "step": 76 }, { "epoch": 0.49044585987261147, "grad_norm": 0.11006154865026474, "learning_rate": 0.0001956940335732209, "loss": 1.1058, "step": 77 }, { "epoch": 0.4968152866242038, "grad_norm": 0.11379122734069824, "learning_rate": 0.00019554276398546768, "loss": 1.1224, "step": 78 }, { "epoch": 0.5031847133757962, "grad_norm": 0.11065732687711716, "learning_rate": 0.000195388943521281, "loss": 1.1033, "step": 79 }, { "epoch": 0.5095541401273885, "grad_norm": 0.11113402247428894, "learning_rate": 0.00019523257628748146, "loss": 1.0912, "step": 80 }, { "epoch": 0.5095541401273885, "eval_loss": 1.0586377382278442, "eval_runtime": 19.2899, "eval_samples_per_second": 56.61, "eval_steps_per_second": 0.933, "step": 80 }, { "epoch": 0.5159235668789809, "grad_norm": 0.11783529818058014, "learning_rate": 0.00019507366645888543, "loss": 1.0938, "step": 81 }, { "epoch": 0.5222929936305732, "grad_norm": 0.12089723348617554, "learning_rate": 0.00019491221827819347, "loss": 1.1068, "step": 82 }, { "epoch": 0.5286624203821656, "grad_norm": 0.10991813987493515, "learning_rate": 0.00019474823605587703, "loss": 1.1393, "step": 83 }, { "epoch": 0.535031847133758, "grad_norm": 0.11100416630506516, "learning_rate": 0.00019458172417006347, "loss": 1.1081, "step": 84 }, { "epoch": 0.5414012738853503, "grad_norm": 0.11886284500360489, "learning_rate": 0.00019441268706641907, "loss": 1.1168, "step": 85 }, { "epoch": 0.5477707006369427, "grad_norm": 0.11771067976951599, "learning_rate": 0.00019424112925803039, "loss": 1.098, "step": 86 }, { "epoch": 0.554140127388535, "grad_norm": 0.11022554337978363, "learning_rate": 0.00019406705532528374, "loss": 1.1179, "step": 87 }, { "epoch": 0.5605095541401274, "grad_norm": 0.11891311407089233, "learning_rate": 0.00019389046991574298, "loss": 1.0866, "step": 88 }, { "epoch": 0.5668789808917197, "grad_norm": 0.11594802141189575, "learning_rate": 0.00019371137774402527, "loss": 1.1146, "step": 89 }, { "epoch": 0.5732484076433121, "grad_norm": 0.1181577518582344, "learning_rate": 0.0001935297835916754, "loss": 1.1213, "step": 90 }, { "epoch": 0.5796178343949044, "grad_norm": 0.10821503400802612, "learning_rate": 0.00019334569230703794, "loss": 1.1121, "step": 91 }, { "epoch": 0.5859872611464968, "grad_norm": 0.118013896048069, "learning_rate": 0.0001931591088051279, "loss": 1.117, "step": 92 }, { "epoch": 0.5923566878980892, "grad_norm": 0.11678043752908707, "learning_rate": 0.0001929700380674995, "loss": 1.0974, "step": 93 }, { "epoch": 0.5987261146496815, "grad_norm": 0.11073200404644012, "learning_rate": 0.00019277848514211317, "loss": 1.1059, "step": 94 }, { "epoch": 0.6050955414012739, "grad_norm": 0.11440474539995193, "learning_rate": 0.00019258445514320065, "loss": 1.0913, "step": 95 }, { "epoch": 0.6114649681528662, "grad_norm": 0.11020273715257645, "learning_rate": 0.0001923879532511287, "loss": 1.0836, "step": 96 }, { "epoch": 0.6178343949044586, "grad_norm": 0.11285867542028427, "learning_rate": 0.0001921889847122605, "loss": 1.0842, "step": 97 }, { "epoch": 0.6242038216560509, "grad_norm": 0.11981746554374695, "learning_rate": 0.00019198755483881583, "loss": 1.1062, "step": 98 }, { "epoch": 0.6305732484076433, "grad_norm": 0.11882256716489792, "learning_rate": 0.0001917836690087291, "loss": 1.1012, "step": 99 }, { "epoch": 0.6369426751592356, "grad_norm": 0.11642686277627945, "learning_rate": 0.00019157733266550575, "loss": 1.0823, "step": 100 }, { "epoch": 0.643312101910828, "grad_norm": 0.11980683356523514, "learning_rate": 0.00019136855131807705, "loss": 1.105, "step": 101 }, { "epoch": 0.6496815286624203, "grad_norm": 0.1147085651755333, "learning_rate": 0.0001911573305406528, "loss": 1.0794, "step": 102 }, { "epoch": 0.6560509554140127, "grad_norm": 0.12037765234708786, "learning_rate": 0.00019094367597257282, "loss": 1.1059, "step": 103 }, { "epoch": 0.6624203821656051, "grad_norm": 0.12135636061429977, "learning_rate": 0.000190727593318156, "loss": 1.118, "step": 104 }, { "epoch": 0.6687898089171974, "grad_norm": 0.13285911083221436, "learning_rate": 0.00019050908834654834, "loss": 1.0817, "step": 105 }, { "epoch": 0.6751592356687898, "grad_norm": 0.11360063403844833, "learning_rate": 0.00019028816689156878, "loss": 1.0711, "step": 106 }, { "epoch": 0.6815286624203821, "grad_norm": 0.13178926706314087, "learning_rate": 0.00019006483485155338, "loss": 1.1266, "step": 107 }, { "epoch": 0.6878980891719745, "grad_norm": 0.1290571093559265, "learning_rate": 0.0001898390981891979, "loss": 1.0776, "step": 108 }, { "epoch": 0.6942675159235668, "grad_norm": 0.11376259475946426, "learning_rate": 0.0001896109629313987, "loss": 1.1026, "step": 109 }, { "epoch": 0.7006369426751592, "grad_norm": 0.12076874077320099, "learning_rate": 0.0001893804351690917, "loss": 1.104, "step": 110 }, { "epoch": 0.7070063694267515, "grad_norm": 0.12165362387895584, "learning_rate": 0.0001891475210570898, "loss": 1.0884, "step": 111 }, { "epoch": 0.7133757961783439, "grad_norm": 0.10634943842887878, "learning_rate": 0.00018891222681391851, "loss": 1.0844, "step": 112 }, { "epoch": 0.7197452229299363, "grad_norm": 0.11928383260965347, "learning_rate": 0.00018867455872165008, "loss": 1.1205, "step": 113 }, { "epoch": 0.7261146496815286, "grad_norm": 0.1243489533662796, "learning_rate": 0.00018843452312573554, "loss": 1.0704, "step": 114 }, { "epoch": 0.732484076433121, "grad_norm": 0.11439479887485504, "learning_rate": 0.0001881921264348355, "loss": 1.0809, "step": 115 }, { "epoch": 0.7388535031847133, "grad_norm": 0.1184995099902153, "learning_rate": 0.0001879473751206489, "loss": 1.1619, "step": 116 }, { "epoch": 0.7452229299363057, "grad_norm": 0.11846223473548889, "learning_rate": 0.00018770027571774031, "loss": 1.0835, "step": 117 }, { "epoch": 0.7515923566878981, "grad_norm": 0.11566226184368134, "learning_rate": 0.00018745083482336544, "loss": 1.0658, "step": 118 }, { "epoch": 0.7579617834394905, "grad_norm": 0.11553015559911728, "learning_rate": 0.00018719905909729494, "loss": 1.0773, "step": 119 }, { "epoch": 0.7643312101910829, "grad_norm": 0.13605500757694244, "learning_rate": 0.0001869449552616367, "loss": 1.0727, "step": 120 }, { "epoch": 0.7643312101910829, "eval_loss": 1.0301120281219482, "eval_runtime": 19.2781, "eval_samples_per_second": 56.645, "eval_steps_per_second": 0.934, "step": 120 }, { "epoch": 0.7707006369426752, "grad_norm": 0.1149601861834526, "learning_rate": 0.00018668853010065634, "loss": 1.0745, "step": 121 }, { "epoch": 0.7770700636942676, "grad_norm": 0.11904130131006241, "learning_rate": 0.00018642979046059593, "loss": 1.0574, "step": 122 }, { "epoch": 0.7834394904458599, "grad_norm": 0.11868870258331299, "learning_rate": 0.00018616874324949159, "loss": 1.0681, "step": 123 }, { "epoch": 0.7898089171974523, "grad_norm": 0.11400648951530457, "learning_rate": 0.00018590539543698854, "loss": 1.0874, "step": 124 }, { "epoch": 0.7961783439490446, "grad_norm": 0.12247481942176819, "learning_rate": 0.0001856397540541554, "loss": 1.0832, "step": 125 }, { "epoch": 0.802547770700637, "grad_norm": 0.11855783313512802, "learning_rate": 0.0001853718261932964, "loss": 1.0775, "step": 126 }, { "epoch": 0.8089171974522293, "grad_norm": 0.11434577405452728, "learning_rate": 0.00018510161900776187, "loss": 1.048, "step": 127 }, { "epoch": 0.8152866242038217, "grad_norm": 0.12175115942955017, "learning_rate": 0.00018482913971175737, "loss": 1.0776, "step": 128 }, { "epoch": 0.821656050955414, "grad_norm": 0.1237318217754364, "learning_rate": 0.00018455439558015115, "loss": 1.0977, "step": 129 }, { "epoch": 0.8280254777070064, "grad_norm": 0.12041562050580978, "learning_rate": 0.00018427739394827973, "loss": 1.0477, "step": 130 }, { "epoch": 0.8343949044585988, "grad_norm": 0.11855332553386688, "learning_rate": 0.00018399814221175227, "loss": 1.1026, "step": 131 }, { "epoch": 0.8407643312101911, "grad_norm": 0.12020997703075409, "learning_rate": 0.00018371664782625287, "loss": 1.0484, "step": 132 }, { "epoch": 0.8471337579617835, "grad_norm": 0.1116231232881546, "learning_rate": 0.00018343291830734176, "loss": 1.0772, "step": 133 }, { "epoch": 0.8535031847133758, "grad_norm": 0.12280379235744476, "learning_rate": 0.00018314696123025454, "loss": 1.0829, "step": 134 }, { "epoch": 0.8598726114649682, "grad_norm": 0.11589805781841278, "learning_rate": 0.00018285878422969983, "loss": 1.0636, "step": 135 }, { "epoch": 0.8662420382165605, "grad_norm": 0.11667989194393158, "learning_rate": 0.0001825683949996556, "loss": 1.0783, "step": 136 }, { "epoch": 0.8726114649681529, "grad_norm": 0.11666262894868851, "learning_rate": 0.00018227580129316366, "loss": 1.0587, "step": 137 }, { "epoch": 0.8789808917197452, "grad_norm": 0.11791834235191345, "learning_rate": 0.00018198101092212267, "loss": 1.0955, "step": 138 }, { "epoch": 0.8853503184713376, "grad_norm": 0.12023093551397324, "learning_rate": 0.00018168403175707954, "loss": 1.1133, "step": 139 }, { "epoch": 0.89171974522293, "grad_norm": 0.12082846462726593, "learning_rate": 0.0001813848717270195, "loss": 1.1083, "step": 140 }, { "epoch": 0.8980891719745223, "grad_norm": 0.1259888857603073, "learning_rate": 0.00018108353881915402, "loss": 1.0931, "step": 141 }, { "epoch": 0.9044585987261147, "grad_norm": 0.11900565028190613, "learning_rate": 0.00018078004107870797, "loss": 1.0955, "step": 142 }, { "epoch": 0.910828025477707, "grad_norm": 0.11422552168369293, "learning_rate": 0.00018047438660870446, "loss": 1.0473, "step": 143 }, { "epoch": 0.9171974522292994, "grad_norm": 0.13001863658428192, "learning_rate": 0.00018016658356974884, "loss": 1.0273, "step": 144 }, { "epoch": 0.9235668789808917, "grad_norm": 0.11941977590322495, "learning_rate": 0.0001798566401798106, "loss": 1.0774, "step": 145 }, { "epoch": 0.9299363057324841, "grad_norm": 0.12032714486122131, "learning_rate": 0.00017954456471400393, "loss": 1.1162, "step": 146 }, { "epoch": 0.9363057324840764, "grad_norm": 0.13784518837928772, "learning_rate": 0.00017923036550436704, "loss": 1.095, "step": 147 }, { "epoch": 0.9426751592356688, "grad_norm": 0.12085068970918655, "learning_rate": 0.00017891405093963938, "loss": 1.1024, "step": 148 }, { "epoch": 0.9490445859872612, "grad_norm": 0.11120469868183136, "learning_rate": 0.00017859562946503788, "loss": 1.0502, "step": 149 }, { "epoch": 0.9554140127388535, "grad_norm": 0.1275676190853119, "learning_rate": 0.00017827510958203147, "loss": 1.0875, "step": 150 }, { "epoch": 0.9617834394904459, "grad_norm": 0.13544359803199768, "learning_rate": 0.00017795249984811396, "loss": 1.0985, "step": 151 }, { "epoch": 0.9681528662420382, "grad_norm": 0.11840228736400604, "learning_rate": 0.00017762780887657574, "loss": 1.059, "step": 152 }, { "epoch": 0.9745222929936306, "grad_norm": 0.12622268497943878, "learning_rate": 0.0001773010453362737, "loss": 1.1034, "step": 153 }, { "epoch": 0.9808917197452229, "grad_norm": 0.11485569179058075, "learning_rate": 0.0001769722179513998, "loss": 1.0639, "step": 154 }, { "epoch": 0.9872611464968153, "grad_norm": 0.11948831379413605, "learning_rate": 0.00017664133550124815, "loss": 1.0635, "step": 155 }, { "epoch": 0.9936305732484076, "grad_norm": 0.1214427575469017, "learning_rate": 0.00017630840681998066, "loss": 1.1361, "step": 156 }, { "epoch": 1.0, "grad_norm": 0.11713624000549316, "learning_rate": 0.00017597344079639112, "loss": 1.0619, "step": 157 } ], "logging_steps": 1, "max_steps": 628, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 157, "total_flos": 2.0567076783046656e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }