{ "best_metric": 0.014392802491784096, "best_model_checkpoint": "/home/paperspace/Data/models/dbischof_premise_aea/llm3br256/checkpoint-500", "epoch": 3.872216844143272, "eval_steps": 5, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007744433688286544, "grad_norm": 0.28707125782966614, "learning_rate": 1.5384615384615387e-06, "loss": 0.0847, "step": 1 }, { "epoch": 0.015488867376573089, "grad_norm": 0.34009915590286255, "learning_rate": 3.0769230769230774e-06, "loss": 0.0928, "step": 2 }, { "epoch": 0.023233301064859633, "grad_norm": 0.29313409328460693, "learning_rate": 4.615384615384616e-06, "loss": 0.0934, "step": 3 }, { "epoch": 0.030977734753146177, "grad_norm": 0.2913404107093811, "learning_rate": 6.153846153846155e-06, "loss": 0.0913, "step": 4 }, { "epoch": 0.03872216844143272, "grad_norm": 0.29106780886650085, "learning_rate": 7.692307692307694e-06, "loss": 0.095, "step": 5 }, { "epoch": 0.03872216844143272, "eval_loss": 0.07727333903312683, "eval_runtime": 5.9343, "eval_samples_per_second": 8.426, "eval_steps_per_second": 2.191, "step": 5 }, { "epoch": 0.046466602129719266, "grad_norm": 0.23025450110435486, "learning_rate": 9.230769230769232e-06, "loss": 0.0948, "step": 6 }, { "epoch": 0.05421103581800581, "grad_norm": 0.21704453229904175, "learning_rate": 1.0769230769230771e-05, "loss": 0.0727, "step": 7 }, { "epoch": 0.061955469506292354, "grad_norm": 0.17385561764240265, "learning_rate": 1.230769230769231e-05, "loss": 0.0689, "step": 8 }, { "epoch": 0.0696999031945789, "grad_norm": 0.15649482607841492, "learning_rate": 1.3846153846153847e-05, "loss": 0.0604, "step": 9 }, { "epoch": 0.07744433688286544, "grad_norm": 0.11710207164287567, "learning_rate": 1.5384615384615387e-05, "loss": 0.0562, "step": 10 }, { "epoch": 0.07744433688286544, "eval_loss": 0.04053657874464989, "eval_runtime": 4.8954, "eval_samples_per_second": 10.214, "eval_steps_per_second": 2.656, "step": 10 }, { "epoch": 0.08518877057115198, "grad_norm": 0.09721983969211578, "learning_rate": 1.6923076923076924e-05, "loss": 0.0393, "step": 11 }, { "epoch": 0.09293320425943853, "grad_norm": 0.09856045991182327, "learning_rate": 1.8461538461538465e-05, "loss": 0.0404, "step": 12 }, { "epoch": 0.10067763794772508, "grad_norm": 0.11793606728315353, "learning_rate": 2e-05, "loss": 0.0455, "step": 13 }, { "epoch": 0.10842207163601161, "grad_norm": 0.11285863816738129, "learning_rate": 2.1538461538461542e-05, "loss": 0.059, "step": 14 }, { "epoch": 0.11616650532429816, "grad_norm": 0.08813278377056122, "learning_rate": 2.307692307692308e-05, "loss": 0.032, "step": 15 }, { "epoch": 0.11616650532429816, "eval_loss": 0.03360835462808609, "eval_runtime": 4.8812, "eval_samples_per_second": 10.243, "eval_steps_per_second": 2.663, "step": 15 }, { "epoch": 0.12391093901258471, "grad_norm": 0.06082022562623024, "learning_rate": 2.461538461538462e-05, "loss": 0.0419, "step": 16 }, { "epoch": 0.13165537270087124, "grad_norm": 0.055546533316373825, "learning_rate": 2.6153846153846157e-05, "loss": 0.0452, "step": 17 }, { "epoch": 0.1393998063891578, "grad_norm": 0.0525379441678524, "learning_rate": 2.7692307692307694e-05, "loss": 0.0329, "step": 18 }, { "epoch": 0.14714424007744434, "grad_norm": 0.058248624205589294, "learning_rate": 2.9230769230769234e-05, "loss": 0.0338, "step": 19 }, { "epoch": 0.15488867376573087, "grad_norm": 0.057563405483961105, "learning_rate": 3.0769230769230774e-05, "loss": 0.0488, "step": 20 }, { "epoch": 0.15488867376573087, "eval_loss": 0.031162459403276443, "eval_runtime": 4.9017, "eval_samples_per_second": 10.201, "eval_steps_per_second": 2.652, "step": 20 }, { "epoch": 0.16263310745401743, "grad_norm": 0.04852646589279175, "learning_rate": 3.230769230769231e-05, "loss": 0.0364, "step": 21 }, { "epoch": 0.17037754114230397, "grad_norm": 0.05401140823960304, "learning_rate": 3.384615384615385e-05, "loss": 0.0446, "step": 22 }, { "epoch": 0.1781219748305905, "grad_norm": 0.0492316372692585, "learning_rate": 3.538461538461539e-05, "loss": 0.0407, "step": 23 }, { "epoch": 0.18586640851887706, "grad_norm": 0.037774790078401566, "learning_rate": 3.692307692307693e-05, "loss": 0.0315, "step": 24 }, { "epoch": 0.1936108422071636, "grad_norm": 0.04360613971948624, "learning_rate": 3.846153846153846e-05, "loss": 0.0331, "step": 25 }, { "epoch": 0.1936108422071636, "eval_loss": 0.02766346000134945, "eval_runtime": 4.8772, "eval_samples_per_second": 10.252, "eval_steps_per_second": 2.665, "step": 25 }, { "epoch": 0.20135527589545016, "grad_norm": 0.037237901240587234, "learning_rate": 4e-05, "loss": 0.0259, "step": 26 }, { "epoch": 0.2090997095837367, "grad_norm": 0.03505983576178551, "learning_rate": 4.1538461538461544e-05, "loss": 0.0303, "step": 27 }, { "epoch": 0.21684414327202323, "grad_norm": 0.041253913193941116, "learning_rate": 4.3076923076923084e-05, "loss": 0.0453, "step": 28 }, { "epoch": 0.2245885769603098, "grad_norm": 0.04072079062461853, "learning_rate": 4.461538461538462e-05, "loss": 0.0316, "step": 29 }, { "epoch": 0.23233301064859632, "grad_norm": 0.03738202154636383, "learning_rate": 4.615384615384616e-05, "loss": 0.0377, "step": 30 }, { "epoch": 0.23233301064859632, "eval_loss": 0.025424109771847725, "eval_runtime": 4.8765, "eval_samples_per_second": 10.253, "eval_steps_per_second": 2.666, "step": 30 }, { "epoch": 0.24007744433688286, "grad_norm": 0.03633822873234749, "learning_rate": 4.76923076923077e-05, "loss": 0.0369, "step": 31 }, { "epoch": 0.24782187802516942, "grad_norm": 0.03256253898143768, "learning_rate": 4.923076923076924e-05, "loss": 0.0349, "step": 32 }, { "epoch": 0.25556631171345595, "grad_norm": 0.031838804483413696, "learning_rate": 5.0769230769230766e-05, "loss": 0.0283, "step": 33 }, { "epoch": 0.2633107454017425, "grad_norm": 0.026707077398896217, "learning_rate": 5.230769230769231e-05, "loss": 0.0283, "step": 34 }, { "epoch": 0.271055179090029, "grad_norm": 0.03254338726401329, "learning_rate": 5.384615384615385e-05, "loss": 0.0316, "step": 35 }, { "epoch": 0.271055179090029, "eval_loss": 0.024270590394735336, "eval_runtime": 4.8832, "eval_samples_per_second": 10.239, "eval_steps_per_second": 2.662, "step": 35 }, { "epoch": 0.2787996127783156, "grad_norm": 0.030620776116847992, "learning_rate": 5.538461538461539e-05, "loss": 0.0306, "step": 36 }, { "epoch": 0.28654404646660214, "grad_norm": 0.03317311033606529, "learning_rate": 5.692307692307692e-05, "loss": 0.0293, "step": 37 }, { "epoch": 0.2942884801548887, "grad_norm": 0.026506489142775536, "learning_rate": 5.846153846153847e-05, "loss": 0.0293, "step": 38 }, { "epoch": 0.3020329138431752, "grad_norm": 0.023665621876716614, "learning_rate": 6e-05, "loss": 0.0166, "step": 39 }, { "epoch": 0.30977734753146174, "grad_norm": 0.03278828039765358, "learning_rate": 6.153846153846155e-05, "loss": 0.0374, "step": 40 }, { "epoch": 0.30977734753146174, "eval_loss": 0.023048410192131996, "eval_runtime": 4.885, "eval_samples_per_second": 10.235, "eval_steps_per_second": 2.661, "step": 40 }, { "epoch": 0.31752178121974833, "grad_norm": 0.03030160255730152, "learning_rate": 6.307692307692308e-05, "loss": 0.0334, "step": 41 }, { "epoch": 0.32526621490803487, "grad_norm": 0.03384114429354668, "learning_rate": 6.461538461538462e-05, "loss": 0.0212, "step": 42 }, { "epoch": 0.3330106485963214, "grad_norm": 0.02560395933687687, "learning_rate": 6.615384615384616e-05, "loss": 0.0363, "step": 43 }, { "epoch": 0.34075508228460794, "grad_norm": 0.026470044627785683, "learning_rate": 6.76923076923077e-05, "loss": 0.024, "step": 44 }, { "epoch": 0.34849951597289447, "grad_norm": 0.023488877341151237, "learning_rate": 6.923076923076924e-05, "loss": 0.0208, "step": 45 }, { "epoch": 0.34849951597289447, "eval_loss": 0.022530335932970047, "eval_runtime": 4.8759, "eval_samples_per_second": 10.255, "eval_steps_per_second": 2.666, "step": 45 }, { "epoch": 0.356243949661181, "grad_norm": 0.029532263055443764, "learning_rate": 7.076923076923078e-05, "loss": 0.0399, "step": 46 }, { "epoch": 0.3639883833494676, "grad_norm": 0.025283565744757652, "learning_rate": 7.23076923076923e-05, "loss": 0.033, "step": 47 }, { "epoch": 0.3717328170377541, "grad_norm": 0.024645334109663963, "learning_rate": 7.384615384615386e-05, "loss": 0.0431, "step": 48 }, { "epoch": 0.37947725072604066, "grad_norm": 0.025530191138386726, "learning_rate": 7.538461538461539e-05, "loss": 0.0321, "step": 49 }, { "epoch": 0.3872216844143272, "grad_norm": 0.02383197844028473, "learning_rate": 7.692307692307693e-05, "loss": 0.0305, "step": 50 }, { "epoch": 0.3872216844143272, "eval_loss": 0.021847765892744064, "eval_runtime": 4.8901, "eval_samples_per_second": 10.225, "eval_steps_per_second": 2.658, "step": 50 }, { "epoch": 0.39496611810261373, "grad_norm": 0.02661319635808468, "learning_rate": 7.846153846153847e-05, "loss": 0.0312, "step": 51 }, { "epoch": 0.4027105517909003, "grad_norm": 0.029026813805103302, "learning_rate": 8e-05, "loss": 0.0202, "step": 52 }, { "epoch": 0.41045498547918685, "grad_norm": 0.03153839334845543, "learning_rate": 8.153846153846155e-05, "loss": 0.0322, "step": 53 }, { "epoch": 0.4181994191674734, "grad_norm": 0.027100125327706337, "learning_rate": 8.307692307692309e-05, "loss": 0.0217, "step": 54 }, { "epoch": 0.4259438528557599, "grad_norm": 0.034204043447971344, "learning_rate": 8.461538461538461e-05, "loss": 0.0238, "step": 55 }, { "epoch": 0.4259438528557599, "eval_loss": 0.021218011155724525, "eval_runtime": 4.895, "eval_samples_per_second": 10.215, "eval_steps_per_second": 2.656, "step": 55 }, { "epoch": 0.43368828654404645, "grad_norm": 0.026411807164549828, "learning_rate": 8.615384615384617e-05, "loss": 0.0264, "step": 56 }, { "epoch": 0.441432720232333, "grad_norm": 0.025747094303369522, "learning_rate": 8.76923076923077e-05, "loss": 0.0231, "step": 57 }, { "epoch": 0.4491771539206196, "grad_norm": 0.028047436848282814, "learning_rate": 8.923076923076924e-05, "loss": 0.0269, "step": 58 }, { "epoch": 0.4569215876089061, "grad_norm": 0.03033887967467308, "learning_rate": 9.076923076923078e-05, "loss": 0.0286, "step": 59 }, { "epoch": 0.46466602129719264, "grad_norm": 0.024372393265366554, "learning_rate": 9.230769230769232e-05, "loss": 0.0278, "step": 60 }, { "epoch": 0.46466602129719264, "eval_loss": 0.020728331059217453, "eval_runtime": 4.8702, "eval_samples_per_second": 10.266, "eval_steps_per_second": 2.669, "step": 60 }, { "epoch": 0.4724104549854792, "grad_norm": 0.028278978541493416, "learning_rate": 9.384615384615386e-05, "loss": 0.0247, "step": 61 }, { "epoch": 0.4801548886737657, "grad_norm": 0.03280925378203392, "learning_rate": 9.53846153846154e-05, "loss": 0.026, "step": 62 }, { "epoch": 0.4878993223620523, "grad_norm": 0.023919392377138138, "learning_rate": 9.692307692307692e-05, "loss": 0.0312, "step": 63 }, { "epoch": 0.49564375605033884, "grad_norm": 0.0364394448697567, "learning_rate": 9.846153846153848e-05, "loss": 0.0219, "step": 64 }, { "epoch": 0.5033881897386253, "grad_norm": 0.02771547995507717, "learning_rate": 0.0001, "loss": 0.0199, "step": 65 }, { "epoch": 0.5033881897386253, "eval_loss": 0.02000207081437111, "eval_runtime": 4.8908, "eval_samples_per_second": 10.223, "eval_steps_per_second": 2.658, "step": 65 }, { "epoch": 0.5111326234269119, "grad_norm": 0.02505766600370407, "learning_rate": 9.999926652940913e-05, "loss": 0.0206, "step": 66 }, { "epoch": 0.5188770571151985, "grad_norm": 0.037389349192380905, "learning_rate": 9.999706613915566e-05, "loss": 0.0265, "step": 67 }, { "epoch": 0.526621490803485, "grad_norm": 0.03750506415963173, "learning_rate": 9.999339889379647e-05, "loss": 0.0236, "step": 68 }, { "epoch": 0.5343659244917716, "grad_norm": 0.028572333976626396, "learning_rate": 9.998826490092421e-05, "loss": 0.0236, "step": 69 }, { "epoch": 0.542110358180058, "grad_norm": 0.024309856817126274, "learning_rate": 9.99816643111642e-05, "loss": 0.0235, "step": 70 }, { "epoch": 0.542110358180058, "eval_loss": 0.02025166153907776, "eval_runtime": 4.8811, "eval_samples_per_second": 10.244, "eval_steps_per_second": 2.663, "step": 70 }, { "epoch": 0.5498547918683446, "grad_norm": 0.035883497446775436, "learning_rate": 9.997359731816998e-05, "loss": 0.0289, "step": 71 }, { "epoch": 0.5575992255566312, "grad_norm": 0.034139424562454224, "learning_rate": 9.996406415861763e-05, "loss": 0.0366, "step": 72 }, { "epoch": 0.5653436592449177, "grad_norm": 0.02562110312283039, "learning_rate": 9.995306511219885e-05, "loss": 0.0336, "step": 73 }, { "epoch": 0.5730880929332043, "grad_norm": 0.026915963739156723, "learning_rate": 9.994060050161269e-05, "loss": 0.0193, "step": 74 }, { "epoch": 0.5808325266214908, "grad_norm": 0.02748969756066799, "learning_rate": 9.992667069255619e-05, "loss": 0.0213, "step": 75 }, { "epoch": 0.5808325266214908, "eval_loss": 0.019886016845703125, "eval_runtime": 4.8762, "eval_samples_per_second": 10.254, "eval_steps_per_second": 2.666, "step": 75 }, { "epoch": 0.5885769603097774, "grad_norm": 0.0281902477145195, "learning_rate": 9.991127609371356e-05, "loss": 0.0333, "step": 76 }, { "epoch": 0.5963213939980639, "grad_norm": 0.032518427819013596, "learning_rate": 9.989441715674422e-05, "loss": 0.0296, "step": 77 }, { "epoch": 0.6040658276863504, "grad_norm": 0.0259566493332386, "learning_rate": 9.987609437626955e-05, "loss": 0.0282, "step": 78 }, { "epoch": 0.611810261374637, "grad_norm": 0.029854053631424904, "learning_rate": 9.985630828985835e-05, "loss": 0.0205, "step": 79 }, { "epoch": 0.6195546950629235, "grad_norm": 0.03595299273729324, "learning_rate": 9.983505947801115e-05, "loss": 0.044, "step": 80 }, { "epoch": 0.6195546950629235, "eval_loss": 0.01953260228037834, "eval_runtime": 4.8809, "eval_samples_per_second": 10.244, "eval_steps_per_second": 2.663, "step": 80 }, { "epoch": 0.6272991287512101, "grad_norm": 0.02581968903541565, "learning_rate": 9.981234856414307e-05, "loss": 0.0265, "step": 81 }, { "epoch": 0.6350435624394967, "grad_norm": 0.02523561753332615, "learning_rate": 9.978817621456562e-05, "loss": 0.0232, "step": 82 }, { "epoch": 0.6427879961277831, "grad_norm": 0.022955749183893204, "learning_rate": 9.97625431384671e-05, "loss": 0.0267, "step": 83 }, { "epoch": 0.6505324298160697, "grad_norm": 0.0209239199757576, "learning_rate": 9.973545008789181e-05, "loss": 0.0303, "step": 84 }, { "epoch": 0.6582768635043562, "grad_norm": 0.028582807630300522, "learning_rate": 9.970689785771798e-05, "loss": 0.021, "step": 85 }, { "epoch": 0.6582768635043562, "eval_loss": 0.019236262887716293, "eval_runtime": 4.874, "eval_samples_per_second": 10.258, "eval_steps_per_second": 2.667, "step": 85 }, { "epoch": 0.6660212971926428, "grad_norm": 0.02616284228861332, "learning_rate": 9.967688728563446e-05, "loss": 0.0176, "step": 86 }, { "epoch": 0.6737657308809293, "grad_norm": 0.029908856377005577, "learning_rate": 9.964541925211612e-05, "loss": 0.0206, "step": 87 }, { "epoch": 0.6815101645692159, "grad_norm": 0.03139350563287735, "learning_rate": 9.961249468039807e-05, "loss": 0.0301, "step": 88 }, { "epoch": 0.6892545982575025, "grad_norm": 0.025906842201948166, "learning_rate": 9.957811453644847e-05, "loss": 0.0192, "step": 89 }, { "epoch": 0.6969990319457889, "grad_norm": 0.0281496811658144, "learning_rate": 9.954227982894034e-05, "loss": 0.0296, "step": 90 }, { "epoch": 0.6969990319457889, "eval_loss": 0.019074302166700363, "eval_runtime": 4.8832, "eval_samples_per_second": 10.239, "eval_steps_per_second": 2.662, "step": 90 }, { "epoch": 0.7047434656340755, "grad_norm": 0.027965204790234566, "learning_rate": 9.950499160922183e-05, "loss": 0.0213, "step": 91 }, { "epoch": 0.712487899322362, "grad_norm": 0.02602163329720497, "learning_rate": 9.946625097128543e-05, "loss": 0.0269, "step": 92 }, { "epoch": 0.7202323330106486, "grad_norm": 0.028190776705741882, "learning_rate": 9.942605905173592e-05, "loss": 0.0207, "step": 93 }, { "epoch": 0.7279767666989352, "grad_norm": 0.025893300771713257, "learning_rate": 9.938441702975689e-05, "loss": 0.0265, "step": 94 }, { "epoch": 0.7357212003872217, "grad_norm": 0.0202568881213665, "learning_rate": 9.934132612707632e-05, "loss": 0.0141, "step": 95 }, { "epoch": 0.7357212003872217, "eval_loss": 0.018998095765709877, "eval_runtime": 4.8865, "eval_samples_per_second": 10.232, "eval_steps_per_second": 2.66, "step": 95 }, { "epoch": 0.7434656340755083, "grad_norm": 0.03151071444153786, "learning_rate": 9.929678760793057e-05, "loss": 0.028, "step": 96 }, { "epoch": 0.7512100677637947, "grad_norm": 0.037441398948431015, "learning_rate": 9.925080277902743e-05, "loss": 0.0275, "step": 97 }, { "epoch": 0.7589545014520813, "grad_norm": 0.022733572870492935, "learning_rate": 9.920337298950765e-05, "loss": 0.0227, "step": 98 }, { "epoch": 0.7666989351403679, "grad_norm": 0.021637218073010445, "learning_rate": 9.91544996309055e-05, "loss": 0.0179, "step": 99 }, { "epoch": 0.7744433688286544, "grad_norm": 0.023374751210212708, "learning_rate": 9.91041841371078e-05, "loss": 0.0289, "step": 100 }, { "epoch": 0.7744433688286544, "eval_loss": 0.01871725358068943, "eval_runtime": 4.9046, "eval_samples_per_second": 10.195, "eval_steps_per_second": 2.651, "step": 100 }, { "epoch": 0.782187802516941, "grad_norm": 0.021633530035614967, "learning_rate": 9.905242798431196e-05, "loss": 0.0267, "step": 101 }, { "epoch": 0.7899322362052275, "grad_norm": 0.024837492033839226, "learning_rate": 9.899923269098262e-05, "loss": 0.0341, "step": 102 }, { "epoch": 0.797676669893514, "grad_norm": 0.023348737508058548, "learning_rate": 9.894459981780711e-05, "loss": 0.0263, "step": 103 }, { "epoch": 0.8054211035818006, "grad_norm": 0.02404264733195305, "learning_rate": 9.888853096764964e-05, "loss": 0.0214, "step": 104 }, { "epoch": 0.8131655372700871, "grad_norm": 0.02434077486395836, "learning_rate": 9.883102778550434e-05, "loss": 0.0159, "step": 105 }, { "epoch": 0.8131655372700871, "eval_loss": 0.01875956915318966, "eval_runtime": 4.887, "eval_samples_per_second": 10.231, "eval_steps_per_second": 2.66, "step": 105 }, { "epoch": 0.8209099709583737, "grad_norm": 0.023013584315776825, "learning_rate": 9.877209195844692e-05, "loss": 0.0266, "step": 106 }, { "epoch": 0.8286544046466602, "grad_norm": 0.03137190267443657, "learning_rate": 9.871172521558523e-05, "loss": 0.0242, "step": 107 }, { "epoch": 0.8363988383349468, "grad_norm": 0.023217204958200455, "learning_rate": 9.864992932800845e-05, "loss": 0.0254, "step": 108 }, { "epoch": 0.8441432720232332, "grad_norm": 0.027811044827103615, "learning_rate": 9.858670610873528e-05, "loss": 0.0173, "step": 109 }, { "epoch": 0.8518877057115198, "grad_norm": 0.027365995571017265, "learning_rate": 9.852205741266058e-05, "loss": 0.0275, "step": 110 }, { "epoch": 0.8518877057115198, "eval_loss": 0.01876773312687874, "eval_runtime": 4.8844, "eval_samples_per_second": 10.237, "eval_steps_per_second": 2.662, "step": 110 }, { "epoch": 0.8596321393998064, "grad_norm": 0.022870220243930817, "learning_rate": 9.845598513650103e-05, "loss": 0.0175, "step": 111 }, { "epoch": 0.8673765730880929, "grad_norm": 0.021480288356542587, "learning_rate": 9.838849121873949e-05, "loss": 0.0179, "step": 112 }, { "epoch": 0.8751210067763795, "grad_norm": 0.025231841951608658, "learning_rate": 9.831957763956813e-05, "loss": 0.0182, "step": 113 }, { "epoch": 0.882865440464666, "grad_norm": 0.023175878450274467, "learning_rate": 9.824924642083026e-05, "loss": 0.0167, "step": 114 }, { "epoch": 0.8906098741529526, "grad_norm": 0.02536984719336033, "learning_rate": 9.817749962596115e-05, "loss": 0.0271, "step": 115 }, { "epoch": 0.8906098741529526, "eval_loss": 0.018538037315011024, "eval_runtime": 4.8812, "eval_samples_per_second": 10.243, "eval_steps_per_second": 2.663, "step": 115 }, { "epoch": 0.8983543078412392, "grad_norm": 0.02080857753753662, "learning_rate": 9.810433935992733e-05, "loss": 0.0254, "step": 116 }, { "epoch": 0.9060987415295256, "grad_norm": 0.026430707424879074, "learning_rate": 9.802976776916494e-05, "loss": 0.0185, "step": 117 }, { "epoch": 0.9138431752178122, "grad_norm": 0.02291349321603775, "learning_rate": 9.795378704151675e-05, "loss": 0.0164, "step": 118 }, { "epoch": 0.9215876089060987, "grad_norm": 0.02319083735346794, "learning_rate": 9.787639940616788e-05, "loss": 0.0237, "step": 119 }, { "epoch": 0.9293320425943853, "grad_norm": 0.027965422719717026, "learning_rate": 9.779760713358059e-05, "loss": 0.0262, "step": 120 }, { "epoch": 0.9293320425943853, "eval_loss": 0.018477478995919228, "eval_runtime": 4.8802, "eval_samples_per_second": 10.246, "eval_steps_per_second": 2.664, "step": 120 }, { "epoch": 0.9370764762826719, "grad_norm": 0.023768456652760506, "learning_rate": 9.771741253542741e-05, "loss": 0.0186, "step": 121 }, { "epoch": 0.9448209099709584, "grad_norm": 0.01906961388885975, "learning_rate": 9.763581796452353e-05, "loss": 0.0163, "step": 122 }, { "epoch": 0.952565343659245, "grad_norm": 0.022706998512148857, "learning_rate": 9.755282581475769e-05, "loss": 0.0253, "step": 123 }, { "epoch": 0.9603097773475314, "grad_norm": 0.02551465854048729, "learning_rate": 9.74684385210219e-05, "loss": 0.0163, "step": 124 }, { "epoch": 0.968054211035818, "grad_norm": 0.02145274542272091, "learning_rate": 9.738265855914013e-05, "loss": 0.0299, "step": 125 }, { "epoch": 0.968054211035818, "eval_loss": 0.01828974299132824, "eval_runtime": 4.8759, "eval_samples_per_second": 10.254, "eval_steps_per_second": 2.666, "step": 125 }, { "epoch": 0.9757986447241046, "grad_norm": 0.023152988404035568, "learning_rate": 9.729548844579552e-05, "loss": 0.0178, "step": 126 }, { "epoch": 0.9835430784123911, "grad_norm": 0.026649784296751022, "learning_rate": 9.720693073845667e-05, "loss": 0.024, "step": 127 }, { "epoch": 0.9912875121006777, "grad_norm": 0.020236071199178696, "learning_rate": 9.711698803530254e-05, "loss": 0.0301, "step": 128 }, { "epoch": 0.9990319457889641, "grad_norm": 0.027533914893865585, "learning_rate": 9.70256629751462e-05, "loss": 0.0195, "step": 129 }, { "epoch": 1.0067763794772506, "grad_norm": 0.053280122578144073, "learning_rate": 9.693295823735753e-05, "loss": 0.0315, "step": 130 }, { "epoch": 1.0067763794772506, "eval_loss": 0.018024258315563202, "eval_runtime": 4.8931, "eval_samples_per_second": 10.219, "eval_steps_per_second": 2.657, "step": 130 }, { "epoch": 1.0145208131655372, "grad_norm": 0.01893387921154499, "learning_rate": 9.683887654178445e-05, "loss": 0.0226, "step": 131 }, { "epoch": 1.0222652468538238, "grad_norm": 0.029532097280025482, "learning_rate": 9.674342064867326e-05, "loss": 0.0145, "step": 132 }, { "epoch": 1.0300096805421104, "grad_norm": 0.028108367696404457, "learning_rate": 9.664659335858755e-05, "loss": 0.0148, "step": 133 }, { "epoch": 1.037754114230397, "grad_norm": 0.025696909055113792, "learning_rate": 9.654839751232611e-05, "loss": 0.0198, "step": 134 }, { "epoch": 1.0454985479186834, "grad_norm": 0.02809828147292137, "learning_rate": 9.644883599083958e-05, "loss": 0.0212, "step": 135 }, { "epoch": 1.0454985479186834, "eval_loss": 0.017997030168771744, "eval_runtime": 4.8817, "eval_samples_per_second": 10.242, "eval_steps_per_second": 2.663, "step": 135 }, { "epoch": 1.05324298160697, "grad_norm": 0.023596247658133507, "learning_rate": 9.634791171514585e-05, "loss": 0.027, "step": 136 }, { "epoch": 1.0609874152952565, "grad_norm": 0.032478995621204376, "learning_rate": 9.624562764624445e-05, "loss": 0.0231, "step": 137 }, { "epoch": 1.0687318489835431, "grad_norm": 0.029977047815918922, "learning_rate": 9.614198678502965e-05, "loss": 0.0139, "step": 138 }, { "epoch": 1.0764762826718297, "grad_norm": 0.03173111006617546, "learning_rate": 9.603699217220239e-05, "loss": 0.0188, "step": 139 }, { "epoch": 1.084220716360116, "grad_norm": 0.02266346476972103, "learning_rate": 9.59306468881811e-05, "loss": 0.0172, "step": 140 }, { "epoch": 1.084220716360116, "eval_loss": 0.018361272290349007, "eval_runtime": 4.8948, "eval_samples_per_second": 10.215, "eval_steps_per_second": 2.656, "step": 140 }, { "epoch": 1.0919651500484027, "grad_norm": 0.03363156318664551, "learning_rate": 9.582295405301131e-05, "loss": 0.0202, "step": 141 }, { "epoch": 1.0997095837366893, "grad_norm": 0.03840557113289833, "learning_rate": 9.571391682627412e-05, "loss": 0.0222, "step": 142 }, { "epoch": 1.1074540174249758, "grad_norm": 0.023486673831939697, "learning_rate": 9.56035384069935e-05, "loss": 0.0396, "step": 143 }, { "epoch": 1.1151984511132624, "grad_norm": 0.030952000990509987, "learning_rate": 9.549182203354242e-05, "loss": 0.0225, "step": 144 }, { "epoch": 1.1229428848015488, "grad_norm": 0.030439218506217003, "learning_rate": 9.537877098354786e-05, "loss": 0.0277, "step": 145 }, { "epoch": 1.1229428848015488, "eval_loss": 0.01816246099770069, "eval_runtime": 4.8899, "eval_samples_per_second": 10.225, "eval_steps_per_second": 2.659, "step": 145 }, { "epoch": 1.1306873184898354, "grad_norm": 0.024195371195673943, "learning_rate": 9.526438857379463e-05, "loss": 0.0116, "step": 146 }, { "epoch": 1.138431752178122, "grad_norm": 0.02799941971898079, "learning_rate": 9.514867816012809e-05, "loss": 0.0195, "step": 147 }, { "epoch": 1.1461761858664086, "grad_norm": 0.030233675613999367, "learning_rate": 9.503164313735566e-05, "loss": 0.0182, "step": 148 }, { "epoch": 1.1539206195546952, "grad_norm": 0.024903280660510063, "learning_rate": 9.491328693914722e-05, "loss": 0.0222, "step": 149 }, { "epoch": 1.1616650532429815, "grad_norm": 0.023587804287672043, "learning_rate": 9.47936130379344e-05, "loss": 0.0166, "step": 150 }, { "epoch": 1.1616650532429815, "eval_loss": 0.017931492999196053, "eval_runtime": 4.8826, "eval_samples_per_second": 10.24, "eval_steps_per_second": 2.663, "step": 150 }, { "epoch": 1.1694094869312681, "grad_norm": 0.024121137335896492, "learning_rate": 9.467262494480869e-05, "loss": 0.0216, "step": 151 }, { "epoch": 1.1771539206195547, "grad_norm": 0.02379632741212845, "learning_rate": 9.45503262094184e-05, "loss": 0.023, "step": 152 }, { "epoch": 1.1848983543078413, "grad_norm": 0.02161642163991928, "learning_rate": 9.442672041986457e-05, "loss": 0.0349, "step": 153 }, { "epoch": 1.1926427879961277, "grad_norm": 0.019304990768432617, "learning_rate": 9.430181120259565e-05, "loss": 0.0193, "step": 154 }, { "epoch": 1.2003872216844143, "grad_norm": 0.022498024627566338, "learning_rate": 9.417560222230115e-05, "loss": 0.0272, "step": 155 }, { "epoch": 1.2003872216844143, "eval_loss": 0.018144290894269943, "eval_runtime": 4.8768, "eval_samples_per_second": 10.253, "eval_steps_per_second": 2.666, "step": 155 }, { "epoch": 1.2081316553727008, "grad_norm": 0.03062877058982849, "learning_rate": 9.404809718180407e-05, "loss": 0.0215, "step": 156 }, { "epoch": 1.2158760890609874, "grad_norm": 0.023427944630384445, "learning_rate": 9.391929982195232e-05, "loss": 0.0301, "step": 157 }, { "epoch": 1.223620522749274, "grad_norm": 0.02246953919529915, "learning_rate": 9.378921392150892e-05, "loss": 0.0212, "step": 158 }, { "epoch": 1.2313649564375604, "grad_norm": 0.02264482155442238, "learning_rate": 9.365784329704115e-05, "loss": 0.0164, "step": 159 }, { "epoch": 1.239109390125847, "grad_norm": 0.025367658585309982, "learning_rate": 9.35251918028086e-05, "loss": 0.0193, "step": 160 }, { "epoch": 1.239109390125847, "eval_loss": 0.017837481573224068, "eval_runtime": 4.8761, "eval_samples_per_second": 10.254, "eval_steps_per_second": 2.666, "step": 160 }, { "epoch": 1.2468538238141336, "grad_norm": 0.02131119929254055, "learning_rate": 9.339126333065007e-05, "loss": 0.0207, "step": 161 }, { "epoch": 1.2545982575024202, "grad_norm": 0.019136667251586914, "learning_rate": 9.325606180986939e-05, "loss": 0.0147, "step": 162 }, { "epoch": 1.2623426911907067, "grad_norm": 0.024482635781168938, "learning_rate": 9.31195912071201e-05, "loss": 0.0299, "step": 163 }, { "epoch": 1.2700871248789931, "grad_norm": 0.02487838640809059, "learning_rate": 9.298185552628917e-05, "loss": 0.0232, "step": 164 }, { "epoch": 1.2778315585672797, "grad_norm": 0.025261854752898216, "learning_rate": 9.284285880837946e-05, "loss": 0.0121, "step": 165 }, { "epoch": 1.2778315585672797, "eval_loss": 0.017772378399968147, "eval_runtime": 4.8807, "eval_samples_per_second": 10.245, "eval_steps_per_second": 2.664, "step": 165 }, { "epoch": 1.2855759922555663, "grad_norm": 0.02148056961596012, "learning_rate": 9.270260513139116e-05, "loss": 0.0347, "step": 166 }, { "epoch": 1.2933204259438529, "grad_norm": 0.02021237276494503, "learning_rate": 9.256109861020213e-05, "loss": 0.02, "step": 167 }, { "epoch": 1.3010648596321395, "grad_norm": 0.017359554767608643, "learning_rate": 9.241834339644726e-05, "loss": 0.0168, "step": 168 }, { "epoch": 1.3088092933204258, "grad_norm": 0.02310781180858612, "learning_rate": 9.22743436783966e-05, "loss": 0.0192, "step": 169 }, { "epoch": 1.3165537270087124, "grad_norm": 0.020348088815808296, "learning_rate": 9.212910368083245e-05, "loss": 0.0218, "step": 170 }, { "epoch": 1.3165537270087124, "eval_loss": 0.0177312009036541, "eval_runtime": 4.8794, "eval_samples_per_second": 10.247, "eval_steps_per_second": 2.664, "step": 170 }, { "epoch": 1.324298160696999, "grad_norm": 0.019140997901558876, "learning_rate": 9.198262766492554e-05, "loss": 0.0217, "step": 171 }, { "epoch": 1.3320425943852856, "grad_norm": 0.023120978847146034, "learning_rate": 9.183491992810979e-05, "loss": 0.0275, "step": 172 }, { "epoch": 1.3397870280735722, "grad_norm": 0.024684559553861618, "learning_rate": 9.168598480395651e-05, "loss": 0.0201, "step": 173 }, { "epoch": 1.3475314617618586, "grad_norm": 0.024830348789691925, "learning_rate": 9.153582666204701e-05, "loss": 0.0234, "step": 174 }, { "epoch": 1.3552758954501452, "grad_norm": 0.023022592067718506, "learning_rate": 9.138444990784453e-05, "loss": 0.016, "step": 175 }, { "epoch": 1.3552758954501452, "eval_loss": 0.017486225813627243, "eval_runtime": 4.8842, "eval_samples_per_second": 10.237, "eval_steps_per_second": 2.662, "step": 175 }, { "epoch": 1.3630203291384317, "grad_norm": 0.02616291493177414, "learning_rate": 9.123185898256496e-05, "loss": 0.0261, "step": 176 }, { "epoch": 1.3707647628267183, "grad_norm": 0.02299882471561432, "learning_rate": 9.107805836304658e-05, "loss": 0.0254, "step": 177 }, { "epoch": 1.378509196515005, "grad_norm": 0.018913911655545235, "learning_rate": 9.092305256161859e-05, "loss": 0.0124, "step": 178 }, { "epoch": 1.3862536302032913, "grad_norm": 0.02167947217822075, "learning_rate": 9.076684612596891e-05, "loss": 0.0232, "step": 179 }, { "epoch": 1.3939980638915779, "grad_norm": 0.02304757945239544, "learning_rate": 9.060944363901056e-05, "loss": 0.0268, "step": 180 }, { "epoch": 1.3939980638915779, "eval_loss": 0.01751082018017769, "eval_runtime": 4.8781, "eval_samples_per_second": 10.25, "eval_steps_per_second": 2.665, "step": 180 }, { "epoch": 1.4017424975798645, "grad_norm": 0.02488349750638008, "learning_rate": 9.045084971874738e-05, "loss": 0.0128, "step": 181 }, { "epoch": 1.409486931268151, "grad_norm": 0.025742027908563614, "learning_rate": 9.029106901813839e-05, "loss": 0.0243, "step": 182 }, { "epoch": 1.4172313649564376, "grad_norm": 0.020051000639796257, "learning_rate": 9.013010622496144e-05, "loss": 0.0106, "step": 183 }, { "epoch": 1.424975798644724, "grad_norm": 0.021976549178361893, "learning_rate": 8.996796606167548e-05, "loss": 0.0183, "step": 184 }, { "epoch": 1.4327202323330106, "grad_norm": 0.0210378710180521, "learning_rate": 8.980465328528219e-05, "loss": 0.0152, "step": 185 }, { "epoch": 1.4327202323330106, "eval_loss": 0.017743976786732674, "eval_runtime": 4.8802, "eval_samples_per_second": 10.246, "eval_steps_per_second": 2.664, "step": 185 }, { "epoch": 1.4404646660212972, "grad_norm": 0.02365756221115589, "learning_rate": 8.96401726871863e-05, "loss": 0.0114, "step": 186 }, { "epoch": 1.4482090997095838, "grad_norm": 0.025590112432837486, "learning_rate": 8.94745290930551e-05, "loss": 0.0189, "step": 187 }, { "epoch": 1.4559535333978704, "grad_norm": 0.029832618311047554, "learning_rate": 8.930772736267674e-05, "loss": 0.0324, "step": 188 }, { "epoch": 1.4636979670861567, "grad_norm": 0.025901637971401215, "learning_rate": 8.913977238981778e-05, "loss": 0.0186, "step": 189 }, { "epoch": 1.4714424007744433, "grad_norm": 0.01908070780336857, "learning_rate": 8.897066910207958e-05, "loss": 0.0279, "step": 190 }, { "epoch": 1.4714424007744433, "eval_loss": 0.017557693645358086, "eval_runtime": 4.8877, "eval_samples_per_second": 10.23, "eval_steps_per_second": 2.66, "step": 190 }, { "epoch": 1.47918683446273, "grad_norm": 0.025517305359244347, "learning_rate": 8.880042246075365e-05, "loss": 0.0279, "step": 191 }, { "epoch": 1.4869312681510165, "grad_norm": 0.019936546683311462, "learning_rate": 8.862903746067618e-05, "loss": 0.0172, "step": 192 }, { "epoch": 1.494675701839303, "grad_norm": 0.019224194809794426, "learning_rate": 8.845651913008145e-05, "loss": 0.0138, "step": 193 }, { "epoch": 1.5024201355275895, "grad_norm": 0.017969885841012, "learning_rate": 8.828287253045435e-05, "loss": 0.0151, "step": 194 }, { "epoch": 1.510164569215876, "grad_norm": 0.02093169093132019, "learning_rate": 8.810810275638183e-05, "loss": 0.0206, "step": 195 }, { "epoch": 1.510164569215876, "eval_loss": 0.017626546323299408, "eval_runtime": 4.9032, "eval_samples_per_second": 10.197, "eval_steps_per_second": 2.651, "step": 195 }, { "epoch": 1.5179090029041626, "grad_norm": 0.027407390996813774, "learning_rate": 8.793221493540347e-05, "loss": 0.0151, "step": 196 }, { "epoch": 1.5256534365924492, "grad_norm": 0.022155404090881348, "learning_rate": 8.775521422786104e-05, "loss": 0.0187, "step": 197 }, { "epoch": 1.5333978702807358, "grad_norm": 0.02126327157020569, "learning_rate": 8.757710582674707e-05, "loss": 0.0168, "step": 198 }, { "epoch": 1.5411423039690222, "grad_norm": 0.02067979797720909, "learning_rate": 8.739789495755253e-05, "loss": 0.015, "step": 199 }, { "epoch": 1.5488867376573088, "grad_norm": 0.023581981658935547, "learning_rate": 8.721758687811352e-05, "loss": 0.0196, "step": 200 }, { "epoch": 1.5488867376573088, "eval_loss": 0.017185786738991737, "eval_runtime": 4.8793, "eval_samples_per_second": 10.247, "eval_steps_per_second": 2.664, "step": 200 }, { "epoch": 1.5566311713455954, "grad_norm": 0.0208896417170763, "learning_rate": 8.703618687845696e-05, "loss": 0.0176, "step": 201 }, { "epoch": 1.5643756050338817, "grad_norm": 0.02558140642940998, "learning_rate": 8.685370028064546e-05, "loss": 0.0224, "step": 202 }, { "epoch": 1.5721200387221685, "grad_norm": 0.01860946975648403, "learning_rate": 8.667013243862113e-05, "loss": 0.0189, "step": 203 }, { "epoch": 1.579864472410455, "grad_norm": 0.024494647979736328, "learning_rate": 8.64854887380485e-05, "loss": 0.0204, "step": 204 }, { "epoch": 1.5876089060987415, "grad_norm": 0.028290973976254463, "learning_rate": 8.629977459615655e-05, "loss": 0.0262, "step": 205 }, { "epoch": 1.5876089060987415, "eval_loss": 0.016824763268232346, "eval_runtime": 4.88, "eval_samples_per_second": 10.246, "eval_steps_per_second": 2.664, "step": 205 }, { "epoch": 1.595353339787028, "grad_norm": 0.020388493314385414, "learning_rate": 8.611299546157974e-05, "loss": 0.0287, "step": 206 }, { "epoch": 1.6030977734753145, "grad_norm": 0.022215668112039566, "learning_rate": 8.592515681419813e-05, "loss": 0.0249, "step": 207 }, { "epoch": 1.6108422071636013, "grad_norm": 0.028934534639120102, "learning_rate": 8.573626416497668e-05, "loss": 0.0217, "step": 208 }, { "epoch": 1.6185866408518876, "grad_norm": 0.022588912397623062, "learning_rate": 8.554632305580354e-05, "loss": 0.0207, "step": 209 }, { "epoch": 1.6263310745401742, "grad_norm": 0.02324405126273632, "learning_rate": 8.535533905932738e-05, "loss": 0.0178, "step": 210 }, { "epoch": 1.6263310745401742, "eval_loss": 0.016888294368982315, "eval_runtime": 4.8771, "eval_samples_per_second": 10.252, "eval_steps_per_second": 2.665, "step": 210 }, { "epoch": 1.6340755082284608, "grad_norm": 0.023379050195217133, "learning_rate": 8.5163317778794e-05, "loss": 0.0227, "step": 211 }, { "epoch": 1.6418199419167472, "grad_norm": 0.024302620440721512, "learning_rate": 8.497026484788189e-05, "loss": 0.0279, "step": 212 }, { "epoch": 1.649564375605034, "grad_norm": 0.02425311878323555, "learning_rate": 8.477618593053693e-05, "loss": 0.02, "step": 213 }, { "epoch": 1.6573088092933204, "grad_norm": 0.0243984404951334, "learning_rate": 8.458108672080624e-05, "loss": 0.0255, "step": 214 }, { "epoch": 1.665053242981607, "grad_norm": 0.018734309822320938, "learning_rate": 8.438497294267117e-05, "loss": 0.011, "step": 215 }, { "epoch": 1.665053242981607, "eval_loss": 0.01664450205862522, "eval_runtime": 4.8808, "eval_samples_per_second": 10.244, "eval_steps_per_second": 2.663, "step": 215 }, { "epoch": 1.6727976766698935, "grad_norm": 0.019455671310424805, "learning_rate": 8.418785034987921e-05, "loss": 0.0175, "step": 216 }, { "epoch": 1.68054211035818, "grad_norm": 0.021629663184285164, "learning_rate": 8.39897247257754e-05, "loss": 0.0156, "step": 217 }, { "epoch": 1.6882865440464667, "grad_norm": 0.022207748144865036, "learning_rate": 8.379060188313244e-05, "loss": 0.0271, "step": 218 }, { "epoch": 1.696030977734753, "grad_norm": 0.021333666518330574, "learning_rate": 8.359048766398031e-05, "loss": 0.0223, "step": 219 }, { "epoch": 1.7037754114230397, "grad_norm": 0.021991191431879997, "learning_rate": 8.338938793943478e-05, "loss": 0.0128, "step": 220 }, { "epoch": 1.7037754114230397, "eval_loss": 0.016610655933618546, "eval_runtime": 4.8782, "eval_samples_per_second": 10.25, "eval_steps_per_second": 2.665, "step": 220 }, { "epoch": 1.7115198451113263, "grad_norm": 0.01750914379954338, "learning_rate": 8.318730860952522e-05, "loss": 0.0217, "step": 221 }, { "epoch": 1.7192642787996126, "grad_norm": 0.022801555693149567, "learning_rate": 8.298425560302146e-05, "loss": 0.0229, "step": 222 }, { "epoch": 1.7270087124878994, "grad_norm": 0.028667643666267395, "learning_rate": 8.278023487725982e-05, "loss": 0.0317, "step": 223 }, { "epoch": 1.7347531461761858, "grad_norm": 0.0247921384871006, "learning_rate": 8.257525241796838e-05, "loss": 0.0177, "step": 224 }, { "epoch": 1.7424975798644724, "grad_norm": 0.02079445868730545, "learning_rate": 8.236931423909138e-05, "loss": 0.0223, "step": 225 }, { "epoch": 1.7424975798644724, "eval_loss": 0.016715094447135925, "eval_runtime": 4.8828, "eval_samples_per_second": 10.24, "eval_steps_per_second": 2.662, "step": 225 }, { "epoch": 1.750242013552759, "grad_norm": 0.023619551211595535, "learning_rate": 8.216242638261276e-05, "loss": 0.0237, "step": 226 }, { "epoch": 1.7579864472410454, "grad_norm": 0.020713407546281815, "learning_rate": 8.19545949183788e-05, "loss": 0.0167, "step": 227 }, { "epoch": 1.7657308809293322, "grad_norm": 0.024574102833867073, "learning_rate": 8.17458259439202e-05, "loss": 0.0281, "step": 228 }, { "epoch": 1.7734753146176185, "grad_norm": 0.01983151212334633, "learning_rate": 8.153612558427311e-05, "loss": 0.0217, "step": 229 }, { "epoch": 1.7812197483059051, "grad_norm": 0.027135249227285385, "learning_rate": 8.132549999179933e-05, "loss": 0.0201, "step": 230 }, { "epoch": 1.7812197483059051, "eval_loss": 0.016661785542964935, "eval_runtime": 4.8807, "eval_samples_per_second": 10.244, "eval_steps_per_second": 2.664, "step": 230 }, { "epoch": 1.7889641819941917, "grad_norm": 0.021816475316882133, "learning_rate": 8.111395534600603e-05, "loss": 0.0166, "step": 231 }, { "epoch": 1.796708615682478, "grad_norm": 0.019049836322665215, "learning_rate": 8.090149785336425e-05, "loss": 0.0125, "step": 232 }, { "epoch": 1.804453049370765, "grad_norm": 0.023273281753063202, "learning_rate": 8.068813374712688e-05, "loss": 0.0295, "step": 233 }, { "epoch": 1.8121974830590513, "grad_norm": 0.02431442402303219, "learning_rate": 8.047386928714582e-05, "loss": 0.0193, "step": 234 }, { "epoch": 1.8199419167473379, "grad_norm": 0.02583279088139534, "learning_rate": 8.025871075968828e-05, "loss": 0.0239, "step": 235 }, { "epoch": 1.8199419167473379, "eval_loss": 0.016347970813512802, "eval_runtime": 4.883, "eval_samples_per_second": 10.24, "eval_steps_per_second": 2.662, "step": 235 }, { "epoch": 1.8276863504356244, "grad_norm": 0.021466901525855064, "learning_rate": 8.00426644772523e-05, "loss": 0.0226, "step": 236 }, { "epoch": 1.8354307841239108, "grad_norm": 0.02583594247698784, "learning_rate": 7.982573677838172e-05, "loss": 0.0113, "step": 237 }, { "epoch": 1.8431752178121976, "grad_norm": 0.02358117513358593, "learning_rate": 7.960793402748002e-05, "loss": 0.0292, "step": 238 }, { "epoch": 1.850919651500484, "grad_norm": 0.025698702782392502, "learning_rate": 7.938926261462366e-05, "loss": 0.0269, "step": 239 }, { "epoch": 1.8586640851887706, "grad_norm": 0.021297315135598183, "learning_rate": 7.916972895537471e-05, "loss": 0.0206, "step": 240 }, { "epoch": 1.8586640851887706, "eval_loss": 0.016880055889487267, "eval_runtime": 4.8849, "eval_samples_per_second": 10.236, "eval_steps_per_second": 2.661, "step": 240 }, { "epoch": 1.8664085188770572, "grad_norm": 0.02742616832256317, "learning_rate": 7.894933949059245e-05, "loss": 0.0266, "step": 241 }, { "epoch": 1.8741529525653435, "grad_norm": 0.029985694214701653, "learning_rate": 7.872810068624451e-05, "loss": 0.0209, "step": 242 }, { "epoch": 1.8818973862536303, "grad_norm": 0.01984225958585739, "learning_rate": 7.850601903321716e-05, "loss": 0.0112, "step": 243 }, { "epoch": 1.8896418199419167, "grad_norm": 0.028832539916038513, "learning_rate": 7.828310104712489e-05, "loss": 0.0176, "step": 244 }, { "epoch": 1.8973862536302033, "grad_norm": 0.025244107469916344, "learning_rate": 7.805935326811912e-05, "loss": 0.0209, "step": 245 }, { "epoch": 1.8973862536302033, "eval_loss": 0.016251368448138237, "eval_runtime": 4.8854, "eval_samples_per_second": 10.235, "eval_steps_per_second": 2.661, "step": 245 }, { "epoch": 1.90513068731849, "grad_norm": 0.019776510074734688, "learning_rate": 7.783478226069651e-05, "loss": 0.0146, "step": 246 }, { "epoch": 1.9128751210067763, "grad_norm": 0.030150357633829117, "learning_rate": 7.760939461350623e-05, "loss": 0.0205, "step": 247 }, { "epoch": 1.920619554695063, "grad_norm": 0.02409055270254612, "learning_rate": 7.738319693915672e-05, "loss": 0.0209, "step": 248 }, { "epoch": 1.9283639883833494, "grad_norm": 0.02473391965031624, "learning_rate": 7.715619587402164e-05, "loss": 0.0169, "step": 249 }, { "epoch": 1.936108422071636, "grad_norm": 0.028100404888391495, "learning_rate": 7.692839807804521e-05, "loss": 0.0171, "step": 250 }, { "epoch": 1.936108422071636, "eval_loss": 0.016126085072755814, "eval_runtime": 4.8878, "eval_samples_per_second": 10.23, "eval_steps_per_second": 2.66, "step": 250 }, { "epoch": 1.9438528557599226, "grad_norm": 0.031069206073880196, "learning_rate": 7.669981023454682e-05, "loss": 0.0346, "step": 251 }, { "epoch": 1.951597289448209, "grad_norm": 0.020763061940670013, "learning_rate": 7.647043905002484e-05, "loss": 0.0168, "step": 252 }, { "epoch": 1.9593417231364958, "grad_norm": 0.021877290681004524, "learning_rate": 7.624029125396004e-05, "loss": 0.0276, "step": 253 }, { "epoch": 1.9670861568247822, "grad_norm": 0.023641012609004974, "learning_rate": 7.6009373598618e-05, "loss": 0.0182, "step": 254 }, { "epoch": 1.9748305905130688, "grad_norm": 0.025783414021134377, "learning_rate": 7.577769285885109e-05, "loss": 0.022, "step": 255 }, { "epoch": 1.9748305905130688, "eval_loss": 0.015866845846176147, "eval_runtime": 4.8957, "eval_samples_per_second": 10.213, "eval_steps_per_second": 2.655, "step": 255 }, { "epoch": 1.9825750242013553, "grad_norm": 0.022825462743639946, "learning_rate": 7.554525583189969e-05, "loss": 0.0184, "step": 256 }, { "epoch": 1.9903194578896417, "grad_norm": 0.024429945275187492, "learning_rate": 7.53120693371927e-05, "loss": 0.0196, "step": 257 }, { "epoch": 1.9980638915779285, "grad_norm": 0.0280454121530056, "learning_rate": 7.507814021614761e-05, "loss": 0.0297, "step": 258 }, { "epoch": 2.005808325266215, "grad_norm": 0.04602880775928497, "learning_rate": 7.484347533196961e-05, "loss": 0.0211, "step": 259 }, { "epoch": 2.0135527589545013, "grad_norm": 0.01826930046081543, "learning_rate": 7.460808156945036e-05, "loss": 0.0162, "step": 260 }, { "epoch": 2.0135527589545013, "eval_loss": 0.015875546261668205, "eval_runtime": 4.8819, "eval_samples_per_second": 10.242, "eval_steps_per_second": 2.663, "step": 260 }, { "epoch": 2.021297192642788, "grad_norm": 0.018936650827527046, "learning_rate": 7.437196583476596e-05, "loss": 0.0169, "step": 261 }, { "epoch": 2.0290416263310744, "grad_norm": 0.02147481217980385, "learning_rate": 7.413513505527429e-05, "loss": 0.0142, "step": 262 }, { "epoch": 2.0367860600193612, "grad_norm": 0.020604653283953667, "learning_rate": 7.389759617931182e-05, "loss": 0.0115, "step": 263 }, { "epoch": 2.0445304937076476, "grad_norm": 0.021933911368250847, "learning_rate": 7.365935617598975e-05, "loss": 0.0134, "step": 264 }, { "epoch": 2.052274927395934, "grad_norm": 0.02122250571846962, "learning_rate": 7.342042203498951e-05, "loss": 0.0185, "step": 265 }, { "epoch": 2.052274927395934, "eval_loss": 0.01603526994585991, "eval_runtime": 4.9059, "eval_samples_per_second": 10.192, "eval_steps_per_second": 2.65, "step": 265 }, { "epoch": 2.060019361084221, "grad_norm": 0.018767178058624268, "learning_rate": 7.318080076635772e-05, "loss": 0.0087, "step": 266 }, { "epoch": 2.067763794772507, "grad_norm": 0.01828618347644806, "learning_rate": 7.294049940030055e-05, "loss": 0.0088, "step": 267 }, { "epoch": 2.075508228460794, "grad_norm": 0.029488379135727882, "learning_rate": 7.269952498697734e-05, "loss": 0.0148, "step": 268 }, { "epoch": 2.0832526621490803, "grad_norm": 0.028726164251565933, "learning_rate": 7.245788459629396e-05, "loss": 0.0226, "step": 269 }, { "epoch": 2.0909970958373667, "grad_norm": 0.03607122600078583, "learning_rate": 7.221558531769519e-05, "loss": 0.0185, "step": 270 }, { "epoch": 2.0909970958373667, "eval_loss": 0.01613912731409073, "eval_runtime": 4.8866, "eval_samples_per_second": 10.232, "eval_steps_per_second": 2.66, "step": 270 }, { "epoch": 2.0987415295256535, "grad_norm": 0.02318711020052433, "learning_rate": 7.197263425995682e-05, "loss": 0.0187, "step": 271 }, { "epoch": 2.10648596321394, "grad_norm": 0.027442490682005882, "learning_rate": 7.172903855097711e-05, "loss": 0.0185, "step": 272 }, { "epoch": 2.1142303969022267, "grad_norm": 0.02113383449614048, "learning_rate": 7.14848053375676e-05, "loss": 0.0162, "step": 273 }, { "epoch": 2.121974830590513, "grad_norm": 0.02109163999557495, "learning_rate": 7.123994178524345e-05, "loss": 0.0189, "step": 274 }, { "epoch": 2.1297192642787994, "grad_norm": 0.018890704959630966, "learning_rate": 7.099445507801323e-05, "loss": 0.0196, "step": 275 }, { "epoch": 2.1297192642787994, "eval_loss": 0.016141431406140327, "eval_runtime": 4.8795, "eval_samples_per_second": 10.247, "eval_steps_per_second": 2.664, "step": 275 }, { "epoch": 2.1374636979670862, "grad_norm": 0.026332931593060493, "learning_rate": 7.074835241816817e-05, "loss": 0.029, "step": 276 }, { "epoch": 2.1452081316553726, "grad_norm": 0.02275455929338932, "learning_rate": 7.05016410260708e-05, "loss": 0.0156, "step": 277 }, { "epoch": 2.1529525653436594, "grad_norm": 0.022596005350351334, "learning_rate": 7.025432813994315e-05, "loss": 0.0184, "step": 278 }, { "epoch": 2.160696999031946, "grad_norm": 0.020018640905618668, "learning_rate": 7.000642101565434e-05, "loss": 0.0107, "step": 279 }, { "epoch": 2.168441432720232, "grad_norm": 0.025625359266996384, "learning_rate": 6.975792692650777e-05, "loss": 0.0146, "step": 280 }, { "epoch": 2.168441432720232, "eval_loss": 0.015940353274345398, "eval_runtime": 4.9128, "eval_samples_per_second": 10.178, "eval_steps_per_second": 2.646, "step": 280 }, { "epoch": 2.176185866408519, "grad_norm": 0.026554979383945465, "learning_rate": 6.950885316302773e-05, "loss": 0.0213, "step": 281 }, { "epoch": 2.1839303000968053, "grad_norm": 0.023344026878476143, "learning_rate": 6.925920703274541e-05, "loss": 0.0176, "step": 282 }, { "epoch": 2.191674733785092, "grad_norm": 0.03146139904856682, "learning_rate": 6.90089958599846e-05, "loss": 0.0243, "step": 283 }, { "epoch": 2.1994191674733785, "grad_norm": 0.02688729763031006, "learning_rate": 6.875822698564679e-05, "loss": 0.0235, "step": 284 }, { "epoch": 2.207163601161665, "grad_norm": 0.017707915976643562, "learning_rate": 6.850690776699573e-05, "loss": 0.0091, "step": 285 }, { "epoch": 2.207163601161665, "eval_loss": 0.015938647091388702, "eval_runtime": 4.8821, "eval_samples_per_second": 10.241, "eval_steps_per_second": 2.663, "step": 285 }, { "epoch": 2.2149080348499517, "grad_norm": 0.02426217496395111, "learning_rate": 6.825504557744167e-05, "loss": 0.0222, "step": 286 }, { "epoch": 2.222652468538238, "grad_norm": 0.017933079972863197, "learning_rate": 6.800264780632494e-05, "loss": 0.0127, "step": 287 }, { "epoch": 2.230396902226525, "grad_norm": 0.02196042612195015, "learning_rate": 6.774972185869927e-05, "loss": 0.013, "step": 288 }, { "epoch": 2.2381413359148112, "grad_norm": 0.02711823582649231, "learning_rate": 6.749627515511442e-05, "loss": 0.0198, "step": 289 }, { "epoch": 2.2458857696030976, "grad_norm": 0.01899660937488079, "learning_rate": 6.724231513139852e-05, "loss": 0.0106, "step": 290 }, { "epoch": 2.2458857696030976, "eval_loss": 0.015821926295757294, "eval_runtime": 4.8849, "eval_samples_per_second": 10.236, "eval_steps_per_second": 2.661, "step": 290 }, { "epoch": 2.2536302032913844, "grad_norm": 0.02587137557566166, "learning_rate": 6.698784923843992e-05, "loss": 0.0204, "step": 291 }, { "epoch": 2.261374636979671, "grad_norm": 0.02532321773469448, "learning_rate": 6.673288494196858e-05, "loss": 0.0191, "step": 292 }, { "epoch": 2.2691190706679576, "grad_norm": 0.03079635463654995, "learning_rate": 6.647742972233703e-05, "loss": 0.0205, "step": 293 }, { "epoch": 2.276863504356244, "grad_norm": 0.023865051567554474, "learning_rate": 6.622149107430088e-05, "loss": 0.0151, "step": 294 }, { "epoch": 2.2846079380445303, "grad_norm": 0.02512257918715477, "learning_rate": 6.5965076506799e-05, "loss": 0.014, "step": 295 }, { "epoch": 2.2846079380445303, "eval_loss": 0.015925200656056404, "eval_runtime": 4.8773, "eval_samples_per_second": 10.251, "eval_steps_per_second": 2.665, "step": 295 }, { "epoch": 2.292352371732817, "grad_norm": 0.026422763243317604, "learning_rate": 6.570819354273317e-05, "loss": 0.0173, "step": 296 }, { "epoch": 2.3000968054211035, "grad_norm": 0.02848372980952263, "learning_rate": 6.545084971874738e-05, "loss": 0.0251, "step": 297 }, { "epoch": 2.3078412391093903, "grad_norm": 0.02191309630870819, "learning_rate": 6.519305258500666e-05, "loss": 0.0104, "step": 298 }, { "epoch": 2.3155856727976767, "grad_norm": 0.025703053921461105, "learning_rate": 6.493480970497569e-05, "loss": 0.0311, "step": 299 }, { "epoch": 2.323330106485963, "grad_norm": 0.021763848140835762, "learning_rate": 6.467612865519674e-05, "loss": 0.0168, "step": 300 }, { "epoch": 2.323330106485963, "eval_loss": 0.01583768054842949, "eval_runtime": 4.8796, "eval_samples_per_second": 10.247, "eval_steps_per_second": 2.664, "step": 300 }, { "epoch": 2.33107454017425, "grad_norm": 0.01981600932776928, "learning_rate": 6.441701702506754e-05, "loss": 0.0174, "step": 301 }, { "epoch": 2.3388189738625362, "grad_norm": 0.021816400811076164, "learning_rate": 6.415748241661851e-05, "loss": 0.0222, "step": 302 }, { "epoch": 2.346563407550823, "grad_norm": 0.028364678844809532, "learning_rate": 6.389753244428972e-05, "loss": 0.0222, "step": 303 }, { "epoch": 2.3543078412391094, "grad_norm": 0.03110797517001629, "learning_rate": 6.363717473470759e-05, "loss": 0.0194, "step": 304 }, { "epoch": 2.362052274927396, "grad_norm": 0.03083011880517006, "learning_rate": 6.337641692646106e-05, "loss": 0.0217, "step": 305 }, { "epoch": 2.362052274927396, "eval_loss": 0.01598162204027176, "eval_runtime": 4.8805, "eval_samples_per_second": 10.245, "eval_steps_per_second": 2.664, "step": 305 }, { "epoch": 2.3697967086156826, "grad_norm": 0.027600981295108795, "learning_rate": 6.311526666987743e-05, "loss": 0.0168, "step": 306 }, { "epoch": 2.377541142303969, "grad_norm": 0.050711363554000854, "learning_rate": 6.285373162679803e-05, "loss": 0.027, "step": 307 }, { "epoch": 2.3852855759922553, "grad_norm": 0.0258706696331501, "learning_rate": 6.259181947035342e-05, "loss": 0.014, "step": 308 }, { "epoch": 2.393030009680542, "grad_norm": 0.022878140211105347, "learning_rate": 6.232953788473811e-05, "loss": 0.0125, "step": 309 }, { "epoch": 2.4007744433688285, "grad_norm": 0.02646121010184288, "learning_rate": 6.206689456498529e-05, "loss": 0.0225, "step": 310 }, { "epoch": 2.4007744433688285, "eval_loss": 0.015688462182879448, "eval_runtime": 4.8894, "eval_samples_per_second": 10.226, "eval_steps_per_second": 2.659, "step": 310 }, { "epoch": 2.4085188770571153, "grad_norm": 0.01907186210155487, "learning_rate": 6.1803897216741e-05, "loss": 0.0105, "step": 311 }, { "epoch": 2.4162633107454017, "grad_norm": 0.025598157197237015, "learning_rate": 6.154055355603807e-05, "loss": 0.0195, "step": 312 }, { "epoch": 2.4240077444336885, "grad_norm": 0.021488605067133904, "learning_rate": 6.127687130906972e-05, "loss": 0.0171, "step": 313 }, { "epoch": 2.431752178121975, "grad_norm": 0.023560060188174248, "learning_rate": 6.101285821196285e-05, "loss": 0.0234, "step": 314 }, { "epoch": 2.4394966118102612, "grad_norm": 0.020358163863420486, "learning_rate": 6.0748522010551215e-05, "loss": 0.0158, "step": 315 }, { "epoch": 2.4394966118102612, "eval_loss": 0.015287678688764572, "eval_runtime": 4.884, "eval_samples_per_second": 10.237, "eval_steps_per_second": 2.662, "step": 315 }, { "epoch": 2.447241045498548, "grad_norm": 0.04023784399032593, "learning_rate": 6.048387046014795e-05, "loss": 0.0195, "step": 316 }, { "epoch": 2.4549854791868344, "grad_norm": 0.018253512680530548, "learning_rate": 6.021891132531825e-05, "loss": 0.0172, "step": 317 }, { "epoch": 2.4627299128751208, "grad_norm": 0.020507492125034332, "learning_rate": 5.995365237965144e-05, "loss": 0.0234, "step": 318 }, { "epoch": 2.4704743465634076, "grad_norm": 0.025176333263516426, "learning_rate": 5.9688101405532925e-05, "loss": 0.0196, "step": 319 }, { "epoch": 2.478218780251694, "grad_norm": 0.022779326885938644, "learning_rate": 5.9422266193915924e-05, "loss": 0.0122, "step": 320 }, { "epoch": 2.478218780251694, "eval_loss": 0.015223703347146511, "eval_runtime": 4.8811, "eval_samples_per_second": 10.244, "eval_steps_per_second": 2.663, "step": 320 }, { "epoch": 2.4859632139399808, "grad_norm": 0.019654158502817154, "learning_rate": 5.9156154544092815e-05, "loss": 0.0191, "step": 321 }, { "epoch": 2.493707647628267, "grad_norm": 0.01823735609650612, "learning_rate": 5.8889774263466355e-05, "loss": 0.0128, "step": 322 }, { "epoch": 2.501452081316554, "grad_norm": 0.022733347490429878, "learning_rate": 5.862313316732063e-05, "loss": 0.0095, "step": 323 }, { "epoch": 2.5091965150048403, "grad_norm": 0.019566858187317848, "learning_rate": 5.8356239078591724e-05, "loss": 0.012, "step": 324 }, { "epoch": 2.5169409486931267, "grad_norm": 0.023728664964437485, "learning_rate": 5.808909982763825e-05, "loss": 0.0152, "step": 325 }, { "epoch": 2.5169409486931267, "eval_loss": 0.01537258829921484, "eval_runtime": 4.8868, "eval_samples_per_second": 10.232, "eval_steps_per_second": 2.66, "step": 325 }, { "epoch": 2.5246853823814135, "grad_norm": 0.026009773835539818, "learning_rate": 5.782172325201155e-05, "loss": 0.0158, "step": 326 }, { "epoch": 2.5324298160697, "grad_norm": 0.045942921191453934, "learning_rate": 5.7554117196225846e-05, "loss": 0.0304, "step": 327 }, { "epoch": 2.5401742497579862, "grad_norm": 0.017686696723103523, "learning_rate": 5.728628951152799e-05, "loss": 0.0157, "step": 328 }, { "epoch": 2.547918683446273, "grad_norm": 0.020913394168019295, "learning_rate": 5.701824805566722e-05, "loss": 0.0162, "step": 329 }, { "epoch": 2.5556631171345594, "grad_norm": 0.025631655007600784, "learning_rate": 5.675000069266451e-05, "loss": 0.0268, "step": 330 }, { "epoch": 2.5556631171345594, "eval_loss": 0.015396489761769772, "eval_runtime": 4.8797, "eval_samples_per_second": 10.246, "eval_steps_per_second": 2.664, "step": 330 }, { "epoch": 2.563407550822846, "grad_norm": 0.021823951974511147, "learning_rate": 5.6481555292581946e-05, "loss": 0.0116, "step": 331 }, { "epoch": 2.5711519845111326, "grad_norm": 0.023217862471938133, "learning_rate": 5.621291973129177e-05, "loss": 0.0151, "step": 332 }, { "epoch": 2.5788964181994194, "grad_norm": 0.03442602604627609, "learning_rate": 5.5944101890245324e-05, "loss": 0.0202, "step": 333 }, { "epoch": 2.5866408518877058, "grad_norm": 0.023536914959549904, "learning_rate": 5.5675109656241876e-05, "loss": 0.014, "step": 334 }, { "epoch": 2.594385285575992, "grad_norm": 0.026387905701994896, "learning_rate": 5.540595092119709e-05, "loss": 0.0174, "step": 335 }, { "epoch": 2.594385285575992, "eval_loss": 0.01569586619734764, "eval_runtime": 4.8895, "eval_samples_per_second": 10.226, "eval_steps_per_second": 2.659, "step": 335 }, { "epoch": 2.602129719264279, "grad_norm": 0.02376389689743519, "learning_rate": 5.5136633581911655e-05, "loss": 0.0232, "step": 336 }, { "epoch": 2.6098741529525653, "grad_norm": 0.022475535050034523, "learning_rate": 5.486716553983951e-05, "loss": 0.0176, "step": 337 }, { "epoch": 2.6176185866408517, "grad_norm": 0.026273801922798157, "learning_rate": 5.4597554700855946e-05, "loss": 0.0099, "step": 338 }, { "epoch": 2.6253630203291385, "grad_norm": 0.0252407044172287, "learning_rate": 5.432780897502589e-05, "loss": 0.0169, "step": 339 }, { "epoch": 2.633107454017425, "grad_norm": 0.025699293240904808, "learning_rate": 5.4057936276371565e-05, "loss": 0.0147, "step": 340 }, { "epoch": 2.633107454017425, "eval_loss": 0.015603473410010338, "eval_runtime": 4.8875, "eval_samples_per_second": 10.23, "eval_steps_per_second": 2.66, "step": 340 }, { "epoch": 2.6408518877057117, "grad_norm": 0.02292807772755623, "learning_rate": 5.378794452264053e-05, "loss": 0.0112, "step": 341 }, { "epoch": 2.648596321393998, "grad_norm": 0.02671566605567932, "learning_rate": 5.351784163507319e-05, "loss": 0.0157, "step": 342 }, { "epoch": 2.656340755082285, "grad_norm": 0.024869635701179504, "learning_rate": 5.324763553817054e-05, "loss": 0.0183, "step": 343 }, { "epoch": 2.664085188770571, "grad_norm": 0.030287204310297966, "learning_rate": 5.2977334159461614e-05, "loss": 0.0235, "step": 344 }, { "epoch": 2.6718296224588576, "grad_norm": 0.021120263263583183, "learning_rate": 5.270694542927088e-05, "loss": 0.0191, "step": 345 }, { "epoch": 2.6718296224588576, "eval_loss": 0.015455065295100212, "eval_runtime": 4.8759, "eval_samples_per_second": 10.254, "eval_steps_per_second": 2.666, "step": 345 }, { "epoch": 2.6795740561471444, "grad_norm": 0.022198256105184555, "learning_rate": 5.2436477280485605e-05, "loss": 0.017, "step": 346 }, { "epoch": 2.6873184898354308, "grad_norm": 0.02474604733288288, "learning_rate": 5.216593764832311e-05, "loss": 0.0182, "step": 347 }, { "epoch": 2.695062923523717, "grad_norm": 0.022626683115959167, "learning_rate": 5.189533447009794e-05, "loss": 0.0235, "step": 348 }, { "epoch": 2.702807357212004, "grad_norm": 0.025306569412350655, "learning_rate": 5.162467568498903e-05, "loss": 0.0185, "step": 349 }, { "epoch": 2.7105517909002903, "grad_norm": 0.01958346739411354, "learning_rate": 5.135396923380673e-05, "loss": 0.0139, "step": 350 }, { "epoch": 2.7105517909002903, "eval_loss": 0.015315129421651363, "eval_runtime": 4.8828, "eval_samples_per_second": 10.24, "eval_steps_per_second": 2.662, "step": 350 }, { "epoch": 2.718296224588577, "grad_norm": 0.028099266812205315, "learning_rate": 5.108322305875988e-05, "loss": 0.0151, "step": 351 }, { "epoch": 2.7260406582768635, "grad_norm": 0.026032108813524246, "learning_rate": 5.081244510322274e-05, "loss": 0.0143, "step": 352 }, { "epoch": 2.7337850919651503, "grad_norm": 0.030373040586709976, "learning_rate": 5.0541643311502e-05, "loss": 0.0177, "step": 353 }, { "epoch": 2.7415295256534367, "grad_norm": 0.026800263673067093, "learning_rate": 5.027082562860368e-05, "loss": 0.0146, "step": 354 }, { "epoch": 2.749273959341723, "grad_norm": 0.028782140463590622, "learning_rate": 5e-05, "loss": 0.026, "step": 355 }, { "epoch": 2.749273959341723, "eval_loss": 0.015001767314970493, "eval_runtime": 4.8958, "eval_samples_per_second": 10.213, "eval_steps_per_second": 2.655, "step": 355 }, { "epoch": 2.75701839303001, "grad_norm": 0.023763621225953102, "learning_rate": 4.9729174371396334e-05, "loss": 0.0138, "step": 356 }, { "epoch": 2.764762826718296, "grad_norm": 0.02057846635580063, "learning_rate": 4.945835668849801e-05, "loss": 0.0101, "step": 357 }, { "epoch": 2.7725072604065826, "grad_norm": 0.026699546724557877, "learning_rate": 4.9187554896777285e-05, "loss": 0.0185, "step": 358 }, { "epoch": 2.7802516940948694, "grad_norm": 0.025631215423345566, "learning_rate": 4.8916776941240135e-05, "loss": 0.0177, "step": 359 }, { "epoch": 2.7879961277831558, "grad_norm": 0.020701708272099495, "learning_rate": 4.8646030766193285e-05, "loss": 0.0162, "step": 360 }, { "epoch": 2.7879961277831558, "eval_loss": 0.014788495376706123, "eval_runtime": 4.885, "eval_samples_per_second": 10.235, "eval_steps_per_second": 2.661, "step": 360 }, { "epoch": 2.7957405614714426, "grad_norm": 0.018802624195814133, "learning_rate": 4.837532431501098e-05, "loss": 0.0195, "step": 361 }, { "epoch": 2.803484995159729, "grad_norm": 0.024294838309288025, "learning_rate": 4.8104665529902075e-05, "loss": 0.0172, "step": 362 }, { "epoch": 2.8112294288480157, "grad_norm": 0.02249518595635891, "learning_rate": 4.78340623516769e-05, "loss": 0.0157, "step": 363 }, { "epoch": 2.818973862536302, "grad_norm": 0.022549943998456, "learning_rate": 4.756352271951441e-05, "loss": 0.0167, "step": 364 }, { "epoch": 2.8267182962245885, "grad_norm": 0.03274448588490486, "learning_rate": 4.729305457072913e-05, "loss": 0.0258, "step": 365 }, { "epoch": 2.8267182962245885, "eval_loss": 0.014879841357469559, "eval_runtime": 4.8948, "eval_samples_per_second": 10.215, "eval_steps_per_second": 2.656, "step": 365 }, { "epoch": 2.8344627299128753, "grad_norm": 0.031107768416404724, "learning_rate": 4.70226658405384e-05, "loss": 0.0167, "step": 366 }, { "epoch": 2.8422071636011617, "grad_norm": 0.023017307743430138, "learning_rate": 4.675236446182946e-05, "loss": 0.0126, "step": 367 }, { "epoch": 2.849951597289448, "grad_norm": 0.03121495246887207, "learning_rate": 4.648215836492682e-05, "loss": 0.0139, "step": 368 }, { "epoch": 2.857696030977735, "grad_norm": 0.026987893506884575, "learning_rate": 4.6212055477359486e-05, "loss": 0.0147, "step": 369 }, { "epoch": 2.865440464666021, "grad_norm": 0.024263298138976097, "learning_rate": 4.594206372362845e-05, "loss": 0.0154, "step": 370 }, { "epoch": 2.865440464666021, "eval_loss": 0.014814168214797974, "eval_runtime": 4.8923, "eval_samples_per_second": 10.22, "eval_steps_per_second": 2.657, "step": 370 }, { "epoch": 2.8731848983543076, "grad_norm": 0.022974541410803795, "learning_rate": 4.567219102497412e-05, "loss": 0.0136, "step": 371 }, { "epoch": 2.8809293320425944, "grad_norm": 0.025871610268950462, "learning_rate": 4.540244529914406e-05, "loss": 0.0126, "step": 372 }, { "epoch": 2.888673765730881, "grad_norm": 0.026091424748301506, "learning_rate": 4.5132834460160524e-05, "loss": 0.023, "step": 373 }, { "epoch": 2.8964181994191676, "grad_norm": 0.024125855416059494, "learning_rate": 4.486336641808835e-05, "loss": 0.0129, "step": 374 }, { "epoch": 2.904162633107454, "grad_norm": 0.01973029226064682, "learning_rate": 4.4594049078802925e-05, "loss": 0.0166, "step": 375 }, { "epoch": 2.904162633107454, "eval_loss": 0.01432761363685131, "eval_runtime": 4.8944, "eval_samples_per_second": 10.216, "eval_steps_per_second": 2.656, "step": 375 }, { "epoch": 2.9119070667957407, "grad_norm": 0.022474128752946854, "learning_rate": 4.4324890343758136e-05, "loss": 0.0115, "step": 376 }, { "epoch": 2.919651500484027, "grad_norm": 0.022197918966412544, "learning_rate": 4.405589810975468e-05, "loss": 0.0108, "step": 377 }, { "epoch": 2.9273959341723135, "grad_norm": 0.023376472294330597, "learning_rate": 4.3787080268708244e-05, "loss": 0.0105, "step": 378 }, { "epoch": 2.9351403678606003, "grad_norm": 0.01602279581129551, "learning_rate": 4.351844470741808e-05, "loss": 0.0094, "step": 379 }, { "epoch": 2.9428848015488867, "grad_norm": 0.02684823051095009, "learning_rate": 4.3249999307335495e-05, "loss": 0.0189, "step": 380 }, { "epoch": 2.9428848015488867, "eval_loss": 0.014240576885640621, "eval_runtime": 4.8817, "eval_samples_per_second": 10.242, "eval_steps_per_second": 2.663, "step": 380 }, { "epoch": 2.950629235237173, "grad_norm": 0.0212652999907732, "learning_rate": 4.298175194433279e-05, "loss": 0.0154, "step": 381 }, { "epoch": 2.95837366892546, "grad_norm": 0.019883181899785995, "learning_rate": 4.2713710488472006e-05, "loss": 0.0087, "step": 382 }, { "epoch": 2.9661181026137466, "grad_norm": 0.02650902420282364, "learning_rate": 4.244588280377417e-05, "loss": 0.0164, "step": 383 }, { "epoch": 2.973862536302033, "grad_norm": 0.02401239052414894, "learning_rate": 4.2178276747988446e-05, "loss": 0.0139, "step": 384 }, { "epoch": 2.9816069699903194, "grad_norm": 0.022838260978460312, "learning_rate": 4.1910900172361764e-05, "loss": 0.0155, "step": 385 }, { "epoch": 2.9816069699903194, "eval_loss": 0.0144858593121171, "eval_runtime": 4.8906, "eval_samples_per_second": 10.224, "eval_steps_per_second": 2.658, "step": 385 }, { "epoch": 2.989351403678606, "grad_norm": 0.03657938912510872, "learning_rate": 4.164376092140828e-05, "loss": 0.0286, "step": 386 }, { "epoch": 2.9970958373668926, "grad_norm": 0.02792074717581272, "learning_rate": 4.1376866832679385e-05, "loss": 0.014, "step": 387 }, { "epoch": 3.004840271055179, "grad_norm": 0.05196017026901245, "learning_rate": 4.1110225736533664e-05, "loss": 0.0222, "step": 388 }, { "epoch": 3.0125847047434657, "grad_norm": 0.0229202788323164, "learning_rate": 4.084384545590719e-05, "loss": 0.007, "step": 389 }, { "epoch": 3.020329138431752, "grad_norm": 0.021996086463332176, "learning_rate": 4.057773380608411e-05, "loss": 0.0121, "step": 390 }, { "epoch": 3.020329138431752, "eval_loss": 0.014621075242757797, "eval_runtime": 4.8766, "eval_samples_per_second": 10.253, "eval_steps_per_second": 2.666, "step": 390 }, { "epoch": 3.028073572120039, "grad_norm": 0.024300433695316315, "learning_rate": 4.0311898594467086e-05, "loss": 0.0119, "step": 391 }, { "epoch": 3.0358180058083253, "grad_norm": 0.023426620289683342, "learning_rate": 4.0046347620348586e-05, "loss": 0.0123, "step": 392 }, { "epoch": 3.0435624394966116, "grad_norm": 0.024129556491971016, "learning_rate": 3.9781088674681764e-05, "loss": 0.0124, "step": 393 }, { "epoch": 3.0513068731848985, "grad_norm": 0.04510955512523651, "learning_rate": 3.951612953985207e-05, "loss": 0.0174, "step": 394 }, { "epoch": 3.059051306873185, "grad_norm": 0.02260909229516983, "learning_rate": 3.92514779894488e-05, "loss": 0.0122, "step": 395 }, { "epoch": 3.059051306873185, "eval_loss": 0.014701277017593384, "eval_runtime": 4.8794, "eval_samples_per_second": 10.247, "eval_steps_per_second": 2.664, "step": 395 }, { "epoch": 3.0667957405614716, "grad_norm": 0.020229579880833626, "learning_rate": 3.8987141788037154e-05, "loss": 0.0063, "step": 396 }, { "epoch": 3.074540174249758, "grad_norm": 0.024916259571909904, "learning_rate": 3.8723128690930296e-05, "loss": 0.0099, "step": 397 }, { "epoch": 3.0822846079380444, "grad_norm": 0.017238672822713852, "learning_rate": 3.8459446443961944e-05, "loss": 0.0071, "step": 398 }, { "epoch": 3.090029041626331, "grad_norm": 0.028883591294288635, "learning_rate": 3.8196102783258994e-05, "loss": 0.0181, "step": 399 }, { "epoch": 3.0977734753146176, "grad_norm": 0.025792468339204788, "learning_rate": 3.793310543501473e-05, "loss": 0.0136, "step": 400 }, { "epoch": 3.0977734753146176, "eval_loss": 0.014834250323474407, "eval_runtime": 4.8859, "eval_samples_per_second": 10.234, "eval_steps_per_second": 2.661, "step": 400 }, { "epoch": 3.1055179090029044, "grad_norm": 0.03113100863993168, "learning_rate": 3.7670462115261906e-05, "loss": 0.0193, "step": 401 }, { "epoch": 3.1132623426911907, "grad_norm": 0.02263321541249752, "learning_rate": 3.7408180529646596e-05, "loss": 0.0123, "step": 402 }, { "epoch": 3.121006776379477, "grad_norm": 0.023540707305073738, "learning_rate": 3.714626837320195e-05, "loss": 0.0119, "step": 403 }, { "epoch": 3.128751210067764, "grad_norm": 0.031784623861312866, "learning_rate": 3.688473333012259e-05, "loss": 0.0175, "step": 404 }, { "epoch": 3.1364956437560503, "grad_norm": 0.022701062262058258, "learning_rate": 3.6623583073538966e-05, "loss": 0.0107, "step": 405 }, { "epoch": 3.1364956437560503, "eval_loss": 0.01486950647085905, "eval_runtime": 4.892, "eval_samples_per_second": 10.221, "eval_steps_per_second": 2.657, "step": 405 }, { "epoch": 3.144240077444337, "grad_norm": 0.026784732937812805, "learning_rate": 3.636282526529242e-05, "loss": 0.0125, "step": 406 }, { "epoch": 3.1519845111326235, "grad_norm": 0.026719210669398308, "learning_rate": 3.6102467555710295e-05, "loss": 0.0103, "step": 407 }, { "epoch": 3.15972894482091, "grad_norm": 0.03489716723561287, "learning_rate": 3.584251758338151e-05, "loss": 0.0134, "step": 408 }, { "epoch": 3.1674733785091966, "grad_norm": 0.02056041732430458, "learning_rate": 3.558298297493247e-05, "loss": 0.0073, "step": 409 }, { "epoch": 3.175217812197483, "grad_norm": 0.030753985047340393, "learning_rate": 3.5323871344803263e-05, "loss": 0.0164, "step": 410 }, { "epoch": 3.175217812197483, "eval_loss": 0.01476968638598919, "eval_runtime": 4.8875, "eval_samples_per_second": 10.23, "eval_steps_per_second": 2.66, "step": 410 }, { "epoch": 3.1829622458857694, "grad_norm": 0.025167269632220268, "learning_rate": 3.506519029502433e-05, "loss": 0.0121, "step": 411 }, { "epoch": 3.190706679574056, "grad_norm": 0.03184746950864792, "learning_rate": 3.480694741499334e-05, "loss": 0.0174, "step": 412 }, { "epoch": 3.1984511132623425, "grad_norm": 0.014001097530126572, "learning_rate": 3.4549150281252636e-05, "loss": 0.0057, "step": 413 }, { "epoch": 3.2061955469506294, "grad_norm": 0.027478694915771484, "learning_rate": 3.4291806457266826e-05, "loss": 0.0138, "step": 414 }, { "epoch": 3.2139399806389157, "grad_norm": 0.02516726590692997, "learning_rate": 3.403492349320101e-05, "loss": 0.0112, "step": 415 }, { "epoch": 3.2139399806389157, "eval_loss": 0.014760926365852356, "eval_runtime": 4.876, "eval_samples_per_second": 10.254, "eval_steps_per_second": 2.666, "step": 415 }, { "epoch": 3.2216844143272025, "grad_norm": 0.03305725008249283, "learning_rate": 3.3778508925699124e-05, "loss": 0.0256, "step": 416 }, { "epoch": 3.229428848015489, "grad_norm": 0.024431169033050537, "learning_rate": 3.3522570277662985e-05, "loss": 0.0083, "step": 417 }, { "epoch": 3.2371732817037753, "grad_norm": 0.03031334839761257, "learning_rate": 3.326711505803142e-05, "loss": 0.0107, "step": 418 }, { "epoch": 3.244917715392062, "grad_norm": 0.033758629113435745, "learning_rate": 3.3012150761560085e-05, "loss": 0.0186, "step": 419 }, { "epoch": 3.2526621490803485, "grad_norm": 0.02770036645233631, "learning_rate": 3.275768486860149e-05, "loss": 0.0097, "step": 420 }, { "epoch": 3.2526621490803485, "eval_loss": 0.015088791027665138, "eval_runtime": 4.8982, "eval_samples_per_second": 10.208, "eval_steps_per_second": 2.654, "step": 420 }, { "epoch": 3.260406582768635, "grad_norm": 0.02369946427643299, "learning_rate": 3.250372484488558e-05, "loss": 0.0094, "step": 421 }, { "epoch": 3.2681510164569216, "grad_norm": 0.03576388210058212, "learning_rate": 3.225027814130074e-05, "loss": 0.0125, "step": 422 }, { "epoch": 3.275895450145208, "grad_norm": 0.025971444323658943, "learning_rate": 3.199735219367507e-05, "loss": 0.0118, "step": 423 }, { "epoch": 3.283639883833495, "grad_norm": 0.028038574382662773, "learning_rate": 3.174495442255836e-05, "loss": 0.0099, "step": 424 }, { "epoch": 3.291384317521781, "grad_norm": 0.027834760025143623, "learning_rate": 3.149309223300428e-05, "loss": 0.0113, "step": 425 }, { "epoch": 3.291384317521781, "eval_loss": 0.014965364709496498, "eval_runtime": 4.8786, "eval_samples_per_second": 10.249, "eval_steps_per_second": 2.665, "step": 425 }, { "epoch": 3.299128751210068, "grad_norm": 0.023443985730409622, "learning_rate": 3.124177301435324e-05, "loss": 0.0132, "step": 426 }, { "epoch": 3.3068731848983544, "grad_norm": 0.024410808458924294, "learning_rate": 3.09910041400154e-05, "loss": 0.0102, "step": 427 }, { "epoch": 3.3146176185866407, "grad_norm": 0.032607510685920715, "learning_rate": 3.0740792967254604e-05, "loss": 0.0168, "step": 428 }, { "epoch": 3.3223620522749275, "grad_norm": 0.03291484713554382, "learning_rate": 3.0491146836972272e-05, "loss": 0.019, "step": 429 }, { "epoch": 3.330106485963214, "grad_norm": 0.03559967130422592, "learning_rate": 3.024207307349224e-05, "loss": 0.0303, "step": 430 }, { "epoch": 3.330106485963214, "eval_loss": 0.014858649112284184, "eval_runtime": 4.8834, "eval_samples_per_second": 10.239, "eval_steps_per_second": 2.662, "step": 430 }, { "epoch": 3.3378509196515003, "grad_norm": 0.02721838466823101, "learning_rate": 2.9993578984345672e-05, "loss": 0.0111, "step": 431 }, { "epoch": 3.345595353339787, "grad_norm": 0.028012285009026527, "learning_rate": 2.9745671860056868e-05, "loss": 0.0136, "step": 432 }, { "epoch": 3.3533397870280734, "grad_norm": 0.029208144173026085, "learning_rate": 2.9498358973929196e-05, "loss": 0.013, "step": 433 }, { "epoch": 3.3610842207163603, "grad_norm": 0.031169850379228592, "learning_rate": 2.9251647581831836e-05, "loss": 0.0187, "step": 434 }, { "epoch": 3.3688286544046466, "grad_norm": 0.03211589530110359, "learning_rate": 2.900554492198677e-05, "loss": 0.0161, "step": 435 }, { "epoch": 3.3688286544046466, "eval_loss": 0.014618839137256145, "eval_runtime": 4.887, "eval_samples_per_second": 10.231, "eval_steps_per_second": 2.66, "step": 435 }, { "epoch": 3.3765730880929334, "grad_norm": 0.0314168706536293, "learning_rate": 2.876005821475657e-05, "loss": 0.0106, "step": 436 }, { "epoch": 3.38431752178122, "grad_norm": 0.03567107021808624, "learning_rate": 2.851519466243242e-05, "loss": 0.0173, "step": 437 }, { "epoch": 3.392061955469506, "grad_norm": 0.031098151579499245, "learning_rate": 2.8270961449022893e-05, "loss": 0.0185, "step": 438 }, { "epoch": 3.399806389157793, "grad_norm": 0.028943657875061035, "learning_rate": 2.802736574004319e-05, "loss": 0.0159, "step": 439 }, { "epoch": 3.4075508228460794, "grad_norm": 0.023004574701189995, "learning_rate": 2.7784414682304832e-05, "loss": 0.011, "step": 440 }, { "epoch": 3.4075508228460794, "eval_loss": 0.014386112801730633, "eval_runtime": 4.8818, "eval_samples_per_second": 10.242, "eval_steps_per_second": 2.663, "step": 440 }, { "epoch": 3.4152952565343657, "grad_norm": 0.027619289234280586, "learning_rate": 2.7542115403706063e-05, "loss": 0.0089, "step": 441 }, { "epoch": 3.4230396902226525, "grad_norm": 0.025844210758805275, "learning_rate": 2.7300475013022663e-05, "loss": 0.0127, "step": 442 }, { "epoch": 3.430784123910939, "grad_norm": 0.01797422766685486, "learning_rate": 2.7059500599699476e-05, "loss": 0.0068, "step": 443 }, { "epoch": 3.4385285575992257, "grad_norm": 0.031139735132455826, "learning_rate": 2.6819199233642278e-05, "loss": 0.0135, "step": 444 }, { "epoch": 3.446272991287512, "grad_norm": 0.03126378357410431, "learning_rate": 2.65795779650105e-05, "loss": 0.0084, "step": 445 }, { "epoch": 3.446272991287512, "eval_loss": 0.014389649964869022, "eval_runtime": 4.8893, "eval_samples_per_second": 10.226, "eval_steps_per_second": 2.659, "step": 445 }, { "epoch": 3.454017424975799, "grad_norm": 0.019535277038812637, "learning_rate": 2.6340643824010247e-05, "loss": 0.0099, "step": 446 }, { "epoch": 3.4617618586640853, "grad_norm": 0.029923155903816223, "learning_rate": 2.6102403820688177e-05, "loss": 0.0158, "step": 447 }, { "epoch": 3.4695062923523716, "grad_norm": 0.023479627445340157, "learning_rate": 2.586486494472572e-05, "loss": 0.0066, "step": 448 }, { "epoch": 3.4772507260406584, "grad_norm": 0.03173988685011864, "learning_rate": 2.562803416523405e-05, "loss": 0.01, "step": 449 }, { "epoch": 3.484995159728945, "grad_norm": 0.03306049853563309, "learning_rate": 2.539191843054963e-05, "loss": 0.0127, "step": 450 }, { "epoch": 3.484995159728945, "eval_loss": 0.014806166291236877, "eval_runtime": 4.9121, "eval_samples_per_second": 10.179, "eval_steps_per_second": 2.647, "step": 450 }, { "epoch": 3.492739593417231, "grad_norm": 0.02089696377515793, "learning_rate": 2.51565246680304e-05, "loss": 0.0062, "step": 451 }, { "epoch": 3.500484027105518, "grad_norm": 0.03812693804502487, "learning_rate": 2.4921859783852408e-05, "loss": 0.0116, "step": 452 }, { "epoch": 3.5082284607938043, "grad_norm": 0.02929401397705078, "learning_rate": 2.4687930662807303e-05, "loss": 0.0136, "step": 453 }, { "epoch": 3.515972894482091, "grad_norm": 0.024923592805862427, "learning_rate": 2.445474416810033e-05, "loss": 0.0094, "step": 454 }, { "epoch": 3.5237173281703775, "grad_norm": 0.02743164636194706, "learning_rate": 2.422230714114891e-05, "loss": 0.0134, "step": 455 }, { "epoch": 3.5237173281703775, "eval_loss": 0.01469426229596138, "eval_runtime": 4.8924, "eval_samples_per_second": 10.22, "eval_steps_per_second": 2.657, "step": 455 }, { "epoch": 3.5314617618586643, "grad_norm": 0.04384300857782364, "learning_rate": 2.399062640138201e-05, "loss": 0.0233, "step": 456 }, { "epoch": 3.5392061955469507, "grad_norm": 0.03357204422354698, "learning_rate": 2.3759708746039976e-05, "loss": 0.0177, "step": 457 }, { "epoch": 3.546950629235237, "grad_norm": 0.03177043795585632, "learning_rate": 2.3529560949975182e-05, "loss": 0.0087, "step": 458 }, { "epoch": 3.554695062923524, "grad_norm": 0.02979344129562378, "learning_rate": 2.3300189765453196e-05, "loss": 0.0082, "step": 459 }, { "epoch": 3.5624394966118103, "grad_norm": 0.021871499717235565, "learning_rate": 2.3071601921954794e-05, "loss": 0.0092, "step": 460 }, { "epoch": 3.5624394966118103, "eval_loss": 0.014372522011399269, "eval_runtime": 4.8873, "eval_samples_per_second": 10.231, "eval_steps_per_second": 2.66, "step": 460 }, { "epoch": 3.5701839303000966, "grad_norm": 0.027945492416620255, "learning_rate": 2.2843804125978357e-05, "loss": 0.0164, "step": 461 }, { "epoch": 3.5779283639883834, "grad_norm": 0.024697836488485336, "learning_rate": 2.2616803060843283e-05, "loss": 0.0074, "step": 462 }, { "epoch": 3.58567279767667, "grad_norm": 0.024211924523115158, "learning_rate": 2.2390605386493757e-05, "loss": 0.0087, "step": 463 }, { "epoch": 3.593417231364956, "grad_norm": 0.025920916348695755, "learning_rate": 2.2165217739303508e-05, "loss": 0.0128, "step": 464 }, { "epoch": 3.601161665053243, "grad_norm": 0.027798939496278763, "learning_rate": 2.194064673188089e-05, "loss": 0.0205, "step": 465 }, { "epoch": 3.601161665053243, "eval_loss": 0.014178312383592129, "eval_runtime": 4.8802, "eval_samples_per_second": 10.246, "eval_steps_per_second": 2.664, "step": 465 }, { "epoch": 3.60890609874153, "grad_norm": 0.0249908696860075, "learning_rate": 2.171689895287513e-05, "loss": 0.0098, "step": 466 }, { "epoch": 3.616650532429816, "grad_norm": 0.023805009201169014, "learning_rate": 2.149398096678283e-05, "loss": 0.0099, "step": 467 }, { "epoch": 3.6243949661181025, "grad_norm": 0.030275024473667145, "learning_rate": 2.12718993137555e-05, "loss": 0.0201, "step": 468 }, { "epoch": 3.6321393998063893, "grad_norm": 0.025657106190919876, "learning_rate": 2.105066050940758e-05, "loss": 0.0102, "step": 469 }, { "epoch": 3.6398838334946757, "grad_norm": 0.02271328866481781, "learning_rate": 2.08302710446253e-05, "loss": 0.0097, "step": 470 }, { "epoch": 3.6398838334946757, "eval_loss": 0.014142417348921299, "eval_runtime": 4.8856, "eval_samples_per_second": 10.234, "eval_steps_per_second": 2.661, "step": 470 }, { "epoch": 3.647628267182962, "grad_norm": 0.026042208075523376, "learning_rate": 2.061073738537635e-05, "loss": 0.0177, "step": 471 }, { "epoch": 3.655372700871249, "grad_norm": 0.021258225664496422, "learning_rate": 2.039206597252001e-05, "loss": 0.0065, "step": 472 }, { "epoch": 3.6631171345595352, "grad_norm": 0.027606485411524773, "learning_rate": 2.0174263221618307e-05, "loss": 0.0127, "step": 473 }, { "epoch": 3.6708615682478216, "grad_norm": 0.02728329971432686, "learning_rate": 1.9957335522747707e-05, "loss": 0.0123, "step": 474 }, { "epoch": 3.6786060019361084, "grad_norm": 0.03719132021069527, "learning_rate": 1.9741289240311755e-05, "loss": 0.0158, "step": 475 }, { "epoch": 3.6786060019361084, "eval_loss": 0.014227832667529583, "eval_runtime": 4.8832, "eval_samples_per_second": 10.239, "eval_steps_per_second": 2.662, "step": 475 }, { "epoch": 3.6863504356243952, "grad_norm": 0.029825210571289062, "learning_rate": 1.9526130712854185e-05, "loss": 0.0128, "step": 476 }, { "epoch": 3.6940948693126816, "grad_norm": 0.09481414407491684, "learning_rate": 1.931186625287313e-05, "loss": 0.0202, "step": 477 }, { "epoch": 3.701839303000968, "grad_norm": 0.027814751490950584, "learning_rate": 1.909850214663575e-05, "loss": 0.0121, "step": 478 }, { "epoch": 3.709583736689255, "grad_norm": 0.03036467730998993, "learning_rate": 1.8886044653993968e-05, "loss": 0.0163, "step": 479 }, { "epoch": 3.717328170377541, "grad_norm": 0.023233845829963684, "learning_rate": 1.8674500008200674e-05, "loss": 0.0095, "step": 480 }, { "epoch": 3.717328170377541, "eval_loss": 0.014037776738405228, "eval_runtime": 4.8827, "eval_samples_per_second": 10.24, "eval_steps_per_second": 2.662, "step": 480 }, { "epoch": 3.7250726040658275, "grad_norm": 0.02636660821735859, "learning_rate": 1.8463874415726918e-05, "loss": 0.0125, "step": 481 }, { "epoch": 3.7328170377541143, "grad_norm": 0.022603245452046394, "learning_rate": 1.82541740560798e-05, "loss": 0.0072, "step": 482 }, { "epoch": 3.7405614714424007, "grad_norm": 0.019264785572886467, "learning_rate": 1.8045405081621215e-05, "loss": 0.0068, "step": 483 }, { "epoch": 3.748305905130687, "grad_norm": 0.02744339220225811, "learning_rate": 1.7837573617387265e-05, "loss": 0.0139, "step": 484 }, { "epoch": 3.756050338818974, "grad_norm": 0.032306037843227386, "learning_rate": 1.7630685760908622e-05, "loss": 0.0163, "step": 485 }, { "epoch": 3.756050338818974, "eval_loss": 0.014070287346839905, "eval_runtime": 4.8787, "eval_samples_per_second": 10.249, "eval_steps_per_second": 2.665, "step": 485 }, { "epoch": 3.7637947725072602, "grad_norm": 0.034267835319042206, "learning_rate": 1.7424747582031637e-05, "loss": 0.0145, "step": 486 }, { "epoch": 3.771539206195547, "grad_norm": 0.02139255404472351, "learning_rate": 1.72197651227402e-05, "loss": 0.0084, "step": 487 }, { "epoch": 3.7792836398838334, "grad_norm": 0.020995331928133965, "learning_rate": 1.7015744396978556e-05, "loss": 0.0065, "step": 488 }, { "epoch": 3.7870280735721202, "grad_norm": 0.03288980573415756, "learning_rate": 1.6812691390474787e-05, "loss": 0.0175, "step": 489 }, { "epoch": 3.7947725072604066, "grad_norm": 0.021166102960705757, "learning_rate": 1.6610612060565234e-05, "loss": 0.007, "step": 490 }, { "epoch": 3.7947725072604066, "eval_loss": 0.014264380559325218, "eval_runtime": 4.8993, "eval_samples_per_second": 10.206, "eval_steps_per_second": 2.653, "step": 490 }, { "epoch": 3.802516940948693, "grad_norm": 0.02033647708594799, "learning_rate": 1.64095123360197e-05, "loss": 0.0081, "step": 491 }, { "epoch": 3.81026137463698, "grad_norm": 0.01951659470796585, "learning_rate": 1.6209398116867574e-05, "loss": 0.008, "step": 492 }, { "epoch": 3.818005808325266, "grad_norm": 0.028182433918118477, "learning_rate": 1.6010275274224606e-05, "loss": 0.0143, "step": 493 }, { "epoch": 3.8257502420135525, "grad_norm": 0.03811497241258621, "learning_rate": 1.5812149650120784e-05, "loss": 0.0139, "step": 494 }, { "epoch": 3.8334946757018393, "grad_norm": 0.02721046842634678, "learning_rate": 1.561502705732883e-05, "loss": 0.0069, "step": 495 }, { "epoch": 3.8334946757018393, "eval_loss": 0.014395428821444511, "eval_runtime": 4.885, "eval_samples_per_second": 10.235, "eval_steps_per_second": 2.661, "step": 495 }, { "epoch": 3.8412391093901257, "grad_norm": 0.03506116569042206, "learning_rate": 1.5418913279193746e-05, "loss": 0.0154, "step": 496 }, { "epoch": 3.8489835430784125, "grad_norm": 0.029712386429309845, "learning_rate": 1.5223814069463078e-05, "loss": 0.0074, "step": 497 }, { "epoch": 3.856727976766699, "grad_norm": 0.021429866552352905, "learning_rate": 1.5029735152118124e-05, "loss": 0.0067, "step": 498 }, { "epoch": 3.8644724104549857, "grad_norm": 0.024990901350975037, "learning_rate": 1.4836682221206e-05, "loss": 0.0089, "step": 499 }, { "epoch": 3.872216844143272, "grad_norm": 0.0315503366291523, "learning_rate": 1.4644660940672627e-05, "loss": 0.012, "step": 500 }, { "epoch": 3.872216844143272, "eval_loss": 0.014392802491784096, "eval_runtime": 4.8814, "eval_samples_per_second": 10.243, "eval_steps_per_second": 2.663, "step": 500 } ], "logging_steps": 1, "max_steps": 645, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.720511304678769e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }