|
{ |
|
"best_metric": 0.014392802491784096, |
|
"best_model_checkpoint": "/home/paperspace/Data/models/dbischof_premise_aea/llm3br256/checkpoint-500", |
|
"epoch": 3.872216844143272, |
|
"eval_steps": 5, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007744433688286544, |
|
"grad_norm": 0.28707125782966614, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.0847, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015488867376573089, |
|
"grad_norm": 0.34009915590286255, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.0928, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.023233301064859633, |
|
"grad_norm": 0.29313409328460693, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.0934, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.030977734753146177, |
|
"grad_norm": 0.2913404107093811, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.0913, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03872216844143272, |
|
"grad_norm": 0.29106780886650085, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.095, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03872216844143272, |
|
"eval_loss": 0.07727333903312683, |
|
"eval_runtime": 5.9343, |
|
"eval_samples_per_second": 8.426, |
|
"eval_steps_per_second": 2.191, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.046466602129719266, |
|
"grad_norm": 0.23025450110435486, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.0948, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05421103581800581, |
|
"grad_norm": 0.21704453229904175, |
|
"learning_rate": 1.0769230769230771e-05, |
|
"loss": 0.0727, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.061955469506292354, |
|
"grad_norm": 0.17385561764240265, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.0689, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0696999031945789, |
|
"grad_norm": 0.15649482607841492, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.0604, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07744433688286544, |
|
"grad_norm": 0.11710207164287567, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.0562, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07744433688286544, |
|
"eval_loss": 0.04053657874464989, |
|
"eval_runtime": 4.8954, |
|
"eval_samples_per_second": 10.214, |
|
"eval_steps_per_second": 2.656, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08518877057115198, |
|
"grad_norm": 0.09721983969211578, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.0393, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09293320425943853, |
|
"grad_norm": 0.09856045991182327, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.0404, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.10067763794772508, |
|
"grad_norm": 0.11793606728315353, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0455, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10842207163601161, |
|
"grad_norm": 0.11285863816738129, |
|
"learning_rate": 2.1538461538461542e-05, |
|
"loss": 0.059, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11616650532429816, |
|
"grad_norm": 0.08813278377056122, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 0.032, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11616650532429816, |
|
"eval_loss": 0.03360835462808609, |
|
"eval_runtime": 4.8812, |
|
"eval_samples_per_second": 10.243, |
|
"eval_steps_per_second": 2.663, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12391093901258471, |
|
"grad_norm": 0.06082022562623024, |
|
"learning_rate": 2.461538461538462e-05, |
|
"loss": 0.0419, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.13165537270087124, |
|
"grad_norm": 0.055546533316373825, |
|
"learning_rate": 2.6153846153846157e-05, |
|
"loss": 0.0452, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1393998063891578, |
|
"grad_norm": 0.0525379441678524, |
|
"learning_rate": 2.7692307692307694e-05, |
|
"loss": 0.0329, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.14714424007744434, |
|
"grad_norm": 0.058248624205589294, |
|
"learning_rate": 2.9230769230769234e-05, |
|
"loss": 0.0338, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15488867376573087, |
|
"grad_norm": 0.057563405483961105, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.0488, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15488867376573087, |
|
"eval_loss": 0.031162459403276443, |
|
"eval_runtime": 4.9017, |
|
"eval_samples_per_second": 10.201, |
|
"eval_steps_per_second": 2.652, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16263310745401743, |
|
"grad_norm": 0.04852646589279175, |
|
"learning_rate": 3.230769230769231e-05, |
|
"loss": 0.0364, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.17037754114230397, |
|
"grad_norm": 0.05401140823960304, |
|
"learning_rate": 3.384615384615385e-05, |
|
"loss": 0.0446, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1781219748305905, |
|
"grad_norm": 0.0492316372692585, |
|
"learning_rate": 3.538461538461539e-05, |
|
"loss": 0.0407, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.18586640851887706, |
|
"grad_norm": 0.037774790078401566, |
|
"learning_rate": 3.692307692307693e-05, |
|
"loss": 0.0315, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1936108422071636, |
|
"grad_norm": 0.04360613971948624, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 0.0331, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1936108422071636, |
|
"eval_loss": 0.02766346000134945, |
|
"eval_runtime": 4.8772, |
|
"eval_samples_per_second": 10.252, |
|
"eval_steps_per_second": 2.665, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.20135527589545016, |
|
"grad_norm": 0.037237901240587234, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0259, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2090997095837367, |
|
"grad_norm": 0.03505983576178551, |
|
"learning_rate": 4.1538461538461544e-05, |
|
"loss": 0.0303, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21684414327202323, |
|
"grad_norm": 0.041253913193941116, |
|
"learning_rate": 4.3076923076923084e-05, |
|
"loss": 0.0453, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2245885769603098, |
|
"grad_norm": 0.04072079062461853, |
|
"learning_rate": 4.461538461538462e-05, |
|
"loss": 0.0316, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.23233301064859632, |
|
"grad_norm": 0.03738202154636383, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.0377, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23233301064859632, |
|
"eval_loss": 0.025424109771847725, |
|
"eval_runtime": 4.8765, |
|
"eval_samples_per_second": 10.253, |
|
"eval_steps_per_second": 2.666, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.24007744433688286, |
|
"grad_norm": 0.03633822873234749, |
|
"learning_rate": 4.76923076923077e-05, |
|
"loss": 0.0369, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.24782187802516942, |
|
"grad_norm": 0.03256253898143768, |
|
"learning_rate": 4.923076923076924e-05, |
|
"loss": 0.0349, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.25556631171345595, |
|
"grad_norm": 0.031838804483413696, |
|
"learning_rate": 5.0769230769230766e-05, |
|
"loss": 0.0283, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2633107454017425, |
|
"grad_norm": 0.026707077398896217, |
|
"learning_rate": 5.230769230769231e-05, |
|
"loss": 0.0283, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.271055179090029, |
|
"grad_norm": 0.03254338726401329, |
|
"learning_rate": 5.384615384615385e-05, |
|
"loss": 0.0316, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.271055179090029, |
|
"eval_loss": 0.024270590394735336, |
|
"eval_runtime": 4.8832, |
|
"eval_samples_per_second": 10.239, |
|
"eval_steps_per_second": 2.662, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2787996127783156, |
|
"grad_norm": 0.030620776116847992, |
|
"learning_rate": 5.538461538461539e-05, |
|
"loss": 0.0306, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.28654404646660214, |
|
"grad_norm": 0.03317311033606529, |
|
"learning_rate": 5.692307692307692e-05, |
|
"loss": 0.0293, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2942884801548887, |
|
"grad_norm": 0.026506489142775536, |
|
"learning_rate": 5.846153846153847e-05, |
|
"loss": 0.0293, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3020329138431752, |
|
"grad_norm": 0.023665621876716614, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0166, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.30977734753146174, |
|
"grad_norm": 0.03278828039765358, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.0374, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.30977734753146174, |
|
"eval_loss": 0.023048410192131996, |
|
"eval_runtime": 4.885, |
|
"eval_samples_per_second": 10.235, |
|
"eval_steps_per_second": 2.661, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.31752178121974833, |
|
"grad_norm": 0.03030160255730152, |
|
"learning_rate": 6.307692307692308e-05, |
|
"loss": 0.0334, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.32526621490803487, |
|
"grad_norm": 0.03384114429354668, |
|
"learning_rate": 6.461538461538462e-05, |
|
"loss": 0.0212, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3330106485963214, |
|
"grad_norm": 0.02560395933687687, |
|
"learning_rate": 6.615384615384616e-05, |
|
"loss": 0.0363, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.34075508228460794, |
|
"grad_norm": 0.026470044627785683, |
|
"learning_rate": 6.76923076923077e-05, |
|
"loss": 0.024, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.34849951597289447, |
|
"grad_norm": 0.023488877341151237, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 0.0208, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.34849951597289447, |
|
"eval_loss": 0.022530335932970047, |
|
"eval_runtime": 4.8759, |
|
"eval_samples_per_second": 10.255, |
|
"eval_steps_per_second": 2.666, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.356243949661181, |
|
"grad_norm": 0.029532263055443764, |
|
"learning_rate": 7.076923076923078e-05, |
|
"loss": 0.0399, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3639883833494676, |
|
"grad_norm": 0.025283565744757652, |
|
"learning_rate": 7.23076923076923e-05, |
|
"loss": 0.033, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3717328170377541, |
|
"grad_norm": 0.024645334109663963, |
|
"learning_rate": 7.384615384615386e-05, |
|
"loss": 0.0431, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.37947725072604066, |
|
"grad_norm": 0.025530191138386726, |
|
"learning_rate": 7.538461538461539e-05, |
|
"loss": 0.0321, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3872216844143272, |
|
"grad_norm": 0.02383197844028473, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.0305, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3872216844143272, |
|
"eval_loss": 0.021847765892744064, |
|
"eval_runtime": 4.8901, |
|
"eval_samples_per_second": 10.225, |
|
"eval_steps_per_second": 2.658, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.39496611810261373, |
|
"grad_norm": 0.02661319635808468, |
|
"learning_rate": 7.846153846153847e-05, |
|
"loss": 0.0312, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4027105517909003, |
|
"grad_norm": 0.029026813805103302, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0202, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.41045498547918685, |
|
"grad_norm": 0.03153839334845543, |
|
"learning_rate": 8.153846153846155e-05, |
|
"loss": 0.0322, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.4181994191674734, |
|
"grad_norm": 0.027100125327706337, |
|
"learning_rate": 8.307692307692309e-05, |
|
"loss": 0.0217, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4259438528557599, |
|
"grad_norm": 0.034204043447971344, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 0.0238, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4259438528557599, |
|
"eval_loss": 0.021218011155724525, |
|
"eval_runtime": 4.895, |
|
"eval_samples_per_second": 10.215, |
|
"eval_steps_per_second": 2.656, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.43368828654404645, |
|
"grad_norm": 0.026411807164549828, |
|
"learning_rate": 8.615384615384617e-05, |
|
"loss": 0.0264, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.441432720232333, |
|
"grad_norm": 0.025747094303369522, |
|
"learning_rate": 8.76923076923077e-05, |
|
"loss": 0.0231, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.4491771539206196, |
|
"grad_norm": 0.028047436848282814, |
|
"learning_rate": 8.923076923076924e-05, |
|
"loss": 0.0269, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4569215876089061, |
|
"grad_norm": 0.03033887967467308, |
|
"learning_rate": 9.076923076923078e-05, |
|
"loss": 0.0286, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.46466602129719264, |
|
"grad_norm": 0.024372393265366554, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.0278, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.46466602129719264, |
|
"eval_loss": 0.020728331059217453, |
|
"eval_runtime": 4.8702, |
|
"eval_samples_per_second": 10.266, |
|
"eval_steps_per_second": 2.669, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4724104549854792, |
|
"grad_norm": 0.028278978541493416, |
|
"learning_rate": 9.384615384615386e-05, |
|
"loss": 0.0247, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4801548886737657, |
|
"grad_norm": 0.03280925378203392, |
|
"learning_rate": 9.53846153846154e-05, |
|
"loss": 0.026, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4878993223620523, |
|
"grad_norm": 0.023919392377138138, |
|
"learning_rate": 9.692307692307692e-05, |
|
"loss": 0.0312, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.49564375605033884, |
|
"grad_norm": 0.0364394448697567, |
|
"learning_rate": 9.846153846153848e-05, |
|
"loss": 0.0219, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5033881897386253, |
|
"grad_norm": 0.02771547995507717, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0199, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5033881897386253, |
|
"eval_loss": 0.02000207081437111, |
|
"eval_runtime": 4.8908, |
|
"eval_samples_per_second": 10.223, |
|
"eval_steps_per_second": 2.658, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5111326234269119, |
|
"grad_norm": 0.02505766600370407, |
|
"learning_rate": 9.999926652940913e-05, |
|
"loss": 0.0206, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5188770571151985, |
|
"grad_norm": 0.037389349192380905, |
|
"learning_rate": 9.999706613915566e-05, |
|
"loss": 0.0265, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.526621490803485, |
|
"grad_norm": 0.03750506415963173, |
|
"learning_rate": 9.999339889379647e-05, |
|
"loss": 0.0236, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5343659244917716, |
|
"grad_norm": 0.028572333976626396, |
|
"learning_rate": 9.998826490092421e-05, |
|
"loss": 0.0236, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.542110358180058, |
|
"grad_norm": 0.024309856817126274, |
|
"learning_rate": 9.99816643111642e-05, |
|
"loss": 0.0235, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.542110358180058, |
|
"eval_loss": 0.02025166153907776, |
|
"eval_runtime": 4.8811, |
|
"eval_samples_per_second": 10.244, |
|
"eval_steps_per_second": 2.663, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5498547918683446, |
|
"grad_norm": 0.035883497446775436, |
|
"learning_rate": 9.997359731816998e-05, |
|
"loss": 0.0289, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5575992255566312, |
|
"grad_norm": 0.034139424562454224, |
|
"learning_rate": 9.996406415861763e-05, |
|
"loss": 0.0366, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5653436592449177, |
|
"grad_norm": 0.02562110312283039, |
|
"learning_rate": 9.995306511219885e-05, |
|
"loss": 0.0336, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5730880929332043, |
|
"grad_norm": 0.026915963739156723, |
|
"learning_rate": 9.994060050161269e-05, |
|
"loss": 0.0193, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5808325266214908, |
|
"grad_norm": 0.02748969756066799, |
|
"learning_rate": 9.992667069255619e-05, |
|
"loss": 0.0213, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5808325266214908, |
|
"eval_loss": 0.019886016845703125, |
|
"eval_runtime": 4.8762, |
|
"eval_samples_per_second": 10.254, |
|
"eval_steps_per_second": 2.666, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5885769603097774, |
|
"grad_norm": 0.0281902477145195, |
|
"learning_rate": 9.991127609371356e-05, |
|
"loss": 0.0333, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5963213939980639, |
|
"grad_norm": 0.032518427819013596, |
|
"learning_rate": 9.989441715674422e-05, |
|
"loss": 0.0296, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6040658276863504, |
|
"grad_norm": 0.0259566493332386, |
|
"learning_rate": 9.987609437626955e-05, |
|
"loss": 0.0282, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.611810261374637, |
|
"grad_norm": 0.029854053631424904, |
|
"learning_rate": 9.985630828985835e-05, |
|
"loss": 0.0205, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6195546950629235, |
|
"grad_norm": 0.03595299273729324, |
|
"learning_rate": 9.983505947801115e-05, |
|
"loss": 0.044, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6195546950629235, |
|
"eval_loss": 0.01953260228037834, |
|
"eval_runtime": 4.8809, |
|
"eval_samples_per_second": 10.244, |
|
"eval_steps_per_second": 2.663, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6272991287512101, |
|
"grad_norm": 0.02581968903541565, |
|
"learning_rate": 9.981234856414307e-05, |
|
"loss": 0.0265, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6350435624394967, |
|
"grad_norm": 0.02523561753332615, |
|
"learning_rate": 9.978817621456562e-05, |
|
"loss": 0.0232, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6427879961277831, |
|
"grad_norm": 0.022955749183893204, |
|
"learning_rate": 9.97625431384671e-05, |
|
"loss": 0.0267, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6505324298160697, |
|
"grad_norm": 0.0209239199757576, |
|
"learning_rate": 9.973545008789181e-05, |
|
"loss": 0.0303, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6582768635043562, |
|
"grad_norm": 0.028582807630300522, |
|
"learning_rate": 9.970689785771798e-05, |
|
"loss": 0.021, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6582768635043562, |
|
"eval_loss": 0.019236262887716293, |
|
"eval_runtime": 4.874, |
|
"eval_samples_per_second": 10.258, |
|
"eval_steps_per_second": 2.667, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6660212971926428, |
|
"grad_norm": 0.02616284228861332, |
|
"learning_rate": 9.967688728563446e-05, |
|
"loss": 0.0176, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6737657308809293, |
|
"grad_norm": 0.029908856377005577, |
|
"learning_rate": 9.964541925211612e-05, |
|
"loss": 0.0206, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6815101645692159, |
|
"grad_norm": 0.03139350563287735, |
|
"learning_rate": 9.961249468039807e-05, |
|
"loss": 0.0301, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6892545982575025, |
|
"grad_norm": 0.025906842201948166, |
|
"learning_rate": 9.957811453644847e-05, |
|
"loss": 0.0192, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6969990319457889, |
|
"grad_norm": 0.0281496811658144, |
|
"learning_rate": 9.954227982894034e-05, |
|
"loss": 0.0296, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6969990319457889, |
|
"eval_loss": 0.019074302166700363, |
|
"eval_runtime": 4.8832, |
|
"eval_samples_per_second": 10.239, |
|
"eval_steps_per_second": 2.662, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.7047434656340755, |
|
"grad_norm": 0.027965204790234566, |
|
"learning_rate": 9.950499160922183e-05, |
|
"loss": 0.0213, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.712487899322362, |
|
"grad_norm": 0.02602163329720497, |
|
"learning_rate": 9.946625097128543e-05, |
|
"loss": 0.0269, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7202323330106486, |
|
"grad_norm": 0.028190776705741882, |
|
"learning_rate": 9.942605905173592e-05, |
|
"loss": 0.0207, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7279767666989352, |
|
"grad_norm": 0.025893300771713257, |
|
"learning_rate": 9.938441702975689e-05, |
|
"loss": 0.0265, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7357212003872217, |
|
"grad_norm": 0.0202568881213665, |
|
"learning_rate": 9.934132612707632e-05, |
|
"loss": 0.0141, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7357212003872217, |
|
"eval_loss": 0.018998095765709877, |
|
"eval_runtime": 4.8865, |
|
"eval_samples_per_second": 10.232, |
|
"eval_steps_per_second": 2.66, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7434656340755083, |
|
"grad_norm": 0.03151071444153786, |
|
"learning_rate": 9.929678760793057e-05, |
|
"loss": 0.028, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7512100677637947, |
|
"grad_norm": 0.037441398948431015, |
|
"learning_rate": 9.925080277902743e-05, |
|
"loss": 0.0275, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7589545014520813, |
|
"grad_norm": 0.022733572870492935, |
|
"learning_rate": 9.920337298950765e-05, |
|
"loss": 0.0227, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7666989351403679, |
|
"grad_norm": 0.021637218073010445, |
|
"learning_rate": 9.91544996309055e-05, |
|
"loss": 0.0179, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7744433688286544, |
|
"grad_norm": 0.023374751210212708, |
|
"learning_rate": 9.91041841371078e-05, |
|
"loss": 0.0289, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7744433688286544, |
|
"eval_loss": 0.01871725358068943, |
|
"eval_runtime": 4.9046, |
|
"eval_samples_per_second": 10.195, |
|
"eval_steps_per_second": 2.651, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.782187802516941, |
|
"grad_norm": 0.021633530035614967, |
|
"learning_rate": 9.905242798431196e-05, |
|
"loss": 0.0267, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7899322362052275, |
|
"grad_norm": 0.024837492033839226, |
|
"learning_rate": 9.899923269098262e-05, |
|
"loss": 0.0341, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.797676669893514, |
|
"grad_norm": 0.023348737508058548, |
|
"learning_rate": 9.894459981780711e-05, |
|
"loss": 0.0263, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.8054211035818006, |
|
"grad_norm": 0.02404264733195305, |
|
"learning_rate": 9.888853096764964e-05, |
|
"loss": 0.0214, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8131655372700871, |
|
"grad_norm": 0.02434077486395836, |
|
"learning_rate": 9.883102778550434e-05, |
|
"loss": 0.0159, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8131655372700871, |
|
"eval_loss": 0.01875956915318966, |
|
"eval_runtime": 4.887, |
|
"eval_samples_per_second": 10.231, |
|
"eval_steps_per_second": 2.66, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8209099709583737, |
|
"grad_norm": 0.023013584315776825, |
|
"learning_rate": 9.877209195844692e-05, |
|
"loss": 0.0266, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8286544046466602, |
|
"grad_norm": 0.03137190267443657, |
|
"learning_rate": 9.871172521558523e-05, |
|
"loss": 0.0242, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8363988383349468, |
|
"grad_norm": 0.023217204958200455, |
|
"learning_rate": 9.864992932800845e-05, |
|
"loss": 0.0254, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8441432720232332, |
|
"grad_norm": 0.027811044827103615, |
|
"learning_rate": 9.858670610873528e-05, |
|
"loss": 0.0173, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8518877057115198, |
|
"grad_norm": 0.027365995571017265, |
|
"learning_rate": 9.852205741266058e-05, |
|
"loss": 0.0275, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8518877057115198, |
|
"eval_loss": 0.01876773312687874, |
|
"eval_runtime": 4.8844, |
|
"eval_samples_per_second": 10.237, |
|
"eval_steps_per_second": 2.662, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8596321393998064, |
|
"grad_norm": 0.022870220243930817, |
|
"learning_rate": 9.845598513650103e-05, |
|
"loss": 0.0175, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8673765730880929, |
|
"grad_norm": 0.021480288356542587, |
|
"learning_rate": 9.838849121873949e-05, |
|
"loss": 0.0179, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8751210067763795, |
|
"grad_norm": 0.025231841951608658, |
|
"learning_rate": 9.831957763956813e-05, |
|
"loss": 0.0182, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.882865440464666, |
|
"grad_norm": 0.023175878450274467, |
|
"learning_rate": 9.824924642083026e-05, |
|
"loss": 0.0167, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8906098741529526, |
|
"grad_norm": 0.02536984719336033, |
|
"learning_rate": 9.817749962596115e-05, |
|
"loss": 0.0271, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8906098741529526, |
|
"eval_loss": 0.018538037315011024, |
|
"eval_runtime": 4.8812, |
|
"eval_samples_per_second": 10.243, |
|
"eval_steps_per_second": 2.663, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8983543078412392, |
|
"grad_norm": 0.02080857753753662, |
|
"learning_rate": 9.810433935992733e-05, |
|
"loss": 0.0254, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.9060987415295256, |
|
"grad_norm": 0.026430707424879074, |
|
"learning_rate": 9.802976776916494e-05, |
|
"loss": 0.0185, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.9138431752178122, |
|
"grad_norm": 0.02291349321603775, |
|
"learning_rate": 9.795378704151675e-05, |
|
"loss": 0.0164, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9215876089060987, |
|
"grad_norm": 0.02319083735346794, |
|
"learning_rate": 9.787639940616788e-05, |
|
"loss": 0.0237, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9293320425943853, |
|
"grad_norm": 0.027965422719717026, |
|
"learning_rate": 9.779760713358059e-05, |
|
"loss": 0.0262, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9293320425943853, |
|
"eval_loss": 0.018477478995919228, |
|
"eval_runtime": 4.8802, |
|
"eval_samples_per_second": 10.246, |
|
"eval_steps_per_second": 2.664, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9370764762826719, |
|
"grad_norm": 0.023768456652760506, |
|
"learning_rate": 9.771741253542741e-05, |
|
"loss": 0.0186, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9448209099709584, |
|
"grad_norm": 0.01906961388885975, |
|
"learning_rate": 9.763581796452353e-05, |
|
"loss": 0.0163, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.952565343659245, |
|
"grad_norm": 0.022706998512148857, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.0253, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9603097773475314, |
|
"grad_norm": 0.02551465854048729, |
|
"learning_rate": 9.74684385210219e-05, |
|
"loss": 0.0163, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.968054211035818, |
|
"grad_norm": 0.02145274542272091, |
|
"learning_rate": 9.738265855914013e-05, |
|
"loss": 0.0299, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.968054211035818, |
|
"eval_loss": 0.01828974299132824, |
|
"eval_runtime": 4.8759, |
|
"eval_samples_per_second": 10.254, |
|
"eval_steps_per_second": 2.666, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9757986447241046, |
|
"grad_norm": 0.023152988404035568, |
|
"learning_rate": 9.729548844579552e-05, |
|
"loss": 0.0178, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9835430784123911, |
|
"grad_norm": 0.026649784296751022, |
|
"learning_rate": 9.720693073845667e-05, |
|
"loss": 0.024, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9912875121006777, |
|
"grad_norm": 0.020236071199178696, |
|
"learning_rate": 9.711698803530254e-05, |
|
"loss": 0.0301, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9990319457889641, |
|
"grad_norm": 0.027533914893865585, |
|
"learning_rate": 9.70256629751462e-05, |
|
"loss": 0.0195, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.0067763794772506, |
|
"grad_norm": 0.053280122578144073, |
|
"learning_rate": 9.693295823735753e-05, |
|
"loss": 0.0315, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0067763794772506, |
|
"eval_loss": 0.018024258315563202, |
|
"eval_runtime": 4.8931, |
|
"eval_samples_per_second": 10.219, |
|
"eval_steps_per_second": 2.657, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.0145208131655372, |
|
"grad_norm": 0.01893387921154499, |
|
"learning_rate": 9.683887654178445e-05, |
|
"loss": 0.0226, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0222652468538238, |
|
"grad_norm": 0.029532097280025482, |
|
"learning_rate": 9.674342064867326e-05, |
|
"loss": 0.0145, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0300096805421104, |
|
"grad_norm": 0.028108367696404457, |
|
"learning_rate": 9.664659335858755e-05, |
|
"loss": 0.0148, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.037754114230397, |
|
"grad_norm": 0.025696909055113792, |
|
"learning_rate": 9.654839751232611e-05, |
|
"loss": 0.0198, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0454985479186834, |
|
"grad_norm": 0.02809828147292137, |
|
"learning_rate": 9.644883599083958e-05, |
|
"loss": 0.0212, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0454985479186834, |
|
"eval_loss": 0.017997030168771744, |
|
"eval_runtime": 4.8817, |
|
"eval_samples_per_second": 10.242, |
|
"eval_steps_per_second": 2.663, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.05324298160697, |
|
"grad_norm": 0.023596247658133507, |
|
"learning_rate": 9.634791171514585e-05, |
|
"loss": 0.027, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0609874152952565, |
|
"grad_norm": 0.032478995621204376, |
|
"learning_rate": 9.624562764624445e-05, |
|
"loss": 0.0231, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0687318489835431, |
|
"grad_norm": 0.029977047815918922, |
|
"learning_rate": 9.614198678502965e-05, |
|
"loss": 0.0139, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0764762826718297, |
|
"grad_norm": 0.03173111006617546, |
|
"learning_rate": 9.603699217220239e-05, |
|
"loss": 0.0188, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.084220716360116, |
|
"grad_norm": 0.02266346476972103, |
|
"learning_rate": 9.59306468881811e-05, |
|
"loss": 0.0172, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.084220716360116, |
|
"eval_loss": 0.018361272290349007, |
|
"eval_runtime": 4.8948, |
|
"eval_samples_per_second": 10.215, |
|
"eval_steps_per_second": 2.656, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0919651500484027, |
|
"grad_norm": 0.03363156318664551, |
|
"learning_rate": 9.582295405301131e-05, |
|
"loss": 0.0202, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0997095837366893, |
|
"grad_norm": 0.03840557113289833, |
|
"learning_rate": 9.571391682627412e-05, |
|
"loss": 0.0222, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.1074540174249758, |
|
"grad_norm": 0.023486673831939697, |
|
"learning_rate": 9.56035384069935e-05, |
|
"loss": 0.0396, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.1151984511132624, |
|
"grad_norm": 0.030952000990509987, |
|
"learning_rate": 9.549182203354242e-05, |
|
"loss": 0.0225, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.1229428848015488, |
|
"grad_norm": 0.030439218506217003, |
|
"learning_rate": 9.537877098354786e-05, |
|
"loss": 0.0277, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1229428848015488, |
|
"eval_loss": 0.01816246099770069, |
|
"eval_runtime": 4.8899, |
|
"eval_samples_per_second": 10.225, |
|
"eval_steps_per_second": 2.659, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.1306873184898354, |
|
"grad_norm": 0.024195371195673943, |
|
"learning_rate": 9.526438857379463e-05, |
|
"loss": 0.0116, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.138431752178122, |
|
"grad_norm": 0.02799941971898079, |
|
"learning_rate": 9.514867816012809e-05, |
|
"loss": 0.0195, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.1461761858664086, |
|
"grad_norm": 0.030233675613999367, |
|
"learning_rate": 9.503164313735566e-05, |
|
"loss": 0.0182, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1539206195546952, |
|
"grad_norm": 0.024903280660510063, |
|
"learning_rate": 9.491328693914722e-05, |
|
"loss": 0.0222, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1616650532429815, |
|
"grad_norm": 0.023587804287672043, |
|
"learning_rate": 9.47936130379344e-05, |
|
"loss": 0.0166, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1616650532429815, |
|
"eval_loss": 0.017931492999196053, |
|
"eval_runtime": 4.8826, |
|
"eval_samples_per_second": 10.24, |
|
"eval_steps_per_second": 2.663, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1694094869312681, |
|
"grad_norm": 0.024121137335896492, |
|
"learning_rate": 9.467262494480869e-05, |
|
"loss": 0.0216, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1771539206195547, |
|
"grad_norm": 0.02379632741212845, |
|
"learning_rate": 9.45503262094184e-05, |
|
"loss": 0.023, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1848983543078413, |
|
"grad_norm": 0.02161642163991928, |
|
"learning_rate": 9.442672041986457e-05, |
|
"loss": 0.0349, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1926427879961277, |
|
"grad_norm": 0.019304990768432617, |
|
"learning_rate": 9.430181120259565e-05, |
|
"loss": 0.0193, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.2003872216844143, |
|
"grad_norm": 0.022498024627566338, |
|
"learning_rate": 9.417560222230115e-05, |
|
"loss": 0.0272, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2003872216844143, |
|
"eval_loss": 0.018144290894269943, |
|
"eval_runtime": 4.8768, |
|
"eval_samples_per_second": 10.253, |
|
"eval_steps_per_second": 2.666, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.2081316553727008, |
|
"grad_norm": 0.03062877058982849, |
|
"learning_rate": 9.404809718180407e-05, |
|
"loss": 0.0215, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.2158760890609874, |
|
"grad_norm": 0.023427944630384445, |
|
"learning_rate": 9.391929982195232e-05, |
|
"loss": 0.0301, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.223620522749274, |
|
"grad_norm": 0.02246953919529915, |
|
"learning_rate": 9.378921392150892e-05, |
|
"loss": 0.0212, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2313649564375604, |
|
"grad_norm": 0.02264482155442238, |
|
"learning_rate": 9.365784329704115e-05, |
|
"loss": 0.0164, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.239109390125847, |
|
"grad_norm": 0.025367658585309982, |
|
"learning_rate": 9.35251918028086e-05, |
|
"loss": 0.0193, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.239109390125847, |
|
"eval_loss": 0.017837481573224068, |
|
"eval_runtime": 4.8761, |
|
"eval_samples_per_second": 10.254, |
|
"eval_steps_per_second": 2.666, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2468538238141336, |
|
"grad_norm": 0.02131119929254055, |
|
"learning_rate": 9.339126333065007e-05, |
|
"loss": 0.0207, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.2545982575024202, |
|
"grad_norm": 0.019136667251586914, |
|
"learning_rate": 9.325606180986939e-05, |
|
"loss": 0.0147, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2623426911907067, |
|
"grad_norm": 0.024482635781168938, |
|
"learning_rate": 9.31195912071201e-05, |
|
"loss": 0.0299, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2700871248789931, |
|
"grad_norm": 0.02487838640809059, |
|
"learning_rate": 9.298185552628917e-05, |
|
"loss": 0.0232, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2778315585672797, |
|
"grad_norm": 0.025261854752898216, |
|
"learning_rate": 9.284285880837946e-05, |
|
"loss": 0.0121, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2778315585672797, |
|
"eval_loss": 0.017772378399968147, |
|
"eval_runtime": 4.8807, |
|
"eval_samples_per_second": 10.245, |
|
"eval_steps_per_second": 2.664, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2855759922555663, |
|
"grad_norm": 0.02148056961596012, |
|
"learning_rate": 9.270260513139116e-05, |
|
"loss": 0.0347, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2933204259438529, |
|
"grad_norm": 0.02021237276494503, |
|
"learning_rate": 9.256109861020213e-05, |
|
"loss": 0.02, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.3010648596321395, |
|
"grad_norm": 0.017359554767608643, |
|
"learning_rate": 9.241834339644726e-05, |
|
"loss": 0.0168, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.3088092933204258, |
|
"grad_norm": 0.02310781180858612, |
|
"learning_rate": 9.22743436783966e-05, |
|
"loss": 0.0192, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.3165537270087124, |
|
"grad_norm": 0.020348088815808296, |
|
"learning_rate": 9.212910368083245e-05, |
|
"loss": 0.0218, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.3165537270087124, |
|
"eval_loss": 0.0177312009036541, |
|
"eval_runtime": 4.8794, |
|
"eval_samples_per_second": 10.247, |
|
"eval_steps_per_second": 2.664, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.324298160696999, |
|
"grad_norm": 0.019140997901558876, |
|
"learning_rate": 9.198262766492554e-05, |
|
"loss": 0.0217, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.3320425943852856, |
|
"grad_norm": 0.023120978847146034, |
|
"learning_rate": 9.183491992810979e-05, |
|
"loss": 0.0275, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3397870280735722, |
|
"grad_norm": 0.024684559553861618, |
|
"learning_rate": 9.168598480395651e-05, |
|
"loss": 0.0201, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.3475314617618586, |
|
"grad_norm": 0.024830348789691925, |
|
"learning_rate": 9.153582666204701e-05, |
|
"loss": 0.0234, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.3552758954501452, |
|
"grad_norm": 0.023022592067718506, |
|
"learning_rate": 9.138444990784453e-05, |
|
"loss": 0.016, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3552758954501452, |
|
"eval_loss": 0.017486225813627243, |
|
"eval_runtime": 4.8842, |
|
"eval_samples_per_second": 10.237, |
|
"eval_steps_per_second": 2.662, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.3630203291384317, |
|
"grad_norm": 0.02616291493177414, |
|
"learning_rate": 9.123185898256496e-05, |
|
"loss": 0.0261, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.3707647628267183, |
|
"grad_norm": 0.02299882471561432, |
|
"learning_rate": 9.107805836304658e-05, |
|
"loss": 0.0254, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.378509196515005, |
|
"grad_norm": 0.018913911655545235, |
|
"learning_rate": 9.092305256161859e-05, |
|
"loss": 0.0124, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3862536302032913, |
|
"grad_norm": 0.02167947217822075, |
|
"learning_rate": 9.076684612596891e-05, |
|
"loss": 0.0232, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3939980638915779, |
|
"grad_norm": 0.02304757945239544, |
|
"learning_rate": 9.060944363901056e-05, |
|
"loss": 0.0268, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3939980638915779, |
|
"eval_loss": 0.01751082018017769, |
|
"eval_runtime": 4.8781, |
|
"eval_samples_per_second": 10.25, |
|
"eval_steps_per_second": 2.665, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.4017424975798645, |
|
"grad_norm": 0.02488349750638008, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0128, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.409486931268151, |
|
"grad_norm": 0.025742027908563614, |
|
"learning_rate": 9.029106901813839e-05, |
|
"loss": 0.0243, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.4172313649564376, |
|
"grad_norm": 0.020051000639796257, |
|
"learning_rate": 9.013010622496144e-05, |
|
"loss": 0.0106, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.424975798644724, |
|
"grad_norm": 0.021976549178361893, |
|
"learning_rate": 8.996796606167548e-05, |
|
"loss": 0.0183, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4327202323330106, |
|
"grad_norm": 0.0210378710180521, |
|
"learning_rate": 8.980465328528219e-05, |
|
"loss": 0.0152, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4327202323330106, |
|
"eval_loss": 0.017743976786732674, |
|
"eval_runtime": 4.8802, |
|
"eval_samples_per_second": 10.246, |
|
"eval_steps_per_second": 2.664, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4404646660212972, |
|
"grad_norm": 0.02365756221115589, |
|
"learning_rate": 8.96401726871863e-05, |
|
"loss": 0.0114, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.4482090997095838, |
|
"grad_norm": 0.025590112432837486, |
|
"learning_rate": 8.94745290930551e-05, |
|
"loss": 0.0189, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4559535333978704, |
|
"grad_norm": 0.029832618311047554, |
|
"learning_rate": 8.930772736267674e-05, |
|
"loss": 0.0324, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4636979670861567, |
|
"grad_norm": 0.025901637971401215, |
|
"learning_rate": 8.913977238981778e-05, |
|
"loss": 0.0186, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4714424007744433, |
|
"grad_norm": 0.01908070780336857, |
|
"learning_rate": 8.897066910207958e-05, |
|
"loss": 0.0279, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4714424007744433, |
|
"eval_loss": 0.017557693645358086, |
|
"eval_runtime": 4.8877, |
|
"eval_samples_per_second": 10.23, |
|
"eval_steps_per_second": 2.66, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.47918683446273, |
|
"grad_norm": 0.025517305359244347, |
|
"learning_rate": 8.880042246075365e-05, |
|
"loss": 0.0279, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4869312681510165, |
|
"grad_norm": 0.019936546683311462, |
|
"learning_rate": 8.862903746067618e-05, |
|
"loss": 0.0172, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.494675701839303, |
|
"grad_norm": 0.019224194809794426, |
|
"learning_rate": 8.845651913008145e-05, |
|
"loss": 0.0138, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.5024201355275895, |
|
"grad_norm": 0.017969885841012, |
|
"learning_rate": 8.828287253045435e-05, |
|
"loss": 0.0151, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.510164569215876, |
|
"grad_norm": 0.02093169093132019, |
|
"learning_rate": 8.810810275638183e-05, |
|
"loss": 0.0206, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.510164569215876, |
|
"eval_loss": 0.017626546323299408, |
|
"eval_runtime": 4.9032, |
|
"eval_samples_per_second": 10.197, |
|
"eval_steps_per_second": 2.651, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.5179090029041626, |
|
"grad_norm": 0.027407390996813774, |
|
"learning_rate": 8.793221493540347e-05, |
|
"loss": 0.0151, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5256534365924492, |
|
"grad_norm": 0.022155404090881348, |
|
"learning_rate": 8.775521422786104e-05, |
|
"loss": 0.0187, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.5333978702807358, |
|
"grad_norm": 0.02126327157020569, |
|
"learning_rate": 8.757710582674707e-05, |
|
"loss": 0.0168, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5411423039690222, |
|
"grad_norm": 0.02067979797720909, |
|
"learning_rate": 8.739789495755253e-05, |
|
"loss": 0.015, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.5488867376573088, |
|
"grad_norm": 0.023581981658935547, |
|
"learning_rate": 8.721758687811352e-05, |
|
"loss": 0.0196, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5488867376573088, |
|
"eval_loss": 0.017185786738991737, |
|
"eval_runtime": 4.8793, |
|
"eval_samples_per_second": 10.247, |
|
"eval_steps_per_second": 2.664, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5566311713455954, |
|
"grad_norm": 0.0208896417170763, |
|
"learning_rate": 8.703618687845696e-05, |
|
"loss": 0.0176, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.5643756050338817, |
|
"grad_norm": 0.02558140642940998, |
|
"learning_rate": 8.685370028064546e-05, |
|
"loss": 0.0224, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.5721200387221685, |
|
"grad_norm": 0.01860946975648403, |
|
"learning_rate": 8.667013243862113e-05, |
|
"loss": 0.0189, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.579864472410455, |
|
"grad_norm": 0.024494647979736328, |
|
"learning_rate": 8.64854887380485e-05, |
|
"loss": 0.0204, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5876089060987415, |
|
"grad_norm": 0.028290973976254463, |
|
"learning_rate": 8.629977459615655e-05, |
|
"loss": 0.0262, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5876089060987415, |
|
"eval_loss": 0.016824763268232346, |
|
"eval_runtime": 4.88, |
|
"eval_samples_per_second": 10.246, |
|
"eval_steps_per_second": 2.664, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.595353339787028, |
|
"grad_norm": 0.020388493314385414, |
|
"learning_rate": 8.611299546157974e-05, |
|
"loss": 0.0287, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.6030977734753145, |
|
"grad_norm": 0.022215668112039566, |
|
"learning_rate": 8.592515681419813e-05, |
|
"loss": 0.0249, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.6108422071636013, |
|
"grad_norm": 0.028934534639120102, |
|
"learning_rate": 8.573626416497668e-05, |
|
"loss": 0.0217, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.6185866408518876, |
|
"grad_norm": 0.022588912397623062, |
|
"learning_rate": 8.554632305580354e-05, |
|
"loss": 0.0207, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.6263310745401742, |
|
"grad_norm": 0.02324405126273632, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.0178, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6263310745401742, |
|
"eval_loss": 0.016888294368982315, |
|
"eval_runtime": 4.8771, |
|
"eval_samples_per_second": 10.252, |
|
"eval_steps_per_second": 2.665, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6340755082284608, |
|
"grad_norm": 0.023379050195217133, |
|
"learning_rate": 8.5163317778794e-05, |
|
"loss": 0.0227, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.6418199419167472, |
|
"grad_norm": 0.024302620440721512, |
|
"learning_rate": 8.497026484788189e-05, |
|
"loss": 0.0279, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.649564375605034, |
|
"grad_norm": 0.02425311878323555, |
|
"learning_rate": 8.477618593053693e-05, |
|
"loss": 0.02, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6573088092933204, |
|
"grad_norm": 0.0243984404951334, |
|
"learning_rate": 8.458108672080624e-05, |
|
"loss": 0.0255, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.665053242981607, |
|
"grad_norm": 0.018734309822320938, |
|
"learning_rate": 8.438497294267117e-05, |
|
"loss": 0.011, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.665053242981607, |
|
"eval_loss": 0.01664450205862522, |
|
"eval_runtime": 4.8808, |
|
"eval_samples_per_second": 10.244, |
|
"eval_steps_per_second": 2.663, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6727976766698935, |
|
"grad_norm": 0.019455671310424805, |
|
"learning_rate": 8.418785034987921e-05, |
|
"loss": 0.0175, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.68054211035818, |
|
"grad_norm": 0.021629663184285164, |
|
"learning_rate": 8.39897247257754e-05, |
|
"loss": 0.0156, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.6882865440464667, |
|
"grad_norm": 0.022207748144865036, |
|
"learning_rate": 8.379060188313244e-05, |
|
"loss": 0.0271, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.696030977734753, |
|
"grad_norm": 0.021333666518330574, |
|
"learning_rate": 8.359048766398031e-05, |
|
"loss": 0.0223, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.7037754114230397, |
|
"grad_norm": 0.021991191431879997, |
|
"learning_rate": 8.338938793943478e-05, |
|
"loss": 0.0128, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7037754114230397, |
|
"eval_loss": 0.016610655933618546, |
|
"eval_runtime": 4.8782, |
|
"eval_samples_per_second": 10.25, |
|
"eval_steps_per_second": 2.665, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.7115198451113263, |
|
"grad_norm": 0.01750914379954338, |
|
"learning_rate": 8.318730860952522e-05, |
|
"loss": 0.0217, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.7192642787996126, |
|
"grad_norm": 0.022801555693149567, |
|
"learning_rate": 8.298425560302146e-05, |
|
"loss": 0.0229, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.7270087124878994, |
|
"grad_norm": 0.028667643666267395, |
|
"learning_rate": 8.278023487725982e-05, |
|
"loss": 0.0317, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.7347531461761858, |
|
"grad_norm": 0.0247921384871006, |
|
"learning_rate": 8.257525241796838e-05, |
|
"loss": 0.0177, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.7424975798644724, |
|
"grad_norm": 0.02079445868730545, |
|
"learning_rate": 8.236931423909138e-05, |
|
"loss": 0.0223, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.7424975798644724, |
|
"eval_loss": 0.016715094447135925, |
|
"eval_runtime": 4.8828, |
|
"eval_samples_per_second": 10.24, |
|
"eval_steps_per_second": 2.662, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.750242013552759, |
|
"grad_norm": 0.023619551211595535, |
|
"learning_rate": 8.216242638261276e-05, |
|
"loss": 0.0237, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.7579864472410454, |
|
"grad_norm": 0.020713407546281815, |
|
"learning_rate": 8.19545949183788e-05, |
|
"loss": 0.0167, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7657308809293322, |
|
"grad_norm": 0.024574102833867073, |
|
"learning_rate": 8.17458259439202e-05, |
|
"loss": 0.0281, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7734753146176185, |
|
"grad_norm": 0.01983151212334633, |
|
"learning_rate": 8.153612558427311e-05, |
|
"loss": 0.0217, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.7812197483059051, |
|
"grad_norm": 0.027135249227285385, |
|
"learning_rate": 8.132549999179933e-05, |
|
"loss": 0.0201, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7812197483059051, |
|
"eval_loss": 0.016661785542964935, |
|
"eval_runtime": 4.8807, |
|
"eval_samples_per_second": 10.244, |
|
"eval_steps_per_second": 2.664, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7889641819941917, |
|
"grad_norm": 0.021816475316882133, |
|
"learning_rate": 8.111395534600603e-05, |
|
"loss": 0.0166, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.796708615682478, |
|
"grad_norm": 0.019049836322665215, |
|
"learning_rate": 8.090149785336425e-05, |
|
"loss": 0.0125, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.804453049370765, |
|
"grad_norm": 0.023273281753063202, |
|
"learning_rate": 8.068813374712688e-05, |
|
"loss": 0.0295, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.8121974830590513, |
|
"grad_norm": 0.02431442402303219, |
|
"learning_rate": 8.047386928714582e-05, |
|
"loss": 0.0193, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.8199419167473379, |
|
"grad_norm": 0.02583279088139534, |
|
"learning_rate": 8.025871075968828e-05, |
|
"loss": 0.0239, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.8199419167473379, |
|
"eval_loss": 0.016347970813512802, |
|
"eval_runtime": 4.883, |
|
"eval_samples_per_second": 10.24, |
|
"eval_steps_per_second": 2.662, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.8276863504356244, |
|
"grad_norm": 0.021466901525855064, |
|
"learning_rate": 8.00426644772523e-05, |
|
"loss": 0.0226, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.8354307841239108, |
|
"grad_norm": 0.02583594247698784, |
|
"learning_rate": 7.982573677838172e-05, |
|
"loss": 0.0113, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.8431752178121976, |
|
"grad_norm": 0.02358117513358593, |
|
"learning_rate": 7.960793402748002e-05, |
|
"loss": 0.0292, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.850919651500484, |
|
"grad_norm": 0.025698702782392502, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.0269, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.8586640851887706, |
|
"grad_norm": 0.021297315135598183, |
|
"learning_rate": 7.916972895537471e-05, |
|
"loss": 0.0206, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8586640851887706, |
|
"eval_loss": 0.016880055889487267, |
|
"eval_runtime": 4.8849, |
|
"eval_samples_per_second": 10.236, |
|
"eval_steps_per_second": 2.661, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8664085188770572, |
|
"grad_norm": 0.02742616832256317, |
|
"learning_rate": 7.894933949059245e-05, |
|
"loss": 0.0266, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8741529525653435, |
|
"grad_norm": 0.029985694214701653, |
|
"learning_rate": 7.872810068624451e-05, |
|
"loss": 0.0209, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8818973862536303, |
|
"grad_norm": 0.01984225958585739, |
|
"learning_rate": 7.850601903321716e-05, |
|
"loss": 0.0112, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8896418199419167, |
|
"grad_norm": 0.028832539916038513, |
|
"learning_rate": 7.828310104712489e-05, |
|
"loss": 0.0176, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.8973862536302033, |
|
"grad_norm": 0.025244107469916344, |
|
"learning_rate": 7.805935326811912e-05, |
|
"loss": 0.0209, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.8973862536302033, |
|
"eval_loss": 0.016251368448138237, |
|
"eval_runtime": 4.8854, |
|
"eval_samples_per_second": 10.235, |
|
"eval_steps_per_second": 2.661, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.90513068731849, |
|
"grad_norm": 0.019776510074734688, |
|
"learning_rate": 7.783478226069651e-05, |
|
"loss": 0.0146, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.9128751210067763, |
|
"grad_norm": 0.030150357633829117, |
|
"learning_rate": 7.760939461350623e-05, |
|
"loss": 0.0205, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.920619554695063, |
|
"grad_norm": 0.02409055270254612, |
|
"learning_rate": 7.738319693915672e-05, |
|
"loss": 0.0209, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.9283639883833494, |
|
"grad_norm": 0.02473391965031624, |
|
"learning_rate": 7.715619587402164e-05, |
|
"loss": 0.0169, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.936108422071636, |
|
"grad_norm": 0.028100404888391495, |
|
"learning_rate": 7.692839807804521e-05, |
|
"loss": 0.0171, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.936108422071636, |
|
"eval_loss": 0.016126085072755814, |
|
"eval_runtime": 4.8878, |
|
"eval_samples_per_second": 10.23, |
|
"eval_steps_per_second": 2.66, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.9438528557599226, |
|
"grad_norm": 0.031069206073880196, |
|
"learning_rate": 7.669981023454682e-05, |
|
"loss": 0.0346, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.951597289448209, |
|
"grad_norm": 0.020763061940670013, |
|
"learning_rate": 7.647043905002484e-05, |
|
"loss": 0.0168, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.9593417231364958, |
|
"grad_norm": 0.021877290681004524, |
|
"learning_rate": 7.624029125396004e-05, |
|
"loss": 0.0276, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.9670861568247822, |
|
"grad_norm": 0.023641012609004974, |
|
"learning_rate": 7.6009373598618e-05, |
|
"loss": 0.0182, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.9748305905130688, |
|
"grad_norm": 0.025783414021134377, |
|
"learning_rate": 7.577769285885109e-05, |
|
"loss": 0.022, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9748305905130688, |
|
"eval_loss": 0.015866845846176147, |
|
"eval_runtime": 4.8957, |
|
"eval_samples_per_second": 10.213, |
|
"eval_steps_per_second": 2.655, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.9825750242013553, |
|
"grad_norm": 0.022825462743639946, |
|
"learning_rate": 7.554525583189969e-05, |
|
"loss": 0.0184, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.9903194578896417, |
|
"grad_norm": 0.024429945275187492, |
|
"learning_rate": 7.53120693371927e-05, |
|
"loss": 0.0196, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.9980638915779285, |
|
"grad_norm": 0.0280454121530056, |
|
"learning_rate": 7.507814021614761e-05, |
|
"loss": 0.0297, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.005808325266215, |
|
"grad_norm": 0.04602880775928497, |
|
"learning_rate": 7.484347533196961e-05, |
|
"loss": 0.0211, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 2.0135527589545013, |
|
"grad_norm": 0.01826930046081543, |
|
"learning_rate": 7.460808156945036e-05, |
|
"loss": 0.0162, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.0135527589545013, |
|
"eval_loss": 0.015875546261668205, |
|
"eval_runtime": 4.8819, |
|
"eval_samples_per_second": 10.242, |
|
"eval_steps_per_second": 2.663, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.021297192642788, |
|
"grad_norm": 0.018936650827527046, |
|
"learning_rate": 7.437196583476596e-05, |
|
"loss": 0.0169, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.0290416263310744, |
|
"grad_norm": 0.02147481217980385, |
|
"learning_rate": 7.413513505527429e-05, |
|
"loss": 0.0142, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.0367860600193612, |
|
"grad_norm": 0.020604653283953667, |
|
"learning_rate": 7.389759617931182e-05, |
|
"loss": 0.0115, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 2.0445304937076476, |
|
"grad_norm": 0.021933911368250847, |
|
"learning_rate": 7.365935617598975e-05, |
|
"loss": 0.0134, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.052274927395934, |
|
"grad_norm": 0.02122250571846962, |
|
"learning_rate": 7.342042203498951e-05, |
|
"loss": 0.0185, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.052274927395934, |
|
"eval_loss": 0.01603526994585991, |
|
"eval_runtime": 4.9059, |
|
"eval_samples_per_second": 10.192, |
|
"eval_steps_per_second": 2.65, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.060019361084221, |
|
"grad_norm": 0.018767178058624268, |
|
"learning_rate": 7.318080076635772e-05, |
|
"loss": 0.0087, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.067763794772507, |
|
"grad_norm": 0.01828618347644806, |
|
"learning_rate": 7.294049940030055e-05, |
|
"loss": 0.0088, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.075508228460794, |
|
"grad_norm": 0.029488379135727882, |
|
"learning_rate": 7.269952498697734e-05, |
|
"loss": 0.0148, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.0832526621490803, |
|
"grad_norm": 0.028726164251565933, |
|
"learning_rate": 7.245788459629396e-05, |
|
"loss": 0.0226, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 2.0909970958373667, |
|
"grad_norm": 0.03607122600078583, |
|
"learning_rate": 7.221558531769519e-05, |
|
"loss": 0.0185, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0909970958373667, |
|
"eval_loss": 0.01613912731409073, |
|
"eval_runtime": 4.8866, |
|
"eval_samples_per_second": 10.232, |
|
"eval_steps_per_second": 2.66, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.0987415295256535, |
|
"grad_norm": 0.02318711020052433, |
|
"learning_rate": 7.197263425995682e-05, |
|
"loss": 0.0187, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 2.10648596321394, |
|
"grad_norm": 0.027442490682005882, |
|
"learning_rate": 7.172903855097711e-05, |
|
"loss": 0.0185, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.1142303969022267, |
|
"grad_norm": 0.02113383449614048, |
|
"learning_rate": 7.14848053375676e-05, |
|
"loss": 0.0162, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.121974830590513, |
|
"grad_norm": 0.02109163999557495, |
|
"learning_rate": 7.123994178524345e-05, |
|
"loss": 0.0189, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.1297192642787994, |
|
"grad_norm": 0.018890704959630966, |
|
"learning_rate": 7.099445507801323e-05, |
|
"loss": 0.0196, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.1297192642787994, |
|
"eval_loss": 0.016141431406140327, |
|
"eval_runtime": 4.8795, |
|
"eval_samples_per_second": 10.247, |
|
"eval_steps_per_second": 2.664, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.1374636979670862, |
|
"grad_norm": 0.026332931593060493, |
|
"learning_rate": 7.074835241816817e-05, |
|
"loss": 0.029, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.1452081316553726, |
|
"grad_norm": 0.02275455929338932, |
|
"learning_rate": 7.05016410260708e-05, |
|
"loss": 0.0156, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.1529525653436594, |
|
"grad_norm": 0.022596005350351334, |
|
"learning_rate": 7.025432813994315e-05, |
|
"loss": 0.0184, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.160696999031946, |
|
"grad_norm": 0.020018640905618668, |
|
"learning_rate": 7.000642101565434e-05, |
|
"loss": 0.0107, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.168441432720232, |
|
"grad_norm": 0.025625359266996384, |
|
"learning_rate": 6.975792692650777e-05, |
|
"loss": 0.0146, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.168441432720232, |
|
"eval_loss": 0.015940353274345398, |
|
"eval_runtime": 4.9128, |
|
"eval_samples_per_second": 10.178, |
|
"eval_steps_per_second": 2.646, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.176185866408519, |
|
"grad_norm": 0.026554979383945465, |
|
"learning_rate": 6.950885316302773e-05, |
|
"loss": 0.0213, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.1839303000968053, |
|
"grad_norm": 0.023344026878476143, |
|
"learning_rate": 6.925920703274541e-05, |
|
"loss": 0.0176, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.191674733785092, |
|
"grad_norm": 0.03146139904856682, |
|
"learning_rate": 6.90089958599846e-05, |
|
"loss": 0.0243, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.1994191674733785, |
|
"grad_norm": 0.02688729763031006, |
|
"learning_rate": 6.875822698564679e-05, |
|
"loss": 0.0235, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.207163601161665, |
|
"grad_norm": 0.017707915976643562, |
|
"learning_rate": 6.850690776699573e-05, |
|
"loss": 0.0091, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.207163601161665, |
|
"eval_loss": 0.015938647091388702, |
|
"eval_runtime": 4.8821, |
|
"eval_samples_per_second": 10.241, |
|
"eval_steps_per_second": 2.663, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.2149080348499517, |
|
"grad_norm": 0.02426217496395111, |
|
"learning_rate": 6.825504557744167e-05, |
|
"loss": 0.0222, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.222652468538238, |
|
"grad_norm": 0.017933079972863197, |
|
"learning_rate": 6.800264780632494e-05, |
|
"loss": 0.0127, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.230396902226525, |
|
"grad_norm": 0.02196042612195015, |
|
"learning_rate": 6.774972185869927e-05, |
|
"loss": 0.013, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.2381413359148112, |
|
"grad_norm": 0.02711823582649231, |
|
"learning_rate": 6.749627515511442e-05, |
|
"loss": 0.0198, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.2458857696030976, |
|
"grad_norm": 0.01899660937488079, |
|
"learning_rate": 6.724231513139852e-05, |
|
"loss": 0.0106, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.2458857696030976, |
|
"eval_loss": 0.015821926295757294, |
|
"eval_runtime": 4.8849, |
|
"eval_samples_per_second": 10.236, |
|
"eval_steps_per_second": 2.661, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.2536302032913844, |
|
"grad_norm": 0.02587137557566166, |
|
"learning_rate": 6.698784923843992e-05, |
|
"loss": 0.0204, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.261374636979671, |
|
"grad_norm": 0.02532321773469448, |
|
"learning_rate": 6.673288494196858e-05, |
|
"loss": 0.0191, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.2691190706679576, |
|
"grad_norm": 0.03079635463654995, |
|
"learning_rate": 6.647742972233703e-05, |
|
"loss": 0.0205, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.276863504356244, |
|
"grad_norm": 0.023865051567554474, |
|
"learning_rate": 6.622149107430088e-05, |
|
"loss": 0.0151, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.2846079380445303, |
|
"grad_norm": 0.02512257918715477, |
|
"learning_rate": 6.5965076506799e-05, |
|
"loss": 0.014, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.2846079380445303, |
|
"eval_loss": 0.015925200656056404, |
|
"eval_runtime": 4.8773, |
|
"eval_samples_per_second": 10.251, |
|
"eval_steps_per_second": 2.665, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.292352371732817, |
|
"grad_norm": 0.026422763243317604, |
|
"learning_rate": 6.570819354273317e-05, |
|
"loss": 0.0173, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.3000968054211035, |
|
"grad_norm": 0.02848372980952263, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.0251, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.3078412391093903, |
|
"grad_norm": 0.02191309630870819, |
|
"learning_rate": 6.519305258500666e-05, |
|
"loss": 0.0104, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.3155856727976767, |
|
"grad_norm": 0.025703053921461105, |
|
"learning_rate": 6.493480970497569e-05, |
|
"loss": 0.0311, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.323330106485963, |
|
"grad_norm": 0.021763848140835762, |
|
"learning_rate": 6.467612865519674e-05, |
|
"loss": 0.0168, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.323330106485963, |
|
"eval_loss": 0.01583768054842949, |
|
"eval_runtime": 4.8796, |
|
"eval_samples_per_second": 10.247, |
|
"eval_steps_per_second": 2.664, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.33107454017425, |
|
"grad_norm": 0.01981600932776928, |
|
"learning_rate": 6.441701702506754e-05, |
|
"loss": 0.0174, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.3388189738625362, |
|
"grad_norm": 0.021816400811076164, |
|
"learning_rate": 6.415748241661851e-05, |
|
"loss": 0.0222, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.346563407550823, |
|
"grad_norm": 0.028364678844809532, |
|
"learning_rate": 6.389753244428972e-05, |
|
"loss": 0.0222, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.3543078412391094, |
|
"grad_norm": 0.03110797517001629, |
|
"learning_rate": 6.363717473470759e-05, |
|
"loss": 0.0194, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.362052274927396, |
|
"grad_norm": 0.03083011880517006, |
|
"learning_rate": 6.337641692646106e-05, |
|
"loss": 0.0217, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.362052274927396, |
|
"eval_loss": 0.01598162204027176, |
|
"eval_runtime": 4.8805, |
|
"eval_samples_per_second": 10.245, |
|
"eval_steps_per_second": 2.664, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.3697967086156826, |
|
"grad_norm": 0.027600981295108795, |
|
"learning_rate": 6.311526666987743e-05, |
|
"loss": 0.0168, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.377541142303969, |
|
"grad_norm": 0.050711363554000854, |
|
"learning_rate": 6.285373162679803e-05, |
|
"loss": 0.027, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.3852855759922553, |
|
"grad_norm": 0.0258706696331501, |
|
"learning_rate": 6.259181947035342e-05, |
|
"loss": 0.014, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.393030009680542, |
|
"grad_norm": 0.022878140211105347, |
|
"learning_rate": 6.232953788473811e-05, |
|
"loss": 0.0125, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.4007744433688285, |
|
"grad_norm": 0.02646121010184288, |
|
"learning_rate": 6.206689456498529e-05, |
|
"loss": 0.0225, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.4007744433688285, |
|
"eval_loss": 0.015688462182879448, |
|
"eval_runtime": 4.8894, |
|
"eval_samples_per_second": 10.226, |
|
"eval_steps_per_second": 2.659, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.4085188770571153, |
|
"grad_norm": 0.01907186210155487, |
|
"learning_rate": 6.1803897216741e-05, |
|
"loss": 0.0105, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.4162633107454017, |
|
"grad_norm": 0.025598157197237015, |
|
"learning_rate": 6.154055355603807e-05, |
|
"loss": 0.0195, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.4240077444336885, |
|
"grad_norm": 0.021488605067133904, |
|
"learning_rate": 6.127687130906972e-05, |
|
"loss": 0.0171, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.431752178121975, |
|
"grad_norm": 0.023560060188174248, |
|
"learning_rate": 6.101285821196285e-05, |
|
"loss": 0.0234, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.4394966118102612, |
|
"grad_norm": 0.020358163863420486, |
|
"learning_rate": 6.0748522010551215e-05, |
|
"loss": 0.0158, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.4394966118102612, |
|
"eval_loss": 0.015287678688764572, |
|
"eval_runtime": 4.884, |
|
"eval_samples_per_second": 10.237, |
|
"eval_steps_per_second": 2.662, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.447241045498548, |
|
"grad_norm": 0.04023784399032593, |
|
"learning_rate": 6.048387046014795e-05, |
|
"loss": 0.0195, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.4549854791868344, |
|
"grad_norm": 0.018253512680530548, |
|
"learning_rate": 6.021891132531825e-05, |
|
"loss": 0.0172, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.4627299128751208, |
|
"grad_norm": 0.020507492125034332, |
|
"learning_rate": 5.995365237965144e-05, |
|
"loss": 0.0234, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.4704743465634076, |
|
"grad_norm": 0.025176333263516426, |
|
"learning_rate": 5.9688101405532925e-05, |
|
"loss": 0.0196, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.478218780251694, |
|
"grad_norm": 0.022779326885938644, |
|
"learning_rate": 5.9422266193915924e-05, |
|
"loss": 0.0122, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.478218780251694, |
|
"eval_loss": 0.015223703347146511, |
|
"eval_runtime": 4.8811, |
|
"eval_samples_per_second": 10.244, |
|
"eval_steps_per_second": 2.663, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.4859632139399808, |
|
"grad_norm": 0.019654158502817154, |
|
"learning_rate": 5.9156154544092815e-05, |
|
"loss": 0.0191, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.493707647628267, |
|
"grad_norm": 0.01823735609650612, |
|
"learning_rate": 5.8889774263466355e-05, |
|
"loss": 0.0128, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.501452081316554, |
|
"grad_norm": 0.022733347490429878, |
|
"learning_rate": 5.862313316732063e-05, |
|
"loss": 0.0095, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.5091965150048403, |
|
"grad_norm": 0.019566858187317848, |
|
"learning_rate": 5.8356239078591724e-05, |
|
"loss": 0.012, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.5169409486931267, |
|
"grad_norm": 0.023728664964437485, |
|
"learning_rate": 5.808909982763825e-05, |
|
"loss": 0.0152, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.5169409486931267, |
|
"eval_loss": 0.01537258829921484, |
|
"eval_runtime": 4.8868, |
|
"eval_samples_per_second": 10.232, |
|
"eval_steps_per_second": 2.66, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.5246853823814135, |
|
"grad_norm": 0.026009773835539818, |
|
"learning_rate": 5.782172325201155e-05, |
|
"loss": 0.0158, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.5324298160697, |
|
"grad_norm": 0.045942921191453934, |
|
"learning_rate": 5.7554117196225846e-05, |
|
"loss": 0.0304, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.5401742497579862, |
|
"grad_norm": 0.017686696723103523, |
|
"learning_rate": 5.728628951152799e-05, |
|
"loss": 0.0157, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.547918683446273, |
|
"grad_norm": 0.020913394168019295, |
|
"learning_rate": 5.701824805566722e-05, |
|
"loss": 0.0162, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.5556631171345594, |
|
"grad_norm": 0.025631655007600784, |
|
"learning_rate": 5.675000069266451e-05, |
|
"loss": 0.0268, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.5556631171345594, |
|
"eval_loss": 0.015396489761769772, |
|
"eval_runtime": 4.8797, |
|
"eval_samples_per_second": 10.246, |
|
"eval_steps_per_second": 2.664, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.563407550822846, |
|
"grad_norm": 0.021823951974511147, |
|
"learning_rate": 5.6481555292581946e-05, |
|
"loss": 0.0116, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.5711519845111326, |
|
"grad_norm": 0.023217862471938133, |
|
"learning_rate": 5.621291973129177e-05, |
|
"loss": 0.0151, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.5788964181994194, |
|
"grad_norm": 0.03442602604627609, |
|
"learning_rate": 5.5944101890245324e-05, |
|
"loss": 0.0202, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.5866408518877058, |
|
"grad_norm": 0.023536914959549904, |
|
"learning_rate": 5.5675109656241876e-05, |
|
"loss": 0.014, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.594385285575992, |
|
"grad_norm": 0.026387905701994896, |
|
"learning_rate": 5.540595092119709e-05, |
|
"loss": 0.0174, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.594385285575992, |
|
"eval_loss": 0.01569586619734764, |
|
"eval_runtime": 4.8895, |
|
"eval_samples_per_second": 10.226, |
|
"eval_steps_per_second": 2.659, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.602129719264279, |
|
"grad_norm": 0.02376389689743519, |
|
"learning_rate": 5.5136633581911655e-05, |
|
"loss": 0.0232, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.6098741529525653, |
|
"grad_norm": 0.022475535050034523, |
|
"learning_rate": 5.486716553983951e-05, |
|
"loss": 0.0176, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.6176185866408517, |
|
"grad_norm": 0.026273801922798157, |
|
"learning_rate": 5.4597554700855946e-05, |
|
"loss": 0.0099, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.6253630203291385, |
|
"grad_norm": 0.0252407044172287, |
|
"learning_rate": 5.432780897502589e-05, |
|
"loss": 0.0169, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.633107454017425, |
|
"grad_norm": 0.025699293240904808, |
|
"learning_rate": 5.4057936276371565e-05, |
|
"loss": 0.0147, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.633107454017425, |
|
"eval_loss": 0.015603473410010338, |
|
"eval_runtime": 4.8875, |
|
"eval_samples_per_second": 10.23, |
|
"eval_steps_per_second": 2.66, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.6408518877057117, |
|
"grad_norm": 0.02292807772755623, |
|
"learning_rate": 5.378794452264053e-05, |
|
"loss": 0.0112, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.648596321393998, |
|
"grad_norm": 0.02671566605567932, |
|
"learning_rate": 5.351784163507319e-05, |
|
"loss": 0.0157, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.656340755082285, |
|
"grad_norm": 0.024869635701179504, |
|
"learning_rate": 5.324763553817054e-05, |
|
"loss": 0.0183, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.664085188770571, |
|
"grad_norm": 0.030287204310297966, |
|
"learning_rate": 5.2977334159461614e-05, |
|
"loss": 0.0235, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.6718296224588576, |
|
"grad_norm": 0.021120263263583183, |
|
"learning_rate": 5.270694542927088e-05, |
|
"loss": 0.0191, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.6718296224588576, |
|
"eval_loss": 0.015455065295100212, |
|
"eval_runtime": 4.8759, |
|
"eval_samples_per_second": 10.254, |
|
"eval_steps_per_second": 2.666, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.6795740561471444, |
|
"grad_norm": 0.022198256105184555, |
|
"learning_rate": 5.2436477280485605e-05, |
|
"loss": 0.017, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.6873184898354308, |
|
"grad_norm": 0.02474604733288288, |
|
"learning_rate": 5.216593764832311e-05, |
|
"loss": 0.0182, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.695062923523717, |
|
"grad_norm": 0.022626683115959167, |
|
"learning_rate": 5.189533447009794e-05, |
|
"loss": 0.0235, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.702807357212004, |
|
"grad_norm": 0.025306569412350655, |
|
"learning_rate": 5.162467568498903e-05, |
|
"loss": 0.0185, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.7105517909002903, |
|
"grad_norm": 0.01958346739411354, |
|
"learning_rate": 5.135396923380673e-05, |
|
"loss": 0.0139, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.7105517909002903, |
|
"eval_loss": 0.015315129421651363, |
|
"eval_runtime": 4.8828, |
|
"eval_samples_per_second": 10.24, |
|
"eval_steps_per_second": 2.662, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.718296224588577, |
|
"grad_norm": 0.028099266812205315, |
|
"learning_rate": 5.108322305875988e-05, |
|
"loss": 0.0151, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.7260406582768635, |
|
"grad_norm": 0.026032108813524246, |
|
"learning_rate": 5.081244510322274e-05, |
|
"loss": 0.0143, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.7337850919651503, |
|
"grad_norm": 0.030373040586709976, |
|
"learning_rate": 5.0541643311502e-05, |
|
"loss": 0.0177, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.7415295256534367, |
|
"grad_norm": 0.026800263673067093, |
|
"learning_rate": 5.027082562860368e-05, |
|
"loss": 0.0146, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.749273959341723, |
|
"grad_norm": 0.028782140463590622, |
|
"learning_rate": 5e-05, |
|
"loss": 0.026, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.749273959341723, |
|
"eval_loss": 0.015001767314970493, |
|
"eval_runtime": 4.8958, |
|
"eval_samples_per_second": 10.213, |
|
"eval_steps_per_second": 2.655, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.75701839303001, |
|
"grad_norm": 0.023763621225953102, |
|
"learning_rate": 4.9729174371396334e-05, |
|
"loss": 0.0138, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.764762826718296, |
|
"grad_norm": 0.02057846635580063, |
|
"learning_rate": 4.945835668849801e-05, |
|
"loss": 0.0101, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.7725072604065826, |
|
"grad_norm": 0.026699546724557877, |
|
"learning_rate": 4.9187554896777285e-05, |
|
"loss": 0.0185, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.7802516940948694, |
|
"grad_norm": 0.025631215423345566, |
|
"learning_rate": 4.8916776941240135e-05, |
|
"loss": 0.0177, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.7879961277831558, |
|
"grad_norm": 0.020701708272099495, |
|
"learning_rate": 4.8646030766193285e-05, |
|
"loss": 0.0162, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.7879961277831558, |
|
"eval_loss": 0.014788495376706123, |
|
"eval_runtime": 4.885, |
|
"eval_samples_per_second": 10.235, |
|
"eval_steps_per_second": 2.661, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.7957405614714426, |
|
"grad_norm": 0.018802624195814133, |
|
"learning_rate": 4.837532431501098e-05, |
|
"loss": 0.0195, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.803484995159729, |
|
"grad_norm": 0.024294838309288025, |
|
"learning_rate": 4.8104665529902075e-05, |
|
"loss": 0.0172, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.8112294288480157, |
|
"grad_norm": 0.02249518595635891, |
|
"learning_rate": 4.78340623516769e-05, |
|
"loss": 0.0157, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.818973862536302, |
|
"grad_norm": 0.022549943998456, |
|
"learning_rate": 4.756352271951441e-05, |
|
"loss": 0.0167, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.8267182962245885, |
|
"grad_norm": 0.03274448588490486, |
|
"learning_rate": 4.729305457072913e-05, |
|
"loss": 0.0258, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.8267182962245885, |
|
"eval_loss": 0.014879841357469559, |
|
"eval_runtime": 4.8948, |
|
"eval_samples_per_second": 10.215, |
|
"eval_steps_per_second": 2.656, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.8344627299128753, |
|
"grad_norm": 0.031107768416404724, |
|
"learning_rate": 4.70226658405384e-05, |
|
"loss": 0.0167, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.8422071636011617, |
|
"grad_norm": 0.023017307743430138, |
|
"learning_rate": 4.675236446182946e-05, |
|
"loss": 0.0126, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.849951597289448, |
|
"grad_norm": 0.03121495246887207, |
|
"learning_rate": 4.648215836492682e-05, |
|
"loss": 0.0139, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.857696030977735, |
|
"grad_norm": 0.026987893506884575, |
|
"learning_rate": 4.6212055477359486e-05, |
|
"loss": 0.0147, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.865440464666021, |
|
"grad_norm": 0.024263298138976097, |
|
"learning_rate": 4.594206372362845e-05, |
|
"loss": 0.0154, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.865440464666021, |
|
"eval_loss": 0.014814168214797974, |
|
"eval_runtime": 4.8923, |
|
"eval_samples_per_second": 10.22, |
|
"eval_steps_per_second": 2.657, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.8731848983543076, |
|
"grad_norm": 0.022974541410803795, |
|
"learning_rate": 4.567219102497412e-05, |
|
"loss": 0.0136, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.8809293320425944, |
|
"grad_norm": 0.025871610268950462, |
|
"learning_rate": 4.540244529914406e-05, |
|
"loss": 0.0126, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.888673765730881, |
|
"grad_norm": 0.026091424748301506, |
|
"learning_rate": 4.5132834460160524e-05, |
|
"loss": 0.023, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.8964181994191676, |
|
"grad_norm": 0.024125855416059494, |
|
"learning_rate": 4.486336641808835e-05, |
|
"loss": 0.0129, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.904162633107454, |
|
"grad_norm": 0.01973029226064682, |
|
"learning_rate": 4.4594049078802925e-05, |
|
"loss": 0.0166, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.904162633107454, |
|
"eval_loss": 0.01432761363685131, |
|
"eval_runtime": 4.8944, |
|
"eval_samples_per_second": 10.216, |
|
"eval_steps_per_second": 2.656, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.9119070667957407, |
|
"grad_norm": 0.022474128752946854, |
|
"learning_rate": 4.4324890343758136e-05, |
|
"loss": 0.0115, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.919651500484027, |
|
"grad_norm": 0.022197918966412544, |
|
"learning_rate": 4.405589810975468e-05, |
|
"loss": 0.0108, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.9273959341723135, |
|
"grad_norm": 0.023376472294330597, |
|
"learning_rate": 4.3787080268708244e-05, |
|
"loss": 0.0105, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.9351403678606003, |
|
"grad_norm": 0.01602279581129551, |
|
"learning_rate": 4.351844470741808e-05, |
|
"loss": 0.0094, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.9428848015488867, |
|
"grad_norm": 0.02684823051095009, |
|
"learning_rate": 4.3249999307335495e-05, |
|
"loss": 0.0189, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.9428848015488867, |
|
"eval_loss": 0.014240576885640621, |
|
"eval_runtime": 4.8817, |
|
"eval_samples_per_second": 10.242, |
|
"eval_steps_per_second": 2.663, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.950629235237173, |
|
"grad_norm": 0.0212652999907732, |
|
"learning_rate": 4.298175194433279e-05, |
|
"loss": 0.0154, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.95837366892546, |
|
"grad_norm": 0.019883181899785995, |
|
"learning_rate": 4.2713710488472006e-05, |
|
"loss": 0.0087, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.9661181026137466, |
|
"grad_norm": 0.02650902420282364, |
|
"learning_rate": 4.244588280377417e-05, |
|
"loss": 0.0164, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.973862536302033, |
|
"grad_norm": 0.02401239052414894, |
|
"learning_rate": 4.2178276747988446e-05, |
|
"loss": 0.0139, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.9816069699903194, |
|
"grad_norm": 0.022838260978460312, |
|
"learning_rate": 4.1910900172361764e-05, |
|
"loss": 0.0155, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.9816069699903194, |
|
"eval_loss": 0.0144858593121171, |
|
"eval_runtime": 4.8906, |
|
"eval_samples_per_second": 10.224, |
|
"eval_steps_per_second": 2.658, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.989351403678606, |
|
"grad_norm": 0.03657938912510872, |
|
"learning_rate": 4.164376092140828e-05, |
|
"loss": 0.0286, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.9970958373668926, |
|
"grad_norm": 0.02792074717581272, |
|
"learning_rate": 4.1376866832679385e-05, |
|
"loss": 0.014, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 3.004840271055179, |
|
"grad_norm": 0.05196017026901245, |
|
"learning_rate": 4.1110225736533664e-05, |
|
"loss": 0.0222, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 3.0125847047434657, |
|
"grad_norm": 0.0229202788323164, |
|
"learning_rate": 4.084384545590719e-05, |
|
"loss": 0.007, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 3.020329138431752, |
|
"grad_norm": 0.021996086463332176, |
|
"learning_rate": 4.057773380608411e-05, |
|
"loss": 0.0121, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.020329138431752, |
|
"eval_loss": 0.014621075242757797, |
|
"eval_runtime": 4.8766, |
|
"eval_samples_per_second": 10.253, |
|
"eval_steps_per_second": 2.666, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.028073572120039, |
|
"grad_norm": 0.024300433695316315, |
|
"learning_rate": 4.0311898594467086e-05, |
|
"loss": 0.0119, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 3.0358180058083253, |
|
"grad_norm": 0.023426620289683342, |
|
"learning_rate": 4.0046347620348586e-05, |
|
"loss": 0.0123, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 3.0435624394966116, |
|
"grad_norm": 0.024129556491971016, |
|
"learning_rate": 3.9781088674681764e-05, |
|
"loss": 0.0124, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 3.0513068731848985, |
|
"grad_norm": 0.04510955512523651, |
|
"learning_rate": 3.951612953985207e-05, |
|
"loss": 0.0174, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 3.059051306873185, |
|
"grad_norm": 0.02260909229516983, |
|
"learning_rate": 3.92514779894488e-05, |
|
"loss": 0.0122, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.059051306873185, |
|
"eval_loss": 0.014701277017593384, |
|
"eval_runtime": 4.8794, |
|
"eval_samples_per_second": 10.247, |
|
"eval_steps_per_second": 2.664, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.0667957405614716, |
|
"grad_norm": 0.020229579880833626, |
|
"learning_rate": 3.8987141788037154e-05, |
|
"loss": 0.0063, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 3.074540174249758, |
|
"grad_norm": 0.024916259571909904, |
|
"learning_rate": 3.8723128690930296e-05, |
|
"loss": 0.0099, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 3.0822846079380444, |
|
"grad_norm": 0.017238672822713852, |
|
"learning_rate": 3.8459446443961944e-05, |
|
"loss": 0.0071, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 3.090029041626331, |
|
"grad_norm": 0.028883591294288635, |
|
"learning_rate": 3.8196102783258994e-05, |
|
"loss": 0.0181, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 3.0977734753146176, |
|
"grad_norm": 0.025792468339204788, |
|
"learning_rate": 3.793310543501473e-05, |
|
"loss": 0.0136, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.0977734753146176, |
|
"eval_loss": 0.014834250323474407, |
|
"eval_runtime": 4.8859, |
|
"eval_samples_per_second": 10.234, |
|
"eval_steps_per_second": 2.661, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.1055179090029044, |
|
"grad_norm": 0.03113100863993168, |
|
"learning_rate": 3.7670462115261906e-05, |
|
"loss": 0.0193, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 3.1132623426911907, |
|
"grad_norm": 0.02263321541249752, |
|
"learning_rate": 3.7408180529646596e-05, |
|
"loss": 0.0123, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 3.121006776379477, |
|
"grad_norm": 0.023540707305073738, |
|
"learning_rate": 3.714626837320195e-05, |
|
"loss": 0.0119, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 3.128751210067764, |
|
"grad_norm": 0.031784623861312866, |
|
"learning_rate": 3.688473333012259e-05, |
|
"loss": 0.0175, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 3.1364956437560503, |
|
"grad_norm": 0.022701062262058258, |
|
"learning_rate": 3.6623583073538966e-05, |
|
"loss": 0.0107, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.1364956437560503, |
|
"eval_loss": 0.01486950647085905, |
|
"eval_runtime": 4.892, |
|
"eval_samples_per_second": 10.221, |
|
"eval_steps_per_second": 2.657, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.144240077444337, |
|
"grad_norm": 0.026784732937812805, |
|
"learning_rate": 3.636282526529242e-05, |
|
"loss": 0.0125, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 3.1519845111326235, |
|
"grad_norm": 0.026719210669398308, |
|
"learning_rate": 3.6102467555710295e-05, |
|
"loss": 0.0103, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 3.15972894482091, |
|
"grad_norm": 0.03489716723561287, |
|
"learning_rate": 3.584251758338151e-05, |
|
"loss": 0.0134, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 3.1674733785091966, |
|
"grad_norm": 0.02056041732430458, |
|
"learning_rate": 3.558298297493247e-05, |
|
"loss": 0.0073, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 3.175217812197483, |
|
"grad_norm": 0.030753985047340393, |
|
"learning_rate": 3.5323871344803263e-05, |
|
"loss": 0.0164, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.175217812197483, |
|
"eval_loss": 0.01476968638598919, |
|
"eval_runtime": 4.8875, |
|
"eval_samples_per_second": 10.23, |
|
"eval_steps_per_second": 2.66, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.1829622458857694, |
|
"grad_norm": 0.025167269632220268, |
|
"learning_rate": 3.506519029502433e-05, |
|
"loss": 0.0121, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 3.190706679574056, |
|
"grad_norm": 0.03184746950864792, |
|
"learning_rate": 3.480694741499334e-05, |
|
"loss": 0.0174, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 3.1984511132623425, |
|
"grad_norm": 0.014001097530126572, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.0057, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 3.2061955469506294, |
|
"grad_norm": 0.027478694915771484, |
|
"learning_rate": 3.4291806457266826e-05, |
|
"loss": 0.0138, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 3.2139399806389157, |
|
"grad_norm": 0.02516726590692997, |
|
"learning_rate": 3.403492349320101e-05, |
|
"loss": 0.0112, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.2139399806389157, |
|
"eval_loss": 0.014760926365852356, |
|
"eval_runtime": 4.876, |
|
"eval_samples_per_second": 10.254, |
|
"eval_steps_per_second": 2.666, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.2216844143272025, |
|
"grad_norm": 0.03305725008249283, |
|
"learning_rate": 3.3778508925699124e-05, |
|
"loss": 0.0256, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.229428848015489, |
|
"grad_norm": 0.024431169033050537, |
|
"learning_rate": 3.3522570277662985e-05, |
|
"loss": 0.0083, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.2371732817037753, |
|
"grad_norm": 0.03031334839761257, |
|
"learning_rate": 3.326711505803142e-05, |
|
"loss": 0.0107, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.244917715392062, |
|
"grad_norm": 0.033758629113435745, |
|
"learning_rate": 3.3012150761560085e-05, |
|
"loss": 0.0186, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.2526621490803485, |
|
"grad_norm": 0.02770036645233631, |
|
"learning_rate": 3.275768486860149e-05, |
|
"loss": 0.0097, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.2526621490803485, |
|
"eval_loss": 0.015088791027665138, |
|
"eval_runtime": 4.8982, |
|
"eval_samples_per_second": 10.208, |
|
"eval_steps_per_second": 2.654, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.260406582768635, |
|
"grad_norm": 0.02369946427643299, |
|
"learning_rate": 3.250372484488558e-05, |
|
"loss": 0.0094, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.2681510164569216, |
|
"grad_norm": 0.03576388210058212, |
|
"learning_rate": 3.225027814130074e-05, |
|
"loss": 0.0125, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.275895450145208, |
|
"grad_norm": 0.025971444323658943, |
|
"learning_rate": 3.199735219367507e-05, |
|
"loss": 0.0118, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.283639883833495, |
|
"grad_norm": 0.028038574382662773, |
|
"learning_rate": 3.174495442255836e-05, |
|
"loss": 0.0099, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.291384317521781, |
|
"grad_norm": 0.027834760025143623, |
|
"learning_rate": 3.149309223300428e-05, |
|
"loss": 0.0113, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.291384317521781, |
|
"eval_loss": 0.014965364709496498, |
|
"eval_runtime": 4.8786, |
|
"eval_samples_per_second": 10.249, |
|
"eval_steps_per_second": 2.665, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.299128751210068, |
|
"grad_norm": 0.023443985730409622, |
|
"learning_rate": 3.124177301435324e-05, |
|
"loss": 0.0132, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.3068731848983544, |
|
"grad_norm": 0.024410808458924294, |
|
"learning_rate": 3.09910041400154e-05, |
|
"loss": 0.0102, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.3146176185866407, |
|
"grad_norm": 0.032607510685920715, |
|
"learning_rate": 3.0740792967254604e-05, |
|
"loss": 0.0168, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.3223620522749275, |
|
"grad_norm": 0.03291484713554382, |
|
"learning_rate": 3.0491146836972272e-05, |
|
"loss": 0.019, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.330106485963214, |
|
"grad_norm": 0.03559967130422592, |
|
"learning_rate": 3.024207307349224e-05, |
|
"loss": 0.0303, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.330106485963214, |
|
"eval_loss": 0.014858649112284184, |
|
"eval_runtime": 4.8834, |
|
"eval_samples_per_second": 10.239, |
|
"eval_steps_per_second": 2.662, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.3378509196515003, |
|
"grad_norm": 0.02721838466823101, |
|
"learning_rate": 2.9993578984345672e-05, |
|
"loss": 0.0111, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.345595353339787, |
|
"grad_norm": 0.028012285009026527, |
|
"learning_rate": 2.9745671860056868e-05, |
|
"loss": 0.0136, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.3533397870280734, |
|
"grad_norm": 0.029208144173026085, |
|
"learning_rate": 2.9498358973929196e-05, |
|
"loss": 0.013, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.3610842207163603, |
|
"grad_norm": 0.031169850379228592, |
|
"learning_rate": 2.9251647581831836e-05, |
|
"loss": 0.0187, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.3688286544046466, |
|
"grad_norm": 0.03211589530110359, |
|
"learning_rate": 2.900554492198677e-05, |
|
"loss": 0.0161, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.3688286544046466, |
|
"eval_loss": 0.014618839137256145, |
|
"eval_runtime": 4.887, |
|
"eval_samples_per_second": 10.231, |
|
"eval_steps_per_second": 2.66, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.3765730880929334, |
|
"grad_norm": 0.0314168706536293, |
|
"learning_rate": 2.876005821475657e-05, |
|
"loss": 0.0106, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.38431752178122, |
|
"grad_norm": 0.03567107021808624, |
|
"learning_rate": 2.851519466243242e-05, |
|
"loss": 0.0173, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.392061955469506, |
|
"grad_norm": 0.031098151579499245, |
|
"learning_rate": 2.8270961449022893e-05, |
|
"loss": 0.0185, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.399806389157793, |
|
"grad_norm": 0.028943657875061035, |
|
"learning_rate": 2.802736574004319e-05, |
|
"loss": 0.0159, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.4075508228460794, |
|
"grad_norm": 0.023004574701189995, |
|
"learning_rate": 2.7784414682304832e-05, |
|
"loss": 0.011, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.4075508228460794, |
|
"eval_loss": 0.014386112801730633, |
|
"eval_runtime": 4.8818, |
|
"eval_samples_per_second": 10.242, |
|
"eval_steps_per_second": 2.663, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.4152952565343657, |
|
"grad_norm": 0.027619289234280586, |
|
"learning_rate": 2.7542115403706063e-05, |
|
"loss": 0.0089, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.4230396902226525, |
|
"grad_norm": 0.025844210758805275, |
|
"learning_rate": 2.7300475013022663e-05, |
|
"loss": 0.0127, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.430784123910939, |
|
"grad_norm": 0.01797422766685486, |
|
"learning_rate": 2.7059500599699476e-05, |
|
"loss": 0.0068, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.4385285575992257, |
|
"grad_norm": 0.031139735132455826, |
|
"learning_rate": 2.6819199233642278e-05, |
|
"loss": 0.0135, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.446272991287512, |
|
"grad_norm": 0.03126378357410431, |
|
"learning_rate": 2.65795779650105e-05, |
|
"loss": 0.0084, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.446272991287512, |
|
"eval_loss": 0.014389649964869022, |
|
"eval_runtime": 4.8893, |
|
"eval_samples_per_second": 10.226, |
|
"eval_steps_per_second": 2.659, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.454017424975799, |
|
"grad_norm": 0.019535277038812637, |
|
"learning_rate": 2.6340643824010247e-05, |
|
"loss": 0.0099, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.4617618586640853, |
|
"grad_norm": 0.029923155903816223, |
|
"learning_rate": 2.6102403820688177e-05, |
|
"loss": 0.0158, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.4695062923523716, |
|
"grad_norm": 0.023479627445340157, |
|
"learning_rate": 2.586486494472572e-05, |
|
"loss": 0.0066, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.4772507260406584, |
|
"grad_norm": 0.03173988685011864, |
|
"learning_rate": 2.562803416523405e-05, |
|
"loss": 0.01, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.484995159728945, |
|
"grad_norm": 0.03306049853563309, |
|
"learning_rate": 2.539191843054963e-05, |
|
"loss": 0.0127, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.484995159728945, |
|
"eval_loss": 0.014806166291236877, |
|
"eval_runtime": 4.9121, |
|
"eval_samples_per_second": 10.179, |
|
"eval_steps_per_second": 2.647, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.492739593417231, |
|
"grad_norm": 0.02089696377515793, |
|
"learning_rate": 2.51565246680304e-05, |
|
"loss": 0.0062, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.500484027105518, |
|
"grad_norm": 0.03812693804502487, |
|
"learning_rate": 2.4921859783852408e-05, |
|
"loss": 0.0116, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.5082284607938043, |
|
"grad_norm": 0.02929401397705078, |
|
"learning_rate": 2.4687930662807303e-05, |
|
"loss": 0.0136, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.515972894482091, |
|
"grad_norm": 0.024923592805862427, |
|
"learning_rate": 2.445474416810033e-05, |
|
"loss": 0.0094, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.5237173281703775, |
|
"grad_norm": 0.02743164636194706, |
|
"learning_rate": 2.422230714114891e-05, |
|
"loss": 0.0134, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.5237173281703775, |
|
"eval_loss": 0.01469426229596138, |
|
"eval_runtime": 4.8924, |
|
"eval_samples_per_second": 10.22, |
|
"eval_steps_per_second": 2.657, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.5314617618586643, |
|
"grad_norm": 0.04384300857782364, |
|
"learning_rate": 2.399062640138201e-05, |
|
"loss": 0.0233, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.5392061955469507, |
|
"grad_norm": 0.03357204422354698, |
|
"learning_rate": 2.3759708746039976e-05, |
|
"loss": 0.0177, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.546950629235237, |
|
"grad_norm": 0.03177043795585632, |
|
"learning_rate": 2.3529560949975182e-05, |
|
"loss": 0.0087, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.554695062923524, |
|
"grad_norm": 0.02979344129562378, |
|
"learning_rate": 2.3300189765453196e-05, |
|
"loss": 0.0082, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.5624394966118103, |
|
"grad_norm": 0.021871499717235565, |
|
"learning_rate": 2.3071601921954794e-05, |
|
"loss": 0.0092, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.5624394966118103, |
|
"eval_loss": 0.014372522011399269, |
|
"eval_runtime": 4.8873, |
|
"eval_samples_per_second": 10.231, |
|
"eval_steps_per_second": 2.66, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.5701839303000966, |
|
"grad_norm": 0.027945492416620255, |
|
"learning_rate": 2.2843804125978357e-05, |
|
"loss": 0.0164, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.5779283639883834, |
|
"grad_norm": 0.024697836488485336, |
|
"learning_rate": 2.2616803060843283e-05, |
|
"loss": 0.0074, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.58567279767667, |
|
"grad_norm": 0.024211924523115158, |
|
"learning_rate": 2.2390605386493757e-05, |
|
"loss": 0.0087, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.593417231364956, |
|
"grad_norm": 0.025920916348695755, |
|
"learning_rate": 2.2165217739303508e-05, |
|
"loss": 0.0128, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.601161665053243, |
|
"grad_norm": 0.027798939496278763, |
|
"learning_rate": 2.194064673188089e-05, |
|
"loss": 0.0205, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.601161665053243, |
|
"eval_loss": 0.014178312383592129, |
|
"eval_runtime": 4.8802, |
|
"eval_samples_per_second": 10.246, |
|
"eval_steps_per_second": 2.664, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.60890609874153, |
|
"grad_norm": 0.0249908696860075, |
|
"learning_rate": 2.171689895287513e-05, |
|
"loss": 0.0098, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.616650532429816, |
|
"grad_norm": 0.023805009201169014, |
|
"learning_rate": 2.149398096678283e-05, |
|
"loss": 0.0099, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.6243949661181025, |
|
"grad_norm": 0.030275024473667145, |
|
"learning_rate": 2.12718993137555e-05, |
|
"loss": 0.0201, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.6321393998063893, |
|
"grad_norm": 0.025657106190919876, |
|
"learning_rate": 2.105066050940758e-05, |
|
"loss": 0.0102, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.6398838334946757, |
|
"grad_norm": 0.02271328866481781, |
|
"learning_rate": 2.08302710446253e-05, |
|
"loss": 0.0097, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.6398838334946757, |
|
"eval_loss": 0.014142417348921299, |
|
"eval_runtime": 4.8856, |
|
"eval_samples_per_second": 10.234, |
|
"eval_steps_per_second": 2.661, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.647628267182962, |
|
"grad_norm": 0.026042208075523376, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.0177, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.655372700871249, |
|
"grad_norm": 0.021258225664496422, |
|
"learning_rate": 2.039206597252001e-05, |
|
"loss": 0.0065, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.6631171345595352, |
|
"grad_norm": 0.027606485411524773, |
|
"learning_rate": 2.0174263221618307e-05, |
|
"loss": 0.0127, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.6708615682478216, |
|
"grad_norm": 0.02728329971432686, |
|
"learning_rate": 1.9957335522747707e-05, |
|
"loss": 0.0123, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.6786060019361084, |
|
"grad_norm": 0.03719132021069527, |
|
"learning_rate": 1.9741289240311755e-05, |
|
"loss": 0.0158, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.6786060019361084, |
|
"eval_loss": 0.014227832667529583, |
|
"eval_runtime": 4.8832, |
|
"eval_samples_per_second": 10.239, |
|
"eval_steps_per_second": 2.662, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.6863504356243952, |
|
"grad_norm": 0.029825210571289062, |
|
"learning_rate": 1.9526130712854185e-05, |
|
"loss": 0.0128, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.6940948693126816, |
|
"grad_norm": 0.09481414407491684, |
|
"learning_rate": 1.931186625287313e-05, |
|
"loss": 0.0202, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.701839303000968, |
|
"grad_norm": 0.027814751490950584, |
|
"learning_rate": 1.909850214663575e-05, |
|
"loss": 0.0121, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.709583736689255, |
|
"grad_norm": 0.03036467730998993, |
|
"learning_rate": 1.8886044653993968e-05, |
|
"loss": 0.0163, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.717328170377541, |
|
"grad_norm": 0.023233845829963684, |
|
"learning_rate": 1.8674500008200674e-05, |
|
"loss": 0.0095, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.717328170377541, |
|
"eval_loss": 0.014037776738405228, |
|
"eval_runtime": 4.8827, |
|
"eval_samples_per_second": 10.24, |
|
"eval_steps_per_second": 2.662, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.7250726040658275, |
|
"grad_norm": 0.02636660821735859, |
|
"learning_rate": 1.8463874415726918e-05, |
|
"loss": 0.0125, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.7328170377541143, |
|
"grad_norm": 0.022603245452046394, |
|
"learning_rate": 1.82541740560798e-05, |
|
"loss": 0.0072, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.7405614714424007, |
|
"grad_norm": 0.019264785572886467, |
|
"learning_rate": 1.8045405081621215e-05, |
|
"loss": 0.0068, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.748305905130687, |
|
"grad_norm": 0.02744339220225811, |
|
"learning_rate": 1.7837573617387265e-05, |
|
"loss": 0.0139, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.756050338818974, |
|
"grad_norm": 0.032306037843227386, |
|
"learning_rate": 1.7630685760908622e-05, |
|
"loss": 0.0163, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.756050338818974, |
|
"eval_loss": 0.014070287346839905, |
|
"eval_runtime": 4.8787, |
|
"eval_samples_per_second": 10.249, |
|
"eval_steps_per_second": 2.665, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.7637947725072602, |
|
"grad_norm": 0.034267835319042206, |
|
"learning_rate": 1.7424747582031637e-05, |
|
"loss": 0.0145, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.771539206195547, |
|
"grad_norm": 0.02139255404472351, |
|
"learning_rate": 1.72197651227402e-05, |
|
"loss": 0.0084, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.7792836398838334, |
|
"grad_norm": 0.020995331928133965, |
|
"learning_rate": 1.7015744396978556e-05, |
|
"loss": 0.0065, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.7870280735721202, |
|
"grad_norm": 0.03288980573415756, |
|
"learning_rate": 1.6812691390474787e-05, |
|
"loss": 0.0175, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.7947725072604066, |
|
"grad_norm": 0.021166102960705757, |
|
"learning_rate": 1.6610612060565234e-05, |
|
"loss": 0.007, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.7947725072604066, |
|
"eval_loss": 0.014264380559325218, |
|
"eval_runtime": 4.8993, |
|
"eval_samples_per_second": 10.206, |
|
"eval_steps_per_second": 2.653, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.802516940948693, |
|
"grad_norm": 0.02033647708594799, |
|
"learning_rate": 1.64095123360197e-05, |
|
"loss": 0.0081, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.81026137463698, |
|
"grad_norm": 0.01951659470796585, |
|
"learning_rate": 1.6209398116867574e-05, |
|
"loss": 0.008, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.818005808325266, |
|
"grad_norm": 0.028182433918118477, |
|
"learning_rate": 1.6010275274224606e-05, |
|
"loss": 0.0143, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.8257502420135525, |
|
"grad_norm": 0.03811497241258621, |
|
"learning_rate": 1.5812149650120784e-05, |
|
"loss": 0.0139, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.8334946757018393, |
|
"grad_norm": 0.02721046842634678, |
|
"learning_rate": 1.561502705732883e-05, |
|
"loss": 0.0069, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.8334946757018393, |
|
"eval_loss": 0.014395428821444511, |
|
"eval_runtime": 4.885, |
|
"eval_samples_per_second": 10.235, |
|
"eval_steps_per_second": 2.661, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.8412391093901257, |
|
"grad_norm": 0.03506116569042206, |
|
"learning_rate": 1.5418913279193746e-05, |
|
"loss": 0.0154, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.8489835430784125, |
|
"grad_norm": 0.029712386429309845, |
|
"learning_rate": 1.5223814069463078e-05, |
|
"loss": 0.0074, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.856727976766699, |
|
"grad_norm": 0.021429866552352905, |
|
"learning_rate": 1.5029735152118124e-05, |
|
"loss": 0.0067, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.8644724104549857, |
|
"grad_norm": 0.024990901350975037, |
|
"learning_rate": 1.4836682221206e-05, |
|
"loss": 0.0089, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.872216844143272, |
|
"grad_norm": 0.0315503366291523, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.012, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.872216844143272, |
|
"eval_loss": 0.014392802491784096, |
|
"eval_runtime": 4.8814, |
|
"eval_samples_per_second": 10.243, |
|
"eval_steps_per_second": 2.663, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 645, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.720511304678769e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|