sizhkhy's picture
Upload folder using huggingface_hub
57c55f8 verified
{
"best_metric": 0.014392802491784096,
"best_model_checkpoint": "/home/paperspace/Data/models/dbischof_premise_aea/llm3br256/checkpoint-500",
"epoch": 3.872216844143272,
"eval_steps": 5,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007744433688286544,
"grad_norm": 0.28707125782966614,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.0847,
"step": 1
},
{
"epoch": 0.015488867376573089,
"grad_norm": 0.34009915590286255,
"learning_rate": 3.0769230769230774e-06,
"loss": 0.0928,
"step": 2
},
{
"epoch": 0.023233301064859633,
"grad_norm": 0.29313409328460693,
"learning_rate": 4.615384615384616e-06,
"loss": 0.0934,
"step": 3
},
{
"epoch": 0.030977734753146177,
"grad_norm": 0.2913404107093811,
"learning_rate": 6.153846153846155e-06,
"loss": 0.0913,
"step": 4
},
{
"epoch": 0.03872216844143272,
"grad_norm": 0.29106780886650085,
"learning_rate": 7.692307692307694e-06,
"loss": 0.095,
"step": 5
},
{
"epoch": 0.03872216844143272,
"eval_loss": 0.07727333903312683,
"eval_runtime": 5.9343,
"eval_samples_per_second": 8.426,
"eval_steps_per_second": 2.191,
"step": 5
},
{
"epoch": 0.046466602129719266,
"grad_norm": 0.23025450110435486,
"learning_rate": 9.230769230769232e-06,
"loss": 0.0948,
"step": 6
},
{
"epoch": 0.05421103581800581,
"grad_norm": 0.21704453229904175,
"learning_rate": 1.0769230769230771e-05,
"loss": 0.0727,
"step": 7
},
{
"epoch": 0.061955469506292354,
"grad_norm": 0.17385561764240265,
"learning_rate": 1.230769230769231e-05,
"loss": 0.0689,
"step": 8
},
{
"epoch": 0.0696999031945789,
"grad_norm": 0.15649482607841492,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.0604,
"step": 9
},
{
"epoch": 0.07744433688286544,
"grad_norm": 0.11710207164287567,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.0562,
"step": 10
},
{
"epoch": 0.07744433688286544,
"eval_loss": 0.04053657874464989,
"eval_runtime": 4.8954,
"eval_samples_per_second": 10.214,
"eval_steps_per_second": 2.656,
"step": 10
},
{
"epoch": 0.08518877057115198,
"grad_norm": 0.09721983969211578,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.0393,
"step": 11
},
{
"epoch": 0.09293320425943853,
"grad_norm": 0.09856045991182327,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.0404,
"step": 12
},
{
"epoch": 0.10067763794772508,
"grad_norm": 0.11793606728315353,
"learning_rate": 2e-05,
"loss": 0.0455,
"step": 13
},
{
"epoch": 0.10842207163601161,
"grad_norm": 0.11285863816738129,
"learning_rate": 2.1538461538461542e-05,
"loss": 0.059,
"step": 14
},
{
"epoch": 0.11616650532429816,
"grad_norm": 0.08813278377056122,
"learning_rate": 2.307692307692308e-05,
"loss": 0.032,
"step": 15
},
{
"epoch": 0.11616650532429816,
"eval_loss": 0.03360835462808609,
"eval_runtime": 4.8812,
"eval_samples_per_second": 10.243,
"eval_steps_per_second": 2.663,
"step": 15
},
{
"epoch": 0.12391093901258471,
"grad_norm": 0.06082022562623024,
"learning_rate": 2.461538461538462e-05,
"loss": 0.0419,
"step": 16
},
{
"epoch": 0.13165537270087124,
"grad_norm": 0.055546533316373825,
"learning_rate": 2.6153846153846157e-05,
"loss": 0.0452,
"step": 17
},
{
"epoch": 0.1393998063891578,
"grad_norm": 0.0525379441678524,
"learning_rate": 2.7692307692307694e-05,
"loss": 0.0329,
"step": 18
},
{
"epoch": 0.14714424007744434,
"grad_norm": 0.058248624205589294,
"learning_rate": 2.9230769230769234e-05,
"loss": 0.0338,
"step": 19
},
{
"epoch": 0.15488867376573087,
"grad_norm": 0.057563405483961105,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.0488,
"step": 20
},
{
"epoch": 0.15488867376573087,
"eval_loss": 0.031162459403276443,
"eval_runtime": 4.9017,
"eval_samples_per_second": 10.201,
"eval_steps_per_second": 2.652,
"step": 20
},
{
"epoch": 0.16263310745401743,
"grad_norm": 0.04852646589279175,
"learning_rate": 3.230769230769231e-05,
"loss": 0.0364,
"step": 21
},
{
"epoch": 0.17037754114230397,
"grad_norm": 0.05401140823960304,
"learning_rate": 3.384615384615385e-05,
"loss": 0.0446,
"step": 22
},
{
"epoch": 0.1781219748305905,
"grad_norm": 0.0492316372692585,
"learning_rate": 3.538461538461539e-05,
"loss": 0.0407,
"step": 23
},
{
"epoch": 0.18586640851887706,
"grad_norm": 0.037774790078401566,
"learning_rate": 3.692307692307693e-05,
"loss": 0.0315,
"step": 24
},
{
"epoch": 0.1936108422071636,
"grad_norm": 0.04360613971948624,
"learning_rate": 3.846153846153846e-05,
"loss": 0.0331,
"step": 25
},
{
"epoch": 0.1936108422071636,
"eval_loss": 0.02766346000134945,
"eval_runtime": 4.8772,
"eval_samples_per_second": 10.252,
"eval_steps_per_second": 2.665,
"step": 25
},
{
"epoch": 0.20135527589545016,
"grad_norm": 0.037237901240587234,
"learning_rate": 4e-05,
"loss": 0.0259,
"step": 26
},
{
"epoch": 0.2090997095837367,
"grad_norm": 0.03505983576178551,
"learning_rate": 4.1538461538461544e-05,
"loss": 0.0303,
"step": 27
},
{
"epoch": 0.21684414327202323,
"grad_norm": 0.041253913193941116,
"learning_rate": 4.3076923076923084e-05,
"loss": 0.0453,
"step": 28
},
{
"epoch": 0.2245885769603098,
"grad_norm": 0.04072079062461853,
"learning_rate": 4.461538461538462e-05,
"loss": 0.0316,
"step": 29
},
{
"epoch": 0.23233301064859632,
"grad_norm": 0.03738202154636383,
"learning_rate": 4.615384615384616e-05,
"loss": 0.0377,
"step": 30
},
{
"epoch": 0.23233301064859632,
"eval_loss": 0.025424109771847725,
"eval_runtime": 4.8765,
"eval_samples_per_second": 10.253,
"eval_steps_per_second": 2.666,
"step": 30
},
{
"epoch": 0.24007744433688286,
"grad_norm": 0.03633822873234749,
"learning_rate": 4.76923076923077e-05,
"loss": 0.0369,
"step": 31
},
{
"epoch": 0.24782187802516942,
"grad_norm": 0.03256253898143768,
"learning_rate": 4.923076923076924e-05,
"loss": 0.0349,
"step": 32
},
{
"epoch": 0.25556631171345595,
"grad_norm": 0.031838804483413696,
"learning_rate": 5.0769230769230766e-05,
"loss": 0.0283,
"step": 33
},
{
"epoch": 0.2633107454017425,
"grad_norm": 0.026707077398896217,
"learning_rate": 5.230769230769231e-05,
"loss": 0.0283,
"step": 34
},
{
"epoch": 0.271055179090029,
"grad_norm": 0.03254338726401329,
"learning_rate": 5.384615384615385e-05,
"loss": 0.0316,
"step": 35
},
{
"epoch": 0.271055179090029,
"eval_loss": 0.024270590394735336,
"eval_runtime": 4.8832,
"eval_samples_per_second": 10.239,
"eval_steps_per_second": 2.662,
"step": 35
},
{
"epoch": 0.2787996127783156,
"grad_norm": 0.030620776116847992,
"learning_rate": 5.538461538461539e-05,
"loss": 0.0306,
"step": 36
},
{
"epoch": 0.28654404646660214,
"grad_norm": 0.03317311033606529,
"learning_rate": 5.692307692307692e-05,
"loss": 0.0293,
"step": 37
},
{
"epoch": 0.2942884801548887,
"grad_norm": 0.026506489142775536,
"learning_rate": 5.846153846153847e-05,
"loss": 0.0293,
"step": 38
},
{
"epoch": 0.3020329138431752,
"grad_norm": 0.023665621876716614,
"learning_rate": 6e-05,
"loss": 0.0166,
"step": 39
},
{
"epoch": 0.30977734753146174,
"grad_norm": 0.03278828039765358,
"learning_rate": 6.153846153846155e-05,
"loss": 0.0374,
"step": 40
},
{
"epoch": 0.30977734753146174,
"eval_loss": 0.023048410192131996,
"eval_runtime": 4.885,
"eval_samples_per_second": 10.235,
"eval_steps_per_second": 2.661,
"step": 40
},
{
"epoch": 0.31752178121974833,
"grad_norm": 0.03030160255730152,
"learning_rate": 6.307692307692308e-05,
"loss": 0.0334,
"step": 41
},
{
"epoch": 0.32526621490803487,
"grad_norm": 0.03384114429354668,
"learning_rate": 6.461538461538462e-05,
"loss": 0.0212,
"step": 42
},
{
"epoch": 0.3330106485963214,
"grad_norm": 0.02560395933687687,
"learning_rate": 6.615384615384616e-05,
"loss": 0.0363,
"step": 43
},
{
"epoch": 0.34075508228460794,
"grad_norm": 0.026470044627785683,
"learning_rate": 6.76923076923077e-05,
"loss": 0.024,
"step": 44
},
{
"epoch": 0.34849951597289447,
"grad_norm": 0.023488877341151237,
"learning_rate": 6.923076923076924e-05,
"loss": 0.0208,
"step": 45
},
{
"epoch": 0.34849951597289447,
"eval_loss": 0.022530335932970047,
"eval_runtime": 4.8759,
"eval_samples_per_second": 10.255,
"eval_steps_per_second": 2.666,
"step": 45
},
{
"epoch": 0.356243949661181,
"grad_norm": 0.029532263055443764,
"learning_rate": 7.076923076923078e-05,
"loss": 0.0399,
"step": 46
},
{
"epoch": 0.3639883833494676,
"grad_norm": 0.025283565744757652,
"learning_rate": 7.23076923076923e-05,
"loss": 0.033,
"step": 47
},
{
"epoch": 0.3717328170377541,
"grad_norm": 0.024645334109663963,
"learning_rate": 7.384615384615386e-05,
"loss": 0.0431,
"step": 48
},
{
"epoch": 0.37947725072604066,
"grad_norm": 0.025530191138386726,
"learning_rate": 7.538461538461539e-05,
"loss": 0.0321,
"step": 49
},
{
"epoch": 0.3872216844143272,
"grad_norm": 0.02383197844028473,
"learning_rate": 7.692307692307693e-05,
"loss": 0.0305,
"step": 50
},
{
"epoch": 0.3872216844143272,
"eval_loss": 0.021847765892744064,
"eval_runtime": 4.8901,
"eval_samples_per_second": 10.225,
"eval_steps_per_second": 2.658,
"step": 50
},
{
"epoch": 0.39496611810261373,
"grad_norm": 0.02661319635808468,
"learning_rate": 7.846153846153847e-05,
"loss": 0.0312,
"step": 51
},
{
"epoch": 0.4027105517909003,
"grad_norm": 0.029026813805103302,
"learning_rate": 8e-05,
"loss": 0.0202,
"step": 52
},
{
"epoch": 0.41045498547918685,
"grad_norm": 0.03153839334845543,
"learning_rate": 8.153846153846155e-05,
"loss": 0.0322,
"step": 53
},
{
"epoch": 0.4181994191674734,
"grad_norm": 0.027100125327706337,
"learning_rate": 8.307692307692309e-05,
"loss": 0.0217,
"step": 54
},
{
"epoch": 0.4259438528557599,
"grad_norm": 0.034204043447971344,
"learning_rate": 8.461538461538461e-05,
"loss": 0.0238,
"step": 55
},
{
"epoch": 0.4259438528557599,
"eval_loss": 0.021218011155724525,
"eval_runtime": 4.895,
"eval_samples_per_second": 10.215,
"eval_steps_per_second": 2.656,
"step": 55
},
{
"epoch": 0.43368828654404645,
"grad_norm": 0.026411807164549828,
"learning_rate": 8.615384615384617e-05,
"loss": 0.0264,
"step": 56
},
{
"epoch": 0.441432720232333,
"grad_norm": 0.025747094303369522,
"learning_rate": 8.76923076923077e-05,
"loss": 0.0231,
"step": 57
},
{
"epoch": 0.4491771539206196,
"grad_norm": 0.028047436848282814,
"learning_rate": 8.923076923076924e-05,
"loss": 0.0269,
"step": 58
},
{
"epoch": 0.4569215876089061,
"grad_norm": 0.03033887967467308,
"learning_rate": 9.076923076923078e-05,
"loss": 0.0286,
"step": 59
},
{
"epoch": 0.46466602129719264,
"grad_norm": 0.024372393265366554,
"learning_rate": 9.230769230769232e-05,
"loss": 0.0278,
"step": 60
},
{
"epoch": 0.46466602129719264,
"eval_loss": 0.020728331059217453,
"eval_runtime": 4.8702,
"eval_samples_per_second": 10.266,
"eval_steps_per_second": 2.669,
"step": 60
},
{
"epoch": 0.4724104549854792,
"grad_norm": 0.028278978541493416,
"learning_rate": 9.384615384615386e-05,
"loss": 0.0247,
"step": 61
},
{
"epoch": 0.4801548886737657,
"grad_norm": 0.03280925378203392,
"learning_rate": 9.53846153846154e-05,
"loss": 0.026,
"step": 62
},
{
"epoch": 0.4878993223620523,
"grad_norm": 0.023919392377138138,
"learning_rate": 9.692307692307692e-05,
"loss": 0.0312,
"step": 63
},
{
"epoch": 0.49564375605033884,
"grad_norm": 0.0364394448697567,
"learning_rate": 9.846153846153848e-05,
"loss": 0.0219,
"step": 64
},
{
"epoch": 0.5033881897386253,
"grad_norm": 0.02771547995507717,
"learning_rate": 0.0001,
"loss": 0.0199,
"step": 65
},
{
"epoch": 0.5033881897386253,
"eval_loss": 0.02000207081437111,
"eval_runtime": 4.8908,
"eval_samples_per_second": 10.223,
"eval_steps_per_second": 2.658,
"step": 65
},
{
"epoch": 0.5111326234269119,
"grad_norm": 0.02505766600370407,
"learning_rate": 9.999926652940913e-05,
"loss": 0.0206,
"step": 66
},
{
"epoch": 0.5188770571151985,
"grad_norm": 0.037389349192380905,
"learning_rate": 9.999706613915566e-05,
"loss": 0.0265,
"step": 67
},
{
"epoch": 0.526621490803485,
"grad_norm": 0.03750506415963173,
"learning_rate": 9.999339889379647e-05,
"loss": 0.0236,
"step": 68
},
{
"epoch": 0.5343659244917716,
"grad_norm": 0.028572333976626396,
"learning_rate": 9.998826490092421e-05,
"loss": 0.0236,
"step": 69
},
{
"epoch": 0.542110358180058,
"grad_norm": 0.024309856817126274,
"learning_rate": 9.99816643111642e-05,
"loss": 0.0235,
"step": 70
},
{
"epoch": 0.542110358180058,
"eval_loss": 0.02025166153907776,
"eval_runtime": 4.8811,
"eval_samples_per_second": 10.244,
"eval_steps_per_second": 2.663,
"step": 70
},
{
"epoch": 0.5498547918683446,
"grad_norm": 0.035883497446775436,
"learning_rate": 9.997359731816998e-05,
"loss": 0.0289,
"step": 71
},
{
"epoch": 0.5575992255566312,
"grad_norm": 0.034139424562454224,
"learning_rate": 9.996406415861763e-05,
"loss": 0.0366,
"step": 72
},
{
"epoch": 0.5653436592449177,
"grad_norm": 0.02562110312283039,
"learning_rate": 9.995306511219885e-05,
"loss": 0.0336,
"step": 73
},
{
"epoch": 0.5730880929332043,
"grad_norm": 0.026915963739156723,
"learning_rate": 9.994060050161269e-05,
"loss": 0.0193,
"step": 74
},
{
"epoch": 0.5808325266214908,
"grad_norm": 0.02748969756066799,
"learning_rate": 9.992667069255619e-05,
"loss": 0.0213,
"step": 75
},
{
"epoch": 0.5808325266214908,
"eval_loss": 0.019886016845703125,
"eval_runtime": 4.8762,
"eval_samples_per_second": 10.254,
"eval_steps_per_second": 2.666,
"step": 75
},
{
"epoch": 0.5885769603097774,
"grad_norm": 0.0281902477145195,
"learning_rate": 9.991127609371356e-05,
"loss": 0.0333,
"step": 76
},
{
"epoch": 0.5963213939980639,
"grad_norm": 0.032518427819013596,
"learning_rate": 9.989441715674422e-05,
"loss": 0.0296,
"step": 77
},
{
"epoch": 0.6040658276863504,
"grad_norm": 0.0259566493332386,
"learning_rate": 9.987609437626955e-05,
"loss": 0.0282,
"step": 78
},
{
"epoch": 0.611810261374637,
"grad_norm": 0.029854053631424904,
"learning_rate": 9.985630828985835e-05,
"loss": 0.0205,
"step": 79
},
{
"epoch": 0.6195546950629235,
"grad_norm": 0.03595299273729324,
"learning_rate": 9.983505947801115e-05,
"loss": 0.044,
"step": 80
},
{
"epoch": 0.6195546950629235,
"eval_loss": 0.01953260228037834,
"eval_runtime": 4.8809,
"eval_samples_per_second": 10.244,
"eval_steps_per_second": 2.663,
"step": 80
},
{
"epoch": 0.6272991287512101,
"grad_norm": 0.02581968903541565,
"learning_rate": 9.981234856414307e-05,
"loss": 0.0265,
"step": 81
},
{
"epoch": 0.6350435624394967,
"grad_norm": 0.02523561753332615,
"learning_rate": 9.978817621456562e-05,
"loss": 0.0232,
"step": 82
},
{
"epoch": 0.6427879961277831,
"grad_norm": 0.022955749183893204,
"learning_rate": 9.97625431384671e-05,
"loss": 0.0267,
"step": 83
},
{
"epoch": 0.6505324298160697,
"grad_norm": 0.0209239199757576,
"learning_rate": 9.973545008789181e-05,
"loss": 0.0303,
"step": 84
},
{
"epoch": 0.6582768635043562,
"grad_norm": 0.028582807630300522,
"learning_rate": 9.970689785771798e-05,
"loss": 0.021,
"step": 85
},
{
"epoch": 0.6582768635043562,
"eval_loss": 0.019236262887716293,
"eval_runtime": 4.874,
"eval_samples_per_second": 10.258,
"eval_steps_per_second": 2.667,
"step": 85
},
{
"epoch": 0.6660212971926428,
"grad_norm": 0.02616284228861332,
"learning_rate": 9.967688728563446e-05,
"loss": 0.0176,
"step": 86
},
{
"epoch": 0.6737657308809293,
"grad_norm": 0.029908856377005577,
"learning_rate": 9.964541925211612e-05,
"loss": 0.0206,
"step": 87
},
{
"epoch": 0.6815101645692159,
"grad_norm": 0.03139350563287735,
"learning_rate": 9.961249468039807e-05,
"loss": 0.0301,
"step": 88
},
{
"epoch": 0.6892545982575025,
"grad_norm": 0.025906842201948166,
"learning_rate": 9.957811453644847e-05,
"loss": 0.0192,
"step": 89
},
{
"epoch": 0.6969990319457889,
"grad_norm": 0.0281496811658144,
"learning_rate": 9.954227982894034e-05,
"loss": 0.0296,
"step": 90
},
{
"epoch": 0.6969990319457889,
"eval_loss": 0.019074302166700363,
"eval_runtime": 4.8832,
"eval_samples_per_second": 10.239,
"eval_steps_per_second": 2.662,
"step": 90
},
{
"epoch": 0.7047434656340755,
"grad_norm": 0.027965204790234566,
"learning_rate": 9.950499160922183e-05,
"loss": 0.0213,
"step": 91
},
{
"epoch": 0.712487899322362,
"grad_norm": 0.02602163329720497,
"learning_rate": 9.946625097128543e-05,
"loss": 0.0269,
"step": 92
},
{
"epoch": 0.7202323330106486,
"grad_norm": 0.028190776705741882,
"learning_rate": 9.942605905173592e-05,
"loss": 0.0207,
"step": 93
},
{
"epoch": 0.7279767666989352,
"grad_norm": 0.025893300771713257,
"learning_rate": 9.938441702975689e-05,
"loss": 0.0265,
"step": 94
},
{
"epoch": 0.7357212003872217,
"grad_norm": 0.0202568881213665,
"learning_rate": 9.934132612707632e-05,
"loss": 0.0141,
"step": 95
},
{
"epoch": 0.7357212003872217,
"eval_loss": 0.018998095765709877,
"eval_runtime": 4.8865,
"eval_samples_per_second": 10.232,
"eval_steps_per_second": 2.66,
"step": 95
},
{
"epoch": 0.7434656340755083,
"grad_norm": 0.03151071444153786,
"learning_rate": 9.929678760793057e-05,
"loss": 0.028,
"step": 96
},
{
"epoch": 0.7512100677637947,
"grad_norm": 0.037441398948431015,
"learning_rate": 9.925080277902743e-05,
"loss": 0.0275,
"step": 97
},
{
"epoch": 0.7589545014520813,
"grad_norm": 0.022733572870492935,
"learning_rate": 9.920337298950765e-05,
"loss": 0.0227,
"step": 98
},
{
"epoch": 0.7666989351403679,
"grad_norm": 0.021637218073010445,
"learning_rate": 9.91544996309055e-05,
"loss": 0.0179,
"step": 99
},
{
"epoch": 0.7744433688286544,
"grad_norm": 0.023374751210212708,
"learning_rate": 9.91041841371078e-05,
"loss": 0.0289,
"step": 100
},
{
"epoch": 0.7744433688286544,
"eval_loss": 0.01871725358068943,
"eval_runtime": 4.9046,
"eval_samples_per_second": 10.195,
"eval_steps_per_second": 2.651,
"step": 100
},
{
"epoch": 0.782187802516941,
"grad_norm": 0.021633530035614967,
"learning_rate": 9.905242798431196e-05,
"loss": 0.0267,
"step": 101
},
{
"epoch": 0.7899322362052275,
"grad_norm": 0.024837492033839226,
"learning_rate": 9.899923269098262e-05,
"loss": 0.0341,
"step": 102
},
{
"epoch": 0.797676669893514,
"grad_norm": 0.023348737508058548,
"learning_rate": 9.894459981780711e-05,
"loss": 0.0263,
"step": 103
},
{
"epoch": 0.8054211035818006,
"grad_norm": 0.02404264733195305,
"learning_rate": 9.888853096764964e-05,
"loss": 0.0214,
"step": 104
},
{
"epoch": 0.8131655372700871,
"grad_norm": 0.02434077486395836,
"learning_rate": 9.883102778550434e-05,
"loss": 0.0159,
"step": 105
},
{
"epoch": 0.8131655372700871,
"eval_loss": 0.01875956915318966,
"eval_runtime": 4.887,
"eval_samples_per_second": 10.231,
"eval_steps_per_second": 2.66,
"step": 105
},
{
"epoch": 0.8209099709583737,
"grad_norm": 0.023013584315776825,
"learning_rate": 9.877209195844692e-05,
"loss": 0.0266,
"step": 106
},
{
"epoch": 0.8286544046466602,
"grad_norm": 0.03137190267443657,
"learning_rate": 9.871172521558523e-05,
"loss": 0.0242,
"step": 107
},
{
"epoch": 0.8363988383349468,
"grad_norm": 0.023217204958200455,
"learning_rate": 9.864992932800845e-05,
"loss": 0.0254,
"step": 108
},
{
"epoch": 0.8441432720232332,
"grad_norm": 0.027811044827103615,
"learning_rate": 9.858670610873528e-05,
"loss": 0.0173,
"step": 109
},
{
"epoch": 0.8518877057115198,
"grad_norm": 0.027365995571017265,
"learning_rate": 9.852205741266058e-05,
"loss": 0.0275,
"step": 110
},
{
"epoch": 0.8518877057115198,
"eval_loss": 0.01876773312687874,
"eval_runtime": 4.8844,
"eval_samples_per_second": 10.237,
"eval_steps_per_second": 2.662,
"step": 110
},
{
"epoch": 0.8596321393998064,
"grad_norm": 0.022870220243930817,
"learning_rate": 9.845598513650103e-05,
"loss": 0.0175,
"step": 111
},
{
"epoch": 0.8673765730880929,
"grad_norm": 0.021480288356542587,
"learning_rate": 9.838849121873949e-05,
"loss": 0.0179,
"step": 112
},
{
"epoch": 0.8751210067763795,
"grad_norm": 0.025231841951608658,
"learning_rate": 9.831957763956813e-05,
"loss": 0.0182,
"step": 113
},
{
"epoch": 0.882865440464666,
"grad_norm": 0.023175878450274467,
"learning_rate": 9.824924642083026e-05,
"loss": 0.0167,
"step": 114
},
{
"epoch": 0.8906098741529526,
"grad_norm": 0.02536984719336033,
"learning_rate": 9.817749962596115e-05,
"loss": 0.0271,
"step": 115
},
{
"epoch": 0.8906098741529526,
"eval_loss": 0.018538037315011024,
"eval_runtime": 4.8812,
"eval_samples_per_second": 10.243,
"eval_steps_per_second": 2.663,
"step": 115
},
{
"epoch": 0.8983543078412392,
"grad_norm": 0.02080857753753662,
"learning_rate": 9.810433935992733e-05,
"loss": 0.0254,
"step": 116
},
{
"epoch": 0.9060987415295256,
"grad_norm": 0.026430707424879074,
"learning_rate": 9.802976776916494e-05,
"loss": 0.0185,
"step": 117
},
{
"epoch": 0.9138431752178122,
"grad_norm": 0.02291349321603775,
"learning_rate": 9.795378704151675e-05,
"loss": 0.0164,
"step": 118
},
{
"epoch": 0.9215876089060987,
"grad_norm": 0.02319083735346794,
"learning_rate": 9.787639940616788e-05,
"loss": 0.0237,
"step": 119
},
{
"epoch": 0.9293320425943853,
"grad_norm": 0.027965422719717026,
"learning_rate": 9.779760713358059e-05,
"loss": 0.0262,
"step": 120
},
{
"epoch": 0.9293320425943853,
"eval_loss": 0.018477478995919228,
"eval_runtime": 4.8802,
"eval_samples_per_second": 10.246,
"eval_steps_per_second": 2.664,
"step": 120
},
{
"epoch": 0.9370764762826719,
"grad_norm": 0.023768456652760506,
"learning_rate": 9.771741253542741e-05,
"loss": 0.0186,
"step": 121
},
{
"epoch": 0.9448209099709584,
"grad_norm": 0.01906961388885975,
"learning_rate": 9.763581796452353e-05,
"loss": 0.0163,
"step": 122
},
{
"epoch": 0.952565343659245,
"grad_norm": 0.022706998512148857,
"learning_rate": 9.755282581475769e-05,
"loss": 0.0253,
"step": 123
},
{
"epoch": 0.9603097773475314,
"grad_norm": 0.02551465854048729,
"learning_rate": 9.74684385210219e-05,
"loss": 0.0163,
"step": 124
},
{
"epoch": 0.968054211035818,
"grad_norm": 0.02145274542272091,
"learning_rate": 9.738265855914013e-05,
"loss": 0.0299,
"step": 125
},
{
"epoch": 0.968054211035818,
"eval_loss": 0.01828974299132824,
"eval_runtime": 4.8759,
"eval_samples_per_second": 10.254,
"eval_steps_per_second": 2.666,
"step": 125
},
{
"epoch": 0.9757986447241046,
"grad_norm": 0.023152988404035568,
"learning_rate": 9.729548844579552e-05,
"loss": 0.0178,
"step": 126
},
{
"epoch": 0.9835430784123911,
"grad_norm": 0.026649784296751022,
"learning_rate": 9.720693073845667e-05,
"loss": 0.024,
"step": 127
},
{
"epoch": 0.9912875121006777,
"grad_norm": 0.020236071199178696,
"learning_rate": 9.711698803530254e-05,
"loss": 0.0301,
"step": 128
},
{
"epoch": 0.9990319457889641,
"grad_norm": 0.027533914893865585,
"learning_rate": 9.70256629751462e-05,
"loss": 0.0195,
"step": 129
},
{
"epoch": 1.0067763794772506,
"grad_norm": 0.053280122578144073,
"learning_rate": 9.693295823735753e-05,
"loss": 0.0315,
"step": 130
},
{
"epoch": 1.0067763794772506,
"eval_loss": 0.018024258315563202,
"eval_runtime": 4.8931,
"eval_samples_per_second": 10.219,
"eval_steps_per_second": 2.657,
"step": 130
},
{
"epoch": 1.0145208131655372,
"grad_norm": 0.01893387921154499,
"learning_rate": 9.683887654178445e-05,
"loss": 0.0226,
"step": 131
},
{
"epoch": 1.0222652468538238,
"grad_norm": 0.029532097280025482,
"learning_rate": 9.674342064867326e-05,
"loss": 0.0145,
"step": 132
},
{
"epoch": 1.0300096805421104,
"grad_norm": 0.028108367696404457,
"learning_rate": 9.664659335858755e-05,
"loss": 0.0148,
"step": 133
},
{
"epoch": 1.037754114230397,
"grad_norm": 0.025696909055113792,
"learning_rate": 9.654839751232611e-05,
"loss": 0.0198,
"step": 134
},
{
"epoch": 1.0454985479186834,
"grad_norm": 0.02809828147292137,
"learning_rate": 9.644883599083958e-05,
"loss": 0.0212,
"step": 135
},
{
"epoch": 1.0454985479186834,
"eval_loss": 0.017997030168771744,
"eval_runtime": 4.8817,
"eval_samples_per_second": 10.242,
"eval_steps_per_second": 2.663,
"step": 135
},
{
"epoch": 1.05324298160697,
"grad_norm": 0.023596247658133507,
"learning_rate": 9.634791171514585e-05,
"loss": 0.027,
"step": 136
},
{
"epoch": 1.0609874152952565,
"grad_norm": 0.032478995621204376,
"learning_rate": 9.624562764624445e-05,
"loss": 0.0231,
"step": 137
},
{
"epoch": 1.0687318489835431,
"grad_norm": 0.029977047815918922,
"learning_rate": 9.614198678502965e-05,
"loss": 0.0139,
"step": 138
},
{
"epoch": 1.0764762826718297,
"grad_norm": 0.03173111006617546,
"learning_rate": 9.603699217220239e-05,
"loss": 0.0188,
"step": 139
},
{
"epoch": 1.084220716360116,
"grad_norm": 0.02266346476972103,
"learning_rate": 9.59306468881811e-05,
"loss": 0.0172,
"step": 140
},
{
"epoch": 1.084220716360116,
"eval_loss": 0.018361272290349007,
"eval_runtime": 4.8948,
"eval_samples_per_second": 10.215,
"eval_steps_per_second": 2.656,
"step": 140
},
{
"epoch": 1.0919651500484027,
"grad_norm": 0.03363156318664551,
"learning_rate": 9.582295405301131e-05,
"loss": 0.0202,
"step": 141
},
{
"epoch": 1.0997095837366893,
"grad_norm": 0.03840557113289833,
"learning_rate": 9.571391682627412e-05,
"loss": 0.0222,
"step": 142
},
{
"epoch": 1.1074540174249758,
"grad_norm": 0.023486673831939697,
"learning_rate": 9.56035384069935e-05,
"loss": 0.0396,
"step": 143
},
{
"epoch": 1.1151984511132624,
"grad_norm": 0.030952000990509987,
"learning_rate": 9.549182203354242e-05,
"loss": 0.0225,
"step": 144
},
{
"epoch": 1.1229428848015488,
"grad_norm": 0.030439218506217003,
"learning_rate": 9.537877098354786e-05,
"loss": 0.0277,
"step": 145
},
{
"epoch": 1.1229428848015488,
"eval_loss": 0.01816246099770069,
"eval_runtime": 4.8899,
"eval_samples_per_second": 10.225,
"eval_steps_per_second": 2.659,
"step": 145
},
{
"epoch": 1.1306873184898354,
"grad_norm": 0.024195371195673943,
"learning_rate": 9.526438857379463e-05,
"loss": 0.0116,
"step": 146
},
{
"epoch": 1.138431752178122,
"grad_norm": 0.02799941971898079,
"learning_rate": 9.514867816012809e-05,
"loss": 0.0195,
"step": 147
},
{
"epoch": 1.1461761858664086,
"grad_norm": 0.030233675613999367,
"learning_rate": 9.503164313735566e-05,
"loss": 0.0182,
"step": 148
},
{
"epoch": 1.1539206195546952,
"grad_norm": 0.024903280660510063,
"learning_rate": 9.491328693914722e-05,
"loss": 0.0222,
"step": 149
},
{
"epoch": 1.1616650532429815,
"grad_norm": 0.023587804287672043,
"learning_rate": 9.47936130379344e-05,
"loss": 0.0166,
"step": 150
},
{
"epoch": 1.1616650532429815,
"eval_loss": 0.017931492999196053,
"eval_runtime": 4.8826,
"eval_samples_per_second": 10.24,
"eval_steps_per_second": 2.663,
"step": 150
},
{
"epoch": 1.1694094869312681,
"grad_norm": 0.024121137335896492,
"learning_rate": 9.467262494480869e-05,
"loss": 0.0216,
"step": 151
},
{
"epoch": 1.1771539206195547,
"grad_norm": 0.02379632741212845,
"learning_rate": 9.45503262094184e-05,
"loss": 0.023,
"step": 152
},
{
"epoch": 1.1848983543078413,
"grad_norm": 0.02161642163991928,
"learning_rate": 9.442672041986457e-05,
"loss": 0.0349,
"step": 153
},
{
"epoch": 1.1926427879961277,
"grad_norm": 0.019304990768432617,
"learning_rate": 9.430181120259565e-05,
"loss": 0.0193,
"step": 154
},
{
"epoch": 1.2003872216844143,
"grad_norm": 0.022498024627566338,
"learning_rate": 9.417560222230115e-05,
"loss": 0.0272,
"step": 155
},
{
"epoch": 1.2003872216844143,
"eval_loss": 0.018144290894269943,
"eval_runtime": 4.8768,
"eval_samples_per_second": 10.253,
"eval_steps_per_second": 2.666,
"step": 155
},
{
"epoch": 1.2081316553727008,
"grad_norm": 0.03062877058982849,
"learning_rate": 9.404809718180407e-05,
"loss": 0.0215,
"step": 156
},
{
"epoch": 1.2158760890609874,
"grad_norm": 0.023427944630384445,
"learning_rate": 9.391929982195232e-05,
"loss": 0.0301,
"step": 157
},
{
"epoch": 1.223620522749274,
"grad_norm": 0.02246953919529915,
"learning_rate": 9.378921392150892e-05,
"loss": 0.0212,
"step": 158
},
{
"epoch": 1.2313649564375604,
"grad_norm": 0.02264482155442238,
"learning_rate": 9.365784329704115e-05,
"loss": 0.0164,
"step": 159
},
{
"epoch": 1.239109390125847,
"grad_norm": 0.025367658585309982,
"learning_rate": 9.35251918028086e-05,
"loss": 0.0193,
"step": 160
},
{
"epoch": 1.239109390125847,
"eval_loss": 0.017837481573224068,
"eval_runtime": 4.8761,
"eval_samples_per_second": 10.254,
"eval_steps_per_second": 2.666,
"step": 160
},
{
"epoch": 1.2468538238141336,
"grad_norm": 0.02131119929254055,
"learning_rate": 9.339126333065007e-05,
"loss": 0.0207,
"step": 161
},
{
"epoch": 1.2545982575024202,
"grad_norm": 0.019136667251586914,
"learning_rate": 9.325606180986939e-05,
"loss": 0.0147,
"step": 162
},
{
"epoch": 1.2623426911907067,
"grad_norm": 0.024482635781168938,
"learning_rate": 9.31195912071201e-05,
"loss": 0.0299,
"step": 163
},
{
"epoch": 1.2700871248789931,
"grad_norm": 0.02487838640809059,
"learning_rate": 9.298185552628917e-05,
"loss": 0.0232,
"step": 164
},
{
"epoch": 1.2778315585672797,
"grad_norm": 0.025261854752898216,
"learning_rate": 9.284285880837946e-05,
"loss": 0.0121,
"step": 165
},
{
"epoch": 1.2778315585672797,
"eval_loss": 0.017772378399968147,
"eval_runtime": 4.8807,
"eval_samples_per_second": 10.245,
"eval_steps_per_second": 2.664,
"step": 165
},
{
"epoch": 1.2855759922555663,
"grad_norm": 0.02148056961596012,
"learning_rate": 9.270260513139116e-05,
"loss": 0.0347,
"step": 166
},
{
"epoch": 1.2933204259438529,
"grad_norm": 0.02021237276494503,
"learning_rate": 9.256109861020213e-05,
"loss": 0.02,
"step": 167
},
{
"epoch": 1.3010648596321395,
"grad_norm": 0.017359554767608643,
"learning_rate": 9.241834339644726e-05,
"loss": 0.0168,
"step": 168
},
{
"epoch": 1.3088092933204258,
"grad_norm": 0.02310781180858612,
"learning_rate": 9.22743436783966e-05,
"loss": 0.0192,
"step": 169
},
{
"epoch": 1.3165537270087124,
"grad_norm": 0.020348088815808296,
"learning_rate": 9.212910368083245e-05,
"loss": 0.0218,
"step": 170
},
{
"epoch": 1.3165537270087124,
"eval_loss": 0.0177312009036541,
"eval_runtime": 4.8794,
"eval_samples_per_second": 10.247,
"eval_steps_per_second": 2.664,
"step": 170
},
{
"epoch": 1.324298160696999,
"grad_norm": 0.019140997901558876,
"learning_rate": 9.198262766492554e-05,
"loss": 0.0217,
"step": 171
},
{
"epoch": 1.3320425943852856,
"grad_norm": 0.023120978847146034,
"learning_rate": 9.183491992810979e-05,
"loss": 0.0275,
"step": 172
},
{
"epoch": 1.3397870280735722,
"grad_norm": 0.024684559553861618,
"learning_rate": 9.168598480395651e-05,
"loss": 0.0201,
"step": 173
},
{
"epoch": 1.3475314617618586,
"grad_norm": 0.024830348789691925,
"learning_rate": 9.153582666204701e-05,
"loss": 0.0234,
"step": 174
},
{
"epoch": 1.3552758954501452,
"grad_norm": 0.023022592067718506,
"learning_rate": 9.138444990784453e-05,
"loss": 0.016,
"step": 175
},
{
"epoch": 1.3552758954501452,
"eval_loss": 0.017486225813627243,
"eval_runtime": 4.8842,
"eval_samples_per_second": 10.237,
"eval_steps_per_second": 2.662,
"step": 175
},
{
"epoch": 1.3630203291384317,
"grad_norm": 0.02616291493177414,
"learning_rate": 9.123185898256496e-05,
"loss": 0.0261,
"step": 176
},
{
"epoch": 1.3707647628267183,
"grad_norm": 0.02299882471561432,
"learning_rate": 9.107805836304658e-05,
"loss": 0.0254,
"step": 177
},
{
"epoch": 1.378509196515005,
"grad_norm": 0.018913911655545235,
"learning_rate": 9.092305256161859e-05,
"loss": 0.0124,
"step": 178
},
{
"epoch": 1.3862536302032913,
"grad_norm": 0.02167947217822075,
"learning_rate": 9.076684612596891e-05,
"loss": 0.0232,
"step": 179
},
{
"epoch": 1.3939980638915779,
"grad_norm": 0.02304757945239544,
"learning_rate": 9.060944363901056e-05,
"loss": 0.0268,
"step": 180
},
{
"epoch": 1.3939980638915779,
"eval_loss": 0.01751082018017769,
"eval_runtime": 4.8781,
"eval_samples_per_second": 10.25,
"eval_steps_per_second": 2.665,
"step": 180
},
{
"epoch": 1.4017424975798645,
"grad_norm": 0.02488349750638008,
"learning_rate": 9.045084971874738e-05,
"loss": 0.0128,
"step": 181
},
{
"epoch": 1.409486931268151,
"grad_norm": 0.025742027908563614,
"learning_rate": 9.029106901813839e-05,
"loss": 0.0243,
"step": 182
},
{
"epoch": 1.4172313649564376,
"grad_norm": 0.020051000639796257,
"learning_rate": 9.013010622496144e-05,
"loss": 0.0106,
"step": 183
},
{
"epoch": 1.424975798644724,
"grad_norm": 0.021976549178361893,
"learning_rate": 8.996796606167548e-05,
"loss": 0.0183,
"step": 184
},
{
"epoch": 1.4327202323330106,
"grad_norm": 0.0210378710180521,
"learning_rate": 8.980465328528219e-05,
"loss": 0.0152,
"step": 185
},
{
"epoch": 1.4327202323330106,
"eval_loss": 0.017743976786732674,
"eval_runtime": 4.8802,
"eval_samples_per_second": 10.246,
"eval_steps_per_second": 2.664,
"step": 185
},
{
"epoch": 1.4404646660212972,
"grad_norm": 0.02365756221115589,
"learning_rate": 8.96401726871863e-05,
"loss": 0.0114,
"step": 186
},
{
"epoch": 1.4482090997095838,
"grad_norm": 0.025590112432837486,
"learning_rate": 8.94745290930551e-05,
"loss": 0.0189,
"step": 187
},
{
"epoch": 1.4559535333978704,
"grad_norm": 0.029832618311047554,
"learning_rate": 8.930772736267674e-05,
"loss": 0.0324,
"step": 188
},
{
"epoch": 1.4636979670861567,
"grad_norm": 0.025901637971401215,
"learning_rate": 8.913977238981778e-05,
"loss": 0.0186,
"step": 189
},
{
"epoch": 1.4714424007744433,
"grad_norm": 0.01908070780336857,
"learning_rate": 8.897066910207958e-05,
"loss": 0.0279,
"step": 190
},
{
"epoch": 1.4714424007744433,
"eval_loss": 0.017557693645358086,
"eval_runtime": 4.8877,
"eval_samples_per_second": 10.23,
"eval_steps_per_second": 2.66,
"step": 190
},
{
"epoch": 1.47918683446273,
"grad_norm": 0.025517305359244347,
"learning_rate": 8.880042246075365e-05,
"loss": 0.0279,
"step": 191
},
{
"epoch": 1.4869312681510165,
"grad_norm": 0.019936546683311462,
"learning_rate": 8.862903746067618e-05,
"loss": 0.0172,
"step": 192
},
{
"epoch": 1.494675701839303,
"grad_norm": 0.019224194809794426,
"learning_rate": 8.845651913008145e-05,
"loss": 0.0138,
"step": 193
},
{
"epoch": 1.5024201355275895,
"grad_norm": 0.017969885841012,
"learning_rate": 8.828287253045435e-05,
"loss": 0.0151,
"step": 194
},
{
"epoch": 1.510164569215876,
"grad_norm": 0.02093169093132019,
"learning_rate": 8.810810275638183e-05,
"loss": 0.0206,
"step": 195
},
{
"epoch": 1.510164569215876,
"eval_loss": 0.017626546323299408,
"eval_runtime": 4.9032,
"eval_samples_per_second": 10.197,
"eval_steps_per_second": 2.651,
"step": 195
},
{
"epoch": 1.5179090029041626,
"grad_norm": 0.027407390996813774,
"learning_rate": 8.793221493540347e-05,
"loss": 0.0151,
"step": 196
},
{
"epoch": 1.5256534365924492,
"grad_norm": 0.022155404090881348,
"learning_rate": 8.775521422786104e-05,
"loss": 0.0187,
"step": 197
},
{
"epoch": 1.5333978702807358,
"grad_norm": 0.02126327157020569,
"learning_rate": 8.757710582674707e-05,
"loss": 0.0168,
"step": 198
},
{
"epoch": 1.5411423039690222,
"grad_norm": 0.02067979797720909,
"learning_rate": 8.739789495755253e-05,
"loss": 0.015,
"step": 199
},
{
"epoch": 1.5488867376573088,
"grad_norm": 0.023581981658935547,
"learning_rate": 8.721758687811352e-05,
"loss": 0.0196,
"step": 200
},
{
"epoch": 1.5488867376573088,
"eval_loss": 0.017185786738991737,
"eval_runtime": 4.8793,
"eval_samples_per_second": 10.247,
"eval_steps_per_second": 2.664,
"step": 200
},
{
"epoch": 1.5566311713455954,
"grad_norm": 0.0208896417170763,
"learning_rate": 8.703618687845696e-05,
"loss": 0.0176,
"step": 201
},
{
"epoch": 1.5643756050338817,
"grad_norm": 0.02558140642940998,
"learning_rate": 8.685370028064546e-05,
"loss": 0.0224,
"step": 202
},
{
"epoch": 1.5721200387221685,
"grad_norm": 0.01860946975648403,
"learning_rate": 8.667013243862113e-05,
"loss": 0.0189,
"step": 203
},
{
"epoch": 1.579864472410455,
"grad_norm": 0.024494647979736328,
"learning_rate": 8.64854887380485e-05,
"loss": 0.0204,
"step": 204
},
{
"epoch": 1.5876089060987415,
"grad_norm": 0.028290973976254463,
"learning_rate": 8.629977459615655e-05,
"loss": 0.0262,
"step": 205
},
{
"epoch": 1.5876089060987415,
"eval_loss": 0.016824763268232346,
"eval_runtime": 4.88,
"eval_samples_per_second": 10.246,
"eval_steps_per_second": 2.664,
"step": 205
},
{
"epoch": 1.595353339787028,
"grad_norm": 0.020388493314385414,
"learning_rate": 8.611299546157974e-05,
"loss": 0.0287,
"step": 206
},
{
"epoch": 1.6030977734753145,
"grad_norm": 0.022215668112039566,
"learning_rate": 8.592515681419813e-05,
"loss": 0.0249,
"step": 207
},
{
"epoch": 1.6108422071636013,
"grad_norm": 0.028934534639120102,
"learning_rate": 8.573626416497668e-05,
"loss": 0.0217,
"step": 208
},
{
"epoch": 1.6185866408518876,
"grad_norm": 0.022588912397623062,
"learning_rate": 8.554632305580354e-05,
"loss": 0.0207,
"step": 209
},
{
"epoch": 1.6263310745401742,
"grad_norm": 0.02324405126273632,
"learning_rate": 8.535533905932738e-05,
"loss": 0.0178,
"step": 210
},
{
"epoch": 1.6263310745401742,
"eval_loss": 0.016888294368982315,
"eval_runtime": 4.8771,
"eval_samples_per_second": 10.252,
"eval_steps_per_second": 2.665,
"step": 210
},
{
"epoch": 1.6340755082284608,
"grad_norm": 0.023379050195217133,
"learning_rate": 8.5163317778794e-05,
"loss": 0.0227,
"step": 211
},
{
"epoch": 1.6418199419167472,
"grad_norm": 0.024302620440721512,
"learning_rate": 8.497026484788189e-05,
"loss": 0.0279,
"step": 212
},
{
"epoch": 1.649564375605034,
"grad_norm": 0.02425311878323555,
"learning_rate": 8.477618593053693e-05,
"loss": 0.02,
"step": 213
},
{
"epoch": 1.6573088092933204,
"grad_norm": 0.0243984404951334,
"learning_rate": 8.458108672080624e-05,
"loss": 0.0255,
"step": 214
},
{
"epoch": 1.665053242981607,
"grad_norm": 0.018734309822320938,
"learning_rate": 8.438497294267117e-05,
"loss": 0.011,
"step": 215
},
{
"epoch": 1.665053242981607,
"eval_loss": 0.01664450205862522,
"eval_runtime": 4.8808,
"eval_samples_per_second": 10.244,
"eval_steps_per_second": 2.663,
"step": 215
},
{
"epoch": 1.6727976766698935,
"grad_norm": 0.019455671310424805,
"learning_rate": 8.418785034987921e-05,
"loss": 0.0175,
"step": 216
},
{
"epoch": 1.68054211035818,
"grad_norm": 0.021629663184285164,
"learning_rate": 8.39897247257754e-05,
"loss": 0.0156,
"step": 217
},
{
"epoch": 1.6882865440464667,
"grad_norm": 0.022207748144865036,
"learning_rate": 8.379060188313244e-05,
"loss": 0.0271,
"step": 218
},
{
"epoch": 1.696030977734753,
"grad_norm": 0.021333666518330574,
"learning_rate": 8.359048766398031e-05,
"loss": 0.0223,
"step": 219
},
{
"epoch": 1.7037754114230397,
"grad_norm": 0.021991191431879997,
"learning_rate": 8.338938793943478e-05,
"loss": 0.0128,
"step": 220
},
{
"epoch": 1.7037754114230397,
"eval_loss": 0.016610655933618546,
"eval_runtime": 4.8782,
"eval_samples_per_second": 10.25,
"eval_steps_per_second": 2.665,
"step": 220
},
{
"epoch": 1.7115198451113263,
"grad_norm": 0.01750914379954338,
"learning_rate": 8.318730860952522e-05,
"loss": 0.0217,
"step": 221
},
{
"epoch": 1.7192642787996126,
"grad_norm": 0.022801555693149567,
"learning_rate": 8.298425560302146e-05,
"loss": 0.0229,
"step": 222
},
{
"epoch": 1.7270087124878994,
"grad_norm": 0.028667643666267395,
"learning_rate": 8.278023487725982e-05,
"loss": 0.0317,
"step": 223
},
{
"epoch": 1.7347531461761858,
"grad_norm": 0.0247921384871006,
"learning_rate": 8.257525241796838e-05,
"loss": 0.0177,
"step": 224
},
{
"epoch": 1.7424975798644724,
"grad_norm": 0.02079445868730545,
"learning_rate": 8.236931423909138e-05,
"loss": 0.0223,
"step": 225
},
{
"epoch": 1.7424975798644724,
"eval_loss": 0.016715094447135925,
"eval_runtime": 4.8828,
"eval_samples_per_second": 10.24,
"eval_steps_per_second": 2.662,
"step": 225
},
{
"epoch": 1.750242013552759,
"grad_norm": 0.023619551211595535,
"learning_rate": 8.216242638261276e-05,
"loss": 0.0237,
"step": 226
},
{
"epoch": 1.7579864472410454,
"grad_norm": 0.020713407546281815,
"learning_rate": 8.19545949183788e-05,
"loss": 0.0167,
"step": 227
},
{
"epoch": 1.7657308809293322,
"grad_norm": 0.024574102833867073,
"learning_rate": 8.17458259439202e-05,
"loss": 0.0281,
"step": 228
},
{
"epoch": 1.7734753146176185,
"grad_norm": 0.01983151212334633,
"learning_rate": 8.153612558427311e-05,
"loss": 0.0217,
"step": 229
},
{
"epoch": 1.7812197483059051,
"grad_norm": 0.027135249227285385,
"learning_rate": 8.132549999179933e-05,
"loss": 0.0201,
"step": 230
},
{
"epoch": 1.7812197483059051,
"eval_loss": 0.016661785542964935,
"eval_runtime": 4.8807,
"eval_samples_per_second": 10.244,
"eval_steps_per_second": 2.664,
"step": 230
},
{
"epoch": 1.7889641819941917,
"grad_norm": 0.021816475316882133,
"learning_rate": 8.111395534600603e-05,
"loss": 0.0166,
"step": 231
},
{
"epoch": 1.796708615682478,
"grad_norm": 0.019049836322665215,
"learning_rate": 8.090149785336425e-05,
"loss": 0.0125,
"step": 232
},
{
"epoch": 1.804453049370765,
"grad_norm": 0.023273281753063202,
"learning_rate": 8.068813374712688e-05,
"loss": 0.0295,
"step": 233
},
{
"epoch": 1.8121974830590513,
"grad_norm": 0.02431442402303219,
"learning_rate": 8.047386928714582e-05,
"loss": 0.0193,
"step": 234
},
{
"epoch": 1.8199419167473379,
"grad_norm": 0.02583279088139534,
"learning_rate": 8.025871075968828e-05,
"loss": 0.0239,
"step": 235
},
{
"epoch": 1.8199419167473379,
"eval_loss": 0.016347970813512802,
"eval_runtime": 4.883,
"eval_samples_per_second": 10.24,
"eval_steps_per_second": 2.662,
"step": 235
},
{
"epoch": 1.8276863504356244,
"grad_norm": 0.021466901525855064,
"learning_rate": 8.00426644772523e-05,
"loss": 0.0226,
"step": 236
},
{
"epoch": 1.8354307841239108,
"grad_norm": 0.02583594247698784,
"learning_rate": 7.982573677838172e-05,
"loss": 0.0113,
"step": 237
},
{
"epoch": 1.8431752178121976,
"grad_norm": 0.02358117513358593,
"learning_rate": 7.960793402748002e-05,
"loss": 0.0292,
"step": 238
},
{
"epoch": 1.850919651500484,
"grad_norm": 0.025698702782392502,
"learning_rate": 7.938926261462366e-05,
"loss": 0.0269,
"step": 239
},
{
"epoch": 1.8586640851887706,
"grad_norm": 0.021297315135598183,
"learning_rate": 7.916972895537471e-05,
"loss": 0.0206,
"step": 240
},
{
"epoch": 1.8586640851887706,
"eval_loss": 0.016880055889487267,
"eval_runtime": 4.8849,
"eval_samples_per_second": 10.236,
"eval_steps_per_second": 2.661,
"step": 240
},
{
"epoch": 1.8664085188770572,
"grad_norm": 0.02742616832256317,
"learning_rate": 7.894933949059245e-05,
"loss": 0.0266,
"step": 241
},
{
"epoch": 1.8741529525653435,
"grad_norm": 0.029985694214701653,
"learning_rate": 7.872810068624451e-05,
"loss": 0.0209,
"step": 242
},
{
"epoch": 1.8818973862536303,
"grad_norm": 0.01984225958585739,
"learning_rate": 7.850601903321716e-05,
"loss": 0.0112,
"step": 243
},
{
"epoch": 1.8896418199419167,
"grad_norm": 0.028832539916038513,
"learning_rate": 7.828310104712489e-05,
"loss": 0.0176,
"step": 244
},
{
"epoch": 1.8973862536302033,
"grad_norm": 0.025244107469916344,
"learning_rate": 7.805935326811912e-05,
"loss": 0.0209,
"step": 245
},
{
"epoch": 1.8973862536302033,
"eval_loss": 0.016251368448138237,
"eval_runtime": 4.8854,
"eval_samples_per_second": 10.235,
"eval_steps_per_second": 2.661,
"step": 245
},
{
"epoch": 1.90513068731849,
"grad_norm": 0.019776510074734688,
"learning_rate": 7.783478226069651e-05,
"loss": 0.0146,
"step": 246
},
{
"epoch": 1.9128751210067763,
"grad_norm": 0.030150357633829117,
"learning_rate": 7.760939461350623e-05,
"loss": 0.0205,
"step": 247
},
{
"epoch": 1.920619554695063,
"grad_norm": 0.02409055270254612,
"learning_rate": 7.738319693915672e-05,
"loss": 0.0209,
"step": 248
},
{
"epoch": 1.9283639883833494,
"grad_norm": 0.02473391965031624,
"learning_rate": 7.715619587402164e-05,
"loss": 0.0169,
"step": 249
},
{
"epoch": 1.936108422071636,
"grad_norm": 0.028100404888391495,
"learning_rate": 7.692839807804521e-05,
"loss": 0.0171,
"step": 250
},
{
"epoch": 1.936108422071636,
"eval_loss": 0.016126085072755814,
"eval_runtime": 4.8878,
"eval_samples_per_second": 10.23,
"eval_steps_per_second": 2.66,
"step": 250
},
{
"epoch": 1.9438528557599226,
"grad_norm": 0.031069206073880196,
"learning_rate": 7.669981023454682e-05,
"loss": 0.0346,
"step": 251
},
{
"epoch": 1.951597289448209,
"grad_norm": 0.020763061940670013,
"learning_rate": 7.647043905002484e-05,
"loss": 0.0168,
"step": 252
},
{
"epoch": 1.9593417231364958,
"grad_norm": 0.021877290681004524,
"learning_rate": 7.624029125396004e-05,
"loss": 0.0276,
"step": 253
},
{
"epoch": 1.9670861568247822,
"grad_norm": 0.023641012609004974,
"learning_rate": 7.6009373598618e-05,
"loss": 0.0182,
"step": 254
},
{
"epoch": 1.9748305905130688,
"grad_norm": 0.025783414021134377,
"learning_rate": 7.577769285885109e-05,
"loss": 0.022,
"step": 255
},
{
"epoch": 1.9748305905130688,
"eval_loss": 0.015866845846176147,
"eval_runtime": 4.8957,
"eval_samples_per_second": 10.213,
"eval_steps_per_second": 2.655,
"step": 255
},
{
"epoch": 1.9825750242013553,
"grad_norm": 0.022825462743639946,
"learning_rate": 7.554525583189969e-05,
"loss": 0.0184,
"step": 256
},
{
"epoch": 1.9903194578896417,
"grad_norm": 0.024429945275187492,
"learning_rate": 7.53120693371927e-05,
"loss": 0.0196,
"step": 257
},
{
"epoch": 1.9980638915779285,
"grad_norm": 0.0280454121530056,
"learning_rate": 7.507814021614761e-05,
"loss": 0.0297,
"step": 258
},
{
"epoch": 2.005808325266215,
"grad_norm": 0.04602880775928497,
"learning_rate": 7.484347533196961e-05,
"loss": 0.0211,
"step": 259
},
{
"epoch": 2.0135527589545013,
"grad_norm": 0.01826930046081543,
"learning_rate": 7.460808156945036e-05,
"loss": 0.0162,
"step": 260
},
{
"epoch": 2.0135527589545013,
"eval_loss": 0.015875546261668205,
"eval_runtime": 4.8819,
"eval_samples_per_second": 10.242,
"eval_steps_per_second": 2.663,
"step": 260
},
{
"epoch": 2.021297192642788,
"grad_norm": 0.018936650827527046,
"learning_rate": 7.437196583476596e-05,
"loss": 0.0169,
"step": 261
},
{
"epoch": 2.0290416263310744,
"grad_norm": 0.02147481217980385,
"learning_rate": 7.413513505527429e-05,
"loss": 0.0142,
"step": 262
},
{
"epoch": 2.0367860600193612,
"grad_norm": 0.020604653283953667,
"learning_rate": 7.389759617931182e-05,
"loss": 0.0115,
"step": 263
},
{
"epoch": 2.0445304937076476,
"grad_norm": 0.021933911368250847,
"learning_rate": 7.365935617598975e-05,
"loss": 0.0134,
"step": 264
},
{
"epoch": 2.052274927395934,
"grad_norm": 0.02122250571846962,
"learning_rate": 7.342042203498951e-05,
"loss": 0.0185,
"step": 265
},
{
"epoch": 2.052274927395934,
"eval_loss": 0.01603526994585991,
"eval_runtime": 4.9059,
"eval_samples_per_second": 10.192,
"eval_steps_per_second": 2.65,
"step": 265
},
{
"epoch": 2.060019361084221,
"grad_norm": 0.018767178058624268,
"learning_rate": 7.318080076635772e-05,
"loss": 0.0087,
"step": 266
},
{
"epoch": 2.067763794772507,
"grad_norm": 0.01828618347644806,
"learning_rate": 7.294049940030055e-05,
"loss": 0.0088,
"step": 267
},
{
"epoch": 2.075508228460794,
"grad_norm": 0.029488379135727882,
"learning_rate": 7.269952498697734e-05,
"loss": 0.0148,
"step": 268
},
{
"epoch": 2.0832526621490803,
"grad_norm": 0.028726164251565933,
"learning_rate": 7.245788459629396e-05,
"loss": 0.0226,
"step": 269
},
{
"epoch": 2.0909970958373667,
"grad_norm": 0.03607122600078583,
"learning_rate": 7.221558531769519e-05,
"loss": 0.0185,
"step": 270
},
{
"epoch": 2.0909970958373667,
"eval_loss": 0.01613912731409073,
"eval_runtime": 4.8866,
"eval_samples_per_second": 10.232,
"eval_steps_per_second": 2.66,
"step": 270
},
{
"epoch": 2.0987415295256535,
"grad_norm": 0.02318711020052433,
"learning_rate": 7.197263425995682e-05,
"loss": 0.0187,
"step": 271
},
{
"epoch": 2.10648596321394,
"grad_norm": 0.027442490682005882,
"learning_rate": 7.172903855097711e-05,
"loss": 0.0185,
"step": 272
},
{
"epoch": 2.1142303969022267,
"grad_norm": 0.02113383449614048,
"learning_rate": 7.14848053375676e-05,
"loss": 0.0162,
"step": 273
},
{
"epoch": 2.121974830590513,
"grad_norm": 0.02109163999557495,
"learning_rate": 7.123994178524345e-05,
"loss": 0.0189,
"step": 274
},
{
"epoch": 2.1297192642787994,
"grad_norm": 0.018890704959630966,
"learning_rate": 7.099445507801323e-05,
"loss": 0.0196,
"step": 275
},
{
"epoch": 2.1297192642787994,
"eval_loss": 0.016141431406140327,
"eval_runtime": 4.8795,
"eval_samples_per_second": 10.247,
"eval_steps_per_second": 2.664,
"step": 275
},
{
"epoch": 2.1374636979670862,
"grad_norm": 0.026332931593060493,
"learning_rate": 7.074835241816817e-05,
"loss": 0.029,
"step": 276
},
{
"epoch": 2.1452081316553726,
"grad_norm": 0.02275455929338932,
"learning_rate": 7.05016410260708e-05,
"loss": 0.0156,
"step": 277
},
{
"epoch": 2.1529525653436594,
"grad_norm": 0.022596005350351334,
"learning_rate": 7.025432813994315e-05,
"loss": 0.0184,
"step": 278
},
{
"epoch": 2.160696999031946,
"grad_norm": 0.020018640905618668,
"learning_rate": 7.000642101565434e-05,
"loss": 0.0107,
"step": 279
},
{
"epoch": 2.168441432720232,
"grad_norm": 0.025625359266996384,
"learning_rate": 6.975792692650777e-05,
"loss": 0.0146,
"step": 280
},
{
"epoch": 2.168441432720232,
"eval_loss": 0.015940353274345398,
"eval_runtime": 4.9128,
"eval_samples_per_second": 10.178,
"eval_steps_per_second": 2.646,
"step": 280
},
{
"epoch": 2.176185866408519,
"grad_norm": 0.026554979383945465,
"learning_rate": 6.950885316302773e-05,
"loss": 0.0213,
"step": 281
},
{
"epoch": 2.1839303000968053,
"grad_norm": 0.023344026878476143,
"learning_rate": 6.925920703274541e-05,
"loss": 0.0176,
"step": 282
},
{
"epoch": 2.191674733785092,
"grad_norm": 0.03146139904856682,
"learning_rate": 6.90089958599846e-05,
"loss": 0.0243,
"step": 283
},
{
"epoch": 2.1994191674733785,
"grad_norm": 0.02688729763031006,
"learning_rate": 6.875822698564679e-05,
"loss": 0.0235,
"step": 284
},
{
"epoch": 2.207163601161665,
"grad_norm": 0.017707915976643562,
"learning_rate": 6.850690776699573e-05,
"loss": 0.0091,
"step": 285
},
{
"epoch": 2.207163601161665,
"eval_loss": 0.015938647091388702,
"eval_runtime": 4.8821,
"eval_samples_per_second": 10.241,
"eval_steps_per_second": 2.663,
"step": 285
},
{
"epoch": 2.2149080348499517,
"grad_norm": 0.02426217496395111,
"learning_rate": 6.825504557744167e-05,
"loss": 0.0222,
"step": 286
},
{
"epoch": 2.222652468538238,
"grad_norm": 0.017933079972863197,
"learning_rate": 6.800264780632494e-05,
"loss": 0.0127,
"step": 287
},
{
"epoch": 2.230396902226525,
"grad_norm": 0.02196042612195015,
"learning_rate": 6.774972185869927e-05,
"loss": 0.013,
"step": 288
},
{
"epoch": 2.2381413359148112,
"grad_norm": 0.02711823582649231,
"learning_rate": 6.749627515511442e-05,
"loss": 0.0198,
"step": 289
},
{
"epoch": 2.2458857696030976,
"grad_norm": 0.01899660937488079,
"learning_rate": 6.724231513139852e-05,
"loss": 0.0106,
"step": 290
},
{
"epoch": 2.2458857696030976,
"eval_loss": 0.015821926295757294,
"eval_runtime": 4.8849,
"eval_samples_per_second": 10.236,
"eval_steps_per_second": 2.661,
"step": 290
},
{
"epoch": 2.2536302032913844,
"grad_norm": 0.02587137557566166,
"learning_rate": 6.698784923843992e-05,
"loss": 0.0204,
"step": 291
},
{
"epoch": 2.261374636979671,
"grad_norm": 0.02532321773469448,
"learning_rate": 6.673288494196858e-05,
"loss": 0.0191,
"step": 292
},
{
"epoch": 2.2691190706679576,
"grad_norm": 0.03079635463654995,
"learning_rate": 6.647742972233703e-05,
"loss": 0.0205,
"step": 293
},
{
"epoch": 2.276863504356244,
"grad_norm": 0.023865051567554474,
"learning_rate": 6.622149107430088e-05,
"loss": 0.0151,
"step": 294
},
{
"epoch": 2.2846079380445303,
"grad_norm": 0.02512257918715477,
"learning_rate": 6.5965076506799e-05,
"loss": 0.014,
"step": 295
},
{
"epoch": 2.2846079380445303,
"eval_loss": 0.015925200656056404,
"eval_runtime": 4.8773,
"eval_samples_per_second": 10.251,
"eval_steps_per_second": 2.665,
"step": 295
},
{
"epoch": 2.292352371732817,
"grad_norm": 0.026422763243317604,
"learning_rate": 6.570819354273317e-05,
"loss": 0.0173,
"step": 296
},
{
"epoch": 2.3000968054211035,
"grad_norm": 0.02848372980952263,
"learning_rate": 6.545084971874738e-05,
"loss": 0.0251,
"step": 297
},
{
"epoch": 2.3078412391093903,
"grad_norm": 0.02191309630870819,
"learning_rate": 6.519305258500666e-05,
"loss": 0.0104,
"step": 298
},
{
"epoch": 2.3155856727976767,
"grad_norm": 0.025703053921461105,
"learning_rate": 6.493480970497569e-05,
"loss": 0.0311,
"step": 299
},
{
"epoch": 2.323330106485963,
"grad_norm": 0.021763848140835762,
"learning_rate": 6.467612865519674e-05,
"loss": 0.0168,
"step": 300
},
{
"epoch": 2.323330106485963,
"eval_loss": 0.01583768054842949,
"eval_runtime": 4.8796,
"eval_samples_per_second": 10.247,
"eval_steps_per_second": 2.664,
"step": 300
},
{
"epoch": 2.33107454017425,
"grad_norm": 0.01981600932776928,
"learning_rate": 6.441701702506754e-05,
"loss": 0.0174,
"step": 301
},
{
"epoch": 2.3388189738625362,
"grad_norm": 0.021816400811076164,
"learning_rate": 6.415748241661851e-05,
"loss": 0.0222,
"step": 302
},
{
"epoch": 2.346563407550823,
"grad_norm": 0.028364678844809532,
"learning_rate": 6.389753244428972e-05,
"loss": 0.0222,
"step": 303
},
{
"epoch": 2.3543078412391094,
"grad_norm": 0.03110797517001629,
"learning_rate": 6.363717473470759e-05,
"loss": 0.0194,
"step": 304
},
{
"epoch": 2.362052274927396,
"grad_norm": 0.03083011880517006,
"learning_rate": 6.337641692646106e-05,
"loss": 0.0217,
"step": 305
},
{
"epoch": 2.362052274927396,
"eval_loss": 0.01598162204027176,
"eval_runtime": 4.8805,
"eval_samples_per_second": 10.245,
"eval_steps_per_second": 2.664,
"step": 305
},
{
"epoch": 2.3697967086156826,
"grad_norm": 0.027600981295108795,
"learning_rate": 6.311526666987743e-05,
"loss": 0.0168,
"step": 306
},
{
"epoch": 2.377541142303969,
"grad_norm": 0.050711363554000854,
"learning_rate": 6.285373162679803e-05,
"loss": 0.027,
"step": 307
},
{
"epoch": 2.3852855759922553,
"grad_norm": 0.0258706696331501,
"learning_rate": 6.259181947035342e-05,
"loss": 0.014,
"step": 308
},
{
"epoch": 2.393030009680542,
"grad_norm": 0.022878140211105347,
"learning_rate": 6.232953788473811e-05,
"loss": 0.0125,
"step": 309
},
{
"epoch": 2.4007744433688285,
"grad_norm": 0.02646121010184288,
"learning_rate": 6.206689456498529e-05,
"loss": 0.0225,
"step": 310
},
{
"epoch": 2.4007744433688285,
"eval_loss": 0.015688462182879448,
"eval_runtime": 4.8894,
"eval_samples_per_second": 10.226,
"eval_steps_per_second": 2.659,
"step": 310
},
{
"epoch": 2.4085188770571153,
"grad_norm": 0.01907186210155487,
"learning_rate": 6.1803897216741e-05,
"loss": 0.0105,
"step": 311
},
{
"epoch": 2.4162633107454017,
"grad_norm": 0.025598157197237015,
"learning_rate": 6.154055355603807e-05,
"loss": 0.0195,
"step": 312
},
{
"epoch": 2.4240077444336885,
"grad_norm": 0.021488605067133904,
"learning_rate": 6.127687130906972e-05,
"loss": 0.0171,
"step": 313
},
{
"epoch": 2.431752178121975,
"grad_norm": 0.023560060188174248,
"learning_rate": 6.101285821196285e-05,
"loss": 0.0234,
"step": 314
},
{
"epoch": 2.4394966118102612,
"grad_norm": 0.020358163863420486,
"learning_rate": 6.0748522010551215e-05,
"loss": 0.0158,
"step": 315
},
{
"epoch": 2.4394966118102612,
"eval_loss": 0.015287678688764572,
"eval_runtime": 4.884,
"eval_samples_per_second": 10.237,
"eval_steps_per_second": 2.662,
"step": 315
},
{
"epoch": 2.447241045498548,
"grad_norm": 0.04023784399032593,
"learning_rate": 6.048387046014795e-05,
"loss": 0.0195,
"step": 316
},
{
"epoch": 2.4549854791868344,
"grad_norm": 0.018253512680530548,
"learning_rate": 6.021891132531825e-05,
"loss": 0.0172,
"step": 317
},
{
"epoch": 2.4627299128751208,
"grad_norm": 0.020507492125034332,
"learning_rate": 5.995365237965144e-05,
"loss": 0.0234,
"step": 318
},
{
"epoch": 2.4704743465634076,
"grad_norm": 0.025176333263516426,
"learning_rate": 5.9688101405532925e-05,
"loss": 0.0196,
"step": 319
},
{
"epoch": 2.478218780251694,
"grad_norm": 0.022779326885938644,
"learning_rate": 5.9422266193915924e-05,
"loss": 0.0122,
"step": 320
},
{
"epoch": 2.478218780251694,
"eval_loss": 0.015223703347146511,
"eval_runtime": 4.8811,
"eval_samples_per_second": 10.244,
"eval_steps_per_second": 2.663,
"step": 320
},
{
"epoch": 2.4859632139399808,
"grad_norm": 0.019654158502817154,
"learning_rate": 5.9156154544092815e-05,
"loss": 0.0191,
"step": 321
},
{
"epoch": 2.493707647628267,
"grad_norm": 0.01823735609650612,
"learning_rate": 5.8889774263466355e-05,
"loss": 0.0128,
"step": 322
},
{
"epoch": 2.501452081316554,
"grad_norm": 0.022733347490429878,
"learning_rate": 5.862313316732063e-05,
"loss": 0.0095,
"step": 323
},
{
"epoch": 2.5091965150048403,
"grad_norm": 0.019566858187317848,
"learning_rate": 5.8356239078591724e-05,
"loss": 0.012,
"step": 324
},
{
"epoch": 2.5169409486931267,
"grad_norm": 0.023728664964437485,
"learning_rate": 5.808909982763825e-05,
"loss": 0.0152,
"step": 325
},
{
"epoch": 2.5169409486931267,
"eval_loss": 0.01537258829921484,
"eval_runtime": 4.8868,
"eval_samples_per_second": 10.232,
"eval_steps_per_second": 2.66,
"step": 325
},
{
"epoch": 2.5246853823814135,
"grad_norm": 0.026009773835539818,
"learning_rate": 5.782172325201155e-05,
"loss": 0.0158,
"step": 326
},
{
"epoch": 2.5324298160697,
"grad_norm": 0.045942921191453934,
"learning_rate": 5.7554117196225846e-05,
"loss": 0.0304,
"step": 327
},
{
"epoch": 2.5401742497579862,
"grad_norm": 0.017686696723103523,
"learning_rate": 5.728628951152799e-05,
"loss": 0.0157,
"step": 328
},
{
"epoch": 2.547918683446273,
"grad_norm": 0.020913394168019295,
"learning_rate": 5.701824805566722e-05,
"loss": 0.0162,
"step": 329
},
{
"epoch": 2.5556631171345594,
"grad_norm": 0.025631655007600784,
"learning_rate": 5.675000069266451e-05,
"loss": 0.0268,
"step": 330
},
{
"epoch": 2.5556631171345594,
"eval_loss": 0.015396489761769772,
"eval_runtime": 4.8797,
"eval_samples_per_second": 10.246,
"eval_steps_per_second": 2.664,
"step": 330
},
{
"epoch": 2.563407550822846,
"grad_norm": 0.021823951974511147,
"learning_rate": 5.6481555292581946e-05,
"loss": 0.0116,
"step": 331
},
{
"epoch": 2.5711519845111326,
"grad_norm": 0.023217862471938133,
"learning_rate": 5.621291973129177e-05,
"loss": 0.0151,
"step": 332
},
{
"epoch": 2.5788964181994194,
"grad_norm": 0.03442602604627609,
"learning_rate": 5.5944101890245324e-05,
"loss": 0.0202,
"step": 333
},
{
"epoch": 2.5866408518877058,
"grad_norm": 0.023536914959549904,
"learning_rate": 5.5675109656241876e-05,
"loss": 0.014,
"step": 334
},
{
"epoch": 2.594385285575992,
"grad_norm": 0.026387905701994896,
"learning_rate": 5.540595092119709e-05,
"loss": 0.0174,
"step": 335
},
{
"epoch": 2.594385285575992,
"eval_loss": 0.01569586619734764,
"eval_runtime": 4.8895,
"eval_samples_per_second": 10.226,
"eval_steps_per_second": 2.659,
"step": 335
},
{
"epoch": 2.602129719264279,
"grad_norm": 0.02376389689743519,
"learning_rate": 5.5136633581911655e-05,
"loss": 0.0232,
"step": 336
},
{
"epoch": 2.6098741529525653,
"grad_norm": 0.022475535050034523,
"learning_rate": 5.486716553983951e-05,
"loss": 0.0176,
"step": 337
},
{
"epoch": 2.6176185866408517,
"grad_norm": 0.026273801922798157,
"learning_rate": 5.4597554700855946e-05,
"loss": 0.0099,
"step": 338
},
{
"epoch": 2.6253630203291385,
"grad_norm": 0.0252407044172287,
"learning_rate": 5.432780897502589e-05,
"loss": 0.0169,
"step": 339
},
{
"epoch": 2.633107454017425,
"grad_norm": 0.025699293240904808,
"learning_rate": 5.4057936276371565e-05,
"loss": 0.0147,
"step": 340
},
{
"epoch": 2.633107454017425,
"eval_loss": 0.015603473410010338,
"eval_runtime": 4.8875,
"eval_samples_per_second": 10.23,
"eval_steps_per_second": 2.66,
"step": 340
},
{
"epoch": 2.6408518877057117,
"grad_norm": 0.02292807772755623,
"learning_rate": 5.378794452264053e-05,
"loss": 0.0112,
"step": 341
},
{
"epoch": 2.648596321393998,
"grad_norm": 0.02671566605567932,
"learning_rate": 5.351784163507319e-05,
"loss": 0.0157,
"step": 342
},
{
"epoch": 2.656340755082285,
"grad_norm": 0.024869635701179504,
"learning_rate": 5.324763553817054e-05,
"loss": 0.0183,
"step": 343
},
{
"epoch": 2.664085188770571,
"grad_norm": 0.030287204310297966,
"learning_rate": 5.2977334159461614e-05,
"loss": 0.0235,
"step": 344
},
{
"epoch": 2.6718296224588576,
"grad_norm": 0.021120263263583183,
"learning_rate": 5.270694542927088e-05,
"loss": 0.0191,
"step": 345
},
{
"epoch": 2.6718296224588576,
"eval_loss": 0.015455065295100212,
"eval_runtime": 4.8759,
"eval_samples_per_second": 10.254,
"eval_steps_per_second": 2.666,
"step": 345
},
{
"epoch": 2.6795740561471444,
"grad_norm": 0.022198256105184555,
"learning_rate": 5.2436477280485605e-05,
"loss": 0.017,
"step": 346
},
{
"epoch": 2.6873184898354308,
"grad_norm": 0.02474604733288288,
"learning_rate": 5.216593764832311e-05,
"loss": 0.0182,
"step": 347
},
{
"epoch": 2.695062923523717,
"grad_norm": 0.022626683115959167,
"learning_rate": 5.189533447009794e-05,
"loss": 0.0235,
"step": 348
},
{
"epoch": 2.702807357212004,
"grad_norm": 0.025306569412350655,
"learning_rate": 5.162467568498903e-05,
"loss": 0.0185,
"step": 349
},
{
"epoch": 2.7105517909002903,
"grad_norm": 0.01958346739411354,
"learning_rate": 5.135396923380673e-05,
"loss": 0.0139,
"step": 350
},
{
"epoch": 2.7105517909002903,
"eval_loss": 0.015315129421651363,
"eval_runtime": 4.8828,
"eval_samples_per_second": 10.24,
"eval_steps_per_second": 2.662,
"step": 350
},
{
"epoch": 2.718296224588577,
"grad_norm": 0.028099266812205315,
"learning_rate": 5.108322305875988e-05,
"loss": 0.0151,
"step": 351
},
{
"epoch": 2.7260406582768635,
"grad_norm": 0.026032108813524246,
"learning_rate": 5.081244510322274e-05,
"loss": 0.0143,
"step": 352
},
{
"epoch": 2.7337850919651503,
"grad_norm": 0.030373040586709976,
"learning_rate": 5.0541643311502e-05,
"loss": 0.0177,
"step": 353
},
{
"epoch": 2.7415295256534367,
"grad_norm": 0.026800263673067093,
"learning_rate": 5.027082562860368e-05,
"loss": 0.0146,
"step": 354
},
{
"epoch": 2.749273959341723,
"grad_norm": 0.028782140463590622,
"learning_rate": 5e-05,
"loss": 0.026,
"step": 355
},
{
"epoch": 2.749273959341723,
"eval_loss": 0.015001767314970493,
"eval_runtime": 4.8958,
"eval_samples_per_second": 10.213,
"eval_steps_per_second": 2.655,
"step": 355
},
{
"epoch": 2.75701839303001,
"grad_norm": 0.023763621225953102,
"learning_rate": 4.9729174371396334e-05,
"loss": 0.0138,
"step": 356
},
{
"epoch": 2.764762826718296,
"grad_norm": 0.02057846635580063,
"learning_rate": 4.945835668849801e-05,
"loss": 0.0101,
"step": 357
},
{
"epoch": 2.7725072604065826,
"grad_norm": 0.026699546724557877,
"learning_rate": 4.9187554896777285e-05,
"loss": 0.0185,
"step": 358
},
{
"epoch": 2.7802516940948694,
"grad_norm": 0.025631215423345566,
"learning_rate": 4.8916776941240135e-05,
"loss": 0.0177,
"step": 359
},
{
"epoch": 2.7879961277831558,
"grad_norm": 0.020701708272099495,
"learning_rate": 4.8646030766193285e-05,
"loss": 0.0162,
"step": 360
},
{
"epoch": 2.7879961277831558,
"eval_loss": 0.014788495376706123,
"eval_runtime": 4.885,
"eval_samples_per_second": 10.235,
"eval_steps_per_second": 2.661,
"step": 360
},
{
"epoch": 2.7957405614714426,
"grad_norm": 0.018802624195814133,
"learning_rate": 4.837532431501098e-05,
"loss": 0.0195,
"step": 361
},
{
"epoch": 2.803484995159729,
"grad_norm": 0.024294838309288025,
"learning_rate": 4.8104665529902075e-05,
"loss": 0.0172,
"step": 362
},
{
"epoch": 2.8112294288480157,
"grad_norm": 0.02249518595635891,
"learning_rate": 4.78340623516769e-05,
"loss": 0.0157,
"step": 363
},
{
"epoch": 2.818973862536302,
"grad_norm": 0.022549943998456,
"learning_rate": 4.756352271951441e-05,
"loss": 0.0167,
"step": 364
},
{
"epoch": 2.8267182962245885,
"grad_norm": 0.03274448588490486,
"learning_rate": 4.729305457072913e-05,
"loss": 0.0258,
"step": 365
},
{
"epoch": 2.8267182962245885,
"eval_loss": 0.014879841357469559,
"eval_runtime": 4.8948,
"eval_samples_per_second": 10.215,
"eval_steps_per_second": 2.656,
"step": 365
},
{
"epoch": 2.8344627299128753,
"grad_norm": 0.031107768416404724,
"learning_rate": 4.70226658405384e-05,
"loss": 0.0167,
"step": 366
},
{
"epoch": 2.8422071636011617,
"grad_norm": 0.023017307743430138,
"learning_rate": 4.675236446182946e-05,
"loss": 0.0126,
"step": 367
},
{
"epoch": 2.849951597289448,
"grad_norm": 0.03121495246887207,
"learning_rate": 4.648215836492682e-05,
"loss": 0.0139,
"step": 368
},
{
"epoch": 2.857696030977735,
"grad_norm": 0.026987893506884575,
"learning_rate": 4.6212055477359486e-05,
"loss": 0.0147,
"step": 369
},
{
"epoch": 2.865440464666021,
"grad_norm": 0.024263298138976097,
"learning_rate": 4.594206372362845e-05,
"loss": 0.0154,
"step": 370
},
{
"epoch": 2.865440464666021,
"eval_loss": 0.014814168214797974,
"eval_runtime": 4.8923,
"eval_samples_per_second": 10.22,
"eval_steps_per_second": 2.657,
"step": 370
},
{
"epoch": 2.8731848983543076,
"grad_norm": 0.022974541410803795,
"learning_rate": 4.567219102497412e-05,
"loss": 0.0136,
"step": 371
},
{
"epoch": 2.8809293320425944,
"grad_norm": 0.025871610268950462,
"learning_rate": 4.540244529914406e-05,
"loss": 0.0126,
"step": 372
},
{
"epoch": 2.888673765730881,
"grad_norm": 0.026091424748301506,
"learning_rate": 4.5132834460160524e-05,
"loss": 0.023,
"step": 373
},
{
"epoch": 2.8964181994191676,
"grad_norm": 0.024125855416059494,
"learning_rate": 4.486336641808835e-05,
"loss": 0.0129,
"step": 374
},
{
"epoch": 2.904162633107454,
"grad_norm": 0.01973029226064682,
"learning_rate": 4.4594049078802925e-05,
"loss": 0.0166,
"step": 375
},
{
"epoch": 2.904162633107454,
"eval_loss": 0.01432761363685131,
"eval_runtime": 4.8944,
"eval_samples_per_second": 10.216,
"eval_steps_per_second": 2.656,
"step": 375
},
{
"epoch": 2.9119070667957407,
"grad_norm": 0.022474128752946854,
"learning_rate": 4.4324890343758136e-05,
"loss": 0.0115,
"step": 376
},
{
"epoch": 2.919651500484027,
"grad_norm": 0.022197918966412544,
"learning_rate": 4.405589810975468e-05,
"loss": 0.0108,
"step": 377
},
{
"epoch": 2.9273959341723135,
"grad_norm": 0.023376472294330597,
"learning_rate": 4.3787080268708244e-05,
"loss": 0.0105,
"step": 378
},
{
"epoch": 2.9351403678606003,
"grad_norm": 0.01602279581129551,
"learning_rate": 4.351844470741808e-05,
"loss": 0.0094,
"step": 379
},
{
"epoch": 2.9428848015488867,
"grad_norm": 0.02684823051095009,
"learning_rate": 4.3249999307335495e-05,
"loss": 0.0189,
"step": 380
},
{
"epoch": 2.9428848015488867,
"eval_loss": 0.014240576885640621,
"eval_runtime": 4.8817,
"eval_samples_per_second": 10.242,
"eval_steps_per_second": 2.663,
"step": 380
},
{
"epoch": 2.950629235237173,
"grad_norm": 0.0212652999907732,
"learning_rate": 4.298175194433279e-05,
"loss": 0.0154,
"step": 381
},
{
"epoch": 2.95837366892546,
"grad_norm": 0.019883181899785995,
"learning_rate": 4.2713710488472006e-05,
"loss": 0.0087,
"step": 382
},
{
"epoch": 2.9661181026137466,
"grad_norm": 0.02650902420282364,
"learning_rate": 4.244588280377417e-05,
"loss": 0.0164,
"step": 383
},
{
"epoch": 2.973862536302033,
"grad_norm": 0.02401239052414894,
"learning_rate": 4.2178276747988446e-05,
"loss": 0.0139,
"step": 384
},
{
"epoch": 2.9816069699903194,
"grad_norm": 0.022838260978460312,
"learning_rate": 4.1910900172361764e-05,
"loss": 0.0155,
"step": 385
},
{
"epoch": 2.9816069699903194,
"eval_loss": 0.0144858593121171,
"eval_runtime": 4.8906,
"eval_samples_per_second": 10.224,
"eval_steps_per_second": 2.658,
"step": 385
},
{
"epoch": 2.989351403678606,
"grad_norm": 0.03657938912510872,
"learning_rate": 4.164376092140828e-05,
"loss": 0.0286,
"step": 386
},
{
"epoch": 2.9970958373668926,
"grad_norm": 0.02792074717581272,
"learning_rate": 4.1376866832679385e-05,
"loss": 0.014,
"step": 387
},
{
"epoch": 3.004840271055179,
"grad_norm": 0.05196017026901245,
"learning_rate": 4.1110225736533664e-05,
"loss": 0.0222,
"step": 388
},
{
"epoch": 3.0125847047434657,
"grad_norm": 0.0229202788323164,
"learning_rate": 4.084384545590719e-05,
"loss": 0.007,
"step": 389
},
{
"epoch": 3.020329138431752,
"grad_norm": 0.021996086463332176,
"learning_rate": 4.057773380608411e-05,
"loss": 0.0121,
"step": 390
},
{
"epoch": 3.020329138431752,
"eval_loss": 0.014621075242757797,
"eval_runtime": 4.8766,
"eval_samples_per_second": 10.253,
"eval_steps_per_second": 2.666,
"step": 390
},
{
"epoch": 3.028073572120039,
"grad_norm": 0.024300433695316315,
"learning_rate": 4.0311898594467086e-05,
"loss": 0.0119,
"step": 391
},
{
"epoch": 3.0358180058083253,
"grad_norm": 0.023426620289683342,
"learning_rate": 4.0046347620348586e-05,
"loss": 0.0123,
"step": 392
},
{
"epoch": 3.0435624394966116,
"grad_norm": 0.024129556491971016,
"learning_rate": 3.9781088674681764e-05,
"loss": 0.0124,
"step": 393
},
{
"epoch": 3.0513068731848985,
"grad_norm": 0.04510955512523651,
"learning_rate": 3.951612953985207e-05,
"loss": 0.0174,
"step": 394
},
{
"epoch": 3.059051306873185,
"grad_norm": 0.02260909229516983,
"learning_rate": 3.92514779894488e-05,
"loss": 0.0122,
"step": 395
},
{
"epoch": 3.059051306873185,
"eval_loss": 0.014701277017593384,
"eval_runtime": 4.8794,
"eval_samples_per_second": 10.247,
"eval_steps_per_second": 2.664,
"step": 395
},
{
"epoch": 3.0667957405614716,
"grad_norm": 0.020229579880833626,
"learning_rate": 3.8987141788037154e-05,
"loss": 0.0063,
"step": 396
},
{
"epoch": 3.074540174249758,
"grad_norm": 0.024916259571909904,
"learning_rate": 3.8723128690930296e-05,
"loss": 0.0099,
"step": 397
},
{
"epoch": 3.0822846079380444,
"grad_norm": 0.017238672822713852,
"learning_rate": 3.8459446443961944e-05,
"loss": 0.0071,
"step": 398
},
{
"epoch": 3.090029041626331,
"grad_norm": 0.028883591294288635,
"learning_rate": 3.8196102783258994e-05,
"loss": 0.0181,
"step": 399
},
{
"epoch": 3.0977734753146176,
"grad_norm": 0.025792468339204788,
"learning_rate": 3.793310543501473e-05,
"loss": 0.0136,
"step": 400
},
{
"epoch": 3.0977734753146176,
"eval_loss": 0.014834250323474407,
"eval_runtime": 4.8859,
"eval_samples_per_second": 10.234,
"eval_steps_per_second": 2.661,
"step": 400
},
{
"epoch": 3.1055179090029044,
"grad_norm": 0.03113100863993168,
"learning_rate": 3.7670462115261906e-05,
"loss": 0.0193,
"step": 401
},
{
"epoch": 3.1132623426911907,
"grad_norm": 0.02263321541249752,
"learning_rate": 3.7408180529646596e-05,
"loss": 0.0123,
"step": 402
},
{
"epoch": 3.121006776379477,
"grad_norm": 0.023540707305073738,
"learning_rate": 3.714626837320195e-05,
"loss": 0.0119,
"step": 403
},
{
"epoch": 3.128751210067764,
"grad_norm": 0.031784623861312866,
"learning_rate": 3.688473333012259e-05,
"loss": 0.0175,
"step": 404
},
{
"epoch": 3.1364956437560503,
"grad_norm": 0.022701062262058258,
"learning_rate": 3.6623583073538966e-05,
"loss": 0.0107,
"step": 405
},
{
"epoch": 3.1364956437560503,
"eval_loss": 0.01486950647085905,
"eval_runtime": 4.892,
"eval_samples_per_second": 10.221,
"eval_steps_per_second": 2.657,
"step": 405
},
{
"epoch": 3.144240077444337,
"grad_norm": 0.026784732937812805,
"learning_rate": 3.636282526529242e-05,
"loss": 0.0125,
"step": 406
},
{
"epoch": 3.1519845111326235,
"grad_norm": 0.026719210669398308,
"learning_rate": 3.6102467555710295e-05,
"loss": 0.0103,
"step": 407
},
{
"epoch": 3.15972894482091,
"grad_norm": 0.03489716723561287,
"learning_rate": 3.584251758338151e-05,
"loss": 0.0134,
"step": 408
},
{
"epoch": 3.1674733785091966,
"grad_norm": 0.02056041732430458,
"learning_rate": 3.558298297493247e-05,
"loss": 0.0073,
"step": 409
},
{
"epoch": 3.175217812197483,
"grad_norm": 0.030753985047340393,
"learning_rate": 3.5323871344803263e-05,
"loss": 0.0164,
"step": 410
},
{
"epoch": 3.175217812197483,
"eval_loss": 0.01476968638598919,
"eval_runtime": 4.8875,
"eval_samples_per_second": 10.23,
"eval_steps_per_second": 2.66,
"step": 410
},
{
"epoch": 3.1829622458857694,
"grad_norm": 0.025167269632220268,
"learning_rate": 3.506519029502433e-05,
"loss": 0.0121,
"step": 411
},
{
"epoch": 3.190706679574056,
"grad_norm": 0.03184746950864792,
"learning_rate": 3.480694741499334e-05,
"loss": 0.0174,
"step": 412
},
{
"epoch": 3.1984511132623425,
"grad_norm": 0.014001097530126572,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.0057,
"step": 413
},
{
"epoch": 3.2061955469506294,
"grad_norm": 0.027478694915771484,
"learning_rate": 3.4291806457266826e-05,
"loss": 0.0138,
"step": 414
},
{
"epoch": 3.2139399806389157,
"grad_norm": 0.02516726590692997,
"learning_rate": 3.403492349320101e-05,
"loss": 0.0112,
"step": 415
},
{
"epoch": 3.2139399806389157,
"eval_loss": 0.014760926365852356,
"eval_runtime": 4.876,
"eval_samples_per_second": 10.254,
"eval_steps_per_second": 2.666,
"step": 415
},
{
"epoch": 3.2216844143272025,
"grad_norm": 0.03305725008249283,
"learning_rate": 3.3778508925699124e-05,
"loss": 0.0256,
"step": 416
},
{
"epoch": 3.229428848015489,
"grad_norm": 0.024431169033050537,
"learning_rate": 3.3522570277662985e-05,
"loss": 0.0083,
"step": 417
},
{
"epoch": 3.2371732817037753,
"grad_norm": 0.03031334839761257,
"learning_rate": 3.326711505803142e-05,
"loss": 0.0107,
"step": 418
},
{
"epoch": 3.244917715392062,
"grad_norm": 0.033758629113435745,
"learning_rate": 3.3012150761560085e-05,
"loss": 0.0186,
"step": 419
},
{
"epoch": 3.2526621490803485,
"grad_norm": 0.02770036645233631,
"learning_rate": 3.275768486860149e-05,
"loss": 0.0097,
"step": 420
},
{
"epoch": 3.2526621490803485,
"eval_loss": 0.015088791027665138,
"eval_runtime": 4.8982,
"eval_samples_per_second": 10.208,
"eval_steps_per_second": 2.654,
"step": 420
},
{
"epoch": 3.260406582768635,
"grad_norm": 0.02369946427643299,
"learning_rate": 3.250372484488558e-05,
"loss": 0.0094,
"step": 421
},
{
"epoch": 3.2681510164569216,
"grad_norm": 0.03576388210058212,
"learning_rate": 3.225027814130074e-05,
"loss": 0.0125,
"step": 422
},
{
"epoch": 3.275895450145208,
"grad_norm": 0.025971444323658943,
"learning_rate": 3.199735219367507e-05,
"loss": 0.0118,
"step": 423
},
{
"epoch": 3.283639883833495,
"grad_norm": 0.028038574382662773,
"learning_rate": 3.174495442255836e-05,
"loss": 0.0099,
"step": 424
},
{
"epoch": 3.291384317521781,
"grad_norm": 0.027834760025143623,
"learning_rate": 3.149309223300428e-05,
"loss": 0.0113,
"step": 425
},
{
"epoch": 3.291384317521781,
"eval_loss": 0.014965364709496498,
"eval_runtime": 4.8786,
"eval_samples_per_second": 10.249,
"eval_steps_per_second": 2.665,
"step": 425
},
{
"epoch": 3.299128751210068,
"grad_norm": 0.023443985730409622,
"learning_rate": 3.124177301435324e-05,
"loss": 0.0132,
"step": 426
},
{
"epoch": 3.3068731848983544,
"grad_norm": 0.024410808458924294,
"learning_rate": 3.09910041400154e-05,
"loss": 0.0102,
"step": 427
},
{
"epoch": 3.3146176185866407,
"grad_norm": 0.032607510685920715,
"learning_rate": 3.0740792967254604e-05,
"loss": 0.0168,
"step": 428
},
{
"epoch": 3.3223620522749275,
"grad_norm": 0.03291484713554382,
"learning_rate": 3.0491146836972272e-05,
"loss": 0.019,
"step": 429
},
{
"epoch": 3.330106485963214,
"grad_norm": 0.03559967130422592,
"learning_rate": 3.024207307349224e-05,
"loss": 0.0303,
"step": 430
},
{
"epoch": 3.330106485963214,
"eval_loss": 0.014858649112284184,
"eval_runtime": 4.8834,
"eval_samples_per_second": 10.239,
"eval_steps_per_second": 2.662,
"step": 430
},
{
"epoch": 3.3378509196515003,
"grad_norm": 0.02721838466823101,
"learning_rate": 2.9993578984345672e-05,
"loss": 0.0111,
"step": 431
},
{
"epoch": 3.345595353339787,
"grad_norm": 0.028012285009026527,
"learning_rate": 2.9745671860056868e-05,
"loss": 0.0136,
"step": 432
},
{
"epoch": 3.3533397870280734,
"grad_norm": 0.029208144173026085,
"learning_rate": 2.9498358973929196e-05,
"loss": 0.013,
"step": 433
},
{
"epoch": 3.3610842207163603,
"grad_norm": 0.031169850379228592,
"learning_rate": 2.9251647581831836e-05,
"loss": 0.0187,
"step": 434
},
{
"epoch": 3.3688286544046466,
"grad_norm": 0.03211589530110359,
"learning_rate": 2.900554492198677e-05,
"loss": 0.0161,
"step": 435
},
{
"epoch": 3.3688286544046466,
"eval_loss": 0.014618839137256145,
"eval_runtime": 4.887,
"eval_samples_per_second": 10.231,
"eval_steps_per_second": 2.66,
"step": 435
},
{
"epoch": 3.3765730880929334,
"grad_norm": 0.0314168706536293,
"learning_rate": 2.876005821475657e-05,
"loss": 0.0106,
"step": 436
},
{
"epoch": 3.38431752178122,
"grad_norm": 0.03567107021808624,
"learning_rate": 2.851519466243242e-05,
"loss": 0.0173,
"step": 437
},
{
"epoch": 3.392061955469506,
"grad_norm": 0.031098151579499245,
"learning_rate": 2.8270961449022893e-05,
"loss": 0.0185,
"step": 438
},
{
"epoch": 3.399806389157793,
"grad_norm": 0.028943657875061035,
"learning_rate": 2.802736574004319e-05,
"loss": 0.0159,
"step": 439
},
{
"epoch": 3.4075508228460794,
"grad_norm": 0.023004574701189995,
"learning_rate": 2.7784414682304832e-05,
"loss": 0.011,
"step": 440
},
{
"epoch": 3.4075508228460794,
"eval_loss": 0.014386112801730633,
"eval_runtime": 4.8818,
"eval_samples_per_second": 10.242,
"eval_steps_per_second": 2.663,
"step": 440
},
{
"epoch": 3.4152952565343657,
"grad_norm": 0.027619289234280586,
"learning_rate": 2.7542115403706063e-05,
"loss": 0.0089,
"step": 441
},
{
"epoch": 3.4230396902226525,
"grad_norm": 0.025844210758805275,
"learning_rate": 2.7300475013022663e-05,
"loss": 0.0127,
"step": 442
},
{
"epoch": 3.430784123910939,
"grad_norm": 0.01797422766685486,
"learning_rate": 2.7059500599699476e-05,
"loss": 0.0068,
"step": 443
},
{
"epoch": 3.4385285575992257,
"grad_norm": 0.031139735132455826,
"learning_rate": 2.6819199233642278e-05,
"loss": 0.0135,
"step": 444
},
{
"epoch": 3.446272991287512,
"grad_norm": 0.03126378357410431,
"learning_rate": 2.65795779650105e-05,
"loss": 0.0084,
"step": 445
},
{
"epoch": 3.446272991287512,
"eval_loss": 0.014389649964869022,
"eval_runtime": 4.8893,
"eval_samples_per_second": 10.226,
"eval_steps_per_second": 2.659,
"step": 445
},
{
"epoch": 3.454017424975799,
"grad_norm": 0.019535277038812637,
"learning_rate": 2.6340643824010247e-05,
"loss": 0.0099,
"step": 446
},
{
"epoch": 3.4617618586640853,
"grad_norm": 0.029923155903816223,
"learning_rate": 2.6102403820688177e-05,
"loss": 0.0158,
"step": 447
},
{
"epoch": 3.4695062923523716,
"grad_norm": 0.023479627445340157,
"learning_rate": 2.586486494472572e-05,
"loss": 0.0066,
"step": 448
},
{
"epoch": 3.4772507260406584,
"grad_norm": 0.03173988685011864,
"learning_rate": 2.562803416523405e-05,
"loss": 0.01,
"step": 449
},
{
"epoch": 3.484995159728945,
"grad_norm": 0.03306049853563309,
"learning_rate": 2.539191843054963e-05,
"loss": 0.0127,
"step": 450
},
{
"epoch": 3.484995159728945,
"eval_loss": 0.014806166291236877,
"eval_runtime": 4.9121,
"eval_samples_per_second": 10.179,
"eval_steps_per_second": 2.647,
"step": 450
},
{
"epoch": 3.492739593417231,
"grad_norm": 0.02089696377515793,
"learning_rate": 2.51565246680304e-05,
"loss": 0.0062,
"step": 451
},
{
"epoch": 3.500484027105518,
"grad_norm": 0.03812693804502487,
"learning_rate": 2.4921859783852408e-05,
"loss": 0.0116,
"step": 452
},
{
"epoch": 3.5082284607938043,
"grad_norm": 0.02929401397705078,
"learning_rate": 2.4687930662807303e-05,
"loss": 0.0136,
"step": 453
},
{
"epoch": 3.515972894482091,
"grad_norm": 0.024923592805862427,
"learning_rate": 2.445474416810033e-05,
"loss": 0.0094,
"step": 454
},
{
"epoch": 3.5237173281703775,
"grad_norm": 0.02743164636194706,
"learning_rate": 2.422230714114891e-05,
"loss": 0.0134,
"step": 455
},
{
"epoch": 3.5237173281703775,
"eval_loss": 0.01469426229596138,
"eval_runtime": 4.8924,
"eval_samples_per_second": 10.22,
"eval_steps_per_second": 2.657,
"step": 455
},
{
"epoch": 3.5314617618586643,
"grad_norm": 0.04384300857782364,
"learning_rate": 2.399062640138201e-05,
"loss": 0.0233,
"step": 456
},
{
"epoch": 3.5392061955469507,
"grad_norm": 0.03357204422354698,
"learning_rate": 2.3759708746039976e-05,
"loss": 0.0177,
"step": 457
},
{
"epoch": 3.546950629235237,
"grad_norm": 0.03177043795585632,
"learning_rate": 2.3529560949975182e-05,
"loss": 0.0087,
"step": 458
},
{
"epoch": 3.554695062923524,
"grad_norm": 0.02979344129562378,
"learning_rate": 2.3300189765453196e-05,
"loss": 0.0082,
"step": 459
},
{
"epoch": 3.5624394966118103,
"grad_norm": 0.021871499717235565,
"learning_rate": 2.3071601921954794e-05,
"loss": 0.0092,
"step": 460
},
{
"epoch": 3.5624394966118103,
"eval_loss": 0.014372522011399269,
"eval_runtime": 4.8873,
"eval_samples_per_second": 10.231,
"eval_steps_per_second": 2.66,
"step": 460
},
{
"epoch": 3.5701839303000966,
"grad_norm": 0.027945492416620255,
"learning_rate": 2.2843804125978357e-05,
"loss": 0.0164,
"step": 461
},
{
"epoch": 3.5779283639883834,
"grad_norm": 0.024697836488485336,
"learning_rate": 2.2616803060843283e-05,
"loss": 0.0074,
"step": 462
},
{
"epoch": 3.58567279767667,
"grad_norm": 0.024211924523115158,
"learning_rate": 2.2390605386493757e-05,
"loss": 0.0087,
"step": 463
},
{
"epoch": 3.593417231364956,
"grad_norm": 0.025920916348695755,
"learning_rate": 2.2165217739303508e-05,
"loss": 0.0128,
"step": 464
},
{
"epoch": 3.601161665053243,
"grad_norm": 0.027798939496278763,
"learning_rate": 2.194064673188089e-05,
"loss": 0.0205,
"step": 465
},
{
"epoch": 3.601161665053243,
"eval_loss": 0.014178312383592129,
"eval_runtime": 4.8802,
"eval_samples_per_second": 10.246,
"eval_steps_per_second": 2.664,
"step": 465
},
{
"epoch": 3.60890609874153,
"grad_norm": 0.0249908696860075,
"learning_rate": 2.171689895287513e-05,
"loss": 0.0098,
"step": 466
},
{
"epoch": 3.616650532429816,
"grad_norm": 0.023805009201169014,
"learning_rate": 2.149398096678283e-05,
"loss": 0.0099,
"step": 467
},
{
"epoch": 3.6243949661181025,
"grad_norm": 0.030275024473667145,
"learning_rate": 2.12718993137555e-05,
"loss": 0.0201,
"step": 468
},
{
"epoch": 3.6321393998063893,
"grad_norm": 0.025657106190919876,
"learning_rate": 2.105066050940758e-05,
"loss": 0.0102,
"step": 469
},
{
"epoch": 3.6398838334946757,
"grad_norm": 0.02271328866481781,
"learning_rate": 2.08302710446253e-05,
"loss": 0.0097,
"step": 470
},
{
"epoch": 3.6398838334946757,
"eval_loss": 0.014142417348921299,
"eval_runtime": 4.8856,
"eval_samples_per_second": 10.234,
"eval_steps_per_second": 2.661,
"step": 470
},
{
"epoch": 3.647628267182962,
"grad_norm": 0.026042208075523376,
"learning_rate": 2.061073738537635e-05,
"loss": 0.0177,
"step": 471
},
{
"epoch": 3.655372700871249,
"grad_norm": 0.021258225664496422,
"learning_rate": 2.039206597252001e-05,
"loss": 0.0065,
"step": 472
},
{
"epoch": 3.6631171345595352,
"grad_norm": 0.027606485411524773,
"learning_rate": 2.0174263221618307e-05,
"loss": 0.0127,
"step": 473
},
{
"epoch": 3.6708615682478216,
"grad_norm": 0.02728329971432686,
"learning_rate": 1.9957335522747707e-05,
"loss": 0.0123,
"step": 474
},
{
"epoch": 3.6786060019361084,
"grad_norm": 0.03719132021069527,
"learning_rate": 1.9741289240311755e-05,
"loss": 0.0158,
"step": 475
},
{
"epoch": 3.6786060019361084,
"eval_loss": 0.014227832667529583,
"eval_runtime": 4.8832,
"eval_samples_per_second": 10.239,
"eval_steps_per_second": 2.662,
"step": 475
},
{
"epoch": 3.6863504356243952,
"grad_norm": 0.029825210571289062,
"learning_rate": 1.9526130712854185e-05,
"loss": 0.0128,
"step": 476
},
{
"epoch": 3.6940948693126816,
"grad_norm": 0.09481414407491684,
"learning_rate": 1.931186625287313e-05,
"loss": 0.0202,
"step": 477
},
{
"epoch": 3.701839303000968,
"grad_norm": 0.027814751490950584,
"learning_rate": 1.909850214663575e-05,
"loss": 0.0121,
"step": 478
},
{
"epoch": 3.709583736689255,
"grad_norm": 0.03036467730998993,
"learning_rate": 1.8886044653993968e-05,
"loss": 0.0163,
"step": 479
},
{
"epoch": 3.717328170377541,
"grad_norm": 0.023233845829963684,
"learning_rate": 1.8674500008200674e-05,
"loss": 0.0095,
"step": 480
},
{
"epoch": 3.717328170377541,
"eval_loss": 0.014037776738405228,
"eval_runtime": 4.8827,
"eval_samples_per_second": 10.24,
"eval_steps_per_second": 2.662,
"step": 480
},
{
"epoch": 3.7250726040658275,
"grad_norm": 0.02636660821735859,
"learning_rate": 1.8463874415726918e-05,
"loss": 0.0125,
"step": 481
},
{
"epoch": 3.7328170377541143,
"grad_norm": 0.022603245452046394,
"learning_rate": 1.82541740560798e-05,
"loss": 0.0072,
"step": 482
},
{
"epoch": 3.7405614714424007,
"grad_norm": 0.019264785572886467,
"learning_rate": 1.8045405081621215e-05,
"loss": 0.0068,
"step": 483
},
{
"epoch": 3.748305905130687,
"grad_norm": 0.02744339220225811,
"learning_rate": 1.7837573617387265e-05,
"loss": 0.0139,
"step": 484
},
{
"epoch": 3.756050338818974,
"grad_norm": 0.032306037843227386,
"learning_rate": 1.7630685760908622e-05,
"loss": 0.0163,
"step": 485
},
{
"epoch": 3.756050338818974,
"eval_loss": 0.014070287346839905,
"eval_runtime": 4.8787,
"eval_samples_per_second": 10.249,
"eval_steps_per_second": 2.665,
"step": 485
},
{
"epoch": 3.7637947725072602,
"grad_norm": 0.034267835319042206,
"learning_rate": 1.7424747582031637e-05,
"loss": 0.0145,
"step": 486
},
{
"epoch": 3.771539206195547,
"grad_norm": 0.02139255404472351,
"learning_rate": 1.72197651227402e-05,
"loss": 0.0084,
"step": 487
},
{
"epoch": 3.7792836398838334,
"grad_norm": 0.020995331928133965,
"learning_rate": 1.7015744396978556e-05,
"loss": 0.0065,
"step": 488
},
{
"epoch": 3.7870280735721202,
"grad_norm": 0.03288980573415756,
"learning_rate": 1.6812691390474787e-05,
"loss": 0.0175,
"step": 489
},
{
"epoch": 3.7947725072604066,
"grad_norm": 0.021166102960705757,
"learning_rate": 1.6610612060565234e-05,
"loss": 0.007,
"step": 490
},
{
"epoch": 3.7947725072604066,
"eval_loss": 0.014264380559325218,
"eval_runtime": 4.8993,
"eval_samples_per_second": 10.206,
"eval_steps_per_second": 2.653,
"step": 490
},
{
"epoch": 3.802516940948693,
"grad_norm": 0.02033647708594799,
"learning_rate": 1.64095123360197e-05,
"loss": 0.0081,
"step": 491
},
{
"epoch": 3.81026137463698,
"grad_norm": 0.01951659470796585,
"learning_rate": 1.6209398116867574e-05,
"loss": 0.008,
"step": 492
},
{
"epoch": 3.818005808325266,
"grad_norm": 0.028182433918118477,
"learning_rate": 1.6010275274224606e-05,
"loss": 0.0143,
"step": 493
},
{
"epoch": 3.8257502420135525,
"grad_norm": 0.03811497241258621,
"learning_rate": 1.5812149650120784e-05,
"loss": 0.0139,
"step": 494
},
{
"epoch": 3.8334946757018393,
"grad_norm": 0.02721046842634678,
"learning_rate": 1.561502705732883e-05,
"loss": 0.0069,
"step": 495
},
{
"epoch": 3.8334946757018393,
"eval_loss": 0.014395428821444511,
"eval_runtime": 4.885,
"eval_samples_per_second": 10.235,
"eval_steps_per_second": 2.661,
"step": 495
},
{
"epoch": 3.8412391093901257,
"grad_norm": 0.03506116569042206,
"learning_rate": 1.5418913279193746e-05,
"loss": 0.0154,
"step": 496
},
{
"epoch": 3.8489835430784125,
"grad_norm": 0.029712386429309845,
"learning_rate": 1.5223814069463078e-05,
"loss": 0.0074,
"step": 497
},
{
"epoch": 3.856727976766699,
"grad_norm": 0.021429866552352905,
"learning_rate": 1.5029735152118124e-05,
"loss": 0.0067,
"step": 498
},
{
"epoch": 3.8644724104549857,
"grad_norm": 0.024990901350975037,
"learning_rate": 1.4836682221206e-05,
"loss": 0.0089,
"step": 499
},
{
"epoch": 3.872216844143272,
"grad_norm": 0.0315503366291523,
"learning_rate": 1.4644660940672627e-05,
"loss": 0.012,
"step": 500
},
{
"epoch": 3.872216844143272,
"eval_loss": 0.014392802491784096,
"eval_runtime": 4.8814,
"eval_samples_per_second": 10.243,
"eval_steps_per_second": 2.663,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 645,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.720511304678769e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}