Qwen2.5-half / trainer_state.json
H4nwei's picture
Upload folder using huggingface_hub
4257f15 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.48828125,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00048828125,
"grad_norm": 11.68676471710205,
"learning_rate": 1.6129032258064518e-07,
"loss": 0.6894,
"step": 1
},
{
"epoch": 0.0009765625,
"grad_norm": 16.37053871154785,
"learning_rate": 3.2258064516129035e-07,
"loss": 0.7171,
"step": 2
},
{
"epoch": 0.00146484375,
"grad_norm": 23.564491271972656,
"learning_rate": 4.838709677419355e-07,
"loss": 0.7123,
"step": 3
},
{
"epoch": 0.001953125,
"grad_norm": 16.051462173461914,
"learning_rate": 6.451612903225807e-07,
"loss": 0.7445,
"step": 4
},
{
"epoch": 0.00244140625,
"grad_norm": 13.484965324401855,
"learning_rate": 8.064516129032258e-07,
"loss": 0.7697,
"step": 5
},
{
"epoch": 0.0029296875,
"grad_norm": 12.733880043029785,
"learning_rate": 9.67741935483871e-07,
"loss": 0.6796,
"step": 6
},
{
"epoch": 0.00341796875,
"grad_norm": 11.081924438476562,
"learning_rate": 1.1290322580645162e-06,
"loss": 0.6711,
"step": 7
},
{
"epoch": 0.00390625,
"grad_norm": 11.96164321899414,
"learning_rate": 1.2903225806451614e-06,
"loss": 0.6916,
"step": 8
},
{
"epoch": 0.00439453125,
"grad_norm": 8.69968318939209,
"learning_rate": 1.4516129032258066e-06,
"loss": 0.6125,
"step": 9
},
{
"epoch": 0.0048828125,
"grad_norm": 8.749759674072266,
"learning_rate": 1.6129032258064516e-06,
"loss": 0.5684,
"step": 10
},
{
"epoch": 0.00537109375,
"grad_norm": 9.206546783447266,
"learning_rate": 1.774193548387097e-06,
"loss": 0.5901,
"step": 11
},
{
"epoch": 0.005859375,
"grad_norm": 6.172158718109131,
"learning_rate": 1.935483870967742e-06,
"loss": 0.5147,
"step": 12
},
{
"epoch": 0.00634765625,
"grad_norm": 5.583189010620117,
"learning_rate": 2.096774193548387e-06,
"loss": 0.5078,
"step": 13
},
{
"epoch": 0.0068359375,
"grad_norm": 8.174113273620605,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.5151,
"step": 14
},
{
"epoch": 0.00732421875,
"grad_norm": 11.44507122039795,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.5215,
"step": 15
},
{
"epoch": 0.0078125,
"grad_norm": 4.763265132904053,
"learning_rate": 2.580645161290323e-06,
"loss": 0.5062,
"step": 16
},
{
"epoch": 0.00830078125,
"grad_norm": 7.144759178161621,
"learning_rate": 2.7419354838709676e-06,
"loss": 0.5313,
"step": 17
},
{
"epoch": 0.0087890625,
"grad_norm": 4.595753192901611,
"learning_rate": 2.903225806451613e-06,
"loss": 0.4514,
"step": 18
},
{
"epoch": 0.00927734375,
"grad_norm": 5.988632678985596,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.468,
"step": 19
},
{
"epoch": 0.009765625,
"grad_norm": 5.993471145629883,
"learning_rate": 3.225806451612903e-06,
"loss": 0.4231,
"step": 20
},
{
"epoch": 0.01025390625,
"grad_norm": 5.629610538482666,
"learning_rate": 3.3870967741935484e-06,
"loss": 0.4748,
"step": 21
},
{
"epoch": 0.0107421875,
"grad_norm": 5.070748329162598,
"learning_rate": 3.548387096774194e-06,
"loss": 0.4851,
"step": 22
},
{
"epoch": 0.01123046875,
"grad_norm": 5.008419990539551,
"learning_rate": 3.7096774193548392e-06,
"loss": 0.4251,
"step": 23
},
{
"epoch": 0.01171875,
"grad_norm": 5.048961162567139,
"learning_rate": 3.870967741935484e-06,
"loss": 0.4423,
"step": 24
},
{
"epoch": 0.01220703125,
"grad_norm": 3.505443811416626,
"learning_rate": 4.032258064516129e-06,
"loss": 0.4165,
"step": 25
},
{
"epoch": 0.0126953125,
"grad_norm": 4.471498966217041,
"learning_rate": 4.193548387096774e-06,
"loss": 0.4132,
"step": 26
},
{
"epoch": 0.01318359375,
"grad_norm": 3.593733310699463,
"learning_rate": 4.35483870967742e-06,
"loss": 0.38,
"step": 27
},
{
"epoch": 0.013671875,
"grad_norm": 7.17294979095459,
"learning_rate": 4.516129032258065e-06,
"loss": 0.3956,
"step": 28
},
{
"epoch": 0.01416015625,
"grad_norm": 15.088685989379883,
"learning_rate": 4.67741935483871e-06,
"loss": 0.4425,
"step": 29
},
{
"epoch": 0.0146484375,
"grad_norm": 4.4346113204956055,
"learning_rate": 4.838709677419355e-06,
"loss": 0.3911,
"step": 30
},
{
"epoch": 0.01513671875,
"grad_norm": 4.740771293640137,
"learning_rate": 5e-06,
"loss": 0.423,
"step": 31
},
{
"epoch": 0.015625,
"grad_norm": 3.4211642742156982,
"learning_rate": 5.161290322580646e-06,
"loss": 0.4183,
"step": 32
},
{
"epoch": 0.01611328125,
"grad_norm": 5.500433444976807,
"learning_rate": 5.322580645161291e-06,
"loss": 0.3956,
"step": 33
},
{
"epoch": 0.0166015625,
"grad_norm": 4.092607021331787,
"learning_rate": 5.483870967741935e-06,
"loss": 0.4028,
"step": 34
},
{
"epoch": 0.01708984375,
"grad_norm": 12.963457107543945,
"learning_rate": 5.645161290322582e-06,
"loss": 0.3862,
"step": 35
},
{
"epoch": 0.017578125,
"grad_norm": 4.550689697265625,
"learning_rate": 5.806451612903226e-06,
"loss": 0.4078,
"step": 36
},
{
"epoch": 0.01806640625,
"grad_norm": 3.3017280101776123,
"learning_rate": 5.967741935483872e-06,
"loss": 0.4334,
"step": 37
},
{
"epoch": 0.0185546875,
"grad_norm": 4.2097954750061035,
"learning_rate": 6.129032258064517e-06,
"loss": 0.342,
"step": 38
},
{
"epoch": 0.01904296875,
"grad_norm": 2.9576752185821533,
"learning_rate": 6.290322580645162e-06,
"loss": 0.3824,
"step": 39
},
{
"epoch": 0.01953125,
"grad_norm": 6.747947692871094,
"learning_rate": 6.451612903225806e-06,
"loss": 0.3952,
"step": 40
},
{
"epoch": 0.02001953125,
"grad_norm": 2.851712942123413,
"learning_rate": 6.612903225806452e-06,
"loss": 0.4143,
"step": 41
},
{
"epoch": 0.0205078125,
"grad_norm": 3.3788578510284424,
"learning_rate": 6.774193548387097e-06,
"loss": 0.3733,
"step": 42
},
{
"epoch": 0.02099609375,
"grad_norm": 4.708284378051758,
"learning_rate": 6.935483870967743e-06,
"loss": 0.3955,
"step": 43
},
{
"epoch": 0.021484375,
"grad_norm": 3.0566701889038086,
"learning_rate": 7.096774193548388e-06,
"loss": 0.402,
"step": 44
},
{
"epoch": 0.02197265625,
"grad_norm": 4.440851211547852,
"learning_rate": 7.258064516129033e-06,
"loss": 0.361,
"step": 45
},
{
"epoch": 0.0224609375,
"grad_norm": 2.7747905254364014,
"learning_rate": 7.4193548387096784e-06,
"loss": 0.3896,
"step": 46
},
{
"epoch": 0.02294921875,
"grad_norm": 3.510695695877075,
"learning_rate": 7.580645161290323e-06,
"loss": 0.364,
"step": 47
},
{
"epoch": 0.0234375,
"grad_norm": 20.806020736694336,
"learning_rate": 7.741935483870968e-06,
"loss": 0.3849,
"step": 48
},
{
"epoch": 0.02392578125,
"grad_norm": 3.569124698638916,
"learning_rate": 7.903225806451613e-06,
"loss": 0.3569,
"step": 49
},
{
"epoch": 0.0244140625,
"grad_norm": 2.8412413597106934,
"learning_rate": 8.064516129032258e-06,
"loss": 0.362,
"step": 50
},
{
"epoch": 0.02490234375,
"grad_norm": 3.287231683731079,
"learning_rate": 8.225806451612904e-06,
"loss": 0.3941,
"step": 51
},
{
"epoch": 0.025390625,
"grad_norm": 2.849888563156128,
"learning_rate": 8.387096774193549e-06,
"loss": 0.3906,
"step": 52
},
{
"epoch": 0.02587890625,
"grad_norm": 6.925948619842529,
"learning_rate": 8.548387096774194e-06,
"loss": 0.3783,
"step": 53
},
{
"epoch": 0.0263671875,
"grad_norm": 2.9347381591796875,
"learning_rate": 8.70967741935484e-06,
"loss": 0.4156,
"step": 54
},
{
"epoch": 0.02685546875,
"grad_norm": 3.695150375366211,
"learning_rate": 8.870967741935484e-06,
"loss": 0.3586,
"step": 55
},
{
"epoch": 0.02734375,
"grad_norm": 7.241847038269043,
"learning_rate": 9.03225806451613e-06,
"loss": 0.3693,
"step": 56
},
{
"epoch": 0.02783203125,
"grad_norm": 2.603956699371338,
"learning_rate": 9.193548387096775e-06,
"loss": 0.4109,
"step": 57
},
{
"epoch": 0.0283203125,
"grad_norm": 3.0118958950042725,
"learning_rate": 9.35483870967742e-06,
"loss": 0.4096,
"step": 58
},
{
"epoch": 0.02880859375,
"grad_norm": 5.108702182769775,
"learning_rate": 9.516129032258065e-06,
"loss": 0.3786,
"step": 59
},
{
"epoch": 0.029296875,
"grad_norm": 3.0591766834259033,
"learning_rate": 9.67741935483871e-06,
"loss": 0.3979,
"step": 60
},
{
"epoch": 0.02978515625,
"grad_norm": 3.5517218112945557,
"learning_rate": 9.838709677419356e-06,
"loss": 0.3847,
"step": 61
},
{
"epoch": 0.0302734375,
"grad_norm": 3.091423988342285,
"learning_rate": 1e-05,
"loss": 0.35,
"step": 62
},
{
"epoch": 0.03076171875,
"grad_norm": 2.7133779525756836,
"learning_rate": 9.999993744224208e-06,
"loss": 0.3592,
"step": 63
},
{
"epoch": 0.03125,
"grad_norm": 2.4324684143066406,
"learning_rate": 9.999974976912485e-06,
"loss": 0.3616,
"step": 64
},
{
"epoch": 0.03173828125,
"grad_norm": 2.872821807861328,
"learning_rate": 9.999943698111792e-06,
"loss": 0.3741,
"step": 65
},
{
"epoch": 0.0322265625,
"grad_norm": 2.9383156299591064,
"learning_rate": 9.999899907900399e-06,
"loss": 0.3732,
"step": 66
},
{
"epoch": 0.03271484375,
"grad_norm": 3.5359489917755127,
"learning_rate": 9.999843606387883e-06,
"loss": 0.4053,
"step": 67
},
{
"epoch": 0.033203125,
"grad_norm": 3.5608558654785156,
"learning_rate": 9.999774793715126e-06,
"loss": 0.4197,
"step": 68
},
{
"epoch": 0.03369140625,
"grad_norm": 2.5407004356384277,
"learning_rate": 9.999693470054321e-06,
"loss": 0.354,
"step": 69
},
{
"epoch": 0.0341796875,
"grad_norm": 3.4264254570007324,
"learning_rate": 9.999599635608964e-06,
"loss": 0.3936,
"step": 70
},
{
"epoch": 0.03466796875,
"grad_norm": 3.0363235473632812,
"learning_rate": 9.999493290613859e-06,
"loss": 0.3753,
"step": 71
},
{
"epoch": 0.03515625,
"grad_norm": 2.2824833393096924,
"learning_rate": 9.999374435335113e-06,
"loss": 0.3813,
"step": 72
},
{
"epoch": 0.03564453125,
"grad_norm": 2.445328712463379,
"learning_rate": 9.999243070070137e-06,
"loss": 0.4237,
"step": 73
},
{
"epoch": 0.0361328125,
"grad_norm": 5.150700092315674,
"learning_rate": 9.99909919514765e-06,
"loss": 0.3892,
"step": 74
},
{
"epoch": 0.03662109375,
"grad_norm": 4.1412272453308105,
"learning_rate": 9.998942810927673e-06,
"loss": 0.3675,
"step": 75
},
{
"epoch": 0.037109375,
"grad_norm": 5.456881999969482,
"learning_rate": 9.998773917801526e-06,
"loss": 0.3955,
"step": 76
},
{
"epoch": 0.03759765625,
"grad_norm": 2.2837321758270264,
"learning_rate": 9.998592516191832e-06,
"loss": 0.3477,
"step": 77
},
{
"epoch": 0.0380859375,
"grad_norm": 2.237900972366333,
"learning_rate": 9.998398606552513e-06,
"loss": 0.3771,
"step": 78
},
{
"epoch": 0.03857421875,
"grad_norm": 2.6276211738586426,
"learning_rate": 9.998192189368795e-06,
"loss": 0.3989,
"step": 79
},
{
"epoch": 0.0390625,
"grad_norm": 3.5280210971832275,
"learning_rate": 9.997973265157192e-06,
"loss": 0.3726,
"step": 80
},
{
"epoch": 0.03955078125,
"grad_norm": 8.555140495300293,
"learning_rate": 9.997741834465526e-06,
"loss": 0.397,
"step": 81
},
{
"epoch": 0.0400390625,
"grad_norm": 2.1885085105895996,
"learning_rate": 9.997497897872904e-06,
"loss": 0.4058,
"step": 82
},
{
"epoch": 0.04052734375,
"grad_norm": 3.0636098384857178,
"learning_rate": 9.997241455989735e-06,
"loss": 0.3866,
"step": 83
},
{
"epoch": 0.041015625,
"grad_norm": 3.7982375621795654,
"learning_rate": 9.996972509457711e-06,
"loss": 0.3877,
"step": 84
},
{
"epoch": 0.04150390625,
"grad_norm": 2.4791505336761475,
"learning_rate": 9.996691058949826e-06,
"loss": 0.3789,
"step": 85
},
{
"epoch": 0.0419921875,
"grad_norm": 3.917693614959717,
"learning_rate": 9.996397105170353e-06,
"loss": 0.3737,
"step": 86
},
{
"epoch": 0.04248046875,
"grad_norm": 2.3083252906799316,
"learning_rate": 9.996090648854856e-06,
"loss": 0.3658,
"step": 87
},
{
"epoch": 0.04296875,
"grad_norm": 1.9872547388076782,
"learning_rate": 9.995771690770184e-06,
"loss": 0.3819,
"step": 88
},
{
"epoch": 0.04345703125,
"grad_norm": 1.8703923225402832,
"learning_rate": 9.995440231714469e-06,
"loss": 0.37,
"step": 89
},
{
"epoch": 0.0439453125,
"grad_norm": 2.7573578357696533,
"learning_rate": 9.995096272517122e-06,
"loss": 0.3876,
"step": 90
},
{
"epoch": 0.04443359375,
"grad_norm": 2.177542209625244,
"learning_rate": 9.99473981403884e-06,
"loss": 0.434,
"step": 91
},
{
"epoch": 0.044921875,
"grad_norm": 1.9339114427566528,
"learning_rate": 9.99437085717159e-06,
"loss": 0.333,
"step": 92
},
{
"epoch": 0.04541015625,
"grad_norm": 2.9820590019226074,
"learning_rate": 9.993989402838618e-06,
"loss": 0.3321,
"step": 93
},
{
"epoch": 0.0458984375,
"grad_norm": 2.0244717597961426,
"learning_rate": 9.99359545199444e-06,
"loss": 0.3153,
"step": 94
},
{
"epoch": 0.04638671875,
"grad_norm": 2.0268101692199707,
"learning_rate": 9.993189005624842e-06,
"loss": 0.3663,
"step": 95
},
{
"epoch": 0.046875,
"grad_norm": 1.920785903930664,
"learning_rate": 9.992770064746882e-06,
"loss": 0.3419,
"step": 96
},
{
"epoch": 0.04736328125,
"grad_norm": 3.2875781059265137,
"learning_rate": 9.992338630408877e-06,
"loss": 0.3406,
"step": 97
},
{
"epoch": 0.0478515625,
"grad_norm": 3.7749016284942627,
"learning_rate": 9.991894703690414e-06,
"loss": 0.3555,
"step": 98
},
{
"epoch": 0.04833984375,
"grad_norm": 4.618077754974365,
"learning_rate": 9.991438285702332e-06,
"loss": 0.4001,
"step": 99
},
{
"epoch": 0.048828125,
"grad_norm": 2.468576192855835,
"learning_rate": 9.99096937758673e-06,
"loss": 0.4258,
"step": 100
},
{
"epoch": 0.04931640625,
"grad_norm": 5.204842567443848,
"learning_rate": 9.990487980516962e-06,
"loss": 0.4107,
"step": 101
},
{
"epoch": 0.0498046875,
"grad_norm": 3.3488011360168457,
"learning_rate": 9.989994095697636e-06,
"loss": 0.3658,
"step": 102
},
{
"epoch": 0.05029296875,
"grad_norm": 4.41386079788208,
"learning_rate": 9.989487724364602e-06,
"loss": 0.3705,
"step": 103
},
{
"epoch": 0.05078125,
"grad_norm": 2.9542033672332764,
"learning_rate": 9.988968867784958e-06,
"loss": 0.3955,
"step": 104
},
{
"epoch": 0.05126953125,
"grad_norm": 2.3820998668670654,
"learning_rate": 9.988437527257044e-06,
"loss": 0.3652,
"step": 105
},
{
"epoch": 0.0517578125,
"grad_norm": 2.352477550506592,
"learning_rate": 9.987893704110441e-06,
"loss": 0.3545,
"step": 106
},
{
"epoch": 0.05224609375,
"grad_norm": 7.221553802490234,
"learning_rate": 9.987337399705964e-06,
"loss": 0.3616,
"step": 107
},
{
"epoch": 0.052734375,
"grad_norm": 2.3267176151275635,
"learning_rate": 9.986768615435655e-06,
"loss": 0.3868,
"step": 108
},
{
"epoch": 0.05322265625,
"grad_norm": 1.9337338209152222,
"learning_rate": 9.986187352722792e-06,
"loss": 0.3664,
"step": 109
},
{
"epoch": 0.0537109375,
"grad_norm": 2.2121877670288086,
"learning_rate": 9.985593613021873e-06,
"loss": 0.3731,
"step": 110
},
{
"epoch": 0.05419921875,
"grad_norm": 1.9584633111953735,
"learning_rate": 9.98498739781862e-06,
"loss": 0.3805,
"step": 111
},
{
"epoch": 0.0546875,
"grad_norm": 2.3601884841918945,
"learning_rate": 9.984368708629972e-06,
"loss": 0.3328,
"step": 112
},
{
"epoch": 0.05517578125,
"grad_norm": 2.705298662185669,
"learning_rate": 9.98373754700408e-06,
"loss": 0.3573,
"step": 113
},
{
"epoch": 0.0556640625,
"grad_norm": 4.535929203033447,
"learning_rate": 9.98309391452031e-06,
"loss": 0.3853,
"step": 114
},
{
"epoch": 0.05615234375,
"grad_norm": 2.4388949871063232,
"learning_rate": 9.982437812789224e-06,
"loss": 0.3389,
"step": 115
},
{
"epoch": 0.056640625,
"grad_norm": 3.7873549461364746,
"learning_rate": 9.981769243452595e-06,
"loss": 0.3745,
"step": 116
},
{
"epoch": 0.05712890625,
"grad_norm": 2.1249921321868896,
"learning_rate": 9.981088208183392e-06,
"loss": 0.3854,
"step": 117
},
{
"epoch": 0.0576171875,
"grad_norm": 3.2426087856292725,
"learning_rate": 9.980394708685777e-06,
"loss": 0.3743,
"step": 118
},
{
"epoch": 0.05810546875,
"grad_norm": 2.349886178970337,
"learning_rate": 9.979688746695099e-06,
"loss": 0.3477,
"step": 119
},
{
"epoch": 0.05859375,
"grad_norm": 2.6616315841674805,
"learning_rate": 9.978970323977895e-06,
"loss": 0.3497,
"step": 120
},
{
"epoch": 0.05908203125,
"grad_norm": 2.284364938735962,
"learning_rate": 9.978239442331881e-06,
"loss": 0.3987,
"step": 121
},
{
"epoch": 0.0595703125,
"grad_norm": 2.347794532775879,
"learning_rate": 9.977496103585949e-06,
"loss": 0.3375,
"step": 122
},
{
"epoch": 0.06005859375,
"grad_norm": 2.8935320377349854,
"learning_rate": 9.976740309600166e-06,
"loss": 0.3943,
"step": 123
},
{
"epoch": 0.060546875,
"grad_norm": 2.3763160705566406,
"learning_rate": 9.97597206226576e-06,
"loss": 0.3703,
"step": 124
},
{
"epoch": 0.06103515625,
"grad_norm": 2.1485118865966797,
"learning_rate": 9.975191363505127e-06,
"loss": 0.3604,
"step": 125
},
{
"epoch": 0.0615234375,
"grad_norm": 4.019608020782471,
"learning_rate": 9.974398215271814e-06,
"loss": 0.3345,
"step": 126
},
{
"epoch": 0.06201171875,
"grad_norm": 4.793520450592041,
"learning_rate": 9.973592619550528e-06,
"loss": 0.3583,
"step": 127
},
{
"epoch": 0.0625,
"grad_norm": 2.3743088245391846,
"learning_rate": 9.972774578357118e-06,
"loss": 0.3612,
"step": 128
},
{
"epoch": 0.06298828125,
"grad_norm": 2.3221397399902344,
"learning_rate": 9.971944093738575e-06,
"loss": 0.3759,
"step": 129
},
{
"epoch": 0.0634765625,
"grad_norm": 2.639760971069336,
"learning_rate": 9.971101167773032e-06,
"loss": 0.3749,
"step": 130
},
{
"epoch": 0.06396484375,
"grad_norm": 2.3176326751708984,
"learning_rate": 9.97024580256975e-06,
"loss": 0.3324,
"step": 131
},
{
"epoch": 0.064453125,
"grad_norm": 2.5662341117858887,
"learning_rate": 9.969378000269117e-06,
"loss": 0.3956,
"step": 132
},
{
"epoch": 0.06494140625,
"grad_norm": 3.271336793899536,
"learning_rate": 9.968497763042644e-06,
"loss": 0.3702,
"step": 133
},
{
"epoch": 0.0654296875,
"grad_norm": 2.0121848583221436,
"learning_rate": 9.96760509309296e-06,
"loss": 0.3644,
"step": 134
},
{
"epoch": 0.06591796875,
"grad_norm": 2.1467254161834717,
"learning_rate": 9.9666999926538e-06,
"loss": 0.3444,
"step": 135
},
{
"epoch": 0.06640625,
"grad_norm": 2.985793113708496,
"learning_rate": 9.96578246399001e-06,
"loss": 0.4015,
"step": 136
},
{
"epoch": 0.06689453125,
"grad_norm": 2.158658504486084,
"learning_rate": 9.964852509397527e-06,
"loss": 0.3809,
"step": 137
},
{
"epoch": 0.0673828125,
"grad_norm": 4.1197919845581055,
"learning_rate": 9.963910131203386e-06,
"loss": 0.3874,
"step": 138
},
{
"epoch": 0.06787109375,
"grad_norm": 2.2979846000671387,
"learning_rate": 9.962955331765712e-06,
"loss": 0.342,
"step": 139
},
{
"epoch": 0.068359375,
"grad_norm": 2.2568418979644775,
"learning_rate": 9.961988113473708e-06,
"loss": 0.3223,
"step": 140
},
{
"epoch": 0.06884765625,
"grad_norm": 2.358520030975342,
"learning_rate": 9.961008478747655e-06,
"loss": 0.374,
"step": 141
},
{
"epoch": 0.0693359375,
"grad_norm": 2.6409096717834473,
"learning_rate": 9.960016430038903e-06,
"loss": 0.3705,
"step": 142
},
{
"epoch": 0.06982421875,
"grad_norm": 2.167280673980713,
"learning_rate": 9.959011969829867e-06,
"loss": 0.3302,
"step": 143
},
{
"epoch": 0.0703125,
"grad_norm": 2.3867969512939453,
"learning_rate": 9.957995100634016e-06,
"loss": 0.3251,
"step": 144
},
{
"epoch": 0.07080078125,
"grad_norm": 2.305117130279541,
"learning_rate": 9.956965824995873e-06,
"loss": 0.3593,
"step": 145
},
{
"epoch": 0.0712890625,
"grad_norm": 2.1817824840545654,
"learning_rate": 9.955924145491005e-06,
"loss": 0.3371,
"step": 146
},
{
"epoch": 0.07177734375,
"grad_norm": 4.12109375,
"learning_rate": 9.954870064726017e-06,
"loss": 0.3771,
"step": 147
},
{
"epoch": 0.072265625,
"grad_norm": 3.0079329013824463,
"learning_rate": 9.953803585338548e-06,
"loss": 0.3636,
"step": 148
},
{
"epoch": 0.07275390625,
"grad_norm": 2.473532199859619,
"learning_rate": 9.95272470999726e-06,
"loss": 0.3692,
"step": 149
},
{
"epoch": 0.0732421875,
"grad_norm": 3.1922385692596436,
"learning_rate": 9.95163344140183e-06,
"loss": 0.3773,
"step": 150
},
{
"epoch": 0.07373046875,
"grad_norm": 6.991460800170898,
"learning_rate": 9.950529782282955e-06,
"loss": 0.2813,
"step": 151
},
{
"epoch": 0.07421875,
"grad_norm": 2.9967305660247803,
"learning_rate": 9.949413735402332e-06,
"loss": 0.3565,
"step": 152
},
{
"epoch": 0.07470703125,
"grad_norm": 1.8642289638519287,
"learning_rate": 9.948285303552654e-06,
"loss": 0.3715,
"step": 153
},
{
"epoch": 0.0751953125,
"grad_norm": 2.169416904449463,
"learning_rate": 9.947144489557612e-06,
"loss": 0.3507,
"step": 154
},
{
"epoch": 0.07568359375,
"grad_norm": 2.5897326469421387,
"learning_rate": 9.945991296271874e-06,
"loss": 0.3508,
"step": 155
},
{
"epoch": 0.076171875,
"grad_norm": 1.8967130184173584,
"learning_rate": 9.944825726581085e-06,
"loss": 0.318,
"step": 156
},
{
"epoch": 0.07666015625,
"grad_norm": 1.998544454574585,
"learning_rate": 9.943647783401867e-06,
"loss": 0.3757,
"step": 157
},
{
"epoch": 0.0771484375,
"grad_norm": 2.5188403129577637,
"learning_rate": 9.942457469681794e-06,
"loss": 0.3551,
"step": 158
},
{
"epoch": 0.07763671875,
"grad_norm": 2.2102835178375244,
"learning_rate": 9.941254788399406e-06,
"loss": 0.3499,
"step": 159
},
{
"epoch": 0.078125,
"grad_norm": 3.3190438747406006,
"learning_rate": 9.940039742564182e-06,
"loss": 0.3586,
"step": 160
},
{
"epoch": 0.07861328125,
"grad_norm": 6.675033092498779,
"learning_rate": 9.938812335216543e-06,
"loss": 0.3892,
"step": 161
},
{
"epoch": 0.0791015625,
"grad_norm": 3.091517925262451,
"learning_rate": 9.937572569427844e-06,
"loss": 0.3434,
"step": 162
},
{
"epoch": 0.07958984375,
"grad_norm": 2.7739408016204834,
"learning_rate": 9.936320448300364e-06,
"loss": 0.3366,
"step": 163
},
{
"epoch": 0.080078125,
"grad_norm": 4.218409538269043,
"learning_rate": 9.935055974967299e-06,
"loss": 0.3129,
"step": 164
},
{
"epoch": 0.08056640625,
"grad_norm": 2.2632052898406982,
"learning_rate": 9.933779152592752e-06,
"loss": 0.3507,
"step": 165
},
{
"epoch": 0.0810546875,
"grad_norm": 2.3607664108276367,
"learning_rate": 9.93248998437173e-06,
"loss": 0.3598,
"step": 166
},
{
"epoch": 0.08154296875,
"grad_norm": 2.2539124488830566,
"learning_rate": 9.931188473530132e-06,
"loss": 0.404,
"step": 167
},
{
"epoch": 0.08203125,
"grad_norm": 2.049994945526123,
"learning_rate": 9.929874623324741e-06,
"loss": 0.3534,
"step": 168
},
{
"epoch": 0.08251953125,
"grad_norm": 4.720448017120361,
"learning_rate": 9.92854843704322e-06,
"loss": 0.3492,
"step": 169
},
{
"epoch": 0.0830078125,
"grad_norm": 2.1875171661376953,
"learning_rate": 9.927209918004095e-06,
"loss": 0.3765,
"step": 170
},
{
"epoch": 0.08349609375,
"grad_norm": 6.087578773498535,
"learning_rate": 9.92585906955676e-06,
"loss": 0.3519,
"step": 171
},
{
"epoch": 0.083984375,
"grad_norm": 6.033719539642334,
"learning_rate": 9.924495895081455e-06,
"loss": 0.3493,
"step": 172
},
{
"epoch": 0.08447265625,
"grad_norm": 4.239842414855957,
"learning_rate": 9.923120397989265e-06,
"loss": 0.3566,
"step": 173
},
{
"epoch": 0.0849609375,
"grad_norm": 3.4344899654388428,
"learning_rate": 9.92173258172211e-06,
"loss": 0.3291,
"step": 174
},
{
"epoch": 0.08544921875,
"grad_norm": 2.5044116973876953,
"learning_rate": 9.920332449752741e-06,
"loss": 0.368,
"step": 175
},
{
"epoch": 0.0859375,
"grad_norm": 2.5513086318969727,
"learning_rate": 9.91892000558472e-06,
"loss": 0.3715,
"step": 176
},
{
"epoch": 0.08642578125,
"grad_norm": 3.1087024211883545,
"learning_rate": 9.917495252752418e-06,
"loss": 0.3421,
"step": 177
},
{
"epoch": 0.0869140625,
"grad_norm": 4.5129194259643555,
"learning_rate": 9.916058194821013e-06,
"loss": 0.3348,
"step": 178
},
{
"epoch": 0.08740234375,
"grad_norm": 2.54546856880188,
"learning_rate": 9.914608835386468e-06,
"loss": 0.3741,
"step": 179
},
{
"epoch": 0.087890625,
"grad_norm": 3.379059314727783,
"learning_rate": 9.913147178075531e-06,
"loss": 0.3633,
"step": 180
},
{
"epoch": 0.08837890625,
"grad_norm": 2.6582908630371094,
"learning_rate": 9.911673226545721e-06,
"loss": 0.3626,
"step": 181
},
{
"epoch": 0.0888671875,
"grad_norm": 2.116603374481201,
"learning_rate": 9.910186984485321e-06,
"loss": 0.3627,
"step": 182
},
{
"epoch": 0.08935546875,
"grad_norm": 3.2947633266448975,
"learning_rate": 9.908688455613374e-06,
"loss": 0.3264,
"step": 183
},
{
"epoch": 0.08984375,
"grad_norm": 2.313702344894409,
"learning_rate": 9.90717764367966e-06,
"loss": 0.3285,
"step": 184
},
{
"epoch": 0.09033203125,
"grad_norm": 2.2801687717437744,
"learning_rate": 9.9056545524647e-06,
"loss": 0.3573,
"step": 185
},
{
"epoch": 0.0908203125,
"grad_norm": 3.657966375350952,
"learning_rate": 9.904119185779744e-06,
"loss": 0.3711,
"step": 186
},
{
"epoch": 0.09130859375,
"grad_norm": 22.30857276916504,
"learning_rate": 9.902571547466753e-06,
"loss": 0.3995,
"step": 187
},
{
"epoch": 0.091796875,
"grad_norm": 2.184039831161499,
"learning_rate": 9.901011641398398e-06,
"loss": 0.3654,
"step": 188
},
{
"epoch": 0.09228515625,
"grad_norm": 4.786393165588379,
"learning_rate": 9.89943947147805e-06,
"loss": 0.3859,
"step": 189
},
{
"epoch": 0.0927734375,
"grad_norm": 2.666750431060791,
"learning_rate": 9.897855041639764e-06,
"loss": 0.3888,
"step": 190
},
{
"epoch": 0.09326171875,
"grad_norm": 2.0390570163726807,
"learning_rate": 9.896258355848277e-06,
"loss": 0.3488,
"step": 191
},
{
"epoch": 0.09375,
"grad_norm": 2.618748188018799,
"learning_rate": 9.894649418098992e-06,
"loss": 0.3513,
"step": 192
},
{
"epoch": 0.09423828125,
"grad_norm": 2.525346040725708,
"learning_rate": 9.89302823241797e-06,
"loss": 0.3689,
"step": 193
},
{
"epoch": 0.0947265625,
"grad_norm": 2.0813663005828857,
"learning_rate": 9.89139480286192e-06,
"loss": 0.3718,
"step": 194
},
{
"epoch": 0.09521484375,
"grad_norm": 3.025359630584717,
"learning_rate": 9.88974913351819e-06,
"loss": 0.3786,
"step": 195
},
{
"epoch": 0.095703125,
"grad_norm": 2.8500590324401855,
"learning_rate": 9.888091228504757e-06,
"loss": 0.3481,
"step": 196
},
{
"epoch": 0.09619140625,
"grad_norm": 2.450500249862671,
"learning_rate": 9.88642109197021e-06,
"loss": 0.383,
"step": 197
},
{
"epoch": 0.0966796875,
"grad_norm": 1.9162877798080444,
"learning_rate": 9.884738728093754e-06,
"loss": 0.3698,
"step": 198
},
{
"epoch": 0.09716796875,
"grad_norm": 14.184158325195312,
"learning_rate": 9.883044141085183e-06,
"loss": 0.3327,
"step": 199
},
{
"epoch": 0.09765625,
"grad_norm": 3.0886130332946777,
"learning_rate": 9.881337335184879e-06,
"loss": 0.3767,
"step": 200
},
{
"epoch": 0.09814453125,
"grad_norm": 2.5864577293395996,
"learning_rate": 9.879618314663799e-06,
"loss": 0.3498,
"step": 201
},
{
"epoch": 0.0986328125,
"grad_norm": 3.3661086559295654,
"learning_rate": 9.87788708382347e-06,
"loss": 0.3487,
"step": 202
},
{
"epoch": 0.09912109375,
"grad_norm": 2.543836832046509,
"learning_rate": 9.876143646995964e-06,
"loss": 0.3611,
"step": 203
},
{
"epoch": 0.099609375,
"grad_norm": 2.209348201751709,
"learning_rate": 9.874388008543903e-06,
"loss": 0.3303,
"step": 204
},
{
"epoch": 0.10009765625,
"grad_norm": 8.464391708374023,
"learning_rate": 9.87262017286044e-06,
"loss": 0.3915,
"step": 205
},
{
"epoch": 0.1005859375,
"grad_norm": 2.339383125305176,
"learning_rate": 9.870840144369247e-06,
"loss": 0.3386,
"step": 206
},
{
"epoch": 0.10107421875,
"grad_norm": 4.952784538269043,
"learning_rate": 9.869047927524508e-06,
"loss": 0.3189,
"step": 207
},
{
"epoch": 0.1015625,
"grad_norm": 2.147639036178589,
"learning_rate": 9.867243526810909e-06,
"loss": 0.325,
"step": 208
},
{
"epoch": 0.10205078125,
"grad_norm": 2.364194393157959,
"learning_rate": 9.865426946743614e-06,
"loss": 0.3728,
"step": 209
},
{
"epoch": 0.1025390625,
"grad_norm": 2.0875487327575684,
"learning_rate": 9.863598191868275e-06,
"loss": 0.3493,
"step": 210
},
{
"epoch": 0.10302734375,
"grad_norm": 3.100674629211426,
"learning_rate": 9.861757266761002e-06,
"loss": 0.3503,
"step": 211
},
{
"epoch": 0.103515625,
"grad_norm": 3.1530754566192627,
"learning_rate": 9.859904176028364e-06,
"loss": 0.3635,
"step": 212
},
{
"epoch": 0.10400390625,
"grad_norm": 2.373269557952881,
"learning_rate": 9.858038924307363e-06,
"loss": 0.316,
"step": 213
},
{
"epoch": 0.1044921875,
"grad_norm": 2.517578125,
"learning_rate": 9.856161516265445e-06,
"loss": 0.3729,
"step": 214
},
{
"epoch": 0.10498046875,
"grad_norm": 3.9366421699523926,
"learning_rate": 9.854271956600463e-06,
"loss": 0.3119,
"step": 215
},
{
"epoch": 0.10546875,
"grad_norm": 3.0418357849121094,
"learning_rate": 9.852370250040682e-06,
"loss": 0.3799,
"step": 216
},
{
"epoch": 0.10595703125,
"grad_norm": 2.486046314239502,
"learning_rate": 9.85045640134476e-06,
"loss": 0.3761,
"step": 217
},
{
"epoch": 0.1064453125,
"grad_norm": 3.757772207260132,
"learning_rate": 9.848530415301748e-06,
"loss": 0.3281,
"step": 218
},
{
"epoch": 0.10693359375,
"grad_norm": 5.470198631286621,
"learning_rate": 9.846592296731052e-06,
"loss": 0.3626,
"step": 219
},
{
"epoch": 0.107421875,
"grad_norm": 2.6514899730682373,
"learning_rate": 9.84464205048245e-06,
"loss": 0.3312,
"step": 220
},
{
"epoch": 0.10791015625,
"grad_norm": 2.359720230102539,
"learning_rate": 9.842679681436062e-06,
"loss": 0.3332,
"step": 221
},
{
"epoch": 0.1083984375,
"grad_norm": 2.7306034564971924,
"learning_rate": 9.840705194502349e-06,
"loss": 0.3623,
"step": 222
},
{
"epoch": 0.10888671875,
"grad_norm": 2.2408559322357178,
"learning_rate": 9.838718594622083e-06,
"loss": 0.3579,
"step": 223
},
{
"epoch": 0.109375,
"grad_norm": 1.9728875160217285,
"learning_rate": 9.836719886766357e-06,
"loss": 0.3411,
"step": 224
},
{
"epoch": 0.10986328125,
"grad_norm": 2.826547861099243,
"learning_rate": 9.83470907593656e-06,
"loss": 0.2803,
"step": 225
},
{
"epoch": 0.1103515625,
"grad_norm": 2.5550942420959473,
"learning_rate": 9.832686167164361e-06,
"loss": 0.3537,
"step": 226
},
{
"epoch": 0.11083984375,
"grad_norm": 2.6079165935516357,
"learning_rate": 9.830651165511707e-06,
"loss": 0.3527,
"step": 227
},
{
"epoch": 0.111328125,
"grad_norm": 2.2585561275482178,
"learning_rate": 9.828604076070805e-06,
"loss": 0.3741,
"step": 228
},
{
"epoch": 0.11181640625,
"grad_norm": 2.335930585861206,
"learning_rate": 9.826544903964105e-06,
"loss": 0.34,
"step": 229
},
{
"epoch": 0.1123046875,
"grad_norm": 2.3235063552856445,
"learning_rate": 9.824473654344297e-06,
"loss": 0.3691,
"step": 230
},
{
"epoch": 0.11279296875,
"grad_norm": 3.584376811981201,
"learning_rate": 9.82239033239429e-06,
"loss": 0.3548,
"step": 231
},
{
"epoch": 0.11328125,
"grad_norm": 3.483834743499756,
"learning_rate": 9.820294943327202e-06,
"loss": 0.3905,
"step": 232
},
{
"epoch": 0.11376953125,
"grad_norm": 2.4160964488983154,
"learning_rate": 9.818187492386346e-06,
"loss": 0.3723,
"step": 233
},
{
"epoch": 0.1142578125,
"grad_norm": 2.206505298614502,
"learning_rate": 9.816067984845218e-06,
"loss": 0.3572,
"step": 234
},
{
"epoch": 0.11474609375,
"grad_norm": 2.8877620697021484,
"learning_rate": 9.813936426007487e-06,
"loss": 0.3486,
"step": 235
},
{
"epoch": 0.115234375,
"grad_norm": 2.2150516510009766,
"learning_rate": 9.81179282120697e-06,
"loss": 0.3431,
"step": 236
},
{
"epoch": 0.11572265625,
"grad_norm": 4.500147819519043,
"learning_rate": 9.809637175807634e-06,
"loss": 0.3465,
"step": 237
},
{
"epoch": 0.1162109375,
"grad_norm": 2.428119659423828,
"learning_rate": 9.80746949520357e-06,
"loss": 0.3193,
"step": 238
},
{
"epoch": 0.11669921875,
"grad_norm": 4.387357711791992,
"learning_rate": 9.805289784818991e-06,
"loss": 0.3789,
"step": 239
},
{
"epoch": 0.1171875,
"grad_norm": 2.6022865772247314,
"learning_rate": 9.803098050108206e-06,
"loss": 0.3744,
"step": 240
},
{
"epoch": 0.11767578125,
"grad_norm": 2.3189945220947266,
"learning_rate": 9.800894296555618e-06,
"loss": 0.3542,
"step": 241
},
{
"epoch": 0.1181640625,
"grad_norm": 2.428673505783081,
"learning_rate": 9.798678529675702e-06,
"loss": 0.354,
"step": 242
},
{
"epoch": 0.11865234375,
"grad_norm": 2.112927198410034,
"learning_rate": 9.796450755012992e-06,
"loss": 0.3541,
"step": 243
},
{
"epoch": 0.119140625,
"grad_norm": 3.9023051261901855,
"learning_rate": 9.794210978142073e-06,
"loss": 0.3902,
"step": 244
},
{
"epoch": 0.11962890625,
"grad_norm": 2.621843099594116,
"learning_rate": 9.79195920466756e-06,
"loss": 0.35,
"step": 245
},
{
"epoch": 0.1201171875,
"grad_norm": 2.8156723976135254,
"learning_rate": 9.789695440224094e-06,
"loss": 0.3562,
"step": 246
},
{
"epoch": 0.12060546875,
"grad_norm": 4.237185001373291,
"learning_rate": 9.78741969047631e-06,
"loss": 0.3596,
"step": 247
},
{
"epoch": 0.12109375,
"grad_norm": 2.050010919570923,
"learning_rate": 9.785131961118843e-06,
"loss": 0.3562,
"step": 248
},
{
"epoch": 0.12158203125,
"grad_norm": 2.1943752765655518,
"learning_rate": 9.782832257876302e-06,
"loss": 0.3147,
"step": 249
},
{
"epoch": 0.1220703125,
"grad_norm": 3.3409993648529053,
"learning_rate": 9.780520586503258e-06,
"loss": 0.4023,
"step": 250
},
{
"epoch": 0.12255859375,
"grad_norm": 2.073791027069092,
"learning_rate": 9.77819695278423e-06,
"loss": 0.3323,
"step": 251
},
{
"epoch": 0.123046875,
"grad_norm": 2.773463010787964,
"learning_rate": 9.77586136253367e-06,
"loss": 0.3461,
"step": 252
},
{
"epoch": 0.12353515625,
"grad_norm": 2.2921154499053955,
"learning_rate": 9.773513821595951e-06,
"loss": 0.3344,
"step": 253
},
{
"epoch": 0.1240234375,
"grad_norm": 2.6613571643829346,
"learning_rate": 9.771154335845345e-06,
"loss": 0.348,
"step": 254
},
{
"epoch": 0.12451171875,
"grad_norm": 8.336869239807129,
"learning_rate": 9.768782911186023e-06,
"loss": 0.3726,
"step": 255
},
{
"epoch": 0.125,
"grad_norm": 2.428882360458374,
"learning_rate": 9.766399553552022e-06,
"loss": 0.3765,
"step": 256
},
{
"epoch": 0.12548828125,
"grad_norm": 1.8940154314041138,
"learning_rate": 9.764004268907244e-06,
"loss": 0.3407,
"step": 257
},
{
"epoch": 0.1259765625,
"grad_norm": 2.5715792179107666,
"learning_rate": 9.761597063245434e-06,
"loss": 0.3679,
"step": 258
},
{
"epoch": 0.12646484375,
"grad_norm": 2.1206367015838623,
"learning_rate": 9.759177942590166e-06,
"loss": 0.3409,
"step": 259
},
{
"epoch": 0.126953125,
"grad_norm": 2.5495412349700928,
"learning_rate": 9.756746912994832e-06,
"loss": 0.3499,
"step": 260
},
{
"epoch": 0.12744140625,
"grad_norm": 2.9602348804473877,
"learning_rate": 9.754303980542623e-06,
"loss": 0.3706,
"step": 261
},
{
"epoch": 0.1279296875,
"grad_norm": 2.7507028579711914,
"learning_rate": 9.751849151346513e-06,
"loss": 0.3767,
"step": 262
},
{
"epoch": 0.12841796875,
"grad_norm": 2.539034843444824,
"learning_rate": 9.749382431549247e-06,
"loss": 0.3406,
"step": 263
},
{
"epoch": 0.12890625,
"grad_norm": 2.833279848098755,
"learning_rate": 9.746903827323324e-06,
"loss": 0.3522,
"step": 264
},
{
"epoch": 0.12939453125,
"grad_norm": 2.5430469512939453,
"learning_rate": 9.74441334487098e-06,
"loss": 0.3406,
"step": 265
},
{
"epoch": 0.1298828125,
"grad_norm": 2.858895778656006,
"learning_rate": 9.741910990424173e-06,
"loss": 0.3396,
"step": 266
},
{
"epoch": 0.13037109375,
"grad_norm": 3.113898515701294,
"learning_rate": 9.739396770244575e-06,
"loss": 0.3779,
"step": 267
},
{
"epoch": 0.130859375,
"grad_norm": 2.812479257583618,
"learning_rate": 9.736870690623541e-06,
"loss": 0.3581,
"step": 268
},
{
"epoch": 0.13134765625,
"grad_norm": 4.137664318084717,
"learning_rate": 9.734332757882108e-06,
"loss": 0.3731,
"step": 269
},
{
"epoch": 0.1318359375,
"grad_norm": 2.346695899963379,
"learning_rate": 9.73178297837097e-06,
"loss": 0.3499,
"step": 270
},
{
"epoch": 0.13232421875,
"grad_norm": 3.5724024772644043,
"learning_rate": 9.729221358470468e-06,
"loss": 0.346,
"step": 271
},
{
"epoch": 0.1328125,
"grad_norm": 2.5001883506774902,
"learning_rate": 9.726647904590572e-06,
"loss": 0.3371,
"step": 272
},
{
"epoch": 0.13330078125,
"grad_norm": 1.8020128011703491,
"learning_rate": 9.724062623170855e-06,
"loss": 0.3632,
"step": 273
},
{
"epoch": 0.1337890625,
"grad_norm": 2.486666679382324,
"learning_rate": 9.721465520680501e-06,
"loss": 0.3505,
"step": 274
},
{
"epoch": 0.13427734375,
"grad_norm": 2.269751787185669,
"learning_rate": 9.718856603618263e-06,
"loss": 0.3718,
"step": 275
},
{
"epoch": 0.134765625,
"grad_norm": 2.7286322116851807,
"learning_rate": 9.716235878512462e-06,
"loss": 0.3462,
"step": 276
},
{
"epoch": 0.13525390625,
"grad_norm": 2.535698175430298,
"learning_rate": 9.713603351920964e-06,
"loss": 0.3451,
"step": 277
},
{
"epoch": 0.1357421875,
"grad_norm": 1.9008198976516724,
"learning_rate": 9.710959030431167e-06,
"loss": 0.3924,
"step": 278
},
{
"epoch": 0.13623046875,
"grad_norm": 2.339395046234131,
"learning_rate": 9.708302920659987e-06,
"loss": 0.3331,
"step": 279
},
{
"epoch": 0.13671875,
"grad_norm": 2.376002550125122,
"learning_rate": 9.705635029253833e-06,
"loss": 0.3815,
"step": 280
},
{
"epoch": 0.13720703125,
"grad_norm": 2.245027780532837,
"learning_rate": 9.702955362888595e-06,
"loss": 0.3548,
"step": 281
},
{
"epoch": 0.1376953125,
"grad_norm": 2.206878900527954,
"learning_rate": 9.700263928269636e-06,
"loss": 0.3204,
"step": 282
},
{
"epoch": 0.13818359375,
"grad_norm": 2.0215516090393066,
"learning_rate": 9.697560732131753e-06,
"loss": 0.3387,
"step": 283
},
{
"epoch": 0.138671875,
"grad_norm": 2.9142580032348633,
"learning_rate": 9.694845781239188e-06,
"loss": 0.3336,
"step": 284
},
{
"epoch": 0.13916015625,
"grad_norm": 2.0387048721313477,
"learning_rate": 9.692119082385588e-06,
"loss": 0.3342,
"step": 285
},
{
"epoch": 0.1396484375,
"grad_norm": 2.3236615657806396,
"learning_rate": 9.689380642393998e-06,
"loss": 0.3773,
"step": 286
},
{
"epoch": 0.14013671875,
"grad_norm": 3.4590189456939697,
"learning_rate": 9.686630468116846e-06,
"loss": 0.3358,
"step": 287
},
{
"epoch": 0.140625,
"grad_norm": 1.6319761276245117,
"learning_rate": 9.683868566435922e-06,
"loss": 0.2913,
"step": 288
},
{
"epoch": 0.14111328125,
"grad_norm": 6.874841690063477,
"learning_rate": 9.681094944262361e-06,
"loss": 0.3259,
"step": 289
},
{
"epoch": 0.1416015625,
"grad_norm": 4.962515830993652,
"learning_rate": 9.678309608536626e-06,
"loss": 0.3455,
"step": 290
},
{
"epoch": 0.14208984375,
"grad_norm": 3.334455966949463,
"learning_rate": 9.675512566228493e-06,
"loss": 0.3561,
"step": 291
},
{
"epoch": 0.142578125,
"grad_norm": 3.891530990600586,
"learning_rate": 9.672703824337026e-06,
"loss": 0.3627,
"step": 292
},
{
"epoch": 0.14306640625,
"grad_norm": 2.2160141468048096,
"learning_rate": 9.669883389890572e-06,
"loss": 0.312,
"step": 293
},
{
"epoch": 0.1435546875,
"grad_norm": 3.7108445167541504,
"learning_rate": 9.667051269946734e-06,
"loss": 0.338,
"step": 294
},
{
"epoch": 0.14404296875,
"grad_norm": 2.138221025466919,
"learning_rate": 9.664207471592353e-06,
"loss": 0.3767,
"step": 295
},
{
"epoch": 0.14453125,
"grad_norm": 11.57601547241211,
"learning_rate": 9.661352001943494e-06,
"loss": 0.3481,
"step": 296
},
{
"epoch": 0.14501953125,
"grad_norm": 2.1737406253814697,
"learning_rate": 9.658484868145428e-06,
"loss": 0.3319,
"step": 297
},
{
"epoch": 0.1455078125,
"grad_norm": 4.048387050628662,
"learning_rate": 9.655606077372619e-06,
"loss": 0.3061,
"step": 298
},
{
"epoch": 0.14599609375,
"grad_norm": 2.4968268871307373,
"learning_rate": 9.652715636828687e-06,
"loss": 0.333,
"step": 299
},
{
"epoch": 0.146484375,
"grad_norm": 2.2704763412475586,
"learning_rate": 9.649813553746416e-06,
"loss": 0.3307,
"step": 300
},
{
"epoch": 0.14697265625,
"grad_norm": 1.9303852319717407,
"learning_rate": 9.646899835387718e-06,
"loss": 0.3342,
"step": 301
},
{
"epoch": 0.1474609375,
"grad_norm": 2.8917553424835205,
"learning_rate": 9.64397448904362e-06,
"loss": 0.3595,
"step": 302
},
{
"epoch": 0.14794921875,
"grad_norm": 2.193105697631836,
"learning_rate": 9.641037522034246e-06,
"loss": 0.3675,
"step": 303
},
{
"epoch": 0.1484375,
"grad_norm": 1.9201539754867554,
"learning_rate": 9.638088941708799e-06,
"loss": 0.353,
"step": 304
},
{
"epoch": 0.14892578125,
"grad_norm": 2.513864517211914,
"learning_rate": 9.635128755445542e-06,
"loss": 0.3669,
"step": 305
},
{
"epoch": 0.1494140625,
"grad_norm": 2.397608518600464,
"learning_rate": 9.63215697065178e-06,
"loss": 0.3439,
"step": 306
},
{
"epoch": 0.14990234375,
"grad_norm": 2.335594654083252,
"learning_rate": 9.62917359476384e-06,
"loss": 0.3558,
"step": 307
},
{
"epoch": 0.150390625,
"grad_norm": 2.5134353637695312,
"learning_rate": 9.626178635247054e-06,
"loss": 0.3923,
"step": 308
},
{
"epoch": 0.15087890625,
"grad_norm": 2.9013524055480957,
"learning_rate": 9.623172099595743e-06,
"loss": 0.3748,
"step": 309
},
{
"epoch": 0.1513671875,
"grad_norm": 3.2646868228912354,
"learning_rate": 9.620153995333188e-06,
"loss": 0.3268,
"step": 310
},
{
"epoch": 0.15185546875,
"grad_norm": 2.843632459640503,
"learning_rate": 9.617124330011624e-06,
"loss": 0.3392,
"step": 311
},
{
"epoch": 0.15234375,
"grad_norm": 2.5182275772094727,
"learning_rate": 9.614083111212216e-06,
"loss": 0.3849,
"step": 312
},
{
"epoch": 0.15283203125,
"grad_norm": 2.9543368816375732,
"learning_rate": 9.611030346545035e-06,
"loss": 0.3784,
"step": 313
},
{
"epoch": 0.1533203125,
"grad_norm": 3.7902252674102783,
"learning_rate": 9.607966043649047e-06,
"loss": 0.3466,
"step": 314
},
{
"epoch": 0.15380859375,
"grad_norm": 2.4927687644958496,
"learning_rate": 9.604890210192084e-06,
"loss": 0.3638,
"step": 315
},
{
"epoch": 0.154296875,
"grad_norm": 4.722542762756348,
"learning_rate": 9.601802853870843e-06,
"loss": 0.3439,
"step": 316
},
{
"epoch": 0.15478515625,
"grad_norm": 2.0797646045684814,
"learning_rate": 9.598703982410842e-06,
"loss": 0.373,
"step": 317
},
{
"epoch": 0.1552734375,
"grad_norm": 2.1771399974823,
"learning_rate": 9.595593603566423e-06,
"loss": 0.3112,
"step": 318
},
{
"epoch": 0.15576171875,
"grad_norm": 2.621591091156006,
"learning_rate": 9.592471725120714e-06,
"loss": 0.3384,
"step": 319
},
{
"epoch": 0.15625,
"grad_norm": 4.34113883972168,
"learning_rate": 9.58933835488563e-06,
"loss": 0.3488,
"step": 320
},
{
"epoch": 0.15673828125,
"grad_norm": 3.58477783203125,
"learning_rate": 9.58619350070183e-06,
"loss": 0.3329,
"step": 321
},
{
"epoch": 0.1572265625,
"grad_norm": 2.657738208770752,
"learning_rate": 9.583037170438719e-06,
"loss": 0.3371,
"step": 322
},
{
"epoch": 0.15771484375,
"grad_norm": 2.3004322052001953,
"learning_rate": 9.579869371994412e-06,
"loss": 0.3658,
"step": 323
},
{
"epoch": 0.158203125,
"grad_norm": 3.4922330379486084,
"learning_rate": 9.576690113295726e-06,
"loss": 0.3713,
"step": 324
},
{
"epoch": 0.15869140625,
"grad_norm": 4.173436641693115,
"learning_rate": 9.573499402298152e-06,
"loss": 0.3349,
"step": 325
},
{
"epoch": 0.1591796875,
"grad_norm": 12.521305084228516,
"learning_rate": 9.570297246985838e-06,
"loss": 0.3411,
"step": 326
},
{
"epoch": 0.15966796875,
"grad_norm": 3.122694253921509,
"learning_rate": 9.567083655371572e-06,
"loss": 0.3644,
"step": 327
},
{
"epoch": 0.16015625,
"grad_norm": 1.6851651668548584,
"learning_rate": 9.563858635496755e-06,
"loss": 0.3567,
"step": 328
},
{
"epoch": 0.16064453125,
"grad_norm": 2.407923698425293,
"learning_rate": 9.56062219543139e-06,
"loss": 0.3298,
"step": 329
},
{
"epoch": 0.1611328125,
"grad_norm": 1.9536917209625244,
"learning_rate": 9.557374343274056e-06,
"loss": 0.352,
"step": 330
},
{
"epoch": 0.16162109375,
"grad_norm": 2.042382001876831,
"learning_rate": 9.55411508715188e-06,
"loss": 0.3249,
"step": 331
},
{
"epoch": 0.162109375,
"grad_norm": 1.9811147451400757,
"learning_rate": 9.55084443522054e-06,
"loss": 0.3341,
"step": 332
},
{
"epoch": 0.16259765625,
"grad_norm": 2.6401963233947754,
"learning_rate": 9.547562395664219e-06,
"loss": 0.3296,
"step": 333
},
{
"epoch": 0.1630859375,
"grad_norm": 2.3292157649993896,
"learning_rate": 9.544268976695596e-06,
"loss": 0.3446,
"step": 334
},
{
"epoch": 0.16357421875,
"grad_norm": 3.5120034217834473,
"learning_rate": 9.54096418655583e-06,
"loss": 0.3796,
"step": 335
},
{
"epoch": 0.1640625,
"grad_norm": 2.3993301391601562,
"learning_rate": 9.53764803351453e-06,
"loss": 0.3544,
"step": 336
},
{
"epoch": 0.16455078125,
"grad_norm": 2.403285264968872,
"learning_rate": 9.534320525869742e-06,
"loss": 0.3734,
"step": 337
},
{
"epoch": 0.1650390625,
"grad_norm": 1.878564476966858,
"learning_rate": 9.530981671947924e-06,
"loss": 0.3334,
"step": 338
},
{
"epoch": 0.16552734375,
"grad_norm": 3.3280200958251953,
"learning_rate": 9.527631480103919e-06,
"loss": 0.3282,
"step": 339
},
{
"epoch": 0.166015625,
"grad_norm": 2.304945230484009,
"learning_rate": 9.524269958720951e-06,
"loss": 0.3422,
"step": 340
},
{
"epoch": 0.16650390625,
"grad_norm": 2.0590991973876953,
"learning_rate": 9.520897116210588e-06,
"loss": 0.355,
"step": 341
},
{
"epoch": 0.1669921875,
"grad_norm": 1.660049557685852,
"learning_rate": 9.517512961012729e-06,
"loss": 0.3499,
"step": 342
},
{
"epoch": 0.16748046875,
"grad_norm": 1.8652247190475464,
"learning_rate": 9.514117501595582e-06,
"loss": 0.3594,
"step": 343
},
{
"epoch": 0.16796875,
"grad_norm": 1.7373839616775513,
"learning_rate": 9.510710746455636e-06,
"loss": 0.3447,
"step": 344
},
{
"epoch": 0.16845703125,
"grad_norm": 2.8204782009124756,
"learning_rate": 9.507292704117655e-06,
"loss": 0.362,
"step": 345
},
{
"epoch": 0.1689453125,
"grad_norm": 1.6446189880371094,
"learning_rate": 9.503863383134636e-06,
"loss": 0.3752,
"step": 346
},
{
"epoch": 0.16943359375,
"grad_norm": 3.4714109897613525,
"learning_rate": 9.500422792087809e-06,
"loss": 0.3358,
"step": 347
},
{
"epoch": 0.169921875,
"grad_norm": 2.125108003616333,
"learning_rate": 9.496970939586598e-06,
"loss": 0.3822,
"step": 348
},
{
"epoch": 0.17041015625,
"grad_norm": 2.7372467517852783,
"learning_rate": 9.493507834268609e-06,
"loss": 0.3513,
"step": 349
},
{
"epoch": 0.1708984375,
"grad_norm": 2.562140941619873,
"learning_rate": 9.490033484799608e-06,
"loss": 0.3727,
"step": 350
},
{
"epoch": 0.17138671875,
"grad_norm": 2.868966817855835,
"learning_rate": 9.486547899873495e-06,
"loss": 0.3309,
"step": 351
},
{
"epoch": 0.171875,
"grad_norm": 2.5418648719787598,
"learning_rate": 9.483051088212283e-06,
"loss": 0.3826,
"step": 352
},
{
"epoch": 0.17236328125,
"grad_norm": 1.7842854261398315,
"learning_rate": 9.479543058566081e-06,
"loss": 0.3404,
"step": 353
},
{
"epoch": 0.1728515625,
"grad_norm": 1.8991374969482422,
"learning_rate": 9.47602381971307e-06,
"loss": 0.3946,
"step": 354
},
{
"epoch": 0.17333984375,
"grad_norm": 1.9261831045150757,
"learning_rate": 9.472493380459474e-06,
"loss": 0.3579,
"step": 355
},
{
"epoch": 0.173828125,
"grad_norm": 1.6657100915908813,
"learning_rate": 9.468951749639552e-06,
"loss": 0.3405,
"step": 356
},
{
"epoch": 0.17431640625,
"grad_norm": 2.1538491249084473,
"learning_rate": 9.465398936115557e-06,
"loss": 0.3657,
"step": 357
},
{
"epoch": 0.1748046875,
"grad_norm": 1.8424322605133057,
"learning_rate": 9.461834948777738e-06,
"loss": 0.3685,
"step": 358
},
{
"epoch": 0.17529296875,
"grad_norm": 3.16018009185791,
"learning_rate": 9.458259796544293e-06,
"loss": 0.3225,
"step": 359
},
{
"epoch": 0.17578125,
"grad_norm": 1.7529760599136353,
"learning_rate": 9.454673488361363e-06,
"loss": 0.3428,
"step": 360
},
{
"epoch": 0.17626953125,
"grad_norm": 1.6713848114013672,
"learning_rate": 9.451076033203003e-06,
"loss": 0.3383,
"step": 361
},
{
"epoch": 0.1767578125,
"grad_norm": 2.688614845275879,
"learning_rate": 9.447467440071165e-06,
"loss": 0.3553,
"step": 362
},
{
"epoch": 0.17724609375,
"grad_norm": 2.0093319416046143,
"learning_rate": 9.443847717995666e-06,
"loss": 0.3689,
"step": 363
},
{
"epoch": 0.177734375,
"grad_norm": 5.026141166687012,
"learning_rate": 9.440216876034177e-06,
"loss": 0.3072,
"step": 364
},
{
"epoch": 0.17822265625,
"grad_norm": 2.687075138092041,
"learning_rate": 9.436574923272188e-06,
"loss": 0.3624,
"step": 365
},
{
"epoch": 0.1787109375,
"grad_norm": 1.9798976182937622,
"learning_rate": 9.432921868822997e-06,
"loss": 0.3355,
"step": 366
},
{
"epoch": 0.17919921875,
"grad_norm": 2.060910701751709,
"learning_rate": 9.42925772182768e-06,
"loss": 0.3435,
"step": 367
},
{
"epoch": 0.1796875,
"grad_norm": 1.7003917694091797,
"learning_rate": 9.425582491455068e-06,
"loss": 0.3659,
"step": 368
},
{
"epoch": 0.18017578125,
"grad_norm": 2.026036262512207,
"learning_rate": 9.421896186901729e-06,
"loss": 0.3523,
"step": 369
},
{
"epoch": 0.1806640625,
"grad_norm": 1.9931825399398804,
"learning_rate": 9.418198817391941e-06,
"loss": 0.3654,
"step": 370
},
{
"epoch": 0.18115234375,
"grad_norm": 2.7290432453155518,
"learning_rate": 9.41449039217767e-06,
"loss": 0.3599,
"step": 371
},
{
"epoch": 0.181640625,
"grad_norm": 1.5444127321243286,
"learning_rate": 9.410770920538545e-06,
"loss": 0.2991,
"step": 372
},
{
"epoch": 0.18212890625,
"grad_norm": 2.319566011428833,
"learning_rate": 9.407040411781843e-06,
"loss": 0.3724,
"step": 373
},
{
"epoch": 0.1826171875,
"grad_norm": 1.9856535196304321,
"learning_rate": 9.403298875242448e-06,
"loss": 0.348,
"step": 374
},
{
"epoch": 0.18310546875,
"grad_norm": 1.9270925521850586,
"learning_rate": 9.39954632028285e-06,
"loss": 0.3766,
"step": 375
},
{
"epoch": 0.18359375,
"grad_norm": 2.2769391536712646,
"learning_rate": 9.395782756293104e-06,
"loss": 0.3563,
"step": 376
},
{
"epoch": 0.18408203125,
"grad_norm": 2.2026526927948,
"learning_rate": 9.392008192690816e-06,
"loss": 0.3213,
"step": 377
},
{
"epoch": 0.1845703125,
"grad_norm": 2.3757741451263428,
"learning_rate": 9.388222638921116e-06,
"loss": 0.3595,
"step": 378
},
{
"epoch": 0.18505859375,
"grad_norm": 1.9485424757003784,
"learning_rate": 9.384426104456632e-06,
"loss": 0.3561,
"step": 379
},
{
"epoch": 0.185546875,
"grad_norm": 2.7337324619293213,
"learning_rate": 9.380618598797473e-06,
"loss": 0.38,
"step": 380
},
{
"epoch": 0.18603515625,
"grad_norm": 2.1130242347717285,
"learning_rate": 9.3768001314712e-06,
"loss": 0.3533,
"step": 381
},
{
"epoch": 0.1865234375,
"grad_norm": 1.831874966621399,
"learning_rate": 9.372970712032803e-06,
"loss": 0.332,
"step": 382
},
{
"epoch": 0.18701171875,
"grad_norm": 2.3811991214752197,
"learning_rate": 9.369130350064677e-06,
"loss": 0.3798,
"step": 383
},
{
"epoch": 0.1875,
"grad_norm": 1.8242988586425781,
"learning_rate": 9.3652790551766e-06,
"loss": 0.3634,
"step": 384
},
{
"epoch": 0.18798828125,
"grad_norm": 3.14345645904541,
"learning_rate": 9.361416837005705e-06,
"loss": 0.3513,
"step": 385
},
{
"epoch": 0.1884765625,
"grad_norm": 1.9473716020584106,
"learning_rate": 9.357543705216465e-06,
"loss": 0.3687,
"step": 386
},
{
"epoch": 0.18896484375,
"grad_norm": 1.982612133026123,
"learning_rate": 9.353659669500652e-06,
"loss": 0.3803,
"step": 387
},
{
"epoch": 0.189453125,
"grad_norm": 1.774999976158142,
"learning_rate": 9.349764739577334e-06,
"loss": 0.3331,
"step": 388
},
{
"epoch": 0.18994140625,
"grad_norm": 1.5273141860961914,
"learning_rate": 9.34585892519283e-06,
"loss": 0.3599,
"step": 389
},
{
"epoch": 0.1904296875,
"grad_norm": 1.8035123348236084,
"learning_rate": 9.3419422361207e-06,
"loss": 0.3771,
"step": 390
},
{
"epoch": 0.19091796875,
"grad_norm": 1.789610505104065,
"learning_rate": 9.338014682161719e-06,
"loss": 0.3236,
"step": 391
},
{
"epoch": 0.19140625,
"grad_norm": 1.9845644235610962,
"learning_rate": 9.334076273143843e-06,
"loss": 0.3274,
"step": 392
},
{
"epoch": 0.19189453125,
"grad_norm": 2.072159767150879,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3416,
"step": 393
},
{
"epoch": 0.1923828125,
"grad_norm": 1.8441466093063354,
"learning_rate": 9.326166929379032e-06,
"loss": 0.3352,
"step": 394
},
{
"epoch": 0.19287109375,
"grad_norm": 2.479971170425415,
"learning_rate": 9.322196014423729e-06,
"loss": 0.3472,
"step": 395
},
{
"epoch": 0.193359375,
"grad_norm": 2.514597177505493,
"learning_rate": 9.318214283992747e-06,
"loss": 0.3544,
"step": 396
},
{
"epoch": 0.19384765625,
"grad_norm": 2.048144578933716,
"learning_rate": 9.314221748049613e-06,
"loss": 0.3869,
"step": 397
},
{
"epoch": 0.1943359375,
"grad_norm": 2.8453140258789062,
"learning_rate": 9.310218416584887e-06,
"loss": 0.3734,
"step": 398
},
{
"epoch": 0.19482421875,
"grad_norm": 1.6406381130218506,
"learning_rate": 9.306204299616148e-06,
"loss": 0.3507,
"step": 399
},
{
"epoch": 0.1953125,
"grad_norm": 2.275040626525879,
"learning_rate": 9.302179407187965e-06,
"loss": 0.3787,
"step": 400
},
{
"epoch": 0.19580078125,
"grad_norm": 1.522905945777893,
"learning_rate": 9.298143749371865e-06,
"loss": 0.341,
"step": 401
},
{
"epoch": 0.1962890625,
"grad_norm": 2.3068466186523438,
"learning_rate": 9.294097336266317e-06,
"loss": 0.3686,
"step": 402
},
{
"epoch": 0.19677734375,
"grad_norm": 2.8621833324432373,
"learning_rate": 9.290040177996703e-06,
"loss": 0.3331,
"step": 403
},
{
"epoch": 0.197265625,
"grad_norm": 2.339892864227295,
"learning_rate": 9.285972284715291e-06,
"loss": 0.3889,
"step": 404
},
{
"epoch": 0.19775390625,
"grad_norm": 1.7295536994934082,
"learning_rate": 9.281893666601214e-06,
"loss": 0.3692,
"step": 405
},
{
"epoch": 0.1982421875,
"grad_norm": 4.145984649658203,
"learning_rate": 9.277804333860435e-06,
"loss": 0.3387,
"step": 406
},
{
"epoch": 0.19873046875,
"grad_norm": 1.866166114807129,
"learning_rate": 9.273704296725741e-06,
"loss": 0.3503,
"step": 407
},
{
"epoch": 0.19921875,
"grad_norm": 1.8600391149520874,
"learning_rate": 9.269593565456691e-06,
"loss": 0.347,
"step": 408
},
{
"epoch": 0.19970703125,
"grad_norm": 1.990860104560852,
"learning_rate": 9.265472150339615e-06,
"loss": 0.3642,
"step": 409
},
{
"epoch": 0.2001953125,
"grad_norm": 1.4612618684768677,
"learning_rate": 9.26134006168757e-06,
"loss": 0.3624,
"step": 410
},
{
"epoch": 0.20068359375,
"grad_norm": 1.4518144130706787,
"learning_rate": 9.257197309840322e-06,
"loss": 0.3374,
"step": 411
},
{
"epoch": 0.201171875,
"grad_norm": 1.5550000667572021,
"learning_rate": 9.253043905164327e-06,
"loss": 0.3651,
"step": 412
},
{
"epoch": 0.20166015625,
"grad_norm": 1.9353028535842896,
"learning_rate": 9.248879858052688e-06,
"loss": 0.3111,
"step": 413
},
{
"epoch": 0.2021484375,
"grad_norm": 1.5865511894226074,
"learning_rate": 9.244705178925146e-06,
"loss": 0.3734,
"step": 414
},
{
"epoch": 0.20263671875,
"grad_norm": 1.9505976438522339,
"learning_rate": 9.24051987822804e-06,
"loss": 0.3294,
"step": 415
},
{
"epoch": 0.203125,
"grad_norm": 1.7402981519699097,
"learning_rate": 9.236323966434296e-06,
"loss": 0.3664,
"step": 416
},
{
"epoch": 0.20361328125,
"grad_norm": 2.2276546955108643,
"learning_rate": 9.232117454043383e-06,
"loss": 0.3943,
"step": 417
},
{
"epoch": 0.2041015625,
"grad_norm": 2.5883917808532715,
"learning_rate": 9.227900351581303e-06,
"loss": 0.3759,
"step": 418
},
{
"epoch": 0.20458984375,
"grad_norm": 2.116527795791626,
"learning_rate": 9.223672669600552e-06,
"loss": 0.371,
"step": 419
},
{
"epoch": 0.205078125,
"grad_norm": 1.890336036682129,
"learning_rate": 9.219434418680107e-06,
"loss": 0.3208,
"step": 420
},
{
"epoch": 0.20556640625,
"grad_norm": 2.831151247024536,
"learning_rate": 9.215185609425383e-06,
"loss": 0.3283,
"step": 421
},
{
"epoch": 0.2060546875,
"grad_norm": 1.890857458114624,
"learning_rate": 9.21092625246822e-06,
"loss": 0.3634,
"step": 422
},
{
"epoch": 0.20654296875,
"grad_norm": 1.4543401002883911,
"learning_rate": 9.206656358466851e-06,
"loss": 0.3615,
"step": 423
},
{
"epoch": 0.20703125,
"grad_norm": 1.9577465057373047,
"learning_rate": 9.202375938105876e-06,
"loss": 0.364,
"step": 424
},
{
"epoch": 0.20751953125,
"grad_norm": 1.5794016122817993,
"learning_rate": 9.198085002096237e-06,
"loss": 0.34,
"step": 425
},
{
"epoch": 0.2080078125,
"grad_norm": 1.8114027976989746,
"learning_rate": 9.193783561175184e-06,
"loss": 0.3413,
"step": 426
},
{
"epoch": 0.20849609375,
"grad_norm": 1.5112391710281372,
"learning_rate": 9.189471626106261e-06,
"loss": 0.3558,
"step": 427
},
{
"epoch": 0.208984375,
"grad_norm": 1.5750012397766113,
"learning_rate": 9.185149207679263e-06,
"loss": 0.3211,
"step": 428
},
{
"epoch": 0.20947265625,
"grad_norm": 1.5355925559997559,
"learning_rate": 9.180816316710226e-06,
"loss": 0.316,
"step": 429
},
{
"epoch": 0.2099609375,
"grad_norm": 1.7540535926818848,
"learning_rate": 9.176472964041385e-06,
"loss": 0.3446,
"step": 430
},
{
"epoch": 0.21044921875,
"grad_norm": 1.94683837890625,
"learning_rate": 9.172119160541158e-06,
"loss": 0.3894,
"step": 431
},
{
"epoch": 0.2109375,
"grad_norm": 2.1505014896392822,
"learning_rate": 9.167754917104112e-06,
"loss": 0.3516,
"step": 432
},
{
"epoch": 0.21142578125,
"grad_norm": 3.6382253170013428,
"learning_rate": 9.163380244650938e-06,
"loss": 0.3766,
"step": 433
},
{
"epoch": 0.2119140625,
"grad_norm": 1.4218906164169312,
"learning_rate": 9.158995154128425e-06,
"loss": 0.3377,
"step": 434
},
{
"epoch": 0.21240234375,
"grad_norm": 1.6487233638763428,
"learning_rate": 9.15459965650943e-06,
"loss": 0.3198,
"step": 435
},
{
"epoch": 0.212890625,
"grad_norm": 6.333557605743408,
"learning_rate": 9.15019376279285e-06,
"loss": 0.3336,
"step": 436
},
{
"epoch": 0.21337890625,
"grad_norm": 1.746251106262207,
"learning_rate": 9.1457774840036e-06,
"loss": 0.3434,
"step": 437
},
{
"epoch": 0.2138671875,
"grad_norm": 2.1596200466156006,
"learning_rate": 9.14135083119258e-06,
"loss": 0.3496,
"step": 438
},
{
"epoch": 0.21435546875,
"grad_norm": 1.7951174974441528,
"learning_rate": 9.13691381543665e-06,
"loss": 0.3589,
"step": 439
},
{
"epoch": 0.21484375,
"grad_norm": 1.7067686319351196,
"learning_rate": 9.132466447838598e-06,
"loss": 0.3367,
"step": 440
},
{
"epoch": 0.21533203125,
"grad_norm": 2.095935344696045,
"learning_rate": 9.128008739527119e-06,
"loss": 0.3305,
"step": 441
},
{
"epoch": 0.2158203125,
"grad_norm": 2.011528968811035,
"learning_rate": 9.123540701656782e-06,
"loss": 0.368,
"step": 442
},
{
"epoch": 0.21630859375,
"grad_norm": 1.4319236278533936,
"learning_rate": 9.119062345408005e-06,
"loss": 0.3288,
"step": 443
},
{
"epoch": 0.216796875,
"grad_norm": 1.8978536128997803,
"learning_rate": 9.114573681987024e-06,
"loss": 0.3222,
"step": 444
},
{
"epoch": 0.21728515625,
"grad_norm": 1.8402870893478394,
"learning_rate": 9.11007472262587e-06,
"loss": 0.3286,
"step": 445
},
{
"epoch": 0.2177734375,
"grad_norm": 1.8938474655151367,
"learning_rate": 9.105565478582335e-06,
"loss": 0.3725,
"step": 446
},
{
"epoch": 0.21826171875,
"grad_norm": 1.723388433456421,
"learning_rate": 9.101045961139945e-06,
"loss": 0.3634,
"step": 447
},
{
"epoch": 0.21875,
"grad_norm": 1.8326998949050903,
"learning_rate": 9.096516181607935e-06,
"loss": 0.3276,
"step": 448
},
{
"epoch": 0.21923828125,
"grad_norm": 1.6433813571929932,
"learning_rate": 9.09197615132122e-06,
"loss": 0.3637,
"step": 449
},
{
"epoch": 0.2197265625,
"grad_norm": 1.482116460800171,
"learning_rate": 9.087425881640366e-06,
"loss": 0.3413,
"step": 450
},
{
"epoch": 0.22021484375,
"grad_norm": 5.252507209777832,
"learning_rate": 9.082865383951558e-06,
"loss": 0.35,
"step": 451
},
{
"epoch": 0.220703125,
"grad_norm": 1.4982550144195557,
"learning_rate": 9.078294669666577e-06,
"loss": 0.3354,
"step": 452
},
{
"epoch": 0.22119140625,
"grad_norm": 2.408413887023926,
"learning_rate": 9.073713750222766e-06,
"loss": 0.3376,
"step": 453
},
{
"epoch": 0.2216796875,
"grad_norm": 1.682771921157837,
"learning_rate": 9.069122637083012e-06,
"loss": 0.3131,
"step": 454
},
{
"epoch": 0.22216796875,
"grad_norm": 1.6665334701538086,
"learning_rate": 9.064521341735702e-06,
"loss": 0.3348,
"step": 455
},
{
"epoch": 0.22265625,
"grad_norm": 1.3198261260986328,
"learning_rate": 9.059909875694703e-06,
"loss": 0.3087,
"step": 456
},
{
"epoch": 0.22314453125,
"grad_norm": 2.0489742755889893,
"learning_rate": 9.055288250499339e-06,
"loss": 0.3549,
"step": 457
},
{
"epoch": 0.2236328125,
"grad_norm": 1.4335616827011108,
"learning_rate": 9.050656477714345e-06,
"loss": 0.3859,
"step": 458
},
{
"epoch": 0.22412109375,
"grad_norm": 1.9734736680984497,
"learning_rate": 9.046014568929856e-06,
"loss": 0.358,
"step": 459
},
{
"epoch": 0.224609375,
"grad_norm": 1.8493421077728271,
"learning_rate": 9.04136253576137e-06,
"loss": 0.3306,
"step": 460
},
{
"epoch": 0.22509765625,
"grad_norm": 2.6172261238098145,
"learning_rate": 9.036700389849717e-06,
"loss": 0.3481,
"step": 461
},
{
"epoch": 0.2255859375,
"grad_norm": 1.538042664527893,
"learning_rate": 9.03202814286103e-06,
"loss": 0.3154,
"step": 462
},
{
"epoch": 0.22607421875,
"grad_norm": 2.418534278869629,
"learning_rate": 9.027345806486722e-06,
"loss": 0.3247,
"step": 463
},
{
"epoch": 0.2265625,
"grad_norm": 1.7823346853256226,
"learning_rate": 9.022653392443455e-06,
"loss": 0.338,
"step": 464
},
{
"epoch": 0.22705078125,
"grad_norm": 1.9469126462936401,
"learning_rate": 9.0179509124731e-06,
"loss": 0.3377,
"step": 465
},
{
"epoch": 0.2275390625,
"grad_norm": 1.985723614692688,
"learning_rate": 9.013238378342725e-06,
"loss": 0.3438,
"step": 466
},
{
"epoch": 0.22802734375,
"grad_norm": 1.5227419137954712,
"learning_rate": 9.008515801844552e-06,
"loss": 0.3392,
"step": 467
},
{
"epoch": 0.228515625,
"grad_norm": 2.764451026916504,
"learning_rate": 9.003783194795931e-06,
"loss": 0.3439,
"step": 468
},
{
"epoch": 0.22900390625,
"grad_norm": 1.489700198173523,
"learning_rate": 8.999040569039315e-06,
"loss": 0.3654,
"step": 469
},
{
"epoch": 0.2294921875,
"grad_norm": 2.0311126708984375,
"learning_rate": 8.994287936442226e-06,
"loss": 0.3312,
"step": 470
},
{
"epoch": 0.22998046875,
"grad_norm": 1.7580716609954834,
"learning_rate": 8.989525308897223e-06,
"loss": 0.3573,
"step": 471
},
{
"epoch": 0.23046875,
"grad_norm": 1.7429345846176147,
"learning_rate": 8.98475269832188e-06,
"loss": 0.3757,
"step": 472
},
{
"epoch": 0.23095703125,
"grad_norm": 1.544498085975647,
"learning_rate": 8.97997011665875e-06,
"loss": 0.2787,
"step": 473
},
{
"epoch": 0.2314453125,
"grad_norm": 1.6220890283584595,
"learning_rate": 8.975177575875335e-06,
"loss": 0.3597,
"step": 474
},
{
"epoch": 0.23193359375,
"grad_norm": 1.598620057106018,
"learning_rate": 8.97037508796406e-06,
"loss": 0.3615,
"step": 475
},
{
"epoch": 0.232421875,
"grad_norm": 1.567460298538208,
"learning_rate": 8.96556266494224e-06,
"loss": 0.3613,
"step": 476
},
{
"epoch": 0.23291015625,
"grad_norm": 1.5737589597702026,
"learning_rate": 8.960740318852051e-06,
"loss": 0.3699,
"step": 477
},
{
"epoch": 0.2333984375,
"grad_norm": 1.9563899040222168,
"learning_rate": 8.9559080617605e-06,
"loss": 0.3578,
"step": 478
},
{
"epoch": 0.23388671875,
"grad_norm": 2.225196599960327,
"learning_rate": 8.951065905759392e-06,
"loss": 0.3346,
"step": 479
},
{
"epoch": 0.234375,
"grad_norm": 1.5860683917999268,
"learning_rate": 8.946213862965306e-06,
"loss": 0.3741,
"step": 480
},
{
"epoch": 0.23486328125,
"grad_norm": 1.289207935333252,
"learning_rate": 8.941351945519557e-06,
"loss": 0.3434,
"step": 481
},
{
"epoch": 0.2353515625,
"grad_norm": 1.576648235321045,
"learning_rate": 8.936480165588174e-06,
"loss": 0.3513,
"step": 482
},
{
"epoch": 0.23583984375,
"grad_norm": 1.5328677892684937,
"learning_rate": 8.931598535361855e-06,
"loss": 0.3299,
"step": 483
},
{
"epoch": 0.236328125,
"grad_norm": 1.439266562461853,
"learning_rate": 8.926707067055963e-06,
"loss": 0.3077,
"step": 484
},
{
"epoch": 0.23681640625,
"grad_norm": 1.6571671962738037,
"learning_rate": 8.921805772910463e-06,
"loss": 0.3666,
"step": 485
},
{
"epoch": 0.2373046875,
"grad_norm": 2.0075385570526123,
"learning_rate": 8.916894665189918e-06,
"loss": 0.3695,
"step": 486
},
{
"epoch": 0.23779296875,
"grad_norm": 1.3680145740509033,
"learning_rate": 8.91197375618344e-06,
"loss": 0.3393,
"step": 487
},
{
"epoch": 0.23828125,
"grad_norm": 1.9149501323699951,
"learning_rate": 8.907043058204674e-06,
"loss": 0.3374,
"step": 488
},
{
"epoch": 0.23876953125,
"grad_norm": 1.5481083393096924,
"learning_rate": 8.902102583591755e-06,
"loss": 0.3263,
"step": 489
},
{
"epoch": 0.2392578125,
"grad_norm": 1.8688881397247314,
"learning_rate": 8.89715234470728e-06,
"loss": 0.3207,
"step": 490
},
{
"epoch": 0.23974609375,
"grad_norm": 1.846941351890564,
"learning_rate": 8.892192353938288e-06,
"loss": 0.3677,
"step": 491
},
{
"epoch": 0.240234375,
"grad_norm": 1.4003583192825317,
"learning_rate": 8.887222623696213e-06,
"loss": 0.3281,
"step": 492
},
{
"epoch": 0.24072265625,
"grad_norm": 1.9100502729415894,
"learning_rate": 8.882243166416862e-06,
"loss": 0.3685,
"step": 493
},
{
"epoch": 0.2412109375,
"grad_norm": 1.6730045080184937,
"learning_rate": 8.877253994560381e-06,
"loss": 0.3482,
"step": 494
},
{
"epoch": 0.24169921875,
"grad_norm": 1.4065086841583252,
"learning_rate": 8.87225512061123e-06,
"loss": 0.3404,
"step": 495
},
{
"epoch": 0.2421875,
"grad_norm": 1.5349781513214111,
"learning_rate": 8.867246557078141e-06,
"loss": 0.3279,
"step": 496
},
{
"epoch": 0.24267578125,
"grad_norm": 1.376725196838379,
"learning_rate": 8.862228316494094e-06,
"loss": 0.3384,
"step": 497
},
{
"epoch": 0.2431640625,
"grad_norm": 1.5585695505142212,
"learning_rate": 8.857200411416283e-06,
"loss": 0.3638,
"step": 498
},
{
"epoch": 0.24365234375,
"grad_norm": 3.5493311882019043,
"learning_rate": 8.852162854426087e-06,
"loss": 0.3561,
"step": 499
},
{
"epoch": 0.244140625,
"grad_norm": 2.1406612396240234,
"learning_rate": 8.84711565812904e-06,
"loss": 0.3097,
"step": 500
},
{
"epoch": 0.24462890625,
"grad_norm": 1.5322456359863281,
"learning_rate": 8.842058835154789e-06,
"loss": 0.36,
"step": 501
},
{
"epoch": 0.2451171875,
"grad_norm": 2.3245677947998047,
"learning_rate": 8.836992398157076e-06,
"loss": 0.3479,
"step": 502
},
{
"epoch": 0.24560546875,
"grad_norm": 1.8092581033706665,
"learning_rate": 8.831916359813702e-06,
"loss": 0.3292,
"step": 503
},
{
"epoch": 0.24609375,
"grad_norm": 1.6669384241104126,
"learning_rate": 8.826830732826484e-06,
"loss": 0.357,
"step": 504
},
{
"epoch": 0.24658203125,
"grad_norm": 1.3617286682128906,
"learning_rate": 8.821735529921243e-06,
"loss": 0.3434,
"step": 505
},
{
"epoch": 0.2470703125,
"grad_norm": 5.754039287567139,
"learning_rate": 8.816630763847756e-06,
"loss": 0.3677,
"step": 506
},
{
"epoch": 0.24755859375,
"grad_norm": 1.2652654647827148,
"learning_rate": 8.811516447379734e-06,
"loss": 0.3573,
"step": 507
},
{
"epoch": 0.248046875,
"grad_norm": 1.6732009649276733,
"learning_rate": 8.806392593314781e-06,
"loss": 0.3398,
"step": 508
},
{
"epoch": 0.24853515625,
"grad_norm": 1.280765175819397,
"learning_rate": 8.801259214474371e-06,
"loss": 0.3371,
"step": 509
},
{
"epoch": 0.2490234375,
"grad_norm": 1.2774041891098022,
"learning_rate": 8.796116323703811e-06,
"loss": 0.3466,
"step": 510
},
{
"epoch": 0.24951171875,
"grad_norm": 1.4741958379745483,
"learning_rate": 8.790963933872212e-06,
"loss": 0.3506,
"step": 511
},
{
"epoch": 0.25,
"grad_norm": 1.4504543542861938,
"learning_rate": 8.785802057872447e-06,
"loss": 0.4083,
"step": 512
},
{
"epoch": 0.25048828125,
"grad_norm": 1.4813644886016846,
"learning_rate": 8.780630708621135e-06,
"loss": 0.382,
"step": 513
},
{
"epoch": 0.2509765625,
"grad_norm": 1.6617738008499146,
"learning_rate": 8.775449899058597e-06,
"loss": 0.3387,
"step": 514
},
{
"epoch": 0.25146484375,
"grad_norm": 1.8677629232406616,
"learning_rate": 8.770259642148826e-06,
"loss": 0.3422,
"step": 515
},
{
"epoch": 0.251953125,
"grad_norm": 1.4123599529266357,
"learning_rate": 8.765059950879454e-06,
"loss": 0.3621,
"step": 516
},
{
"epoch": 0.25244140625,
"grad_norm": 1.966430902481079,
"learning_rate": 8.759850838261723e-06,
"loss": 0.3475,
"step": 517
},
{
"epoch": 0.2529296875,
"grad_norm": 1.3296693563461304,
"learning_rate": 8.754632317330448e-06,
"loss": 0.3938,
"step": 518
},
{
"epoch": 0.25341796875,
"grad_norm": 1.4010918140411377,
"learning_rate": 8.749404401143991e-06,
"loss": 0.3474,
"step": 519
},
{
"epoch": 0.25390625,
"grad_norm": 1.5129917860031128,
"learning_rate": 8.744167102784216e-06,
"loss": 0.3783,
"step": 520
},
{
"epoch": 0.25439453125,
"grad_norm": 1.7624212503433228,
"learning_rate": 8.738920435356473e-06,
"loss": 0.3272,
"step": 521
},
{
"epoch": 0.2548828125,
"grad_norm": 1.4559099674224854,
"learning_rate": 8.733664411989548e-06,
"loss": 0.3526,
"step": 522
},
{
"epoch": 0.25537109375,
"grad_norm": 1.8239963054656982,
"learning_rate": 8.728399045835648e-06,
"loss": 0.3385,
"step": 523
},
{
"epoch": 0.255859375,
"grad_norm": 1.4369486570358276,
"learning_rate": 8.723124350070347e-06,
"loss": 0.3193,
"step": 524
},
{
"epoch": 0.25634765625,
"grad_norm": 4.341763496398926,
"learning_rate": 8.717840337892575e-06,
"loss": 0.3256,
"step": 525
},
{
"epoch": 0.2568359375,
"grad_norm": 2.0711512565612793,
"learning_rate": 8.712547022524566e-06,
"loss": 0.3639,
"step": 526
},
{
"epoch": 0.25732421875,
"grad_norm": 1.4793862104415894,
"learning_rate": 8.707244417211844e-06,
"loss": 0.3166,
"step": 527
},
{
"epoch": 0.2578125,
"grad_norm": 1.742661476135254,
"learning_rate": 8.701932535223168e-06,
"loss": 0.3533,
"step": 528
},
{
"epoch": 0.25830078125,
"grad_norm": 1.4166213274002075,
"learning_rate": 8.696611389850516e-06,
"loss": 0.3436,
"step": 529
},
{
"epoch": 0.2587890625,
"grad_norm": 1.362882137298584,
"learning_rate": 8.691280994409044e-06,
"loss": 0.3165,
"step": 530
},
{
"epoch": 0.25927734375,
"grad_norm": 2.5286190509796143,
"learning_rate": 8.685941362237058e-06,
"loss": 0.3438,
"step": 531
},
{
"epoch": 0.259765625,
"grad_norm": 2.232900381088257,
"learning_rate": 8.680592506695972e-06,
"loss": 0.3389,
"step": 532
},
{
"epoch": 0.26025390625,
"grad_norm": 1.2126928567886353,
"learning_rate": 8.675234441170286e-06,
"loss": 0.306,
"step": 533
},
{
"epoch": 0.2607421875,
"grad_norm": 1.480934977531433,
"learning_rate": 8.669867179067538e-06,
"loss": 0.3696,
"step": 534
},
{
"epoch": 0.26123046875,
"grad_norm": 2.439810037612915,
"learning_rate": 8.664490733818289e-06,
"loss": 0.3628,
"step": 535
},
{
"epoch": 0.26171875,
"grad_norm": 1.3664276599884033,
"learning_rate": 8.659105118876068e-06,
"loss": 0.3534,
"step": 536
},
{
"epoch": 0.26220703125,
"grad_norm": 1.8439381122589111,
"learning_rate": 8.65371034771736e-06,
"loss": 0.3539,
"step": 537
},
{
"epoch": 0.2626953125,
"grad_norm": 2.1068308353424072,
"learning_rate": 8.64830643384155e-06,
"loss": 0.4281,
"step": 538
},
{
"epoch": 0.26318359375,
"grad_norm": 1.847388505935669,
"learning_rate": 8.642893390770912e-06,
"loss": 0.3624,
"step": 539
},
{
"epoch": 0.263671875,
"grad_norm": 2.783621311187744,
"learning_rate": 8.63747123205056e-06,
"loss": 0.3501,
"step": 540
},
{
"epoch": 0.26416015625,
"grad_norm": 5.078010559082031,
"learning_rate": 8.632039971248416e-06,
"loss": 0.3423,
"step": 541
},
{
"epoch": 0.2646484375,
"grad_norm": 1.461103916168213,
"learning_rate": 8.626599621955179e-06,
"loss": 0.3505,
"step": 542
},
{
"epoch": 0.26513671875,
"grad_norm": 1.512221336364746,
"learning_rate": 8.621150197784293e-06,
"loss": 0.344,
"step": 543
},
{
"epoch": 0.265625,
"grad_norm": 2.6210267543792725,
"learning_rate": 8.615691712371907e-06,
"loss": 0.3192,
"step": 544
},
{
"epoch": 0.26611328125,
"grad_norm": 1.5492252111434937,
"learning_rate": 8.610224179376847e-06,
"loss": 0.3217,
"step": 545
},
{
"epoch": 0.2666015625,
"grad_norm": 1.4719685316085815,
"learning_rate": 8.604747612480577e-06,
"loss": 0.3251,
"step": 546
},
{
"epoch": 0.26708984375,
"grad_norm": 1.9413729906082153,
"learning_rate": 8.599262025387165e-06,
"loss": 0.3658,
"step": 547
},
{
"epoch": 0.267578125,
"grad_norm": 1.8121291399002075,
"learning_rate": 8.593767431823255e-06,
"loss": 0.3274,
"step": 548
},
{
"epoch": 0.26806640625,
"grad_norm": 1.7863436937332153,
"learning_rate": 8.588263845538021e-06,
"loss": 0.3586,
"step": 549
},
{
"epoch": 0.2685546875,
"grad_norm": 2.253500461578369,
"learning_rate": 8.582751280303148e-06,
"loss": 0.383,
"step": 550
},
{
"epoch": 0.26904296875,
"grad_norm": 1.9108343124389648,
"learning_rate": 8.577229749912782e-06,
"loss": 0.3188,
"step": 551
},
{
"epoch": 0.26953125,
"grad_norm": 1.4474389553070068,
"learning_rate": 8.571699268183506e-06,
"loss": 0.3239,
"step": 552
},
{
"epoch": 0.27001953125,
"grad_norm": 1.6433511972427368,
"learning_rate": 8.566159848954305e-06,
"loss": 0.3565,
"step": 553
},
{
"epoch": 0.2705078125,
"grad_norm": 2.9185471534729004,
"learning_rate": 8.560611506086518e-06,
"loss": 0.3916,
"step": 554
},
{
"epoch": 0.27099609375,
"grad_norm": 1.6128103733062744,
"learning_rate": 8.555054253463828e-06,
"loss": 0.3518,
"step": 555
},
{
"epoch": 0.271484375,
"grad_norm": 1.3888630867004395,
"learning_rate": 8.549488104992201e-06,
"loss": 0.3772,
"step": 556
},
{
"epoch": 0.27197265625,
"grad_norm": 1.7909587621688843,
"learning_rate": 8.543913074599867e-06,
"loss": 0.3313,
"step": 557
},
{
"epoch": 0.2724609375,
"grad_norm": 1.6241544485092163,
"learning_rate": 8.538329176237287e-06,
"loss": 0.3535,
"step": 558
},
{
"epoch": 0.27294921875,
"grad_norm": 1.4434620141983032,
"learning_rate": 8.532736423877102e-06,
"loss": 0.3329,
"step": 559
},
{
"epoch": 0.2734375,
"grad_norm": 1.8953794240951538,
"learning_rate": 8.527134831514116e-06,
"loss": 0.3318,
"step": 560
},
{
"epoch": 0.27392578125,
"grad_norm": 1.287680983543396,
"learning_rate": 8.521524413165254e-06,
"loss": 0.3187,
"step": 561
},
{
"epoch": 0.2744140625,
"grad_norm": 1.6521981954574585,
"learning_rate": 8.51590518286952e-06,
"loss": 0.3509,
"step": 562
},
{
"epoch": 0.27490234375,
"grad_norm": 1.4679384231567383,
"learning_rate": 8.510277154687973e-06,
"loss": 0.3598,
"step": 563
},
{
"epoch": 0.275390625,
"grad_norm": 2.19455885887146,
"learning_rate": 8.504640342703687e-06,
"loss": 0.3371,
"step": 564
},
{
"epoch": 0.27587890625,
"grad_norm": 1.4917466640472412,
"learning_rate": 8.498994761021715e-06,
"loss": 0.3086,
"step": 565
},
{
"epoch": 0.2763671875,
"grad_norm": 2.3828556537628174,
"learning_rate": 8.493340423769054e-06,
"loss": 0.328,
"step": 566
},
{
"epoch": 0.27685546875,
"grad_norm": 2.0100631713867188,
"learning_rate": 8.487677345094606e-06,
"loss": 0.3497,
"step": 567
},
{
"epoch": 0.27734375,
"grad_norm": 2.037872552871704,
"learning_rate": 8.482005539169158e-06,
"loss": 0.3649,
"step": 568
},
{
"epoch": 0.27783203125,
"grad_norm": 1.3535383939743042,
"learning_rate": 8.476325020185326e-06,
"loss": 0.3321,
"step": 569
},
{
"epoch": 0.2783203125,
"grad_norm": 1.4872392416000366,
"learning_rate": 8.47063580235753e-06,
"loss": 0.3775,
"step": 570
},
{
"epoch": 0.27880859375,
"grad_norm": 2.482274293899536,
"learning_rate": 8.46493789992196e-06,
"loss": 0.3518,
"step": 571
},
{
"epoch": 0.279296875,
"grad_norm": 1.4444823265075684,
"learning_rate": 8.459231327136532e-06,
"loss": 0.3503,
"step": 572
},
{
"epoch": 0.27978515625,
"grad_norm": 1.3315978050231934,
"learning_rate": 8.453516098280869e-06,
"loss": 0.3408,
"step": 573
},
{
"epoch": 0.2802734375,
"grad_norm": 2.0306880474090576,
"learning_rate": 8.447792227656241e-06,
"loss": 0.3751,
"step": 574
},
{
"epoch": 0.28076171875,
"grad_norm": 1.3674098253250122,
"learning_rate": 8.442059729585552e-06,
"loss": 0.3307,
"step": 575
},
{
"epoch": 0.28125,
"grad_norm": 2.2325830459594727,
"learning_rate": 8.43631861841329e-06,
"loss": 0.3168,
"step": 576
},
{
"epoch": 0.28173828125,
"grad_norm": 1.956121802330017,
"learning_rate": 8.430568908505497e-06,
"loss": 0.3317,
"step": 577
},
{
"epoch": 0.2822265625,
"grad_norm": 2.0539493560791016,
"learning_rate": 8.42481061424973e-06,
"loss": 0.3172,
"step": 578
},
{
"epoch": 0.28271484375,
"grad_norm": 1.3269410133361816,
"learning_rate": 8.41904375005503e-06,
"loss": 0.3726,
"step": 579
},
{
"epoch": 0.283203125,
"grad_norm": 2.887756586074829,
"learning_rate": 8.413268330351881e-06,
"loss": 0.342,
"step": 580
},
{
"epoch": 0.28369140625,
"grad_norm": 1.640519618988037,
"learning_rate": 8.40748436959217e-06,
"loss": 0.3418,
"step": 581
},
{
"epoch": 0.2841796875,
"grad_norm": 2.179222583770752,
"learning_rate": 8.40169188224917e-06,
"loss": 0.368,
"step": 582
},
{
"epoch": 0.28466796875,
"grad_norm": 2.25158429145813,
"learning_rate": 8.395890882817478e-06,
"loss": 0.3555,
"step": 583
},
{
"epoch": 0.28515625,
"grad_norm": 1.5757050514221191,
"learning_rate": 8.390081385812993e-06,
"loss": 0.3453,
"step": 584
},
{
"epoch": 0.28564453125,
"grad_norm": 1.5802643299102783,
"learning_rate": 8.38426340577288e-06,
"loss": 0.3635,
"step": 585
},
{
"epoch": 0.2861328125,
"grad_norm": 1.5654072761535645,
"learning_rate": 8.378436957255535e-06,
"loss": 0.3304,
"step": 586
},
{
"epoch": 0.28662109375,
"grad_norm": 1.2622393369674683,
"learning_rate": 8.372602054840532e-06,
"loss": 0.3468,
"step": 587
},
{
"epoch": 0.287109375,
"grad_norm": 2.9419167041778564,
"learning_rate": 8.366758713128617e-06,
"loss": 0.3286,
"step": 588
},
{
"epoch": 0.28759765625,
"grad_norm": 1.6033565998077393,
"learning_rate": 8.360906946741635e-06,
"loss": 0.3375,
"step": 589
},
{
"epoch": 0.2880859375,
"grad_norm": 1.5381578207015991,
"learning_rate": 8.355046770322528e-06,
"loss": 0.3531,
"step": 590
},
{
"epoch": 0.28857421875,
"grad_norm": 1.7467304468154907,
"learning_rate": 8.349178198535273e-06,
"loss": 0.305,
"step": 591
},
{
"epoch": 0.2890625,
"grad_norm": 1.3759098052978516,
"learning_rate": 8.343301246064858e-06,
"loss": 0.3643,
"step": 592
},
{
"epoch": 0.28955078125,
"grad_norm": 1.3180525302886963,
"learning_rate": 8.337415927617243e-06,
"loss": 0.3468,
"step": 593
},
{
"epoch": 0.2900390625,
"grad_norm": 1.3249021768569946,
"learning_rate": 8.33152225791932e-06,
"loss": 0.3502,
"step": 594
},
{
"epoch": 0.29052734375,
"grad_norm": 1.9022133350372314,
"learning_rate": 8.32562025171888e-06,
"loss": 0.3842,
"step": 595
},
{
"epoch": 0.291015625,
"grad_norm": 1.4465323686599731,
"learning_rate": 8.319709923784573e-06,
"loss": 0.3247,
"step": 596
},
{
"epoch": 0.29150390625,
"grad_norm": 2.4993956089019775,
"learning_rate": 8.313791288905874e-06,
"loss": 0.3826,
"step": 597
},
{
"epoch": 0.2919921875,
"grad_norm": 1.842347264289856,
"learning_rate": 8.307864361893045e-06,
"loss": 0.329,
"step": 598
},
{
"epoch": 0.29248046875,
"grad_norm": 1.5460954904556274,
"learning_rate": 8.301929157577097e-06,
"loss": 0.3453,
"step": 599
},
{
"epoch": 0.29296875,
"grad_norm": 3.255307912826538,
"learning_rate": 8.295985690809752e-06,
"loss": 0.3358,
"step": 600
},
{
"epoch": 0.29345703125,
"grad_norm": 1.4224542379379272,
"learning_rate": 8.290033976463407e-06,
"loss": 0.3683,
"step": 601
},
{
"epoch": 0.2939453125,
"grad_norm": 1.4209293127059937,
"learning_rate": 8.2840740294311e-06,
"loss": 0.315,
"step": 602
},
{
"epoch": 0.29443359375,
"grad_norm": 2.0559093952178955,
"learning_rate": 8.278105864626467e-06,
"loss": 0.3801,
"step": 603
},
{
"epoch": 0.294921875,
"grad_norm": 1.880486249923706,
"learning_rate": 8.27212949698371e-06,
"loss": 0.3713,
"step": 604
},
{
"epoch": 0.29541015625,
"grad_norm": 3.0988686084747314,
"learning_rate": 8.266144941457552e-06,
"loss": 0.3917,
"step": 605
},
{
"epoch": 0.2958984375,
"grad_norm": 1.6043518781661987,
"learning_rate": 8.26015221302321e-06,
"loss": 0.3678,
"step": 606
},
{
"epoch": 0.29638671875,
"grad_norm": 1.520564079284668,
"learning_rate": 8.254151326676354e-06,
"loss": 0.3259,
"step": 607
},
{
"epoch": 0.296875,
"grad_norm": 1.9146232604980469,
"learning_rate": 8.248142297433058e-06,
"loss": 0.3291,
"step": 608
},
{
"epoch": 0.29736328125,
"grad_norm": 2.2928895950317383,
"learning_rate": 8.24212514032978e-06,
"loss": 0.3828,
"step": 609
},
{
"epoch": 0.2978515625,
"grad_norm": 1.9419975280761719,
"learning_rate": 8.236099870423314e-06,
"loss": 0.3287,
"step": 610
},
{
"epoch": 0.29833984375,
"grad_norm": 1.7183066606521606,
"learning_rate": 8.230066502790756e-06,
"loss": 0.3121,
"step": 611
},
{
"epoch": 0.298828125,
"grad_norm": 1.5658105611801147,
"learning_rate": 8.224025052529463e-06,
"loss": 0.3501,
"step": 612
},
{
"epoch": 0.29931640625,
"grad_norm": 1.9759196043014526,
"learning_rate": 8.21797553475702e-06,
"loss": 0.3345,
"step": 613
},
{
"epoch": 0.2998046875,
"grad_norm": 2.0763461589813232,
"learning_rate": 8.211917964611197e-06,
"loss": 0.3187,
"step": 614
},
{
"epoch": 0.30029296875,
"grad_norm": 1.4480257034301758,
"learning_rate": 8.205852357249912e-06,
"loss": 0.2866,
"step": 615
},
{
"epoch": 0.30078125,
"grad_norm": 1.9418996572494507,
"learning_rate": 8.1997787278512e-06,
"loss": 0.3125,
"step": 616
},
{
"epoch": 0.30126953125,
"grad_norm": 1.726302146911621,
"learning_rate": 8.193697091613163e-06,
"loss": 0.3663,
"step": 617
},
{
"epoch": 0.3017578125,
"grad_norm": 1.622819423675537,
"learning_rate": 8.187607463753946e-06,
"loss": 0.3385,
"step": 618
},
{
"epoch": 0.30224609375,
"grad_norm": 2.375453472137451,
"learning_rate": 8.181509859511686e-06,
"loss": 0.3314,
"step": 619
},
{
"epoch": 0.302734375,
"grad_norm": 1.6941611766815186,
"learning_rate": 8.175404294144482e-06,
"loss": 0.3152,
"step": 620
},
{
"epoch": 0.30322265625,
"grad_norm": 1.6905850172042847,
"learning_rate": 8.16929078293035e-06,
"loss": 0.3352,
"step": 621
},
{
"epoch": 0.3037109375,
"grad_norm": 1.9776393175125122,
"learning_rate": 8.163169341167196e-06,
"loss": 0.39,
"step": 622
},
{
"epoch": 0.30419921875,
"grad_norm": 1.4409841299057007,
"learning_rate": 8.157039984172764e-06,
"loss": 0.3445,
"step": 623
},
{
"epoch": 0.3046875,
"grad_norm": 1.7097798585891724,
"learning_rate": 8.150902727284609e-06,
"loss": 0.3583,
"step": 624
},
{
"epoch": 0.30517578125,
"grad_norm": 1.5705921649932861,
"learning_rate": 8.144757585860053e-06,
"loss": 0.355,
"step": 625
},
{
"epoch": 0.3056640625,
"grad_norm": 1.5804706811904907,
"learning_rate": 8.138604575276143e-06,
"loss": 0.3615,
"step": 626
},
{
"epoch": 0.30615234375,
"grad_norm": 1.7296881675720215,
"learning_rate": 8.132443710929624e-06,
"loss": 0.381,
"step": 627
},
{
"epoch": 0.306640625,
"grad_norm": 1.3139718770980835,
"learning_rate": 8.126275008236891e-06,
"loss": 0.3296,
"step": 628
},
{
"epoch": 0.30712890625,
"grad_norm": 1.339277744293213,
"learning_rate": 8.12009848263395e-06,
"loss": 0.3262,
"step": 629
},
{
"epoch": 0.3076171875,
"grad_norm": 5.439074516296387,
"learning_rate": 8.113914149576388e-06,
"loss": 0.361,
"step": 630
},
{
"epoch": 0.30810546875,
"grad_norm": 1.8875752687454224,
"learning_rate": 8.107722024539321e-06,
"loss": 0.3419,
"step": 631
},
{
"epoch": 0.30859375,
"grad_norm": 1.3780957460403442,
"learning_rate": 8.10152212301737e-06,
"loss": 0.3398,
"step": 632
},
{
"epoch": 0.30908203125,
"grad_norm": 2.1425485610961914,
"learning_rate": 8.095314460524612e-06,
"loss": 0.3473,
"step": 633
},
{
"epoch": 0.3095703125,
"grad_norm": 2.3225300312042236,
"learning_rate": 8.089099052594545e-06,
"loss": 0.3757,
"step": 634
},
{
"epoch": 0.31005859375,
"grad_norm": 1.4518051147460938,
"learning_rate": 8.08287591478005e-06,
"loss": 0.3112,
"step": 635
},
{
"epoch": 0.310546875,
"grad_norm": 2.2762012481689453,
"learning_rate": 8.076645062653346e-06,
"loss": 0.3642,
"step": 636
},
{
"epoch": 0.31103515625,
"grad_norm": 1.6947425603866577,
"learning_rate": 8.070406511805961e-06,
"loss": 0.35,
"step": 637
},
{
"epoch": 0.3115234375,
"grad_norm": 1.5694466829299927,
"learning_rate": 8.064160277848683e-06,
"loss": 0.3458,
"step": 638
},
{
"epoch": 0.31201171875,
"grad_norm": 1.9441496133804321,
"learning_rate": 8.05790637641153e-06,
"loss": 0.3698,
"step": 639
},
{
"epoch": 0.3125,
"grad_norm": 1.6394853591918945,
"learning_rate": 8.051644823143702e-06,
"loss": 0.3515,
"step": 640
},
{
"epoch": 0.31298828125,
"grad_norm": 1.8157254457473755,
"learning_rate": 8.04537563371355e-06,
"loss": 0.3278,
"step": 641
},
{
"epoch": 0.3134765625,
"grad_norm": 1.6162160634994507,
"learning_rate": 8.03909882380853e-06,
"loss": 0.3586,
"step": 642
},
{
"epoch": 0.31396484375,
"grad_norm": 1.7346367835998535,
"learning_rate": 8.03281440913517e-06,
"loss": 0.3194,
"step": 643
},
{
"epoch": 0.314453125,
"grad_norm": 1.593997836112976,
"learning_rate": 8.026522405419024e-06,
"loss": 0.3205,
"step": 644
},
{
"epoch": 0.31494140625,
"grad_norm": 1.3535056114196777,
"learning_rate": 8.020222828404638e-06,
"loss": 0.3382,
"step": 645
},
{
"epoch": 0.3154296875,
"grad_norm": 2.354459524154663,
"learning_rate": 8.01391569385551e-06,
"loss": 0.3041,
"step": 646
},
{
"epoch": 0.31591796875,
"grad_norm": 1.6168910264968872,
"learning_rate": 8.007601017554045e-06,
"loss": 0.392,
"step": 647
},
{
"epoch": 0.31640625,
"grad_norm": 1.7411466836929321,
"learning_rate": 8.001278815301525e-06,
"loss": 0.319,
"step": 648
},
{
"epoch": 0.31689453125,
"grad_norm": 2.3402931690216064,
"learning_rate": 7.994949102918062e-06,
"loss": 0.3657,
"step": 649
},
{
"epoch": 0.3173828125,
"grad_norm": 1.2933272123336792,
"learning_rate": 7.98861189624256e-06,
"loss": 0.3049,
"step": 650
},
{
"epoch": 0.31787109375,
"grad_norm": 1.6581286191940308,
"learning_rate": 7.982267211132675e-06,
"loss": 0.354,
"step": 651
},
{
"epoch": 0.318359375,
"grad_norm": 2.0283968448638916,
"learning_rate": 7.97591506346478e-06,
"loss": 0.3521,
"step": 652
},
{
"epoch": 0.31884765625,
"grad_norm": 1.6676313877105713,
"learning_rate": 7.96955546913392e-06,
"loss": 0.3237,
"step": 653
},
{
"epoch": 0.3193359375,
"grad_norm": 1.548922061920166,
"learning_rate": 7.963188444053772e-06,
"loss": 0.3145,
"step": 654
},
{
"epoch": 0.31982421875,
"grad_norm": 2.61688232421875,
"learning_rate": 7.95681400415661e-06,
"loss": 0.3159,
"step": 655
},
{
"epoch": 0.3203125,
"grad_norm": 2.0864787101745605,
"learning_rate": 7.95043216539326e-06,
"loss": 0.3394,
"step": 656
},
{
"epoch": 0.32080078125,
"grad_norm": 1.82245934009552,
"learning_rate": 7.944042943733061e-06,
"loss": 0.355,
"step": 657
},
{
"epoch": 0.3212890625,
"grad_norm": 1.6342824697494507,
"learning_rate": 7.937646355163833e-06,
"loss": 0.3407,
"step": 658
},
{
"epoch": 0.32177734375,
"grad_norm": 1.7688589096069336,
"learning_rate": 7.931242415691822e-06,
"loss": 0.3936,
"step": 659
},
{
"epoch": 0.322265625,
"grad_norm": 1.5749949216842651,
"learning_rate": 7.924831141341671e-06,
"loss": 0.3226,
"step": 660
},
{
"epoch": 0.32275390625,
"grad_norm": 4.079642295837402,
"learning_rate": 7.918412548156382e-06,
"loss": 0.3478,
"step": 661
},
{
"epoch": 0.3232421875,
"grad_norm": 1.564584732055664,
"learning_rate": 7.911986652197263e-06,
"loss": 0.345,
"step": 662
},
{
"epoch": 0.32373046875,
"grad_norm": 1.9359629154205322,
"learning_rate": 7.905553469543903e-06,
"loss": 0.3478,
"step": 663
},
{
"epoch": 0.32421875,
"grad_norm": 1.3265938758850098,
"learning_rate": 7.899113016294118e-06,
"loss": 0.3789,
"step": 664
},
{
"epoch": 0.32470703125,
"grad_norm": 1.617301106452942,
"learning_rate": 7.892665308563922e-06,
"loss": 0.3182,
"step": 665
},
{
"epoch": 0.3251953125,
"grad_norm": 2.50874924659729,
"learning_rate": 7.88621036248748e-06,
"loss": 0.3269,
"step": 666
},
{
"epoch": 0.32568359375,
"grad_norm": 2.0309231281280518,
"learning_rate": 7.879748194217074e-06,
"loss": 0.3294,
"step": 667
},
{
"epoch": 0.326171875,
"grad_norm": 1.6182068586349487,
"learning_rate": 7.873278819923047e-06,
"loss": 0.3269,
"step": 668
},
{
"epoch": 0.32666015625,
"grad_norm": 2.3924951553344727,
"learning_rate": 7.866802255793788e-06,
"loss": 0.3498,
"step": 669
},
{
"epoch": 0.3271484375,
"grad_norm": 2.816044330596924,
"learning_rate": 7.860318518035668e-06,
"loss": 0.3231,
"step": 670
},
{
"epoch": 0.32763671875,
"grad_norm": 1.9277939796447754,
"learning_rate": 7.853827622873011e-06,
"loss": 0.3236,
"step": 671
},
{
"epoch": 0.328125,
"grad_norm": 1.364225149154663,
"learning_rate": 7.847329586548049e-06,
"loss": 0.3807,
"step": 672
},
{
"epoch": 0.32861328125,
"grad_norm": 1.443907380104065,
"learning_rate": 7.840824425320888e-06,
"loss": 0.4092,
"step": 673
},
{
"epoch": 0.3291015625,
"grad_norm": 1.670778512954712,
"learning_rate": 7.834312155469457e-06,
"loss": 0.3653,
"step": 674
},
{
"epoch": 0.32958984375,
"grad_norm": 1.510043740272522,
"learning_rate": 7.827792793289477e-06,
"loss": 0.3463,
"step": 675
},
{
"epoch": 0.330078125,
"grad_norm": 2.1872780323028564,
"learning_rate": 7.821266355094419e-06,
"loss": 0.3479,
"step": 676
},
{
"epoch": 0.33056640625,
"grad_norm": 1.6790423393249512,
"learning_rate": 7.814732857215453e-06,
"loss": 0.3476,
"step": 677
},
{
"epoch": 0.3310546875,
"grad_norm": 1.3476860523223877,
"learning_rate": 7.808192316001417e-06,
"loss": 0.3333,
"step": 678
},
{
"epoch": 0.33154296875,
"grad_norm": 1.752164602279663,
"learning_rate": 7.801644747818777e-06,
"loss": 0.3341,
"step": 679
},
{
"epoch": 0.33203125,
"grad_norm": 2.4022326469421387,
"learning_rate": 7.79509016905158e-06,
"loss": 0.357,
"step": 680
},
{
"epoch": 0.33251953125,
"grad_norm": 1.3659697771072388,
"learning_rate": 7.788528596101419e-06,
"loss": 0.3073,
"step": 681
},
{
"epoch": 0.3330078125,
"grad_norm": 1.4519615173339844,
"learning_rate": 7.78196004538738e-06,
"loss": 0.3052,
"step": 682
},
{
"epoch": 0.33349609375,
"grad_norm": 2.08927583694458,
"learning_rate": 7.775384533346018e-06,
"loss": 0.3242,
"step": 683
},
{
"epoch": 0.333984375,
"grad_norm": 1.4538501501083374,
"learning_rate": 7.768802076431304e-06,
"loss": 0.3495,
"step": 684
},
{
"epoch": 0.33447265625,
"grad_norm": 2.239643096923828,
"learning_rate": 7.76221269111459e-06,
"loss": 0.3554,
"step": 685
},
{
"epoch": 0.3349609375,
"grad_norm": 1.8009265661239624,
"learning_rate": 7.755616393884562e-06,
"loss": 0.3652,
"step": 686
},
{
"epoch": 0.33544921875,
"grad_norm": 1.5794439315795898,
"learning_rate": 7.7490132012472e-06,
"loss": 0.3321,
"step": 687
},
{
"epoch": 0.3359375,
"grad_norm": 1.737437129020691,
"learning_rate": 7.742403129725742e-06,
"loss": 0.3138,
"step": 688
},
{
"epoch": 0.33642578125,
"grad_norm": 1.7152299880981445,
"learning_rate": 7.735786195860641e-06,
"loss": 0.3582,
"step": 689
},
{
"epoch": 0.3369140625,
"grad_norm": 1.3847858905792236,
"learning_rate": 7.729162416209518e-06,
"loss": 0.3396,
"step": 690
},
{
"epoch": 0.33740234375,
"grad_norm": 1.6747031211853027,
"learning_rate": 7.722531807347122e-06,
"loss": 0.3474,
"step": 691
},
{
"epoch": 0.337890625,
"grad_norm": 1.3016866445541382,
"learning_rate": 7.715894385865299e-06,
"loss": 0.3391,
"step": 692
},
{
"epoch": 0.33837890625,
"grad_norm": 1.3648223876953125,
"learning_rate": 7.709250168372932e-06,
"loss": 0.3298,
"step": 693
},
{
"epoch": 0.3388671875,
"grad_norm": 1.5124351978302002,
"learning_rate": 7.702599171495919e-06,
"loss": 0.3334,
"step": 694
},
{
"epoch": 0.33935546875,
"grad_norm": 37.46984100341797,
"learning_rate": 7.695941411877115e-06,
"loss": 0.3342,
"step": 695
},
{
"epoch": 0.33984375,
"grad_norm": 1.4970625638961792,
"learning_rate": 7.689276906176302e-06,
"loss": 0.3436,
"step": 696
},
{
"epoch": 0.34033203125,
"grad_norm": 3.098925828933716,
"learning_rate": 7.682605671070142e-06,
"loss": 0.3437,
"step": 697
},
{
"epoch": 0.3408203125,
"grad_norm": 1.7555867433547974,
"learning_rate": 7.675927723252134e-06,
"loss": 0.322,
"step": 698
},
{
"epoch": 0.34130859375,
"grad_norm": 1.5935651063919067,
"learning_rate": 7.669243079432578e-06,
"loss": 0.2998,
"step": 699
},
{
"epoch": 0.341796875,
"grad_norm": 1.506208896636963,
"learning_rate": 7.662551756338525e-06,
"loss": 0.3612,
"step": 700
},
{
"epoch": 0.34228515625,
"grad_norm": 1.923596978187561,
"learning_rate": 7.655853770713744e-06,
"loss": 0.3593,
"step": 701
},
{
"epoch": 0.3427734375,
"grad_norm": 1.9344090223312378,
"learning_rate": 7.64914913931867e-06,
"loss": 0.3156,
"step": 702
},
{
"epoch": 0.34326171875,
"grad_norm": 1.7808047533035278,
"learning_rate": 7.642437878930376e-06,
"loss": 0.3419,
"step": 703
},
{
"epoch": 0.34375,
"grad_norm": 1.5053675174713135,
"learning_rate": 7.635720006342513e-06,
"loss": 0.3539,
"step": 704
},
{
"epoch": 0.34423828125,
"grad_norm": 1.5963175296783447,
"learning_rate": 7.628995538365287e-06,
"loss": 0.3562,
"step": 705
},
{
"epoch": 0.3447265625,
"grad_norm": 1.4388726949691772,
"learning_rate": 7.6222644918254005e-06,
"loss": 0.3413,
"step": 706
},
{
"epoch": 0.34521484375,
"grad_norm": 3.6217451095581055,
"learning_rate": 7.615526883566023e-06,
"loss": 0.3584,
"step": 707
},
{
"epoch": 0.345703125,
"grad_norm": 1.6617943048477173,
"learning_rate": 7.608782730446741e-06,
"loss": 0.3675,
"step": 708
},
{
"epoch": 0.34619140625,
"grad_norm": 3.6505870819091797,
"learning_rate": 7.6020320493435175e-06,
"loss": 0.3028,
"step": 709
},
{
"epoch": 0.3466796875,
"grad_norm": 1.5057923793792725,
"learning_rate": 7.595274857148651e-06,
"loss": 0.3601,
"step": 710
},
{
"epoch": 0.34716796875,
"grad_norm": 1.775791049003601,
"learning_rate": 7.588511170770736e-06,
"loss": 0.3561,
"step": 711
},
{
"epoch": 0.34765625,
"grad_norm": 2.0912845134735107,
"learning_rate": 7.581741007134611e-06,
"loss": 0.3211,
"step": 712
},
{
"epoch": 0.34814453125,
"grad_norm": 1.4719021320343018,
"learning_rate": 7.574964383181329e-06,
"loss": 0.3571,
"step": 713
},
{
"epoch": 0.3486328125,
"grad_norm": 1.5099034309387207,
"learning_rate": 7.568181315868104e-06,
"loss": 0.3773,
"step": 714
},
{
"epoch": 0.34912109375,
"grad_norm": 1.797803282737732,
"learning_rate": 7.561391822168277e-06,
"loss": 0.3305,
"step": 715
},
{
"epoch": 0.349609375,
"grad_norm": 1.5316636562347412,
"learning_rate": 7.554595919071268e-06,
"loss": 0.3692,
"step": 716
},
{
"epoch": 0.35009765625,
"grad_norm": 1.332055926322937,
"learning_rate": 7.5477936235825344e-06,
"loss": 0.2998,
"step": 717
},
{
"epoch": 0.3505859375,
"grad_norm": 1.538785457611084,
"learning_rate": 7.540984952723531e-06,
"loss": 0.3325,
"step": 718
},
{
"epoch": 0.35107421875,
"grad_norm": 2.884404420852661,
"learning_rate": 7.534169923531665e-06,
"loss": 0.3036,
"step": 719
},
{
"epoch": 0.3515625,
"grad_norm": 1.7468745708465576,
"learning_rate": 7.527348553060254e-06,
"loss": 0.3199,
"step": 720
},
{
"epoch": 0.35205078125,
"grad_norm": 2.015227794647217,
"learning_rate": 7.520520858378486e-06,
"loss": 0.3884,
"step": 721
},
{
"epoch": 0.3525390625,
"grad_norm": 1.3880223035812378,
"learning_rate": 7.513686856571367e-06,
"loss": 0.336,
"step": 722
},
{
"epoch": 0.35302734375,
"grad_norm": 1.297411561012268,
"learning_rate": 7.506846564739694e-06,
"loss": 0.3306,
"step": 723
},
{
"epoch": 0.353515625,
"grad_norm": 1.55870521068573,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3056,
"step": 724
},
{
"epoch": 0.35400390625,
"grad_norm": 2.036909818649292,
"learning_rate": 7.493147179484514e-06,
"loss": 0.3273,
"step": 725
},
{
"epoch": 0.3544921875,
"grad_norm": 1.3678783178329468,
"learning_rate": 7.486288120341118e-06,
"loss": 0.345,
"step": 726
},
{
"epoch": 0.35498046875,
"grad_norm": 2.0894579887390137,
"learning_rate": 7.479422839733307e-06,
"loss": 0.359,
"step": 727
},
{
"epoch": 0.35546875,
"grad_norm": 1.6823246479034424,
"learning_rate": 7.4725513548401455e-06,
"loss": 0.3563,
"step": 728
},
{
"epoch": 0.35595703125,
"grad_norm": 1.351969838142395,
"learning_rate": 7.4656736828562186e-06,
"loss": 0.3017,
"step": 729
},
{
"epoch": 0.3564453125,
"grad_norm": 1.6686972379684448,
"learning_rate": 7.458789840991596e-06,
"loss": 0.3478,
"step": 730
},
{
"epoch": 0.35693359375,
"grad_norm": 1.3534908294677734,
"learning_rate": 7.4518998464717874e-06,
"loss": 0.3244,
"step": 731
},
{
"epoch": 0.357421875,
"grad_norm": 1.4082777500152588,
"learning_rate": 7.445003716537698e-06,
"loss": 0.3251,
"step": 732
},
{
"epoch": 0.35791015625,
"grad_norm": 2.0288498401641846,
"learning_rate": 7.438101468445582e-06,
"loss": 0.3379,
"step": 733
},
{
"epoch": 0.3583984375,
"grad_norm": 1.6891510486602783,
"learning_rate": 7.4311931194670085e-06,
"loss": 0.3576,
"step": 734
},
{
"epoch": 0.35888671875,
"grad_norm": 1.3616983890533447,
"learning_rate": 7.42427868688881e-06,
"loss": 0.3439,
"step": 735
},
{
"epoch": 0.359375,
"grad_norm": 1.5869650840759277,
"learning_rate": 7.417358188013042e-06,
"loss": 0.3389,
"step": 736
},
{
"epoch": 0.35986328125,
"grad_norm": 1.3705356121063232,
"learning_rate": 7.410431640156937e-06,
"loss": 0.346,
"step": 737
},
{
"epoch": 0.3603515625,
"grad_norm": 2.2622792720794678,
"learning_rate": 7.403499060652874e-06,
"loss": 0.3535,
"step": 738
},
{
"epoch": 0.36083984375,
"grad_norm": 1.719897747039795,
"learning_rate": 7.3965604668483145e-06,
"loss": 0.382,
"step": 739
},
{
"epoch": 0.361328125,
"grad_norm": 1.3844950199127197,
"learning_rate": 7.389615876105773e-06,
"loss": 0.3481,
"step": 740
},
{
"epoch": 0.36181640625,
"grad_norm": 1.6294703483581543,
"learning_rate": 7.38266530580277e-06,
"loss": 0.3656,
"step": 741
},
{
"epoch": 0.3623046875,
"grad_norm": 2.908967971801758,
"learning_rate": 7.375708773331791e-06,
"loss": 0.3457,
"step": 742
},
{
"epoch": 0.36279296875,
"grad_norm": 1.473132848739624,
"learning_rate": 7.36874629610024e-06,
"loss": 0.3385,
"step": 743
},
{
"epoch": 0.36328125,
"grad_norm": 2.919328451156616,
"learning_rate": 7.361777891530392e-06,
"loss": 0.3336,
"step": 744
},
{
"epoch": 0.36376953125,
"grad_norm": 2.563336133956909,
"learning_rate": 7.354803577059359e-06,
"loss": 0.3357,
"step": 745
},
{
"epoch": 0.3642578125,
"grad_norm": 1.4097625017166138,
"learning_rate": 7.347823370139042e-06,
"loss": 0.3559,
"step": 746
},
{
"epoch": 0.36474609375,
"grad_norm": 1.3321950435638428,
"learning_rate": 7.340837288236085e-06,
"loss": 0.3626,
"step": 747
},
{
"epoch": 0.365234375,
"grad_norm": 1.6507295370101929,
"learning_rate": 7.3338453488318284e-06,
"loss": 0.3095,
"step": 748
},
{
"epoch": 0.36572265625,
"grad_norm": 1.8008859157562256,
"learning_rate": 7.326847569422278e-06,
"loss": 0.3193,
"step": 749
},
{
"epoch": 0.3662109375,
"grad_norm": 1.4755789041519165,
"learning_rate": 7.3198439675180484e-06,
"loss": 0.2986,
"step": 750
},
{
"epoch": 0.36669921875,
"grad_norm": 1.7474323511123657,
"learning_rate": 7.312834560644327e-06,
"loss": 0.3936,
"step": 751
},
{
"epoch": 0.3671875,
"grad_norm": 1.6639896631240845,
"learning_rate": 7.30581936634082e-06,
"loss": 0.3673,
"step": 752
},
{
"epoch": 0.36767578125,
"grad_norm": 1.3790712356567383,
"learning_rate": 7.298798402161725e-06,
"loss": 0.3639,
"step": 753
},
{
"epoch": 0.3681640625,
"grad_norm": 1.9777040481567383,
"learning_rate": 7.291771685675673e-06,
"loss": 0.3299,
"step": 754
},
{
"epoch": 0.36865234375,
"grad_norm": 1.7995957136154175,
"learning_rate": 7.284739234465686e-06,
"loss": 0.3605,
"step": 755
},
{
"epoch": 0.369140625,
"grad_norm": 1.9671039581298828,
"learning_rate": 7.277701066129141e-06,
"loss": 0.3792,
"step": 756
},
{
"epoch": 0.36962890625,
"grad_norm": 2.719590187072754,
"learning_rate": 7.27065719827772e-06,
"loss": 0.3318,
"step": 757
},
{
"epoch": 0.3701171875,
"grad_norm": 1.9835278987884521,
"learning_rate": 7.2636076485373645e-06,
"loss": 0.3286,
"step": 758
},
{
"epoch": 0.37060546875,
"grad_norm": 1.2610225677490234,
"learning_rate": 7.256552434548236e-06,
"loss": 0.3274,
"step": 759
},
{
"epoch": 0.37109375,
"grad_norm": 1.2788983583450317,
"learning_rate": 7.249491573964671e-06,
"loss": 0.3622,
"step": 760
},
{
"epoch": 0.37158203125,
"grad_norm": 1.2974728345870972,
"learning_rate": 7.242425084455132e-06,
"loss": 0.3253,
"step": 761
},
{
"epoch": 0.3720703125,
"grad_norm": 1.8051031827926636,
"learning_rate": 7.23535298370217e-06,
"loss": 0.3486,
"step": 762
},
{
"epoch": 0.37255859375,
"grad_norm": 1.7785935401916504,
"learning_rate": 7.228275289402373e-06,
"loss": 0.3195,
"step": 763
},
{
"epoch": 0.373046875,
"grad_norm": 1.2360249757766724,
"learning_rate": 7.221192019266332e-06,
"loss": 0.3005,
"step": 764
},
{
"epoch": 0.37353515625,
"grad_norm": 1.5772784948349,
"learning_rate": 7.214103191018584e-06,
"loss": 0.3319,
"step": 765
},
{
"epoch": 0.3740234375,
"grad_norm": 1.5777393579483032,
"learning_rate": 7.2070088223975784e-06,
"loss": 0.3412,
"step": 766
},
{
"epoch": 0.37451171875,
"grad_norm": 1.2442673444747925,
"learning_rate": 7.199908931155628e-06,
"loss": 0.3236,
"step": 767
},
{
"epoch": 0.375,
"grad_norm": 1.1323033571243286,
"learning_rate": 7.192803535058861e-06,
"loss": 0.3236,
"step": 768
},
{
"epoch": 0.37548828125,
"grad_norm": 1.316483974456787,
"learning_rate": 7.185692651887186e-06,
"loss": 0.3295,
"step": 769
},
{
"epoch": 0.3759765625,
"grad_norm": 1.5371990203857422,
"learning_rate": 7.178576299434239e-06,
"loss": 0.3711,
"step": 770
},
{
"epoch": 0.37646484375,
"grad_norm": 1.7177865505218506,
"learning_rate": 7.171454495507341e-06,
"loss": 0.3294,
"step": 771
},
{
"epoch": 0.376953125,
"grad_norm": 1.4074996709823608,
"learning_rate": 7.164327257927456e-06,
"loss": 0.3472,
"step": 772
},
{
"epoch": 0.37744140625,
"grad_norm": 1.3459590673446655,
"learning_rate": 7.157194604529143e-06,
"loss": 0.3268,
"step": 773
},
{
"epoch": 0.3779296875,
"grad_norm": 1.3509142398834229,
"learning_rate": 7.150056553160517e-06,
"loss": 0.3258,
"step": 774
},
{
"epoch": 0.37841796875,
"grad_norm": 1.3562768697738647,
"learning_rate": 7.142913121683195e-06,
"loss": 0.3301,
"step": 775
},
{
"epoch": 0.37890625,
"grad_norm": 1.815333604812622,
"learning_rate": 7.135764327972261e-06,
"loss": 0.3653,
"step": 776
},
{
"epoch": 0.37939453125,
"grad_norm": 1.3162930011749268,
"learning_rate": 7.128610189916213e-06,
"loss": 0.376,
"step": 777
},
{
"epoch": 0.3798828125,
"grad_norm": 1.7800266742706299,
"learning_rate": 7.121450725416928e-06,
"loss": 0.3662,
"step": 778
},
{
"epoch": 0.38037109375,
"grad_norm": 1.5096458196640015,
"learning_rate": 7.114285952389604e-06,
"loss": 0.3588,
"step": 779
},
{
"epoch": 0.380859375,
"grad_norm": 2.538273334503174,
"learning_rate": 7.1071158887627304e-06,
"loss": 0.3312,
"step": 780
},
{
"epoch": 0.38134765625,
"grad_norm": 1.3077067136764526,
"learning_rate": 7.0999405524780266e-06,
"loss": 0.3344,
"step": 781
},
{
"epoch": 0.3818359375,
"grad_norm": 1.3059022426605225,
"learning_rate": 7.092759961490415e-06,
"loss": 0.3259,
"step": 782
},
{
"epoch": 0.38232421875,
"grad_norm": 2.276553153991699,
"learning_rate": 7.08557413376796e-06,
"loss": 0.3331,
"step": 783
},
{
"epoch": 0.3828125,
"grad_norm": 1.3777782917022705,
"learning_rate": 7.078383087291833e-06,
"loss": 0.3211,
"step": 784
},
{
"epoch": 0.38330078125,
"grad_norm": 1.3232738971710205,
"learning_rate": 7.071186840056264e-06,
"loss": 0.2928,
"step": 785
},
{
"epoch": 0.3837890625,
"grad_norm": 1.1360565423965454,
"learning_rate": 7.063985410068499e-06,
"loss": 0.3291,
"step": 786
},
{
"epoch": 0.38427734375,
"grad_norm": 1.5104074478149414,
"learning_rate": 7.056778815348746e-06,
"loss": 0.3388,
"step": 787
},
{
"epoch": 0.384765625,
"grad_norm": 1.3837941884994507,
"learning_rate": 7.0495670739301435e-06,
"loss": 0.3802,
"step": 788
},
{
"epoch": 0.38525390625,
"grad_norm": 2.0784964561462402,
"learning_rate": 7.042350203858706e-06,
"loss": 0.3153,
"step": 789
},
{
"epoch": 0.3857421875,
"grad_norm": 1.4472565650939941,
"learning_rate": 7.035128223193286e-06,
"loss": 0.3145,
"step": 790
},
{
"epoch": 0.38623046875,
"grad_norm": 1.729691505432129,
"learning_rate": 7.0279011500055136e-06,
"loss": 0.393,
"step": 791
},
{
"epoch": 0.38671875,
"grad_norm": 1.4967801570892334,
"learning_rate": 7.020669002379772e-06,
"loss": 0.3344,
"step": 792
},
{
"epoch": 0.38720703125,
"grad_norm": 1.322029948234558,
"learning_rate": 7.0134317984131395e-06,
"loss": 0.3319,
"step": 793
},
{
"epoch": 0.3876953125,
"grad_norm": 2.8917009830474854,
"learning_rate": 7.006189556215346e-06,
"loss": 0.3152,
"step": 794
},
{
"epoch": 0.38818359375,
"grad_norm": 1.581947922706604,
"learning_rate": 6.998942293908725e-06,
"loss": 0.3606,
"step": 795
},
{
"epoch": 0.388671875,
"grad_norm": 2.658916711807251,
"learning_rate": 6.991690029628181e-06,
"loss": 0.3451,
"step": 796
},
{
"epoch": 0.38916015625,
"grad_norm": 2.3201754093170166,
"learning_rate": 6.9844327815211275e-06,
"loss": 0.333,
"step": 797
},
{
"epoch": 0.3896484375,
"grad_norm": 1.4934650659561157,
"learning_rate": 6.977170567747452e-06,
"loss": 0.3336,
"step": 798
},
{
"epoch": 0.39013671875,
"grad_norm": 1.4863629341125488,
"learning_rate": 6.969903406479465e-06,
"loss": 0.3347,
"step": 799
},
{
"epoch": 0.390625,
"grad_norm": 1.3552590608596802,
"learning_rate": 6.962631315901861e-06,
"loss": 0.3623,
"step": 800
},
{
"epoch": 0.39111328125,
"grad_norm": 2.2949376106262207,
"learning_rate": 6.955354314211669e-06,
"loss": 0.2987,
"step": 801
},
{
"epoch": 0.3916015625,
"grad_norm": 1.3013123273849487,
"learning_rate": 6.948072419618201e-06,
"loss": 0.3307,
"step": 802
},
{
"epoch": 0.39208984375,
"grad_norm": 1.4084373712539673,
"learning_rate": 6.940785650343019e-06,
"loss": 0.3119,
"step": 803
},
{
"epoch": 0.392578125,
"grad_norm": 2.596653461456299,
"learning_rate": 6.93349402461988e-06,
"loss": 0.3228,
"step": 804
},
{
"epoch": 0.39306640625,
"grad_norm": 1.5036858320236206,
"learning_rate": 6.926197560694699e-06,
"loss": 0.3463,
"step": 805
},
{
"epoch": 0.3935546875,
"grad_norm": 1.8642725944519043,
"learning_rate": 6.918896276825485e-06,
"loss": 0.368,
"step": 806
},
{
"epoch": 0.39404296875,
"grad_norm": 1.289711356163025,
"learning_rate": 6.9115901912823226e-06,
"loss": 0.3582,
"step": 807
},
{
"epoch": 0.39453125,
"grad_norm": 1.507915735244751,
"learning_rate": 6.9042793223473024e-06,
"loss": 0.3829,
"step": 808
},
{
"epoch": 0.39501953125,
"grad_norm": 1.7021656036376953,
"learning_rate": 6.896963688314489e-06,
"loss": 0.3668,
"step": 809
},
{
"epoch": 0.3955078125,
"grad_norm": 1.2955149412155151,
"learning_rate": 6.889643307489865e-06,
"loss": 0.3344,
"step": 810
},
{
"epoch": 0.39599609375,
"grad_norm": 1.183563232421875,
"learning_rate": 6.882318198191298e-06,
"loss": 0.3191,
"step": 811
},
{
"epoch": 0.396484375,
"grad_norm": 1.458882451057434,
"learning_rate": 6.874988378748484e-06,
"loss": 0.3531,
"step": 812
},
{
"epoch": 0.39697265625,
"grad_norm": 1.6540387868881226,
"learning_rate": 6.8676538675029054e-06,
"loss": 0.3399,
"step": 813
},
{
"epoch": 0.3974609375,
"grad_norm": 1.2130305767059326,
"learning_rate": 6.860314682807786e-06,
"loss": 0.3387,
"step": 814
},
{
"epoch": 0.39794921875,
"grad_norm": 1.3185558319091797,
"learning_rate": 6.852970843028043e-06,
"loss": 0.3389,
"step": 815
},
{
"epoch": 0.3984375,
"grad_norm": 1.6620187759399414,
"learning_rate": 6.845622366540242e-06,
"loss": 0.3041,
"step": 816
},
{
"epoch": 0.39892578125,
"grad_norm": 1.1920667886734009,
"learning_rate": 6.8382692717325525e-06,
"loss": 0.3047,
"step": 817
},
{
"epoch": 0.3994140625,
"grad_norm": 1.4352617263793945,
"learning_rate": 6.8309115770046986e-06,
"loss": 0.3276,
"step": 818
},
{
"epoch": 0.39990234375,
"grad_norm": 1.6452810764312744,
"learning_rate": 6.8235493007679155e-06,
"loss": 0.3243,
"step": 819
},
{
"epoch": 0.400390625,
"grad_norm": 1.6612956523895264,
"learning_rate": 6.816182461444905e-06,
"loss": 0.342,
"step": 820
},
{
"epoch": 0.40087890625,
"grad_norm": 1.2954360246658325,
"learning_rate": 6.8088110774697825e-06,
"loss": 0.3117,
"step": 821
},
{
"epoch": 0.4013671875,
"grad_norm": 2.189624786376953,
"learning_rate": 6.8014351672880395e-06,
"loss": 0.3069,
"step": 822
},
{
"epoch": 0.40185546875,
"grad_norm": 1.4809291362762451,
"learning_rate": 6.794054749356492e-06,
"loss": 0.3355,
"step": 823
},
{
"epoch": 0.40234375,
"grad_norm": 1.6851189136505127,
"learning_rate": 6.786669842143236e-06,
"loss": 0.3435,
"step": 824
},
{
"epoch": 0.40283203125,
"grad_norm": 1.401813268661499,
"learning_rate": 6.779280464127601e-06,
"loss": 0.326,
"step": 825
},
{
"epoch": 0.4033203125,
"grad_norm": 1.7311843633651733,
"learning_rate": 6.771886633800104e-06,
"loss": 0.3281,
"step": 826
},
{
"epoch": 0.40380859375,
"grad_norm": 2.936901092529297,
"learning_rate": 6.764488369662403e-06,
"loss": 0.3727,
"step": 827
},
{
"epoch": 0.404296875,
"grad_norm": 1.319385051727295,
"learning_rate": 6.75708569022725e-06,
"loss": 0.344,
"step": 828
},
{
"epoch": 0.40478515625,
"grad_norm": 1.9358359575271606,
"learning_rate": 6.749678614018446e-06,
"loss": 0.3622,
"step": 829
},
{
"epoch": 0.4052734375,
"grad_norm": 1.1188249588012695,
"learning_rate": 6.742267159570796e-06,
"loss": 0.3299,
"step": 830
},
{
"epoch": 0.40576171875,
"grad_norm": 1.3562527894973755,
"learning_rate": 6.734851345430057e-06,
"loss": 0.319,
"step": 831
},
{
"epoch": 0.40625,
"grad_norm": 1.2941495180130005,
"learning_rate": 6.727431190152898e-06,
"loss": 0.3323,
"step": 832
},
{
"epoch": 0.40673828125,
"grad_norm": 2.1621103286743164,
"learning_rate": 6.720006712306849e-06,
"loss": 0.3409,
"step": 833
},
{
"epoch": 0.4072265625,
"grad_norm": 1.3561265468597412,
"learning_rate": 6.712577930470258e-06,
"loss": 0.3549,
"step": 834
},
{
"epoch": 0.40771484375,
"grad_norm": 1.2518807649612427,
"learning_rate": 6.705144863232246e-06,
"loss": 0.3279,
"step": 835
},
{
"epoch": 0.408203125,
"grad_norm": 1.1951934099197388,
"learning_rate": 6.697707529192648e-06,
"loss": 0.3146,
"step": 836
},
{
"epoch": 0.40869140625,
"grad_norm": 1.2976142168045044,
"learning_rate": 6.6902659469619855e-06,
"loss": 0.3151,
"step": 837
},
{
"epoch": 0.4091796875,
"grad_norm": 1.554851770401001,
"learning_rate": 6.682820135161405e-06,
"loss": 0.2972,
"step": 838
},
{
"epoch": 0.40966796875,
"grad_norm": 1.467674732208252,
"learning_rate": 6.675370112422639e-06,
"loss": 0.3538,
"step": 839
},
{
"epoch": 0.41015625,
"grad_norm": 2.0394184589385986,
"learning_rate": 6.667915897387957e-06,
"loss": 0.3124,
"step": 840
},
{
"epoch": 0.41064453125,
"grad_norm": 1.458815097808838,
"learning_rate": 6.6604575087101165e-06,
"loss": 0.3073,
"step": 841
},
{
"epoch": 0.4111328125,
"grad_norm": 1.2343790531158447,
"learning_rate": 6.6529949650523195e-06,
"loss": 0.3224,
"step": 842
},
{
"epoch": 0.41162109375,
"grad_norm": 1.307780385017395,
"learning_rate": 6.645528285088169e-06,
"loss": 0.3139,
"step": 843
},
{
"epoch": 0.412109375,
"grad_norm": 1.187071681022644,
"learning_rate": 6.638057487501613e-06,
"loss": 0.3316,
"step": 844
},
{
"epoch": 0.41259765625,
"grad_norm": 1.9509886503219604,
"learning_rate": 6.630582590986907e-06,
"loss": 0.3381,
"step": 845
},
{
"epoch": 0.4130859375,
"grad_norm": 1.5562846660614014,
"learning_rate": 6.623103614248561e-06,
"loss": 0.3648,
"step": 846
},
{
"epoch": 0.41357421875,
"grad_norm": 1.423948049545288,
"learning_rate": 6.615620576001293e-06,
"loss": 0.3163,
"step": 847
},
{
"epoch": 0.4140625,
"grad_norm": 1.5273832082748413,
"learning_rate": 6.608133494969993e-06,
"loss": 0.3002,
"step": 848
},
{
"epoch": 0.41455078125,
"grad_norm": 1.2620773315429688,
"learning_rate": 6.600642389889657e-06,
"loss": 0.3599,
"step": 849
},
{
"epoch": 0.4150390625,
"grad_norm": 1.283124566078186,
"learning_rate": 6.593147279505352e-06,
"loss": 0.3348,
"step": 850
},
{
"epoch": 0.41552734375,
"grad_norm": 1.2876836061477661,
"learning_rate": 6.585648182572176e-06,
"loss": 0.347,
"step": 851
},
{
"epoch": 0.416015625,
"grad_norm": 2.6049535274505615,
"learning_rate": 6.578145117855192e-06,
"loss": 0.3305,
"step": 852
},
{
"epoch": 0.41650390625,
"grad_norm": 1.7834153175354004,
"learning_rate": 6.570638104129399e-06,
"loss": 0.323,
"step": 853
},
{
"epoch": 0.4169921875,
"grad_norm": 1.3892278671264648,
"learning_rate": 6.563127160179672e-06,
"loss": 0.3475,
"step": 854
},
{
"epoch": 0.41748046875,
"grad_norm": 1.4540331363677979,
"learning_rate": 6.555612304800727e-06,
"loss": 0.3442,
"step": 855
},
{
"epoch": 0.41796875,
"grad_norm": 1.058359146118164,
"learning_rate": 6.548093556797063e-06,
"loss": 0.3398,
"step": 856
},
{
"epoch": 0.41845703125,
"grad_norm": 1.587546706199646,
"learning_rate": 6.540570934982917e-06,
"loss": 0.3261,
"step": 857
},
{
"epoch": 0.4189453125,
"grad_norm": 2.1293222904205322,
"learning_rate": 6.533044458182229e-06,
"loss": 0.3755,
"step": 858
},
{
"epoch": 0.41943359375,
"grad_norm": 1.2648324966430664,
"learning_rate": 6.5255141452285765e-06,
"loss": 0.3001,
"step": 859
},
{
"epoch": 0.419921875,
"grad_norm": 1.4118512868881226,
"learning_rate": 6.51798001496514e-06,
"loss": 0.3376,
"step": 860
},
{
"epoch": 0.42041015625,
"grad_norm": 1.4707554578781128,
"learning_rate": 6.510442086244649e-06,
"loss": 0.3247,
"step": 861
},
{
"epoch": 0.4208984375,
"grad_norm": 1.3729053735733032,
"learning_rate": 6.502900377929344e-06,
"loss": 0.3039,
"step": 862
},
{
"epoch": 0.42138671875,
"grad_norm": 3.840740442276001,
"learning_rate": 6.4953549088909194e-06,
"loss": 0.3567,
"step": 863
},
{
"epoch": 0.421875,
"grad_norm": 1.3986668586730957,
"learning_rate": 6.487805698010476e-06,
"loss": 0.3313,
"step": 864
},
{
"epoch": 0.42236328125,
"grad_norm": 3.7465996742248535,
"learning_rate": 6.4802527641784866e-06,
"loss": 0.3357,
"step": 865
},
{
"epoch": 0.4228515625,
"grad_norm": 1.7644517421722412,
"learning_rate": 6.472696126294733e-06,
"loss": 0.3662,
"step": 866
},
{
"epoch": 0.42333984375,
"grad_norm": 1.2544833421707153,
"learning_rate": 6.4651358032682694e-06,
"loss": 0.3371,
"step": 867
},
{
"epoch": 0.423828125,
"grad_norm": 1.500871181488037,
"learning_rate": 6.457571814017368e-06,
"loss": 0.3224,
"step": 868
},
{
"epoch": 0.42431640625,
"grad_norm": 1.3260788917541504,
"learning_rate": 6.45000417746948e-06,
"loss": 0.3161,
"step": 869
},
{
"epoch": 0.4248046875,
"grad_norm": 1.334038257598877,
"learning_rate": 6.442432912561178e-06,
"loss": 0.3423,
"step": 870
},
{
"epoch": 0.42529296875,
"grad_norm": 1.378933310508728,
"learning_rate": 6.434858038238118e-06,
"loss": 0.3492,
"step": 871
},
{
"epoch": 0.42578125,
"grad_norm": 1.5512367486953735,
"learning_rate": 6.427279573454985e-06,
"loss": 0.3731,
"step": 872
},
{
"epoch": 0.42626953125,
"grad_norm": 1.4665623903274536,
"learning_rate": 6.4196975371754514e-06,
"loss": 0.3481,
"step": 873
},
{
"epoch": 0.4267578125,
"grad_norm": 1.5259501934051514,
"learning_rate": 6.412111948372122e-06,
"loss": 0.3439,
"step": 874
},
{
"epoch": 0.42724609375,
"grad_norm": 1.465909719467163,
"learning_rate": 6.404522826026496e-06,
"loss": 0.33,
"step": 875
},
{
"epoch": 0.427734375,
"grad_norm": 1.357045292854309,
"learning_rate": 6.396930189128912e-06,
"loss": 0.344,
"step": 876
},
{
"epoch": 0.42822265625,
"grad_norm": 1.352899193763733,
"learning_rate": 6.3893340566785046e-06,
"loss": 0.3021,
"step": 877
},
{
"epoch": 0.4287109375,
"grad_norm": 1.3821226358413696,
"learning_rate": 6.381734447683152e-06,
"loss": 0.3326,
"step": 878
},
{
"epoch": 0.42919921875,
"grad_norm": 1.675229787826538,
"learning_rate": 6.374131381159436e-06,
"loss": 0.4357,
"step": 879
},
{
"epoch": 0.4296875,
"grad_norm": 1.7067149877548218,
"learning_rate": 6.366524876132589e-06,
"loss": 0.3018,
"step": 880
},
{
"epoch": 0.43017578125,
"grad_norm": 1.4271488189697266,
"learning_rate": 6.358914951636444e-06,
"loss": 0.3468,
"step": 881
},
{
"epoch": 0.4306640625,
"grad_norm": 1.3299568891525269,
"learning_rate": 6.351301626713398e-06,
"loss": 0.3466,
"step": 882
},
{
"epoch": 0.43115234375,
"grad_norm": 1.6695646047592163,
"learning_rate": 6.343684920414348e-06,
"loss": 0.3214,
"step": 883
},
{
"epoch": 0.431640625,
"grad_norm": 1.3570027351379395,
"learning_rate": 6.3360648517986605e-06,
"loss": 0.3382,
"step": 884
},
{
"epoch": 0.43212890625,
"grad_norm": 1.385907769203186,
"learning_rate": 6.32844143993411e-06,
"loss": 0.3092,
"step": 885
},
{
"epoch": 0.4326171875,
"grad_norm": 1.5601329803466797,
"learning_rate": 6.320814703896838e-06,
"loss": 0.3587,
"step": 886
},
{
"epoch": 0.43310546875,
"grad_norm": 1.39394211769104,
"learning_rate": 6.313184662771305e-06,
"loss": 0.3404,
"step": 887
},
{
"epoch": 0.43359375,
"grad_norm": 1.2028573751449585,
"learning_rate": 6.305551335650244e-06,
"loss": 0.3548,
"step": 888
},
{
"epoch": 0.43408203125,
"grad_norm": 4.250852108001709,
"learning_rate": 6.297914741634605e-06,
"loss": 0.3454,
"step": 889
},
{
"epoch": 0.4345703125,
"grad_norm": 1.5344691276550293,
"learning_rate": 6.290274899833517e-06,
"loss": 0.3176,
"step": 890
},
{
"epoch": 0.43505859375,
"grad_norm": 1.7602498531341553,
"learning_rate": 6.2826318293642385e-06,
"loss": 0.339,
"step": 891
},
{
"epoch": 0.435546875,
"grad_norm": 1.1949964761734009,
"learning_rate": 6.274985549352098e-06,
"loss": 0.304,
"step": 892
},
{
"epoch": 0.43603515625,
"grad_norm": 1.1564438343048096,
"learning_rate": 6.267336078930464e-06,
"loss": 0.3145,
"step": 893
},
{
"epoch": 0.4365234375,
"grad_norm": 1.3757606744766235,
"learning_rate": 6.259683437240683e-06,
"loss": 0.3385,
"step": 894
},
{
"epoch": 0.43701171875,
"grad_norm": 1.8371174335479736,
"learning_rate": 6.252027643432044e-06,
"loss": 0.3355,
"step": 895
},
{
"epoch": 0.4375,
"grad_norm": 1.334598422050476,
"learning_rate": 6.244368716661714e-06,
"loss": 0.3276,
"step": 896
},
{
"epoch": 0.43798828125,
"grad_norm": 1.5038282871246338,
"learning_rate": 6.236706676094705e-06,
"loss": 0.3522,
"step": 897
},
{
"epoch": 0.4384765625,
"grad_norm": 3.6733760833740234,
"learning_rate": 6.229041540903823e-06,
"loss": 0.3431,
"step": 898
},
{
"epoch": 0.43896484375,
"grad_norm": 1.5863288640975952,
"learning_rate": 6.221373330269613e-06,
"loss": 0.3324,
"step": 899
},
{
"epoch": 0.439453125,
"grad_norm": 1.4606237411499023,
"learning_rate": 6.213702063380317e-06,
"loss": 0.3226,
"step": 900
},
{
"epoch": 0.43994140625,
"grad_norm": 1.8370083570480347,
"learning_rate": 6.206027759431825e-06,
"loss": 0.3294,
"step": 901
},
{
"epoch": 0.4404296875,
"grad_norm": 1.6841802597045898,
"learning_rate": 6.198350437627631e-06,
"loss": 0.3238,
"step": 902
},
{
"epoch": 0.44091796875,
"grad_norm": 1.9791240692138672,
"learning_rate": 6.190670117178772e-06,
"loss": 0.3326,
"step": 903
},
{
"epoch": 0.44140625,
"grad_norm": 1.4503194093704224,
"learning_rate": 6.182986817303794e-06,
"loss": 0.3544,
"step": 904
},
{
"epoch": 0.44189453125,
"grad_norm": 1.9381232261657715,
"learning_rate": 6.175300557228698e-06,
"loss": 0.3278,
"step": 905
},
{
"epoch": 0.4423828125,
"grad_norm": 4.399080753326416,
"learning_rate": 6.167611356186895e-06,
"loss": 0.3367,
"step": 906
},
{
"epoch": 0.44287109375,
"grad_norm": 1.4784455299377441,
"learning_rate": 6.159919233419147e-06,
"loss": 0.3559,
"step": 907
},
{
"epoch": 0.443359375,
"grad_norm": 1.9754478931427002,
"learning_rate": 6.152224208173533e-06,
"loss": 0.3311,
"step": 908
},
{
"epoch": 0.44384765625,
"grad_norm": 1.5615670680999756,
"learning_rate": 6.144526299705396e-06,
"loss": 0.4023,
"step": 909
},
{
"epoch": 0.4443359375,
"grad_norm": 1.461332082748413,
"learning_rate": 6.136825527277295e-06,
"loss": 0.3026,
"step": 910
},
{
"epoch": 0.44482421875,
"grad_norm": 1.4366703033447266,
"learning_rate": 6.129121910158945e-06,
"loss": 0.336,
"step": 911
},
{
"epoch": 0.4453125,
"grad_norm": 2.06691575050354,
"learning_rate": 6.12141546762719e-06,
"loss": 0.342,
"step": 912
},
{
"epoch": 0.44580078125,
"grad_norm": 1.7794272899627686,
"learning_rate": 6.11370621896594e-06,
"loss": 0.3532,
"step": 913
},
{
"epoch": 0.4462890625,
"grad_norm": 1.4335381984710693,
"learning_rate": 6.105994183466131e-06,
"loss": 0.3471,
"step": 914
},
{
"epoch": 0.44677734375,
"grad_norm": 5.071071147918701,
"learning_rate": 6.0982793804256636e-06,
"loss": 0.336,
"step": 915
},
{
"epoch": 0.447265625,
"grad_norm": 1.2241181135177612,
"learning_rate": 6.090561829149373e-06,
"loss": 0.3232,
"step": 916
},
{
"epoch": 0.44775390625,
"grad_norm": 1.267858624458313,
"learning_rate": 6.082841548948966e-06,
"loss": 0.3556,
"step": 917
},
{
"epoch": 0.4482421875,
"grad_norm": 1.1905056238174438,
"learning_rate": 6.07511855914298e-06,
"loss": 0.2941,
"step": 918
},
{
"epoch": 0.44873046875,
"grad_norm": 1.2715431451797485,
"learning_rate": 6.067392879056729e-06,
"loss": 0.3159,
"step": 919
},
{
"epoch": 0.44921875,
"grad_norm": 1.2241966724395752,
"learning_rate": 6.059664528022267e-06,
"loss": 0.3141,
"step": 920
},
{
"epoch": 0.44970703125,
"grad_norm": 1.6341863870620728,
"learning_rate": 6.051933525378323e-06,
"loss": 0.3319,
"step": 921
},
{
"epoch": 0.4501953125,
"grad_norm": 3.6661813259124756,
"learning_rate": 6.044199890470267e-06,
"loss": 0.3482,
"step": 922
},
{
"epoch": 0.45068359375,
"grad_norm": 1.4551990032196045,
"learning_rate": 6.036463642650049e-06,
"loss": 0.3899,
"step": 923
},
{
"epoch": 0.451171875,
"grad_norm": 1.8738077878952026,
"learning_rate": 6.028724801276167e-06,
"loss": 0.3412,
"step": 924
},
{
"epoch": 0.45166015625,
"grad_norm": 1.3348729610443115,
"learning_rate": 6.020983385713601e-06,
"loss": 0.3194,
"step": 925
},
{
"epoch": 0.4521484375,
"grad_norm": 1.675868034362793,
"learning_rate": 6.013239415333776e-06,
"loss": 0.338,
"step": 926
},
{
"epoch": 0.45263671875,
"grad_norm": 1.5089606046676636,
"learning_rate": 6.005492909514507e-06,
"loss": 0.3502,
"step": 927
},
{
"epoch": 0.453125,
"grad_norm": 1.6367465257644653,
"learning_rate": 5.997743887639959e-06,
"loss": 0.3356,
"step": 928
},
{
"epoch": 0.45361328125,
"grad_norm": 1.5445111989974976,
"learning_rate": 5.989992369100586e-06,
"loss": 0.3192,
"step": 929
},
{
"epoch": 0.4541015625,
"grad_norm": 1.2671817541122437,
"learning_rate": 5.982238373293093e-06,
"loss": 0.3282,
"step": 930
},
{
"epoch": 0.45458984375,
"grad_norm": 1.2266660928726196,
"learning_rate": 5.974481919620386e-06,
"loss": 0.3202,
"step": 931
},
{
"epoch": 0.455078125,
"grad_norm": 1.5652544498443604,
"learning_rate": 5.966723027491518e-06,
"loss": 0.3502,
"step": 932
},
{
"epoch": 0.45556640625,
"grad_norm": 1.2947496175765991,
"learning_rate": 5.958961716321644e-06,
"loss": 0.317,
"step": 933
},
{
"epoch": 0.4560546875,
"grad_norm": 2.053834915161133,
"learning_rate": 5.951198005531974e-06,
"loss": 0.308,
"step": 934
},
{
"epoch": 0.45654296875,
"grad_norm": 2.342907428741455,
"learning_rate": 5.943431914549721e-06,
"loss": 0.3314,
"step": 935
},
{
"epoch": 0.45703125,
"grad_norm": 1.5535999536514282,
"learning_rate": 5.9356634628080555e-06,
"loss": 0.3362,
"step": 936
},
{
"epoch": 0.45751953125,
"grad_norm": 1.607968807220459,
"learning_rate": 5.927892669746054e-06,
"loss": 0.317,
"step": 937
},
{
"epoch": 0.4580078125,
"grad_norm": 1.268129825592041,
"learning_rate": 5.920119554808651e-06,
"loss": 0.3278,
"step": 938
},
{
"epoch": 0.45849609375,
"grad_norm": 4.848256587982178,
"learning_rate": 5.912344137446593e-06,
"loss": 0.3448,
"step": 939
},
{
"epoch": 0.458984375,
"grad_norm": 1.1670955419540405,
"learning_rate": 5.904566437116388e-06,
"loss": 0.2967,
"step": 940
},
{
"epoch": 0.45947265625,
"grad_norm": 2.250368595123291,
"learning_rate": 5.896786473280255e-06,
"loss": 0.32,
"step": 941
},
{
"epoch": 0.4599609375,
"grad_norm": 1.5156008005142212,
"learning_rate": 5.889004265406077e-06,
"loss": 0.2914,
"step": 942
},
{
"epoch": 0.46044921875,
"grad_norm": 1.0980958938598633,
"learning_rate": 5.8812198329673545e-06,
"loss": 0.304,
"step": 943
},
{
"epoch": 0.4609375,
"grad_norm": 1.7652188539505005,
"learning_rate": 5.873433195443152e-06,
"loss": 0.3497,
"step": 944
},
{
"epoch": 0.46142578125,
"grad_norm": 1.977793574333191,
"learning_rate": 5.865644372318053e-06,
"loss": 0.3598,
"step": 945
},
{
"epoch": 0.4619140625,
"grad_norm": 1.490369200706482,
"learning_rate": 5.857853383082112e-06,
"loss": 0.3433,
"step": 946
},
{
"epoch": 0.46240234375,
"grad_norm": 5.214506149291992,
"learning_rate": 5.8500602472307974e-06,
"loss": 0.3506,
"step": 947
},
{
"epoch": 0.462890625,
"grad_norm": 1.304093837738037,
"learning_rate": 5.842264984264958e-06,
"loss": 0.3035,
"step": 948
},
{
"epoch": 0.46337890625,
"grad_norm": 1.2441211938858032,
"learning_rate": 5.834467613690759e-06,
"loss": 0.3308,
"step": 949
},
{
"epoch": 0.4638671875,
"grad_norm": 1.0881738662719727,
"learning_rate": 5.82666815501964e-06,
"loss": 0.3163,
"step": 950
},
{
"epoch": 0.46435546875,
"grad_norm": 1.4398066997528076,
"learning_rate": 5.8188666277682695e-06,
"loss": 0.327,
"step": 951
},
{
"epoch": 0.46484375,
"grad_norm": 1.81572425365448,
"learning_rate": 5.8110630514584854e-06,
"loss": 0.3328,
"step": 952
},
{
"epoch": 0.46533203125,
"grad_norm": 1.5575212240219116,
"learning_rate": 5.803257445617263e-06,
"loss": 0.3495,
"step": 953
},
{
"epoch": 0.4658203125,
"grad_norm": 1.3975605964660645,
"learning_rate": 5.795449829776645e-06,
"loss": 0.3448,
"step": 954
},
{
"epoch": 0.46630859375,
"grad_norm": 1.2950125932693481,
"learning_rate": 5.787640223473713e-06,
"loss": 0.3617,
"step": 955
},
{
"epoch": 0.466796875,
"grad_norm": 1.3984689712524414,
"learning_rate": 5.779828646250522e-06,
"loss": 0.3608,
"step": 956
},
{
"epoch": 0.46728515625,
"grad_norm": 1.0765591859817505,
"learning_rate": 5.772015117654065e-06,
"loss": 0.3093,
"step": 957
},
{
"epoch": 0.4677734375,
"grad_norm": 1.5954604148864746,
"learning_rate": 5.764199657236214e-06,
"loss": 0.3504,
"step": 958
},
{
"epoch": 0.46826171875,
"grad_norm": 1.6604746580123901,
"learning_rate": 5.756382284553675e-06,
"loss": 0.3096,
"step": 959
},
{
"epoch": 0.46875,
"grad_norm": 1.3618206977844238,
"learning_rate": 5.7485630191679456e-06,
"loss": 0.3057,
"step": 960
},
{
"epoch": 0.46923828125,
"grad_norm": 1.217523217201233,
"learning_rate": 5.740741880645248e-06,
"loss": 0.3708,
"step": 961
},
{
"epoch": 0.4697265625,
"grad_norm": 1.2130963802337646,
"learning_rate": 5.7329188885565e-06,
"loss": 0.321,
"step": 962
},
{
"epoch": 0.47021484375,
"grad_norm": 1.3064903020858765,
"learning_rate": 5.725094062477256e-06,
"loss": 0.3211,
"step": 963
},
{
"epoch": 0.470703125,
"grad_norm": 1.5063132047653198,
"learning_rate": 5.717267421987659e-06,
"loss": 0.3307,
"step": 964
},
{
"epoch": 0.47119140625,
"grad_norm": 1.3585816621780396,
"learning_rate": 5.7094389866723905e-06,
"loss": 0.3631,
"step": 965
},
{
"epoch": 0.4716796875,
"grad_norm": 1.5815399885177612,
"learning_rate": 5.701608776120627e-06,
"loss": 0.352,
"step": 966
},
{
"epoch": 0.47216796875,
"grad_norm": 1.4560235738754272,
"learning_rate": 5.6937768099259845e-06,
"loss": 0.3109,
"step": 967
},
{
"epoch": 0.47265625,
"grad_norm": 1.8057149648666382,
"learning_rate": 5.685943107686476e-06,
"loss": 0.3218,
"step": 968
},
{
"epoch": 0.47314453125,
"grad_norm": 1.4362132549285889,
"learning_rate": 5.678107689004449e-06,
"loss": 0.3293,
"step": 969
},
{
"epoch": 0.4736328125,
"grad_norm": 2.0112991333007812,
"learning_rate": 5.670270573486555e-06,
"loss": 0.356,
"step": 970
},
{
"epoch": 0.47412109375,
"grad_norm": 1.2395293712615967,
"learning_rate": 5.662431780743691e-06,
"loss": 0.3439,
"step": 971
},
{
"epoch": 0.474609375,
"grad_norm": 1.4867768287658691,
"learning_rate": 5.6545913303909495e-06,
"loss": 0.3767,
"step": 972
},
{
"epoch": 0.47509765625,
"grad_norm": 1.210928201675415,
"learning_rate": 5.646749242047567e-06,
"loss": 0.3259,
"step": 973
},
{
"epoch": 0.4755859375,
"grad_norm": 1.157676100730896,
"learning_rate": 5.6389055353368826e-06,
"loss": 0.336,
"step": 974
},
{
"epoch": 0.47607421875,
"grad_norm": 1.485719919204712,
"learning_rate": 5.631060229886287e-06,
"loss": 0.3121,
"step": 975
},
{
"epoch": 0.4765625,
"grad_norm": 1.1137949228286743,
"learning_rate": 5.6232133453271676e-06,
"loss": 0.3362,
"step": 976
},
{
"epoch": 0.47705078125,
"grad_norm": 1.213346004486084,
"learning_rate": 5.615364901294863e-06,
"loss": 0.3194,
"step": 977
},
{
"epoch": 0.4775390625,
"grad_norm": 1.3590606451034546,
"learning_rate": 5.607514917428618e-06,
"loss": 0.3484,
"step": 978
},
{
"epoch": 0.47802734375,
"grad_norm": 2.0311455726623535,
"learning_rate": 5.599663413371527e-06,
"loss": 0.3419,
"step": 979
},
{
"epoch": 0.478515625,
"grad_norm": 1.195672869682312,
"learning_rate": 5.5918104087704925e-06,
"loss": 0.339,
"step": 980
},
{
"epoch": 0.47900390625,
"grad_norm": 1.8912562131881714,
"learning_rate": 5.583955923276163e-06,
"loss": 0.3427,
"step": 981
},
{
"epoch": 0.4794921875,
"grad_norm": 2.002305030822754,
"learning_rate": 5.576099976542904e-06,
"loss": 0.3595,
"step": 982
},
{
"epoch": 0.47998046875,
"grad_norm": 1.4438331127166748,
"learning_rate": 5.56824258822873e-06,
"loss": 0.3632,
"step": 983
},
{
"epoch": 0.48046875,
"grad_norm": 1.366222620010376,
"learning_rate": 5.560383777995264e-06,
"loss": 0.3188,
"step": 984
},
{
"epoch": 0.48095703125,
"grad_norm": 1.3330532312393188,
"learning_rate": 5.552523565507689e-06,
"loss": 0.3262,
"step": 985
},
{
"epoch": 0.4814453125,
"grad_norm": 1.5084117650985718,
"learning_rate": 5.544661970434696e-06,
"loss": 0.325,
"step": 986
},
{
"epoch": 0.48193359375,
"grad_norm": 1.0425949096679688,
"learning_rate": 5.536799012448435e-06,
"loss": 0.315,
"step": 987
},
{
"epoch": 0.482421875,
"grad_norm": 2.695110559463501,
"learning_rate": 5.528934711224467e-06,
"loss": 0.3166,
"step": 988
},
{
"epoch": 0.48291015625,
"grad_norm": 1.3446696996688843,
"learning_rate": 5.521069086441715e-06,
"loss": 0.3437,
"step": 989
},
{
"epoch": 0.4833984375,
"grad_norm": 1.360203742980957,
"learning_rate": 5.513202157782411e-06,
"loss": 0.3472,
"step": 990
},
{
"epoch": 0.48388671875,
"grad_norm": 1.3492072820663452,
"learning_rate": 5.505333944932053e-06,
"loss": 0.3363,
"step": 991
},
{
"epoch": 0.484375,
"grad_norm": 1.1588752269744873,
"learning_rate": 5.497464467579351e-06,
"loss": 0.338,
"step": 992
},
{
"epoch": 0.48486328125,
"grad_norm": 1.4233770370483398,
"learning_rate": 5.48959374541618e-06,
"loss": 0.336,
"step": 993
},
{
"epoch": 0.4853515625,
"grad_norm": 1.3421063423156738,
"learning_rate": 5.4817217981375286e-06,
"loss": 0.324,
"step": 994
},
{
"epoch": 0.48583984375,
"grad_norm": 1.6678565740585327,
"learning_rate": 5.473848645441452e-06,
"loss": 0.3189,
"step": 995
},
{
"epoch": 0.486328125,
"grad_norm": 1.912955641746521,
"learning_rate": 5.465974307029021e-06,
"loss": 0.3643,
"step": 996
},
{
"epoch": 0.48681640625,
"grad_norm": 2.0670387744903564,
"learning_rate": 5.458098802604273e-06,
"loss": 0.332,
"step": 997
},
{
"epoch": 0.4873046875,
"grad_norm": 2.6159446239471436,
"learning_rate": 5.450222151874166e-06,
"loss": 0.3674,
"step": 998
},
{
"epoch": 0.48779296875,
"grad_norm": 1.3627862930297852,
"learning_rate": 5.442344374548524e-06,
"loss": 0.3496,
"step": 999
},
{
"epoch": 0.48828125,
"grad_norm": 1.4907851219177246,
"learning_rate": 5.43446549033999e-06,
"loss": 0.3475,
"step": 1000
}
],
"logging_steps": 1.0,
"max_steps": 2048,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.000488202654974e+18,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}