diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.48828125, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00048828125, + "grad_norm": 11.68676471710205, + "learning_rate": 1.6129032258064518e-07, + "loss": 0.6894, + "step": 1 + }, + { + "epoch": 0.0009765625, + "grad_norm": 16.37053871154785, + "learning_rate": 3.2258064516129035e-07, + "loss": 0.7171, + "step": 2 + }, + { + "epoch": 0.00146484375, + "grad_norm": 23.564491271972656, + "learning_rate": 4.838709677419355e-07, + "loss": 0.7123, + "step": 3 + }, + { + "epoch": 0.001953125, + "grad_norm": 16.051462173461914, + "learning_rate": 6.451612903225807e-07, + "loss": 0.7445, + "step": 4 + }, + { + "epoch": 0.00244140625, + "grad_norm": 13.484965324401855, + "learning_rate": 8.064516129032258e-07, + "loss": 0.7697, + "step": 5 + }, + { + "epoch": 0.0029296875, + "grad_norm": 12.733880043029785, + "learning_rate": 9.67741935483871e-07, + "loss": 0.6796, + "step": 6 + }, + { + "epoch": 0.00341796875, + "grad_norm": 11.081924438476562, + "learning_rate": 1.1290322580645162e-06, + "loss": 0.6711, + "step": 7 + }, + { + "epoch": 0.00390625, + "grad_norm": 11.96164321899414, + "learning_rate": 1.2903225806451614e-06, + "loss": 0.6916, + "step": 8 + }, + { + "epoch": 0.00439453125, + "grad_norm": 8.69968318939209, + "learning_rate": 1.4516129032258066e-06, + "loss": 0.6125, + "step": 9 + }, + { + "epoch": 0.0048828125, + "grad_norm": 8.749759674072266, + "learning_rate": 1.6129032258064516e-06, + "loss": 0.5684, + "step": 10 + }, + { + "epoch": 0.00537109375, + "grad_norm": 9.206546783447266, + "learning_rate": 1.774193548387097e-06, + "loss": 0.5901, + "step": 11 + }, + { + "epoch": 0.005859375, + "grad_norm": 6.172158718109131, + "learning_rate": 1.935483870967742e-06, + "loss": 0.5147, + "step": 12 + }, + { + "epoch": 0.00634765625, + "grad_norm": 5.583189010620117, + "learning_rate": 2.096774193548387e-06, + "loss": 0.5078, + "step": 13 + }, + { + "epoch": 0.0068359375, + "grad_norm": 8.174113273620605, + "learning_rate": 2.2580645161290324e-06, + "loss": 0.5151, + "step": 14 + }, + { + "epoch": 0.00732421875, + "grad_norm": 11.44507122039795, + "learning_rate": 2.4193548387096776e-06, + "loss": 0.5215, + "step": 15 + }, + { + "epoch": 0.0078125, + "grad_norm": 4.763265132904053, + "learning_rate": 2.580645161290323e-06, + "loss": 0.5062, + "step": 16 + }, + { + "epoch": 0.00830078125, + "grad_norm": 7.144759178161621, + "learning_rate": 2.7419354838709676e-06, + "loss": 0.5313, + "step": 17 + }, + { + "epoch": 0.0087890625, + "grad_norm": 4.595753192901611, + "learning_rate": 2.903225806451613e-06, + "loss": 0.4514, + "step": 18 + }, + { + "epoch": 0.00927734375, + "grad_norm": 5.988632678985596, + "learning_rate": 3.0645161290322584e-06, + "loss": 0.468, + "step": 19 + }, + { + "epoch": 0.009765625, + "grad_norm": 5.993471145629883, + "learning_rate": 3.225806451612903e-06, + "loss": 0.4231, + "step": 20 + }, + { + "epoch": 0.01025390625, + "grad_norm": 5.629610538482666, + "learning_rate": 3.3870967741935484e-06, + "loss": 0.4748, + "step": 21 + }, + { + "epoch": 0.0107421875, + "grad_norm": 5.070748329162598, + "learning_rate": 3.548387096774194e-06, + "loss": 0.4851, + "step": 22 + }, + { + "epoch": 0.01123046875, + "grad_norm": 5.008419990539551, + "learning_rate": 3.7096774193548392e-06, + "loss": 0.4251, + "step": 23 + }, + { + "epoch": 0.01171875, + "grad_norm": 5.048961162567139, + "learning_rate": 3.870967741935484e-06, + "loss": 0.4423, + "step": 24 + }, + { + "epoch": 0.01220703125, + "grad_norm": 3.505443811416626, + "learning_rate": 4.032258064516129e-06, + "loss": 0.4165, + "step": 25 + }, + { + "epoch": 0.0126953125, + "grad_norm": 4.471498966217041, + "learning_rate": 4.193548387096774e-06, + "loss": 0.4132, + "step": 26 + }, + { + "epoch": 0.01318359375, + "grad_norm": 3.593733310699463, + "learning_rate": 4.35483870967742e-06, + "loss": 0.38, + "step": 27 + }, + { + "epoch": 0.013671875, + "grad_norm": 7.17294979095459, + "learning_rate": 4.516129032258065e-06, + "loss": 0.3956, + "step": 28 + }, + { + "epoch": 0.01416015625, + "grad_norm": 15.088685989379883, + "learning_rate": 4.67741935483871e-06, + "loss": 0.4425, + "step": 29 + }, + { + "epoch": 0.0146484375, + "grad_norm": 4.4346113204956055, + "learning_rate": 4.838709677419355e-06, + "loss": 0.3911, + "step": 30 + }, + { + "epoch": 0.01513671875, + "grad_norm": 4.740771293640137, + "learning_rate": 5e-06, + "loss": 0.423, + "step": 31 + }, + { + "epoch": 0.015625, + "grad_norm": 3.4211642742156982, + "learning_rate": 5.161290322580646e-06, + "loss": 0.4183, + "step": 32 + }, + { + "epoch": 0.01611328125, + "grad_norm": 5.500433444976807, + "learning_rate": 5.322580645161291e-06, + "loss": 0.3956, + "step": 33 + }, + { + "epoch": 0.0166015625, + "grad_norm": 4.092607021331787, + "learning_rate": 5.483870967741935e-06, + "loss": 0.4028, + "step": 34 + }, + { + "epoch": 0.01708984375, + "grad_norm": 12.963457107543945, + "learning_rate": 5.645161290322582e-06, + "loss": 0.3862, + "step": 35 + }, + { + "epoch": 0.017578125, + "grad_norm": 4.550689697265625, + "learning_rate": 5.806451612903226e-06, + "loss": 0.4078, + "step": 36 + }, + { + "epoch": 0.01806640625, + "grad_norm": 3.3017280101776123, + "learning_rate": 5.967741935483872e-06, + "loss": 0.4334, + "step": 37 + }, + { + "epoch": 0.0185546875, + "grad_norm": 4.2097954750061035, + "learning_rate": 6.129032258064517e-06, + "loss": 0.342, + "step": 38 + }, + { + "epoch": 0.01904296875, + "grad_norm": 2.9576752185821533, + "learning_rate": 6.290322580645162e-06, + "loss": 0.3824, + "step": 39 + }, + { + "epoch": 0.01953125, + "grad_norm": 6.747947692871094, + "learning_rate": 6.451612903225806e-06, + "loss": 0.3952, + "step": 40 + }, + { + "epoch": 0.02001953125, + "grad_norm": 2.851712942123413, + "learning_rate": 6.612903225806452e-06, + "loss": 0.4143, + "step": 41 + }, + { + "epoch": 0.0205078125, + "grad_norm": 3.3788578510284424, + "learning_rate": 6.774193548387097e-06, + "loss": 0.3733, + "step": 42 + }, + { + "epoch": 0.02099609375, + "grad_norm": 4.708284378051758, + "learning_rate": 6.935483870967743e-06, + "loss": 0.3955, + "step": 43 + }, + { + "epoch": 0.021484375, + "grad_norm": 3.0566701889038086, + "learning_rate": 7.096774193548388e-06, + "loss": 0.402, + "step": 44 + }, + { + "epoch": 0.02197265625, + "grad_norm": 4.440851211547852, + "learning_rate": 7.258064516129033e-06, + "loss": 0.361, + "step": 45 + }, + { + "epoch": 0.0224609375, + "grad_norm": 2.7747905254364014, + "learning_rate": 7.4193548387096784e-06, + "loss": 0.3896, + "step": 46 + }, + { + "epoch": 0.02294921875, + "grad_norm": 3.510695695877075, + "learning_rate": 7.580645161290323e-06, + "loss": 0.364, + "step": 47 + }, + { + "epoch": 0.0234375, + "grad_norm": 20.806020736694336, + "learning_rate": 7.741935483870968e-06, + "loss": 0.3849, + "step": 48 + }, + { + "epoch": 0.02392578125, + "grad_norm": 3.569124698638916, + "learning_rate": 7.903225806451613e-06, + "loss": 0.3569, + "step": 49 + }, + { + "epoch": 0.0244140625, + "grad_norm": 2.8412413597106934, + "learning_rate": 8.064516129032258e-06, + "loss": 0.362, + "step": 50 + }, + { + "epoch": 0.02490234375, + "grad_norm": 3.287231683731079, + "learning_rate": 8.225806451612904e-06, + "loss": 0.3941, + "step": 51 + }, + { + "epoch": 0.025390625, + "grad_norm": 2.849888563156128, + "learning_rate": 8.387096774193549e-06, + "loss": 0.3906, + "step": 52 + }, + { + "epoch": 0.02587890625, + "grad_norm": 6.925948619842529, + "learning_rate": 8.548387096774194e-06, + "loss": 0.3783, + "step": 53 + }, + { + "epoch": 0.0263671875, + "grad_norm": 2.9347381591796875, + "learning_rate": 8.70967741935484e-06, + "loss": 0.4156, + "step": 54 + }, + { + "epoch": 0.02685546875, + "grad_norm": 3.695150375366211, + "learning_rate": 8.870967741935484e-06, + "loss": 0.3586, + "step": 55 + }, + { + "epoch": 0.02734375, + "grad_norm": 7.241847038269043, + "learning_rate": 9.03225806451613e-06, + "loss": 0.3693, + "step": 56 + }, + { + "epoch": 0.02783203125, + "grad_norm": 2.603956699371338, + "learning_rate": 9.193548387096775e-06, + "loss": 0.4109, + "step": 57 + }, + { + "epoch": 0.0283203125, + "grad_norm": 3.0118958950042725, + "learning_rate": 9.35483870967742e-06, + "loss": 0.4096, + "step": 58 + }, + { + "epoch": 0.02880859375, + "grad_norm": 5.108702182769775, + "learning_rate": 9.516129032258065e-06, + "loss": 0.3786, + "step": 59 + }, + { + "epoch": 0.029296875, + "grad_norm": 3.0591766834259033, + "learning_rate": 9.67741935483871e-06, + "loss": 0.3979, + "step": 60 + }, + { + "epoch": 0.02978515625, + "grad_norm": 3.5517218112945557, + "learning_rate": 9.838709677419356e-06, + "loss": 0.3847, + "step": 61 + }, + { + "epoch": 0.0302734375, + "grad_norm": 3.091423988342285, + "learning_rate": 1e-05, + "loss": 0.35, + "step": 62 + }, + { + "epoch": 0.03076171875, + "grad_norm": 2.7133779525756836, + "learning_rate": 9.999993744224208e-06, + "loss": 0.3592, + "step": 63 + }, + { + "epoch": 0.03125, + "grad_norm": 2.4324684143066406, + "learning_rate": 9.999974976912485e-06, + "loss": 0.3616, + "step": 64 + }, + { + "epoch": 0.03173828125, + "grad_norm": 2.872821807861328, + "learning_rate": 9.999943698111792e-06, + "loss": 0.3741, + "step": 65 + }, + { + "epoch": 0.0322265625, + "grad_norm": 2.9383156299591064, + "learning_rate": 9.999899907900399e-06, + "loss": 0.3732, + "step": 66 + }, + { + "epoch": 0.03271484375, + "grad_norm": 3.5359489917755127, + "learning_rate": 9.999843606387883e-06, + "loss": 0.4053, + "step": 67 + }, + { + "epoch": 0.033203125, + "grad_norm": 3.5608558654785156, + "learning_rate": 9.999774793715126e-06, + "loss": 0.4197, + "step": 68 + }, + { + "epoch": 0.03369140625, + "grad_norm": 2.5407004356384277, + "learning_rate": 9.999693470054321e-06, + "loss": 0.354, + "step": 69 + }, + { + "epoch": 0.0341796875, + "grad_norm": 3.4264254570007324, + "learning_rate": 9.999599635608964e-06, + "loss": 0.3936, + "step": 70 + }, + { + "epoch": 0.03466796875, + "grad_norm": 3.0363235473632812, + "learning_rate": 9.999493290613859e-06, + "loss": 0.3753, + "step": 71 + }, + { + "epoch": 0.03515625, + "grad_norm": 2.2824833393096924, + "learning_rate": 9.999374435335113e-06, + "loss": 0.3813, + "step": 72 + }, + { + "epoch": 0.03564453125, + "grad_norm": 2.445328712463379, + "learning_rate": 9.999243070070137e-06, + "loss": 0.4237, + "step": 73 + }, + { + "epoch": 0.0361328125, + "grad_norm": 5.150700092315674, + "learning_rate": 9.99909919514765e-06, + "loss": 0.3892, + "step": 74 + }, + { + "epoch": 0.03662109375, + "grad_norm": 4.1412272453308105, + "learning_rate": 9.998942810927673e-06, + "loss": 0.3675, + "step": 75 + }, + { + "epoch": 0.037109375, + "grad_norm": 5.456881999969482, + "learning_rate": 9.998773917801526e-06, + "loss": 0.3955, + "step": 76 + }, + { + "epoch": 0.03759765625, + "grad_norm": 2.2837321758270264, + "learning_rate": 9.998592516191832e-06, + "loss": 0.3477, + "step": 77 + }, + { + "epoch": 0.0380859375, + "grad_norm": 2.237900972366333, + "learning_rate": 9.998398606552513e-06, + "loss": 0.3771, + "step": 78 + }, + { + "epoch": 0.03857421875, + "grad_norm": 2.6276211738586426, + "learning_rate": 9.998192189368795e-06, + "loss": 0.3989, + "step": 79 + }, + { + "epoch": 0.0390625, + "grad_norm": 3.5280210971832275, + "learning_rate": 9.997973265157192e-06, + "loss": 0.3726, + "step": 80 + }, + { + "epoch": 0.03955078125, + "grad_norm": 8.555140495300293, + "learning_rate": 9.997741834465526e-06, + "loss": 0.397, + "step": 81 + }, + { + "epoch": 0.0400390625, + "grad_norm": 2.1885085105895996, + "learning_rate": 9.997497897872904e-06, + "loss": 0.4058, + "step": 82 + }, + { + "epoch": 0.04052734375, + "grad_norm": 3.0636098384857178, + "learning_rate": 9.997241455989735e-06, + "loss": 0.3866, + "step": 83 + }, + { + "epoch": 0.041015625, + "grad_norm": 3.7982375621795654, + "learning_rate": 9.996972509457711e-06, + "loss": 0.3877, + "step": 84 + }, + { + "epoch": 0.04150390625, + "grad_norm": 2.4791505336761475, + "learning_rate": 9.996691058949826e-06, + "loss": 0.3789, + "step": 85 + }, + { + "epoch": 0.0419921875, + "grad_norm": 3.917693614959717, + "learning_rate": 9.996397105170353e-06, + "loss": 0.3737, + "step": 86 + }, + { + "epoch": 0.04248046875, + "grad_norm": 2.3083252906799316, + "learning_rate": 9.996090648854856e-06, + "loss": 0.3658, + "step": 87 + }, + { + "epoch": 0.04296875, + "grad_norm": 1.9872547388076782, + "learning_rate": 9.995771690770184e-06, + "loss": 0.3819, + "step": 88 + }, + { + "epoch": 0.04345703125, + "grad_norm": 1.8703923225402832, + "learning_rate": 9.995440231714469e-06, + "loss": 0.37, + "step": 89 + }, + { + "epoch": 0.0439453125, + "grad_norm": 2.7573578357696533, + "learning_rate": 9.995096272517122e-06, + "loss": 0.3876, + "step": 90 + }, + { + "epoch": 0.04443359375, + "grad_norm": 2.177542209625244, + "learning_rate": 9.99473981403884e-06, + "loss": 0.434, + "step": 91 + }, + { + "epoch": 0.044921875, + "grad_norm": 1.9339114427566528, + "learning_rate": 9.99437085717159e-06, + "loss": 0.333, + "step": 92 + }, + { + "epoch": 0.04541015625, + "grad_norm": 2.9820590019226074, + "learning_rate": 9.993989402838618e-06, + "loss": 0.3321, + "step": 93 + }, + { + "epoch": 0.0458984375, + "grad_norm": 2.0244717597961426, + "learning_rate": 9.99359545199444e-06, + "loss": 0.3153, + "step": 94 + }, + { + "epoch": 0.04638671875, + "grad_norm": 2.0268101692199707, + "learning_rate": 9.993189005624842e-06, + "loss": 0.3663, + "step": 95 + }, + { + "epoch": 0.046875, + "grad_norm": 1.920785903930664, + "learning_rate": 9.992770064746882e-06, + "loss": 0.3419, + "step": 96 + }, + { + "epoch": 0.04736328125, + "grad_norm": 3.2875781059265137, + "learning_rate": 9.992338630408877e-06, + "loss": 0.3406, + "step": 97 + }, + { + "epoch": 0.0478515625, + "grad_norm": 3.7749016284942627, + "learning_rate": 9.991894703690414e-06, + "loss": 0.3555, + "step": 98 + }, + { + "epoch": 0.04833984375, + "grad_norm": 4.618077754974365, + "learning_rate": 9.991438285702332e-06, + "loss": 0.4001, + "step": 99 + }, + { + "epoch": 0.048828125, + "grad_norm": 2.468576192855835, + "learning_rate": 9.99096937758673e-06, + "loss": 0.4258, + "step": 100 + }, + { + "epoch": 0.04931640625, + "grad_norm": 5.204842567443848, + "learning_rate": 9.990487980516962e-06, + "loss": 0.4107, + "step": 101 + }, + { + "epoch": 0.0498046875, + "grad_norm": 3.3488011360168457, + "learning_rate": 9.989994095697636e-06, + "loss": 0.3658, + "step": 102 + }, + { + "epoch": 0.05029296875, + "grad_norm": 4.41386079788208, + "learning_rate": 9.989487724364602e-06, + "loss": 0.3705, + "step": 103 + }, + { + "epoch": 0.05078125, + "grad_norm": 2.9542033672332764, + "learning_rate": 9.988968867784958e-06, + "loss": 0.3955, + "step": 104 + }, + { + "epoch": 0.05126953125, + "grad_norm": 2.3820998668670654, + "learning_rate": 9.988437527257044e-06, + "loss": 0.3652, + "step": 105 + }, + { + "epoch": 0.0517578125, + "grad_norm": 2.352477550506592, + "learning_rate": 9.987893704110441e-06, + "loss": 0.3545, + "step": 106 + }, + { + "epoch": 0.05224609375, + "grad_norm": 7.221553802490234, + "learning_rate": 9.987337399705964e-06, + "loss": 0.3616, + "step": 107 + }, + { + "epoch": 0.052734375, + "grad_norm": 2.3267176151275635, + "learning_rate": 9.986768615435655e-06, + "loss": 0.3868, + "step": 108 + }, + { + "epoch": 0.05322265625, + "grad_norm": 1.9337338209152222, + "learning_rate": 9.986187352722792e-06, + "loss": 0.3664, + "step": 109 + }, + { + "epoch": 0.0537109375, + "grad_norm": 2.2121877670288086, + "learning_rate": 9.985593613021873e-06, + "loss": 0.3731, + "step": 110 + }, + { + "epoch": 0.05419921875, + "grad_norm": 1.9584633111953735, + "learning_rate": 9.98498739781862e-06, + "loss": 0.3805, + "step": 111 + }, + { + "epoch": 0.0546875, + "grad_norm": 2.3601884841918945, + "learning_rate": 9.984368708629972e-06, + "loss": 0.3328, + "step": 112 + }, + { + "epoch": 0.05517578125, + "grad_norm": 2.705298662185669, + "learning_rate": 9.98373754700408e-06, + "loss": 0.3573, + "step": 113 + }, + { + "epoch": 0.0556640625, + "grad_norm": 4.535929203033447, + "learning_rate": 9.98309391452031e-06, + "loss": 0.3853, + "step": 114 + }, + { + "epoch": 0.05615234375, + "grad_norm": 2.4388949871063232, + "learning_rate": 9.982437812789224e-06, + "loss": 0.3389, + "step": 115 + }, + { + "epoch": 0.056640625, + "grad_norm": 3.7873549461364746, + "learning_rate": 9.981769243452595e-06, + "loss": 0.3745, + "step": 116 + }, + { + "epoch": 0.05712890625, + "grad_norm": 2.1249921321868896, + "learning_rate": 9.981088208183392e-06, + "loss": 0.3854, + "step": 117 + }, + { + "epoch": 0.0576171875, + "grad_norm": 3.2426087856292725, + "learning_rate": 9.980394708685777e-06, + "loss": 0.3743, + "step": 118 + }, + { + "epoch": 0.05810546875, + "grad_norm": 2.349886178970337, + "learning_rate": 9.979688746695099e-06, + "loss": 0.3477, + "step": 119 + }, + { + "epoch": 0.05859375, + "grad_norm": 2.6616315841674805, + "learning_rate": 9.978970323977895e-06, + "loss": 0.3497, + "step": 120 + }, + { + "epoch": 0.05908203125, + "grad_norm": 2.284364938735962, + "learning_rate": 9.978239442331881e-06, + "loss": 0.3987, + "step": 121 + }, + { + "epoch": 0.0595703125, + "grad_norm": 2.347794532775879, + "learning_rate": 9.977496103585949e-06, + "loss": 0.3375, + "step": 122 + }, + { + "epoch": 0.06005859375, + "grad_norm": 2.8935320377349854, + "learning_rate": 9.976740309600166e-06, + "loss": 0.3943, + "step": 123 + }, + { + "epoch": 0.060546875, + "grad_norm": 2.3763160705566406, + "learning_rate": 9.97597206226576e-06, + "loss": 0.3703, + "step": 124 + }, + { + "epoch": 0.06103515625, + "grad_norm": 2.1485118865966797, + "learning_rate": 9.975191363505127e-06, + "loss": 0.3604, + "step": 125 + }, + { + "epoch": 0.0615234375, + "grad_norm": 4.019608020782471, + "learning_rate": 9.974398215271814e-06, + "loss": 0.3345, + "step": 126 + }, + { + "epoch": 0.06201171875, + "grad_norm": 4.793520450592041, + "learning_rate": 9.973592619550528e-06, + "loss": 0.3583, + "step": 127 + }, + { + "epoch": 0.0625, + "grad_norm": 2.3743088245391846, + "learning_rate": 9.972774578357118e-06, + "loss": 0.3612, + "step": 128 + }, + { + "epoch": 0.06298828125, + "grad_norm": 2.3221397399902344, + "learning_rate": 9.971944093738575e-06, + "loss": 0.3759, + "step": 129 + }, + { + "epoch": 0.0634765625, + "grad_norm": 2.639760971069336, + "learning_rate": 9.971101167773032e-06, + "loss": 0.3749, + "step": 130 + }, + { + "epoch": 0.06396484375, + "grad_norm": 2.3176326751708984, + "learning_rate": 9.97024580256975e-06, + "loss": 0.3324, + "step": 131 + }, + { + "epoch": 0.064453125, + "grad_norm": 2.5662341117858887, + "learning_rate": 9.969378000269117e-06, + "loss": 0.3956, + "step": 132 + }, + { + "epoch": 0.06494140625, + "grad_norm": 3.271336793899536, + "learning_rate": 9.968497763042644e-06, + "loss": 0.3702, + "step": 133 + }, + { + "epoch": 0.0654296875, + "grad_norm": 2.0121848583221436, + "learning_rate": 9.96760509309296e-06, + "loss": 0.3644, + "step": 134 + }, + { + "epoch": 0.06591796875, + "grad_norm": 2.1467254161834717, + "learning_rate": 9.9666999926538e-06, + "loss": 0.3444, + "step": 135 + }, + { + "epoch": 0.06640625, + "grad_norm": 2.985793113708496, + "learning_rate": 9.96578246399001e-06, + "loss": 0.4015, + "step": 136 + }, + { + "epoch": 0.06689453125, + "grad_norm": 2.158658504486084, + "learning_rate": 9.964852509397527e-06, + "loss": 0.3809, + "step": 137 + }, + { + "epoch": 0.0673828125, + "grad_norm": 4.1197919845581055, + "learning_rate": 9.963910131203386e-06, + "loss": 0.3874, + "step": 138 + }, + { + "epoch": 0.06787109375, + "grad_norm": 2.2979846000671387, + "learning_rate": 9.962955331765712e-06, + "loss": 0.342, + "step": 139 + }, + { + "epoch": 0.068359375, + "grad_norm": 2.2568418979644775, + "learning_rate": 9.961988113473708e-06, + "loss": 0.3223, + "step": 140 + }, + { + "epoch": 0.06884765625, + "grad_norm": 2.358520030975342, + "learning_rate": 9.961008478747655e-06, + "loss": 0.374, + "step": 141 + }, + { + "epoch": 0.0693359375, + "grad_norm": 2.6409096717834473, + "learning_rate": 9.960016430038903e-06, + "loss": 0.3705, + "step": 142 + }, + { + "epoch": 0.06982421875, + "grad_norm": 2.167280673980713, + "learning_rate": 9.959011969829867e-06, + "loss": 0.3302, + "step": 143 + }, + { + "epoch": 0.0703125, + "grad_norm": 2.3867969512939453, + "learning_rate": 9.957995100634016e-06, + "loss": 0.3251, + "step": 144 + }, + { + "epoch": 0.07080078125, + "grad_norm": 2.305117130279541, + "learning_rate": 9.956965824995873e-06, + "loss": 0.3593, + "step": 145 + }, + { + "epoch": 0.0712890625, + "grad_norm": 2.1817824840545654, + "learning_rate": 9.955924145491005e-06, + "loss": 0.3371, + "step": 146 + }, + { + "epoch": 0.07177734375, + "grad_norm": 4.12109375, + "learning_rate": 9.954870064726017e-06, + "loss": 0.3771, + "step": 147 + }, + { + "epoch": 0.072265625, + "grad_norm": 3.0079329013824463, + "learning_rate": 9.953803585338548e-06, + "loss": 0.3636, + "step": 148 + }, + { + "epoch": 0.07275390625, + "grad_norm": 2.473532199859619, + "learning_rate": 9.95272470999726e-06, + "loss": 0.3692, + "step": 149 + }, + { + "epoch": 0.0732421875, + "grad_norm": 3.1922385692596436, + "learning_rate": 9.95163344140183e-06, + "loss": 0.3773, + "step": 150 + }, + { + "epoch": 0.07373046875, + "grad_norm": 6.991460800170898, + "learning_rate": 9.950529782282955e-06, + "loss": 0.2813, + "step": 151 + }, + { + "epoch": 0.07421875, + "grad_norm": 2.9967305660247803, + "learning_rate": 9.949413735402332e-06, + "loss": 0.3565, + "step": 152 + }, + { + "epoch": 0.07470703125, + "grad_norm": 1.8642289638519287, + "learning_rate": 9.948285303552654e-06, + "loss": 0.3715, + "step": 153 + }, + { + "epoch": 0.0751953125, + "grad_norm": 2.169416904449463, + "learning_rate": 9.947144489557612e-06, + "loss": 0.3507, + "step": 154 + }, + { + "epoch": 0.07568359375, + "grad_norm": 2.5897326469421387, + "learning_rate": 9.945991296271874e-06, + "loss": 0.3508, + "step": 155 + }, + { + "epoch": 0.076171875, + "grad_norm": 1.8967130184173584, + "learning_rate": 9.944825726581085e-06, + "loss": 0.318, + "step": 156 + }, + { + "epoch": 0.07666015625, + "grad_norm": 1.998544454574585, + "learning_rate": 9.943647783401867e-06, + "loss": 0.3757, + "step": 157 + }, + { + "epoch": 0.0771484375, + "grad_norm": 2.5188403129577637, + "learning_rate": 9.942457469681794e-06, + "loss": 0.3551, + "step": 158 + }, + { + "epoch": 0.07763671875, + "grad_norm": 2.2102835178375244, + "learning_rate": 9.941254788399406e-06, + "loss": 0.3499, + "step": 159 + }, + { + "epoch": 0.078125, + "grad_norm": 3.3190438747406006, + "learning_rate": 9.940039742564182e-06, + "loss": 0.3586, + "step": 160 + }, + { + "epoch": 0.07861328125, + "grad_norm": 6.675033092498779, + "learning_rate": 9.938812335216543e-06, + "loss": 0.3892, + "step": 161 + }, + { + "epoch": 0.0791015625, + "grad_norm": 3.091517925262451, + "learning_rate": 9.937572569427844e-06, + "loss": 0.3434, + "step": 162 + }, + { + "epoch": 0.07958984375, + "grad_norm": 2.7739408016204834, + "learning_rate": 9.936320448300364e-06, + "loss": 0.3366, + "step": 163 + }, + { + "epoch": 0.080078125, + "grad_norm": 4.218409538269043, + "learning_rate": 9.935055974967299e-06, + "loss": 0.3129, + "step": 164 + }, + { + "epoch": 0.08056640625, + "grad_norm": 2.2632052898406982, + "learning_rate": 9.933779152592752e-06, + "loss": 0.3507, + "step": 165 + }, + { + "epoch": 0.0810546875, + "grad_norm": 2.3607664108276367, + "learning_rate": 9.93248998437173e-06, + "loss": 0.3598, + "step": 166 + }, + { + "epoch": 0.08154296875, + "grad_norm": 2.2539124488830566, + "learning_rate": 9.931188473530132e-06, + "loss": 0.404, + "step": 167 + }, + { + "epoch": 0.08203125, + "grad_norm": 2.049994945526123, + "learning_rate": 9.929874623324741e-06, + "loss": 0.3534, + "step": 168 + }, + { + "epoch": 0.08251953125, + "grad_norm": 4.720448017120361, + "learning_rate": 9.92854843704322e-06, + "loss": 0.3492, + "step": 169 + }, + { + "epoch": 0.0830078125, + "grad_norm": 2.1875171661376953, + "learning_rate": 9.927209918004095e-06, + "loss": 0.3765, + "step": 170 + }, + { + "epoch": 0.08349609375, + "grad_norm": 6.087578773498535, + "learning_rate": 9.92585906955676e-06, + "loss": 0.3519, + "step": 171 + }, + { + "epoch": 0.083984375, + "grad_norm": 6.033719539642334, + "learning_rate": 9.924495895081455e-06, + "loss": 0.3493, + "step": 172 + }, + { + "epoch": 0.08447265625, + "grad_norm": 4.239842414855957, + "learning_rate": 9.923120397989265e-06, + "loss": 0.3566, + "step": 173 + }, + { + "epoch": 0.0849609375, + "grad_norm": 3.4344899654388428, + "learning_rate": 9.92173258172211e-06, + "loss": 0.3291, + "step": 174 + }, + { + "epoch": 0.08544921875, + "grad_norm": 2.5044116973876953, + "learning_rate": 9.920332449752741e-06, + "loss": 0.368, + "step": 175 + }, + { + "epoch": 0.0859375, + "grad_norm": 2.5513086318969727, + "learning_rate": 9.91892000558472e-06, + "loss": 0.3715, + "step": 176 + }, + { + "epoch": 0.08642578125, + "grad_norm": 3.1087024211883545, + "learning_rate": 9.917495252752418e-06, + "loss": 0.3421, + "step": 177 + }, + { + "epoch": 0.0869140625, + "grad_norm": 4.5129194259643555, + "learning_rate": 9.916058194821013e-06, + "loss": 0.3348, + "step": 178 + }, + { + "epoch": 0.08740234375, + "grad_norm": 2.54546856880188, + "learning_rate": 9.914608835386468e-06, + "loss": 0.3741, + "step": 179 + }, + { + "epoch": 0.087890625, + "grad_norm": 3.379059314727783, + "learning_rate": 9.913147178075531e-06, + "loss": 0.3633, + "step": 180 + }, + { + "epoch": 0.08837890625, + "grad_norm": 2.6582908630371094, + "learning_rate": 9.911673226545721e-06, + "loss": 0.3626, + "step": 181 + }, + { + "epoch": 0.0888671875, + "grad_norm": 2.116603374481201, + "learning_rate": 9.910186984485321e-06, + "loss": 0.3627, + "step": 182 + }, + { + "epoch": 0.08935546875, + "grad_norm": 3.2947633266448975, + "learning_rate": 9.908688455613374e-06, + "loss": 0.3264, + "step": 183 + }, + { + "epoch": 0.08984375, + "grad_norm": 2.313702344894409, + "learning_rate": 9.90717764367966e-06, + "loss": 0.3285, + "step": 184 + }, + { + "epoch": 0.09033203125, + "grad_norm": 2.2801687717437744, + "learning_rate": 9.9056545524647e-06, + "loss": 0.3573, + "step": 185 + }, + { + "epoch": 0.0908203125, + "grad_norm": 3.657966375350952, + "learning_rate": 9.904119185779744e-06, + "loss": 0.3711, + "step": 186 + }, + { + "epoch": 0.09130859375, + "grad_norm": 22.30857276916504, + "learning_rate": 9.902571547466753e-06, + "loss": 0.3995, + "step": 187 + }, + { + "epoch": 0.091796875, + "grad_norm": 2.184039831161499, + "learning_rate": 9.901011641398398e-06, + "loss": 0.3654, + "step": 188 + }, + { + "epoch": 0.09228515625, + "grad_norm": 4.786393165588379, + "learning_rate": 9.89943947147805e-06, + "loss": 0.3859, + "step": 189 + }, + { + "epoch": 0.0927734375, + "grad_norm": 2.666750431060791, + "learning_rate": 9.897855041639764e-06, + "loss": 0.3888, + "step": 190 + }, + { + "epoch": 0.09326171875, + "grad_norm": 2.0390570163726807, + "learning_rate": 9.896258355848277e-06, + "loss": 0.3488, + "step": 191 + }, + { + "epoch": 0.09375, + "grad_norm": 2.618748188018799, + "learning_rate": 9.894649418098992e-06, + "loss": 0.3513, + "step": 192 + }, + { + "epoch": 0.09423828125, + "grad_norm": 2.525346040725708, + "learning_rate": 9.89302823241797e-06, + "loss": 0.3689, + "step": 193 + }, + { + "epoch": 0.0947265625, + "grad_norm": 2.0813663005828857, + "learning_rate": 9.89139480286192e-06, + "loss": 0.3718, + "step": 194 + }, + { + "epoch": 0.09521484375, + "grad_norm": 3.025359630584717, + "learning_rate": 9.88974913351819e-06, + "loss": 0.3786, + "step": 195 + }, + { + "epoch": 0.095703125, + "grad_norm": 2.8500590324401855, + "learning_rate": 9.888091228504757e-06, + "loss": 0.3481, + "step": 196 + }, + { + "epoch": 0.09619140625, + "grad_norm": 2.450500249862671, + "learning_rate": 9.88642109197021e-06, + "loss": 0.383, + "step": 197 + }, + { + "epoch": 0.0966796875, + "grad_norm": 1.9162877798080444, + "learning_rate": 9.884738728093754e-06, + "loss": 0.3698, + "step": 198 + }, + { + "epoch": 0.09716796875, + "grad_norm": 14.184158325195312, + "learning_rate": 9.883044141085183e-06, + "loss": 0.3327, + "step": 199 + }, + { + "epoch": 0.09765625, + "grad_norm": 3.0886130332946777, + "learning_rate": 9.881337335184879e-06, + "loss": 0.3767, + "step": 200 + }, + { + "epoch": 0.09814453125, + "grad_norm": 2.5864577293395996, + "learning_rate": 9.879618314663799e-06, + "loss": 0.3498, + "step": 201 + }, + { + "epoch": 0.0986328125, + "grad_norm": 3.3661086559295654, + "learning_rate": 9.87788708382347e-06, + "loss": 0.3487, + "step": 202 + }, + { + "epoch": 0.09912109375, + "grad_norm": 2.543836832046509, + "learning_rate": 9.876143646995964e-06, + "loss": 0.3611, + "step": 203 + }, + { + "epoch": 0.099609375, + "grad_norm": 2.209348201751709, + "learning_rate": 9.874388008543903e-06, + "loss": 0.3303, + "step": 204 + }, + { + "epoch": 0.10009765625, + "grad_norm": 8.464391708374023, + "learning_rate": 9.87262017286044e-06, + "loss": 0.3915, + "step": 205 + }, + { + "epoch": 0.1005859375, + "grad_norm": 2.339383125305176, + "learning_rate": 9.870840144369247e-06, + "loss": 0.3386, + "step": 206 + }, + { + "epoch": 0.10107421875, + "grad_norm": 4.952784538269043, + "learning_rate": 9.869047927524508e-06, + "loss": 0.3189, + "step": 207 + }, + { + "epoch": 0.1015625, + "grad_norm": 2.147639036178589, + "learning_rate": 9.867243526810909e-06, + "loss": 0.325, + "step": 208 + }, + { + "epoch": 0.10205078125, + "grad_norm": 2.364194393157959, + "learning_rate": 9.865426946743614e-06, + "loss": 0.3728, + "step": 209 + }, + { + "epoch": 0.1025390625, + "grad_norm": 2.0875487327575684, + "learning_rate": 9.863598191868275e-06, + "loss": 0.3493, + "step": 210 + }, + { + "epoch": 0.10302734375, + "grad_norm": 3.100674629211426, + "learning_rate": 9.861757266761002e-06, + "loss": 0.3503, + "step": 211 + }, + { + "epoch": 0.103515625, + "grad_norm": 3.1530754566192627, + "learning_rate": 9.859904176028364e-06, + "loss": 0.3635, + "step": 212 + }, + { + "epoch": 0.10400390625, + "grad_norm": 2.373269557952881, + "learning_rate": 9.858038924307363e-06, + "loss": 0.316, + "step": 213 + }, + { + "epoch": 0.1044921875, + "grad_norm": 2.517578125, + "learning_rate": 9.856161516265445e-06, + "loss": 0.3729, + "step": 214 + }, + { + "epoch": 0.10498046875, + "grad_norm": 3.9366421699523926, + "learning_rate": 9.854271956600463e-06, + "loss": 0.3119, + "step": 215 + }, + { + "epoch": 0.10546875, + "grad_norm": 3.0418357849121094, + "learning_rate": 9.852370250040682e-06, + "loss": 0.3799, + "step": 216 + }, + { + "epoch": 0.10595703125, + "grad_norm": 2.486046314239502, + "learning_rate": 9.85045640134476e-06, + "loss": 0.3761, + "step": 217 + }, + { + "epoch": 0.1064453125, + "grad_norm": 3.757772207260132, + "learning_rate": 9.848530415301748e-06, + "loss": 0.3281, + "step": 218 + }, + { + "epoch": 0.10693359375, + "grad_norm": 5.470198631286621, + "learning_rate": 9.846592296731052e-06, + "loss": 0.3626, + "step": 219 + }, + { + "epoch": 0.107421875, + "grad_norm": 2.6514899730682373, + "learning_rate": 9.84464205048245e-06, + "loss": 0.3312, + "step": 220 + }, + { + "epoch": 0.10791015625, + "grad_norm": 2.359720230102539, + "learning_rate": 9.842679681436062e-06, + "loss": 0.3332, + "step": 221 + }, + { + "epoch": 0.1083984375, + "grad_norm": 2.7306034564971924, + "learning_rate": 9.840705194502349e-06, + "loss": 0.3623, + "step": 222 + }, + { + "epoch": 0.10888671875, + "grad_norm": 2.2408559322357178, + "learning_rate": 9.838718594622083e-06, + "loss": 0.3579, + "step": 223 + }, + { + "epoch": 0.109375, + "grad_norm": 1.9728875160217285, + "learning_rate": 9.836719886766357e-06, + "loss": 0.3411, + "step": 224 + }, + { + "epoch": 0.10986328125, + "grad_norm": 2.826547861099243, + "learning_rate": 9.83470907593656e-06, + "loss": 0.2803, + "step": 225 + }, + { + "epoch": 0.1103515625, + "grad_norm": 2.5550942420959473, + "learning_rate": 9.832686167164361e-06, + "loss": 0.3537, + "step": 226 + }, + { + "epoch": 0.11083984375, + "grad_norm": 2.6079165935516357, + "learning_rate": 9.830651165511707e-06, + "loss": 0.3527, + "step": 227 + }, + { + "epoch": 0.111328125, + "grad_norm": 2.2585561275482178, + "learning_rate": 9.828604076070805e-06, + "loss": 0.3741, + "step": 228 + }, + { + "epoch": 0.11181640625, + "grad_norm": 2.335930585861206, + "learning_rate": 9.826544903964105e-06, + "loss": 0.34, + "step": 229 + }, + { + "epoch": 0.1123046875, + "grad_norm": 2.3235063552856445, + "learning_rate": 9.824473654344297e-06, + "loss": 0.3691, + "step": 230 + }, + { + "epoch": 0.11279296875, + "grad_norm": 3.584376811981201, + "learning_rate": 9.82239033239429e-06, + "loss": 0.3548, + "step": 231 + }, + { + "epoch": 0.11328125, + "grad_norm": 3.483834743499756, + "learning_rate": 9.820294943327202e-06, + "loss": 0.3905, + "step": 232 + }, + { + "epoch": 0.11376953125, + "grad_norm": 2.4160964488983154, + "learning_rate": 9.818187492386346e-06, + "loss": 0.3723, + "step": 233 + }, + { + "epoch": 0.1142578125, + "grad_norm": 2.206505298614502, + "learning_rate": 9.816067984845218e-06, + "loss": 0.3572, + "step": 234 + }, + { + "epoch": 0.11474609375, + "grad_norm": 2.8877620697021484, + "learning_rate": 9.813936426007487e-06, + "loss": 0.3486, + "step": 235 + }, + { + "epoch": 0.115234375, + "grad_norm": 2.2150516510009766, + "learning_rate": 9.81179282120697e-06, + "loss": 0.3431, + "step": 236 + }, + { + "epoch": 0.11572265625, + "grad_norm": 4.500147819519043, + "learning_rate": 9.809637175807634e-06, + "loss": 0.3465, + "step": 237 + }, + { + "epoch": 0.1162109375, + "grad_norm": 2.428119659423828, + "learning_rate": 9.80746949520357e-06, + "loss": 0.3193, + "step": 238 + }, + { + "epoch": 0.11669921875, + "grad_norm": 4.387357711791992, + "learning_rate": 9.805289784818991e-06, + "loss": 0.3789, + "step": 239 + }, + { + "epoch": 0.1171875, + "grad_norm": 2.6022865772247314, + "learning_rate": 9.803098050108206e-06, + "loss": 0.3744, + "step": 240 + }, + { + "epoch": 0.11767578125, + "grad_norm": 2.3189945220947266, + "learning_rate": 9.800894296555618e-06, + "loss": 0.3542, + "step": 241 + }, + { + "epoch": 0.1181640625, + "grad_norm": 2.428673505783081, + "learning_rate": 9.798678529675702e-06, + "loss": 0.354, + "step": 242 + }, + { + "epoch": 0.11865234375, + "grad_norm": 2.112927198410034, + "learning_rate": 9.796450755012992e-06, + "loss": 0.3541, + "step": 243 + }, + { + "epoch": 0.119140625, + "grad_norm": 3.9023051261901855, + "learning_rate": 9.794210978142073e-06, + "loss": 0.3902, + "step": 244 + }, + { + "epoch": 0.11962890625, + "grad_norm": 2.621843099594116, + "learning_rate": 9.79195920466756e-06, + "loss": 0.35, + "step": 245 + }, + { + "epoch": 0.1201171875, + "grad_norm": 2.8156723976135254, + "learning_rate": 9.789695440224094e-06, + "loss": 0.3562, + "step": 246 + }, + { + "epoch": 0.12060546875, + "grad_norm": 4.237185001373291, + "learning_rate": 9.78741969047631e-06, + "loss": 0.3596, + "step": 247 + }, + { + "epoch": 0.12109375, + "grad_norm": 2.050010919570923, + "learning_rate": 9.785131961118843e-06, + "loss": 0.3562, + "step": 248 + }, + { + "epoch": 0.12158203125, + "grad_norm": 2.1943752765655518, + "learning_rate": 9.782832257876302e-06, + "loss": 0.3147, + "step": 249 + }, + { + "epoch": 0.1220703125, + "grad_norm": 3.3409993648529053, + "learning_rate": 9.780520586503258e-06, + "loss": 0.4023, + "step": 250 + }, + { + "epoch": 0.12255859375, + "grad_norm": 2.073791027069092, + "learning_rate": 9.77819695278423e-06, + "loss": 0.3323, + "step": 251 + }, + { + "epoch": 0.123046875, + "grad_norm": 2.773463010787964, + "learning_rate": 9.77586136253367e-06, + "loss": 0.3461, + "step": 252 + }, + { + "epoch": 0.12353515625, + "grad_norm": 2.2921154499053955, + "learning_rate": 9.773513821595951e-06, + "loss": 0.3344, + "step": 253 + }, + { + "epoch": 0.1240234375, + "grad_norm": 2.6613571643829346, + "learning_rate": 9.771154335845345e-06, + "loss": 0.348, + "step": 254 + }, + { + "epoch": 0.12451171875, + "grad_norm": 8.336869239807129, + "learning_rate": 9.768782911186023e-06, + "loss": 0.3726, + "step": 255 + }, + { + "epoch": 0.125, + "grad_norm": 2.428882360458374, + "learning_rate": 9.766399553552022e-06, + "loss": 0.3765, + "step": 256 + }, + { + "epoch": 0.12548828125, + "grad_norm": 1.8940154314041138, + "learning_rate": 9.764004268907244e-06, + "loss": 0.3407, + "step": 257 + }, + { + "epoch": 0.1259765625, + "grad_norm": 2.5715792179107666, + "learning_rate": 9.761597063245434e-06, + "loss": 0.3679, + "step": 258 + }, + { + "epoch": 0.12646484375, + "grad_norm": 2.1206367015838623, + "learning_rate": 9.759177942590166e-06, + "loss": 0.3409, + "step": 259 + }, + { + "epoch": 0.126953125, + "grad_norm": 2.5495412349700928, + "learning_rate": 9.756746912994832e-06, + "loss": 0.3499, + "step": 260 + }, + { + "epoch": 0.12744140625, + "grad_norm": 2.9602348804473877, + "learning_rate": 9.754303980542623e-06, + "loss": 0.3706, + "step": 261 + }, + { + "epoch": 0.1279296875, + "grad_norm": 2.7507028579711914, + "learning_rate": 9.751849151346513e-06, + "loss": 0.3767, + "step": 262 + }, + { + "epoch": 0.12841796875, + "grad_norm": 2.539034843444824, + "learning_rate": 9.749382431549247e-06, + "loss": 0.3406, + "step": 263 + }, + { + "epoch": 0.12890625, + "grad_norm": 2.833279848098755, + "learning_rate": 9.746903827323324e-06, + "loss": 0.3522, + "step": 264 + }, + { + "epoch": 0.12939453125, + "grad_norm": 2.5430469512939453, + "learning_rate": 9.74441334487098e-06, + "loss": 0.3406, + "step": 265 + }, + { + "epoch": 0.1298828125, + "grad_norm": 2.858895778656006, + "learning_rate": 9.741910990424173e-06, + "loss": 0.3396, + "step": 266 + }, + { + "epoch": 0.13037109375, + "grad_norm": 3.113898515701294, + "learning_rate": 9.739396770244575e-06, + "loss": 0.3779, + "step": 267 + }, + { + "epoch": 0.130859375, + "grad_norm": 2.812479257583618, + "learning_rate": 9.736870690623541e-06, + "loss": 0.3581, + "step": 268 + }, + { + "epoch": 0.13134765625, + "grad_norm": 4.137664318084717, + "learning_rate": 9.734332757882108e-06, + "loss": 0.3731, + "step": 269 + }, + { + "epoch": 0.1318359375, + "grad_norm": 2.346695899963379, + "learning_rate": 9.73178297837097e-06, + "loss": 0.3499, + "step": 270 + }, + { + "epoch": 0.13232421875, + "grad_norm": 3.5724024772644043, + "learning_rate": 9.729221358470468e-06, + "loss": 0.346, + "step": 271 + }, + { + "epoch": 0.1328125, + "grad_norm": 2.5001883506774902, + "learning_rate": 9.726647904590572e-06, + "loss": 0.3371, + "step": 272 + }, + { + "epoch": 0.13330078125, + "grad_norm": 1.8020128011703491, + "learning_rate": 9.724062623170855e-06, + "loss": 0.3632, + "step": 273 + }, + { + "epoch": 0.1337890625, + "grad_norm": 2.486666679382324, + "learning_rate": 9.721465520680501e-06, + "loss": 0.3505, + "step": 274 + }, + { + "epoch": 0.13427734375, + "grad_norm": 2.269751787185669, + "learning_rate": 9.718856603618263e-06, + "loss": 0.3718, + "step": 275 + }, + { + "epoch": 0.134765625, + "grad_norm": 2.7286322116851807, + "learning_rate": 9.716235878512462e-06, + "loss": 0.3462, + "step": 276 + }, + { + "epoch": 0.13525390625, + "grad_norm": 2.535698175430298, + "learning_rate": 9.713603351920964e-06, + "loss": 0.3451, + "step": 277 + }, + { + "epoch": 0.1357421875, + "grad_norm": 1.9008198976516724, + "learning_rate": 9.710959030431167e-06, + "loss": 0.3924, + "step": 278 + }, + { + "epoch": 0.13623046875, + "grad_norm": 2.339395046234131, + "learning_rate": 9.708302920659987e-06, + "loss": 0.3331, + "step": 279 + }, + { + "epoch": 0.13671875, + "grad_norm": 2.376002550125122, + "learning_rate": 9.705635029253833e-06, + "loss": 0.3815, + "step": 280 + }, + { + "epoch": 0.13720703125, + "grad_norm": 2.245027780532837, + "learning_rate": 9.702955362888595e-06, + "loss": 0.3548, + "step": 281 + }, + { + "epoch": 0.1376953125, + "grad_norm": 2.206878900527954, + "learning_rate": 9.700263928269636e-06, + "loss": 0.3204, + "step": 282 + }, + { + "epoch": 0.13818359375, + "grad_norm": 2.0215516090393066, + "learning_rate": 9.697560732131753e-06, + "loss": 0.3387, + "step": 283 + }, + { + "epoch": 0.138671875, + "grad_norm": 2.9142580032348633, + "learning_rate": 9.694845781239188e-06, + "loss": 0.3336, + "step": 284 + }, + { + "epoch": 0.13916015625, + "grad_norm": 2.0387048721313477, + "learning_rate": 9.692119082385588e-06, + "loss": 0.3342, + "step": 285 + }, + { + "epoch": 0.1396484375, + "grad_norm": 2.3236615657806396, + "learning_rate": 9.689380642393998e-06, + "loss": 0.3773, + "step": 286 + }, + { + "epoch": 0.14013671875, + "grad_norm": 3.4590189456939697, + "learning_rate": 9.686630468116846e-06, + "loss": 0.3358, + "step": 287 + }, + { + "epoch": 0.140625, + "grad_norm": 1.6319761276245117, + "learning_rate": 9.683868566435922e-06, + "loss": 0.2913, + "step": 288 + }, + { + "epoch": 0.14111328125, + "grad_norm": 6.874841690063477, + "learning_rate": 9.681094944262361e-06, + "loss": 0.3259, + "step": 289 + }, + { + "epoch": 0.1416015625, + "grad_norm": 4.962515830993652, + "learning_rate": 9.678309608536626e-06, + "loss": 0.3455, + "step": 290 + }, + { + "epoch": 0.14208984375, + "grad_norm": 3.334455966949463, + "learning_rate": 9.675512566228493e-06, + "loss": 0.3561, + "step": 291 + }, + { + "epoch": 0.142578125, + "grad_norm": 3.891530990600586, + "learning_rate": 9.672703824337026e-06, + "loss": 0.3627, + "step": 292 + }, + { + "epoch": 0.14306640625, + "grad_norm": 2.2160141468048096, + "learning_rate": 9.669883389890572e-06, + "loss": 0.312, + "step": 293 + }, + { + "epoch": 0.1435546875, + "grad_norm": 3.7108445167541504, + "learning_rate": 9.667051269946734e-06, + "loss": 0.338, + "step": 294 + }, + { + "epoch": 0.14404296875, + "grad_norm": 2.138221025466919, + "learning_rate": 9.664207471592353e-06, + "loss": 0.3767, + "step": 295 + }, + { + "epoch": 0.14453125, + "grad_norm": 11.57601547241211, + "learning_rate": 9.661352001943494e-06, + "loss": 0.3481, + "step": 296 + }, + { + "epoch": 0.14501953125, + "grad_norm": 2.1737406253814697, + "learning_rate": 9.658484868145428e-06, + "loss": 0.3319, + "step": 297 + }, + { + "epoch": 0.1455078125, + "grad_norm": 4.048387050628662, + "learning_rate": 9.655606077372619e-06, + "loss": 0.3061, + "step": 298 + }, + { + "epoch": 0.14599609375, + "grad_norm": 2.4968268871307373, + "learning_rate": 9.652715636828687e-06, + "loss": 0.333, + "step": 299 + }, + { + "epoch": 0.146484375, + "grad_norm": 2.2704763412475586, + "learning_rate": 9.649813553746416e-06, + "loss": 0.3307, + "step": 300 + }, + { + "epoch": 0.14697265625, + "grad_norm": 1.9303852319717407, + "learning_rate": 9.646899835387718e-06, + "loss": 0.3342, + "step": 301 + }, + { + "epoch": 0.1474609375, + "grad_norm": 2.8917553424835205, + "learning_rate": 9.64397448904362e-06, + "loss": 0.3595, + "step": 302 + }, + { + "epoch": 0.14794921875, + "grad_norm": 2.193105697631836, + "learning_rate": 9.641037522034246e-06, + "loss": 0.3675, + "step": 303 + }, + { + "epoch": 0.1484375, + "grad_norm": 1.9201539754867554, + "learning_rate": 9.638088941708799e-06, + "loss": 0.353, + "step": 304 + }, + { + "epoch": 0.14892578125, + "grad_norm": 2.513864517211914, + "learning_rate": 9.635128755445542e-06, + "loss": 0.3669, + "step": 305 + }, + { + "epoch": 0.1494140625, + "grad_norm": 2.397608518600464, + "learning_rate": 9.63215697065178e-06, + "loss": 0.3439, + "step": 306 + }, + { + "epoch": 0.14990234375, + "grad_norm": 2.335594654083252, + "learning_rate": 9.62917359476384e-06, + "loss": 0.3558, + "step": 307 + }, + { + "epoch": 0.150390625, + "grad_norm": 2.5134353637695312, + "learning_rate": 9.626178635247054e-06, + "loss": 0.3923, + "step": 308 + }, + { + "epoch": 0.15087890625, + "grad_norm": 2.9013524055480957, + "learning_rate": 9.623172099595743e-06, + "loss": 0.3748, + "step": 309 + }, + { + "epoch": 0.1513671875, + "grad_norm": 3.2646868228912354, + "learning_rate": 9.620153995333188e-06, + "loss": 0.3268, + "step": 310 + }, + { + "epoch": 0.15185546875, + "grad_norm": 2.843632459640503, + "learning_rate": 9.617124330011624e-06, + "loss": 0.3392, + "step": 311 + }, + { + "epoch": 0.15234375, + "grad_norm": 2.5182275772094727, + "learning_rate": 9.614083111212216e-06, + "loss": 0.3849, + "step": 312 + }, + { + "epoch": 0.15283203125, + "grad_norm": 2.9543368816375732, + "learning_rate": 9.611030346545035e-06, + "loss": 0.3784, + "step": 313 + }, + { + "epoch": 0.1533203125, + "grad_norm": 3.7902252674102783, + "learning_rate": 9.607966043649047e-06, + "loss": 0.3466, + "step": 314 + }, + { + "epoch": 0.15380859375, + "grad_norm": 2.4927687644958496, + "learning_rate": 9.604890210192084e-06, + "loss": 0.3638, + "step": 315 + }, + { + "epoch": 0.154296875, + "grad_norm": 4.722542762756348, + "learning_rate": 9.601802853870843e-06, + "loss": 0.3439, + "step": 316 + }, + { + "epoch": 0.15478515625, + "grad_norm": 2.0797646045684814, + "learning_rate": 9.598703982410842e-06, + "loss": 0.373, + "step": 317 + }, + { + "epoch": 0.1552734375, + "grad_norm": 2.1771399974823, + "learning_rate": 9.595593603566423e-06, + "loss": 0.3112, + "step": 318 + }, + { + "epoch": 0.15576171875, + "grad_norm": 2.621591091156006, + "learning_rate": 9.592471725120714e-06, + "loss": 0.3384, + "step": 319 + }, + { + "epoch": 0.15625, + "grad_norm": 4.34113883972168, + "learning_rate": 9.58933835488563e-06, + "loss": 0.3488, + "step": 320 + }, + { + "epoch": 0.15673828125, + "grad_norm": 3.58477783203125, + "learning_rate": 9.58619350070183e-06, + "loss": 0.3329, + "step": 321 + }, + { + "epoch": 0.1572265625, + "grad_norm": 2.657738208770752, + "learning_rate": 9.583037170438719e-06, + "loss": 0.3371, + "step": 322 + }, + { + "epoch": 0.15771484375, + "grad_norm": 2.3004322052001953, + "learning_rate": 9.579869371994412e-06, + "loss": 0.3658, + "step": 323 + }, + { + "epoch": 0.158203125, + "grad_norm": 3.4922330379486084, + "learning_rate": 9.576690113295726e-06, + "loss": 0.3713, + "step": 324 + }, + { + "epoch": 0.15869140625, + "grad_norm": 4.173436641693115, + "learning_rate": 9.573499402298152e-06, + "loss": 0.3349, + "step": 325 + }, + { + "epoch": 0.1591796875, + "grad_norm": 12.521305084228516, + "learning_rate": 9.570297246985838e-06, + "loss": 0.3411, + "step": 326 + }, + { + "epoch": 0.15966796875, + "grad_norm": 3.122694253921509, + "learning_rate": 9.567083655371572e-06, + "loss": 0.3644, + "step": 327 + }, + { + "epoch": 0.16015625, + "grad_norm": 1.6851651668548584, + "learning_rate": 9.563858635496755e-06, + "loss": 0.3567, + "step": 328 + }, + { + "epoch": 0.16064453125, + "grad_norm": 2.407923698425293, + "learning_rate": 9.56062219543139e-06, + "loss": 0.3298, + "step": 329 + }, + { + "epoch": 0.1611328125, + "grad_norm": 1.9536917209625244, + "learning_rate": 9.557374343274056e-06, + "loss": 0.352, + "step": 330 + }, + { + "epoch": 0.16162109375, + "grad_norm": 2.042382001876831, + "learning_rate": 9.55411508715188e-06, + "loss": 0.3249, + "step": 331 + }, + { + "epoch": 0.162109375, + "grad_norm": 1.9811147451400757, + "learning_rate": 9.55084443522054e-06, + "loss": 0.3341, + "step": 332 + }, + { + "epoch": 0.16259765625, + "grad_norm": 2.6401963233947754, + "learning_rate": 9.547562395664219e-06, + "loss": 0.3296, + "step": 333 + }, + { + "epoch": 0.1630859375, + "grad_norm": 2.3292157649993896, + "learning_rate": 9.544268976695596e-06, + "loss": 0.3446, + "step": 334 + }, + { + "epoch": 0.16357421875, + "grad_norm": 3.5120034217834473, + "learning_rate": 9.54096418655583e-06, + "loss": 0.3796, + "step": 335 + }, + { + "epoch": 0.1640625, + "grad_norm": 2.3993301391601562, + "learning_rate": 9.53764803351453e-06, + "loss": 0.3544, + "step": 336 + }, + { + "epoch": 0.16455078125, + "grad_norm": 2.403285264968872, + "learning_rate": 9.534320525869742e-06, + "loss": 0.3734, + "step": 337 + }, + { + "epoch": 0.1650390625, + "grad_norm": 1.878564476966858, + "learning_rate": 9.530981671947924e-06, + "loss": 0.3334, + "step": 338 + }, + { + "epoch": 0.16552734375, + "grad_norm": 3.3280200958251953, + "learning_rate": 9.527631480103919e-06, + "loss": 0.3282, + "step": 339 + }, + { + "epoch": 0.166015625, + "grad_norm": 2.304945230484009, + "learning_rate": 9.524269958720951e-06, + "loss": 0.3422, + "step": 340 + }, + { + "epoch": 0.16650390625, + "grad_norm": 2.0590991973876953, + "learning_rate": 9.520897116210588e-06, + "loss": 0.355, + "step": 341 + }, + { + "epoch": 0.1669921875, + "grad_norm": 1.660049557685852, + "learning_rate": 9.517512961012729e-06, + "loss": 0.3499, + "step": 342 + }, + { + "epoch": 0.16748046875, + "grad_norm": 1.8652247190475464, + "learning_rate": 9.514117501595582e-06, + "loss": 0.3594, + "step": 343 + }, + { + "epoch": 0.16796875, + "grad_norm": 1.7373839616775513, + "learning_rate": 9.510710746455636e-06, + "loss": 0.3447, + "step": 344 + }, + { + "epoch": 0.16845703125, + "grad_norm": 2.8204782009124756, + "learning_rate": 9.507292704117655e-06, + "loss": 0.362, + "step": 345 + }, + { + "epoch": 0.1689453125, + "grad_norm": 1.6446189880371094, + "learning_rate": 9.503863383134636e-06, + "loss": 0.3752, + "step": 346 + }, + { + "epoch": 0.16943359375, + "grad_norm": 3.4714109897613525, + "learning_rate": 9.500422792087809e-06, + "loss": 0.3358, + "step": 347 + }, + { + "epoch": 0.169921875, + "grad_norm": 2.125108003616333, + "learning_rate": 9.496970939586598e-06, + "loss": 0.3822, + "step": 348 + }, + { + "epoch": 0.17041015625, + "grad_norm": 2.7372467517852783, + "learning_rate": 9.493507834268609e-06, + "loss": 0.3513, + "step": 349 + }, + { + "epoch": 0.1708984375, + "grad_norm": 2.562140941619873, + "learning_rate": 9.490033484799608e-06, + "loss": 0.3727, + "step": 350 + }, + { + "epoch": 0.17138671875, + "grad_norm": 2.868966817855835, + "learning_rate": 9.486547899873495e-06, + "loss": 0.3309, + "step": 351 + }, + { + "epoch": 0.171875, + "grad_norm": 2.5418648719787598, + "learning_rate": 9.483051088212283e-06, + "loss": 0.3826, + "step": 352 + }, + { + "epoch": 0.17236328125, + "grad_norm": 1.7842854261398315, + "learning_rate": 9.479543058566081e-06, + "loss": 0.3404, + "step": 353 + }, + { + "epoch": 0.1728515625, + "grad_norm": 1.8991374969482422, + "learning_rate": 9.47602381971307e-06, + "loss": 0.3946, + "step": 354 + }, + { + "epoch": 0.17333984375, + "grad_norm": 1.9261831045150757, + "learning_rate": 9.472493380459474e-06, + "loss": 0.3579, + "step": 355 + }, + { + "epoch": 0.173828125, + "grad_norm": 1.6657100915908813, + "learning_rate": 9.468951749639552e-06, + "loss": 0.3405, + "step": 356 + }, + { + "epoch": 0.17431640625, + "grad_norm": 2.1538491249084473, + "learning_rate": 9.465398936115557e-06, + "loss": 0.3657, + "step": 357 + }, + { + "epoch": 0.1748046875, + "grad_norm": 1.8424322605133057, + "learning_rate": 9.461834948777738e-06, + "loss": 0.3685, + "step": 358 + }, + { + "epoch": 0.17529296875, + "grad_norm": 3.16018009185791, + "learning_rate": 9.458259796544293e-06, + "loss": 0.3225, + "step": 359 + }, + { + "epoch": 0.17578125, + "grad_norm": 1.7529760599136353, + "learning_rate": 9.454673488361363e-06, + "loss": 0.3428, + "step": 360 + }, + { + "epoch": 0.17626953125, + "grad_norm": 1.6713848114013672, + "learning_rate": 9.451076033203003e-06, + "loss": 0.3383, + "step": 361 + }, + { + "epoch": 0.1767578125, + "grad_norm": 2.688614845275879, + "learning_rate": 9.447467440071165e-06, + "loss": 0.3553, + "step": 362 + }, + { + "epoch": 0.17724609375, + "grad_norm": 2.0093319416046143, + "learning_rate": 9.443847717995666e-06, + "loss": 0.3689, + "step": 363 + }, + { + "epoch": 0.177734375, + "grad_norm": 5.026141166687012, + "learning_rate": 9.440216876034177e-06, + "loss": 0.3072, + "step": 364 + }, + { + "epoch": 0.17822265625, + "grad_norm": 2.687075138092041, + "learning_rate": 9.436574923272188e-06, + "loss": 0.3624, + "step": 365 + }, + { + "epoch": 0.1787109375, + "grad_norm": 1.9798976182937622, + "learning_rate": 9.432921868822997e-06, + "loss": 0.3355, + "step": 366 + }, + { + "epoch": 0.17919921875, + "grad_norm": 2.060910701751709, + "learning_rate": 9.42925772182768e-06, + "loss": 0.3435, + "step": 367 + }, + { + "epoch": 0.1796875, + "grad_norm": 1.7003917694091797, + "learning_rate": 9.425582491455068e-06, + "loss": 0.3659, + "step": 368 + }, + { + "epoch": 0.18017578125, + "grad_norm": 2.026036262512207, + "learning_rate": 9.421896186901729e-06, + "loss": 0.3523, + "step": 369 + }, + { + "epoch": 0.1806640625, + "grad_norm": 1.9931825399398804, + "learning_rate": 9.418198817391941e-06, + "loss": 0.3654, + "step": 370 + }, + { + "epoch": 0.18115234375, + "grad_norm": 2.7290432453155518, + "learning_rate": 9.41449039217767e-06, + "loss": 0.3599, + "step": 371 + }, + { + "epoch": 0.181640625, + "grad_norm": 1.5444127321243286, + "learning_rate": 9.410770920538545e-06, + "loss": 0.2991, + "step": 372 + }, + { + "epoch": 0.18212890625, + "grad_norm": 2.319566011428833, + "learning_rate": 9.407040411781843e-06, + "loss": 0.3724, + "step": 373 + }, + { + "epoch": 0.1826171875, + "grad_norm": 1.9856535196304321, + "learning_rate": 9.403298875242448e-06, + "loss": 0.348, + "step": 374 + }, + { + "epoch": 0.18310546875, + "grad_norm": 1.9270925521850586, + "learning_rate": 9.39954632028285e-06, + "loss": 0.3766, + "step": 375 + }, + { + "epoch": 0.18359375, + "grad_norm": 2.2769391536712646, + "learning_rate": 9.395782756293104e-06, + "loss": 0.3563, + "step": 376 + }, + { + "epoch": 0.18408203125, + "grad_norm": 2.2026526927948, + "learning_rate": 9.392008192690816e-06, + "loss": 0.3213, + "step": 377 + }, + { + "epoch": 0.1845703125, + "grad_norm": 2.3757741451263428, + "learning_rate": 9.388222638921116e-06, + "loss": 0.3595, + "step": 378 + }, + { + "epoch": 0.18505859375, + "grad_norm": 1.9485424757003784, + "learning_rate": 9.384426104456632e-06, + "loss": 0.3561, + "step": 379 + }, + { + "epoch": 0.185546875, + "grad_norm": 2.7337324619293213, + "learning_rate": 9.380618598797473e-06, + "loss": 0.38, + "step": 380 + }, + { + "epoch": 0.18603515625, + "grad_norm": 2.1130242347717285, + "learning_rate": 9.3768001314712e-06, + "loss": 0.3533, + "step": 381 + }, + { + "epoch": 0.1865234375, + "grad_norm": 1.831874966621399, + "learning_rate": 9.372970712032803e-06, + "loss": 0.332, + "step": 382 + }, + { + "epoch": 0.18701171875, + "grad_norm": 2.3811991214752197, + "learning_rate": 9.369130350064677e-06, + "loss": 0.3798, + "step": 383 + }, + { + "epoch": 0.1875, + "grad_norm": 1.8242988586425781, + "learning_rate": 9.3652790551766e-06, + "loss": 0.3634, + "step": 384 + }, + { + "epoch": 0.18798828125, + "grad_norm": 3.14345645904541, + "learning_rate": 9.361416837005705e-06, + "loss": 0.3513, + "step": 385 + }, + { + "epoch": 0.1884765625, + "grad_norm": 1.9473716020584106, + "learning_rate": 9.357543705216465e-06, + "loss": 0.3687, + "step": 386 + }, + { + "epoch": 0.18896484375, + "grad_norm": 1.982612133026123, + "learning_rate": 9.353659669500652e-06, + "loss": 0.3803, + "step": 387 + }, + { + "epoch": 0.189453125, + "grad_norm": 1.774999976158142, + "learning_rate": 9.349764739577334e-06, + "loss": 0.3331, + "step": 388 + }, + { + "epoch": 0.18994140625, + "grad_norm": 1.5273141860961914, + "learning_rate": 9.34585892519283e-06, + "loss": 0.3599, + "step": 389 + }, + { + "epoch": 0.1904296875, + "grad_norm": 1.8035123348236084, + "learning_rate": 9.3419422361207e-06, + "loss": 0.3771, + "step": 390 + }, + { + "epoch": 0.19091796875, + "grad_norm": 1.789610505104065, + "learning_rate": 9.338014682161719e-06, + "loss": 0.3236, + "step": 391 + }, + { + "epoch": 0.19140625, + "grad_norm": 1.9845644235610962, + "learning_rate": 9.334076273143843e-06, + "loss": 0.3274, + "step": 392 + }, + { + "epoch": 0.19189453125, + "grad_norm": 2.072159767150879, + "learning_rate": 9.330127018922195e-06, + "loss": 0.3416, + "step": 393 + }, + { + "epoch": 0.1923828125, + "grad_norm": 1.8441466093063354, + "learning_rate": 9.326166929379032e-06, + "loss": 0.3352, + "step": 394 + }, + { + "epoch": 0.19287109375, + "grad_norm": 2.479971170425415, + "learning_rate": 9.322196014423729e-06, + "loss": 0.3472, + "step": 395 + }, + { + "epoch": 0.193359375, + "grad_norm": 2.514597177505493, + "learning_rate": 9.318214283992747e-06, + "loss": 0.3544, + "step": 396 + }, + { + "epoch": 0.19384765625, + "grad_norm": 2.048144578933716, + "learning_rate": 9.314221748049613e-06, + "loss": 0.3869, + "step": 397 + }, + { + "epoch": 0.1943359375, + "grad_norm": 2.8453140258789062, + "learning_rate": 9.310218416584887e-06, + "loss": 0.3734, + "step": 398 + }, + { + "epoch": 0.19482421875, + "grad_norm": 1.6406381130218506, + "learning_rate": 9.306204299616148e-06, + "loss": 0.3507, + "step": 399 + }, + { + "epoch": 0.1953125, + "grad_norm": 2.275040626525879, + "learning_rate": 9.302179407187965e-06, + "loss": 0.3787, + "step": 400 + }, + { + "epoch": 0.19580078125, + "grad_norm": 1.522905945777893, + "learning_rate": 9.298143749371865e-06, + "loss": 0.341, + "step": 401 + }, + { + "epoch": 0.1962890625, + "grad_norm": 2.3068466186523438, + "learning_rate": 9.294097336266317e-06, + "loss": 0.3686, + "step": 402 + }, + { + "epoch": 0.19677734375, + "grad_norm": 2.8621833324432373, + "learning_rate": 9.290040177996703e-06, + "loss": 0.3331, + "step": 403 + }, + { + "epoch": 0.197265625, + "grad_norm": 2.339892864227295, + "learning_rate": 9.285972284715291e-06, + "loss": 0.3889, + "step": 404 + }, + { + "epoch": 0.19775390625, + "grad_norm": 1.7295536994934082, + "learning_rate": 9.281893666601214e-06, + "loss": 0.3692, + "step": 405 + }, + { + "epoch": 0.1982421875, + "grad_norm": 4.145984649658203, + "learning_rate": 9.277804333860435e-06, + "loss": 0.3387, + "step": 406 + }, + { + "epoch": 0.19873046875, + "grad_norm": 1.866166114807129, + "learning_rate": 9.273704296725741e-06, + "loss": 0.3503, + "step": 407 + }, + { + "epoch": 0.19921875, + "grad_norm": 1.8600391149520874, + "learning_rate": 9.269593565456691e-06, + "loss": 0.347, + "step": 408 + }, + { + "epoch": 0.19970703125, + "grad_norm": 1.990860104560852, + "learning_rate": 9.265472150339615e-06, + "loss": 0.3642, + "step": 409 + }, + { + "epoch": 0.2001953125, + "grad_norm": 1.4612618684768677, + "learning_rate": 9.26134006168757e-06, + "loss": 0.3624, + "step": 410 + }, + { + "epoch": 0.20068359375, + "grad_norm": 1.4518144130706787, + "learning_rate": 9.257197309840322e-06, + "loss": 0.3374, + "step": 411 + }, + { + "epoch": 0.201171875, + "grad_norm": 1.5550000667572021, + "learning_rate": 9.253043905164327e-06, + "loss": 0.3651, + "step": 412 + }, + { + "epoch": 0.20166015625, + "grad_norm": 1.9353028535842896, + "learning_rate": 9.248879858052688e-06, + "loss": 0.3111, + "step": 413 + }, + { + "epoch": 0.2021484375, + "grad_norm": 1.5865511894226074, + "learning_rate": 9.244705178925146e-06, + "loss": 0.3734, + "step": 414 + }, + { + "epoch": 0.20263671875, + "grad_norm": 1.9505976438522339, + "learning_rate": 9.24051987822804e-06, + "loss": 0.3294, + "step": 415 + }, + { + "epoch": 0.203125, + "grad_norm": 1.7402981519699097, + "learning_rate": 9.236323966434296e-06, + "loss": 0.3664, + "step": 416 + }, + { + "epoch": 0.20361328125, + "grad_norm": 2.2276546955108643, + "learning_rate": 9.232117454043383e-06, + "loss": 0.3943, + "step": 417 + }, + { + "epoch": 0.2041015625, + "grad_norm": 2.5883917808532715, + "learning_rate": 9.227900351581303e-06, + "loss": 0.3759, + "step": 418 + }, + { + "epoch": 0.20458984375, + "grad_norm": 2.116527795791626, + "learning_rate": 9.223672669600552e-06, + "loss": 0.371, + "step": 419 + }, + { + "epoch": 0.205078125, + "grad_norm": 1.890336036682129, + "learning_rate": 9.219434418680107e-06, + "loss": 0.3208, + "step": 420 + }, + { + "epoch": 0.20556640625, + "grad_norm": 2.831151247024536, + "learning_rate": 9.215185609425383e-06, + "loss": 0.3283, + "step": 421 + }, + { + "epoch": 0.2060546875, + "grad_norm": 1.890857458114624, + "learning_rate": 9.21092625246822e-06, + "loss": 0.3634, + "step": 422 + }, + { + "epoch": 0.20654296875, + "grad_norm": 1.4543401002883911, + "learning_rate": 9.206656358466851e-06, + "loss": 0.3615, + "step": 423 + }, + { + "epoch": 0.20703125, + "grad_norm": 1.9577465057373047, + "learning_rate": 9.202375938105876e-06, + "loss": 0.364, + "step": 424 + }, + { + "epoch": 0.20751953125, + "grad_norm": 1.5794016122817993, + "learning_rate": 9.198085002096237e-06, + "loss": 0.34, + "step": 425 + }, + { + "epoch": 0.2080078125, + "grad_norm": 1.8114027976989746, + "learning_rate": 9.193783561175184e-06, + "loss": 0.3413, + "step": 426 + }, + { + "epoch": 0.20849609375, + "grad_norm": 1.5112391710281372, + "learning_rate": 9.189471626106261e-06, + "loss": 0.3558, + "step": 427 + }, + { + "epoch": 0.208984375, + "grad_norm": 1.5750012397766113, + "learning_rate": 9.185149207679263e-06, + "loss": 0.3211, + "step": 428 + }, + { + "epoch": 0.20947265625, + "grad_norm": 1.5355925559997559, + "learning_rate": 9.180816316710226e-06, + "loss": 0.316, + "step": 429 + }, + { + "epoch": 0.2099609375, + "grad_norm": 1.7540535926818848, + "learning_rate": 9.176472964041385e-06, + "loss": 0.3446, + "step": 430 + }, + { + "epoch": 0.21044921875, + "grad_norm": 1.94683837890625, + "learning_rate": 9.172119160541158e-06, + "loss": 0.3894, + "step": 431 + }, + { + "epoch": 0.2109375, + "grad_norm": 2.1505014896392822, + "learning_rate": 9.167754917104112e-06, + "loss": 0.3516, + "step": 432 + }, + { + "epoch": 0.21142578125, + "grad_norm": 3.6382253170013428, + "learning_rate": 9.163380244650938e-06, + "loss": 0.3766, + "step": 433 + }, + { + "epoch": 0.2119140625, + "grad_norm": 1.4218906164169312, + "learning_rate": 9.158995154128425e-06, + "loss": 0.3377, + "step": 434 + }, + { + "epoch": 0.21240234375, + "grad_norm": 1.6487233638763428, + "learning_rate": 9.15459965650943e-06, + "loss": 0.3198, + "step": 435 + }, + { + "epoch": 0.212890625, + "grad_norm": 6.333557605743408, + "learning_rate": 9.15019376279285e-06, + "loss": 0.3336, + "step": 436 + }, + { + "epoch": 0.21337890625, + "grad_norm": 1.746251106262207, + "learning_rate": 9.1457774840036e-06, + "loss": 0.3434, + "step": 437 + }, + { + "epoch": 0.2138671875, + "grad_norm": 2.1596200466156006, + "learning_rate": 9.14135083119258e-06, + "loss": 0.3496, + "step": 438 + }, + { + "epoch": 0.21435546875, + "grad_norm": 1.7951174974441528, + "learning_rate": 9.13691381543665e-06, + "loss": 0.3589, + "step": 439 + }, + { + "epoch": 0.21484375, + "grad_norm": 1.7067686319351196, + "learning_rate": 9.132466447838598e-06, + "loss": 0.3367, + "step": 440 + }, + { + "epoch": 0.21533203125, + "grad_norm": 2.095935344696045, + "learning_rate": 9.128008739527119e-06, + "loss": 0.3305, + "step": 441 + }, + { + "epoch": 0.2158203125, + "grad_norm": 2.011528968811035, + "learning_rate": 9.123540701656782e-06, + "loss": 0.368, + "step": 442 + }, + { + "epoch": 0.21630859375, + "grad_norm": 1.4319236278533936, + "learning_rate": 9.119062345408005e-06, + "loss": 0.3288, + "step": 443 + }, + { + "epoch": 0.216796875, + "grad_norm": 1.8978536128997803, + "learning_rate": 9.114573681987024e-06, + "loss": 0.3222, + "step": 444 + }, + { + "epoch": 0.21728515625, + "grad_norm": 1.8402870893478394, + "learning_rate": 9.11007472262587e-06, + "loss": 0.3286, + "step": 445 + }, + { + "epoch": 0.2177734375, + "grad_norm": 1.8938474655151367, + "learning_rate": 9.105565478582335e-06, + "loss": 0.3725, + "step": 446 + }, + { + "epoch": 0.21826171875, + "grad_norm": 1.723388433456421, + "learning_rate": 9.101045961139945e-06, + "loss": 0.3634, + "step": 447 + }, + { + "epoch": 0.21875, + "grad_norm": 1.8326998949050903, + "learning_rate": 9.096516181607935e-06, + "loss": 0.3276, + "step": 448 + }, + { + "epoch": 0.21923828125, + "grad_norm": 1.6433813571929932, + "learning_rate": 9.09197615132122e-06, + "loss": 0.3637, + "step": 449 + }, + { + "epoch": 0.2197265625, + "grad_norm": 1.482116460800171, + "learning_rate": 9.087425881640366e-06, + "loss": 0.3413, + "step": 450 + }, + { + "epoch": 0.22021484375, + "grad_norm": 5.252507209777832, + "learning_rate": 9.082865383951558e-06, + "loss": 0.35, + "step": 451 + }, + { + "epoch": 0.220703125, + "grad_norm": 1.4982550144195557, + "learning_rate": 9.078294669666577e-06, + "loss": 0.3354, + "step": 452 + }, + { + "epoch": 0.22119140625, + "grad_norm": 2.408413887023926, + "learning_rate": 9.073713750222766e-06, + "loss": 0.3376, + "step": 453 + }, + { + "epoch": 0.2216796875, + "grad_norm": 1.682771921157837, + "learning_rate": 9.069122637083012e-06, + "loss": 0.3131, + "step": 454 + }, + { + "epoch": 0.22216796875, + "grad_norm": 1.6665334701538086, + "learning_rate": 9.064521341735702e-06, + "loss": 0.3348, + "step": 455 + }, + { + "epoch": 0.22265625, + "grad_norm": 1.3198261260986328, + "learning_rate": 9.059909875694703e-06, + "loss": 0.3087, + "step": 456 + }, + { + "epoch": 0.22314453125, + "grad_norm": 2.0489742755889893, + "learning_rate": 9.055288250499339e-06, + "loss": 0.3549, + "step": 457 + }, + { + "epoch": 0.2236328125, + "grad_norm": 1.4335616827011108, + "learning_rate": 9.050656477714345e-06, + "loss": 0.3859, + "step": 458 + }, + { + "epoch": 0.22412109375, + "grad_norm": 1.9734736680984497, + "learning_rate": 9.046014568929856e-06, + "loss": 0.358, + "step": 459 + }, + { + "epoch": 0.224609375, + "grad_norm": 1.8493421077728271, + "learning_rate": 9.04136253576137e-06, + "loss": 0.3306, + "step": 460 + }, + { + "epoch": 0.22509765625, + "grad_norm": 2.6172261238098145, + "learning_rate": 9.036700389849717e-06, + "loss": 0.3481, + "step": 461 + }, + { + "epoch": 0.2255859375, + "grad_norm": 1.538042664527893, + "learning_rate": 9.03202814286103e-06, + "loss": 0.3154, + "step": 462 + }, + { + "epoch": 0.22607421875, + "grad_norm": 2.418534278869629, + "learning_rate": 9.027345806486722e-06, + "loss": 0.3247, + "step": 463 + }, + { + "epoch": 0.2265625, + "grad_norm": 1.7823346853256226, + "learning_rate": 9.022653392443455e-06, + "loss": 0.338, + "step": 464 + }, + { + "epoch": 0.22705078125, + "grad_norm": 1.9469126462936401, + "learning_rate": 9.0179509124731e-06, + "loss": 0.3377, + "step": 465 + }, + { + "epoch": 0.2275390625, + "grad_norm": 1.985723614692688, + "learning_rate": 9.013238378342725e-06, + "loss": 0.3438, + "step": 466 + }, + { + "epoch": 0.22802734375, + "grad_norm": 1.5227419137954712, + "learning_rate": 9.008515801844552e-06, + "loss": 0.3392, + "step": 467 + }, + { + "epoch": 0.228515625, + "grad_norm": 2.764451026916504, + "learning_rate": 9.003783194795931e-06, + "loss": 0.3439, + "step": 468 + }, + { + "epoch": 0.22900390625, + "grad_norm": 1.489700198173523, + "learning_rate": 8.999040569039315e-06, + "loss": 0.3654, + "step": 469 + }, + { + "epoch": 0.2294921875, + "grad_norm": 2.0311126708984375, + "learning_rate": 8.994287936442226e-06, + "loss": 0.3312, + "step": 470 + }, + { + "epoch": 0.22998046875, + "grad_norm": 1.7580716609954834, + "learning_rate": 8.989525308897223e-06, + "loss": 0.3573, + "step": 471 + }, + { + "epoch": 0.23046875, + "grad_norm": 1.7429345846176147, + "learning_rate": 8.98475269832188e-06, + "loss": 0.3757, + "step": 472 + }, + { + "epoch": 0.23095703125, + "grad_norm": 1.544498085975647, + "learning_rate": 8.97997011665875e-06, + "loss": 0.2787, + "step": 473 + }, + { + "epoch": 0.2314453125, + "grad_norm": 1.6220890283584595, + "learning_rate": 8.975177575875335e-06, + "loss": 0.3597, + "step": 474 + }, + { + "epoch": 0.23193359375, + "grad_norm": 1.598620057106018, + "learning_rate": 8.97037508796406e-06, + "loss": 0.3615, + "step": 475 + }, + { + "epoch": 0.232421875, + "grad_norm": 1.567460298538208, + "learning_rate": 8.96556266494224e-06, + "loss": 0.3613, + "step": 476 + }, + { + "epoch": 0.23291015625, + "grad_norm": 1.5737589597702026, + "learning_rate": 8.960740318852051e-06, + "loss": 0.3699, + "step": 477 + }, + { + "epoch": 0.2333984375, + "grad_norm": 1.9563899040222168, + "learning_rate": 8.9559080617605e-06, + "loss": 0.3578, + "step": 478 + }, + { + "epoch": 0.23388671875, + "grad_norm": 2.225196599960327, + "learning_rate": 8.951065905759392e-06, + "loss": 0.3346, + "step": 479 + }, + { + "epoch": 0.234375, + "grad_norm": 1.5860683917999268, + "learning_rate": 8.946213862965306e-06, + "loss": 0.3741, + "step": 480 + }, + { + "epoch": 0.23486328125, + "grad_norm": 1.289207935333252, + "learning_rate": 8.941351945519557e-06, + "loss": 0.3434, + "step": 481 + }, + { + "epoch": 0.2353515625, + "grad_norm": 1.576648235321045, + "learning_rate": 8.936480165588174e-06, + "loss": 0.3513, + "step": 482 + }, + { + "epoch": 0.23583984375, + "grad_norm": 1.5328677892684937, + "learning_rate": 8.931598535361855e-06, + "loss": 0.3299, + "step": 483 + }, + { + "epoch": 0.236328125, + "grad_norm": 1.439266562461853, + "learning_rate": 8.926707067055963e-06, + "loss": 0.3077, + "step": 484 + }, + { + "epoch": 0.23681640625, + "grad_norm": 1.6571671962738037, + "learning_rate": 8.921805772910463e-06, + "loss": 0.3666, + "step": 485 + }, + { + "epoch": 0.2373046875, + "grad_norm": 2.0075385570526123, + "learning_rate": 8.916894665189918e-06, + "loss": 0.3695, + "step": 486 + }, + { + "epoch": 0.23779296875, + "grad_norm": 1.3680145740509033, + "learning_rate": 8.91197375618344e-06, + "loss": 0.3393, + "step": 487 + }, + { + "epoch": 0.23828125, + "grad_norm": 1.9149501323699951, + "learning_rate": 8.907043058204674e-06, + "loss": 0.3374, + "step": 488 + }, + { + "epoch": 0.23876953125, + "grad_norm": 1.5481083393096924, + "learning_rate": 8.902102583591755e-06, + "loss": 0.3263, + "step": 489 + }, + { + "epoch": 0.2392578125, + "grad_norm": 1.8688881397247314, + "learning_rate": 8.89715234470728e-06, + "loss": 0.3207, + "step": 490 + }, + { + "epoch": 0.23974609375, + "grad_norm": 1.846941351890564, + "learning_rate": 8.892192353938288e-06, + "loss": 0.3677, + "step": 491 + }, + { + "epoch": 0.240234375, + "grad_norm": 1.4003583192825317, + "learning_rate": 8.887222623696213e-06, + "loss": 0.3281, + "step": 492 + }, + { + "epoch": 0.24072265625, + "grad_norm": 1.9100502729415894, + "learning_rate": 8.882243166416862e-06, + "loss": 0.3685, + "step": 493 + }, + { + "epoch": 0.2412109375, + "grad_norm": 1.6730045080184937, + "learning_rate": 8.877253994560381e-06, + "loss": 0.3482, + "step": 494 + }, + { + "epoch": 0.24169921875, + "grad_norm": 1.4065086841583252, + "learning_rate": 8.87225512061123e-06, + "loss": 0.3404, + "step": 495 + }, + { + "epoch": 0.2421875, + "grad_norm": 1.5349781513214111, + "learning_rate": 8.867246557078141e-06, + "loss": 0.3279, + "step": 496 + }, + { + "epoch": 0.24267578125, + "grad_norm": 1.376725196838379, + "learning_rate": 8.862228316494094e-06, + "loss": 0.3384, + "step": 497 + }, + { + "epoch": 0.2431640625, + "grad_norm": 1.5585695505142212, + "learning_rate": 8.857200411416283e-06, + "loss": 0.3638, + "step": 498 + }, + { + "epoch": 0.24365234375, + "grad_norm": 3.5493311882019043, + "learning_rate": 8.852162854426087e-06, + "loss": 0.3561, + "step": 499 + }, + { + "epoch": 0.244140625, + "grad_norm": 2.1406612396240234, + "learning_rate": 8.84711565812904e-06, + "loss": 0.3097, + "step": 500 + }, + { + "epoch": 0.24462890625, + "grad_norm": 1.5322456359863281, + "learning_rate": 8.842058835154789e-06, + "loss": 0.36, + "step": 501 + }, + { + "epoch": 0.2451171875, + "grad_norm": 2.3245677947998047, + "learning_rate": 8.836992398157076e-06, + "loss": 0.3479, + "step": 502 + }, + { + "epoch": 0.24560546875, + "grad_norm": 1.8092581033706665, + "learning_rate": 8.831916359813702e-06, + "loss": 0.3292, + "step": 503 + }, + { + "epoch": 0.24609375, + "grad_norm": 1.6669384241104126, + "learning_rate": 8.826830732826484e-06, + "loss": 0.357, + "step": 504 + }, + { + "epoch": 0.24658203125, + "grad_norm": 1.3617286682128906, + "learning_rate": 8.821735529921243e-06, + "loss": 0.3434, + "step": 505 + }, + { + "epoch": 0.2470703125, + "grad_norm": 5.754039287567139, + "learning_rate": 8.816630763847756e-06, + "loss": 0.3677, + "step": 506 + }, + { + "epoch": 0.24755859375, + "grad_norm": 1.2652654647827148, + "learning_rate": 8.811516447379734e-06, + "loss": 0.3573, + "step": 507 + }, + { + "epoch": 0.248046875, + "grad_norm": 1.6732009649276733, + "learning_rate": 8.806392593314781e-06, + "loss": 0.3398, + "step": 508 + }, + { + "epoch": 0.24853515625, + "grad_norm": 1.280765175819397, + "learning_rate": 8.801259214474371e-06, + "loss": 0.3371, + "step": 509 + }, + { + "epoch": 0.2490234375, + "grad_norm": 1.2774041891098022, + "learning_rate": 8.796116323703811e-06, + "loss": 0.3466, + "step": 510 + }, + { + "epoch": 0.24951171875, + "grad_norm": 1.4741958379745483, + "learning_rate": 8.790963933872212e-06, + "loss": 0.3506, + "step": 511 + }, + { + "epoch": 0.25, + "grad_norm": 1.4504543542861938, + "learning_rate": 8.785802057872447e-06, + "loss": 0.4083, + "step": 512 + }, + { + "epoch": 0.25048828125, + "grad_norm": 1.4813644886016846, + "learning_rate": 8.780630708621135e-06, + "loss": 0.382, + "step": 513 + }, + { + "epoch": 0.2509765625, + "grad_norm": 1.6617738008499146, + "learning_rate": 8.775449899058597e-06, + "loss": 0.3387, + "step": 514 + }, + { + "epoch": 0.25146484375, + "grad_norm": 1.8677629232406616, + "learning_rate": 8.770259642148826e-06, + "loss": 0.3422, + "step": 515 + }, + { + "epoch": 0.251953125, + "grad_norm": 1.4123599529266357, + "learning_rate": 8.765059950879454e-06, + "loss": 0.3621, + "step": 516 + }, + { + "epoch": 0.25244140625, + "grad_norm": 1.966430902481079, + "learning_rate": 8.759850838261723e-06, + "loss": 0.3475, + "step": 517 + }, + { + "epoch": 0.2529296875, + "grad_norm": 1.3296693563461304, + "learning_rate": 8.754632317330448e-06, + "loss": 0.3938, + "step": 518 + }, + { + "epoch": 0.25341796875, + "grad_norm": 1.4010918140411377, + "learning_rate": 8.749404401143991e-06, + "loss": 0.3474, + "step": 519 + }, + { + "epoch": 0.25390625, + "grad_norm": 1.5129917860031128, + "learning_rate": 8.744167102784216e-06, + "loss": 0.3783, + "step": 520 + }, + { + "epoch": 0.25439453125, + "grad_norm": 1.7624212503433228, + "learning_rate": 8.738920435356473e-06, + "loss": 0.3272, + "step": 521 + }, + { + "epoch": 0.2548828125, + "grad_norm": 1.4559099674224854, + "learning_rate": 8.733664411989548e-06, + "loss": 0.3526, + "step": 522 + }, + { + "epoch": 0.25537109375, + "grad_norm": 1.8239963054656982, + "learning_rate": 8.728399045835648e-06, + "loss": 0.3385, + "step": 523 + }, + { + "epoch": 0.255859375, + "grad_norm": 1.4369486570358276, + "learning_rate": 8.723124350070347e-06, + "loss": 0.3193, + "step": 524 + }, + { + "epoch": 0.25634765625, + "grad_norm": 4.341763496398926, + "learning_rate": 8.717840337892575e-06, + "loss": 0.3256, + "step": 525 + }, + { + "epoch": 0.2568359375, + "grad_norm": 2.0711512565612793, + "learning_rate": 8.712547022524566e-06, + "loss": 0.3639, + "step": 526 + }, + { + "epoch": 0.25732421875, + "grad_norm": 1.4793862104415894, + "learning_rate": 8.707244417211844e-06, + "loss": 0.3166, + "step": 527 + }, + { + "epoch": 0.2578125, + "grad_norm": 1.742661476135254, + "learning_rate": 8.701932535223168e-06, + "loss": 0.3533, + "step": 528 + }, + { + "epoch": 0.25830078125, + "grad_norm": 1.4166213274002075, + "learning_rate": 8.696611389850516e-06, + "loss": 0.3436, + "step": 529 + }, + { + "epoch": 0.2587890625, + "grad_norm": 1.362882137298584, + "learning_rate": 8.691280994409044e-06, + "loss": 0.3165, + "step": 530 + }, + { + "epoch": 0.25927734375, + "grad_norm": 2.5286190509796143, + "learning_rate": 8.685941362237058e-06, + "loss": 0.3438, + "step": 531 + }, + { + "epoch": 0.259765625, + "grad_norm": 2.232900381088257, + "learning_rate": 8.680592506695972e-06, + "loss": 0.3389, + "step": 532 + }, + { + "epoch": 0.26025390625, + "grad_norm": 1.2126928567886353, + "learning_rate": 8.675234441170286e-06, + "loss": 0.306, + "step": 533 + }, + { + "epoch": 0.2607421875, + "grad_norm": 1.480934977531433, + "learning_rate": 8.669867179067538e-06, + "loss": 0.3696, + "step": 534 + }, + { + "epoch": 0.26123046875, + "grad_norm": 2.439810037612915, + "learning_rate": 8.664490733818289e-06, + "loss": 0.3628, + "step": 535 + }, + { + "epoch": 0.26171875, + "grad_norm": 1.3664276599884033, + "learning_rate": 8.659105118876068e-06, + "loss": 0.3534, + "step": 536 + }, + { + "epoch": 0.26220703125, + "grad_norm": 1.8439381122589111, + "learning_rate": 8.65371034771736e-06, + "loss": 0.3539, + "step": 537 + }, + { + "epoch": 0.2626953125, + "grad_norm": 2.1068308353424072, + "learning_rate": 8.64830643384155e-06, + "loss": 0.4281, + "step": 538 + }, + { + "epoch": 0.26318359375, + "grad_norm": 1.847388505935669, + "learning_rate": 8.642893390770912e-06, + "loss": 0.3624, + "step": 539 + }, + { + "epoch": 0.263671875, + "grad_norm": 2.783621311187744, + "learning_rate": 8.63747123205056e-06, + "loss": 0.3501, + "step": 540 + }, + { + "epoch": 0.26416015625, + "grad_norm": 5.078010559082031, + "learning_rate": 8.632039971248416e-06, + "loss": 0.3423, + "step": 541 + }, + { + "epoch": 0.2646484375, + "grad_norm": 1.461103916168213, + "learning_rate": 8.626599621955179e-06, + "loss": 0.3505, + "step": 542 + }, + { + "epoch": 0.26513671875, + "grad_norm": 1.512221336364746, + "learning_rate": 8.621150197784293e-06, + "loss": 0.344, + "step": 543 + }, + { + "epoch": 0.265625, + "grad_norm": 2.6210267543792725, + "learning_rate": 8.615691712371907e-06, + "loss": 0.3192, + "step": 544 + }, + { + "epoch": 0.26611328125, + "grad_norm": 1.5492252111434937, + "learning_rate": 8.610224179376847e-06, + "loss": 0.3217, + "step": 545 + }, + { + "epoch": 0.2666015625, + "grad_norm": 1.4719685316085815, + "learning_rate": 8.604747612480577e-06, + "loss": 0.3251, + "step": 546 + }, + { + "epoch": 0.26708984375, + "grad_norm": 1.9413729906082153, + "learning_rate": 8.599262025387165e-06, + "loss": 0.3658, + "step": 547 + }, + { + "epoch": 0.267578125, + "grad_norm": 1.8121291399002075, + "learning_rate": 8.593767431823255e-06, + "loss": 0.3274, + "step": 548 + }, + { + "epoch": 0.26806640625, + "grad_norm": 1.7863436937332153, + "learning_rate": 8.588263845538021e-06, + "loss": 0.3586, + "step": 549 + }, + { + "epoch": 0.2685546875, + "grad_norm": 2.253500461578369, + "learning_rate": 8.582751280303148e-06, + "loss": 0.383, + "step": 550 + }, + { + "epoch": 0.26904296875, + "grad_norm": 1.9108343124389648, + "learning_rate": 8.577229749912782e-06, + "loss": 0.3188, + "step": 551 + }, + { + "epoch": 0.26953125, + "grad_norm": 1.4474389553070068, + "learning_rate": 8.571699268183506e-06, + "loss": 0.3239, + "step": 552 + }, + { + "epoch": 0.27001953125, + "grad_norm": 1.6433511972427368, + "learning_rate": 8.566159848954305e-06, + "loss": 0.3565, + "step": 553 + }, + { + "epoch": 0.2705078125, + "grad_norm": 2.9185471534729004, + "learning_rate": 8.560611506086518e-06, + "loss": 0.3916, + "step": 554 + }, + { + "epoch": 0.27099609375, + "grad_norm": 1.6128103733062744, + "learning_rate": 8.555054253463828e-06, + "loss": 0.3518, + "step": 555 + }, + { + "epoch": 0.271484375, + "grad_norm": 1.3888630867004395, + "learning_rate": 8.549488104992201e-06, + "loss": 0.3772, + "step": 556 + }, + { + "epoch": 0.27197265625, + "grad_norm": 1.7909587621688843, + "learning_rate": 8.543913074599867e-06, + "loss": 0.3313, + "step": 557 + }, + { + "epoch": 0.2724609375, + "grad_norm": 1.6241544485092163, + "learning_rate": 8.538329176237287e-06, + "loss": 0.3535, + "step": 558 + }, + { + "epoch": 0.27294921875, + "grad_norm": 1.4434620141983032, + "learning_rate": 8.532736423877102e-06, + "loss": 0.3329, + "step": 559 + }, + { + "epoch": 0.2734375, + "grad_norm": 1.8953794240951538, + "learning_rate": 8.527134831514116e-06, + "loss": 0.3318, + "step": 560 + }, + { + "epoch": 0.27392578125, + "grad_norm": 1.287680983543396, + "learning_rate": 8.521524413165254e-06, + "loss": 0.3187, + "step": 561 + }, + { + "epoch": 0.2744140625, + "grad_norm": 1.6521981954574585, + "learning_rate": 8.51590518286952e-06, + "loss": 0.3509, + "step": 562 + }, + { + "epoch": 0.27490234375, + "grad_norm": 1.4679384231567383, + "learning_rate": 8.510277154687973e-06, + "loss": 0.3598, + "step": 563 + }, + { + "epoch": 0.275390625, + "grad_norm": 2.19455885887146, + "learning_rate": 8.504640342703687e-06, + "loss": 0.3371, + "step": 564 + }, + { + "epoch": 0.27587890625, + "grad_norm": 1.4917466640472412, + "learning_rate": 8.498994761021715e-06, + "loss": 0.3086, + "step": 565 + }, + { + "epoch": 0.2763671875, + "grad_norm": 2.3828556537628174, + "learning_rate": 8.493340423769054e-06, + "loss": 0.328, + "step": 566 + }, + { + "epoch": 0.27685546875, + "grad_norm": 2.0100631713867188, + "learning_rate": 8.487677345094606e-06, + "loss": 0.3497, + "step": 567 + }, + { + "epoch": 0.27734375, + "grad_norm": 2.037872552871704, + "learning_rate": 8.482005539169158e-06, + "loss": 0.3649, + "step": 568 + }, + { + "epoch": 0.27783203125, + "grad_norm": 1.3535383939743042, + "learning_rate": 8.476325020185326e-06, + "loss": 0.3321, + "step": 569 + }, + { + "epoch": 0.2783203125, + "grad_norm": 1.4872392416000366, + "learning_rate": 8.47063580235753e-06, + "loss": 0.3775, + "step": 570 + }, + { + "epoch": 0.27880859375, + "grad_norm": 2.482274293899536, + "learning_rate": 8.46493789992196e-06, + "loss": 0.3518, + "step": 571 + }, + { + "epoch": 0.279296875, + "grad_norm": 1.4444823265075684, + "learning_rate": 8.459231327136532e-06, + "loss": 0.3503, + "step": 572 + }, + { + "epoch": 0.27978515625, + "grad_norm": 1.3315978050231934, + "learning_rate": 8.453516098280869e-06, + "loss": 0.3408, + "step": 573 + }, + { + "epoch": 0.2802734375, + "grad_norm": 2.0306880474090576, + "learning_rate": 8.447792227656241e-06, + "loss": 0.3751, + "step": 574 + }, + { + "epoch": 0.28076171875, + "grad_norm": 1.3674098253250122, + "learning_rate": 8.442059729585552e-06, + "loss": 0.3307, + "step": 575 + }, + { + "epoch": 0.28125, + "grad_norm": 2.2325830459594727, + "learning_rate": 8.43631861841329e-06, + "loss": 0.3168, + "step": 576 + }, + { + "epoch": 0.28173828125, + "grad_norm": 1.956121802330017, + "learning_rate": 8.430568908505497e-06, + "loss": 0.3317, + "step": 577 + }, + { + "epoch": 0.2822265625, + "grad_norm": 2.0539493560791016, + "learning_rate": 8.42481061424973e-06, + "loss": 0.3172, + "step": 578 + }, + { + "epoch": 0.28271484375, + "grad_norm": 1.3269410133361816, + "learning_rate": 8.41904375005503e-06, + "loss": 0.3726, + "step": 579 + }, + { + "epoch": 0.283203125, + "grad_norm": 2.887756586074829, + "learning_rate": 8.413268330351881e-06, + "loss": 0.342, + "step": 580 + }, + { + "epoch": 0.28369140625, + "grad_norm": 1.640519618988037, + "learning_rate": 8.40748436959217e-06, + "loss": 0.3418, + "step": 581 + }, + { + "epoch": 0.2841796875, + "grad_norm": 2.179222583770752, + "learning_rate": 8.40169188224917e-06, + "loss": 0.368, + "step": 582 + }, + { + "epoch": 0.28466796875, + "grad_norm": 2.25158429145813, + "learning_rate": 8.395890882817478e-06, + "loss": 0.3555, + "step": 583 + }, + { + "epoch": 0.28515625, + "grad_norm": 1.5757050514221191, + "learning_rate": 8.390081385812993e-06, + "loss": 0.3453, + "step": 584 + }, + { + "epoch": 0.28564453125, + "grad_norm": 1.5802643299102783, + "learning_rate": 8.38426340577288e-06, + "loss": 0.3635, + "step": 585 + }, + { + "epoch": 0.2861328125, + "grad_norm": 1.5654072761535645, + "learning_rate": 8.378436957255535e-06, + "loss": 0.3304, + "step": 586 + }, + { + "epoch": 0.28662109375, + "grad_norm": 1.2622393369674683, + "learning_rate": 8.372602054840532e-06, + "loss": 0.3468, + "step": 587 + }, + { + "epoch": 0.287109375, + "grad_norm": 2.9419167041778564, + "learning_rate": 8.366758713128617e-06, + "loss": 0.3286, + "step": 588 + }, + { + "epoch": 0.28759765625, + "grad_norm": 1.6033565998077393, + "learning_rate": 8.360906946741635e-06, + "loss": 0.3375, + "step": 589 + }, + { + "epoch": 0.2880859375, + "grad_norm": 1.5381578207015991, + "learning_rate": 8.355046770322528e-06, + "loss": 0.3531, + "step": 590 + }, + { + "epoch": 0.28857421875, + "grad_norm": 1.7467304468154907, + "learning_rate": 8.349178198535273e-06, + "loss": 0.305, + "step": 591 + }, + { + "epoch": 0.2890625, + "grad_norm": 1.3759098052978516, + "learning_rate": 8.343301246064858e-06, + "loss": 0.3643, + "step": 592 + }, + { + "epoch": 0.28955078125, + "grad_norm": 1.3180525302886963, + "learning_rate": 8.337415927617243e-06, + "loss": 0.3468, + "step": 593 + }, + { + "epoch": 0.2900390625, + "grad_norm": 1.3249021768569946, + "learning_rate": 8.33152225791932e-06, + "loss": 0.3502, + "step": 594 + }, + { + "epoch": 0.29052734375, + "grad_norm": 1.9022133350372314, + "learning_rate": 8.32562025171888e-06, + "loss": 0.3842, + "step": 595 + }, + { + "epoch": 0.291015625, + "grad_norm": 1.4465323686599731, + "learning_rate": 8.319709923784573e-06, + "loss": 0.3247, + "step": 596 + }, + { + "epoch": 0.29150390625, + "grad_norm": 2.4993956089019775, + "learning_rate": 8.313791288905874e-06, + "loss": 0.3826, + "step": 597 + }, + { + "epoch": 0.2919921875, + "grad_norm": 1.842347264289856, + "learning_rate": 8.307864361893045e-06, + "loss": 0.329, + "step": 598 + }, + { + "epoch": 0.29248046875, + "grad_norm": 1.5460954904556274, + "learning_rate": 8.301929157577097e-06, + "loss": 0.3453, + "step": 599 + }, + { + "epoch": 0.29296875, + "grad_norm": 3.255307912826538, + "learning_rate": 8.295985690809752e-06, + "loss": 0.3358, + "step": 600 + }, + { + "epoch": 0.29345703125, + "grad_norm": 1.4224542379379272, + "learning_rate": 8.290033976463407e-06, + "loss": 0.3683, + "step": 601 + }, + { + "epoch": 0.2939453125, + "grad_norm": 1.4209293127059937, + "learning_rate": 8.2840740294311e-06, + "loss": 0.315, + "step": 602 + }, + { + "epoch": 0.29443359375, + "grad_norm": 2.0559093952178955, + "learning_rate": 8.278105864626467e-06, + "loss": 0.3801, + "step": 603 + }, + { + "epoch": 0.294921875, + "grad_norm": 1.880486249923706, + "learning_rate": 8.27212949698371e-06, + "loss": 0.3713, + "step": 604 + }, + { + "epoch": 0.29541015625, + "grad_norm": 3.0988686084747314, + "learning_rate": 8.266144941457552e-06, + "loss": 0.3917, + "step": 605 + }, + { + "epoch": 0.2958984375, + "grad_norm": 1.6043518781661987, + "learning_rate": 8.26015221302321e-06, + "loss": 0.3678, + "step": 606 + }, + { + "epoch": 0.29638671875, + "grad_norm": 1.520564079284668, + "learning_rate": 8.254151326676354e-06, + "loss": 0.3259, + "step": 607 + }, + { + "epoch": 0.296875, + "grad_norm": 1.9146232604980469, + "learning_rate": 8.248142297433058e-06, + "loss": 0.3291, + "step": 608 + }, + { + "epoch": 0.29736328125, + "grad_norm": 2.2928895950317383, + "learning_rate": 8.24212514032978e-06, + "loss": 0.3828, + "step": 609 + }, + { + "epoch": 0.2978515625, + "grad_norm": 1.9419975280761719, + "learning_rate": 8.236099870423314e-06, + "loss": 0.3287, + "step": 610 + }, + { + "epoch": 0.29833984375, + "grad_norm": 1.7183066606521606, + "learning_rate": 8.230066502790756e-06, + "loss": 0.3121, + "step": 611 + }, + { + "epoch": 0.298828125, + "grad_norm": 1.5658105611801147, + "learning_rate": 8.224025052529463e-06, + "loss": 0.3501, + "step": 612 + }, + { + "epoch": 0.29931640625, + "grad_norm": 1.9759196043014526, + "learning_rate": 8.21797553475702e-06, + "loss": 0.3345, + "step": 613 + }, + { + "epoch": 0.2998046875, + "grad_norm": 2.0763461589813232, + "learning_rate": 8.211917964611197e-06, + "loss": 0.3187, + "step": 614 + }, + { + "epoch": 0.30029296875, + "grad_norm": 1.4480257034301758, + "learning_rate": 8.205852357249912e-06, + "loss": 0.2866, + "step": 615 + }, + { + "epoch": 0.30078125, + "grad_norm": 1.9418996572494507, + "learning_rate": 8.1997787278512e-06, + "loss": 0.3125, + "step": 616 + }, + { + "epoch": 0.30126953125, + "grad_norm": 1.726302146911621, + "learning_rate": 8.193697091613163e-06, + "loss": 0.3663, + "step": 617 + }, + { + "epoch": 0.3017578125, + "grad_norm": 1.622819423675537, + "learning_rate": 8.187607463753946e-06, + "loss": 0.3385, + "step": 618 + }, + { + "epoch": 0.30224609375, + "grad_norm": 2.375453472137451, + "learning_rate": 8.181509859511686e-06, + "loss": 0.3314, + "step": 619 + }, + { + "epoch": 0.302734375, + "grad_norm": 1.6941611766815186, + "learning_rate": 8.175404294144482e-06, + "loss": 0.3152, + "step": 620 + }, + { + "epoch": 0.30322265625, + "grad_norm": 1.6905850172042847, + "learning_rate": 8.16929078293035e-06, + "loss": 0.3352, + "step": 621 + }, + { + "epoch": 0.3037109375, + "grad_norm": 1.9776393175125122, + "learning_rate": 8.163169341167196e-06, + "loss": 0.39, + "step": 622 + }, + { + "epoch": 0.30419921875, + "grad_norm": 1.4409841299057007, + "learning_rate": 8.157039984172764e-06, + "loss": 0.3445, + "step": 623 + }, + { + "epoch": 0.3046875, + "grad_norm": 1.7097798585891724, + "learning_rate": 8.150902727284609e-06, + "loss": 0.3583, + "step": 624 + }, + { + "epoch": 0.30517578125, + "grad_norm": 1.5705921649932861, + "learning_rate": 8.144757585860053e-06, + "loss": 0.355, + "step": 625 + }, + { + "epoch": 0.3056640625, + "grad_norm": 1.5804706811904907, + "learning_rate": 8.138604575276143e-06, + "loss": 0.3615, + "step": 626 + }, + { + "epoch": 0.30615234375, + "grad_norm": 1.7296881675720215, + "learning_rate": 8.132443710929624e-06, + "loss": 0.381, + "step": 627 + }, + { + "epoch": 0.306640625, + "grad_norm": 1.3139718770980835, + "learning_rate": 8.126275008236891e-06, + "loss": 0.3296, + "step": 628 + }, + { + "epoch": 0.30712890625, + "grad_norm": 1.339277744293213, + "learning_rate": 8.12009848263395e-06, + "loss": 0.3262, + "step": 629 + }, + { + "epoch": 0.3076171875, + "grad_norm": 5.439074516296387, + "learning_rate": 8.113914149576388e-06, + "loss": 0.361, + "step": 630 + }, + { + "epoch": 0.30810546875, + "grad_norm": 1.8875752687454224, + "learning_rate": 8.107722024539321e-06, + "loss": 0.3419, + "step": 631 + }, + { + "epoch": 0.30859375, + "grad_norm": 1.3780957460403442, + "learning_rate": 8.10152212301737e-06, + "loss": 0.3398, + "step": 632 + }, + { + "epoch": 0.30908203125, + "grad_norm": 2.1425485610961914, + "learning_rate": 8.095314460524612e-06, + "loss": 0.3473, + "step": 633 + }, + { + "epoch": 0.3095703125, + "grad_norm": 2.3225300312042236, + "learning_rate": 8.089099052594545e-06, + "loss": 0.3757, + "step": 634 + }, + { + "epoch": 0.31005859375, + "grad_norm": 1.4518051147460938, + "learning_rate": 8.08287591478005e-06, + "loss": 0.3112, + "step": 635 + }, + { + "epoch": 0.310546875, + "grad_norm": 2.2762012481689453, + "learning_rate": 8.076645062653346e-06, + "loss": 0.3642, + "step": 636 + }, + { + "epoch": 0.31103515625, + "grad_norm": 1.6947425603866577, + "learning_rate": 8.070406511805961e-06, + "loss": 0.35, + "step": 637 + }, + { + "epoch": 0.3115234375, + "grad_norm": 1.5694466829299927, + "learning_rate": 8.064160277848683e-06, + "loss": 0.3458, + "step": 638 + }, + { + "epoch": 0.31201171875, + "grad_norm": 1.9441496133804321, + "learning_rate": 8.05790637641153e-06, + "loss": 0.3698, + "step": 639 + }, + { + "epoch": 0.3125, + "grad_norm": 1.6394853591918945, + "learning_rate": 8.051644823143702e-06, + "loss": 0.3515, + "step": 640 + }, + { + "epoch": 0.31298828125, + "grad_norm": 1.8157254457473755, + "learning_rate": 8.04537563371355e-06, + "loss": 0.3278, + "step": 641 + }, + { + "epoch": 0.3134765625, + "grad_norm": 1.6162160634994507, + "learning_rate": 8.03909882380853e-06, + "loss": 0.3586, + "step": 642 + }, + { + "epoch": 0.31396484375, + "grad_norm": 1.7346367835998535, + "learning_rate": 8.03281440913517e-06, + "loss": 0.3194, + "step": 643 + }, + { + "epoch": 0.314453125, + "grad_norm": 1.593997836112976, + "learning_rate": 8.026522405419024e-06, + "loss": 0.3205, + "step": 644 + }, + { + "epoch": 0.31494140625, + "grad_norm": 1.3535056114196777, + "learning_rate": 8.020222828404638e-06, + "loss": 0.3382, + "step": 645 + }, + { + "epoch": 0.3154296875, + "grad_norm": 2.354459524154663, + "learning_rate": 8.01391569385551e-06, + "loss": 0.3041, + "step": 646 + }, + { + "epoch": 0.31591796875, + "grad_norm": 1.6168910264968872, + "learning_rate": 8.007601017554045e-06, + "loss": 0.392, + "step": 647 + }, + { + "epoch": 0.31640625, + "grad_norm": 1.7411466836929321, + "learning_rate": 8.001278815301525e-06, + "loss": 0.319, + "step": 648 + }, + { + "epoch": 0.31689453125, + "grad_norm": 2.3402931690216064, + "learning_rate": 7.994949102918062e-06, + "loss": 0.3657, + "step": 649 + }, + { + "epoch": 0.3173828125, + "grad_norm": 1.2933272123336792, + "learning_rate": 7.98861189624256e-06, + "loss": 0.3049, + "step": 650 + }, + { + "epoch": 0.31787109375, + "grad_norm": 1.6581286191940308, + "learning_rate": 7.982267211132675e-06, + "loss": 0.354, + "step": 651 + }, + { + "epoch": 0.318359375, + "grad_norm": 2.0283968448638916, + "learning_rate": 7.97591506346478e-06, + "loss": 0.3521, + "step": 652 + }, + { + "epoch": 0.31884765625, + "grad_norm": 1.6676313877105713, + "learning_rate": 7.96955546913392e-06, + "loss": 0.3237, + "step": 653 + }, + { + "epoch": 0.3193359375, + "grad_norm": 1.548922061920166, + "learning_rate": 7.963188444053772e-06, + "loss": 0.3145, + "step": 654 + }, + { + "epoch": 0.31982421875, + "grad_norm": 2.61688232421875, + "learning_rate": 7.95681400415661e-06, + "loss": 0.3159, + "step": 655 + }, + { + "epoch": 0.3203125, + "grad_norm": 2.0864787101745605, + "learning_rate": 7.95043216539326e-06, + "loss": 0.3394, + "step": 656 + }, + { + "epoch": 0.32080078125, + "grad_norm": 1.82245934009552, + "learning_rate": 7.944042943733061e-06, + "loss": 0.355, + "step": 657 + }, + { + "epoch": 0.3212890625, + "grad_norm": 1.6342824697494507, + "learning_rate": 7.937646355163833e-06, + "loss": 0.3407, + "step": 658 + }, + { + "epoch": 0.32177734375, + "grad_norm": 1.7688589096069336, + "learning_rate": 7.931242415691822e-06, + "loss": 0.3936, + "step": 659 + }, + { + "epoch": 0.322265625, + "grad_norm": 1.5749949216842651, + "learning_rate": 7.924831141341671e-06, + "loss": 0.3226, + "step": 660 + }, + { + "epoch": 0.32275390625, + "grad_norm": 4.079642295837402, + "learning_rate": 7.918412548156382e-06, + "loss": 0.3478, + "step": 661 + }, + { + "epoch": 0.3232421875, + "grad_norm": 1.564584732055664, + "learning_rate": 7.911986652197263e-06, + "loss": 0.345, + "step": 662 + }, + { + "epoch": 0.32373046875, + "grad_norm": 1.9359629154205322, + "learning_rate": 7.905553469543903e-06, + "loss": 0.3478, + "step": 663 + }, + { + "epoch": 0.32421875, + "grad_norm": 1.3265938758850098, + "learning_rate": 7.899113016294118e-06, + "loss": 0.3789, + "step": 664 + }, + { + "epoch": 0.32470703125, + "grad_norm": 1.617301106452942, + "learning_rate": 7.892665308563922e-06, + "loss": 0.3182, + "step": 665 + }, + { + "epoch": 0.3251953125, + "grad_norm": 2.50874924659729, + "learning_rate": 7.88621036248748e-06, + "loss": 0.3269, + "step": 666 + }, + { + "epoch": 0.32568359375, + "grad_norm": 2.0309231281280518, + "learning_rate": 7.879748194217074e-06, + "loss": 0.3294, + "step": 667 + }, + { + "epoch": 0.326171875, + "grad_norm": 1.6182068586349487, + "learning_rate": 7.873278819923047e-06, + "loss": 0.3269, + "step": 668 + }, + { + "epoch": 0.32666015625, + "grad_norm": 2.3924951553344727, + "learning_rate": 7.866802255793788e-06, + "loss": 0.3498, + "step": 669 + }, + { + "epoch": 0.3271484375, + "grad_norm": 2.816044330596924, + "learning_rate": 7.860318518035668e-06, + "loss": 0.3231, + "step": 670 + }, + { + "epoch": 0.32763671875, + "grad_norm": 1.9277939796447754, + "learning_rate": 7.853827622873011e-06, + "loss": 0.3236, + "step": 671 + }, + { + "epoch": 0.328125, + "grad_norm": 1.364225149154663, + "learning_rate": 7.847329586548049e-06, + "loss": 0.3807, + "step": 672 + }, + { + "epoch": 0.32861328125, + "grad_norm": 1.443907380104065, + "learning_rate": 7.840824425320888e-06, + "loss": 0.4092, + "step": 673 + }, + { + "epoch": 0.3291015625, + "grad_norm": 1.670778512954712, + "learning_rate": 7.834312155469457e-06, + "loss": 0.3653, + "step": 674 + }, + { + "epoch": 0.32958984375, + "grad_norm": 1.510043740272522, + "learning_rate": 7.827792793289477e-06, + "loss": 0.3463, + "step": 675 + }, + { + "epoch": 0.330078125, + "grad_norm": 2.1872780323028564, + "learning_rate": 7.821266355094419e-06, + "loss": 0.3479, + "step": 676 + }, + { + "epoch": 0.33056640625, + "grad_norm": 1.6790423393249512, + "learning_rate": 7.814732857215453e-06, + "loss": 0.3476, + "step": 677 + }, + { + "epoch": 0.3310546875, + "grad_norm": 1.3476860523223877, + "learning_rate": 7.808192316001417e-06, + "loss": 0.3333, + "step": 678 + }, + { + "epoch": 0.33154296875, + "grad_norm": 1.752164602279663, + "learning_rate": 7.801644747818777e-06, + "loss": 0.3341, + "step": 679 + }, + { + "epoch": 0.33203125, + "grad_norm": 2.4022326469421387, + "learning_rate": 7.79509016905158e-06, + "loss": 0.357, + "step": 680 + }, + { + "epoch": 0.33251953125, + "grad_norm": 1.3659697771072388, + "learning_rate": 7.788528596101419e-06, + "loss": 0.3073, + "step": 681 + }, + { + "epoch": 0.3330078125, + "grad_norm": 1.4519615173339844, + "learning_rate": 7.78196004538738e-06, + "loss": 0.3052, + "step": 682 + }, + { + "epoch": 0.33349609375, + "grad_norm": 2.08927583694458, + "learning_rate": 7.775384533346018e-06, + "loss": 0.3242, + "step": 683 + }, + { + "epoch": 0.333984375, + "grad_norm": 1.4538501501083374, + "learning_rate": 7.768802076431304e-06, + "loss": 0.3495, + "step": 684 + }, + { + "epoch": 0.33447265625, + "grad_norm": 2.239643096923828, + "learning_rate": 7.76221269111459e-06, + "loss": 0.3554, + "step": 685 + }, + { + "epoch": 0.3349609375, + "grad_norm": 1.8009265661239624, + "learning_rate": 7.755616393884562e-06, + "loss": 0.3652, + "step": 686 + }, + { + "epoch": 0.33544921875, + "grad_norm": 1.5794439315795898, + "learning_rate": 7.7490132012472e-06, + "loss": 0.3321, + "step": 687 + }, + { + "epoch": 0.3359375, + "grad_norm": 1.737437129020691, + "learning_rate": 7.742403129725742e-06, + "loss": 0.3138, + "step": 688 + }, + { + "epoch": 0.33642578125, + "grad_norm": 1.7152299880981445, + "learning_rate": 7.735786195860641e-06, + "loss": 0.3582, + "step": 689 + }, + { + "epoch": 0.3369140625, + "grad_norm": 1.3847858905792236, + "learning_rate": 7.729162416209518e-06, + "loss": 0.3396, + "step": 690 + }, + { + "epoch": 0.33740234375, + "grad_norm": 1.6747031211853027, + "learning_rate": 7.722531807347122e-06, + "loss": 0.3474, + "step": 691 + }, + { + "epoch": 0.337890625, + "grad_norm": 1.3016866445541382, + "learning_rate": 7.715894385865299e-06, + "loss": 0.3391, + "step": 692 + }, + { + "epoch": 0.33837890625, + "grad_norm": 1.3648223876953125, + "learning_rate": 7.709250168372932e-06, + "loss": 0.3298, + "step": 693 + }, + { + "epoch": 0.3388671875, + "grad_norm": 1.5124351978302002, + "learning_rate": 7.702599171495919e-06, + "loss": 0.3334, + "step": 694 + }, + { + "epoch": 0.33935546875, + "grad_norm": 37.46984100341797, + "learning_rate": 7.695941411877115e-06, + "loss": 0.3342, + "step": 695 + }, + { + "epoch": 0.33984375, + "grad_norm": 1.4970625638961792, + "learning_rate": 7.689276906176302e-06, + "loss": 0.3436, + "step": 696 + }, + { + "epoch": 0.34033203125, + "grad_norm": 3.098925828933716, + "learning_rate": 7.682605671070142e-06, + "loss": 0.3437, + "step": 697 + }, + { + "epoch": 0.3408203125, + "grad_norm": 1.7555867433547974, + "learning_rate": 7.675927723252134e-06, + "loss": 0.322, + "step": 698 + }, + { + "epoch": 0.34130859375, + "grad_norm": 1.5935651063919067, + "learning_rate": 7.669243079432578e-06, + "loss": 0.2998, + "step": 699 + }, + { + "epoch": 0.341796875, + "grad_norm": 1.506208896636963, + "learning_rate": 7.662551756338525e-06, + "loss": 0.3612, + "step": 700 + }, + { + "epoch": 0.34228515625, + "grad_norm": 1.923596978187561, + "learning_rate": 7.655853770713744e-06, + "loss": 0.3593, + "step": 701 + }, + { + "epoch": 0.3427734375, + "grad_norm": 1.9344090223312378, + "learning_rate": 7.64914913931867e-06, + "loss": 0.3156, + "step": 702 + }, + { + "epoch": 0.34326171875, + "grad_norm": 1.7808047533035278, + "learning_rate": 7.642437878930376e-06, + "loss": 0.3419, + "step": 703 + }, + { + "epoch": 0.34375, + "grad_norm": 1.5053675174713135, + "learning_rate": 7.635720006342513e-06, + "loss": 0.3539, + "step": 704 + }, + { + "epoch": 0.34423828125, + "grad_norm": 1.5963175296783447, + "learning_rate": 7.628995538365287e-06, + "loss": 0.3562, + "step": 705 + }, + { + "epoch": 0.3447265625, + "grad_norm": 1.4388726949691772, + "learning_rate": 7.6222644918254005e-06, + "loss": 0.3413, + "step": 706 + }, + { + "epoch": 0.34521484375, + "grad_norm": 3.6217451095581055, + "learning_rate": 7.615526883566023e-06, + "loss": 0.3584, + "step": 707 + }, + { + "epoch": 0.345703125, + "grad_norm": 1.6617943048477173, + "learning_rate": 7.608782730446741e-06, + "loss": 0.3675, + "step": 708 + }, + { + "epoch": 0.34619140625, + "grad_norm": 3.6505870819091797, + "learning_rate": 7.6020320493435175e-06, + "loss": 0.3028, + "step": 709 + }, + { + "epoch": 0.3466796875, + "grad_norm": 1.5057923793792725, + "learning_rate": 7.595274857148651e-06, + "loss": 0.3601, + "step": 710 + }, + { + "epoch": 0.34716796875, + "grad_norm": 1.775791049003601, + "learning_rate": 7.588511170770736e-06, + "loss": 0.3561, + "step": 711 + }, + { + "epoch": 0.34765625, + "grad_norm": 2.0912845134735107, + "learning_rate": 7.581741007134611e-06, + "loss": 0.3211, + "step": 712 + }, + { + "epoch": 0.34814453125, + "grad_norm": 1.4719021320343018, + "learning_rate": 7.574964383181329e-06, + "loss": 0.3571, + "step": 713 + }, + { + "epoch": 0.3486328125, + "grad_norm": 1.5099034309387207, + "learning_rate": 7.568181315868104e-06, + "loss": 0.3773, + "step": 714 + }, + { + "epoch": 0.34912109375, + "grad_norm": 1.797803282737732, + "learning_rate": 7.561391822168277e-06, + "loss": 0.3305, + "step": 715 + }, + { + "epoch": 0.349609375, + "grad_norm": 1.5316636562347412, + "learning_rate": 7.554595919071268e-06, + "loss": 0.3692, + "step": 716 + }, + { + "epoch": 0.35009765625, + "grad_norm": 1.332055926322937, + "learning_rate": 7.5477936235825344e-06, + "loss": 0.2998, + "step": 717 + }, + { + "epoch": 0.3505859375, + "grad_norm": 1.538785457611084, + "learning_rate": 7.540984952723531e-06, + "loss": 0.3325, + "step": 718 + }, + { + "epoch": 0.35107421875, + "grad_norm": 2.884404420852661, + "learning_rate": 7.534169923531665e-06, + "loss": 0.3036, + "step": 719 + }, + { + "epoch": 0.3515625, + "grad_norm": 1.7468745708465576, + "learning_rate": 7.527348553060254e-06, + "loss": 0.3199, + "step": 720 + }, + { + "epoch": 0.35205078125, + "grad_norm": 2.015227794647217, + "learning_rate": 7.520520858378486e-06, + "loss": 0.3884, + "step": 721 + }, + { + "epoch": 0.3525390625, + "grad_norm": 1.3880223035812378, + "learning_rate": 7.513686856571367e-06, + "loss": 0.336, + "step": 722 + }, + { + "epoch": 0.35302734375, + "grad_norm": 1.297411561012268, + "learning_rate": 7.506846564739694e-06, + "loss": 0.3306, + "step": 723 + }, + { + "epoch": 0.353515625, + "grad_norm": 1.55870521068573, + "learning_rate": 7.500000000000001e-06, + "loss": 0.3056, + "step": 724 + }, + { + "epoch": 0.35400390625, + "grad_norm": 2.036909818649292, + "learning_rate": 7.493147179484514e-06, + "loss": 0.3273, + "step": 725 + }, + { + "epoch": 0.3544921875, + "grad_norm": 1.3678783178329468, + "learning_rate": 7.486288120341118e-06, + "loss": 0.345, + "step": 726 + }, + { + "epoch": 0.35498046875, + "grad_norm": 2.0894579887390137, + "learning_rate": 7.479422839733307e-06, + "loss": 0.359, + "step": 727 + }, + { + "epoch": 0.35546875, + "grad_norm": 1.6823246479034424, + "learning_rate": 7.4725513548401455e-06, + "loss": 0.3563, + "step": 728 + }, + { + "epoch": 0.35595703125, + "grad_norm": 1.351969838142395, + "learning_rate": 7.4656736828562186e-06, + "loss": 0.3017, + "step": 729 + }, + { + "epoch": 0.3564453125, + "grad_norm": 1.6686972379684448, + "learning_rate": 7.458789840991596e-06, + "loss": 0.3478, + "step": 730 + }, + { + "epoch": 0.35693359375, + "grad_norm": 1.3534908294677734, + "learning_rate": 7.4518998464717874e-06, + "loss": 0.3244, + "step": 731 + }, + { + "epoch": 0.357421875, + "grad_norm": 1.4082777500152588, + "learning_rate": 7.445003716537698e-06, + "loss": 0.3251, + "step": 732 + }, + { + "epoch": 0.35791015625, + "grad_norm": 2.0288498401641846, + "learning_rate": 7.438101468445582e-06, + "loss": 0.3379, + "step": 733 + }, + { + "epoch": 0.3583984375, + "grad_norm": 1.6891510486602783, + "learning_rate": 7.4311931194670085e-06, + "loss": 0.3576, + "step": 734 + }, + { + "epoch": 0.35888671875, + "grad_norm": 1.3616983890533447, + "learning_rate": 7.42427868688881e-06, + "loss": 0.3439, + "step": 735 + }, + { + "epoch": 0.359375, + "grad_norm": 1.5869650840759277, + "learning_rate": 7.417358188013042e-06, + "loss": 0.3389, + "step": 736 + }, + { + "epoch": 0.35986328125, + "grad_norm": 1.3705356121063232, + "learning_rate": 7.410431640156937e-06, + "loss": 0.346, + "step": 737 + }, + { + "epoch": 0.3603515625, + "grad_norm": 2.2622792720794678, + "learning_rate": 7.403499060652874e-06, + "loss": 0.3535, + "step": 738 + }, + { + "epoch": 0.36083984375, + "grad_norm": 1.719897747039795, + "learning_rate": 7.3965604668483145e-06, + "loss": 0.382, + "step": 739 + }, + { + "epoch": 0.361328125, + "grad_norm": 1.3844950199127197, + "learning_rate": 7.389615876105773e-06, + "loss": 0.3481, + "step": 740 + }, + { + "epoch": 0.36181640625, + "grad_norm": 1.6294703483581543, + "learning_rate": 7.38266530580277e-06, + "loss": 0.3656, + "step": 741 + }, + { + "epoch": 0.3623046875, + "grad_norm": 2.908967971801758, + "learning_rate": 7.375708773331791e-06, + "loss": 0.3457, + "step": 742 + }, + { + "epoch": 0.36279296875, + "grad_norm": 1.473132848739624, + "learning_rate": 7.36874629610024e-06, + "loss": 0.3385, + "step": 743 + }, + { + "epoch": 0.36328125, + "grad_norm": 2.919328451156616, + "learning_rate": 7.361777891530392e-06, + "loss": 0.3336, + "step": 744 + }, + { + "epoch": 0.36376953125, + "grad_norm": 2.563336133956909, + "learning_rate": 7.354803577059359e-06, + "loss": 0.3357, + "step": 745 + }, + { + "epoch": 0.3642578125, + "grad_norm": 1.4097625017166138, + "learning_rate": 7.347823370139042e-06, + "loss": 0.3559, + "step": 746 + }, + { + "epoch": 0.36474609375, + "grad_norm": 1.3321950435638428, + "learning_rate": 7.340837288236085e-06, + "loss": 0.3626, + "step": 747 + }, + { + "epoch": 0.365234375, + "grad_norm": 1.6507295370101929, + "learning_rate": 7.3338453488318284e-06, + "loss": 0.3095, + "step": 748 + }, + { + "epoch": 0.36572265625, + "grad_norm": 1.8008859157562256, + "learning_rate": 7.326847569422278e-06, + "loss": 0.3193, + "step": 749 + }, + { + "epoch": 0.3662109375, + "grad_norm": 1.4755789041519165, + "learning_rate": 7.3198439675180484e-06, + "loss": 0.2986, + "step": 750 + }, + { + "epoch": 0.36669921875, + "grad_norm": 1.7474323511123657, + "learning_rate": 7.312834560644327e-06, + "loss": 0.3936, + "step": 751 + }, + { + "epoch": 0.3671875, + "grad_norm": 1.6639896631240845, + "learning_rate": 7.30581936634082e-06, + "loss": 0.3673, + "step": 752 + }, + { + "epoch": 0.36767578125, + "grad_norm": 1.3790712356567383, + "learning_rate": 7.298798402161725e-06, + "loss": 0.3639, + "step": 753 + }, + { + "epoch": 0.3681640625, + "grad_norm": 1.9777040481567383, + "learning_rate": 7.291771685675673e-06, + "loss": 0.3299, + "step": 754 + }, + { + "epoch": 0.36865234375, + "grad_norm": 1.7995957136154175, + "learning_rate": 7.284739234465686e-06, + "loss": 0.3605, + "step": 755 + }, + { + "epoch": 0.369140625, + "grad_norm": 1.9671039581298828, + "learning_rate": 7.277701066129141e-06, + "loss": 0.3792, + "step": 756 + }, + { + "epoch": 0.36962890625, + "grad_norm": 2.719590187072754, + "learning_rate": 7.27065719827772e-06, + "loss": 0.3318, + "step": 757 + }, + { + "epoch": 0.3701171875, + "grad_norm": 1.9835278987884521, + "learning_rate": 7.2636076485373645e-06, + "loss": 0.3286, + "step": 758 + }, + { + "epoch": 0.37060546875, + "grad_norm": 1.2610225677490234, + "learning_rate": 7.256552434548236e-06, + "loss": 0.3274, + "step": 759 + }, + { + "epoch": 0.37109375, + "grad_norm": 1.2788983583450317, + "learning_rate": 7.249491573964671e-06, + "loss": 0.3622, + "step": 760 + }, + { + "epoch": 0.37158203125, + "grad_norm": 1.2974728345870972, + "learning_rate": 7.242425084455132e-06, + "loss": 0.3253, + "step": 761 + }, + { + "epoch": 0.3720703125, + "grad_norm": 1.8051031827926636, + "learning_rate": 7.23535298370217e-06, + "loss": 0.3486, + "step": 762 + }, + { + "epoch": 0.37255859375, + "grad_norm": 1.7785935401916504, + "learning_rate": 7.228275289402373e-06, + "loss": 0.3195, + "step": 763 + }, + { + "epoch": 0.373046875, + "grad_norm": 1.2360249757766724, + "learning_rate": 7.221192019266332e-06, + "loss": 0.3005, + "step": 764 + }, + { + "epoch": 0.37353515625, + "grad_norm": 1.5772784948349, + "learning_rate": 7.214103191018584e-06, + "loss": 0.3319, + "step": 765 + }, + { + "epoch": 0.3740234375, + "grad_norm": 1.5777393579483032, + "learning_rate": 7.2070088223975784e-06, + "loss": 0.3412, + "step": 766 + }, + { + "epoch": 0.37451171875, + "grad_norm": 1.2442673444747925, + "learning_rate": 7.199908931155628e-06, + "loss": 0.3236, + "step": 767 + }, + { + "epoch": 0.375, + "grad_norm": 1.1323033571243286, + "learning_rate": 7.192803535058861e-06, + "loss": 0.3236, + "step": 768 + }, + { + "epoch": 0.37548828125, + "grad_norm": 1.316483974456787, + "learning_rate": 7.185692651887186e-06, + "loss": 0.3295, + "step": 769 + }, + { + "epoch": 0.3759765625, + "grad_norm": 1.5371990203857422, + "learning_rate": 7.178576299434239e-06, + "loss": 0.3711, + "step": 770 + }, + { + "epoch": 0.37646484375, + "grad_norm": 1.7177865505218506, + "learning_rate": 7.171454495507341e-06, + "loss": 0.3294, + "step": 771 + }, + { + "epoch": 0.376953125, + "grad_norm": 1.4074996709823608, + "learning_rate": 7.164327257927456e-06, + "loss": 0.3472, + "step": 772 + }, + { + "epoch": 0.37744140625, + "grad_norm": 1.3459590673446655, + "learning_rate": 7.157194604529143e-06, + "loss": 0.3268, + "step": 773 + }, + { + "epoch": 0.3779296875, + "grad_norm": 1.3509142398834229, + "learning_rate": 7.150056553160517e-06, + "loss": 0.3258, + "step": 774 + }, + { + "epoch": 0.37841796875, + "grad_norm": 1.3562768697738647, + "learning_rate": 7.142913121683195e-06, + "loss": 0.3301, + "step": 775 + }, + { + "epoch": 0.37890625, + "grad_norm": 1.815333604812622, + "learning_rate": 7.135764327972261e-06, + "loss": 0.3653, + "step": 776 + }, + { + "epoch": 0.37939453125, + "grad_norm": 1.3162930011749268, + "learning_rate": 7.128610189916213e-06, + "loss": 0.376, + "step": 777 + }, + { + "epoch": 0.3798828125, + "grad_norm": 1.7800266742706299, + "learning_rate": 7.121450725416928e-06, + "loss": 0.3662, + "step": 778 + }, + { + "epoch": 0.38037109375, + "grad_norm": 1.5096458196640015, + "learning_rate": 7.114285952389604e-06, + "loss": 0.3588, + "step": 779 + }, + { + "epoch": 0.380859375, + "grad_norm": 2.538273334503174, + "learning_rate": 7.1071158887627304e-06, + "loss": 0.3312, + "step": 780 + }, + { + "epoch": 0.38134765625, + "grad_norm": 1.3077067136764526, + "learning_rate": 7.0999405524780266e-06, + "loss": 0.3344, + "step": 781 + }, + { + "epoch": 0.3818359375, + "grad_norm": 1.3059022426605225, + "learning_rate": 7.092759961490415e-06, + "loss": 0.3259, + "step": 782 + }, + { + "epoch": 0.38232421875, + "grad_norm": 2.276553153991699, + "learning_rate": 7.08557413376796e-06, + "loss": 0.3331, + "step": 783 + }, + { + "epoch": 0.3828125, + "grad_norm": 1.3777782917022705, + "learning_rate": 7.078383087291833e-06, + "loss": 0.3211, + "step": 784 + }, + { + "epoch": 0.38330078125, + "grad_norm": 1.3232738971710205, + "learning_rate": 7.071186840056264e-06, + "loss": 0.2928, + "step": 785 + }, + { + "epoch": 0.3837890625, + "grad_norm": 1.1360565423965454, + "learning_rate": 7.063985410068499e-06, + "loss": 0.3291, + "step": 786 + }, + { + "epoch": 0.38427734375, + "grad_norm": 1.5104074478149414, + "learning_rate": 7.056778815348746e-06, + "loss": 0.3388, + "step": 787 + }, + { + "epoch": 0.384765625, + "grad_norm": 1.3837941884994507, + "learning_rate": 7.0495670739301435e-06, + "loss": 0.3802, + "step": 788 + }, + { + "epoch": 0.38525390625, + "grad_norm": 2.0784964561462402, + "learning_rate": 7.042350203858706e-06, + "loss": 0.3153, + "step": 789 + }, + { + "epoch": 0.3857421875, + "grad_norm": 1.4472565650939941, + "learning_rate": 7.035128223193286e-06, + "loss": 0.3145, + "step": 790 + }, + { + "epoch": 0.38623046875, + "grad_norm": 1.729691505432129, + "learning_rate": 7.0279011500055136e-06, + "loss": 0.393, + "step": 791 + }, + { + "epoch": 0.38671875, + "grad_norm": 1.4967801570892334, + "learning_rate": 7.020669002379772e-06, + "loss": 0.3344, + "step": 792 + }, + { + "epoch": 0.38720703125, + "grad_norm": 1.322029948234558, + "learning_rate": 7.0134317984131395e-06, + "loss": 0.3319, + "step": 793 + }, + { + "epoch": 0.3876953125, + "grad_norm": 2.8917009830474854, + "learning_rate": 7.006189556215346e-06, + "loss": 0.3152, + "step": 794 + }, + { + "epoch": 0.38818359375, + "grad_norm": 1.581947922706604, + "learning_rate": 6.998942293908725e-06, + "loss": 0.3606, + "step": 795 + }, + { + "epoch": 0.388671875, + "grad_norm": 2.658916711807251, + "learning_rate": 6.991690029628181e-06, + "loss": 0.3451, + "step": 796 + }, + { + "epoch": 0.38916015625, + "grad_norm": 2.3201754093170166, + "learning_rate": 6.9844327815211275e-06, + "loss": 0.333, + "step": 797 + }, + { + "epoch": 0.3896484375, + "grad_norm": 1.4934650659561157, + "learning_rate": 6.977170567747452e-06, + "loss": 0.3336, + "step": 798 + }, + { + "epoch": 0.39013671875, + "grad_norm": 1.4863629341125488, + "learning_rate": 6.969903406479465e-06, + "loss": 0.3347, + "step": 799 + }, + { + "epoch": 0.390625, + "grad_norm": 1.3552590608596802, + "learning_rate": 6.962631315901861e-06, + "loss": 0.3623, + "step": 800 + }, + { + "epoch": 0.39111328125, + "grad_norm": 2.2949376106262207, + "learning_rate": 6.955354314211669e-06, + "loss": 0.2987, + "step": 801 + }, + { + "epoch": 0.3916015625, + "grad_norm": 1.3013123273849487, + "learning_rate": 6.948072419618201e-06, + "loss": 0.3307, + "step": 802 + }, + { + "epoch": 0.39208984375, + "grad_norm": 1.4084373712539673, + "learning_rate": 6.940785650343019e-06, + "loss": 0.3119, + "step": 803 + }, + { + "epoch": 0.392578125, + "grad_norm": 2.596653461456299, + "learning_rate": 6.93349402461988e-06, + "loss": 0.3228, + "step": 804 + }, + { + "epoch": 0.39306640625, + "grad_norm": 1.5036858320236206, + "learning_rate": 6.926197560694699e-06, + "loss": 0.3463, + "step": 805 + }, + { + "epoch": 0.3935546875, + "grad_norm": 1.8642725944519043, + "learning_rate": 6.918896276825485e-06, + "loss": 0.368, + "step": 806 + }, + { + "epoch": 0.39404296875, + "grad_norm": 1.289711356163025, + "learning_rate": 6.9115901912823226e-06, + "loss": 0.3582, + "step": 807 + }, + { + "epoch": 0.39453125, + "grad_norm": 1.507915735244751, + "learning_rate": 6.9042793223473024e-06, + "loss": 0.3829, + "step": 808 + }, + { + "epoch": 0.39501953125, + "grad_norm": 1.7021656036376953, + "learning_rate": 6.896963688314489e-06, + "loss": 0.3668, + "step": 809 + }, + { + "epoch": 0.3955078125, + "grad_norm": 1.2955149412155151, + "learning_rate": 6.889643307489865e-06, + "loss": 0.3344, + "step": 810 + }, + { + "epoch": 0.39599609375, + "grad_norm": 1.183563232421875, + "learning_rate": 6.882318198191298e-06, + "loss": 0.3191, + "step": 811 + }, + { + "epoch": 0.396484375, + "grad_norm": 1.458882451057434, + "learning_rate": 6.874988378748484e-06, + "loss": 0.3531, + "step": 812 + }, + { + "epoch": 0.39697265625, + "grad_norm": 1.6540387868881226, + "learning_rate": 6.8676538675029054e-06, + "loss": 0.3399, + "step": 813 + }, + { + "epoch": 0.3974609375, + "grad_norm": 1.2130305767059326, + "learning_rate": 6.860314682807786e-06, + "loss": 0.3387, + "step": 814 + }, + { + "epoch": 0.39794921875, + "grad_norm": 1.3185558319091797, + "learning_rate": 6.852970843028043e-06, + "loss": 0.3389, + "step": 815 + }, + { + "epoch": 0.3984375, + "grad_norm": 1.6620187759399414, + "learning_rate": 6.845622366540242e-06, + "loss": 0.3041, + "step": 816 + }, + { + "epoch": 0.39892578125, + "grad_norm": 1.1920667886734009, + "learning_rate": 6.8382692717325525e-06, + "loss": 0.3047, + "step": 817 + }, + { + "epoch": 0.3994140625, + "grad_norm": 1.4352617263793945, + "learning_rate": 6.8309115770046986e-06, + "loss": 0.3276, + "step": 818 + }, + { + "epoch": 0.39990234375, + "grad_norm": 1.6452810764312744, + "learning_rate": 6.8235493007679155e-06, + "loss": 0.3243, + "step": 819 + }, + { + "epoch": 0.400390625, + "grad_norm": 1.6612956523895264, + "learning_rate": 6.816182461444905e-06, + "loss": 0.342, + "step": 820 + }, + { + "epoch": 0.40087890625, + "grad_norm": 1.2954360246658325, + "learning_rate": 6.8088110774697825e-06, + "loss": 0.3117, + "step": 821 + }, + { + "epoch": 0.4013671875, + "grad_norm": 2.189624786376953, + "learning_rate": 6.8014351672880395e-06, + "loss": 0.3069, + "step": 822 + }, + { + "epoch": 0.40185546875, + "grad_norm": 1.4809291362762451, + "learning_rate": 6.794054749356492e-06, + "loss": 0.3355, + "step": 823 + }, + { + "epoch": 0.40234375, + "grad_norm": 1.6851189136505127, + "learning_rate": 6.786669842143236e-06, + "loss": 0.3435, + "step": 824 + }, + { + "epoch": 0.40283203125, + "grad_norm": 1.401813268661499, + "learning_rate": 6.779280464127601e-06, + "loss": 0.326, + "step": 825 + }, + { + "epoch": 0.4033203125, + "grad_norm": 1.7311843633651733, + "learning_rate": 6.771886633800104e-06, + "loss": 0.3281, + "step": 826 + }, + { + "epoch": 0.40380859375, + "grad_norm": 2.936901092529297, + "learning_rate": 6.764488369662403e-06, + "loss": 0.3727, + "step": 827 + }, + { + "epoch": 0.404296875, + "grad_norm": 1.319385051727295, + "learning_rate": 6.75708569022725e-06, + "loss": 0.344, + "step": 828 + }, + { + "epoch": 0.40478515625, + "grad_norm": 1.9358359575271606, + "learning_rate": 6.749678614018446e-06, + "loss": 0.3622, + "step": 829 + }, + { + "epoch": 0.4052734375, + "grad_norm": 1.1188249588012695, + "learning_rate": 6.742267159570796e-06, + "loss": 0.3299, + "step": 830 + }, + { + "epoch": 0.40576171875, + "grad_norm": 1.3562527894973755, + "learning_rate": 6.734851345430057e-06, + "loss": 0.319, + "step": 831 + }, + { + "epoch": 0.40625, + "grad_norm": 1.2941495180130005, + "learning_rate": 6.727431190152898e-06, + "loss": 0.3323, + "step": 832 + }, + { + "epoch": 0.40673828125, + "grad_norm": 2.1621103286743164, + "learning_rate": 6.720006712306849e-06, + "loss": 0.3409, + "step": 833 + }, + { + "epoch": 0.4072265625, + "grad_norm": 1.3561265468597412, + "learning_rate": 6.712577930470258e-06, + "loss": 0.3549, + "step": 834 + }, + { + "epoch": 0.40771484375, + "grad_norm": 1.2518807649612427, + "learning_rate": 6.705144863232246e-06, + "loss": 0.3279, + "step": 835 + }, + { + "epoch": 0.408203125, + "grad_norm": 1.1951934099197388, + "learning_rate": 6.697707529192648e-06, + "loss": 0.3146, + "step": 836 + }, + { + "epoch": 0.40869140625, + "grad_norm": 1.2976142168045044, + "learning_rate": 6.6902659469619855e-06, + "loss": 0.3151, + "step": 837 + }, + { + "epoch": 0.4091796875, + "grad_norm": 1.554851770401001, + "learning_rate": 6.682820135161405e-06, + "loss": 0.2972, + "step": 838 + }, + { + "epoch": 0.40966796875, + "grad_norm": 1.467674732208252, + "learning_rate": 6.675370112422639e-06, + "loss": 0.3538, + "step": 839 + }, + { + "epoch": 0.41015625, + "grad_norm": 2.0394184589385986, + "learning_rate": 6.667915897387957e-06, + "loss": 0.3124, + "step": 840 + }, + { + "epoch": 0.41064453125, + "grad_norm": 1.458815097808838, + "learning_rate": 6.6604575087101165e-06, + "loss": 0.3073, + "step": 841 + }, + { + "epoch": 0.4111328125, + "grad_norm": 1.2343790531158447, + "learning_rate": 6.6529949650523195e-06, + "loss": 0.3224, + "step": 842 + }, + { + "epoch": 0.41162109375, + "grad_norm": 1.307780385017395, + "learning_rate": 6.645528285088169e-06, + "loss": 0.3139, + "step": 843 + }, + { + "epoch": 0.412109375, + "grad_norm": 1.187071681022644, + "learning_rate": 6.638057487501613e-06, + "loss": 0.3316, + "step": 844 + }, + { + "epoch": 0.41259765625, + "grad_norm": 1.9509886503219604, + "learning_rate": 6.630582590986907e-06, + "loss": 0.3381, + "step": 845 + }, + { + "epoch": 0.4130859375, + "grad_norm": 1.5562846660614014, + "learning_rate": 6.623103614248561e-06, + "loss": 0.3648, + "step": 846 + }, + { + "epoch": 0.41357421875, + "grad_norm": 1.423948049545288, + "learning_rate": 6.615620576001293e-06, + "loss": 0.3163, + "step": 847 + }, + { + "epoch": 0.4140625, + "grad_norm": 1.5273832082748413, + "learning_rate": 6.608133494969993e-06, + "loss": 0.3002, + "step": 848 + }, + { + "epoch": 0.41455078125, + "grad_norm": 1.2620773315429688, + "learning_rate": 6.600642389889657e-06, + "loss": 0.3599, + "step": 849 + }, + { + "epoch": 0.4150390625, + "grad_norm": 1.283124566078186, + "learning_rate": 6.593147279505352e-06, + "loss": 0.3348, + "step": 850 + }, + { + "epoch": 0.41552734375, + "grad_norm": 1.2876836061477661, + "learning_rate": 6.585648182572176e-06, + "loss": 0.347, + "step": 851 + }, + { + "epoch": 0.416015625, + "grad_norm": 2.6049535274505615, + "learning_rate": 6.578145117855192e-06, + "loss": 0.3305, + "step": 852 + }, + { + "epoch": 0.41650390625, + "grad_norm": 1.7834153175354004, + "learning_rate": 6.570638104129399e-06, + "loss": 0.323, + "step": 853 + }, + { + "epoch": 0.4169921875, + "grad_norm": 1.3892278671264648, + "learning_rate": 6.563127160179672e-06, + "loss": 0.3475, + "step": 854 + }, + { + "epoch": 0.41748046875, + "grad_norm": 1.4540331363677979, + "learning_rate": 6.555612304800727e-06, + "loss": 0.3442, + "step": 855 + }, + { + "epoch": 0.41796875, + "grad_norm": 1.058359146118164, + "learning_rate": 6.548093556797063e-06, + "loss": 0.3398, + "step": 856 + }, + { + "epoch": 0.41845703125, + "grad_norm": 1.587546706199646, + "learning_rate": 6.540570934982917e-06, + "loss": 0.3261, + "step": 857 + }, + { + "epoch": 0.4189453125, + "grad_norm": 2.1293222904205322, + "learning_rate": 6.533044458182229e-06, + "loss": 0.3755, + "step": 858 + }, + { + "epoch": 0.41943359375, + "grad_norm": 1.2648324966430664, + "learning_rate": 6.5255141452285765e-06, + "loss": 0.3001, + "step": 859 + }, + { + "epoch": 0.419921875, + "grad_norm": 1.4118512868881226, + "learning_rate": 6.51798001496514e-06, + "loss": 0.3376, + "step": 860 + }, + { + "epoch": 0.42041015625, + "grad_norm": 1.4707554578781128, + "learning_rate": 6.510442086244649e-06, + "loss": 0.3247, + "step": 861 + }, + { + "epoch": 0.4208984375, + "grad_norm": 1.3729053735733032, + "learning_rate": 6.502900377929344e-06, + "loss": 0.3039, + "step": 862 + }, + { + "epoch": 0.42138671875, + "grad_norm": 3.840740442276001, + "learning_rate": 6.4953549088909194e-06, + "loss": 0.3567, + "step": 863 + }, + { + "epoch": 0.421875, + "grad_norm": 1.3986668586730957, + "learning_rate": 6.487805698010476e-06, + "loss": 0.3313, + "step": 864 + }, + { + "epoch": 0.42236328125, + "grad_norm": 3.7465996742248535, + "learning_rate": 6.4802527641784866e-06, + "loss": 0.3357, + "step": 865 + }, + { + "epoch": 0.4228515625, + "grad_norm": 1.7644517421722412, + "learning_rate": 6.472696126294733e-06, + "loss": 0.3662, + "step": 866 + }, + { + "epoch": 0.42333984375, + "grad_norm": 1.2544833421707153, + "learning_rate": 6.4651358032682694e-06, + "loss": 0.3371, + "step": 867 + }, + { + "epoch": 0.423828125, + "grad_norm": 1.500871181488037, + "learning_rate": 6.457571814017368e-06, + "loss": 0.3224, + "step": 868 + }, + { + "epoch": 0.42431640625, + "grad_norm": 1.3260788917541504, + "learning_rate": 6.45000417746948e-06, + "loss": 0.3161, + "step": 869 + }, + { + "epoch": 0.4248046875, + "grad_norm": 1.334038257598877, + "learning_rate": 6.442432912561178e-06, + "loss": 0.3423, + "step": 870 + }, + { + "epoch": 0.42529296875, + "grad_norm": 1.378933310508728, + "learning_rate": 6.434858038238118e-06, + "loss": 0.3492, + "step": 871 + }, + { + "epoch": 0.42578125, + "grad_norm": 1.5512367486953735, + "learning_rate": 6.427279573454985e-06, + "loss": 0.3731, + "step": 872 + }, + { + "epoch": 0.42626953125, + "grad_norm": 1.4665623903274536, + "learning_rate": 6.4196975371754514e-06, + "loss": 0.3481, + "step": 873 + }, + { + "epoch": 0.4267578125, + "grad_norm": 1.5259501934051514, + "learning_rate": 6.412111948372122e-06, + "loss": 0.3439, + "step": 874 + }, + { + "epoch": 0.42724609375, + "grad_norm": 1.465909719467163, + "learning_rate": 6.404522826026496e-06, + "loss": 0.33, + "step": 875 + }, + { + "epoch": 0.427734375, + "grad_norm": 1.357045292854309, + "learning_rate": 6.396930189128912e-06, + "loss": 0.344, + "step": 876 + }, + { + "epoch": 0.42822265625, + "grad_norm": 1.352899193763733, + "learning_rate": 6.3893340566785046e-06, + "loss": 0.3021, + "step": 877 + }, + { + "epoch": 0.4287109375, + "grad_norm": 1.3821226358413696, + "learning_rate": 6.381734447683152e-06, + "loss": 0.3326, + "step": 878 + }, + { + "epoch": 0.42919921875, + "grad_norm": 1.675229787826538, + "learning_rate": 6.374131381159436e-06, + "loss": 0.4357, + "step": 879 + }, + { + "epoch": 0.4296875, + "grad_norm": 1.7067149877548218, + "learning_rate": 6.366524876132589e-06, + "loss": 0.3018, + "step": 880 + }, + { + "epoch": 0.43017578125, + "grad_norm": 1.4271488189697266, + "learning_rate": 6.358914951636444e-06, + "loss": 0.3468, + "step": 881 + }, + { + "epoch": 0.4306640625, + "grad_norm": 1.3299568891525269, + "learning_rate": 6.351301626713398e-06, + "loss": 0.3466, + "step": 882 + }, + { + "epoch": 0.43115234375, + "grad_norm": 1.6695646047592163, + "learning_rate": 6.343684920414348e-06, + "loss": 0.3214, + "step": 883 + }, + { + "epoch": 0.431640625, + "grad_norm": 1.3570027351379395, + "learning_rate": 6.3360648517986605e-06, + "loss": 0.3382, + "step": 884 + }, + { + "epoch": 0.43212890625, + "grad_norm": 1.385907769203186, + "learning_rate": 6.32844143993411e-06, + "loss": 0.3092, + "step": 885 + }, + { + "epoch": 0.4326171875, + "grad_norm": 1.5601329803466797, + "learning_rate": 6.320814703896838e-06, + "loss": 0.3587, + "step": 886 + }, + { + "epoch": 0.43310546875, + "grad_norm": 1.39394211769104, + "learning_rate": 6.313184662771305e-06, + "loss": 0.3404, + "step": 887 + }, + { + "epoch": 0.43359375, + "grad_norm": 1.2028573751449585, + "learning_rate": 6.305551335650244e-06, + "loss": 0.3548, + "step": 888 + }, + { + "epoch": 0.43408203125, + "grad_norm": 4.250852108001709, + "learning_rate": 6.297914741634605e-06, + "loss": 0.3454, + "step": 889 + }, + { + "epoch": 0.4345703125, + "grad_norm": 1.5344691276550293, + "learning_rate": 6.290274899833517e-06, + "loss": 0.3176, + "step": 890 + }, + { + "epoch": 0.43505859375, + "grad_norm": 1.7602498531341553, + "learning_rate": 6.2826318293642385e-06, + "loss": 0.339, + "step": 891 + }, + { + "epoch": 0.435546875, + "grad_norm": 1.1949964761734009, + "learning_rate": 6.274985549352098e-06, + "loss": 0.304, + "step": 892 + }, + { + "epoch": 0.43603515625, + "grad_norm": 1.1564438343048096, + "learning_rate": 6.267336078930464e-06, + "loss": 0.3145, + "step": 893 + }, + { + "epoch": 0.4365234375, + "grad_norm": 1.3757606744766235, + "learning_rate": 6.259683437240683e-06, + "loss": 0.3385, + "step": 894 + }, + { + "epoch": 0.43701171875, + "grad_norm": 1.8371174335479736, + "learning_rate": 6.252027643432044e-06, + "loss": 0.3355, + "step": 895 + }, + { + "epoch": 0.4375, + "grad_norm": 1.334598422050476, + "learning_rate": 6.244368716661714e-06, + "loss": 0.3276, + "step": 896 + }, + { + "epoch": 0.43798828125, + "grad_norm": 1.5038282871246338, + "learning_rate": 6.236706676094705e-06, + "loss": 0.3522, + "step": 897 + }, + { + "epoch": 0.4384765625, + "grad_norm": 3.6733760833740234, + "learning_rate": 6.229041540903823e-06, + "loss": 0.3431, + "step": 898 + }, + { + "epoch": 0.43896484375, + "grad_norm": 1.5863288640975952, + "learning_rate": 6.221373330269613e-06, + "loss": 0.3324, + "step": 899 + }, + { + "epoch": 0.439453125, + "grad_norm": 1.4606237411499023, + "learning_rate": 6.213702063380317e-06, + "loss": 0.3226, + "step": 900 + }, + { + "epoch": 0.43994140625, + "grad_norm": 1.8370083570480347, + "learning_rate": 6.206027759431825e-06, + "loss": 0.3294, + "step": 901 + }, + { + "epoch": 0.4404296875, + "grad_norm": 1.6841802597045898, + "learning_rate": 6.198350437627631e-06, + "loss": 0.3238, + "step": 902 + }, + { + "epoch": 0.44091796875, + "grad_norm": 1.9791240692138672, + "learning_rate": 6.190670117178772e-06, + "loss": 0.3326, + "step": 903 + }, + { + "epoch": 0.44140625, + "grad_norm": 1.4503194093704224, + "learning_rate": 6.182986817303794e-06, + "loss": 0.3544, + "step": 904 + }, + { + "epoch": 0.44189453125, + "grad_norm": 1.9381232261657715, + "learning_rate": 6.175300557228698e-06, + "loss": 0.3278, + "step": 905 + }, + { + "epoch": 0.4423828125, + "grad_norm": 4.399080753326416, + "learning_rate": 6.167611356186895e-06, + "loss": 0.3367, + "step": 906 + }, + { + "epoch": 0.44287109375, + "grad_norm": 1.4784455299377441, + "learning_rate": 6.159919233419147e-06, + "loss": 0.3559, + "step": 907 + }, + { + "epoch": 0.443359375, + "grad_norm": 1.9754478931427002, + "learning_rate": 6.152224208173533e-06, + "loss": 0.3311, + "step": 908 + }, + { + "epoch": 0.44384765625, + "grad_norm": 1.5615670680999756, + "learning_rate": 6.144526299705396e-06, + "loss": 0.4023, + "step": 909 + }, + { + "epoch": 0.4443359375, + "grad_norm": 1.461332082748413, + "learning_rate": 6.136825527277295e-06, + "loss": 0.3026, + "step": 910 + }, + { + "epoch": 0.44482421875, + "grad_norm": 1.4366703033447266, + "learning_rate": 6.129121910158945e-06, + "loss": 0.336, + "step": 911 + }, + { + "epoch": 0.4453125, + "grad_norm": 2.06691575050354, + "learning_rate": 6.12141546762719e-06, + "loss": 0.342, + "step": 912 + }, + { + "epoch": 0.44580078125, + "grad_norm": 1.7794272899627686, + "learning_rate": 6.11370621896594e-06, + "loss": 0.3532, + "step": 913 + }, + { + "epoch": 0.4462890625, + "grad_norm": 1.4335381984710693, + "learning_rate": 6.105994183466131e-06, + "loss": 0.3471, + "step": 914 + }, + { + "epoch": 0.44677734375, + "grad_norm": 5.071071147918701, + "learning_rate": 6.0982793804256636e-06, + "loss": 0.336, + "step": 915 + }, + { + "epoch": 0.447265625, + "grad_norm": 1.2241181135177612, + "learning_rate": 6.090561829149373e-06, + "loss": 0.3232, + "step": 916 + }, + { + "epoch": 0.44775390625, + "grad_norm": 1.267858624458313, + "learning_rate": 6.082841548948966e-06, + "loss": 0.3556, + "step": 917 + }, + { + "epoch": 0.4482421875, + "grad_norm": 1.1905056238174438, + "learning_rate": 6.07511855914298e-06, + "loss": 0.2941, + "step": 918 + }, + { + "epoch": 0.44873046875, + "grad_norm": 1.2715431451797485, + "learning_rate": 6.067392879056729e-06, + "loss": 0.3159, + "step": 919 + }, + { + "epoch": 0.44921875, + "grad_norm": 1.2241966724395752, + "learning_rate": 6.059664528022267e-06, + "loss": 0.3141, + "step": 920 + }, + { + "epoch": 0.44970703125, + "grad_norm": 1.6341863870620728, + "learning_rate": 6.051933525378323e-06, + "loss": 0.3319, + "step": 921 + }, + { + "epoch": 0.4501953125, + "grad_norm": 3.6661813259124756, + "learning_rate": 6.044199890470267e-06, + "loss": 0.3482, + "step": 922 + }, + { + "epoch": 0.45068359375, + "grad_norm": 1.4551990032196045, + "learning_rate": 6.036463642650049e-06, + "loss": 0.3899, + "step": 923 + }, + { + "epoch": 0.451171875, + "grad_norm": 1.8738077878952026, + "learning_rate": 6.028724801276167e-06, + "loss": 0.3412, + "step": 924 + }, + { + "epoch": 0.45166015625, + "grad_norm": 1.3348729610443115, + "learning_rate": 6.020983385713601e-06, + "loss": 0.3194, + "step": 925 + }, + { + "epoch": 0.4521484375, + "grad_norm": 1.675868034362793, + "learning_rate": 6.013239415333776e-06, + "loss": 0.338, + "step": 926 + }, + { + "epoch": 0.45263671875, + "grad_norm": 1.5089606046676636, + "learning_rate": 6.005492909514507e-06, + "loss": 0.3502, + "step": 927 + }, + { + "epoch": 0.453125, + "grad_norm": 1.6367465257644653, + "learning_rate": 5.997743887639959e-06, + "loss": 0.3356, + "step": 928 + }, + { + "epoch": 0.45361328125, + "grad_norm": 1.5445111989974976, + "learning_rate": 5.989992369100586e-06, + "loss": 0.3192, + "step": 929 + }, + { + "epoch": 0.4541015625, + "grad_norm": 1.2671817541122437, + "learning_rate": 5.982238373293093e-06, + "loss": 0.3282, + "step": 930 + }, + { + "epoch": 0.45458984375, + "grad_norm": 1.2266660928726196, + "learning_rate": 5.974481919620386e-06, + "loss": 0.3202, + "step": 931 + }, + { + "epoch": 0.455078125, + "grad_norm": 1.5652544498443604, + "learning_rate": 5.966723027491518e-06, + "loss": 0.3502, + "step": 932 + }, + { + "epoch": 0.45556640625, + "grad_norm": 1.2947496175765991, + "learning_rate": 5.958961716321644e-06, + "loss": 0.317, + "step": 933 + }, + { + "epoch": 0.4560546875, + "grad_norm": 2.053834915161133, + "learning_rate": 5.951198005531974e-06, + "loss": 0.308, + "step": 934 + }, + { + "epoch": 0.45654296875, + "grad_norm": 2.342907428741455, + "learning_rate": 5.943431914549721e-06, + "loss": 0.3314, + "step": 935 + }, + { + "epoch": 0.45703125, + "grad_norm": 1.5535999536514282, + "learning_rate": 5.9356634628080555e-06, + "loss": 0.3362, + "step": 936 + }, + { + "epoch": 0.45751953125, + "grad_norm": 1.607968807220459, + "learning_rate": 5.927892669746054e-06, + "loss": 0.317, + "step": 937 + }, + { + "epoch": 0.4580078125, + "grad_norm": 1.268129825592041, + "learning_rate": 5.920119554808651e-06, + "loss": 0.3278, + "step": 938 + }, + { + "epoch": 0.45849609375, + "grad_norm": 4.848256587982178, + "learning_rate": 5.912344137446593e-06, + "loss": 0.3448, + "step": 939 + }, + { + "epoch": 0.458984375, + "grad_norm": 1.1670955419540405, + "learning_rate": 5.904566437116388e-06, + "loss": 0.2967, + "step": 940 + }, + { + "epoch": 0.45947265625, + "grad_norm": 2.250368595123291, + "learning_rate": 5.896786473280255e-06, + "loss": 0.32, + "step": 941 + }, + { + "epoch": 0.4599609375, + "grad_norm": 1.5156008005142212, + "learning_rate": 5.889004265406077e-06, + "loss": 0.2914, + "step": 942 + }, + { + "epoch": 0.46044921875, + "grad_norm": 1.0980958938598633, + "learning_rate": 5.8812198329673545e-06, + "loss": 0.304, + "step": 943 + }, + { + "epoch": 0.4609375, + "grad_norm": 1.7652188539505005, + "learning_rate": 5.873433195443152e-06, + "loss": 0.3497, + "step": 944 + }, + { + "epoch": 0.46142578125, + "grad_norm": 1.977793574333191, + "learning_rate": 5.865644372318053e-06, + "loss": 0.3598, + "step": 945 + }, + { + "epoch": 0.4619140625, + "grad_norm": 1.490369200706482, + "learning_rate": 5.857853383082112e-06, + "loss": 0.3433, + "step": 946 + }, + { + "epoch": 0.46240234375, + "grad_norm": 5.214506149291992, + "learning_rate": 5.8500602472307974e-06, + "loss": 0.3506, + "step": 947 + }, + { + "epoch": 0.462890625, + "grad_norm": 1.304093837738037, + "learning_rate": 5.842264984264958e-06, + "loss": 0.3035, + "step": 948 + }, + { + "epoch": 0.46337890625, + "grad_norm": 1.2441211938858032, + "learning_rate": 5.834467613690759e-06, + "loss": 0.3308, + "step": 949 + }, + { + "epoch": 0.4638671875, + "grad_norm": 1.0881738662719727, + "learning_rate": 5.82666815501964e-06, + "loss": 0.3163, + "step": 950 + }, + { + "epoch": 0.46435546875, + "grad_norm": 1.4398066997528076, + "learning_rate": 5.8188666277682695e-06, + "loss": 0.327, + "step": 951 + }, + { + "epoch": 0.46484375, + "grad_norm": 1.81572425365448, + "learning_rate": 5.8110630514584854e-06, + "loss": 0.3328, + "step": 952 + }, + { + "epoch": 0.46533203125, + "grad_norm": 1.5575212240219116, + "learning_rate": 5.803257445617263e-06, + "loss": 0.3495, + "step": 953 + }, + { + "epoch": 0.4658203125, + "grad_norm": 1.3975605964660645, + "learning_rate": 5.795449829776645e-06, + "loss": 0.3448, + "step": 954 + }, + { + "epoch": 0.46630859375, + "grad_norm": 1.2950125932693481, + "learning_rate": 5.787640223473713e-06, + "loss": 0.3617, + "step": 955 + }, + { + "epoch": 0.466796875, + "grad_norm": 1.3984689712524414, + "learning_rate": 5.779828646250522e-06, + "loss": 0.3608, + "step": 956 + }, + { + "epoch": 0.46728515625, + "grad_norm": 1.0765591859817505, + "learning_rate": 5.772015117654065e-06, + "loss": 0.3093, + "step": 957 + }, + { + "epoch": 0.4677734375, + "grad_norm": 1.5954604148864746, + "learning_rate": 5.764199657236214e-06, + "loss": 0.3504, + "step": 958 + }, + { + "epoch": 0.46826171875, + "grad_norm": 1.6604746580123901, + "learning_rate": 5.756382284553675e-06, + "loss": 0.3096, + "step": 959 + }, + { + "epoch": 0.46875, + "grad_norm": 1.3618206977844238, + "learning_rate": 5.7485630191679456e-06, + "loss": 0.3057, + "step": 960 + }, + { + "epoch": 0.46923828125, + "grad_norm": 1.217523217201233, + "learning_rate": 5.740741880645248e-06, + "loss": 0.3708, + "step": 961 + }, + { + "epoch": 0.4697265625, + "grad_norm": 1.2130963802337646, + "learning_rate": 5.7329188885565e-06, + "loss": 0.321, + "step": 962 + }, + { + "epoch": 0.47021484375, + "grad_norm": 1.3064903020858765, + "learning_rate": 5.725094062477256e-06, + "loss": 0.3211, + "step": 963 + }, + { + "epoch": 0.470703125, + "grad_norm": 1.5063132047653198, + "learning_rate": 5.717267421987659e-06, + "loss": 0.3307, + "step": 964 + }, + { + "epoch": 0.47119140625, + "grad_norm": 1.3585816621780396, + "learning_rate": 5.7094389866723905e-06, + "loss": 0.3631, + "step": 965 + }, + { + "epoch": 0.4716796875, + "grad_norm": 1.5815399885177612, + "learning_rate": 5.701608776120627e-06, + "loss": 0.352, + "step": 966 + }, + { + "epoch": 0.47216796875, + "grad_norm": 1.4560235738754272, + "learning_rate": 5.6937768099259845e-06, + "loss": 0.3109, + "step": 967 + }, + { + "epoch": 0.47265625, + "grad_norm": 1.8057149648666382, + "learning_rate": 5.685943107686476e-06, + "loss": 0.3218, + "step": 968 + }, + { + "epoch": 0.47314453125, + "grad_norm": 1.4362132549285889, + "learning_rate": 5.678107689004449e-06, + "loss": 0.3293, + "step": 969 + }, + { + "epoch": 0.4736328125, + "grad_norm": 2.0112991333007812, + "learning_rate": 5.670270573486555e-06, + "loss": 0.356, + "step": 970 + }, + { + "epoch": 0.47412109375, + "grad_norm": 1.2395293712615967, + "learning_rate": 5.662431780743691e-06, + "loss": 0.3439, + "step": 971 + }, + { + "epoch": 0.474609375, + "grad_norm": 1.4867768287658691, + "learning_rate": 5.6545913303909495e-06, + "loss": 0.3767, + "step": 972 + }, + { + "epoch": 0.47509765625, + "grad_norm": 1.210928201675415, + "learning_rate": 5.646749242047567e-06, + "loss": 0.3259, + "step": 973 + }, + { + "epoch": 0.4755859375, + "grad_norm": 1.157676100730896, + "learning_rate": 5.6389055353368826e-06, + "loss": 0.336, + "step": 974 + }, + { + "epoch": 0.47607421875, + "grad_norm": 1.485719919204712, + "learning_rate": 5.631060229886287e-06, + "loss": 0.3121, + "step": 975 + }, + { + "epoch": 0.4765625, + "grad_norm": 1.1137949228286743, + "learning_rate": 5.6232133453271676e-06, + "loss": 0.3362, + "step": 976 + }, + { + "epoch": 0.47705078125, + "grad_norm": 1.213346004486084, + "learning_rate": 5.615364901294863e-06, + "loss": 0.3194, + "step": 977 + }, + { + "epoch": 0.4775390625, + "grad_norm": 1.3590606451034546, + "learning_rate": 5.607514917428618e-06, + "loss": 0.3484, + "step": 978 + }, + { + "epoch": 0.47802734375, + "grad_norm": 2.0311455726623535, + "learning_rate": 5.599663413371527e-06, + "loss": 0.3419, + "step": 979 + }, + { + "epoch": 0.478515625, + "grad_norm": 1.195672869682312, + "learning_rate": 5.5918104087704925e-06, + "loss": 0.339, + "step": 980 + }, + { + "epoch": 0.47900390625, + "grad_norm": 1.8912562131881714, + "learning_rate": 5.583955923276163e-06, + "loss": 0.3427, + "step": 981 + }, + { + "epoch": 0.4794921875, + "grad_norm": 2.002305030822754, + "learning_rate": 5.576099976542904e-06, + "loss": 0.3595, + "step": 982 + }, + { + "epoch": 0.47998046875, + "grad_norm": 1.4438331127166748, + "learning_rate": 5.56824258822873e-06, + "loss": 0.3632, + "step": 983 + }, + { + "epoch": 0.48046875, + "grad_norm": 1.366222620010376, + "learning_rate": 5.560383777995264e-06, + "loss": 0.3188, + "step": 984 + }, + { + "epoch": 0.48095703125, + "grad_norm": 1.3330532312393188, + "learning_rate": 5.552523565507689e-06, + "loss": 0.3262, + "step": 985 + }, + { + "epoch": 0.4814453125, + "grad_norm": 1.5084117650985718, + "learning_rate": 5.544661970434696e-06, + "loss": 0.325, + "step": 986 + }, + { + "epoch": 0.48193359375, + "grad_norm": 1.0425949096679688, + "learning_rate": 5.536799012448435e-06, + "loss": 0.315, + "step": 987 + }, + { + "epoch": 0.482421875, + "grad_norm": 2.695110559463501, + "learning_rate": 5.528934711224467e-06, + "loss": 0.3166, + "step": 988 + }, + { + "epoch": 0.48291015625, + "grad_norm": 1.3446696996688843, + "learning_rate": 5.521069086441715e-06, + "loss": 0.3437, + "step": 989 + }, + { + "epoch": 0.4833984375, + "grad_norm": 1.360203742980957, + "learning_rate": 5.513202157782411e-06, + "loss": 0.3472, + "step": 990 + }, + { + "epoch": 0.48388671875, + "grad_norm": 1.3492072820663452, + "learning_rate": 5.505333944932053e-06, + "loss": 0.3363, + "step": 991 + }, + { + "epoch": 0.484375, + "grad_norm": 1.1588752269744873, + "learning_rate": 5.497464467579351e-06, + "loss": 0.338, + "step": 992 + }, + { + "epoch": 0.48486328125, + "grad_norm": 1.4233770370483398, + "learning_rate": 5.48959374541618e-06, + "loss": 0.336, + "step": 993 + }, + { + "epoch": 0.4853515625, + "grad_norm": 1.3421063423156738, + "learning_rate": 5.4817217981375286e-06, + "loss": 0.324, + "step": 994 + }, + { + "epoch": 0.48583984375, + "grad_norm": 1.6678565740585327, + "learning_rate": 5.473848645441452e-06, + "loss": 0.3189, + "step": 995 + }, + { + "epoch": 0.486328125, + "grad_norm": 1.912955641746521, + "learning_rate": 5.465974307029021e-06, + "loss": 0.3643, + "step": 996 + }, + { + "epoch": 0.48681640625, + "grad_norm": 2.0670387744903564, + "learning_rate": 5.458098802604273e-06, + "loss": 0.332, + "step": 997 + }, + { + "epoch": 0.4873046875, + "grad_norm": 2.6159446239471436, + "learning_rate": 5.450222151874166e-06, + "loss": 0.3674, + "step": 998 + }, + { + "epoch": 0.48779296875, + "grad_norm": 1.3627862930297852, + "learning_rate": 5.442344374548524e-06, + "loss": 0.3496, + "step": 999 + }, + { + "epoch": 0.48828125, + "grad_norm": 1.4907851219177246, + "learning_rate": 5.43446549033999e-06, + "loss": 0.3475, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 2048, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.000488202654974e+18, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}