diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,45823 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.998968008255934, + "eval_steps": 500, + "global_step": 6540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00045866299736268775, + "grad_norm": 11.983423233032227, + "learning_rate": 5.076142131979695e-06, + "loss": 3.3181, + "step": 1 + }, + { + "epoch": 0.0009173259947253755, + "grad_norm": 20.939693450927734, + "learning_rate": 1.015228426395939e-05, + "loss": 4.644, + "step": 2 + }, + { + "epoch": 0.0013759889920880633, + "grad_norm": 15.526450157165527, + "learning_rate": 1.5228426395939088e-05, + "loss": 3.3677, + "step": 3 + }, + { + "epoch": 0.001834651989450751, + "grad_norm": 15.404317855834961, + "learning_rate": 2.030456852791878e-05, + "loss": 4.1666, + "step": 4 + }, + { + "epoch": 0.002293314986813439, + "grad_norm": 7.165552616119385, + "learning_rate": 2.5380710659898476e-05, + "loss": 2.5469, + "step": 5 + }, + { + "epoch": 0.0027519779841761265, + "grad_norm": 2.2743093967437744, + "learning_rate": 3.0456852791878175e-05, + "loss": 1.4774, + "step": 6 + }, + { + "epoch": 0.0032106409815388145, + "grad_norm": 12.052650451660156, + "learning_rate": 3.553299492385787e-05, + "loss": 3.9692, + "step": 7 + }, + { + "epoch": 0.003669303978901502, + "grad_norm": 3.099673271179199, + "learning_rate": 4.060913705583756e-05, + "loss": 1.4465, + "step": 8 + }, + { + "epoch": 0.0041279669762641896, + "grad_norm": 5.82925271987915, + "learning_rate": 4.568527918781726e-05, + "loss": 3.3387, + "step": 9 + }, + { + "epoch": 0.004586629973626878, + "grad_norm": 4.784450531005859, + "learning_rate": 5.076142131979695e-05, + "loss": 2.6028, + "step": 10 + }, + { + "epoch": 0.0050452929709895655, + "grad_norm": 6.859081745147705, + "learning_rate": 5.583756345177665e-05, + "loss": 3.8062, + "step": 11 + }, + { + "epoch": 0.005503955968352253, + "grad_norm": 2.7776687145233154, + "learning_rate": 6.091370558375635e-05, + "loss": 2.3401, + "step": 12 + }, + { + "epoch": 0.0059626189657149406, + "grad_norm": 3.001207113265991, + "learning_rate": 6.598984771573605e-05, + "loss": 2.3286, + "step": 13 + }, + { + "epoch": 0.006421281963077629, + "grad_norm": 4.640460968017578, + "learning_rate": 7.106598984771574e-05, + "loss": 2.7322, + "step": 14 + }, + { + "epoch": 0.0068799449604403165, + "grad_norm": 3.0203094482421875, + "learning_rate": 7.614213197969543e-05, + "loss": 1.9691, + "step": 15 + }, + { + "epoch": 0.007338607957803004, + "grad_norm": 4.914385795593262, + "learning_rate": 8.121827411167512e-05, + "loss": 3.811, + "step": 16 + }, + { + "epoch": 0.007797270955165692, + "grad_norm": 1.8599748611450195, + "learning_rate": 8.629441624365482e-05, + "loss": 1.743, + "step": 17 + }, + { + "epoch": 0.008255933952528379, + "grad_norm": 4.672982215881348, + "learning_rate": 9.137055837563452e-05, + "loss": 2.9532, + "step": 18 + }, + { + "epoch": 0.008714596949891068, + "grad_norm": 2.165299654006958, + "learning_rate": 9.644670050761421e-05, + "loss": 2.6724, + "step": 19 + }, + { + "epoch": 0.009173259947253756, + "grad_norm": 2.6288881301879883, + "learning_rate": 0.0001015228426395939, + "loss": 3.1768, + "step": 20 + }, + { + "epoch": 0.009631922944616443, + "grad_norm": 3.7706544399261475, + "learning_rate": 0.00010659898477157361, + "loss": 2.6769, + "step": 21 + }, + { + "epoch": 0.010090585941979131, + "grad_norm": 1.690014362335205, + "learning_rate": 0.0001116751269035533, + "loss": 2.0408, + "step": 22 + }, + { + "epoch": 0.010549248939341819, + "grad_norm": 1.388680100440979, + "learning_rate": 0.000116751269035533, + "loss": 1.7454, + "step": 23 + }, + { + "epoch": 0.011007911936704506, + "grad_norm": 1.0925803184509277, + "learning_rate": 0.0001218274111675127, + "loss": 1.7225, + "step": 24 + }, + { + "epoch": 0.011466574934067194, + "grad_norm": 1.5495200157165527, + "learning_rate": 0.0001269035532994924, + "loss": 2.7841, + "step": 25 + }, + { + "epoch": 0.011925237931429881, + "grad_norm": 0.8375623822212219, + "learning_rate": 0.0001319796954314721, + "loss": 1.4273, + "step": 26 + }, + { + "epoch": 0.01238390092879257, + "grad_norm": 1.5786348581314087, + "learning_rate": 0.00013705583756345178, + "loss": 2.4528, + "step": 27 + }, + { + "epoch": 0.012842563926155258, + "grad_norm": 0.9938832521438599, + "learning_rate": 0.00014213197969543148, + "loss": 1.9099, + "step": 28 + }, + { + "epoch": 0.013301226923517945, + "grad_norm": 1.1450473070144653, + "learning_rate": 0.00014720812182741116, + "loss": 2.4707, + "step": 29 + }, + { + "epoch": 0.013759889920880633, + "grad_norm": 0.5717598795890808, + "learning_rate": 0.00015228426395939087, + "loss": 1.4649, + "step": 30 + }, + { + "epoch": 0.01421855291824332, + "grad_norm": 1.1892627477645874, + "learning_rate": 0.00015736040609137057, + "loss": 2.762, + "step": 31 + }, + { + "epoch": 0.014677215915606008, + "grad_norm": 0.8061103224754333, + "learning_rate": 0.00016243654822335025, + "loss": 2.5979, + "step": 32 + }, + { + "epoch": 0.015135878912968696, + "grad_norm": 1.37034010887146, + "learning_rate": 0.00016751269035532995, + "loss": 2.8462, + "step": 33 + }, + { + "epoch": 0.015594541910331383, + "grad_norm": 5.973114967346191, + "learning_rate": 0.00017258883248730963, + "loss": 1.8631, + "step": 34 + }, + { + "epoch": 0.01605320490769407, + "grad_norm": 1.1910300254821777, + "learning_rate": 0.00017766497461928934, + "loss": 2.1458, + "step": 35 + }, + { + "epoch": 0.016511867905056758, + "grad_norm": 0.8178476691246033, + "learning_rate": 0.00018274111675126904, + "loss": 2.6722, + "step": 36 + }, + { + "epoch": 0.016970530902419446, + "grad_norm": 0.6606747508049011, + "learning_rate": 0.00018781725888324875, + "loss": 2.0547, + "step": 37 + }, + { + "epoch": 0.017429193899782137, + "grad_norm": 0.5291973352432251, + "learning_rate": 0.00019289340101522843, + "loss": 1.2715, + "step": 38 + }, + { + "epoch": 0.017887856897144824, + "grad_norm": 1.221416711807251, + "learning_rate": 0.00019796954314720813, + "loss": 3.2524, + "step": 39 + }, + { + "epoch": 0.018346519894507512, + "grad_norm": 0.751660168170929, + "learning_rate": 0.0002030456852791878, + "loss": 2.2274, + "step": 40 + }, + { + "epoch": 0.0188051828918702, + "grad_norm": 0.6112833023071289, + "learning_rate": 0.00020812182741116754, + "loss": 1.6378, + "step": 41 + }, + { + "epoch": 0.019263845889232887, + "grad_norm": 0.40640926361083984, + "learning_rate": 0.00021319796954314722, + "loss": 1.395, + "step": 42 + }, + { + "epoch": 0.019722508886595574, + "grad_norm": 0.3838483989238739, + "learning_rate": 0.0002182741116751269, + "loss": 1.3842, + "step": 43 + }, + { + "epoch": 0.020181171883958262, + "grad_norm": 0.5843013525009155, + "learning_rate": 0.0002233502538071066, + "loss": 2.1636, + "step": 44 + }, + { + "epoch": 0.02063983488132095, + "grad_norm": 0.5792063474655151, + "learning_rate": 0.00022842639593908628, + "loss": 2.4495, + "step": 45 + }, + { + "epoch": 0.021098497878683637, + "grad_norm": 0.48895737528800964, + "learning_rate": 0.000233502538071066, + "loss": 2.0795, + "step": 46 + }, + { + "epoch": 0.021557160876046325, + "grad_norm": 0.6037505865097046, + "learning_rate": 0.0002385786802030457, + "loss": 2.6077, + "step": 47 + }, + { + "epoch": 0.022015823873409012, + "grad_norm": 0.11669456958770752, + "learning_rate": 0.0002436548223350254, + "loss": 0.535, + "step": 48 + }, + { + "epoch": 0.0224744868707717, + "grad_norm": 0.5221993327140808, + "learning_rate": 0.0002487309644670051, + "loss": 1.9393, + "step": 49 + }, + { + "epoch": 0.022933149868134387, + "grad_norm": 0.6841984391212463, + "learning_rate": 0.0002538071065989848, + "loss": 2.3887, + "step": 50 + }, + { + "epoch": 0.023391812865497075, + "grad_norm": 0.5550611019134521, + "learning_rate": 0.0002588832487309645, + "loss": 2.0587, + "step": 51 + }, + { + "epoch": 0.023850475862859762, + "grad_norm": 0.5314993858337402, + "learning_rate": 0.0002639593908629442, + "loss": 2.7202, + "step": 52 + }, + { + "epoch": 0.024309138860222453, + "grad_norm": 0.6494979858398438, + "learning_rate": 0.00026903553299492385, + "loss": 2.7383, + "step": 53 + }, + { + "epoch": 0.02476780185758514, + "grad_norm": 0.6311330795288086, + "learning_rate": 0.00027411167512690355, + "loss": 2.7266, + "step": 54 + }, + { + "epoch": 0.02522646485494783, + "grad_norm": 0.6023324728012085, + "learning_rate": 0.00027918781725888326, + "loss": 2.8926, + "step": 55 + }, + { + "epoch": 0.025685127852310516, + "grad_norm": 0.4306228458881378, + "learning_rate": 0.00028426395939086296, + "loss": 1.8117, + "step": 56 + }, + { + "epoch": 0.026143790849673203, + "grad_norm": 0.4600723385810852, + "learning_rate": 0.0002893401015228426, + "loss": 1.4924, + "step": 57 + }, + { + "epoch": 0.02660245384703589, + "grad_norm": 0.5145777463912964, + "learning_rate": 0.0002944162436548223, + "loss": 1.8133, + "step": 58 + }, + { + "epoch": 0.02706111684439858, + "grad_norm": 0.4451518654823303, + "learning_rate": 0.000299492385786802, + "loss": 1.453, + "step": 59 + }, + { + "epoch": 0.027519779841761266, + "grad_norm": 0.47992944717407227, + "learning_rate": 0.00030456852791878173, + "loss": 2.0383, + "step": 60 + }, + { + "epoch": 0.027978442839123954, + "grad_norm": 0.49620696902275085, + "learning_rate": 0.00030964467005076144, + "loss": 1.7496, + "step": 61 + }, + { + "epoch": 0.02843710583648664, + "grad_norm": 0.6371163725852966, + "learning_rate": 0.00031472081218274114, + "loss": 2.4441, + "step": 62 + }, + { + "epoch": 0.02889576883384933, + "grad_norm": 0.38818782567977905, + "learning_rate": 0.00031979695431472085, + "loss": 1.4037, + "step": 63 + }, + { + "epoch": 0.029354431831212016, + "grad_norm": 1.4600757360458374, + "learning_rate": 0.0003248730964467005, + "loss": 2.6814, + "step": 64 + }, + { + "epoch": 0.029813094828574704, + "grad_norm": 0.394088476896286, + "learning_rate": 0.0003299492385786802, + "loss": 1.3568, + "step": 65 + }, + { + "epoch": 0.03027175782593739, + "grad_norm": 1.3067290782928467, + "learning_rate": 0.0003350253807106599, + "loss": 2.6444, + "step": 66 + }, + { + "epoch": 0.03073042082330008, + "grad_norm": 1.0073161125183105, + "learning_rate": 0.0003401015228426396, + "loss": 2.074, + "step": 67 + }, + { + "epoch": 0.031189083820662766, + "grad_norm": 0.4958036243915558, + "learning_rate": 0.00034517766497461927, + "loss": 1.9275, + "step": 68 + }, + { + "epoch": 0.03164774681802546, + "grad_norm": 0.6386379599571228, + "learning_rate": 0.00035025380710659897, + "loss": 2.5089, + "step": 69 + }, + { + "epoch": 0.03210640981538814, + "grad_norm": 0.3506165146827698, + "learning_rate": 0.0003553299492385787, + "loss": 1.1556, + "step": 70 + }, + { + "epoch": 0.03256507281275083, + "grad_norm": 0.511293888092041, + "learning_rate": 0.0003604060913705584, + "loss": 1.7764, + "step": 71 + }, + { + "epoch": 0.033023735810113516, + "grad_norm": 0.5064987540245056, + "learning_rate": 0.0003654822335025381, + "loss": 2.0087, + "step": 72 + }, + { + "epoch": 0.03348239880747621, + "grad_norm": 0.6523762345314026, + "learning_rate": 0.0003705583756345178, + "loss": 2.1339, + "step": 73 + }, + { + "epoch": 0.03394106180483889, + "grad_norm": 0.7642560005187988, + "learning_rate": 0.0003756345177664975, + "loss": 2.7424, + "step": 74 + }, + { + "epoch": 0.03439972480220158, + "grad_norm": 0.8428514003753662, + "learning_rate": 0.00038071065989847715, + "loss": 3.062, + "step": 75 + }, + { + "epoch": 0.034858387799564274, + "grad_norm": 0.5961275696754456, + "learning_rate": 0.00038578680203045685, + "loss": 1.8469, + "step": 76 + }, + { + "epoch": 0.03531705079692696, + "grad_norm": 0.7156499624252319, + "learning_rate": 0.00039086294416243656, + "loss": 2.3454, + "step": 77 + }, + { + "epoch": 0.03577571379428965, + "grad_norm": 0.4211914837360382, + "learning_rate": 0.00039593908629441627, + "loss": 1.3773, + "step": 78 + }, + { + "epoch": 0.03623437679165233, + "grad_norm": 0.25154200196266174, + "learning_rate": 0.0004010152284263959, + "loss": 0.834, + "step": 79 + }, + { + "epoch": 0.036693039789015024, + "grad_norm": 0.7084411978721619, + "learning_rate": 0.0004060913705583756, + "loss": 2.5184, + "step": 80 + }, + { + "epoch": 0.03715170278637771, + "grad_norm": 0.18035106360912323, + "learning_rate": 0.00041116751269035533, + "loss": 0.7498, + "step": 81 + }, + { + "epoch": 0.0376103657837404, + "grad_norm": 0.6386528611183167, + "learning_rate": 0.0004162436548223351, + "loss": 2.762, + "step": 82 + }, + { + "epoch": 0.03806902878110308, + "grad_norm": 0.5148580074310303, + "learning_rate": 0.00042131979695431474, + "loss": 2.1558, + "step": 83 + }, + { + "epoch": 0.038527691778465774, + "grad_norm": 0.5644561052322388, + "learning_rate": 0.00042639593908629444, + "loss": 2.4407, + "step": 84 + }, + { + "epoch": 0.03898635477582846, + "grad_norm": 0.21473997831344604, + "learning_rate": 0.00043147208121827415, + "loss": 0.9734, + "step": 85 + }, + { + "epoch": 0.03944501777319115, + "grad_norm": 0.29117411375045776, + "learning_rate": 0.0004365482233502538, + "loss": 0.8773, + "step": 86 + }, + { + "epoch": 0.03990368077055383, + "grad_norm": 0.5848171710968018, + "learning_rate": 0.0004416243654822335, + "loss": 1.5979, + "step": 87 + }, + { + "epoch": 0.040362343767916524, + "grad_norm": 0.7745836973190308, + "learning_rate": 0.0004467005076142132, + "loss": 2.6626, + "step": 88 + }, + { + "epoch": 0.04082100676527921, + "grad_norm": 0.6489730477333069, + "learning_rate": 0.0004517766497461929, + "loss": 2.2454, + "step": 89 + }, + { + "epoch": 0.0412796697626419, + "grad_norm": 0.38306012749671936, + "learning_rate": 0.00045685279187817257, + "loss": 1.1562, + "step": 90 + }, + { + "epoch": 0.04173833276000459, + "grad_norm": 0.6314185261726379, + "learning_rate": 0.0004619289340101523, + "loss": 2.3352, + "step": 91 + }, + { + "epoch": 0.042196995757367274, + "grad_norm": 0.5400259494781494, + "learning_rate": 0.000467005076142132, + "loss": 2.261, + "step": 92 + }, + { + "epoch": 0.042655658754729965, + "grad_norm": 0.5282353758811951, + "learning_rate": 0.00047208121827411174, + "loss": 1.9561, + "step": 93 + }, + { + "epoch": 0.04311432175209265, + "grad_norm": 0.6686978936195374, + "learning_rate": 0.0004771573604060914, + "loss": 2.3158, + "step": 94 + }, + { + "epoch": 0.04357298474945534, + "grad_norm": 0.5369504690170288, + "learning_rate": 0.0004822335025380711, + "loss": 2.0291, + "step": 95 + }, + { + "epoch": 0.044031647746818024, + "grad_norm": 0.34599074721336365, + "learning_rate": 0.0004873096446700508, + "loss": 1.1915, + "step": 96 + }, + { + "epoch": 0.044490310744180715, + "grad_norm": 0.45278364419937134, + "learning_rate": 0.0004923857868020305, + "loss": 2.0, + "step": 97 + }, + { + "epoch": 0.0449489737415434, + "grad_norm": 0.7745117545127869, + "learning_rate": 0.0004974619289340102, + "loss": 2.5948, + "step": 98 + }, + { + "epoch": 0.04540763673890609, + "grad_norm": 0.3228628635406494, + "learning_rate": 0.0005025380710659899, + "loss": 1.236, + "step": 99 + }, + { + "epoch": 0.045866299736268774, + "grad_norm": 0.49457353353500366, + "learning_rate": 0.0005076142131979696, + "loss": 1.9651, + "step": 100 + }, + { + "epoch": 0.046324962733631465, + "grad_norm": 0.6033787727355957, + "learning_rate": 0.0005126903553299493, + "loss": 2.657, + "step": 101 + }, + { + "epoch": 0.04678362573099415, + "grad_norm": 0.529002845287323, + "learning_rate": 0.000517766497461929, + "loss": 2.249, + "step": 102 + }, + { + "epoch": 0.04724228872835684, + "grad_norm": 0.3869992792606354, + "learning_rate": 0.0005228426395939087, + "loss": 1.3741, + "step": 103 + }, + { + "epoch": 0.047700951725719525, + "grad_norm": 0.4783024787902832, + "learning_rate": 0.0005279187817258884, + "loss": 1.8258, + "step": 104 + }, + { + "epoch": 0.048159614723082216, + "grad_norm": 0.5782725214958191, + "learning_rate": 0.0005329949238578681, + "loss": 2.4186, + "step": 105 + }, + { + "epoch": 0.04861827772044491, + "grad_norm": 0.5999312400817871, + "learning_rate": 0.0005380710659898477, + "loss": 2.6179, + "step": 106 + }, + { + "epoch": 0.04907694071780759, + "grad_norm": 0.6067392826080322, + "learning_rate": 0.0005431472081218274, + "loss": 2.1979, + "step": 107 + }, + { + "epoch": 0.04953560371517028, + "grad_norm": 0.5408378839492798, + "learning_rate": 0.0005482233502538071, + "loss": 1.9679, + "step": 108 + }, + { + "epoch": 0.049994266712532966, + "grad_norm": 0.8069329857826233, + "learning_rate": 0.0005532994923857868, + "loss": 2.8796, + "step": 109 + }, + { + "epoch": 0.05045292970989566, + "grad_norm": 0.7999544143676758, + "learning_rate": 0.0005583756345177665, + "loss": 2.3101, + "step": 110 + }, + { + "epoch": 0.05091159270725834, + "grad_norm": 0.8346303701400757, + "learning_rate": 0.0005634517766497462, + "loss": 3.0054, + "step": 111 + }, + { + "epoch": 0.05137025570462103, + "grad_norm": 0.4418480396270752, + "learning_rate": 0.0005685279187817259, + "loss": 1.3883, + "step": 112 + }, + { + "epoch": 0.051828918701983716, + "grad_norm": 0.6058046221733093, + "learning_rate": 0.0005736040609137056, + "loss": 1.7998, + "step": 113 + }, + { + "epoch": 0.05228758169934641, + "grad_norm": 0.4952194392681122, + "learning_rate": 0.0005786802030456852, + "loss": 1.7445, + "step": 114 + }, + { + "epoch": 0.05274624469670909, + "grad_norm": 0.6057190895080566, + "learning_rate": 0.0005837563451776649, + "loss": 1.9283, + "step": 115 + }, + { + "epoch": 0.05320490769407178, + "grad_norm": 0.4411911964416504, + "learning_rate": 0.0005888324873096446, + "loss": 1.3921, + "step": 116 + }, + { + "epoch": 0.053663570691434466, + "grad_norm": 0.7016524076461792, + "learning_rate": 0.0005939086294416243, + "loss": 2.9248, + "step": 117 + }, + { + "epoch": 0.05412223368879716, + "grad_norm": 0.7117334008216858, + "learning_rate": 0.000598984771573604, + "loss": 2.7302, + "step": 118 + }, + { + "epoch": 0.05458089668615984, + "grad_norm": 0.49954333901405334, + "learning_rate": 0.0006040609137055838, + "loss": 1.8414, + "step": 119 + }, + { + "epoch": 0.05503955968352253, + "grad_norm": 0.6712334156036377, + "learning_rate": 0.0006091370558375635, + "loss": 2.4685, + "step": 120 + }, + { + "epoch": 0.05549822268088522, + "grad_norm": 0.37909215688705444, + "learning_rate": 0.0006142131979695432, + "loss": 1.1743, + "step": 121 + }, + { + "epoch": 0.05595688567824791, + "grad_norm": 0.42544159293174744, + "learning_rate": 0.0006192893401015229, + "loss": 1.4775, + "step": 122 + }, + { + "epoch": 0.0564155486756106, + "grad_norm": 0.5652533769607544, + "learning_rate": 0.0006243654822335026, + "loss": 2.1067, + "step": 123 + }, + { + "epoch": 0.05687421167297328, + "grad_norm": 0.4516564607620239, + "learning_rate": 0.0006294416243654823, + "loss": 1.7084, + "step": 124 + }, + { + "epoch": 0.05733287467033597, + "grad_norm": 0.6015462279319763, + "learning_rate": 0.000634517766497462, + "loss": 2.3374, + "step": 125 + }, + { + "epoch": 0.05779153766769866, + "grad_norm": 0.5467323660850525, + "learning_rate": 0.0006395939086294417, + "loss": 2.2559, + "step": 126 + }, + { + "epoch": 0.05825020066506135, + "grad_norm": 0.5923335552215576, + "learning_rate": 0.0006446700507614214, + "loss": 2.4232, + "step": 127 + }, + { + "epoch": 0.05870886366242403, + "grad_norm": 0.6827653050422668, + "learning_rate": 0.000649746192893401, + "loss": 2.7278, + "step": 128 + }, + { + "epoch": 0.05916752665978672, + "grad_norm": 0.30980947613716125, + "learning_rate": 0.0006548223350253807, + "loss": 1.019, + "step": 129 + }, + { + "epoch": 0.05962618965714941, + "grad_norm": 0.6105501651763916, + "learning_rate": 0.0006598984771573604, + "loss": 2.1824, + "step": 130 + }, + { + "epoch": 0.0600848526545121, + "grad_norm": 0.292199969291687, + "learning_rate": 0.0006649746192893401, + "loss": 1.0942, + "step": 131 + }, + { + "epoch": 0.06054351565187478, + "grad_norm": 0.6059619188308716, + "learning_rate": 0.0006700507614213198, + "loss": 2.2701, + "step": 132 + }, + { + "epoch": 0.06100217864923747, + "grad_norm": 0.27790480852127075, + "learning_rate": 0.0006751269035532995, + "loss": 1.1452, + "step": 133 + }, + { + "epoch": 0.06146084164660016, + "grad_norm": 0.47722020745277405, + "learning_rate": 0.0006802030456852792, + "loss": 2.2821, + "step": 134 + }, + { + "epoch": 0.06191950464396285, + "grad_norm": 0.6286266446113586, + "learning_rate": 0.0006852791878172588, + "loss": 2.3686, + "step": 135 + }, + { + "epoch": 0.06237816764132553, + "grad_norm": 0.6163983345031738, + "learning_rate": 0.0006903553299492385, + "loss": 2.3126, + "step": 136 + }, + { + "epoch": 0.06283683063868822, + "grad_norm": 0.5008450150489807, + "learning_rate": 0.0006954314720812182, + "loss": 1.8563, + "step": 137 + }, + { + "epoch": 0.06329549363605091, + "grad_norm": 0.6025516986846924, + "learning_rate": 0.0007005076142131979, + "loss": 2.11, + "step": 138 + }, + { + "epoch": 0.0637541566334136, + "grad_norm": 0.4481363892555237, + "learning_rate": 0.0007055837563451776, + "loss": 1.8222, + "step": 139 + }, + { + "epoch": 0.06421281963077628, + "grad_norm": 0.5932562351226807, + "learning_rate": 0.0007106598984771574, + "loss": 2.28, + "step": 140 + }, + { + "epoch": 0.06467148262813897, + "grad_norm": 0.3764328062534332, + "learning_rate": 0.0007157360406091371, + "loss": 1.6057, + "step": 141 + }, + { + "epoch": 0.06513014562550166, + "grad_norm": 0.5842316746711731, + "learning_rate": 0.0007208121827411168, + "loss": 2.1699, + "step": 142 + }, + { + "epoch": 0.06558880862286436, + "grad_norm": 0.3900845944881439, + "learning_rate": 0.0007258883248730965, + "loss": 1.5842, + "step": 143 + }, + { + "epoch": 0.06604747162022703, + "grad_norm": 0.19777247309684753, + "learning_rate": 0.0007309644670050762, + "loss": 0.7936, + "step": 144 + }, + { + "epoch": 0.06650613461758972, + "grad_norm": 0.1068076640367508, + "learning_rate": 0.0007360406091370559, + "loss": 0.4987, + "step": 145 + }, + { + "epoch": 0.06696479761495241, + "grad_norm": 0.5474305152893066, + "learning_rate": 0.0007411167512690356, + "loss": 2.032, + "step": 146 + }, + { + "epoch": 0.0674234606123151, + "grad_norm": 0.6003263592720032, + "learning_rate": 0.0007461928934010153, + "loss": 2.3611, + "step": 147 + }, + { + "epoch": 0.06788212360967778, + "grad_norm": 0.3899264633655548, + "learning_rate": 0.000751269035532995, + "loss": 1.4559, + "step": 148 + }, + { + "epoch": 0.06834078660704047, + "grad_norm": 0.4279090464115143, + "learning_rate": 0.0007563451776649747, + "loss": 1.479, + "step": 149 + }, + { + "epoch": 0.06879944960440317, + "grad_norm": 0.4912429749965668, + "learning_rate": 0.0007614213197969543, + "loss": 1.6797, + "step": 150 + }, + { + "epoch": 0.06925811260176586, + "grad_norm": 0.23800012469291687, + "learning_rate": 0.000766497461928934, + "loss": 0.8729, + "step": 151 + }, + { + "epoch": 0.06971677559912855, + "grad_norm": 0.6027427315711975, + "learning_rate": 0.0007715736040609137, + "loss": 1.8105, + "step": 152 + }, + { + "epoch": 0.07017543859649122, + "grad_norm": 0.6931683421134949, + "learning_rate": 0.0007766497461928934, + "loss": 2.2175, + "step": 153 + }, + { + "epoch": 0.07063410159385392, + "grad_norm": 0.4811568558216095, + "learning_rate": 0.0007817258883248731, + "loss": 1.689, + "step": 154 + }, + { + "epoch": 0.0710927645912166, + "grad_norm": 0.6901246905326843, + "learning_rate": 0.0007868020304568528, + "loss": 1.6627, + "step": 155 + }, + { + "epoch": 0.0715514275885793, + "grad_norm": 12.634415626525879, + "learning_rate": 0.0007918781725888325, + "loss": 2.9146, + "step": 156 + }, + { + "epoch": 0.07201009058594197, + "grad_norm": 0.3894835114479065, + "learning_rate": 0.0007969543147208121, + "loss": 1.461, + "step": 157 + }, + { + "epoch": 0.07246875358330467, + "grad_norm": 0.4583110511302948, + "learning_rate": 0.0008020304568527918, + "loss": 1.3597, + "step": 158 + }, + { + "epoch": 0.07292741658066736, + "grad_norm": 0.5084859728813171, + "learning_rate": 0.0008071065989847715, + "loss": 2.131, + "step": 159 + }, + { + "epoch": 0.07338607957803005, + "grad_norm": 0.6448908448219299, + "learning_rate": 0.0008121827411167512, + "loss": 2.3026, + "step": 160 + }, + { + "epoch": 0.07384474257539272, + "grad_norm": 0.4896964728832245, + "learning_rate": 0.000817258883248731, + "loss": 2.0397, + "step": 161 + }, + { + "epoch": 0.07430340557275542, + "grad_norm": 0.6146819591522217, + "learning_rate": 0.0008223350253807107, + "loss": 2.1047, + "step": 162 + }, + { + "epoch": 0.0747620685701181, + "grad_norm": 0.2950705587863922, + "learning_rate": 0.0008274111675126904, + "loss": 1.1177, + "step": 163 + }, + { + "epoch": 0.0752207315674808, + "grad_norm": 0.39602458477020264, + "learning_rate": 0.0008324873096446702, + "loss": 1.6326, + "step": 164 + }, + { + "epoch": 0.07567939456484347, + "grad_norm": 0.46519267559051514, + "learning_rate": 0.0008375634517766498, + "loss": 1.7606, + "step": 165 + }, + { + "epoch": 0.07613805756220617, + "grad_norm": 0.31276392936706543, + "learning_rate": 0.0008426395939086295, + "loss": 1.2241, + "step": 166 + }, + { + "epoch": 0.07659672055956886, + "grad_norm": 0.48499414324760437, + "learning_rate": 0.0008477157360406092, + "loss": 1.9176, + "step": 167 + }, + { + "epoch": 0.07705538355693155, + "grad_norm": 0.43687963485717773, + "learning_rate": 0.0008527918781725889, + "loss": 1.6424, + "step": 168 + }, + { + "epoch": 0.07751404655429424, + "grad_norm": 0.4287634491920471, + "learning_rate": 0.0008578680203045686, + "loss": 1.8438, + "step": 169 + }, + { + "epoch": 0.07797270955165692, + "grad_norm": 0.28460821509361267, + "learning_rate": 0.0008629441624365483, + "loss": 1.1942, + "step": 170 + }, + { + "epoch": 0.0784313725490196, + "grad_norm": 0.30606964230537415, + "learning_rate": 0.000868020304568528, + "loss": 1.1019, + "step": 171 + }, + { + "epoch": 0.0788900355463823, + "grad_norm": 0.38955485820770264, + "learning_rate": 0.0008730964467005076, + "loss": 1.7985, + "step": 172 + }, + { + "epoch": 0.07934869854374499, + "grad_norm": 0.41820240020751953, + "learning_rate": 0.0008781725888324873, + "loss": 1.6667, + "step": 173 + }, + { + "epoch": 0.07980736154110767, + "grad_norm": 0.47719693183898926, + "learning_rate": 0.000883248730964467, + "loss": 2.0426, + "step": 174 + }, + { + "epoch": 0.08026602453847036, + "grad_norm": 0.49073362350463867, + "learning_rate": 0.0008883248730964467, + "loss": 2.0519, + "step": 175 + }, + { + "epoch": 0.08072468753583305, + "grad_norm": 0.4831169843673706, + "learning_rate": 0.0008934010152284264, + "loss": 2.511, + "step": 176 + }, + { + "epoch": 0.08118335053319574, + "grad_norm": 0.549760103225708, + "learning_rate": 0.0008984771573604061, + "loss": 2.2359, + "step": 177 + }, + { + "epoch": 0.08164201353055842, + "grad_norm": 0.5303396582603455, + "learning_rate": 0.0009035532994923858, + "loss": 2.5146, + "step": 178 + }, + { + "epoch": 0.08210067652792111, + "grad_norm": 0.47503864765167236, + "learning_rate": 0.0009086294416243654, + "loss": 2.0219, + "step": 179 + }, + { + "epoch": 0.0825593395252838, + "grad_norm": 0.4175845682621002, + "learning_rate": 0.0009137055837563451, + "loss": 1.9651, + "step": 180 + }, + { + "epoch": 0.08301800252264649, + "grad_norm": 0.536845862865448, + "learning_rate": 0.0009187817258883248, + "loss": 2.4031, + "step": 181 + }, + { + "epoch": 0.08347666552000918, + "grad_norm": 0.6141538023948669, + "learning_rate": 0.0009238578680203045, + "loss": 2.6653, + "step": 182 + }, + { + "epoch": 0.08393532851737186, + "grad_norm": 0.31810981035232544, + "learning_rate": 0.0009289340101522843, + "loss": 1.1417, + "step": 183 + }, + { + "epoch": 0.08439399151473455, + "grad_norm": 0.4755139648914337, + "learning_rate": 0.000934010152284264, + "loss": 2.1119, + "step": 184 + }, + { + "epoch": 0.08485265451209724, + "grad_norm": 0.14865882694721222, + "learning_rate": 0.0009390862944162437, + "loss": 0.6605, + "step": 185 + }, + { + "epoch": 0.08531131750945993, + "grad_norm": 0.31906870007514954, + "learning_rate": 0.0009441624365482235, + "loss": 1.2437, + "step": 186 + }, + { + "epoch": 0.08576998050682261, + "grad_norm": 0.5705398917198181, + "learning_rate": 0.0009492385786802031, + "loss": 2.3323, + "step": 187 + }, + { + "epoch": 0.0862286435041853, + "grad_norm": 0.3601418137550354, + "learning_rate": 0.0009543147208121828, + "loss": 1.5806, + "step": 188 + }, + { + "epoch": 0.08668730650154799, + "grad_norm": 0.4881735146045685, + "learning_rate": 0.0009593908629441625, + "loss": 1.9485, + "step": 189 + }, + { + "epoch": 0.08714596949891068, + "grad_norm": 0.15371385216712952, + "learning_rate": 0.0009644670050761422, + "loss": 0.683, + "step": 190 + }, + { + "epoch": 0.08760463249627336, + "grad_norm": 0.41688668727874756, + "learning_rate": 0.0009695431472081219, + "loss": 1.6802, + "step": 191 + }, + { + "epoch": 0.08806329549363605, + "grad_norm": 0.429606556892395, + "learning_rate": 0.0009746192893401016, + "loss": 1.9552, + "step": 192 + }, + { + "epoch": 0.08852195849099874, + "grad_norm": 0.549892008304596, + "learning_rate": 0.0009796954314720812, + "loss": 2.1486, + "step": 193 + }, + { + "epoch": 0.08898062148836143, + "grad_norm": 0.3614405393600464, + "learning_rate": 0.000984771573604061, + "loss": 1.3867, + "step": 194 + }, + { + "epoch": 0.08943928448572411, + "grad_norm": 0.43819788098335266, + "learning_rate": 0.0009898477157360406, + "loss": 1.8317, + "step": 195 + }, + { + "epoch": 0.0898979474830868, + "grad_norm": 0.14601579308509827, + "learning_rate": 0.0009949238578680203, + "loss": 0.6363, + "step": 196 + }, + { + "epoch": 0.09035661048044949, + "grad_norm": 0.2820906937122345, + "learning_rate": 0.001, + "loss": 1.0806, + "step": 197 + }, + { + "epoch": 0.09081527347781218, + "grad_norm": 0.41677334904670715, + "learning_rate": 0.0009999999386731974, + "loss": 1.79, + "step": 198 + }, + { + "epoch": 0.09127393647517487, + "grad_norm": 0.3319721519947052, + "learning_rate": 0.0009999997546928047, + "loss": 1.4927, + "step": 199 + }, + { + "epoch": 0.09173259947253755, + "grad_norm": 0.37898340821266174, + "learning_rate": 0.000999999448058867, + "loss": 1.6826, + "step": 200 + }, + { + "epoch": 0.09219126246990024, + "grad_norm": 0.5117692351341248, + "learning_rate": 0.0009999990187714593, + "loss": 2.368, + "step": 201 + }, + { + "epoch": 0.09264992546726293, + "grad_norm": 0.44219571352005005, + "learning_rate": 0.0009999984668306874, + "loss": 2.3706, + "step": 202 + }, + { + "epoch": 0.09310858846462562, + "grad_norm": 0.38528987765312195, + "learning_rate": 0.0009999977922366863, + "loss": 1.6523, + "step": 203 + }, + { + "epoch": 0.0935672514619883, + "grad_norm": 0.5601181983947754, + "learning_rate": 0.0009999969949896215, + "loss": 2.0021, + "step": 204 + }, + { + "epoch": 0.09402591445935099, + "grad_norm": 0.34919580817222595, + "learning_rate": 0.0009999960750896888, + "loss": 1.5776, + "step": 205 + }, + { + "epoch": 0.09448457745671368, + "grad_norm": 0.5601129531860352, + "learning_rate": 0.0009999950325371137, + "loss": 2.25, + "step": 206 + }, + { + "epoch": 0.09494324045407637, + "grad_norm": 0.5060522556304932, + "learning_rate": 0.0009999938673321519, + "loss": 2.4441, + "step": 207 + }, + { + "epoch": 0.09540190345143905, + "grad_norm": 0.44873717427253723, + "learning_rate": 0.0009999925794750893, + "loss": 1.6166, + "step": 208 + }, + { + "epoch": 0.09586056644880174, + "grad_norm": 0.1810014545917511, + "learning_rate": 0.0009999911689662418, + "loss": 0.7684, + "step": 209 + }, + { + "epoch": 0.09631922944616443, + "grad_norm": 0.3902459144592285, + "learning_rate": 0.0009999896358059556, + "loss": 1.5992, + "step": 210 + }, + { + "epoch": 0.09677789244352712, + "grad_norm": 0.6398706436157227, + "learning_rate": 0.0009999879799946067, + "loss": 2.5659, + "step": 211 + }, + { + "epoch": 0.09723655544088981, + "grad_norm": 0.6179486513137817, + "learning_rate": 0.000999986201532601, + "loss": 2.6672, + "step": 212 + }, + { + "epoch": 0.09769521843825249, + "grad_norm": 0.46587255597114563, + "learning_rate": 0.0009999843004203748, + "loss": 1.9471, + "step": 213 + }, + { + "epoch": 0.09815388143561518, + "grad_norm": 0.5521425604820251, + "learning_rate": 0.0009999822766583947, + "loss": 2.51, + "step": 214 + }, + { + "epoch": 0.09861254443297787, + "grad_norm": 0.43424972891807556, + "learning_rate": 0.0009999801302471574, + "loss": 1.8136, + "step": 215 + }, + { + "epoch": 0.09907120743034056, + "grad_norm": 0.3763306140899658, + "learning_rate": 0.0009999778611871888, + "loss": 1.3743, + "step": 216 + }, + { + "epoch": 0.09952987042770324, + "grad_norm": 0.34321171045303345, + "learning_rate": 0.0009999754694790459, + "loss": 1.4966, + "step": 217 + }, + { + "epoch": 0.09998853342506593, + "grad_norm": 0.46348294615745544, + "learning_rate": 0.0009999729551233155, + "loss": 1.6602, + "step": 218 + }, + { + "epoch": 0.10044719642242862, + "grad_norm": 0.5280048847198486, + "learning_rate": 0.000999970318120614, + "loss": 2.2937, + "step": 219 + }, + { + "epoch": 0.10090585941979131, + "grad_norm": 0.4680062234401703, + "learning_rate": 0.0009999675584715887, + "loss": 1.999, + "step": 220 + }, + { + "epoch": 0.10136452241715399, + "grad_norm": 0.48422807455062866, + "learning_rate": 0.0009999646761769162, + "loss": 2.1548, + "step": 221 + }, + { + "epoch": 0.10182318541451668, + "grad_norm": 0.30261266231536865, + "learning_rate": 0.0009999616712373035, + "loss": 1.3619, + "step": 222 + }, + { + "epoch": 0.10228184841187937, + "grad_norm": 0.549519956111908, + "learning_rate": 0.0009999585436534883, + "loss": 2.4467, + "step": 223 + }, + { + "epoch": 0.10274051140924206, + "grad_norm": 0.4387172758579254, + "learning_rate": 0.0009999552934262374, + "loss": 2.0479, + "step": 224 + }, + { + "epoch": 0.10319917440660474, + "grad_norm": 0.49094218015670776, + "learning_rate": 0.0009999519205563482, + "loss": 2.4268, + "step": 225 + }, + { + "epoch": 0.10365783740396743, + "grad_norm": 0.5436589121818542, + "learning_rate": 0.0009999484250446478, + "loss": 2.5486, + "step": 226 + }, + { + "epoch": 0.10411650040133012, + "grad_norm": 0.5063910484313965, + "learning_rate": 0.000999944806891994, + "loss": 2.364, + "step": 227 + }, + { + "epoch": 0.10457516339869281, + "grad_norm": 0.5463952422142029, + "learning_rate": 0.0009999410660992743, + "loss": 2.6033, + "step": 228 + }, + { + "epoch": 0.1050338263960555, + "grad_norm": 0.507409393787384, + "learning_rate": 0.0009999372026674063, + "loss": 2.32, + "step": 229 + }, + { + "epoch": 0.10549248939341818, + "grad_norm": 0.5893042087554932, + "learning_rate": 0.0009999332165973379, + "loss": 2.7153, + "step": 230 + }, + { + "epoch": 0.10595115239078087, + "grad_norm": 0.284311980009079, + "learning_rate": 0.0009999291078900466, + "loss": 0.9774, + "step": 231 + }, + { + "epoch": 0.10640981538814356, + "grad_norm": 0.42511793971061707, + "learning_rate": 0.0009999248765465406, + "loss": 1.924, + "step": 232 + }, + { + "epoch": 0.10686847838550625, + "grad_norm": 0.6010558605194092, + "learning_rate": 0.0009999205225678575, + "loss": 2.7175, + "step": 233 + }, + { + "epoch": 0.10732714138286893, + "grad_norm": 0.3992350697517395, + "learning_rate": 0.000999916045955066, + "loss": 1.9003, + "step": 234 + }, + { + "epoch": 0.10778580438023162, + "grad_norm": 0.3539673388004303, + "learning_rate": 0.0009999114467092636, + "loss": 1.4302, + "step": 235 + }, + { + "epoch": 0.10824446737759431, + "grad_norm": 0.3749694228172302, + "learning_rate": 0.0009999067248315787, + "loss": 1.7714, + "step": 236 + }, + { + "epoch": 0.108703130374957, + "grad_norm": 0.47365519404411316, + "learning_rate": 0.0009999018803231697, + "loss": 1.9969, + "step": 237 + }, + { + "epoch": 0.10916179337231968, + "grad_norm": 0.3063698709011078, + "learning_rate": 0.000999896913185225, + "loss": 1.3087, + "step": 238 + }, + { + "epoch": 0.10962045636968237, + "grad_norm": 0.2695063352584839, + "learning_rate": 0.0009998918234189632, + "loss": 1.203, + "step": 239 + }, + { + "epoch": 0.11007911936704506, + "grad_norm": 0.3138861060142517, + "learning_rate": 0.0009998866110256326, + "loss": 1.1181, + "step": 240 + }, + { + "epoch": 0.11053778236440776, + "grad_norm": 0.4734935462474823, + "learning_rate": 0.000999881276006512, + "loss": 2.2582, + "step": 241 + }, + { + "epoch": 0.11099644536177045, + "grad_norm": 0.3687781095504761, + "learning_rate": 0.0009998758183629099, + "loss": 1.6593, + "step": 242 + }, + { + "epoch": 0.11145510835913312, + "grad_norm": 0.33596163988113403, + "learning_rate": 0.0009998702380961655, + "loss": 1.4227, + "step": 243 + }, + { + "epoch": 0.11191377135649581, + "grad_norm": 0.37901684641838074, + "learning_rate": 0.0009998645352076471, + "loss": 1.6273, + "step": 244 + }, + { + "epoch": 0.1123724343538585, + "grad_norm": 0.30980101227760315, + "learning_rate": 0.0009998587096987544, + "loss": 1.0549, + "step": 245 + }, + { + "epoch": 0.1128310973512212, + "grad_norm": 0.4566017687320709, + "learning_rate": 0.0009998527615709158, + "loss": 1.9674, + "step": 246 + }, + { + "epoch": 0.11328976034858387, + "grad_norm": 0.4420877695083618, + "learning_rate": 0.0009998466908255907, + "loss": 2.1819, + "step": 247 + }, + { + "epoch": 0.11374842334594656, + "grad_norm": 0.29177185893058777, + "learning_rate": 0.0009998404974642684, + "loss": 1.3174, + "step": 248 + }, + { + "epoch": 0.11420708634330926, + "grad_norm": 0.2630375623703003, + "learning_rate": 0.000999834181488468, + "loss": 1.0881, + "step": 249 + }, + { + "epoch": 0.11466574934067195, + "grad_norm": 0.36166948080062866, + "learning_rate": 0.0009998277428997387, + "loss": 1.725, + "step": 250 + }, + { + "epoch": 0.11512441233803462, + "grad_norm": 0.4986696243286133, + "learning_rate": 0.0009998211816996602, + "loss": 2.5789, + "step": 251 + }, + { + "epoch": 0.11558307533539731, + "grad_norm": 0.3712925612926483, + "learning_rate": 0.0009998144978898421, + "loss": 2.1168, + "step": 252 + }, + { + "epoch": 0.11604173833276, + "grad_norm": 0.29228702187538147, + "learning_rate": 0.0009998076914719237, + "loss": 1.5106, + "step": 253 + }, + { + "epoch": 0.1165004013301227, + "grad_norm": 0.41084229946136475, + "learning_rate": 0.0009998007624475746, + "loss": 1.99, + "step": 254 + }, + { + "epoch": 0.11695906432748537, + "grad_norm": 0.394399493932724, + "learning_rate": 0.0009997937108184951, + "loss": 2.3965, + "step": 255 + }, + { + "epoch": 0.11741772732484806, + "grad_norm": 0.27776357531547546, + "learning_rate": 0.0009997865365864143, + "loss": 1.3273, + "step": 256 + }, + { + "epoch": 0.11787639032221076, + "grad_norm": 0.23769012093544006, + "learning_rate": 0.0009997792397530925, + "loss": 1.1331, + "step": 257 + }, + { + "epoch": 0.11833505331957345, + "grad_norm": 0.3943609297275543, + "learning_rate": 0.0009997718203203197, + "loss": 2.2263, + "step": 258 + }, + { + "epoch": 0.11879371631693614, + "grad_norm": 0.3729076087474823, + "learning_rate": 0.0009997642782899158, + "loss": 1.9142, + "step": 259 + }, + { + "epoch": 0.11925237931429881, + "grad_norm": 0.4220196604728699, + "learning_rate": 0.000999756613663731, + "loss": 2.4495, + "step": 260 + }, + { + "epoch": 0.1197110423116615, + "grad_norm": 0.43063583970069885, + "learning_rate": 0.0009997488264436455, + "loss": 2.1042, + "step": 261 + }, + { + "epoch": 0.1201697053090242, + "grad_norm": 0.31665846705436707, + "learning_rate": 0.000999740916631569, + "loss": 1.3811, + "step": 262 + }, + { + "epoch": 0.12062836830638689, + "grad_norm": 0.5183250904083252, + "learning_rate": 0.0009997328842294428, + "loss": 2.5891, + "step": 263 + }, + { + "epoch": 0.12108703130374956, + "grad_norm": 0.5458334684371948, + "learning_rate": 0.000999724729239237, + "loss": 2.8789, + "step": 264 + }, + { + "epoch": 0.12154569430111226, + "grad_norm": 0.32633325457572937, + "learning_rate": 0.0009997164516629515, + "loss": 1.5408, + "step": 265 + }, + { + "epoch": 0.12200435729847495, + "grad_norm": 0.3069404661655426, + "learning_rate": 0.0009997080515026175, + "loss": 1.4044, + "step": 266 + }, + { + "epoch": 0.12246302029583764, + "grad_norm": 0.3677540719509125, + "learning_rate": 0.0009996995287602953, + "loss": 2.0308, + "step": 267 + }, + { + "epoch": 0.12292168329320032, + "grad_norm": 0.37254124879837036, + "learning_rate": 0.0009996908834380756, + "loss": 1.7371, + "step": 268 + }, + { + "epoch": 0.123380346290563, + "grad_norm": 0.351275771856308, + "learning_rate": 0.0009996821155380793, + "loss": 1.7554, + "step": 269 + }, + { + "epoch": 0.1238390092879257, + "grad_norm": 0.45772960782051086, + "learning_rate": 0.0009996732250624571, + "loss": 2.5547, + "step": 270 + }, + { + "epoch": 0.12429767228528839, + "grad_norm": 0.3280022144317627, + "learning_rate": 0.00099966421201339, + "loss": 1.5541, + "step": 271 + }, + { + "epoch": 0.12475633528265107, + "grad_norm": 0.5368868708610535, + "learning_rate": 0.0009996550763930892, + "loss": 2.1713, + "step": 272 + }, + { + "epoch": 0.12521499828001376, + "grad_norm": 0.2520690858364105, + "learning_rate": 0.000999645818203795, + "loss": 0.9838, + "step": 273 + }, + { + "epoch": 0.12567366127737645, + "grad_norm": 0.3174075186252594, + "learning_rate": 0.0009996364374477793, + "loss": 1.5059, + "step": 274 + }, + { + "epoch": 0.12613232427473914, + "grad_norm": 0.26668769121170044, + "learning_rate": 0.0009996269341273427, + "loss": 1.3316, + "step": 275 + }, + { + "epoch": 0.12659098727210183, + "grad_norm": 0.3275998532772064, + "learning_rate": 0.000999617308244817, + "loss": 1.717, + "step": 276 + }, + { + "epoch": 0.12704965026946452, + "grad_norm": 0.4276018738746643, + "learning_rate": 0.0009996075598025628, + "loss": 2.5447, + "step": 277 + }, + { + "epoch": 0.1275083132668272, + "grad_norm": 0.3532467782497406, + "learning_rate": 0.000999597688802972, + "loss": 1.939, + "step": 278 + }, + { + "epoch": 0.12796697626418987, + "grad_norm": 0.3706674575805664, + "learning_rate": 0.000999587695248466, + "loss": 1.9899, + "step": 279 + }, + { + "epoch": 0.12842563926155257, + "grad_norm": 0.38228681683540344, + "learning_rate": 0.0009995775791414958, + "loss": 2.1842, + "step": 280 + }, + { + "epoch": 0.12888430225891526, + "grad_norm": 0.4526541829109192, + "learning_rate": 0.0009995673404845434, + "loss": 2.1307, + "step": 281 + }, + { + "epoch": 0.12934296525627795, + "grad_norm": 0.44101178646087646, + "learning_rate": 0.0009995569792801205, + "loss": 2.179, + "step": 282 + }, + { + "epoch": 0.12980162825364064, + "grad_norm": 0.15456262230873108, + "learning_rate": 0.0009995464955307684, + "loss": 0.6752, + "step": 283 + }, + { + "epoch": 0.13026029125100333, + "grad_norm": 0.4431912302970886, + "learning_rate": 0.000999535889239059, + "loss": 2.4602, + "step": 284 + }, + { + "epoch": 0.13071895424836602, + "grad_norm": 0.4478715658187866, + "learning_rate": 0.0009995251604075943, + "loss": 2.3623, + "step": 285 + }, + { + "epoch": 0.1311776172457287, + "grad_norm": 0.332527756690979, + "learning_rate": 0.000999514309039006, + "loss": 1.4232, + "step": 286 + }, + { + "epoch": 0.13163628024309137, + "grad_norm": 0.3733260929584503, + "learning_rate": 0.0009995033351359558, + "loss": 1.7972, + "step": 287 + }, + { + "epoch": 0.13209494324045407, + "grad_norm": 0.4460364282131195, + "learning_rate": 0.000999492238701136, + "loss": 2.227, + "step": 288 + }, + { + "epoch": 0.13255360623781676, + "grad_norm": 0.35513222217559814, + "learning_rate": 0.0009994810197372684, + "loss": 2.0901, + "step": 289 + }, + { + "epoch": 0.13301226923517945, + "grad_norm": 0.3551701009273529, + "learning_rate": 0.0009994696782471054, + "loss": 1.7401, + "step": 290 + }, + { + "epoch": 0.13347093223254214, + "grad_norm": 0.37001582980155945, + "learning_rate": 0.000999458214233429, + "loss": 2.0237, + "step": 291 + }, + { + "epoch": 0.13392959522990483, + "grad_norm": 0.39629852771759033, + "learning_rate": 0.0009994466276990511, + "loss": 1.9617, + "step": 292 + }, + { + "epoch": 0.13438825822726752, + "grad_norm": 0.30655738711357117, + "learning_rate": 0.0009994349186468144, + "loss": 1.4343, + "step": 293 + }, + { + "epoch": 0.1348469212246302, + "grad_norm": 0.3813563585281372, + "learning_rate": 0.000999423087079591, + "loss": 2.0818, + "step": 294 + }, + { + "epoch": 0.1353055842219929, + "grad_norm": 0.4179360866546631, + "learning_rate": 0.0009994111330002835, + "loss": 2.1136, + "step": 295 + }, + { + "epoch": 0.13576424721935557, + "grad_norm": 0.341799259185791, + "learning_rate": 0.000999399056411824, + "loss": 1.9364, + "step": 296 + }, + { + "epoch": 0.13622291021671826, + "grad_norm": 0.3983851969242096, + "learning_rate": 0.0009993868573171748, + "loss": 1.4841, + "step": 297 + }, + { + "epoch": 0.13668157321408095, + "grad_norm": 0.32653865218162537, + "learning_rate": 0.0009993745357193293, + "loss": 1.5058, + "step": 298 + }, + { + "epoch": 0.13714023621144364, + "grad_norm": 0.5542972087860107, + "learning_rate": 0.0009993620916213092, + "loss": 2.5276, + "step": 299 + }, + { + "epoch": 0.13759889920880633, + "grad_norm": 0.4614558517932892, + "learning_rate": 0.0009993495250261676, + "loss": 2.0603, + "step": 300 + }, + { + "epoch": 0.13805756220616902, + "grad_norm": 0.1725156605243683, + "learning_rate": 0.0009993368359369867, + "loss": 0.7159, + "step": 301 + }, + { + "epoch": 0.1385162252035317, + "grad_norm": 0.466024786233902, + "learning_rate": 0.0009993240243568798, + "loss": 2.0669, + "step": 302 + }, + { + "epoch": 0.1389748882008944, + "grad_norm": 0.5493818521499634, + "learning_rate": 0.0009993110902889893, + "loss": 2.6599, + "step": 303 + }, + { + "epoch": 0.1394335511982571, + "grad_norm": 0.4970023036003113, + "learning_rate": 0.0009992980337364882, + "loss": 2.4265, + "step": 304 + }, + { + "epoch": 0.13989221419561976, + "grad_norm": 0.37026146054267883, + "learning_rate": 0.0009992848547025791, + "loss": 1.4705, + "step": 305 + }, + { + "epoch": 0.14035087719298245, + "grad_norm": 0.4260435402393341, + "learning_rate": 0.0009992715531904955, + "loss": 1.6935, + "step": 306 + }, + { + "epoch": 0.14080954019034514, + "grad_norm": 0.08674909919500351, + "learning_rate": 0.0009992581292034997, + "loss": 0.4348, + "step": 307 + }, + { + "epoch": 0.14126820318770783, + "grad_norm": 0.3797175884246826, + "learning_rate": 0.0009992445827448852, + "loss": 1.7039, + "step": 308 + }, + { + "epoch": 0.14172686618507052, + "grad_norm": 0.24065154790878296, + "learning_rate": 0.0009992309138179745, + "loss": 1.0594, + "step": 309 + }, + { + "epoch": 0.1421855291824332, + "grad_norm": 0.4619709849357605, + "learning_rate": 0.000999217122426121, + "loss": 2.0485, + "step": 310 + }, + { + "epoch": 0.1426441921797959, + "grad_norm": 0.4283906817436218, + "learning_rate": 0.000999203208572708, + "loss": 2.2198, + "step": 311 + }, + { + "epoch": 0.1431028551771586, + "grad_norm": 0.39942678809165955, + "learning_rate": 0.0009991891722611484, + "loss": 1.8766, + "step": 312 + }, + { + "epoch": 0.14356151817452126, + "grad_norm": 0.37499570846557617, + "learning_rate": 0.0009991750134948857, + "loss": 1.4081, + "step": 313 + }, + { + "epoch": 0.14402018117188395, + "grad_norm": 0.4207157790660858, + "learning_rate": 0.0009991607322773928, + "loss": 2.1097, + "step": 314 + }, + { + "epoch": 0.14447884416924664, + "grad_norm": 0.39866119623184204, + "learning_rate": 0.000999146328612173, + "loss": 1.8325, + "step": 315 + }, + { + "epoch": 0.14493750716660933, + "grad_norm": 0.507705807685852, + "learning_rate": 0.00099913180250276, + "loss": 2.0061, + "step": 316 + }, + { + "epoch": 0.14539617016397202, + "grad_norm": 0.4097268283367157, + "learning_rate": 0.0009991171539527168, + "loss": 1.9861, + "step": 317 + }, + { + "epoch": 0.1458548331613347, + "grad_norm": 0.36734098196029663, + "learning_rate": 0.000999102382965637, + "loss": 1.6204, + "step": 318 + }, + { + "epoch": 0.1463134961586974, + "grad_norm": 0.36929383873939514, + "learning_rate": 0.0009990874895451439, + "loss": 1.9098, + "step": 319 + }, + { + "epoch": 0.1467721591560601, + "grad_norm": 0.40100765228271484, + "learning_rate": 0.000999072473694891, + "loss": 1.9401, + "step": 320 + }, + { + "epoch": 0.14723082215342279, + "grad_norm": 0.38179731369018555, + "learning_rate": 0.0009990573354185617, + "loss": 2.0457, + "step": 321 + }, + { + "epoch": 0.14768948515078545, + "grad_norm": 0.4014952480792999, + "learning_rate": 0.0009990420747198697, + "loss": 1.5037, + "step": 322 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 0.2382311075925827, + "learning_rate": 0.0009990266916025585, + "loss": 1.1044, + "step": 323 + }, + { + "epoch": 0.14860681114551083, + "grad_norm": 0.40865546464920044, + "learning_rate": 0.0009990111860704019, + "loss": 1.8592, + "step": 324 + }, + { + "epoch": 0.14906547414287352, + "grad_norm": 0.41064581274986267, + "learning_rate": 0.000998995558127203, + "loss": 1.9371, + "step": 325 + }, + { + "epoch": 0.1495241371402362, + "grad_norm": 0.29274582862854004, + "learning_rate": 0.0009989798077767959, + "loss": 1.3743, + "step": 326 + }, + { + "epoch": 0.1499828001375989, + "grad_norm": 0.3344130218029022, + "learning_rate": 0.0009989639350230439, + "loss": 1.3143, + "step": 327 + }, + { + "epoch": 0.1504414631349616, + "grad_norm": 0.25171759724617004, + "learning_rate": 0.0009989479398698413, + "loss": 0.9927, + "step": 328 + }, + { + "epoch": 0.15090012613232429, + "grad_norm": 0.13870039582252502, + "learning_rate": 0.0009989318223211112, + "loss": 0.7593, + "step": 329 + }, + { + "epoch": 0.15135878912968695, + "grad_norm": 0.5177074074745178, + "learning_rate": 0.0009989155823808076, + "loss": 2.2173, + "step": 330 + }, + { + "epoch": 0.15181745212704964, + "grad_norm": 0.3356158435344696, + "learning_rate": 0.0009988992200529144, + "loss": 1.5492, + "step": 331 + }, + { + "epoch": 0.15227611512441233, + "grad_norm": 0.44678938388824463, + "learning_rate": 0.000998882735341445, + "loss": 1.7162, + "step": 332 + }, + { + "epoch": 0.15273477812177502, + "grad_norm": 0.31965455412864685, + "learning_rate": 0.0009988661282504438, + "loss": 1.2234, + "step": 333 + }, + { + "epoch": 0.1531934411191377, + "grad_norm": 0.4562229812145233, + "learning_rate": 0.0009988493987839841, + "loss": 2.0798, + "step": 334 + }, + { + "epoch": 0.1536521041165004, + "grad_norm": 0.36107146739959717, + "learning_rate": 0.00099883254694617, + "loss": 1.7864, + "step": 335 + }, + { + "epoch": 0.1541107671138631, + "grad_norm": 0.3504132330417633, + "learning_rate": 0.0009988155727411357, + "loss": 1.6267, + "step": 336 + }, + { + "epoch": 0.1545694301112258, + "grad_norm": 0.28946036100387573, + "learning_rate": 0.0009987984761730445, + "loss": 1.319, + "step": 337 + }, + { + "epoch": 0.15502809310858848, + "grad_norm": 0.49577978253364563, + "learning_rate": 0.0009987812572460905, + "loss": 1.9709, + "step": 338 + }, + { + "epoch": 0.15548675610595114, + "grad_norm": 0.49071890115737915, + "learning_rate": 0.000998763915964498, + "loss": 2.1024, + "step": 339 + }, + { + "epoch": 0.15594541910331383, + "grad_norm": 0.3110016882419586, + "learning_rate": 0.0009987464523325202, + "loss": 1.5035, + "step": 340 + }, + { + "epoch": 0.15640408210067652, + "grad_norm": 0.3991573452949524, + "learning_rate": 0.000998728866354442, + "loss": 1.731, + "step": 341 + }, + { + "epoch": 0.1568627450980392, + "grad_norm": 0.3533564805984497, + "learning_rate": 0.0009987111580345763, + "loss": 1.319, + "step": 342 + }, + { + "epoch": 0.1573214080954019, + "grad_norm": 0.4693927466869354, + "learning_rate": 0.000998693327377268, + "loss": 2.17, + "step": 343 + }, + { + "epoch": 0.1577800710927646, + "grad_norm": 0.4726630747318268, + "learning_rate": 0.0009986753743868905, + "loss": 2.1395, + "step": 344 + }, + { + "epoch": 0.1582387340901273, + "grad_norm": 0.27608227729797363, + "learning_rate": 0.0009986572990678482, + "loss": 1.3976, + "step": 345 + }, + { + "epoch": 0.15869739708748998, + "grad_norm": 0.34242314100265503, + "learning_rate": 0.000998639101424575, + "loss": 1.614, + "step": 346 + }, + { + "epoch": 0.15915606008485264, + "grad_norm": 0.41097334027290344, + "learning_rate": 0.0009986207814615348, + "loss": 2.111, + "step": 347 + }, + { + "epoch": 0.15961472308221533, + "grad_norm": 0.44249075651168823, + "learning_rate": 0.0009986023391832216, + "loss": 2.4585, + "step": 348 + }, + { + "epoch": 0.16007338607957802, + "grad_norm": 0.412911593914032, + "learning_rate": 0.0009985837745941596, + "loss": 1.9757, + "step": 349 + }, + { + "epoch": 0.1605320490769407, + "grad_norm": 0.22264568507671356, + "learning_rate": 0.0009985650876989025, + "loss": 0.9562, + "step": 350 + }, + { + "epoch": 0.1609907120743034, + "grad_norm": 0.449796199798584, + "learning_rate": 0.0009985462785020347, + "loss": 2.0573, + "step": 351 + }, + { + "epoch": 0.1614493750716661, + "grad_norm": 0.7880814671516418, + "learning_rate": 0.0009985273470081702, + "loss": 2.3027, + "step": 352 + }, + { + "epoch": 0.1619080380690288, + "grad_norm": 10.907687187194824, + "learning_rate": 0.0009985082932219527, + "loss": 2.3361, + "step": 353 + }, + { + "epoch": 0.16236670106639148, + "grad_norm": 2.09915828704834, + "learning_rate": 0.0009984891171480566, + "loss": 1.6033, + "step": 354 + }, + { + "epoch": 0.16282536406375417, + "grad_norm": 0.3674648106098175, + "learning_rate": 0.0009984698187911856, + "loss": 1.2239, + "step": 355 + }, + { + "epoch": 0.16328402706111683, + "grad_norm": 0.5814857482910156, + "learning_rate": 0.000998450398156074, + "loss": 1.4328, + "step": 356 + }, + { + "epoch": 0.16374269005847952, + "grad_norm": 0.46351417899131775, + "learning_rate": 0.0009984308552474857, + "loss": 2.0938, + "step": 357 + }, + { + "epoch": 0.16420135305584221, + "grad_norm": 0.5045230388641357, + "learning_rate": 0.0009984111900702146, + "loss": 2.2004, + "step": 358 + }, + { + "epoch": 0.1646600160532049, + "grad_norm": 0.311603844165802, + "learning_rate": 0.000998391402629085, + "loss": 1.5096, + "step": 359 + }, + { + "epoch": 0.1651186790505676, + "grad_norm": 0.40799856185913086, + "learning_rate": 0.0009983714929289508, + "loss": 2.1035, + "step": 360 + }, + { + "epoch": 0.1655773420479303, + "grad_norm": 0.1845995932817459, + "learning_rate": 0.0009983514609746959, + "loss": 0.8358, + "step": 361 + }, + { + "epoch": 0.16603600504529298, + "grad_norm": 0.36411669850349426, + "learning_rate": 0.0009983313067712344, + "loss": 1.5809, + "step": 362 + }, + { + "epoch": 0.16649466804265567, + "grad_norm": 0.3270757794380188, + "learning_rate": 0.00099831103032351, + "loss": 1.4844, + "step": 363 + }, + { + "epoch": 0.16695333104001836, + "grad_norm": 0.24170421063899994, + "learning_rate": 0.0009982906316364968, + "loss": 0.9695, + "step": 364 + }, + { + "epoch": 0.16741199403738102, + "grad_norm": 0.3997127413749695, + "learning_rate": 0.000998270110715199, + "loss": 2.1677, + "step": 365 + }, + { + "epoch": 0.16787065703474371, + "grad_norm": 0.251727432012558, + "learning_rate": 0.0009982494675646503, + "loss": 1.3295, + "step": 366 + }, + { + "epoch": 0.1683293200321064, + "grad_norm": 0.27435481548309326, + "learning_rate": 0.0009982287021899146, + "loss": 1.1655, + "step": 367 + }, + { + "epoch": 0.1687879830294691, + "grad_norm": 0.3682950437068939, + "learning_rate": 0.0009982078145960858, + "loss": 1.6586, + "step": 368 + }, + { + "epoch": 0.1692466460268318, + "grad_norm": 0.3245248794555664, + "learning_rate": 0.0009981868047882877, + "loss": 1.2151, + "step": 369 + }, + { + "epoch": 0.16970530902419448, + "grad_norm": 0.22152075171470642, + "learning_rate": 0.0009981656727716746, + "loss": 1.0169, + "step": 370 + }, + { + "epoch": 0.17016397202155717, + "grad_norm": 0.3736206293106079, + "learning_rate": 0.00099814441855143, + "loss": 1.5577, + "step": 371 + }, + { + "epoch": 0.17062263501891986, + "grad_norm": 0.3287566602230072, + "learning_rate": 0.0009981230421327674, + "loss": 1.4915, + "step": 372 + }, + { + "epoch": 0.17108129801628252, + "grad_norm": 0.26681220531463623, + "learning_rate": 0.000998101543520931, + "loss": 1.1304, + "step": 373 + }, + { + "epoch": 0.17153996101364521, + "grad_norm": 0.32426249980926514, + "learning_rate": 0.0009980799227211946, + "loss": 1.4148, + "step": 374 + }, + { + "epoch": 0.1719986240110079, + "grad_norm": 0.38737696409225464, + "learning_rate": 0.0009980581797388616, + "loss": 1.6913, + "step": 375 + }, + { + "epoch": 0.1724572870083706, + "grad_norm": 0.5705090761184692, + "learning_rate": 0.0009980363145792662, + "loss": 2.7029, + "step": 376 + }, + { + "epoch": 0.1729159500057333, + "grad_norm": 0.4684537649154663, + "learning_rate": 0.0009980143272477718, + "loss": 2.1586, + "step": 377 + }, + { + "epoch": 0.17337461300309598, + "grad_norm": 0.386222779750824, + "learning_rate": 0.0009979922177497718, + "loss": 2.2925, + "step": 378 + }, + { + "epoch": 0.17383327600045867, + "grad_norm": 0.2883818447589874, + "learning_rate": 0.0009979699860906902, + "loss": 1.1465, + "step": 379 + }, + { + "epoch": 0.17429193899782136, + "grad_norm": 0.30754733085632324, + "learning_rate": 0.0009979476322759806, + "loss": 0.9909, + "step": 380 + }, + { + "epoch": 0.17475060199518405, + "grad_norm": 0.42961201071739197, + "learning_rate": 0.0009979251563111261, + "loss": 2.0005, + "step": 381 + }, + { + "epoch": 0.17520926499254671, + "grad_norm": 0.41431599855422974, + "learning_rate": 0.0009979025582016409, + "loss": 1.6347, + "step": 382 + }, + { + "epoch": 0.1756679279899094, + "grad_norm": 0.39737799763679504, + "learning_rate": 0.000997879837953068, + "loss": 2.2396, + "step": 383 + }, + { + "epoch": 0.1761265909872721, + "grad_norm": 0.2799502909183502, + "learning_rate": 0.0009978569955709808, + "loss": 1.3289, + "step": 384 + }, + { + "epoch": 0.1765852539846348, + "grad_norm": 0.4221355617046356, + "learning_rate": 0.000997834031060983, + "loss": 2.1155, + "step": 385 + }, + { + "epoch": 0.17704391698199748, + "grad_norm": 0.48007211089134216, + "learning_rate": 0.0009978109444287078, + "loss": 2.677, + "step": 386 + }, + { + "epoch": 0.17750257997936017, + "grad_norm": 0.3006044626235962, + "learning_rate": 0.0009977877356798185, + "loss": 1.4993, + "step": 387 + }, + { + "epoch": 0.17796124297672286, + "grad_norm": 0.30362898111343384, + "learning_rate": 0.0009977644048200084, + "loss": 1.5726, + "step": 388 + }, + { + "epoch": 0.17841990597408555, + "grad_norm": 0.25762712955474854, + "learning_rate": 0.0009977409518550006, + "loss": 1.117, + "step": 389 + }, + { + "epoch": 0.17887856897144822, + "grad_norm": 0.2970077693462372, + "learning_rate": 0.0009977173767905486, + "loss": 1.5483, + "step": 390 + }, + { + "epoch": 0.1793372319688109, + "grad_norm": 0.40200671553611755, + "learning_rate": 0.0009976936796324352, + "loss": 2.1389, + "step": 391 + }, + { + "epoch": 0.1797958949661736, + "grad_norm": 0.4564474821090698, + "learning_rate": 0.0009976698603864737, + "loss": 2.4102, + "step": 392 + }, + { + "epoch": 0.1802545579635363, + "grad_norm": 0.5073770880699158, + "learning_rate": 0.0009976459190585072, + "loss": 2.1866, + "step": 393 + }, + { + "epoch": 0.18071322096089898, + "grad_norm": 0.24557775259017944, + "learning_rate": 0.0009976218556544084, + "loss": 1.0652, + "step": 394 + }, + { + "epoch": 0.18117188395826167, + "grad_norm": 0.3804593086242676, + "learning_rate": 0.0009975976701800803, + "loss": 1.7112, + "step": 395 + }, + { + "epoch": 0.18163054695562436, + "grad_norm": 0.414326012134552, + "learning_rate": 0.000997573362641456, + "loss": 1.8792, + "step": 396 + }, + { + "epoch": 0.18208920995298705, + "grad_norm": 0.34744778275489807, + "learning_rate": 0.000997548933044498, + "loss": 1.4276, + "step": 397 + }, + { + "epoch": 0.18254787295034974, + "grad_norm": 0.3834390640258789, + "learning_rate": 0.0009975243813951993, + "loss": 1.8641, + "step": 398 + }, + { + "epoch": 0.1830065359477124, + "grad_norm": 0.40305501222610474, + "learning_rate": 0.0009974997076995825, + "loss": 1.9985, + "step": 399 + }, + { + "epoch": 0.1834651989450751, + "grad_norm": 0.3667886555194855, + "learning_rate": 0.0009974749119637002, + "loss": 1.7085, + "step": 400 + }, + { + "epoch": 0.1839238619424378, + "grad_norm": 0.28065481781959534, + "learning_rate": 0.000997449994193635, + "loss": 1.42, + "step": 401 + }, + { + "epoch": 0.18438252493980048, + "grad_norm": 0.37087318301200867, + "learning_rate": 0.0009974249543954995, + "loss": 1.5901, + "step": 402 + }, + { + "epoch": 0.18484118793716317, + "grad_norm": 0.3824957013130188, + "learning_rate": 0.0009973997925754362, + "loss": 2.0566, + "step": 403 + }, + { + "epoch": 0.18529985093452586, + "grad_norm": 0.3663063049316406, + "learning_rate": 0.0009973745087396171, + "loss": 2.079, + "step": 404 + }, + { + "epoch": 0.18575851393188855, + "grad_norm": 0.30660781264305115, + "learning_rate": 0.0009973491028942448, + "loss": 1.5338, + "step": 405 + }, + { + "epoch": 0.18621717692925124, + "grad_norm": 0.25807487964630127, + "learning_rate": 0.0009973235750455516, + "loss": 1.2258, + "step": 406 + }, + { + "epoch": 0.1866758399266139, + "grad_norm": 0.288686066865921, + "learning_rate": 0.0009972979251997994, + "loss": 1.6471, + "step": 407 + }, + { + "epoch": 0.1871345029239766, + "grad_norm": 0.34755846858024597, + "learning_rate": 0.0009972721533632804, + "loss": 2.1007, + "step": 408 + }, + { + "epoch": 0.1875931659213393, + "grad_norm": 0.22822384536266327, + "learning_rate": 0.000997246259542317, + "loss": 1.1318, + "step": 409 + }, + { + "epoch": 0.18805182891870198, + "grad_norm": 0.12297138571739197, + "learning_rate": 0.0009972202437432604, + "loss": 0.6918, + "step": 410 + }, + { + "epoch": 0.18851049191606467, + "grad_norm": 0.3044014871120453, + "learning_rate": 0.000997194105972493, + "loss": 1.4567, + "step": 411 + }, + { + "epoch": 0.18896915491342736, + "grad_norm": 0.20039816200733185, + "learning_rate": 0.0009971678462364266, + "loss": 1.1349, + "step": 412 + }, + { + "epoch": 0.18942781791079005, + "grad_norm": 0.36312806606292725, + "learning_rate": 0.0009971414645415027, + "loss": 2.0817, + "step": 413 + }, + { + "epoch": 0.18988648090815274, + "grad_norm": 0.30179518461227417, + "learning_rate": 0.0009971149608941929, + "loss": 1.5486, + "step": 414 + }, + { + "epoch": 0.19034514390551543, + "grad_norm": 0.3225315809249878, + "learning_rate": 0.0009970883353009987, + "loss": 1.3529, + "step": 415 + }, + { + "epoch": 0.1908038069028781, + "grad_norm": 0.3565948009490967, + "learning_rate": 0.000997061587768452, + "loss": 1.9429, + "step": 416 + }, + { + "epoch": 0.1912624699002408, + "grad_norm": 0.6202376484870911, + "learning_rate": 0.0009970347183031133, + "loss": 1.3034, + "step": 417 + }, + { + "epoch": 0.19172113289760348, + "grad_norm": 0.31976190209388733, + "learning_rate": 0.0009970077269115748, + "loss": 1.833, + "step": 418 + }, + { + "epoch": 0.19217979589496617, + "grad_norm": 0.4025329649448395, + "learning_rate": 0.0009969806136004573, + "loss": 1.7521, + "step": 419 + }, + { + "epoch": 0.19263845889232886, + "grad_norm": 0.20742328464984894, + "learning_rate": 0.0009969533783764117, + "loss": 1.1387, + "step": 420 + }, + { + "epoch": 0.19309712188969155, + "grad_norm": 0.2832637429237366, + "learning_rate": 0.0009969260212461192, + "loss": 1.5901, + "step": 421 + }, + { + "epoch": 0.19355578488705424, + "grad_norm": 0.21540392935276031, + "learning_rate": 0.0009968985422162907, + "loss": 1.0676, + "step": 422 + }, + { + "epoch": 0.19401444788441694, + "grad_norm": 0.31164097785949707, + "learning_rate": 0.0009968709412936669, + "loss": 1.7977, + "step": 423 + }, + { + "epoch": 0.19447311088177963, + "grad_norm": 0.3007836937904358, + "learning_rate": 0.0009968432184850188, + "loss": 1.9207, + "step": 424 + }, + { + "epoch": 0.1949317738791423, + "grad_norm": 0.34054356813430786, + "learning_rate": 0.0009968153737971463, + "loss": 2.281, + "step": 425 + }, + { + "epoch": 0.19539043687650498, + "grad_norm": 0.3539496660232544, + "learning_rate": 0.0009967874072368808, + "loss": 2.0088, + "step": 426 + }, + { + "epoch": 0.19584909987386767, + "grad_norm": 0.4037904441356659, + "learning_rate": 0.0009967593188110822, + "loss": 2.0894, + "step": 427 + }, + { + "epoch": 0.19630776287123036, + "grad_norm": 0.36563950777053833, + "learning_rate": 0.0009967311085266408, + "loss": 1.7298, + "step": 428 + }, + { + "epoch": 0.19676642586859305, + "grad_norm": 0.4053013324737549, + "learning_rate": 0.000996702776390477, + "loss": 2.3652, + "step": 429 + }, + { + "epoch": 0.19722508886595574, + "grad_norm": 0.24620242416858673, + "learning_rate": 0.0009966743224095406, + "loss": 1.1821, + "step": 430 + }, + { + "epoch": 0.19768375186331844, + "grad_norm": 0.4127357006072998, + "learning_rate": 0.0009966457465908117, + "loss": 1.9592, + "step": 431 + }, + { + "epoch": 0.19814241486068113, + "grad_norm": 0.13345493376255035, + "learning_rate": 0.0009966170489413002, + "loss": 0.6845, + "step": 432 + }, + { + "epoch": 0.1986010778580438, + "grad_norm": 0.46981745958328247, + "learning_rate": 0.000996588229468046, + "loss": 2.3521, + "step": 433 + }, + { + "epoch": 0.19905974085540648, + "grad_norm": 0.414408802986145, + "learning_rate": 0.0009965592881781182, + "loss": 1.7859, + "step": 434 + }, + { + "epoch": 0.19951840385276917, + "grad_norm": 0.36975857615470886, + "learning_rate": 0.0009965302250786168, + "loss": 1.8119, + "step": 435 + }, + { + "epoch": 0.19997706685013186, + "grad_norm": 0.29966479539871216, + "learning_rate": 0.000996501040176671, + "loss": 1.6156, + "step": 436 + }, + { + "epoch": 0.20043572984749455, + "grad_norm": 0.3034079074859619, + "learning_rate": 0.00099647173347944, + "loss": 1.4443, + "step": 437 + }, + { + "epoch": 0.20089439284485724, + "grad_norm": 0.23707164824008942, + "learning_rate": 0.0009964423049941132, + "loss": 1.1766, + "step": 438 + }, + { + "epoch": 0.20135305584221994, + "grad_norm": 0.37824898958206177, + "learning_rate": 0.0009964127547279094, + "loss": 2.0259, + "step": 439 + }, + { + "epoch": 0.20181171883958263, + "grad_norm": 0.26142311096191406, + "learning_rate": 0.0009963830826880775, + "loss": 1.3159, + "step": 440 + }, + { + "epoch": 0.20227038183694532, + "grad_norm": 0.34730425477027893, + "learning_rate": 0.0009963532888818962, + "loss": 1.8903, + "step": 441 + }, + { + "epoch": 0.20272904483430798, + "grad_norm": 0.505499541759491, + "learning_rate": 0.0009963233733166744, + "loss": 2.4998, + "step": 442 + }, + { + "epoch": 0.20318770783167067, + "grad_norm": 0.38120755553245544, + "learning_rate": 0.0009962933359997505, + "loss": 1.8554, + "step": 443 + }, + { + "epoch": 0.20364637082903336, + "grad_norm": 0.2800237536430359, + "learning_rate": 0.0009962631769384928, + "loss": 1.4182, + "step": 444 + }, + { + "epoch": 0.20410503382639605, + "grad_norm": 0.35025089979171753, + "learning_rate": 0.0009962328961402992, + "loss": 1.7761, + "step": 445 + }, + { + "epoch": 0.20456369682375874, + "grad_norm": 0.446393221616745, + "learning_rate": 0.0009962024936125986, + "loss": 2.626, + "step": 446 + }, + { + "epoch": 0.20502235982112144, + "grad_norm": 0.26646509766578674, + "learning_rate": 0.0009961719693628484, + "loss": 1.2512, + "step": 447 + }, + { + "epoch": 0.20548102281848413, + "grad_norm": 0.4339228868484497, + "learning_rate": 0.0009961413233985363, + "loss": 2.1897, + "step": 448 + }, + { + "epoch": 0.20593968581584682, + "grad_norm": 0.31140047311782837, + "learning_rate": 0.0009961105557271803, + "loss": 1.3533, + "step": 449 + }, + { + "epoch": 0.20639834881320948, + "grad_norm": 0.34898841381073, + "learning_rate": 0.0009960796663563282, + "loss": 1.6303, + "step": 450 + }, + { + "epoch": 0.20685701181057217, + "grad_norm": 0.3299868106842041, + "learning_rate": 0.0009960486552935566, + "loss": 1.5021, + "step": 451 + }, + { + "epoch": 0.20731567480793486, + "grad_norm": 0.3481556177139282, + "learning_rate": 0.0009960175225464732, + "loss": 1.6241, + "step": 452 + }, + { + "epoch": 0.20777433780529755, + "grad_norm": 0.2556866407394409, + "learning_rate": 0.0009959862681227151, + "loss": 1.2629, + "step": 453 + }, + { + "epoch": 0.20823300080266025, + "grad_norm": 0.35603269934654236, + "learning_rate": 0.0009959548920299493, + "loss": 1.5148, + "step": 454 + }, + { + "epoch": 0.20869166380002294, + "grad_norm": 0.4437220096588135, + "learning_rate": 0.0009959233942758725, + "loss": 2.4865, + "step": 455 + }, + { + "epoch": 0.20915032679738563, + "grad_norm": 0.5016736388206482, + "learning_rate": 0.000995891774868211, + "loss": 2.6392, + "step": 456 + }, + { + "epoch": 0.20960898979474832, + "grad_norm": 0.32121479511260986, + "learning_rate": 0.000995860033814722, + "loss": 1.7887, + "step": 457 + }, + { + "epoch": 0.210067652792111, + "grad_norm": 0.316918283700943, + "learning_rate": 0.000995828171123191, + "loss": 1.5659, + "step": 458 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2690139710903168, + "learning_rate": 0.0009957961868014347, + "loss": 1.4294, + "step": 459 + }, + { + "epoch": 0.21098497878683636, + "grad_norm": 0.3223515450954437, + "learning_rate": 0.0009957640808572988, + "loss": 1.7502, + "step": 460 + }, + { + "epoch": 0.21144364178419905, + "grad_norm": 0.267553448677063, + "learning_rate": 0.0009957318532986593, + "loss": 1.1748, + "step": 461 + }, + { + "epoch": 0.21190230478156175, + "grad_norm": 0.30810901522636414, + "learning_rate": 0.0009956995041334215, + "loss": 1.6794, + "step": 462 + }, + { + "epoch": 0.21236096777892444, + "grad_norm": 0.33142194151878357, + "learning_rate": 0.0009956670333695215, + "loss": 1.7014, + "step": 463 + }, + { + "epoch": 0.21281963077628713, + "grad_norm": 0.270020455121994, + "learning_rate": 0.0009956344410149242, + "loss": 1.2939, + "step": 464 + }, + { + "epoch": 0.21327829377364982, + "grad_norm": 0.39550352096557617, + "learning_rate": 0.0009956017270776247, + "loss": 1.9753, + "step": 465 + }, + { + "epoch": 0.2137369567710125, + "grad_norm": 0.3066462278366089, + "learning_rate": 0.0009955688915656482, + "loss": 1.4247, + "step": 466 + }, + { + "epoch": 0.21419561976837517, + "grad_norm": 0.3493700325489044, + "learning_rate": 0.0009955359344870495, + "loss": 1.6978, + "step": 467 + }, + { + "epoch": 0.21465428276573786, + "grad_norm": 0.2585456073284149, + "learning_rate": 0.0009955028558499125, + "loss": 1.2059, + "step": 468 + }, + { + "epoch": 0.21511294576310055, + "grad_norm": 0.31552475690841675, + "learning_rate": 0.0009954696556623529, + "loss": 1.6361, + "step": 469 + }, + { + "epoch": 0.21557160876046325, + "grad_norm": 0.31127098202705383, + "learning_rate": 0.0009954363339325137, + "loss": 1.5196, + "step": 470 + }, + { + "epoch": 0.21603027175782594, + "grad_norm": 0.4300401210784912, + "learning_rate": 0.0009954028906685698, + "loss": 2.5085, + "step": 471 + }, + { + "epoch": 0.21648893475518863, + "grad_norm": 0.08116165548563004, + "learning_rate": 0.0009953693258787247, + "loss": 0.5775, + "step": 472 + }, + { + "epoch": 0.21694759775255132, + "grad_norm": 0.3769618272781372, + "learning_rate": 0.0009953356395712121, + "loss": 2.0813, + "step": 473 + }, + { + "epoch": 0.217406260749914, + "grad_norm": 0.3906650245189667, + "learning_rate": 0.0009953018317542957, + "loss": 2.0854, + "step": 474 + }, + { + "epoch": 0.2178649237472767, + "grad_norm": 0.24557356536388397, + "learning_rate": 0.0009952679024362685, + "loss": 1.2442, + "step": 475 + }, + { + "epoch": 0.21832358674463936, + "grad_norm": 0.3962481915950775, + "learning_rate": 0.000995233851625454, + "loss": 2.356, + "step": 476 + }, + { + "epoch": 0.21878224974200206, + "grad_norm": 0.2204056829214096, + "learning_rate": 0.0009951996793302047, + "loss": 0.9941, + "step": 477 + }, + { + "epoch": 0.21924091273936475, + "grad_norm": 0.07944503426551819, + "learning_rate": 0.0009951653855589035, + "loss": 0.4608, + "step": 478 + }, + { + "epoch": 0.21969957573672744, + "grad_norm": 0.2788356840610504, + "learning_rate": 0.000995130970319963, + "loss": 1.2136, + "step": 479 + }, + { + "epoch": 0.22015823873409013, + "grad_norm": 0.10177599638700485, + "learning_rate": 0.000995096433621825, + "loss": 0.5138, + "step": 480 + }, + { + "epoch": 0.22061690173145282, + "grad_norm": 0.19846868515014648, + "learning_rate": 0.0009950617754729625, + "loss": 1.0419, + "step": 481 + }, + { + "epoch": 0.2210755647288155, + "grad_norm": 0.3579308092594147, + "learning_rate": 0.0009950269958818767, + "loss": 1.4526, + "step": 482 + }, + { + "epoch": 0.2215342277261782, + "grad_norm": 0.36856117844581604, + "learning_rate": 0.0009949920948570995, + "loss": 1.7526, + "step": 483 + }, + { + "epoch": 0.2219928907235409, + "grad_norm": 0.32889747619628906, + "learning_rate": 0.0009949570724071923, + "loss": 1.4898, + "step": 484 + }, + { + "epoch": 0.22245155372090356, + "grad_norm": 0.4301432967185974, + "learning_rate": 0.0009949219285407464, + "loss": 1.9407, + "step": 485 + }, + { + "epoch": 0.22291021671826625, + "grad_norm": 0.3047052323818207, + "learning_rate": 0.000994886663266383, + "loss": 1.3376, + "step": 486 + }, + { + "epoch": 0.22336887971562894, + "grad_norm": 0.27812737226486206, + "learning_rate": 0.0009948512765927525, + "loss": 1.2351, + "step": 487 + }, + { + "epoch": 0.22382754271299163, + "grad_norm": 0.34248921275138855, + "learning_rate": 0.0009948157685285362, + "loss": 1.8019, + "step": 488 + }, + { + "epoch": 0.22428620571035432, + "grad_norm": 0.295585036277771, + "learning_rate": 0.0009947801390824437, + "loss": 1.5117, + "step": 489 + }, + { + "epoch": 0.224744868707717, + "grad_norm": 0.3885442912578583, + "learning_rate": 0.000994744388263216, + "loss": 1.9371, + "step": 490 + }, + { + "epoch": 0.2252035317050797, + "grad_norm": 0.5128207206726074, + "learning_rate": 0.0009947085160796221, + "loss": 1.957, + "step": 491 + }, + { + "epoch": 0.2256621947024424, + "grad_norm": 0.3117857873439789, + "learning_rate": 0.0009946725225404623, + "loss": 1.6161, + "step": 492 + }, + { + "epoch": 0.22612085769980506, + "grad_norm": 0.3748393952846527, + "learning_rate": 0.000994636407654566, + "loss": 2.0451, + "step": 493 + }, + { + "epoch": 0.22657952069716775, + "grad_norm": 0.4382317364215851, + "learning_rate": 0.0009946001714307923, + "loss": 2.3245, + "step": 494 + }, + { + "epoch": 0.22703818369453044, + "grad_norm": 0.4299793243408203, + "learning_rate": 0.0009945638138780305, + "loss": 2.5256, + "step": 495 + }, + { + "epoch": 0.22749684669189313, + "grad_norm": 0.36116522550582886, + "learning_rate": 0.000994527335005199, + "loss": 1.8933, + "step": 496 + }, + { + "epoch": 0.22795550968925582, + "grad_norm": 0.39556559920310974, + "learning_rate": 0.0009944907348212464, + "loss": 1.9866, + "step": 497 + }, + { + "epoch": 0.2284141726866185, + "grad_norm": 0.2519306242465973, + "learning_rate": 0.0009944540133351512, + "loss": 1.0477, + "step": 498 + }, + { + "epoch": 0.2288728356839812, + "grad_norm": 0.23983240127563477, + "learning_rate": 0.0009944171705559214, + "loss": 0.9459, + "step": 499 + }, + { + "epoch": 0.2293314986813439, + "grad_norm": 0.2413322627544403, + "learning_rate": 0.0009943802064925947, + "loss": 0.9827, + "step": 500 + }, + { + "epoch": 0.22979016167870658, + "grad_norm": 0.48764920234680176, + "learning_rate": 0.0009943431211542387, + "loss": 2.16, + "step": 501 + }, + { + "epoch": 0.23024882467606925, + "grad_norm": 0.13628147542476654, + "learning_rate": 0.0009943059145499508, + "loss": 0.7079, + "step": 502 + }, + { + "epoch": 0.23070748767343194, + "grad_norm": 0.3403797447681427, + "learning_rate": 0.0009942685866888576, + "loss": 1.6746, + "step": 503 + }, + { + "epoch": 0.23116615067079463, + "grad_norm": 0.4511934518814087, + "learning_rate": 0.0009942311375801163, + "loss": 1.9286, + "step": 504 + }, + { + "epoch": 0.23162481366815732, + "grad_norm": 0.3697836399078369, + "learning_rate": 0.0009941935672329136, + "loss": 1.4048, + "step": 505 + }, + { + "epoch": 0.23208347666552, + "grad_norm": 0.30644330382347107, + "learning_rate": 0.0009941558756564653, + "loss": 1.3854, + "step": 506 + }, + { + "epoch": 0.2325421396628827, + "grad_norm": 0.2954138517379761, + "learning_rate": 0.0009941180628600178, + "loss": 1.3652, + "step": 507 + }, + { + "epoch": 0.2330008026602454, + "grad_norm": 0.30790993571281433, + "learning_rate": 0.0009940801288528466, + "loss": 1.447, + "step": 508 + }, + { + "epoch": 0.23345946565760808, + "grad_norm": 0.37096887826919556, + "learning_rate": 0.0009940420736442575, + "loss": 1.4563, + "step": 509 + }, + { + "epoch": 0.23391812865497075, + "grad_norm": 0.32588744163513184, + "learning_rate": 0.0009940038972435852, + "loss": 1.4545, + "step": 510 + }, + { + "epoch": 0.23437679165233344, + "grad_norm": 0.30430012941360474, + "learning_rate": 0.000993965599660195, + "loss": 1.509, + "step": 511 + }, + { + "epoch": 0.23483545464969613, + "grad_norm": 0.3035714626312256, + "learning_rate": 0.0009939271809034818, + "loss": 1.5743, + "step": 512 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.21552062034606934, + "learning_rate": 0.0009938886409828695, + "loss": 0.9872, + "step": 513 + }, + { + "epoch": 0.2357527806444215, + "grad_norm": 0.30967265367507935, + "learning_rate": 0.0009938499799078124, + "loss": 1.5457, + "step": 514 + }, + { + "epoch": 0.2362114436417842, + "grad_norm": 0.47051116824150085, + "learning_rate": 0.0009938111976877946, + "loss": 2.3921, + "step": 515 + }, + { + "epoch": 0.2366701066391469, + "grad_norm": 0.24845226109027863, + "learning_rate": 0.0009937722943323293, + "loss": 1.0681, + "step": 516 + }, + { + "epoch": 0.23712876963650958, + "grad_norm": 0.28985902667045593, + "learning_rate": 0.0009937332698509598, + "loss": 1.3932, + "step": 517 + }, + { + "epoch": 0.23758743263387228, + "grad_norm": 0.37183916568756104, + "learning_rate": 0.0009936941242532594, + "loss": 1.9424, + "step": 518 + }, + { + "epoch": 0.23804609563123494, + "grad_norm": 0.32926613092422485, + "learning_rate": 0.0009936548575488306, + "loss": 1.8109, + "step": 519 + }, + { + "epoch": 0.23850475862859763, + "grad_norm": 0.3398732542991638, + "learning_rate": 0.0009936154697473057, + "loss": 1.7718, + "step": 520 + }, + { + "epoch": 0.23896342162596032, + "grad_norm": 0.36996906995773315, + "learning_rate": 0.0009935759608583471, + "loss": 1.7703, + "step": 521 + }, + { + "epoch": 0.239422084623323, + "grad_norm": 0.10577350109815598, + "learning_rate": 0.0009935363308916463, + "loss": 0.5741, + "step": 522 + }, + { + "epoch": 0.2398807476206857, + "grad_norm": 0.4257866144180298, + "learning_rate": 0.0009934965798569248, + "loss": 2.071, + "step": 523 + }, + { + "epoch": 0.2403394106180484, + "grad_norm": 0.23754611611366272, + "learning_rate": 0.0009934567077639342, + "loss": 1.2965, + "step": 524 + }, + { + "epoch": 0.24079807361541108, + "grad_norm": 0.2694760859012604, + "learning_rate": 0.0009934167146224552, + "loss": 1.4501, + "step": 525 + }, + { + "epoch": 0.24125673661277378, + "grad_norm": 0.35148951411247253, + "learning_rate": 0.0009933766004422983, + "loss": 1.674, + "step": 526 + }, + { + "epoch": 0.24171539961013644, + "grad_norm": 0.3345791697502136, + "learning_rate": 0.000993336365233304, + "loss": 1.4195, + "step": 527 + }, + { + "epoch": 0.24217406260749913, + "grad_norm": 0.15900827944278717, + "learning_rate": 0.0009932960090053417, + "loss": 0.7744, + "step": 528 + }, + { + "epoch": 0.24263272560486182, + "grad_norm": 0.358396977186203, + "learning_rate": 0.000993255531768312, + "loss": 1.766, + "step": 529 + }, + { + "epoch": 0.2430913886022245, + "grad_norm": 0.35677099227905273, + "learning_rate": 0.0009932149335321438, + "loss": 1.7595, + "step": 530 + }, + { + "epoch": 0.2435500515995872, + "grad_norm": 0.19825054705142975, + "learning_rate": 0.000993174214306796, + "loss": 0.913, + "step": 531 + }, + { + "epoch": 0.2440087145969499, + "grad_norm": 0.2745889723300934, + "learning_rate": 0.0009931333741022574, + "loss": 1.4666, + "step": 532 + }, + { + "epoch": 0.24446737759431258, + "grad_norm": 0.2890368700027466, + "learning_rate": 0.0009930924129285465, + "loss": 1.6223, + "step": 533 + }, + { + "epoch": 0.24492604059167528, + "grad_norm": 0.21782201528549194, + "learning_rate": 0.0009930513307957114, + "loss": 1.2415, + "step": 534 + }, + { + "epoch": 0.24538470358903797, + "grad_norm": 0.3002353310585022, + "learning_rate": 0.0009930101277138299, + "loss": 1.6741, + "step": 535 + }, + { + "epoch": 0.24584336658640063, + "grad_norm": 0.21679870784282684, + "learning_rate": 0.000992968803693009, + "loss": 1.2902, + "step": 536 + }, + { + "epoch": 0.24630202958376332, + "grad_norm": 0.4101315140724182, + "learning_rate": 0.0009929273587433863, + "loss": 1.9695, + "step": 537 + }, + { + "epoch": 0.246760692581126, + "grad_norm": 0.355189710855484, + "learning_rate": 0.0009928857928751282, + "loss": 1.7978, + "step": 538 + }, + { + "epoch": 0.2472193555784887, + "grad_norm": 0.2238903045654297, + "learning_rate": 0.0009928441060984311, + "loss": 1.2032, + "step": 539 + }, + { + "epoch": 0.2476780185758514, + "grad_norm": 0.31160059571266174, + "learning_rate": 0.0009928022984235214, + "loss": 1.6759, + "step": 540 + }, + { + "epoch": 0.24813668157321409, + "grad_norm": 0.3302862346172333, + "learning_rate": 0.0009927603698606543, + "loss": 1.7676, + "step": 541 + }, + { + "epoch": 0.24859534457057678, + "grad_norm": 0.27699384093284607, + "learning_rate": 0.0009927183204201159, + "loss": 1.348, + "step": 542 + }, + { + "epoch": 0.24905400756793947, + "grad_norm": 0.33498769998550415, + "learning_rate": 0.0009926761501122205, + "loss": 1.6743, + "step": 543 + }, + { + "epoch": 0.24951267056530213, + "grad_norm": 0.2707799971103668, + "learning_rate": 0.0009926338589473134, + "loss": 1.3558, + "step": 544 + }, + { + "epoch": 0.24997133356266482, + "grad_norm": 0.2782471776008606, + "learning_rate": 0.0009925914469357685, + "loss": 1.2665, + "step": 545 + }, + { + "epoch": 0.2504299965600275, + "grad_norm": 0.4235142171382904, + "learning_rate": 0.0009925489140879898, + "loss": 2.0011, + "step": 546 + }, + { + "epoch": 0.2508886595573902, + "grad_norm": 0.3951006233692169, + "learning_rate": 0.000992506260414411, + "loss": 1.7933, + "step": 547 + }, + { + "epoch": 0.2513473225547529, + "grad_norm": 0.26279470324516296, + "learning_rate": 0.0009924634859254952, + "loss": 1.0981, + "step": 548 + }, + { + "epoch": 0.2518059855521156, + "grad_norm": 0.3145405650138855, + "learning_rate": 0.000992420590631736, + "loss": 1.5393, + "step": 549 + }, + { + "epoch": 0.2522646485494783, + "grad_norm": 0.2764291763305664, + "learning_rate": 0.0009923775745436549, + "loss": 1.2642, + "step": 550 + }, + { + "epoch": 0.25272331154684097, + "grad_norm": 0.3677400052547455, + "learning_rate": 0.0009923344376718047, + "loss": 1.1807, + "step": 551 + }, + { + "epoch": 0.25318197454420366, + "grad_norm": 0.4418051242828369, + "learning_rate": 0.000992291180026767, + "loss": 2.4766, + "step": 552 + }, + { + "epoch": 0.25364063754156635, + "grad_norm": 0.3161998391151428, + "learning_rate": 0.0009922478016191531, + "loss": 1.3802, + "step": 553 + }, + { + "epoch": 0.25409930053892904, + "grad_norm": 0.39167219400405884, + "learning_rate": 0.0009922043024596045, + "loss": 2.0244, + "step": 554 + }, + { + "epoch": 0.25455796353629173, + "grad_norm": 0.3040946125984192, + "learning_rate": 0.0009921606825587913, + "loss": 1.6199, + "step": 555 + }, + { + "epoch": 0.2550166265336544, + "grad_norm": 0.22090859711170197, + "learning_rate": 0.000992116941927414, + "loss": 1.0001, + "step": 556 + }, + { + "epoch": 0.25547528953101706, + "grad_norm": 0.29208970069885254, + "learning_rate": 0.0009920730805762026, + "loss": 1.3642, + "step": 557 + }, + { + "epoch": 0.25593395252837975, + "grad_norm": 0.3597604036331177, + "learning_rate": 0.0009920290985159165, + "loss": 1.6378, + "step": 558 + }, + { + "epoch": 0.25639261552574244, + "grad_norm": 0.36907076835632324, + "learning_rate": 0.0009919849957573449, + "loss": 2.0097, + "step": 559 + }, + { + "epoch": 0.25685127852310513, + "grad_norm": 0.5593513250350952, + "learning_rate": 0.0009919407723113062, + "loss": 2.4558, + "step": 560 + }, + { + "epoch": 0.2573099415204678, + "grad_norm": 0.38623642921447754, + "learning_rate": 0.0009918964281886492, + "loss": 1.9032, + "step": 561 + }, + { + "epoch": 0.2577686045178305, + "grad_norm": 0.3310829997062683, + "learning_rate": 0.0009918519634002515, + "loss": 1.403, + "step": 562 + }, + { + "epoch": 0.2582272675151932, + "grad_norm": 0.3338778018951416, + "learning_rate": 0.0009918073779570206, + "loss": 1.7481, + "step": 563 + }, + { + "epoch": 0.2586859305125559, + "grad_norm": 0.32507607340812683, + "learning_rate": 0.000991762671869894, + "loss": 1.4885, + "step": 564 + }, + { + "epoch": 0.2591445935099186, + "grad_norm": 0.13469289243221283, + "learning_rate": 0.0009917178451498382, + "loss": 0.7271, + "step": 565 + }, + { + "epoch": 0.2596032565072813, + "grad_norm": 0.28605028986930847, + "learning_rate": 0.0009916728978078494, + "loss": 1.3297, + "step": 566 + }, + { + "epoch": 0.26006191950464397, + "grad_norm": 0.4288422465324402, + "learning_rate": 0.0009916278298549538, + "loss": 2.4978, + "step": 567 + }, + { + "epoch": 0.26052058250200666, + "grad_norm": 0.2774169445037842, + "learning_rate": 0.0009915826413022064, + "loss": 1.5432, + "step": 568 + }, + { + "epoch": 0.26097924549936935, + "grad_norm": 0.23148323595523834, + "learning_rate": 0.0009915373321606928, + "loss": 1.2222, + "step": 569 + }, + { + "epoch": 0.26143790849673204, + "grad_norm": 0.4910981059074402, + "learning_rate": 0.0009914919024415275, + "loss": 2.5752, + "step": 570 + }, + { + "epoch": 0.26189657149409473, + "grad_norm": 0.2878377139568329, + "learning_rate": 0.0009914463521558546, + "loss": 1.4696, + "step": 571 + }, + { + "epoch": 0.2623552344914574, + "grad_norm": 0.24708285927772522, + "learning_rate": 0.000991400681314848, + "loss": 1.3876, + "step": 572 + }, + { + "epoch": 0.2628138974888201, + "grad_norm": 0.38844871520996094, + "learning_rate": 0.0009913548899297111, + "loss": 2.4133, + "step": 573 + }, + { + "epoch": 0.26327256048618275, + "grad_norm": 0.3721879720687866, + "learning_rate": 0.0009913089780116768, + "loss": 1.9523, + "step": 574 + }, + { + "epoch": 0.26373122348354544, + "grad_norm": 0.304868221282959, + "learning_rate": 0.000991262945572008, + "loss": 1.7794, + "step": 575 + }, + { + "epoch": 0.26418988648090813, + "grad_norm": 0.24861812591552734, + "learning_rate": 0.000991216792621996, + "loss": 1.2682, + "step": 576 + }, + { + "epoch": 0.2646485494782708, + "grad_norm": 0.44407734274864197, + "learning_rate": 0.0009911705191729633, + "loss": 2.2016, + "step": 577 + }, + { + "epoch": 0.2651072124756335, + "grad_norm": 0.33282753825187683, + "learning_rate": 0.0009911241252362603, + "loss": 1.6449, + "step": 578 + }, + { + "epoch": 0.2655658754729962, + "grad_norm": 0.3493805527687073, + "learning_rate": 0.0009910776108232686, + "loss": 1.6497, + "step": 579 + }, + { + "epoch": 0.2660245384703589, + "grad_norm": 0.36523211002349854, + "learning_rate": 0.000991030975945398, + "loss": 2.0684, + "step": 580 + }, + { + "epoch": 0.2664832014677216, + "grad_norm": 0.5898191928863525, + "learning_rate": 0.0009909842206140883, + "loss": 1.5042, + "step": 581 + }, + { + "epoch": 0.2669418644650843, + "grad_norm": 0.14400826394557953, + "learning_rate": 0.0009909373448408092, + "loss": 0.7091, + "step": 582 + }, + { + "epoch": 0.26740052746244697, + "grad_norm": 0.36130577325820923, + "learning_rate": 0.0009908903486370597, + "loss": 1.7273, + "step": 583 + }, + { + "epoch": 0.26785919045980966, + "grad_norm": 0.2697531282901764, + "learning_rate": 0.000990843232014368, + "loss": 1.4963, + "step": 584 + }, + { + "epoch": 0.26831785345717235, + "grad_norm": 0.34319204092025757, + "learning_rate": 0.0009907959949842925, + "loss": 1.804, + "step": 585 + }, + { + "epoch": 0.26877651645453504, + "grad_norm": 0.19325220584869385, + "learning_rate": 0.0009907486375584204, + "loss": 0.9886, + "step": 586 + }, + { + "epoch": 0.26923517945189773, + "grad_norm": 0.23447225987911224, + "learning_rate": 0.0009907011597483691, + "loss": 1.2031, + "step": 587 + }, + { + "epoch": 0.2696938424492604, + "grad_norm": 0.32570499181747437, + "learning_rate": 0.0009906535615657852, + "loss": 1.8141, + "step": 588 + }, + { + "epoch": 0.2701525054466231, + "grad_norm": 0.40676188468933105, + "learning_rate": 0.0009906058430223447, + "loss": 2.1973, + "step": 589 + }, + { + "epoch": 0.2706111684439858, + "grad_norm": 0.3603213429450989, + "learning_rate": 0.0009905580041297537, + "loss": 1.9151, + "step": 590 + }, + { + "epoch": 0.2710698314413485, + "grad_norm": 0.3043905794620514, + "learning_rate": 0.0009905100448997471, + "loss": 1.5438, + "step": 591 + }, + { + "epoch": 0.27152849443871113, + "grad_norm": 0.1927344799041748, + "learning_rate": 0.0009904619653440898, + "loss": 0.9269, + "step": 592 + }, + { + "epoch": 0.2719871574360738, + "grad_norm": 0.29657939076423645, + "learning_rate": 0.000990413765474576, + "loss": 1.4634, + "step": 593 + }, + { + "epoch": 0.2724458204334365, + "grad_norm": 0.1932910680770874, + "learning_rate": 0.0009903654453030293, + "loss": 1.029, + "step": 594 + }, + { + "epoch": 0.2729044834307992, + "grad_norm": 0.21558885276317596, + "learning_rate": 0.0009903170048413032, + "loss": 1.0732, + "step": 595 + }, + { + "epoch": 0.2733631464281619, + "grad_norm": 0.32209712266921997, + "learning_rate": 0.0009902684441012804, + "loss": 1.5956, + "step": 596 + }, + { + "epoch": 0.2738218094255246, + "grad_norm": 0.36815398931503296, + "learning_rate": 0.0009902197630948734, + "loss": 1.8898, + "step": 597 + }, + { + "epoch": 0.2742804724228873, + "grad_norm": 0.28202107548713684, + "learning_rate": 0.0009901709618340237, + "loss": 1.4518, + "step": 598 + }, + { + "epoch": 0.27473913542024997, + "grad_norm": 0.36555325984954834, + "learning_rate": 0.0009901220403307027, + "loss": 1.9685, + "step": 599 + }, + { + "epoch": 0.27519779841761266, + "grad_norm": 0.35351619124412537, + "learning_rate": 0.0009900729985969114, + "loss": 2.0413, + "step": 600 + }, + { + "epoch": 0.27565646141497535, + "grad_norm": 0.2880600690841675, + "learning_rate": 0.0009900238366446798, + "loss": 1.527, + "step": 601 + }, + { + "epoch": 0.27611512441233804, + "grad_norm": 0.24242261052131653, + "learning_rate": 0.000989974554486068, + "loss": 1.2521, + "step": 602 + }, + { + "epoch": 0.27657378740970073, + "grad_norm": 0.3457260727882385, + "learning_rate": 0.000989925152133165, + "loss": 1.9675, + "step": 603 + }, + { + "epoch": 0.2770324504070634, + "grad_norm": 0.337833046913147, + "learning_rate": 0.0009898756295980897, + "loss": 1.7602, + "step": 604 + }, + { + "epoch": 0.2774911134044261, + "grad_norm": 0.6493378281593323, + "learning_rate": 0.00098982598689299, + "loss": 1.5513, + "step": 605 + }, + { + "epoch": 0.2779497764017888, + "grad_norm": 0.34193724393844604, + "learning_rate": 0.0009897762240300442, + "loss": 1.8746, + "step": 606 + }, + { + "epoch": 0.2784084393991515, + "grad_norm": 0.33631783723831177, + "learning_rate": 0.000989726341021459, + "loss": 1.6999, + "step": 607 + }, + { + "epoch": 0.2788671023965142, + "grad_norm": 0.13958394527435303, + "learning_rate": 0.0009896763378794712, + "loss": 0.6808, + "step": 608 + }, + { + "epoch": 0.2793257653938768, + "grad_norm": 0.2809588313102722, + "learning_rate": 0.0009896262146163472, + "loss": 1.3181, + "step": 609 + }, + { + "epoch": 0.2797844283912395, + "grad_norm": 0.3490467965602875, + "learning_rate": 0.0009895759712443821, + "loss": 2.0132, + "step": 610 + }, + { + "epoch": 0.2802430913886022, + "grad_norm": 0.3802107274532318, + "learning_rate": 0.0009895256077759014, + "loss": 2.2368, + "step": 611 + }, + { + "epoch": 0.2807017543859649, + "grad_norm": 0.2700074017047882, + "learning_rate": 0.0009894751242232596, + "loss": 1.6661, + "step": 612 + }, + { + "epoch": 0.2811604173833276, + "grad_norm": 0.3280056416988373, + "learning_rate": 0.0009894245205988401, + "loss": 1.6721, + "step": 613 + }, + { + "epoch": 0.2816190803806903, + "grad_norm": 0.31935131549835205, + "learning_rate": 0.0009893737969150572, + "loss": 1.3081, + "step": 614 + }, + { + "epoch": 0.28207774337805297, + "grad_norm": 0.4210816025733948, + "learning_rate": 0.000989322953184353, + "loss": 2.1636, + "step": 615 + }, + { + "epoch": 0.28253640637541566, + "grad_norm": 0.38598984479904175, + "learning_rate": 0.0009892719894192003, + "loss": 2.2451, + "step": 616 + }, + { + "epoch": 0.28299506937277835, + "grad_norm": 0.28689202666282654, + "learning_rate": 0.0009892209056321009, + "loss": 1.0069, + "step": 617 + }, + { + "epoch": 0.28345373237014104, + "grad_norm": 0.23823553323745728, + "learning_rate": 0.0009891697018355858, + "loss": 1.0115, + "step": 618 + }, + { + "epoch": 0.28391239536750373, + "grad_norm": 0.4259023666381836, + "learning_rate": 0.0009891183780422158, + "loss": 1.9298, + "step": 619 + }, + { + "epoch": 0.2843710583648664, + "grad_norm": 0.3770276606082916, + "learning_rate": 0.0009890669342645807, + "loss": 1.7634, + "step": 620 + }, + { + "epoch": 0.2848297213622291, + "grad_norm": 0.42291995882987976, + "learning_rate": 0.0009890153705153005, + "loss": 1.9407, + "step": 621 + }, + { + "epoch": 0.2852883843595918, + "grad_norm": 0.35341787338256836, + "learning_rate": 0.0009889636868070238, + "loss": 1.7191, + "step": 622 + }, + { + "epoch": 0.2857470473569545, + "grad_norm": 0.3987595736980438, + "learning_rate": 0.000988911883152429, + "loss": 2.0328, + "step": 623 + }, + { + "epoch": 0.2862057103543172, + "grad_norm": 0.30864641070365906, + "learning_rate": 0.0009888599595642244, + "loss": 1.351, + "step": 624 + }, + { + "epoch": 0.2866643733516799, + "grad_norm": 0.25297942757606506, + "learning_rate": 0.0009888079160551466, + "loss": 1.2824, + "step": 625 + }, + { + "epoch": 0.2871230363490425, + "grad_norm": 0.4757532477378845, + "learning_rate": 0.0009887557526379626, + "loss": 2.8047, + "step": 626 + }, + { + "epoch": 0.2875816993464052, + "grad_norm": 0.3067730665206909, + "learning_rate": 0.0009887034693254683, + "loss": 1.437, + "step": 627 + }, + { + "epoch": 0.2880403623437679, + "grad_norm": 0.3403368294239044, + "learning_rate": 0.0009886510661304892, + "loss": 1.9881, + "step": 628 + }, + { + "epoch": 0.2884990253411306, + "grad_norm": 0.21943096816539764, + "learning_rate": 0.0009885985430658803, + "loss": 1.1086, + "step": 629 + }, + { + "epoch": 0.2889576883384933, + "grad_norm": 0.19929127395153046, + "learning_rate": 0.000988545900144526, + "loss": 1.0739, + "step": 630 + }, + { + "epoch": 0.28941635133585597, + "grad_norm": 0.30291882157325745, + "learning_rate": 0.0009884931373793395, + "loss": 1.7326, + "step": 631 + }, + { + "epoch": 0.28987501433321866, + "grad_norm": 0.23580683767795563, + "learning_rate": 0.0009884402547832643, + "loss": 1.3195, + "step": 632 + }, + { + "epoch": 0.29033367733058135, + "grad_norm": 0.24452835321426392, + "learning_rate": 0.0009883872523692727, + "loss": 1.2024, + "step": 633 + }, + { + "epoch": 0.29079234032794404, + "grad_norm": 0.3366321325302124, + "learning_rate": 0.0009883341301503666, + "loss": 1.7041, + "step": 634 + }, + { + "epoch": 0.29125100332530673, + "grad_norm": 0.23907805979251862, + "learning_rate": 0.0009882808881395773, + "loss": 1.2118, + "step": 635 + }, + { + "epoch": 0.2917096663226694, + "grad_norm": 0.28174489736557007, + "learning_rate": 0.0009882275263499655, + "loss": 1.494, + "step": 636 + }, + { + "epoch": 0.2921683293200321, + "grad_norm": 0.2500317394733429, + "learning_rate": 0.0009881740447946212, + "loss": 1.379, + "step": 637 + }, + { + "epoch": 0.2926269923173948, + "grad_norm": 0.3939465880393982, + "learning_rate": 0.000988120443486664, + "loss": 1.8527, + "step": 638 + }, + { + "epoch": 0.2930856553147575, + "grad_norm": 0.2760303318500519, + "learning_rate": 0.0009880667224392422, + "loss": 1.5243, + "step": 639 + }, + { + "epoch": 0.2935443183121202, + "grad_norm": 0.252908855676651, + "learning_rate": 0.0009880128816655343, + "loss": 1.0428, + "step": 640 + }, + { + "epoch": 0.2940029813094829, + "grad_norm": 0.5029557943344116, + "learning_rate": 0.0009879589211787478, + "loss": 2.1211, + "step": 641 + }, + { + "epoch": 0.29446164430684557, + "grad_norm": 0.3588217794895172, + "learning_rate": 0.0009879048409921196, + "loss": 1.4391, + "step": 642 + }, + { + "epoch": 0.2949203073042082, + "grad_norm": 0.28557729721069336, + "learning_rate": 0.000987850641118916, + "loss": 1.3001, + "step": 643 + }, + { + "epoch": 0.2953789703015709, + "grad_norm": 0.42318931221961975, + "learning_rate": 0.0009877963215724323, + "loss": 1.9521, + "step": 644 + }, + { + "epoch": 0.2958376332989336, + "grad_norm": 0.22379803657531738, + "learning_rate": 0.000987741882365994, + "loss": 1.101, + "step": 645 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 0.31854167580604553, + "learning_rate": 0.000987687323512955, + "loss": 1.4523, + "step": 646 + }, + { + "epoch": 0.29675495929365897, + "grad_norm": 0.2017553597688675, + "learning_rate": 0.0009876326450266992, + "loss": 0.9974, + "step": 647 + }, + { + "epoch": 0.29721362229102166, + "grad_norm": 0.19907866418361664, + "learning_rate": 0.0009875778469206396, + "loss": 0.9805, + "step": 648 + }, + { + "epoch": 0.29767228528838435, + "grad_norm": 0.36655089259147644, + "learning_rate": 0.0009875229292082184, + "loss": 2.0823, + "step": 649 + }, + { + "epoch": 0.29813094828574704, + "grad_norm": 0.30942440032958984, + "learning_rate": 0.0009874678919029073, + "loss": 1.5203, + "step": 650 + }, + { + "epoch": 0.29858961128310973, + "grad_norm": 0.3907017707824707, + "learning_rate": 0.0009874127350182078, + "loss": 2.2189, + "step": 651 + }, + { + "epoch": 0.2990482742804724, + "grad_norm": 0.3971099555492401, + "learning_rate": 0.00098735745856765, + "loss": 2.0348, + "step": 652 + }, + { + "epoch": 0.2995069372778351, + "grad_norm": 0.24980095028877258, + "learning_rate": 0.0009873020625647934, + "loss": 1.2255, + "step": 653 + }, + { + "epoch": 0.2999656002751978, + "grad_norm": 0.21592852473258972, + "learning_rate": 0.0009872465470232273, + "loss": 1.0114, + "step": 654 + }, + { + "epoch": 0.3004242632725605, + "grad_norm": 0.3222113847732544, + "learning_rate": 0.00098719091195657, + "loss": 1.4415, + "step": 655 + }, + { + "epoch": 0.3008829262699232, + "grad_norm": 0.3443264067173004, + "learning_rate": 0.000987135157378469, + "loss": 1.6269, + "step": 656 + }, + { + "epoch": 0.3013415892672859, + "grad_norm": 0.32391127943992615, + "learning_rate": 0.0009870792833026017, + "loss": 1.5997, + "step": 657 + }, + { + "epoch": 0.30180025226464857, + "grad_norm": 0.20386099815368652, + "learning_rate": 0.000987023289742674, + "loss": 1.1451, + "step": 658 + }, + { + "epoch": 0.30225891526201126, + "grad_norm": 0.24229376018047333, + "learning_rate": 0.0009869671767124217, + "loss": 1.2804, + "step": 659 + }, + { + "epoch": 0.3027175782593739, + "grad_norm": 0.3021294176578522, + "learning_rate": 0.00098691094422561, + "loss": 1.8578, + "step": 660 + }, + { + "epoch": 0.3031762412567366, + "grad_norm": 0.33602842688560486, + "learning_rate": 0.0009868545922960326, + "loss": 1.9222, + "step": 661 + }, + { + "epoch": 0.3036349042540993, + "grad_norm": 0.2946925759315491, + "learning_rate": 0.0009867981209375134, + "loss": 1.6532, + "step": 662 + }, + { + "epoch": 0.30409356725146197, + "grad_norm": 0.2836867570877075, + "learning_rate": 0.0009867415301639051, + "loss": 1.5413, + "step": 663 + }, + { + "epoch": 0.30455223024882466, + "grad_norm": 0.3318963050842285, + "learning_rate": 0.00098668481998909, + "loss": 1.9834, + "step": 664 + }, + { + "epoch": 0.30501089324618735, + "grad_norm": 0.2361811101436615, + "learning_rate": 0.0009866279904269793, + "loss": 1.1826, + "step": 665 + }, + { + "epoch": 0.30546955624355004, + "grad_norm": 0.20993195474147797, + "learning_rate": 0.0009865710414915137, + "loss": 1.0908, + "step": 666 + }, + { + "epoch": 0.30592821924091274, + "grad_norm": 0.11743535846471786, + "learning_rate": 0.0009865139731966633, + "loss": 0.6957, + "step": 667 + }, + { + "epoch": 0.3063868822382754, + "grad_norm": 0.4895835816860199, + "learning_rate": 0.0009864567855564273, + "loss": 2.6428, + "step": 668 + }, + { + "epoch": 0.3068455452356381, + "grad_norm": 0.3493427038192749, + "learning_rate": 0.0009863994785848343, + "loss": 1.5343, + "step": 669 + }, + { + "epoch": 0.3073042082330008, + "grad_norm": 0.36788055300712585, + "learning_rate": 0.0009863420522959422, + "loss": 1.7888, + "step": 670 + }, + { + "epoch": 0.3077628712303635, + "grad_norm": 0.3058752715587616, + "learning_rate": 0.0009862845067038378, + "loss": 1.4943, + "step": 671 + }, + { + "epoch": 0.3082215342277262, + "grad_norm": 0.29403895139694214, + "learning_rate": 0.0009862268418226376, + "loss": 1.4614, + "step": 672 + }, + { + "epoch": 0.3086801972250889, + "grad_norm": 0.3854977488517761, + "learning_rate": 0.0009861690576664873, + "loss": 1.4861, + "step": 673 + }, + { + "epoch": 0.3091388602224516, + "grad_norm": 0.404697984457016, + "learning_rate": 0.0009861111542495617, + "loss": 2.0234, + "step": 674 + }, + { + "epoch": 0.30959752321981426, + "grad_norm": 0.3911391496658325, + "learning_rate": 0.000986053131586065, + "loss": 2.011, + "step": 675 + }, + { + "epoch": 0.31005618621717695, + "grad_norm": 0.2827008068561554, + "learning_rate": 0.0009859949896902304, + "loss": 1.0028, + "step": 676 + }, + { + "epoch": 0.3105148492145396, + "grad_norm": 0.3550345003604889, + "learning_rate": 0.0009859367285763206, + "loss": 1.6724, + "step": 677 + }, + { + "epoch": 0.3109735122119023, + "grad_norm": 0.33875733613967896, + "learning_rate": 0.0009858783482586276, + "loss": 1.8332, + "step": 678 + }, + { + "epoch": 0.31143217520926497, + "grad_norm": 0.3210250437259674, + "learning_rate": 0.0009858198487514723, + "loss": 1.4062, + "step": 679 + }, + { + "epoch": 0.31189083820662766, + "grad_norm": 0.24080604314804077, + "learning_rate": 0.0009857612300692052, + "loss": 1.1792, + "step": 680 + }, + { + "epoch": 0.31234950120399035, + "grad_norm": 0.2961960434913635, + "learning_rate": 0.0009857024922262057, + "loss": 1.3523, + "step": 681 + }, + { + "epoch": 0.31280816420135305, + "grad_norm": 0.32249388098716736, + "learning_rate": 0.0009856436352368829, + "loss": 1.6275, + "step": 682 + }, + { + "epoch": 0.31326682719871574, + "grad_norm": 0.23962727189064026, + "learning_rate": 0.0009855846591156747, + "loss": 1.3343, + "step": 683 + }, + { + "epoch": 0.3137254901960784, + "grad_norm": 0.24461761116981506, + "learning_rate": 0.0009855255638770485, + "loss": 1.4974, + "step": 684 + }, + { + "epoch": 0.3141841531934411, + "grad_norm": 0.36309170722961426, + "learning_rate": 0.0009854663495355005, + "loss": 1.7183, + "step": 685 + }, + { + "epoch": 0.3146428161908038, + "grad_norm": 0.35972607135772705, + "learning_rate": 0.0009854070161055566, + "loss": 1.9509, + "step": 686 + }, + { + "epoch": 0.3151014791881665, + "grad_norm": 0.3379707932472229, + "learning_rate": 0.0009853475636017717, + "loss": 1.9371, + "step": 687 + }, + { + "epoch": 0.3155601421855292, + "grad_norm": 0.3258635401725769, + "learning_rate": 0.0009852879920387299, + "loss": 1.567, + "step": 688 + }, + { + "epoch": 0.3160188051828919, + "grad_norm": 0.36305832862854004, + "learning_rate": 0.0009852283014310445, + "loss": 2.1206, + "step": 689 + }, + { + "epoch": 0.3164774681802546, + "grad_norm": 0.3250496983528137, + "learning_rate": 0.000985168491793358, + "loss": 1.7597, + "step": 690 + }, + { + "epoch": 0.31693613117761726, + "grad_norm": 0.39209747314453125, + "learning_rate": 0.0009851085631403425, + "loss": 2.2836, + "step": 691 + }, + { + "epoch": 0.31739479417497996, + "grad_norm": 0.3178749978542328, + "learning_rate": 0.0009850485154866983, + "loss": 1.8535, + "step": 692 + }, + { + "epoch": 0.31785345717234265, + "grad_norm": 0.39518505334854126, + "learning_rate": 0.0009849883488471562, + "loss": 1.8896, + "step": 693 + }, + { + "epoch": 0.3183121201697053, + "grad_norm": 0.49922698736190796, + "learning_rate": 0.000984928063236475, + "loss": 1.8992, + "step": 694 + }, + { + "epoch": 0.318770783167068, + "grad_norm": 0.3633042871952057, + "learning_rate": 0.0009848676586694434, + "loss": 1.8554, + "step": 695 + }, + { + "epoch": 0.31922944616443066, + "grad_norm": 0.4339780807495117, + "learning_rate": 0.000984807135160879, + "loss": 2.4392, + "step": 696 + }, + { + "epoch": 0.31968810916179335, + "grad_norm": 0.2694472074508667, + "learning_rate": 0.0009847464927256288, + "loss": 1.1097, + "step": 697 + }, + { + "epoch": 0.32014677215915605, + "grad_norm": 0.3319615125656128, + "learning_rate": 0.000984685731378569, + "loss": 1.5832, + "step": 698 + }, + { + "epoch": 0.32060543515651874, + "grad_norm": 0.39388328790664673, + "learning_rate": 0.0009846248511346041, + "loss": 2.2849, + "step": 699 + }, + { + "epoch": 0.3210640981538814, + "grad_norm": 0.42792367935180664, + "learning_rate": 0.0009845638520086688, + "loss": 2.3538, + "step": 700 + }, + { + "epoch": 0.3215227611512441, + "grad_norm": 0.4416462182998657, + "learning_rate": 0.000984502734015727, + "loss": 2.5254, + "step": 701 + }, + { + "epoch": 0.3219814241486068, + "grad_norm": 0.3952227532863617, + "learning_rate": 0.000984441497170771, + "loss": 2.0428, + "step": 702 + }, + { + "epoch": 0.3224400871459695, + "grad_norm": 0.2395290583372116, + "learning_rate": 0.000984380141488823, + "loss": 0.9864, + "step": 703 + }, + { + "epoch": 0.3228987501433322, + "grad_norm": 0.4042597711086273, + "learning_rate": 0.0009843186669849333, + "loss": 1.9032, + "step": 704 + }, + { + "epoch": 0.3233574131406949, + "grad_norm": 0.3238537013530731, + "learning_rate": 0.0009842570736741827, + "loss": 1.6617, + "step": 705 + }, + { + "epoch": 0.3238160761380576, + "grad_norm": 0.3476428985595703, + "learning_rate": 0.0009841953615716804, + "loss": 1.761, + "step": 706 + }, + { + "epoch": 0.32427473913542026, + "grad_norm": 0.47932443022727966, + "learning_rate": 0.0009841335306925644, + "loss": 2.4644, + "step": 707 + }, + { + "epoch": 0.32473340213278296, + "grad_norm": 0.39882582426071167, + "learning_rate": 0.0009840715810520027, + "loss": 2.0681, + "step": 708 + }, + { + "epoch": 0.32519206513014565, + "grad_norm": 0.3567412793636322, + "learning_rate": 0.0009840095126651919, + "loss": 1.4626, + "step": 709 + }, + { + "epoch": 0.32565072812750834, + "grad_norm": 0.29540038108825684, + "learning_rate": 0.0009839473255473575, + "loss": 1.2916, + "step": 710 + }, + { + "epoch": 0.32610939112487103, + "grad_norm": 0.35967186093330383, + "learning_rate": 0.0009838850197137548, + "loss": 1.5812, + "step": 711 + }, + { + "epoch": 0.32656805412223366, + "grad_norm": 0.4076613485813141, + "learning_rate": 0.0009838225951796678, + "loss": 1.8511, + "step": 712 + }, + { + "epoch": 0.32702671711959636, + "grad_norm": 0.37241417169570923, + "learning_rate": 0.0009837600519604097, + "loss": 1.3398, + "step": 713 + }, + { + "epoch": 0.32748538011695905, + "grad_norm": 0.3827233612537384, + "learning_rate": 0.0009836973900713226, + "loss": 1.9355, + "step": 714 + }, + { + "epoch": 0.32794404311432174, + "grad_norm": 0.3503935635089874, + "learning_rate": 0.0009836346095277782, + "loss": 1.5865, + "step": 715 + }, + { + "epoch": 0.32840270611168443, + "grad_norm": 0.346055269241333, + "learning_rate": 0.000983571710345177, + "loss": 1.879, + "step": 716 + }, + { + "epoch": 0.3288613691090471, + "grad_norm": 0.3340051770210266, + "learning_rate": 0.0009835086925389484, + "loss": 1.531, + "step": 717 + }, + { + "epoch": 0.3293200321064098, + "grad_norm": 0.3591856360435486, + "learning_rate": 0.0009834455561245512, + "loss": 1.5611, + "step": 718 + }, + { + "epoch": 0.3297786951037725, + "grad_norm": 0.08197461068630219, + "learning_rate": 0.0009833823011174735, + "loss": 0.45, + "step": 719 + }, + { + "epoch": 0.3302373581011352, + "grad_norm": 0.35696059465408325, + "learning_rate": 0.0009833189275332318, + "loss": 2.2035, + "step": 720 + }, + { + "epoch": 0.3306960210984979, + "grad_norm": 0.21588194370269775, + "learning_rate": 0.0009832554353873721, + "loss": 1.1911, + "step": 721 + }, + { + "epoch": 0.3311546840958606, + "grad_norm": 0.313678503036499, + "learning_rate": 0.00098319182469547, + "loss": 1.956, + "step": 722 + }, + { + "epoch": 0.33161334709322327, + "grad_norm": 0.3138997554779053, + "learning_rate": 0.000983128095473129, + "loss": 1.5287, + "step": 723 + }, + { + "epoch": 0.33207201009058596, + "grad_norm": 0.38969993591308594, + "learning_rate": 0.0009830642477359828, + "loss": 2.4346, + "step": 724 + }, + { + "epoch": 0.33253067308794865, + "grad_norm": 0.21810193359851837, + "learning_rate": 0.0009830002814996935, + "loss": 1.0224, + "step": 725 + }, + { + "epoch": 0.33298933608531134, + "grad_norm": 0.11237790435552597, + "learning_rate": 0.0009829361967799526, + "loss": 0.6925, + "step": 726 + }, + { + "epoch": 0.33344799908267403, + "grad_norm": 0.31989771127700806, + "learning_rate": 0.0009828719935924804, + "loss": 1.4384, + "step": 727 + }, + { + "epoch": 0.3339066620800367, + "grad_norm": 0.4660812318325043, + "learning_rate": 0.0009828076719530265, + "loss": 2.5269, + "step": 728 + }, + { + "epoch": 0.33436532507739936, + "grad_norm": 0.37333112955093384, + "learning_rate": 0.0009827432318773694, + "loss": 2.13, + "step": 729 + }, + { + "epoch": 0.33482398807476205, + "grad_norm": 0.32906556129455566, + "learning_rate": 0.000982678673381317, + "loss": 1.5657, + "step": 730 + }, + { + "epoch": 0.33528265107212474, + "grad_norm": 0.3006758689880371, + "learning_rate": 0.0009826139964807055, + "loss": 1.4011, + "step": 731 + }, + { + "epoch": 0.33574131406948743, + "grad_norm": 0.3466986119747162, + "learning_rate": 0.0009825492011914009, + "loss": 2.008, + "step": 732 + }, + { + "epoch": 0.3361999770668501, + "grad_norm": 0.29313912987709045, + "learning_rate": 0.0009824842875292977, + "loss": 1.337, + "step": 733 + }, + { + "epoch": 0.3366586400642128, + "grad_norm": 0.3280053436756134, + "learning_rate": 0.0009824192555103202, + "loss": 1.6918, + "step": 734 + }, + { + "epoch": 0.3371173030615755, + "grad_norm": 0.22816585004329681, + "learning_rate": 0.0009823541051504207, + "loss": 1.1973, + "step": 735 + }, + { + "epoch": 0.3375759660589382, + "grad_norm": 0.3135545551776886, + "learning_rate": 0.0009822888364655813, + "loss": 1.6632, + "step": 736 + }, + { + "epoch": 0.3380346290563009, + "grad_norm": 0.30442047119140625, + "learning_rate": 0.0009822234494718128, + "loss": 1.3535, + "step": 737 + }, + { + "epoch": 0.3384932920536636, + "grad_norm": 0.2706790268421173, + "learning_rate": 0.0009821579441851552, + "loss": 1.0, + "step": 738 + }, + { + "epoch": 0.33895195505102627, + "grad_norm": 0.39678725600242615, + "learning_rate": 0.0009820923206216774, + "loss": 1.9822, + "step": 739 + }, + { + "epoch": 0.33941061804838896, + "grad_norm": 0.38112956285476685, + "learning_rate": 0.0009820265787974772, + "loss": 2.0559, + "step": 740 + }, + { + "epoch": 0.33986928104575165, + "grad_norm": 0.32253459095954895, + "learning_rate": 0.0009819607187286816, + "loss": 1.389, + "step": 741 + }, + { + "epoch": 0.34032794404311434, + "grad_norm": 0.34095335006713867, + "learning_rate": 0.0009818947404314465, + "loss": 1.7615, + "step": 742 + }, + { + "epoch": 0.34078660704047703, + "grad_norm": 0.36802440881729126, + "learning_rate": 0.0009818286439219571, + "loss": 1.8613, + "step": 743 + }, + { + "epoch": 0.3412452700378397, + "grad_norm": 0.415279746055603, + "learning_rate": 0.0009817624292164271, + "loss": 1.9945, + "step": 744 + }, + { + "epoch": 0.3417039330352024, + "grad_norm": 0.3048365116119385, + "learning_rate": 0.0009816960963310998, + "loss": 1.6786, + "step": 745 + }, + { + "epoch": 0.34216259603256505, + "grad_norm": 0.23415561020374298, + "learning_rate": 0.0009816296452822464, + "loss": 1.0875, + "step": 746 + }, + { + "epoch": 0.34262125902992774, + "grad_norm": 0.3987458348274231, + "learning_rate": 0.0009815630760861684, + "loss": 1.94, + "step": 747 + }, + { + "epoch": 0.34307992202729043, + "grad_norm": 0.31393012404441833, + "learning_rate": 0.000981496388759196, + "loss": 1.4587, + "step": 748 + }, + { + "epoch": 0.3435385850246531, + "grad_norm": 0.29491788148880005, + "learning_rate": 0.0009814295833176871, + "loss": 1.3318, + "step": 749 + }, + { + "epoch": 0.3439972480220158, + "grad_norm": 0.23884756863117218, + "learning_rate": 0.0009813626597780305, + "loss": 0.908, + "step": 750 + }, + { + "epoch": 0.3444559110193785, + "grad_norm": 0.3067830502986908, + "learning_rate": 0.0009812956181566425, + "loss": 1.4077, + "step": 751 + }, + { + "epoch": 0.3449145740167412, + "grad_norm": 0.42813050746917725, + "learning_rate": 0.000981228458469969, + "loss": 2.1209, + "step": 752 + }, + { + "epoch": 0.3453732370141039, + "grad_norm": 0.20336052775382996, + "learning_rate": 0.0009811611807344848, + "loss": 0.9812, + "step": 753 + }, + { + "epoch": 0.3458319000114666, + "grad_norm": 0.35509541630744934, + "learning_rate": 0.0009810937849666938, + "loss": 2.0477, + "step": 754 + }, + { + "epoch": 0.34629056300882927, + "grad_norm": 0.3570297360420227, + "learning_rate": 0.0009810262711831283, + "loss": 2.0165, + "step": 755 + }, + { + "epoch": 0.34674922600619196, + "grad_norm": 0.3037537932395935, + "learning_rate": 0.0009809586394003502, + "loss": 1.6942, + "step": 756 + }, + { + "epoch": 0.34720788900355465, + "grad_norm": 0.30765625834465027, + "learning_rate": 0.00098089088963495, + "loss": 1.7054, + "step": 757 + }, + { + "epoch": 0.34766655200091734, + "grad_norm": 0.3930741250514984, + "learning_rate": 0.0009808230219035471, + "loss": 2.2827, + "step": 758 + }, + { + "epoch": 0.34812521499828003, + "grad_norm": 0.34178081154823303, + "learning_rate": 0.00098075503622279, + "loss": 1.7407, + "step": 759 + }, + { + "epoch": 0.3485838779956427, + "grad_norm": 0.2623182237148285, + "learning_rate": 0.0009806869326093562, + "loss": 1.2565, + "step": 760 + }, + { + "epoch": 0.3490425409930054, + "grad_norm": 0.4342869520187378, + "learning_rate": 0.0009806187110799518, + "loss": 2.2317, + "step": 761 + }, + { + "epoch": 0.3495012039903681, + "grad_norm": 0.11686166375875473, + "learning_rate": 0.0009805503716513121, + "loss": 0.5233, + "step": 762 + }, + { + "epoch": 0.34995986698773074, + "grad_norm": 0.3906189799308777, + "learning_rate": 0.0009804819143402013, + "loss": 1.9533, + "step": 763 + }, + { + "epoch": 0.35041852998509343, + "grad_norm": 0.4378143548965454, + "learning_rate": 0.0009804133391634125, + "loss": 2.1091, + "step": 764 + }, + { + "epoch": 0.3508771929824561, + "grad_norm": 0.2528671622276306, + "learning_rate": 0.0009803446461377676, + "loss": 1.0801, + "step": 765 + }, + { + "epoch": 0.3513358559798188, + "grad_norm": 0.2157050520181656, + "learning_rate": 0.0009802758352801177, + "loss": 1.1054, + "step": 766 + }, + { + "epoch": 0.3517945189771815, + "grad_norm": 0.4617224335670471, + "learning_rate": 0.0009802069066073423, + "loss": 2.3428, + "step": 767 + }, + { + "epoch": 0.3522531819745442, + "grad_norm": 0.3335023820400238, + "learning_rate": 0.0009801378601363502, + "loss": 1.601, + "step": 768 + }, + { + "epoch": 0.3527118449719069, + "grad_norm": 0.3270067870616913, + "learning_rate": 0.000980068695884079, + "loss": 1.4522, + "step": 769 + }, + { + "epoch": 0.3531705079692696, + "grad_norm": 0.2701742947101593, + "learning_rate": 0.0009799994138674955, + "loss": 1.1839, + "step": 770 + }, + { + "epoch": 0.35362917096663227, + "grad_norm": 0.3580704629421234, + "learning_rate": 0.0009799300141035945, + "loss": 1.8822, + "step": 771 + }, + { + "epoch": 0.35408783396399496, + "grad_norm": 0.37125471234321594, + "learning_rate": 0.0009798604966094007, + "loss": 2.0035, + "step": 772 + }, + { + "epoch": 0.35454649696135765, + "grad_norm": 0.3791847229003906, + "learning_rate": 0.000979790861401967, + "loss": 1.8893, + "step": 773 + }, + { + "epoch": 0.35500515995872034, + "grad_norm": 0.3457506000995636, + "learning_rate": 0.0009797211084983757, + "loss": 1.3708, + "step": 774 + }, + { + "epoch": 0.35546382295608303, + "grad_norm": 0.4049646854400635, + "learning_rate": 0.0009796512379157375, + "loss": 1.923, + "step": 775 + }, + { + "epoch": 0.3559224859534457, + "grad_norm": 0.41086849570274353, + "learning_rate": 0.000979581249671192, + "loss": 1.848, + "step": 776 + }, + { + "epoch": 0.3563811489508084, + "grad_norm": 0.32881003618240356, + "learning_rate": 0.000979511143781908, + "loss": 1.7467, + "step": 777 + }, + { + "epoch": 0.3568398119481711, + "grad_norm": 0.3037266135215759, + "learning_rate": 0.0009794409202650831, + "loss": 1.6274, + "step": 778 + }, + { + "epoch": 0.3572984749455338, + "grad_norm": 0.34774550795555115, + "learning_rate": 0.0009793705791379436, + "loss": 1.6964, + "step": 779 + }, + { + "epoch": 0.35775713794289643, + "grad_norm": 0.19517025351524353, + "learning_rate": 0.0009793001204177444, + "loss": 1.1024, + "step": 780 + }, + { + "epoch": 0.3582158009402591, + "grad_norm": 0.25842177867889404, + "learning_rate": 0.0009792295441217699, + "loss": 1.3525, + "step": 781 + }, + { + "epoch": 0.3586744639376218, + "grad_norm": 0.2821122407913208, + "learning_rate": 0.0009791588502673326, + "loss": 1.2322, + "step": 782 + }, + { + "epoch": 0.3591331269349845, + "grad_norm": 0.40477707982063293, + "learning_rate": 0.0009790880388717746, + "loss": 1.7263, + "step": 783 + }, + { + "epoch": 0.3595917899323472, + "grad_norm": 0.39657363295555115, + "learning_rate": 0.0009790171099524662, + "loss": 2.3909, + "step": 784 + }, + { + "epoch": 0.3600504529297099, + "grad_norm": 0.31579139828681946, + "learning_rate": 0.0009789460635268067, + "loss": 1.5805, + "step": 785 + }, + { + "epoch": 0.3605091159270726, + "grad_norm": 0.4094010889530182, + "learning_rate": 0.0009788748996122246, + "loss": 2.2917, + "step": 786 + }, + { + "epoch": 0.36096777892443527, + "grad_norm": 0.31713518500328064, + "learning_rate": 0.0009788036182261767, + "loss": 1.483, + "step": 787 + }, + { + "epoch": 0.36142644192179796, + "grad_norm": 0.38277119398117065, + "learning_rate": 0.000978732219386149, + "loss": 2.1552, + "step": 788 + }, + { + "epoch": 0.36188510491916065, + "grad_norm": 0.2237308770418167, + "learning_rate": 0.000978660703109656, + "loss": 1.2989, + "step": 789 + }, + { + "epoch": 0.36234376791652334, + "grad_norm": 0.32253265380859375, + "learning_rate": 0.0009785890694142413, + "loss": 1.4193, + "step": 790 + }, + { + "epoch": 0.36280243091388603, + "grad_norm": 0.28654542565345764, + "learning_rate": 0.0009785173183174767, + "loss": 0.8873, + "step": 791 + }, + { + "epoch": 0.3632610939112487, + "grad_norm": 0.313298761844635, + "learning_rate": 0.000978445449836964, + "loss": 1.4503, + "step": 792 + }, + { + "epoch": 0.3637197569086114, + "grad_norm": 0.3323945701122284, + "learning_rate": 0.0009783734639903325, + "loss": 1.4465, + "step": 793 + }, + { + "epoch": 0.3641784199059741, + "grad_norm": 0.34506741166114807, + "learning_rate": 0.0009783013607952411, + "loss": 1.4567, + "step": 794 + }, + { + "epoch": 0.3646370829033368, + "grad_norm": 0.36164945363998413, + "learning_rate": 0.000978229140269377, + "loss": 1.8277, + "step": 795 + }, + { + "epoch": 0.3650957459006995, + "grad_norm": 0.40301600098609924, + "learning_rate": 0.0009781568024304567, + "loss": 1.7805, + "step": 796 + }, + { + "epoch": 0.3655544088980621, + "grad_norm": 0.2755573093891144, + "learning_rate": 0.0009780843472962251, + "loss": 1.461, + "step": 797 + }, + { + "epoch": 0.3660130718954248, + "grad_norm": 0.3744322657585144, + "learning_rate": 0.000978011774884456, + "loss": 2.009, + "step": 798 + }, + { + "epoch": 0.3664717348927875, + "grad_norm": 0.4854291081428528, + "learning_rate": 0.0009779390852129519, + "loss": 1.8751, + "step": 799 + }, + { + "epoch": 0.3669303978901502, + "grad_norm": 0.21938319504261017, + "learning_rate": 0.000977866278299544, + "loss": 0.996, + "step": 800 + }, + { + "epoch": 0.3673890608875129, + "grad_norm": 0.3286993205547333, + "learning_rate": 0.0009777933541620925, + "loss": 1.4633, + "step": 801 + }, + { + "epoch": 0.3678477238848756, + "grad_norm": 0.3668621778488159, + "learning_rate": 0.000977720312818486, + "loss": 1.9308, + "step": 802 + }, + { + "epoch": 0.36830638688223827, + "grad_norm": 0.36969929933547974, + "learning_rate": 0.0009776471542866424, + "loss": 1.9707, + "step": 803 + }, + { + "epoch": 0.36876504987960096, + "grad_norm": 0.22716723382472992, + "learning_rate": 0.0009775738785845077, + "loss": 0.9617, + "step": 804 + }, + { + "epoch": 0.36922371287696365, + "grad_norm": 0.1840977519750595, + "learning_rate": 0.0009775004857300572, + "loss": 1.0723, + "step": 805 + }, + { + "epoch": 0.36968237587432634, + "grad_norm": 0.2663559019565582, + "learning_rate": 0.0009774269757412944, + "loss": 1.6182, + "step": 806 + }, + { + "epoch": 0.37014103887168903, + "grad_norm": 0.22786211967468262, + "learning_rate": 0.0009773533486362523, + "loss": 1.11, + "step": 807 + }, + { + "epoch": 0.3705997018690517, + "grad_norm": 0.10840432345867157, + "learning_rate": 0.0009772796044329917, + "loss": 0.6749, + "step": 808 + }, + { + "epoch": 0.3710583648664144, + "grad_norm": 0.3396431803703308, + "learning_rate": 0.000977205743149603, + "loss": 2.0026, + "step": 809 + }, + { + "epoch": 0.3715170278637771, + "grad_norm": 0.34386134147644043, + "learning_rate": 0.0009771317648042043, + "loss": 2.0295, + "step": 810 + }, + { + "epoch": 0.3719756908611398, + "grad_norm": 0.22869761288166046, + "learning_rate": 0.0009770576694149435, + "loss": 1.061, + "step": 811 + }, + { + "epoch": 0.3724343538585025, + "grad_norm": 0.20139172673225403, + "learning_rate": 0.000976983456999997, + "loss": 0.9954, + "step": 812 + }, + { + "epoch": 0.3728930168558652, + "grad_norm": 0.21274548768997192, + "learning_rate": 0.0009769091275775687, + "loss": 1.0131, + "step": 813 + }, + { + "epoch": 0.3733516798532278, + "grad_norm": 0.4451286196708679, + "learning_rate": 0.0009768346811658932, + "loss": 2.5039, + "step": 814 + }, + { + "epoch": 0.3738103428505905, + "grad_norm": 0.2519133388996124, + "learning_rate": 0.0009767601177832319, + "loss": 1.1851, + "step": 815 + }, + { + "epoch": 0.3742690058479532, + "grad_norm": 0.36184462904930115, + "learning_rate": 0.0009766854374478765, + "loss": 1.9926, + "step": 816 + }, + { + "epoch": 0.3747276688453159, + "grad_norm": 0.2603318393230438, + "learning_rate": 0.0009766106401781459, + "loss": 1.3791, + "step": 817 + }, + { + "epoch": 0.3751863318426786, + "grad_norm": 0.26677289605140686, + "learning_rate": 0.0009765357259923885, + "loss": 1.2404, + "step": 818 + }, + { + "epoch": 0.37564499484004127, + "grad_norm": 0.1966245174407959, + "learning_rate": 0.000976460694908982, + "loss": 1.0012, + "step": 819 + }, + { + "epoch": 0.37610365783740396, + "grad_norm": 0.24759866297245026, + "learning_rate": 0.0009763855469463314, + "loss": 1.4005, + "step": 820 + }, + { + "epoch": 0.37656232083476665, + "grad_norm": 0.387797474861145, + "learning_rate": 0.000976310282122871, + "loss": 2.082, + "step": 821 + }, + { + "epoch": 0.37702098383212934, + "grad_norm": 0.3424549698829651, + "learning_rate": 0.0009762349004570644, + "loss": 1.84, + "step": 822 + }, + { + "epoch": 0.37747964682949203, + "grad_norm": 0.35073143243789673, + "learning_rate": 0.0009761594019674027, + "loss": 1.981, + "step": 823 + }, + { + "epoch": 0.3779383098268547, + "grad_norm": 0.11143548041582108, + "learning_rate": 0.0009760837866724063, + "loss": 0.6522, + "step": 824 + }, + { + "epoch": 0.3783969728242174, + "grad_norm": 0.2800116539001465, + "learning_rate": 0.0009760080545906244, + "loss": 1.4713, + "step": 825 + }, + { + "epoch": 0.3788556358215801, + "grad_norm": 0.365592896938324, + "learning_rate": 0.0009759322057406346, + "loss": 2.3906, + "step": 826 + }, + { + "epoch": 0.3793142988189428, + "grad_norm": 0.3204135000705719, + "learning_rate": 0.0009758562401410429, + "loss": 1.6213, + "step": 827 + }, + { + "epoch": 0.3797729618163055, + "grad_norm": 0.32533779740333557, + "learning_rate": 0.0009757801578104846, + "loss": 1.8047, + "step": 828 + }, + { + "epoch": 0.3802316248136682, + "grad_norm": 0.30973535776138306, + "learning_rate": 0.0009757039587676229, + "loss": 1.8927, + "step": 829 + }, + { + "epoch": 0.38069028781103087, + "grad_norm": 0.36675432324409485, + "learning_rate": 0.0009756276430311501, + "loss": 1.9817, + "step": 830 + }, + { + "epoch": 0.3811489508083935, + "grad_norm": 0.5217725038528442, + "learning_rate": 0.000975551210619787, + "loss": 1.6754, + "step": 831 + }, + { + "epoch": 0.3816076138057562, + "grad_norm": 0.22404442727565765, + "learning_rate": 0.0009754746615522832, + "loss": 1.0305, + "step": 832 + }, + { + "epoch": 0.3820662768031189, + "grad_norm": 1.0014019012451172, + "learning_rate": 0.0009753979958474164, + "loss": 1.6633, + "step": 833 + }, + { + "epoch": 0.3825249398004816, + "grad_norm": 6.931515216827393, + "learning_rate": 0.0009753212135239935, + "loss": 2.2435, + "step": 834 + }, + { + "epoch": 0.38298360279784427, + "grad_norm": 4.728778839111328, + "learning_rate": 0.0009752443146008496, + "loss": 2.3665, + "step": 835 + }, + { + "epoch": 0.38344226579520696, + "grad_norm": 0.5018776655197144, + "learning_rate": 0.0009751672990968486, + "loss": 1.4681, + "step": 836 + }, + { + "epoch": 0.38390092879256965, + "grad_norm": 1.0198359489440918, + "learning_rate": 0.0009750901670308831, + "loss": 2.5583, + "step": 837 + }, + { + "epoch": 0.38435959178993234, + "grad_norm": 0.3947281241416931, + "learning_rate": 0.0009750129184218741, + "loss": 1.788, + "step": 838 + }, + { + "epoch": 0.38481825478729503, + "grad_norm": 0.4941509962081909, + "learning_rate": 0.0009749355532887712, + "loss": 1.4786, + "step": 839 + }, + { + "epoch": 0.3852769177846577, + "grad_norm": 0.20619411766529083, + "learning_rate": 0.0009748580716505523, + "loss": 0.8842, + "step": 840 + }, + { + "epoch": 0.3857355807820204, + "grad_norm": 0.4519241750240326, + "learning_rate": 0.0009747804735262249, + "loss": 1.1354, + "step": 841 + }, + { + "epoch": 0.3861942437793831, + "grad_norm": 0.5304630398750305, + "learning_rate": 0.0009747027589348239, + "loss": 2.087, + "step": 842 + }, + { + "epoch": 0.3866529067767458, + "grad_norm": 0.4589567184448242, + "learning_rate": 0.0009746249278954134, + "loss": 1.955, + "step": 843 + }, + { + "epoch": 0.3871115697741085, + "grad_norm": 0.4906143248081207, + "learning_rate": 0.0009745469804270857, + "loss": 2.0344, + "step": 844 + }, + { + "epoch": 0.3875702327714712, + "grad_norm": 0.330793172121048, + "learning_rate": 0.0009744689165489621, + "loss": 1.4565, + "step": 845 + }, + { + "epoch": 0.38802889576883387, + "grad_norm": 0.3725048899650574, + "learning_rate": 0.0009743907362801923, + "loss": 1.9084, + "step": 846 + }, + { + "epoch": 0.38848755876619656, + "grad_norm": 0.5816589593887329, + "learning_rate": 0.0009743124396399541, + "loss": 2.4114, + "step": 847 + }, + { + "epoch": 0.38894622176355925, + "grad_norm": 0.2848202586174011, + "learning_rate": 0.0009742340266474547, + "loss": 1.4275, + "step": 848 + }, + { + "epoch": 0.3894048847609219, + "grad_norm": 0.28104984760284424, + "learning_rate": 0.0009741554973219291, + "loss": 1.0759, + "step": 849 + }, + { + "epoch": 0.3898635477582846, + "grad_norm": 0.39782270789146423, + "learning_rate": 0.000974076851682641, + "loss": 1.6635, + "step": 850 + }, + { + "epoch": 0.39032221075564727, + "grad_norm": 0.25713643431663513, + "learning_rate": 0.0009739980897488831, + "loss": 1.3452, + "step": 851 + }, + { + "epoch": 0.39078087375300996, + "grad_norm": 0.12562298774719238, + "learning_rate": 0.0009739192115399762, + "loss": 0.65, + "step": 852 + }, + { + "epoch": 0.39123953675037265, + "grad_norm": 0.22468183934688568, + "learning_rate": 0.0009738402170752693, + "loss": 1.1257, + "step": 853 + }, + { + "epoch": 0.39169819974773534, + "grad_norm": 0.34269779920578003, + "learning_rate": 0.0009737611063741407, + "loss": 1.5716, + "step": 854 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 0.3019089102745056, + "learning_rate": 0.0009736818794559967, + "loss": 1.5173, + "step": 855 + }, + { + "epoch": 0.3926155257424607, + "grad_norm": 0.19614103436470032, + "learning_rate": 0.0009736025363402723, + "loss": 0.6453, + "step": 856 + }, + { + "epoch": 0.3930741887398234, + "grad_norm": 0.21685221791267395, + "learning_rate": 0.000973523077046431, + "loss": 1.0789, + "step": 857 + }, + { + "epoch": 0.3935328517371861, + "grad_norm": 0.30863749980926514, + "learning_rate": 0.0009734435015939644, + "loss": 1.4852, + "step": 858 + }, + { + "epoch": 0.3939915147345488, + "grad_norm": 0.1502714902162552, + "learning_rate": 0.0009733638100023932, + "loss": 0.7736, + "step": 859 + }, + { + "epoch": 0.3944501777319115, + "grad_norm": 0.3218334913253784, + "learning_rate": 0.0009732840022912664, + "loss": 1.662, + "step": 860 + }, + { + "epoch": 0.3949088407292742, + "grad_norm": 0.23470452427864075, + "learning_rate": 0.0009732040784801611, + "loss": 1.1917, + "step": 861 + }, + { + "epoch": 0.39536750372663687, + "grad_norm": 0.3882734775543213, + "learning_rate": 0.0009731240385886835, + "loss": 1.7597, + "step": 862 + }, + { + "epoch": 0.39582616672399956, + "grad_norm": 0.41674375534057617, + "learning_rate": 0.0009730438826364679, + "loss": 1.887, + "step": 863 + }, + { + "epoch": 0.39628482972136225, + "grad_norm": 0.3275260329246521, + "learning_rate": 0.0009729636106431768, + "loss": 1.5059, + "step": 864 + }, + { + "epoch": 0.39674349271872494, + "grad_norm": 0.262455016374588, + "learning_rate": 0.000972883222628502, + "loss": 1.4503, + "step": 865 + }, + { + "epoch": 0.3972021557160876, + "grad_norm": 0.3371032774448395, + "learning_rate": 0.0009728027186121629, + "loss": 1.7706, + "step": 866 + }, + { + "epoch": 0.39766081871345027, + "grad_norm": 0.4705641269683838, + "learning_rate": 0.0009727220986139079, + "loss": 2.3818, + "step": 867 + }, + { + "epoch": 0.39811948171081296, + "grad_norm": 0.37057843804359436, + "learning_rate": 0.0009726413626535136, + "loss": 1.912, + "step": 868 + }, + { + "epoch": 0.39857814470817565, + "grad_norm": 0.19416803121566772, + "learning_rate": 0.0009725605107507851, + "loss": 0.9414, + "step": 869 + }, + { + "epoch": 0.39903680770553834, + "grad_norm": 0.26515352725982666, + "learning_rate": 0.0009724795429255559, + "loss": 1.5286, + "step": 870 + }, + { + "epoch": 0.39949547070290103, + "grad_norm": 0.33706986904144287, + "learning_rate": 0.0009723984591976882, + "loss": 1.4502, + "step": 871 + }, + { + "epoch": 0.3999541337002637, + "grad_norm": 0.0702093318104744, + "learning_rate": 0.0009723172595870724, + "loss": 0.4485, + "step": 872 + }, + { + "epoch": 0.4004127966976264, + "grad_norm": 0.376376211643219, + "learning_rate": 0.000972235944113627, + "loss": 2.1362, + "step": 873 + }, + { + "epoch": 0.4008714596949891, + "grad_norm": 0.3718342185020447, + "learning_rate": 0.0009721545127972998, + "loss": 2.3489, + "step": 874 + }, + { + "epoch": 0.4013301226923518, + "grad_norm": 0.41915109753608704, + "learning_rate": 0.0009720729656580658, + "loss": 1.9443, + "step": 875 + }, + { + "epoch": 0.4017887856897145, + "grad_norm": 0.2805352210998535, + "learning_rate": 0.0009719913027159298, + "loss": 1.397, + "step": 876 + }, + { + "epoch": 0.4022474486870772, + "grad_norm": 0.23363643884658813, + "learning_rate": 0.000971909523990924, + "loss": 0.9596, + "step": 877 + }, + { + "epoch": 0.40270611168443987, + "grad_norm": 0.41541317105293274, + "learning_rate": 0.0009718276295031091, + "loss": 1.8851, + "step": 878 + }, + { + "epoch": 0.40316477468180256, + "grad_norm": 0.37006086111068726, + "learning_rate": 0.0009717456192725747, + "loss": 1.9034, + "step": 879 + }, + { + "epoch": 0.40362343767916525, + "grad_norm": 0.334375262260437, + "learning_rate": 0.0009716634933194386, + "loss": 1.5997, + "step": 880 + }, + { + "epoch": 0.40408210067652794, + "grad_norm": 0.3790551424026489, + "learning_rate": 0.0009715812516638466, + "loss": 1.7657, + "step": 881 + }, + { + "epoch": 0.40454076367389064, + "grad_norm": 0.3914640247821808, + "learning_rate": 0.0009714988943259733, + "loss": 1.7908, + "step": 882 + }, + { + "epoch": 0.40499942667125327, + "grad_norm": 0.2521947920322418, + "learning_rate": 0.0009714164213260215, + "loss": 1.1801, + "step": 883 + }, + { + "epoch": 0.40545808966861596, + "grad_norm": 0.3940063714981079, + "learning_rate": 0.0009713338326842225, + "loss": 1.5258, + "step": 884 + }, + { + "epoch": 0.40591675266597865, + "grad_norm": 0.41144511103630066, + "learning_rate": 0.0009712511284208358, + "loss": 2.3174, + "step": 885 + }, + { + "epoch": 0.40637541566334134, + "grad_norm": 0.31135112047195435, + "learning_rate": 0.0009711683085561496, + "loss": 1.8175, + "step": 886 + }, + { + "epoch": 0.40683407866070403, + "grad_norm": 0.27680450677871704, + "learning_rate": 0.0009710853731104798, + "loss": 1.4197, + "step": 887 + }, + { + "epoch": 0.4072927416580667, + "grad_norm": 0.3753867745399475, + "learning_rate": 0.0009710023221041712, + "loss": 1.8083, + "step": 888 + }, + { + "epoch": 0.4077514046554294, + "grad_norm": 0.39393678307533264, + "learning_rate": 0.0009709191555575972, + "loss": 1.7822, + "step": 889 + }, + { + "epoch": 0.4082100676527921, + "grad_norm": 0.24905888736248016, + "learning_rate": 0.0009708358734911586, + "loss": 1.3235, + "step": 890 + }, + { + "epoch": 0.4086687306501548, + "grad_norm": 0.2505285143852234, + "learning_rate": 0.0009707524759252855, + "loss": 1.2241, + "step": 891 + }, + { + "epoch": 0.4091273936475175, + "grad_norm": 0.39112672209739685, + "learning_rate": 0.0009706689628804355, + "loss": 1.9813, + "step": 892 + }, + { + "epoch": 0.4095860566448802, + "grad_norm": 0.3590749502182007, + "learning_rate": 0.0009705853343770954, + "loss": 1.4428, + "step": 893 + }, + { + "epoch": 0.41004471964224287, + "grad_norm": 0.4543074369430542, + "learning_rate": 0.0009705015904357797, + "loss": 2.1425, + "step": 894 + }, + { + "epoch": 0.41050338263960556, + "grad_norm": 0.5381583571434021, + "learning_rate": 0.0009704177310770313, + "loss": 1.4238, + "step": 895 + }, + { + "epoch": 0.41096204563696825, + "grad_norm": 0.5682449340820312, + "learning_rate": 0.0009703337563214216, + "loss": 0.8868, + "step": 896 + }, + { + "epoch": 0.41142070863433094, + "grad_norm": 0.8263031244277954, + "learning_rate": 0.0009702496661895501, + "loss": 2.1425, + "step": 897 + }, + { + "epoch": 0.41187937163169364, + "grad_norm": 0.07518908381462097, + "learning_rate": 0.0009701654607020449, + "loss": 0.4335, + "step": 898 + }, + { + "epoch": 0.4123380346290563, + "grad_norm": 0.43035972118377686, + "learning_rate": 0.0009700811398795622, + "loss": 1.9609, + "step": 899 + }, + { + "epoch": 0.41279669762641896, + "grad_norm": 0.33483147621154785, + "learning_rate": 0.0009699967037427864, + "loss": 1.614, + "step": 900 + }, + { + "epoch": 0.41325536062378165, + "grad_norm": 0.20277926325798035, + "learning_rate": 0.0009699121523124301, + "loss": 0.9445, + "step": 901 + }, + { + "epoch": 0.41371402362114434, + "grad_norm": 0.20781390368938446, + "learning_rate": 0.0009698274856092348, + "loss": 1.0036, + "step": 902 + }, + { + "epoch": 0.41417268661850704, + "grad_norm": 0.3994620740413666, + "learning_rate": 0.0009697427036539696, + "loss": 2.8057, + "step": 903 + }, + { + "epoch": 0.4146313496158697, + "grad_norm": 0.20834676921367645, + "learning_rate": 0.0009696578064674322, + "loss": 1.2151, + "step": 904 + }, + { + "epoch": 0.4150900126132324, + "grad_norm": 0.21917149424552917, + "learning_rate": 0.0009695727940704484, + "loss": 1.2262, + "step": 905 + }, + { + "epoch": 0.4155486756105951, + "grad_norm": 0.21042834222316742, + "learning_rate": 0.0009694876664838725, + "loss": 1.0464, + "step": 906 + }, + { + "epoch": 0.4160073386079578, + "grad_norm": 0.2943893074989319, + "learning_rate": 0.0009694024237285868, + "loss": 1.5404, + "step": 907 + }, + { + "epoch": 0.4164660016053205, + "grad_norm": 0.24305228888988495, + "learning_rate": 0.000969317065825502, + "loss": 1.4753, + "step": 908 + }, + { + "epoch": 0.4169246646026832, + "grad_norm": 0.29043078422546387, + "learning_rate": 0.000969231592795557, + "loss": 1.8857, + "step": 909 + }, + { + "epoch": 0.4173833276000459, + "grad_norm": 0.2063802182674408, + "learning_rate": 0.000969146004659719, + "loss": 1.133, + "step": 910 + }, + { + "epoch": 0.41784199059740856, + "grad_norm": 0.38645628094673157, + "learning_rate": 0.0009690603014389831, + "loss": 2.2798, + "step": 911 + }, + { + "epoch": 0.41830065359477125, + "grad_norm": 0.2220807522535324, + "learning_rate": 0.0009689744831543734, + "loss": 0.8575, + "step": 912 + }, + { + "epoch": 0.41875931659213395, + "grad_norm": 0.4347355365753174, + "learning_rate": 0.0009688885498269416, + "loss": 2.3926, + "step": 913 + }, + { + "epoch": 0.41921797958949664, + "grad_norm": 0.45031097531318665, + "learning_rate": 0.0009688025014777672, + "loss": 2.4253, + "step": 914 + }, + { + "epoch": 0.4196766425868593, + "grad_norm": 0.19847296178340912, + "learning_rate": 0.0009687163381279593, + "loss": 0.8247, + "step": 915 + }, + { + "epoch": 0.420135305584222, + "grad_norm": 0.3677137792110443, + "learning_rate": 0.0009686300597986541, + "loss": 1.9451, + "step": 916 + }, + { + "epoch": 0.42059396858158465, + "grad_norm": 0.2694656550884247, + "learning_rate": 0.0009685436665110161, + "loss": 1.5455, + "step": 917 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.21316292881965637, + "learning_rate": 0.0009684571582862382, + "loss": 1.0842, + "step": 918 + }, + { + "epoch": 0.42151129457631004, + "grad_norm": 0.32941651344299316, + "learning_rate": 0.0009683705351455419, + "loss": 1.709, + "step": 919 + }, + { + "epoch": 0.4219699575736727, + "grad_norm": 0.3653143644332886, + "learning_rate": 0.0009682837971101762, + "loss": 1.8724, + "step": 920 + }, + { + "epoch": 0.4224286205710354, + "grad_norm": 0.37952494621276855, + "learning_rate": 0.0009681969442014183, + "loss": 1.7211, + "step": 921 + }, + { + "epoch": 0.4228872835683981, + "grad_norm": 0.3981945216655731, + "learning_rate": 0.0009681099764405743, + "loss": 2.2534, + "step": 922 + }, + { + "epoch": 0.4233459465657608, + "grad_norm": 0.0702991932630539, + "learning_rate": 0.0009680228938489777, + "loss": 0.4506, + "step": 923 + }, + { + "epoch": 0.4238046095631235, + "grad_norm": 0.2196217030286789, + "learning_rate": 0.0009679356964479908, + "loss": 1.088, + "step": 924 + }, + { + "epoch": 0.4242632725604862, + "grad_norm": 0.2972463071346283, + "learning_rate": 0.0009678483842590034, + "loss": 1.4828, + "step": 925 + }, + { + "epoch": 0.4247219355578489, + "grad_norm": 0.10213679820299149, + "learning_rate": 0.000967760957303434, + "loss": 0.628, + "step": 926 + }, + { + "epoch": 0.42518059855521156, + "grad_norm": 0.294698029756546, + "learning_rate": 0.0009676734156027292, + "loss": 1.504, + "step": 927 + }, + { + "epoch": 0.42563926155257426, + "grad_norm": 0.4071488380432129, + "learning_rate": 0.0009675857591783634, + "loss": 2.2822, + "step": 928 + }, + { + "epoch": 0.42609792454993695, + "grad_norm": 0.16480673849582672, + "learning_rate": 0.0009674979880518393, + "loss": 0.8937, + "step": 929 + }, + { + "epoch": 0.42655658754729964, + "grad_norm": 0.2734246253967285, + "learning_rate": 0.0009674101022446879, + "loss": 1.451, + "step": 930 + }, + { + "epoch": 0.42701525054466233, + "grad_norm": 0.3893333971500397, + "learning_rate": 0.0009673221017784683, + "loss": 1.9644, + "step": 931 + }, + { + "epoch": 0.427473913542025, + "grad_norm": 0.2807949483394623, + "learning_rate": 0.0009672339866747675, + "loss": 1.5402, + "step": 932 + }, + { + "epoch": 0.4279325765393877, + "grad_norm": 0.37752094864845276, + "learning_rate": 0.0009671457569552009, + "loss": 2.1098, + "step": 933 + }, + { + "epoch": 0.42839123953675035, + "grad_norm": 0.29235443472862244, + "learning_rate": 0.0009670574126414118, + "loss": 1.3341, + "step": 934 + }, + { + "epoch": 0.42884990253411304, + "grad_norm": 0.3632213771343231, + "learning_rate": 0.0009669689537550717, + "loss": 1.5638, + "step": 935 + }, + { + "epoch": 0.4293085655314757, + "grad_norm": 0.30011284351348877, + "learning_rate": 0.0009668803803178803, + "loss": 1.6048, + "step": 936 + }, + { + "epoch": 0.4297672285288384, + "grad_norm": 0.21687191724777222, + "learning_rate": 0.0009667916923515651, + "loss": 0.9312, + "step": 937 + }, + { + "epoch": 0.4302258915262011, + "grad_norm": 0.2912392020225525, + "learning_rate": 0.0009667028898778822, + "loss": 1.5125, + "step": 938 + }, + { + "epoch": 0.4306845545235638, + "grad_norm": 0.3935389816761017, + "learning_rate": 0.0009666139729186152, + "loss": 2.1857, + "step": 939 + }, + { + "epoch": 0.4311432175209265, + "grad_norm": 0.4185419976711273, + "learning_rate": 0.0009665249414955761, + "loss": 2.4136, + "step": 940 + }, + { + "epoch": 0.4316018805182892, + "grad_norm": 0.10364656150341034, + "learning_rate": 0.0009664357956306051, + "loss": 0.6298, + "step": 941 + }, + { + "epoch": 0.4320605435156519, + "grad_norm": 0.2796267867088318, + "learning_rate": 0.0009663465353455703, + "loss": 1.5578, + "step": 942 + }, + { + "epoch": 0.43251920651301456, + "grad_norm": 0.2670913338661194, + "learning_rate": 0.0009662571606623678, + "loss": 1.1976, + "step": 943 + }, + { + "epoch": 0.43297786951037726, + "grad_norm": 0.41438791155815125, + "learning_rate": 0.0009661676716029219, + "loss": 2.4133, + "step": 944 + }, + { + "epoch": 0.43343653250773995, + "grad_norm": 0.31299319863319397, + "learning_rate": 0.000966078068189185, + "loss": 1.5249, + "step": 945 + }, + { + "epoch": 0.43389519550510264, + "grad_norm": 0.3784157931804657, + "learning_rate": 0.0009659883504431373, + "loss": 1.8156, + "step": 946 + }, + { + "epoch": 0.43435385850246533, + "grad_norm": 0.3633187413215637, + "learning_rate": 0.0009658985183867872, + "loss": 2.0233, + "step": 947 + }, + { + "epoch": 0.434812521499828, + "grad_norm": 0.29936981201171875, + "learning_rate": 0.0009658085720421714, + "loss": 1.4108, + "step": 948 + }, + { + "epoch": 0.4352711844971907, + "grad_norm": 0.24031949043273926, + "learning_rate": 0.0009657185114313541, + "loss": 0.9012, + "step": 949 + }, + { + "epoch": 0.4357298474945534, + "grad_norm": 0.30812880396842957, + "learning_rate": 0.000965628336576428, + "loss": 1.739, + "step": 950 + }, + { + "epoch": 0.43618851049191604, + "grad_norm": 0.27589166164398193, + "learning_rate": 0.0009655380474995137, + "loss": 1.387, + "step": 951 + }, + { + "epoch": 0.43664717348927873, + "grad_norm": 0.3418666422367096, + "learning_rate": 0.0009654476442227595, + "loss": 1.8832, + "step": 952 + }, + { + "epoch": 0.4371058364866414, + "grad_norm": 0.18190939724445343, + "learning_rate": 0.000965357126768342, + "loss": 0.8432, + "step": 953 + }, + { + "epoch": 0.4375644994840041, + "grad_norm": 0.336883544921875, + "learning_rate": 0.0009652664951584662, + "loss": 1.9048, + "step": 954 + }, + { + "epoch": 0.4380231624813668, + "grad_norm": 0.2961789667606354, + "learning_rate": 0.0009651757494153642, + "loss": 1.473, + "step": 955 + }, + { + "epoch": 0.4384818254787295, + "grad_norm": 0.2900097370147705, + "learning_rate": 0.0009650848895612969, + "loss": 1.2979, + "step": 956 + }, + { + "epoch": 0.4389404884760922, + "grad_norm": 0.13419854640960693, + "learning_rate": 0.0009649939156185526, + "loss": 0.8149, + "step": 957 + }, + { + "epoch": 0.4393991514734549, + "grad_norm": 0.2546023428440094, + "learning_rate": 0.000964902827609448, + "loss": 1.2925, + "step": 958 + }, + { + "epoch": 0.43985781447081757, + "grad_norm": 0.28633105754852295, + "learning_rate": 0.0009648116255563279, + "loss": 1.4249, + "step": 959 + }, + { + "epoch": 0.44031647746818026, + "grad_norm": 0.34882158041000366, + "learning_rate": 0.0009647203094815644, + "loss": 1.8254, + "step": 960 + }, + { + "epoch": 0.44077514046554295, + "grad_norm": 0.2623085677623749, + "learning_rate": 0.0009646288794075582, + "loss": 1.2006, + "step": 961 + }, + { + "epoch": 0.44123380346290564, + "grad_norm": 0.3005825877189636, + "learning_rate": 0.0009645373353567377, + "loss": 1.4745, + "step": 962 + }, + { + "epoch": 0.44169246646026833, + "grad_norm": 0.4481685757637024, + "learning_rate": 0.0009644456773515595, + "loss": 1.6587, + "step": 963 + }, + { + "epoch": 0.442151129457631, + "grad_norm": 0.34620991349220276, + "learning_rate": 0.0009643539054145077, + "loss": 2.0671, + "step": 964 + }, + { + "epoch": 0.4426097924549937, + "grad_norm": 0.10259232670068741, + "learning_rate": 0.0009642620195680948, + "loss": 0.6523, + "step": 965 + }, + { + "epoch": 0.4430684554523564, + "grad_norm": 0.2266424596309662, + "learning_rate": 0.0009641700198348608, + "loss": 1.033, + "step": 966 + }, + { + "epoch": 0.4435271184497191, + "grad_norm": 0.2258816808462143, + "learning_rate": 0.0009640779062373743, + "loss": 1.3325, + "step": 967 + }, + { + "epoch": 0.4439857814470818, + "grad_norm": 0.3065774142742157, + "learning_rate": 0.0009639856787982313, + "loss": 1.7267, + "step": 968 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.2785046696662903, + "learning_rate": 0.0009638933375400555, + "loss": 1.4358, + "step": 969 + }, + { + "epoch": 0.4449031074418071, + "grad_norm": 0.40370169281959534, + "learning_rate": 0.0009638008824854995, + "loss": 2.2593, + "step": 970 + }, + { + "epoch": 0.4453617704391698, + "grad_norm": 0.2614261507987976, + "learning_rate": 0.0009637083136572426, + "loss": 1.4521, + "step": 971 + }, + { + "epoch": 0.4458204334365325, + "grad_norm": 0.3734627068042755, + "learning_rate": 0.0009636156310779928, + "loss": 2.2771, + "step": 972 + }, + { + "epoch": 0.4462790964338952, + "grad_norm": 0.3273562490940094, + "learning_rate": 0.0009635228347704861, + "loss": 1.7556, + "step": 973 + }, + { + "epoch": 0.4467377594312579, + "grad_norm": 0.28261798620224, + "learning_rate": 0.0009634299247574858, + "loss": 1.409, + "step": 974 + }, + { + "epoch": 0.44719642242862057, + "grad_norm": 0.409447580575943, + "learning_rate": 0.0009633369010617834, + "loss": 2.3015, + "step": 975 + }, + { + "epoch": 0.44765508542598326, + "grad_norm": 0.44366922974586487, + "learning_rate": 0.0009632437637061982, + "loss": 2.4729, + "step": 976 + }, + { + "epoch": 0.44811374842334595, + "grad_norm": 0.35576725006103516, + "learning_rate": 0.0009631505127135778, + "loss": 2.0571, + "step": 977 + }, + { + "epoch": 0.44857241142070864, + "grad_norm": 0.2606424391269684, + "learning_rate": 0.000963057148106797, + "loss": 1.4125, + "step": 978 + }, + { + "epoch": 0.44903107441807133, + "grad_norm": 0.3204779028892517, + "learning_rate": 0.0009629636699087591, + "loss": 1.6771, + "step": 979 + }, + { + "epoch": 0.449489737415434, + "grad_norm": 0.3472746014595032, + "learning_rate": 0.0009628700781423947, + "loss": 1.7863, + "step": 980 + }, + { + "epoch": 0.4499484004127967, + "grad_norm": 0.2507071793079376, + "learning_rate": 0.0009627763728306626, + "loss": 1.2164, + "step": 981 + }, + { + "epoch": 0.4504070634101594, + "grad_norm": 0.284404993057251, + "learning_rate": 0.0009626825539965497, + "loss": 1.3602, + "step": 982 + }, + { + "epoch": 0.4508657264075221, + "grad_norm": 0.23801104724407196, + "learning_rate": 0.00096258862166307, + "loss": 1.1913, + "step": 983 + }, + { + "epoch": 0.4513243894048848, + "grad_norm": 0.08279567956924438, + "learning_rate": 0.0009624945758532662, + "loss": 0.5228, + "step": 984 + }, + { + "epoch": 0.4517830524022475, + "grad_norm": 0.31264549493789673, + "learning_rate": 0.0009624004165902078, + "loss": 1.3771, + "step": 985 + }, + { + "epoch": 0.4522417153996101, + "grad_norm": 0.290018767118454, + "learning_rate": 0.0009623061438969934, + "loss": 1.4374, + "step": 986 + }, + { + "epoch": 0.4527003783969728, + "grad_norm": 0.30359646677970886, + "learning_rate": 0.0009622117577967486, + "loss": 1.7618, + "step": 987 + }, + { + "epoch": 0.4531590413943355, + "grad_norm": 0.3439119756221771, + "learning_rate": 0.0009621172583126267, + "loss": 1.7233, + "step": 988 + }, + { + "epoch": 0.4536177043916982, + "grad_norm": 0.3586854636669159, + "learning_rate": 0.0009620226454678093, + "loss": 2.0002, + "step": 989 + }, + { + "epoch": 0.4540763673890609, + "grad_norm": 0.3362513780593872, + "learning_rate": 0.0009619279192855056, + "loss": 2.1243, + "step": 990 + }, + { + "epoch": 0.45453503038642357, + "grad_norm": 0.2629103362560272, + "learning_rate": 0.0009618330797889527, + "loss": 1.3026, + "step": 991 + }, + { + "epoch": 0.45499369338378626, + "grad_norm": 0.3186989426612854, + "learning_rate": 0.0009617381270014154, + "loss": 1.8407, + "step": 992 + }, + { + "epoch": 0.45545235638114895, + "grad_norm": 0.40748247504234314, + "learning_rate": 0.0009616430609461861, + "loss": 1.9366, + "step": 993 + }, + { + "epoch": 0.45591101937851164, + "grad_norm": 0.37845298647880554, + "learning_rate": 0.0009615478816465854, + "loss": 1.8237, + "step": 994 + }, + { + "epoch": 0.45636968237587433, + "grad_norm": 0.3755371570587158, + "learning_rate": 0.0009614525891259613, + "loss": 1.8331, + "step": 995 + }, + { + "epoch": 0.456828345373237, + "grad_norm": 0.28780463337898254, + "learning_rate": 0.00096135718340769, + "loss": 1.2989, + "step": 996 + }, + { + "epoch": 0.4572870083705997, + "grad_norm": 0.3778688609600067, + "learning_rate": 0.000961261664515175, + "loss": 1.7416, + "step": 997 + }, + { + "epoch": 0.4577456713679624, + "grad_norm": 0.2470480501651764, + "learning_rate": 0.0009611660324718479, + "loss": 0.9676, + "step": 998 + }, + { + "epoch": 0.4582043343653251, + "grad_norm": 0.5491411685943604, + "learning_rate": 0.0009610702873011676, + "loss": 1.8815, + "step": 999 + }, + { + "epoch": 0.4586629973626878, + "grad_norm": 0.38404297828674316, + "learning_rate": 0.0009609744290266216, + "loss": 1.8572, + "step": 1000 + }, + { + "epoch": 0.4591216603600505, + "grad_norm": 0.4080640971660614, + "learning_rate": 0.0009608784576717242, + "loss": 2.2996, + "step": 1001 + }, + { + "epoch": 0.45958032335741317, + "grad_norm": 0.39164450764656067, + "learning_rate": 0.000960782373260018, + "loss": 1.6304, + "step": 1002 + }, + { + "epoch": 0.4600389863547758, + "grad_norm": 0.2890069782733917, + "learning_rate": 0.0009606861758150733, + "loss": 1.2357, + "step": 1003 + }, + { + "epoch": 0.4604976493521385, + "grad_norm": 0.08920864015817642, + "learning_rate": 0.0009605898653604881, + "loss": 0.5615, + "step": 1004 + }, + { + "epoch": 0.4609563123495012, + "grad_norm": 0.13560301065444946, + "learning_rate": 0.0009604934419198877, + "loss": 0.7203, + "step": 1005 + }, + { + "epoch": 0.4614149753468639, + "grad_norm": 0.27159520983695984, + "learning_rate": 0.0009603969055169258, + "loss": 1.6746, + "step": 1006 + }, + { + "epoch": 0.46187363834422657, + "grad_norm": 0.19911514222621918, + "learning_rate": 0.0009603002561752832, + "loss": 1.0421, + "step": 1007 + }, + { + "epoch": 0.46233230134158926, + "grad_norm": 0.3992009162902832, + "learning_rate": 0.0009602034939186691, + "loss": 2.3098, + "step": 1008 + }, + { + "epoch": 0.46279096433895195, + "grad_norm": 0.28715813159942627, + "learning_rate": 0.0009601066187708194, + "loss": 1.3643, + "step": 1009 + }, + { + "epoch": 0.46324962733631464, + "grad_norm": 0.3220798671245575, + "learning_rate": 0.0009600096307554987, + "loss": 1.6344, + "step": 1010 + }, + { + "epoch": 0.46370829033367733, + "grad_norm": 0.3547840416431427, + "learning_rate": 0.0009599125298964987, + "loss": 1.7168, + "step": 1011 + }, + { + "epoch": 0.46416695333104, + "grad_norm": 0.29928240180015564, + "learning_rate": 0.000959815316217639, + "loss": 1.4865, + "step": 1012 + }, + { + "epoch": 0.4646256163284027, + "grad_norm": 0.32575827836990356, + "learning_rate": 0.0009597179897427668, + "loss": 1.4995, + "step": 1013 + }, + { + "epoch": 0.4650842793257654, + "grad_norm": 0.36625438928604126, + "learning_rate": 0.000959620550495757, + "loss": 1.8375, + "step": 1014 + }, + { + "epoch": 0.4655429423231281, + "grad_norm": 0.3549942970275879, + "learning_rate": 0.0009595229985005121, + "loss": 1.7886, + "step": 1015 + }, + { + "epoch": 0.4660016053204908, + "grad_norm": 0.33019477128982544, + "learning_rate": 0.0009594253337809621, + "loss": 1.618, + "step": 1016 + }, + { + "epoch": 0.4664602683178535, + "grad_norm": 0.31643784046173096, + "learning_rate": 0.0009593275563610655, + "loss": 1.5107, + "step": 1017 + }, + { + "epoch": 0.46691893131521617, + "grad_norm": 0.33537599444389343, + "learning_rate": 0.0009592296662648072, + "loss": 1.6339, + "step": 1018 + }, + { + "epoch": 0.46737759431257886, + "grad_norm": 0.3272285759449005, + "learning_rate": 0.0009591316635162006, + "loss": 1.4943, + "step": 1019 + }, + { + "epoch": 0.4678362573099415, + "grad_norm": 0.2963853180408478, + "learning_rate": 0.0009590335481392863, + "loss": 1.5295, + "step": 1020 + }, + { + "epoch": 0.4682949203073042, + "grad_norm": 0.38083258271217346, + "learning_rate": 0.000958935320158133, + "loss": 2.0518, + "step": 1021 + }, + { + "epoch": 0.4687535833046669, + "grad_norm": 0.2800465524196625, + "learning_rate": 0.0009588369795968366, + "loss": 1.386, + "step": 1022 + }, + { + "epoch": 0.46921224630202957, + "grad_norm": 0.31138157844543457, + "learning_rate": 0.0009587385264795206, + "loss": 1.2996, + "step": 1023 + }, + { + "epoch": 0.46967090929939226, + "grad_norm": 0.27252325415611267, + "learning_rate": 0.0009586399608303364, + "loss": 1.4318, + "step": 1024 + }, + { + "epoch": 0.47012957229675495, + "grad_norm": 0.2721007764339447, + "learning_rate": 0.0009585412826734627, + "loss": 1.3777, + "step": 1025 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.28045183420181274, + "learning_rate": 0.0009584424920331063, + "loss": 1.5048, + "step": 1026 + }, + { + "epoch": 0.47104689829148033, + "grad_norm": 0.3029325604438782, + "learning_rate": 0.000958343588933501, + "loss": 1.6938, + "step": 1027 + }, + { + "epoch": 0.471505561288843, + "grad_norm": 0.2408834993839264, + "learning_rate": 0.0009582445733989086, + "loss": 1.4271, + "step": 1028 + }, + { + "epoch": 0.4719642242862057, + "grad_norm": 0.10467322915792465, + "learning_rate": 0.0009581454454536182, + "loss": 0.6104, + "step": 1029 + }, + { + "epoch": 0.4724228872835684, + "grad_norm": 0.3287009000778198, + "learning_rate": 0.0009580462051219465, + "loss": 1.9726, + "step": 1030 + }, + { + "epoch": 0.4728815502809311, + "grad_norm": 0.37531885504722595, + "learning_rate": 0.0009579468524282381, + "loss": 2.3677, + "step": 1031 + }, + { + "epoch": 0.4733402132782938, + "grad_norm": 0.4152393937110901, + "learning_rate": 0.0009578473873968649, + "loss": 2.207, + "step": 1032 + }, + { + "epoch": 0.4737988762756565, + "grad_norm": 0.26491448283195496, + "learning_rate": 0.0009577478100522261, + "loss": 1.406, + "step": 1033 + }, + { + "epoch": 0.47425753927301917, + "grad_norm": 0.3497820198535919, + "learning_rate": 0.0009576481204187492, + "loss": 1.9156, + "step": 1034 + }, + { + "epoch": 0.47471620227038186, + "grad_norm": 0.3127298951148987, + "learning_rate": 0.0009575483185208884, + "loss": 1.3849, + "step": 1035 + }, + { + "epoch": 0.47517486526774455, + "grad_norm": 0.386515736579895, + "learning_rate": 0.000957448404383126, + "loss": 1.9648, + "step": 1036 + }, + { + "epoch": 0.4756335282651072, + "grad_norm": 0.30607542395591736, + "learning_rate": 0.0009573483780299717, + "loss": 1.5166, + "step": 1037 + }, + { + "epoch": 0.4760921912624699, + "grad_norm": 0.3027437627315521, + "learning_rate": 0.0009572482394859625, + "loss": 1.6056, + "step": 1038 + }, + { + "epoch": 0.47655085425983257, + "grad_norm": 0.3490167558193207, + "learning_rate": 0.0009571479887756633, + "loss": 1.7711, + "step": 1039 + }, + { + "epoch": 0.47700951725719526, + "grad_norm": 0.3557809293270111, + "learning_rate": 0.0009570476259236662, + "loss": 1.7281, + "step": 1040 + }, + { + "epoch": 0.47746818025455795, + "grad_norm": 0.3299933075904846, + "learning_rate": 0.000956947150954591, + "loss": 1.3677, + "step": 1041 + }, + { + "epoch": 0.47792684325192064, + "grad_norm": 0.4228198528289795, + "learning_rate": 0.000956846563893085, + "loss": 1.8817, + "step": 1042 + }, + { + "epoch": 0.47838550624928333, + "grad_norm": 0.40233686566352844, + "learning_rate": 0.0009567458647638228, + "loss": 2.2048, + "step": 1043 + }, + { + "epoch": 0.478844169246646, + "grad_norm": 0.3554650843143463, + "learning_rate": 0.0009566450535915066, + "loss": 1.8723, + "step": 1044 + }, + { + "epoch": 0.4793028322440087, + "grad_norm": 0.33305272459983826, + "learning_rate": 0.0009565441304008661, + "loss": 1.67, + "step": 1045 + }, + { + "epoch": 0.4797614952413714, + "grad_norm": 0.35247936844825745, + "learning_rate": 0.0009564430952166587, + "loss": 1.9382, + "step": 1046 + }, + { + "epoch": 0.4802201582387341, + "grad_norm": 0.23185671865940094, + "learning_rate": 0.0009563419480636689, + "loss": 0.9469, + "step": 1047 + }, + { + "epoch": 0.4806788212360968, + "grad_norm": 0.29733264446258545, + "learning_rate": 0.0009562406889667088, + "loss": 1.4415, + "step": 1048 + }, + { + "epoch": 0.4811374842334595, + "grad_norm": 0.2600105106830597, + "learning_rate": 0.0009561393179506181, + "loss": 1.2259, + "step": 1049 + }, + { + "epoch": 0.48159614723082217, + "grad_norm": 0.36176133155822754, + "learning_rate": 0.0009560378350402637, + "loss": 1.6957, + "step": 1050 + }, + { + "epoch": 0.48205481022818486, + "grad_norm": 0.29688432812690735, + "learning_rate": 0.0009559362402605403, + "loss": 1.3275, + "step": 1051 + }, + { + "epoch": 0.48251347322554755, + "grad_norm": 0.2176387459039688, + "learning_rate": 0.0009558345336363695, + "loss": 0.9416, + "step": 1052 + }, + { + "epoch": 0.48297213622291024, + "grad_norm": 0.35077396035194397, + "learning_rate": 0.0009557327151927009, + "loss": 1.6619, + "step": 1053 + }, + { + "epoch": 0.4834307992202729, + "grad_norm": 0.35128337144851685, + "learning_rate": 0.0009556307849545114, + "loss": 1.5695, + "step": 1054 + }, + { + "epoch": 0.48388946221763557, + "grad_norm": 0.37428727746009827, + "learning_rate": 0.000955528742946805, + "loss": 1.9629, + "step": 1055 + }, + { + "epoch": 0.48434812521499826, + "grad_norm": 0.28633585572242737, + "learning_rate": 0.0009554265891946133, + "loss": 1.4113, + "step": 1056 + }, + { + "epoch": 0.48480678821236095, + "grad_norm": 0.25998106598854065, + "learning_rate": 0.0009553243237229956, + "loss": 1.1547, + "step": 1057 + }, + { + "epoch": 0.48526545120972364, + "grad_norm": 0.4127240478992462, + "learning_rate": 0.0009552219465570382, + "loss": 2.1592, + "step": 1058 + }, + { + "epoch": 0.48572411420708633, + "grad_norm": 0.23034657537937164, + "learning_rate": 0.000955119457721855, + "loss": 1.0124, + "step": 1059 + }, + { + "epoch": 0.486182777204449, + "grad_norm": 0.4003170430660248, + "learning_rate": 0.0009550168572425873, + "loss": 1.8563, + "step": 1060 + }, + { + "epoch": 0.4866414402018117, + "grad_norm": 0.34863001108169556, + "learning_rate": 0.0009549141451444036, + "loss": 1.8716, + "step": 1061 + }, + { + "epoch": 0.4871001031991744, + "grad_norm": 0.34055593609809875, + "learning_rate": 0.0009548113214525, + "loss": 1.8115, + "step": 1062 + }, + { + "epoch": 0.4875587661965371, + "grad_norm": 0.2745518684387207, + "learning_rate": 0.0009547083861921, + "loss": 1.3137, + "step": 1063 + }, + { + "epoch": 0.4880174291938998, + "grad_norm": 0.19711460173130035, + "learning_rate": 0.0009546053393884542, + "loss": 0.9982, + "step": 1064 + }, + { + "epoch": 0.4884760921912625, + "grad_norm": 0.2679455280303955, + "learning_rate": 0.0009545021810668406, + "loss": 1.0912, + "step": 1065 + }, + { + "epoch": 0.48893475518862517, + "grad_norm": 0.2457709014415741, + "learning_rate": 0.000954398911252565, + "loss": 0.9668, + "step": 1066 + }, + { + "epoch": 0.48939341818598786, + "grad_norm": 0.22733516991138458, + "learning_rate": 0.0009542955299709601, + "loss": 0.9916, + "step": 1067 + }, + { + "epoch": 0.48985208118335055, + "grad_norm": 0.3566371500492096, + "learning_rate": 0.000954192037247386, + "loss": 1.8339, + "step": 1068 + }, + { + "epoch": 0.49031074418071324, + "grad_norm": 0.40455371141433716, + "learning_rate": 0.0009540884331072304, + "loss": 2.1667, + "step": 1069 + }, + { + "epoch": 0.49076940717807593, + "grad_norm": 0.43050408363342285, + "learning_rate": 0.0009539847175759077, + "loss": 2.3535, + "step": 1070 + }, + { + "epoch": 0.49122807017543857, + "grad_norm": 0.3837912380695343, + "learning_rate": 0.0009538808906788608, + "loss": 1.995, + "step": 1071 + }, + { + "epoch": 0.49168673317280126, + "grad_norm": 0.28267940878868103, + "learning_rate": 0.0009537769524415585, + "loss": 1.4414, + "step": 1072 + }, + { + "epoch": 0.49214539617016395, + "grad_norm": 0.43233123421669006, + "learning_rate": 0.0009536729028894979, + "loss": 2.0576, + "step": 1073 + }, + { + "epoch": 0.49260405916752664, + "grad_norm": 0.27310910820961, + "learning_rate": 0.0009535687420482031, + "loss": 1.0809, + "step": 1074 + }, + { + "epoch": 0.49306272216488933, + "grad_norm": 0.2695041596889496, + "learning_rate": 0.0009534644699432254, + "loss": 1.2148, + "step": 1075 + }, + { + "epoch": 0.493521385162252, + "grad_norm": 0.4794595241546631, + "learning_rate": 0.0009533600866001437, + "loss": 2.3083, + "step": 1076 + }, + { + "epoch": 0.4939800481596147, + "grad_norm": 0.2349144071340561, + "learning_rate": 0.0009532555920445638, + "loss": 0.9781, + "step": 1077 + }, + { + "epoch": 0.4944387111569774, + "grad_norm": 0.4289840757846832, + "learning_rate": 0.000953150986302119, + "loss": 1.9989, + "step": 1078 + }, + { + "epoch": 0.4948973741543401, + "grad_norm": 0.4208228588104248, + "learning_rate": 0.0009530462693984698, + "loss": 2.0288, + "step": 1079 + }, + { + "epoch": 0.4953560371517028, + "grad_norm": 0.34878844022750854, + "learning_rate": 0.0009529414413593043, + "loss": 1.4258, + "step": 1080 + }, + { + "epoch": 0.4958147001490655, + "grad_norm": 0.23603618144989014, + "learning_rate": 0.0009528365022103371, + "loss": 1.1298, + "step": 1081 + }, + { + "epoch": 0.49627336314642817, + "grad_norm": 0.27409201860427856, + "learning_rate": 0.000952731451977311, + "loss": 1.2806, + "step": 1082 + }, + { + "epoch": 0.49673202614379086, + "grad_norm": 0.37235966324806213, + "learning_rate": 0.0009526262906859953, + "loss": 1.9988, + "step": 1083 + }, + { + "epoch": 0.49719068914115355, + "grad_norm": 0.3669985830783844, + "learning_rate": 0.0009525210183621869, + "loss": 1.7487, + "step": 1084 + }, + { + "epoch": 0.49764935213851624, + "grad_norm": 0.3774386942386627, + "learning_rate": 0.0009524156350317099, + "loss": 1.0475, + "step": 1085 + }, + { + "epoch": 0.49810801513587893, + "grad_norm": 0.30981799960136414, + "learning_rate": 0.0009523101407204154, + "loss": 1.3907, + "step": 1086 + }, + { + "epoch": 0.4985666781332416, + "grad_norm": 0.3435545861721039, + "learning_rate": 0.0009522045354541822, + "loss": 1.6863, + "step": 1087 + }, + { + "epoch": 0.49902534113060426, + "grad_norm": 0.23086117208003998, + "learning_rate": 0.0009520988192589158, + "loss": 1.0635, + "step": 1088 + }, + { + "epoch": 0.49948400412796695, + "grad_norm": 0.21928969025611877, + "learning_rate": 0.0009519929921605493, + "loss": 0.9675, + "step": 1089 + }, + { + "epoch": 0.49994266712532964, + "grad_norm": 0.2767171263694763, + "learning_rate": 0.0009518870541850426, + "loss": 1.4333, + "step": 1090 + }, + { + "epoch": 0.5004013301226924, + "grad_norm": 0.288322776556015, + "learning_rate": 0.0009517810053583835, + "loss": 1.6043, + "step": 1091 + }, + { + "epoch": 0.500859993120055, + "grad_norm": 0.32782241702079773, + "learning_rate": 0.0009516748457065862, + "loss": 1.8718, + "step": 1092 + }, + { + "epoch": 0.5013186561174178, + "grad_norm": 0.2817911207675934, + "learning_rate": 0.0009515685752556924, + "loss": 1.5955, + "step": 1093 + }, + { + "epoch": 0.5017773191147804, + "grad_norm": 0.3410284221172333, + "learning_rate": 0.0009514621940317712, + "loss": 1.8585, + "step": 1094 + }, + { + "epoch": 0.5022359821121432, + "grad_norm": 0.24062463641166687, + "learning_rate": 0.0009513557020609185, + "loss": 1.2591, + "step": 1095 + }, + { + "epoch": 0.5026946451095058, + "grad_norm": 0.26802435517311096, + "learning_rate": 0.0009512490993692578, + "loss": 1.3988, + "step": 1096 + }, + { + "epoch": 0.5031533081068684, + "grad_norm": 0.3186005651950836, + "learning_rate": 0.0009511423859829392, + "loss": 1.3655, + "step": 1097 + }, + { + "epoch": 0.5036119711042312, + "grad_norm": 0.378814160823822, + "learning_rate": 0.0009510355619281403, + "loss": 1.9003, + "step": 1098 + }, + { + "epoch": 0.5040706341015938, + "grad_norm": 0.12915131449699402, + "learning_rate": 0.0009509286272310662, + "loss": 0.7532, + "step": 1099 + }, + { + "epoch": 0.5045292970989566, + "grad_norm": 0.3437194228172302, + "learning_rate": 0.0009508215819179484, + "loss": 1.8945, + "step": 1100 + }, + { + "epoch": 0.5049879600963192, + "grad_norm": 0.3381500542163849, + "learning_rate": 0.0009507144260150459, + "loss": 1.8967, + "step": 1101 + }, + { + "epoch": 0.5054466230936819, + "grad_norm": 0.31556692719459534, + "learning_rate": 0.0009506071595486449, + "loss": 1.6499, + "step": 1102 + }, + { + "epoch": 0.5059052860910446, + "grad_norm": 0.19522079825401306, + "learning_rate": 0.0009504997825450586, + "loss": 1.1095, + "step": 1103 + }, + { + "epoch": 0.5063639490884073, + "grad_norm": 0.2746993601322174, + "learning_rate": 0.0009503922950306274, + "loss": 1.3311, + "step": 1104 + }, + { + "epoch": 0.50682261208577, + "grad_norm": 0.191048264503479, + "learning_rate": 0.0009502846970317187, + "loss": 0.9343, + "step": 1105 + }, + { + "epoch": 0.5072812750831327, + "grad_norm": 0.24246980249881744, + "learning_rate": 0.0009501769885747271, + "loss": 0.9377, + "step": 1106 + }, + { + "epoch": 0.5077399380804953, + "grad_norm": 0.2772439122200012, + "learning_rate": 0.0009500691696860743, + "loss": 1.4328, + "step": 1107 + }, + { + "epoch": 0.5081986010778581, + "grad_norm": 0.11574332416057587, + "learning_rate": 0.0009499612403922091, + "loss": 0.7223, + "step": 1108 + }, + { + "epoch": 0.5086572640752207, + "grad_norm": 0.2554071247577667, + "learning_rate": 0.0009498532007196071, + "loss": 1.3056, + "step": 1109 + }, + { + "epoch": 0.5091159270725835, + "grad_norm": 0.2135448306798935, + "learning_rate": 0.0009497450506947714, + "loss": 1.161, + "step": 1110 + }, + { + "epoch": 0.5095745900699461, + "grad_norm": 0.1852402538061142, + "learning_rate": 0.000949636790344232, + "loss": 1.002, + "step": 1111 + }, + { + "epoch": 0.5100332530673088, + "grad_norm": 0.3562169373035431, + "learning_rate": 0.0009495284196945458, + "loss": 1.8675, + "step": 1112 + }, + { + "epoch": 0.5104919160646715, + "grad_norm": 0.336135596036911, + "learning_rate": 0.0009494199387722969, + "loss": 1.9324, + "step": 1113 + }, + { + "epoch": 0.5109505790620341, + "grad_norm": 0.38499322533607483, + "learning_rate": 0.0009493113476040966, + "loss": 2.3125, + "step": 1114 + }, + { + "epoch": 0.5114092420593969, + "grad_norm": 0.20560574531555176, + "learning_rate": 0.0009492026462165831, + "loss": 0.9785, + "step": 1115 + }, + { + "epoch": 0.5118679050567595, + "grad_norm": 0.36569714546203613, + "learning_rate": 0.0009490938346364215, + "loss": 1.9463, + "step": 1116 + }, + { + "epoch": 0.5123265680541222, + "grad_norm": 0.2573243975639343, + "learning_rate": 0.0009489849128903041, + "loss": 0.9407, + "step": 1117 + }, + { + "epoch": 0.5127852310514849, + "grad_norm": 0.3620086908340454, + "learning_rate": 0.0009488758810049503, + "loss": 2.1328, + "step": 1118 + }, + { + "epoch": 0.5132438940488476, + "grad_norm": 0.14623965322971344, + "learning_rate": 0.0009487667390071064, + "loss": 0.777, + "step": 1119 + }, + { + "epoch": 0.5137025570462103, + "grad_norm": 0.2728992998600006, + "learning_rate": 0.0009486574869235453, + "loss": 1.5247, + "step": 1120 + }, + { + "epoch": 0.514161220043573, + "grad_norm": 0.379833459854126, + "learning_rate": 0.0009485481247810681, + "loss": 1.8743, + "step": 1121 + }, + { + "epoch": 0.5146198830409356, + "grad_norm": 0.1932978332042694, + "learning_rate": 0.0009484386526065014, + "loss": 0.795, + "step": 1122 + }, + { + "epoch": 0.5150785460382984, + "grad_norm": 0.3890305459499359, + "learning_rate": 0.0009483290704266999, + "loss": 2.0994, + "step": 1123 + }, + { + "epoch": 0.515537209035661, + "grad_norm": 0.2669411897659302, + "learning_rate": 0.0009482193782685449, + "loss": 1.5108, + "step": 1124 + }, + { + "epoch": 0.5159958720330238, + "grad_norm": 0.2894323766231537, + "learning_rate": 0.0009481095761589445, + "loss": 1.3339, + "step": 1125 + }, + { + "epoch": 0.5164545350303864, + "grad_norm": 0.32780617475509644, + "learning_rate": 0.0009479996641248339, + "loss": 1.2261, + "step": 1126 + }, + { + "epoch": 0.5169131980277492, + "grad_norm": 0.2773579955101013, + "learning_rate": 0.0009478896421931755, + "loss": 1.3108, + "step": 1127 + }, + { + "epoch": 0.5173718610251118, + "grad_norm": 0.25270065665245056, + "learning_rate": 0.0009477795103909586, + "loss": 1.2581, + "step": 1128 + }, + { + "epoch": 0.5178305240224745, + "grad_norm": 0.34188172221183777, + "learning_rate": 0.000947669268745199, + "loss": 1.7618, + "step": 1129 + }, + { + "epoch": 0.5182891870198372, + "grad_norm": 0.30379557609558105, + "learning_rate": 0.00094755891728294, + "loss": 1.5002, + "step": 1130 + }, + { + "epoch": 0.5187478500171998, + "grad_norm": 0.29648905992507935, + "learning_rate": 0.0009474484560312514, + "loss": 1.1797, + "step": 1131 + }, + { + "epoch": 0.5192065130145626, + "grad_norm": 0.3193150758743286, + "learning_rate": 0.0009473378850172303, + "loss": 1.899, + "step": 1132 + }, + { + "epoch": 0.5196651760119252, + "grad_norm": 0.3248406946659088, + "learning_rate": 0.0009472272042680005, + "loss": 1.7289, + "step": 1133 + }, + { + "epoch": 0.5201238390092879, + "grad_norm": 0.26000529527664185, + "learning_rate": 0.000947116413810713, + "loss": 1.4107, + "step": 1134 + }, + { + "epoch": 0.5205825020066506, + "grad_norm": 0.27710065245628357, + "learning_rate": 0.0009470055136725451, + "loss": 1.3737, + "step": 1135 + }, + { + "epoch": 0.5210411650040133, + "grad_norm": 0.3785195052623749, + "learning_rate": 0.0009468945038807018, + "loss": 1.7585, + "step": 1136 + }, + { + "epoch": 0.521499828001376, + "grad_norm": 0.3857591450214386, + "learning_rate": 0.0009467833844624142, + "loss": 2.188, + "step": 1137 + }, + { + "epoch": 0.5219584909987387, + "grad_norm": 0.30060064792633057, + "learning_rate": 0.0009466721554449412, + "loss": 1.424, + "step": 1138 + }, + { + "epoch": 0.5224171539961013, + "grad_norm": 0.29208019375801086, + "learning_rate": 0.0009465608168555677, + "loss": 1.2755, + "step": 1139 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 5.316793441772461, + "learning_rate": 0.0009464493687216058, + "loss": 2.1242, + "step": 1140 + }, + { + "epoch": 0.5233344799908267, + "grad_norm": 0.2195115089416504, + "learning_rate": 0.0009463378110703949, + "loss": 1.0337, + "step": 1141 + }, + { + "epoch": 0.5237931429881895, + "grad_norm": 0.24998384714126587, + "learning_rate": 0.0009462261439293005, + "loss": 1.4111, + "step": 1142 + }, + { + "epoch": 0.5242518059855521, + "grad_norm": 0.3403361737728119, + "learning_rate": 0.0009461143673257156, + "loss": 1.9103, + "step": 1143 + }, + { + "epoch": 0.5247104689829148, + "grad_norm": 0.17883948981761932, + "learning_rate": 0.0009460024812870598, + "loss": 0.9974, + "step": 1144 + }, + { + "epoch": 0.5251691319802775, + "grad_norm": 0.24725860357284546, + "learning_rate": 0.0009458904858407794, + "loss": 1.4016, + "step": 1145 + }, + { + "epoch": 0.5256277949776402, + "grad_norm": 0.29898151755332947, + "learning_rate": 0.0009457783810143479, + "loss": 1.701, + "step": 1146 + }, + { + "epoch": 0.5260864579750029, + "grad_norm": 0.27674156427383423, + "learning_rate": 0.0009456661668352652, + "loss": 1.3168, + "step": 1147 + }, + { + "epoch": 0.5265451209723655, + "grad_norm": 0.2375446856021881, + "learning_rate": 0.0009455538433310584, + "loss": 1.0746, + "step": 1148 + }, + { + "epoch": 0.5270037839697282, + "grad_norm": 0.3496798574924469, + "learning_rate": 0.0009454414105292812, + "loss": 1.881, + "step": 1149 + }, + { + "epoch": 0.5274624469670909, + "grad_norm": 0.3575010895729065, + "learning_rate": 0.0009453288684575143, + "loss": 1.9427, + "step": 1150 + }, + { + "epoch": 0.5279211099644536, + "grad_norm": 0.3948631286621094, + "learning_rate": 0.0009452162171433648, + "loss": 2.2327, + "step": 1151 + }, + { + "epoch": 0.5283797729618163, + "grad_norm": 0.1527937948703766, + "learning_rate": 0.0009451034566144671, + "loss": 0.8336, + "step": 1152 + }, + { + "epoch": 0.528838435959179, + "grad_norm": 0.3415633738040924, + "learning_rate": 0.0009449905868984822, + "loss": 1.9399, + "step": 1153 + }, + { + "epoch": 0.5292970989565416, + "grad_norm": 0.2828967571258545, + "learning_rate": 0.0009448776080230979, + "loss": 1.3918, + "step": 1154 + }, + { + "epoch": 0.5297557619539044, + "grad_norm": 0.20446327328681946, + "learning_rate": 0.0009447645200160285, + "loss": 1.025, + "step": 1155 + }, + { + "epoch": 0.530214424951267, + "grad_norm": 0.3204951286315918, + "learning_rate": 0.0009446513229050154, + "loss": 1.8108, + "step": 1156 + }, + { + "epoch": 0.5306730879486298, + "grad_norm": 0.38850653171539307, + "learning_rate": 0.0009445380167178266, + "loss": 2.2024, + "step": 1157 + }, + { + "epoch": 0.5311317509459924, + "grad_norm": 0.22903253138065338, + "learning_rate": 0.0009444246014822571, + "loss": 1.0826, + "step": 1158 + }, + { + "epoch": 0.5315904139433552, + "grad_norm": 0.2147892266511917, + "learning_rate": 0.0009443110772261286, + "loss": 1.0546, + "step": 1159 + }, + { + "epoch": 0.5320490769407178, + "grad_norm": 0.34580257534980774, + "learning_rate": 0.0009441974439772889, + "loss": 1.7944, + "step": 1160 + }, + { + "epoch": 0.5325077399380805, + "grad_norm": 0.2733863592147827, + "learning_rate": 0.0009440837017636134, + "loss": 1.3638, + "step": 1161 + }, + { + "epoch": 0.5329664029354432, + "grad_norm": 0.3368569016456604, + "learning_rate": 0.000943969850613004, + "loss": 1.8119, + "step": 1162 + }, + { + "epoch": 0.5334250659328059, + "grad_norm": 0.25083136558532715, + "learning_rate": 0.0009438558905533889, + "loss": 1.414, + "step": 1163 + }, + { + "epoch": 0.5338837289301686, + "grad_norm": 0.3574562966823578, + "learning_rate": 0.0009437418216127236, + "loss": 2.1129, + "step": 1164 + }, + { + "epoch": 0.5343423919275313, + "grad_norm": 0.2785975933074951, + "learning_rate": 0.0009436276438189899, + "loss": 1.5375, + "step": 1165 + }, + { + "epoch": 0.5348010549248939, + "grad_norm": 0.34642040729522705, + "learning_rate": 0.0009435133572001965, + "loss": 1.8992, + "step": 1166 + }, + { + "epoch": 0.5352597179222566, + "grad_norm": 0.4057437479496002, + "learning_rate": 0.0009433989617843786, + "loss": 2.4216, + "step": 1167 + }, + { + "epoch": 0.5357183809196193, + "grad_norm": 0.2808201313018799, + "learning_rate": 0.0009432844575995983, + "loss": 1.3363, + "step": 1168 + }, + { + "epoch": 0.536177043916982, + "grad_norm": 0.29180237650871277, + "learning_rate": 0.0009431698446739443, + "loss": 1.7092, + "step": 1169 + }, + { + "epoch": 0.5366357069143447, + "grad_norm": 0.28107771277427673, + "learning_rate": 0.000943055123035532, + "loss": 1.2791, + "step": 1170 + }, + { + "epoch": 0.5370943699117073, + "grad_norm": 0.17355158925056458, + "learning_rate": 0.0009429402927125035, + "loss": 0.8209, + "step": 1171 + }, + { + "epoch": 0.5375530329090701, + "grad_norm": 0.31418073177337646, + "learning_rate": 0.0009428253537330272, + "loss": 1.6716, + "step": 1172 + }, + { + "epoch": 0.5380116959064327, + "grad_norm": 0.3649182617664337, + "learning_rate": 0.0009427103061252989, + "loss": 1.8427, + "step": 1173 + }, + { + "epoch": 0.5384703589037955, + "grad_norm": 0.41031497716903687, + "learning_rate": 0.0009425951499175404, + "loss": 2.2205, + "step": 1174 + }, + { + "epoch": 0.5389290219011581, + "grad_norm": 0.2333560585975647, + "learning_rate": 0.0009424798851380003, + "loss": 0.9493, + "step": 1175 + }, + { + "epoch": 0.5393876848985208, + "grad_norm": 0.31085172295570374, + "learning_rate": 0.0009423645118149539, + "loss": 1.4224, + "step": 1176 + }, + { + "epoch": 0.5398463478958835, + "grad_norm": 0.41655832529067993, + "learning_rate": 0.0009422490299767032, + "loss": 2.1355, + "step": 1177 + }, + { + "epoch": 0.5403050108932462, + "grad_norm": 0.390559583902359, + "learning_rate": 0.0009421334396515766, + "loss": 1.7881, + "step": 1178 + }, + { + "epoch": 0.5407636738906089, + "grad_norm": 0.3651013970375061, + "learning_rate": 0.0009420177408679294, + "loss": 1.9978, + "step": 1179 + }, + { + "epoch": 0.5412223368879716, + "grad_norm": 0.2856077551841736, + "learning_rate": 0.0009419019336541431, + "loss": 1.3818, + "step": 1180 + }, + { + "epoch": 0.5416809998853342, + "grad_norm": 0.2839828431606293, + "learning_rate": 0.0009417860180386264, + "loss": 1.3959, + "step": 1181 + }, + { + "epoch": 0.542139662882697, + "grad_norm": 0.338052898645401, + "learning_rate": 0.0009416699940498139, + "loss": 1.8724, + "step": 1182 + }, + { + "epoch": 0.5425983258800596, + "grad_norm": 0.26623108983039856, + "learning_rate": 0.0009415538617161672, + "loss": 1.5506, + "step": 1183 + }, + { + "epoch": 0.5430569888774223, + "grad_norm": 0.3088790774345398, + "learning_rate": 0.0009414376210661746, + "loss": 1.7001, + "step": 1184 + }, + { + "epoch": 0.543515651874785, + "grad_norm": 0.2527974545955658, + "learning_rate": 0.0009413212721283505, + "loss": 1.5046, + "step": 1185 + }, + { + "epoch": 0.5439743148721476, + "grad_norm": 0.29992103576660156, + "learning_rate": 0.0009412048149312364, + "loss": 1.405, + "step": 1186 + }, + { + "epoch": 0.5444329778695104, + "grad_norm": 0.26170700788497925, + "learning_rate": 0.0009410882495033998, + "loss": 1.1761, + "step": 1187 + }, + { + "epoch": 0.544891640866873, + "grad_norm": 0.1813340187072754, + "learning_rate": 0.0009409715758734352, + "loss": 0.9035, + "step": 1188 + }, + { + "epoch": 0.5453503038642358, + "grad_norm": 0.3251456916332245, + "learning_rate": 0.0009408547940699634, + "loss": 1.5636, + "step": 1189 + }, + { + "epoch": 0.5458089668615984, + "grad_norm": 0.3176114857196808, + "learning_rate": 0.0009407379041216321, + "loss": 1.402, + "step": 1190 + }, + { + "epoch": 0.5462676298589612, + "grad_norm": 0.21962450444698334, + "learning_rate": 0.0009406209060571149, + "loss": 1.009, + "step": 1191 + }, + { + "epoch": 0.5467262928563238, + "grad_norm": 0.25622203946113586, + "learning_rate": 0.0009405037999051125, + "loss": 0.9907, + "step": 1192 + }, + { + "epoch": 0.5471849558536865, + "grad_norm": 0.4034391939640045, + "learning_rate": 0.0009403865856943516, + "loss": 2.1548, + "step": 1193 + }, + { + "epoch": 0.5476436188510492, + "grad_norm": 0.10982491821050644, + "learning_rate": 0.0009402692634535861, + "loss": 0.5598, + "step": 1194 + }, + { + "epoch": 0.5481022818484119, + "grad_norm": 0.21411536633968353, + "learning_rate": 0.0009401518332115957, + "loss": 0.9482, + "step": 1195 + }, + { + "epoch": 0.5485609448457746, + "grad_norm": 0.35385677218437195, + "learning_rate": 0.0009400342949971868, + "loss": 1.8314, + "step": 1196 + }, + { + "epoch": 0.5490196078431373, + "grad_norm": 0.3468371033668518, + "learning_rate": 0.0009399166488391927, + "loss": 2.0469, + "step": 1197 + }, + { + "epoch": 0.5494782708404999, + "grad_norm": 0.315140038728714, + "learning_rate": 0.0009397988947664727, + "loss": 1.8289, + "step": 1198 + }, + { + "epoch": 0.5499369338378627, + "grad_norm": 0.2434529811143875, + "learning_rate": 0.0009396810328079126, + "loss": 1.2443, + "step": 1199 + }, + { + "epoch": 0.5503955968352253, + "grad_norm": 0.24115651845932007, + "learning_rate": 0.0009395630629924248, + "loss": 1.2274, + "step": 1200 + }, + { + "epoch": 0.550854259832588, + "grad_norm": 0.12398785352706909, + "learning_rate": 0.0009394449853489484, + "loss": 0.6473, + "step": 1201 + }, + { + "epoch": 0.5513129228299507, + "grad_norm": 0.3112662732601166, + "learning_rate": 0.0009393267999064486, + "loss": 1.5162, + "step": 1202 + }, + { + "epoch": 0.5517715858273133, + "grad_norm": 0.36855632066726685, + "learning_rate": 0.0009392085066939169, + "loss": 1.9268, + "step": 1203 + }, + { + "epoch": 0.5522302488246761, + "grad_norm": 0.24338075518608093, + "learning_rate": 0.0009390901057403716, + "loss": 1.1956, + "step": 1204 + }, + { + "epoch": 0.5526889118220387, + "grad_norm": 0.3083001375198364, + "learning_rate": 0.0009389715970748575, + "loss": 1.5538, + "step": 1205 + }, + { + "epoch": 0.5531475748194015, + "grad_norm": 0.2836032807826996, + "learning_rate": 0.0009388529807264455, + "loss": 1.4889, + "step": 1206 + }, + { + "epoch": 0.5536062378167641, + "grad_norm": 0.2570768892765045, + "learning_rate": 0.000938734256724233, + "loss": 1.3693, + "step": 1207 + }, + { + "epoch": 0.5540649008141268, + "grad_norm": 0.3241545855998993, + "learning_rate": 0.0009386154250973438, + "loss": 1.6445, + "step": 1208 + }, + { + "epoch": 0.5545235638114895, + "grad_norm": 0.29821258783340454, + "learning_rate": 0.0009384964858749283, + "loss": 1.4775, + "step": 1209 + }, + { + "epoch": 0.5549822268088522, + "grad_norm": 0.22773027420043945, + "learning_rate": 0.000938377439086163, + "loss": 1.2509, + "step": 1210 + }, + { + "epoch": 0.5554408898062149, + "grad_norm": 0.31152674555778503, + "learning_rate": 0.0009382582847602512, + "loss": 1.6638, + "step": 1211 + }, + { + "epoch": 0.5558995528035776, + "grad_norm": 0.11729778349399567, + "learning_rate": 0.0009381390229264221, + "loss": 0.5536, + "step": 1212 + }, + { + "epoch": 0.5563582158009402, + "grad_norm": 0.1624806672334671, + "learning_rate": 0.0009380196536139315, + "loss": 0.7977, + "step": 1213 + }, + { + "epoch": 0.556816878798303, + "grad_norm": 0.32244980335235596, + "learning_rate": 0.0009379001768520615, + "loss": 1.5859, + "step": 1214 + }, + { + "epoch": 0.5572755417956656, + "grad_norm": 0.2572208344936371, + "learning_rate": 0.0009377805926701208, + "loss": 1.2651, + "step": 1215 + }, + { + "epoch": 0.5577342047930284, + "grad_norm": 0.34207504987716675, + "learning_rate": 0.0009376609010974442, + "loss": 1.817, + "step": 1216 + }, + { + "epoch": 0.558192867790391, + "grad_norm": 0.33612552285194397, + "learning_rate": 0.0009375411021633927, + "loss": 1.7496, + "step": 1217 + }, + { + "epoch": 0.5586515307877536, + "grad_norm": 0.291958212852478, + "learning_rate": 0.0009374211958973542, + "loss": 1.6475, + "step": 1218 + }, + { + "epoch": 0.5591101937851164, + "grad_norm": 0.28733986616134644, + "learning_rate": 0.0009373011823287422, + "loss": 1.434, + "step": 1219 + }, + { + "epoch": 0.559568856782479, + "grad_norm": 0.2283989042043686, + "learning_rate": 0.0009371810614869971, + "loss": 0.9546, + "step": 1220 + }, + { + "epoch": 0.5600275197798418, + "grad_norm": 0.330026239156723, + "learning_rate": 0.0009370608334015856, + "loss": 1.5756, + "step": 1221 + }, + { + "epoch": 0.5604861827772044, + "grad_norm": 0.37356528639793396, + "learning_rate": 0.000936940498102, + "loss": 2.1412, + "step": 1222 + }, + { + "epoch": 0.5609448457745672, + "grad_norm": 0.35292860865592957, + "learning_rate": 0.0009368200556177598, + "loss": 1.697, + "step": 1223 + }, + { + "epoch": 0.5614035087719298, + "grad_norm": 0.30706706643104553, + "learning_rate": 0.0009366995059784104, + "loss": 1.2438, + "step": 1224 + }, + { + "epoch": 0.5618621717692925, + "grad_norm": 0.3499191999435425, + "learning_rate": 0.0009365788492135235, + "loss": 1.5356, + "step": 1225 + }, + { + "epoch": 0.5623208347666552, + "grad_norm": 0.16763897240161896, + "learning_rate": 0.0009364580853526967, + "loss": 0.8041, + "step": 1226 + }, + { + "epoch": 0.5627794977640179, + "grad_norm": 0.2519925534725189, + "learning_rate": 0.0009363372144255548, + "loss": 1.1698, + "step": 1227 + }, + { + "epoch": 0.5632381607613806, + "grad_norm": 0.17550460994243622, + "learning_rate": 0.0009362162364617479, + "loss": 0.9668, + "step": 1228 + }, + { + "epoch": 0.5636968237587433, + "grad_norm": 0.31531476974487305, + "learning_rate": 0.0009360951514909529, + "loss": 1.7645, + "step": 1229 + }, + { + "epoch": 0.5641554867561059, + "grad_norm": 0.26998183131217957, + "learning_rate": 0.0009359739595428729, + "loss": 1.194, + "step": 1230 + }, + { + "epoch": 0.5646141497534687, + "grad_norm": 0.3305562734603882, + "learning_rate": 0.000935852660647237, + "loss": 1.6608, + "step": 1231 + }, + { + "epoch": 0.5650728127508313, + "grad_norm": 0.3273736536502838, + "learning_rate": 0.000935731254833801, + "loss": 1.4538, + "step": 1232 + }, + { + "epoch": 0.5655314757481941, + "grad_norm": 0.3271407186985016, + "learning_rate": 0.0009356097421323461, + "loss": 1.8257, + "step": 1233 + }, + { + "epoch": 0.5659901387455567, + "grad_norm": 0.1826079934835434, + "learning_rate": 0.0009354881225726808, + "loss": 0.8564, + "step": 1234 + }, + { + "epoch": 0.5664488017429193, + "grad_norm": 0.255459725856781, + "learning_rate": 0.0009353663961846389, + "loss": 1.4302, + "step": 1235 + }, + { + "epoch": 0.5669074647402821, + "grad_norm": 0.2576141357421875, + "learning_rate": 0.0009352445629980809, + "loss": 1.3641, + "step": 1236 + }, + { + "epoch": 0.5673661277376447, + "grad_norm": 0.31090545654296875, + "learning_rate": 0.0009351226230428934, + "loss": 1.6024, + "step": 1237 + }, + { + "epoch": 0.5678247907350075, + "grad_norm": 0.2738056778907776, + "learning_rate": 0.0009350005763489888, + "loss": 1.4413, + "step": 1238 + }, + { + "epoch": 0.5682834537323701, + "grad_norm": 0.34732604026794434, + "learning_rate": 0.0009348784229463065, + "loss": 2.0205, + "step": 1239 + }, + { + "epoch": 0.5687421167297328, + "grad_norm": 0.2433692067861557, + "learning_rate": 0.0009347561628648115, + "loss": 1.4734, + "step": 1240 + }, + { + "epoch": 0.5692007797270955, + "grad_norm": 0.2669577896595001, + "learning_rate": 0.0009346337961344948, + "loss": 1.3418, + "step": 1241 + }, + { + "epoch": 0.5696594427244582, + "grad_norm": 0.2639061510562897, + "learning_rate": 0.0009345113227853741, + "loss": 1.4639, + "step": 1242 + }, + { + "epoch": 0.5701181057218209, + "grad_norm": 0.29163965582847595, + "learning_rate": 0.000934388742847493, + "loss": 1.4735, + "step": 1243 + }, + { + "epoch": 0.5705767687191836, + "grad_norm": 0.1659363955259323, + "learning_rate": 0.0009342660563509211, + "loss": 0.8228, + "step": 1244 + }, + { + "epoch": 0.5710354317165462, + "grad_norm": 0.2903018295764923, + "learning_rate": 0.0009341432633257543, + "loss": 1.3949, + "step": 1245 + }, + { + "epoch": 0.571494094713909, + "grad_norm": 0.39374253153800964, + "learning_rate": 0.0009340203638021149, + "loss": 2.1997, + "step": 1246 + }, + { + "epoch": 0.5719527577112716, + "grad_norm": 0.31729447841644287, + "learning_rate": 0.0009338973578101506, + "loss": 1.4435, + "step": 1247 + }, + { + "epoch": 0.5724114207086344, + "grad_norm": 0.12577004730701447, + "learning_rate": 0.000933774245380036, + "loss": 0.6955, + "step": 1248 + }, + { + "epoch": 0.572870083705997, + "grad_norm": 0.3122677206993103, + "learning_rate": 0.0009336510265419712, + "loss": 1.5015, + "step": 1249 + }, + { + "epoch": 0.5733287467033598, + "grad_norm": 0.2251952886581421, + "learning_rate": 0.000933527701326183, + "loss": 0.9793, + "step": 1250 + }, + { + "epoch": 0.5737874097007224, + "grad_norm": 0.2624088227748871, + "learning_rate": 0.0009334042697629235, + "loss": 1.2029, + "step": 1251 + }, + { + "epoch": 0.574246072698085, + "grad_norm": 0.2972775399684906, + "learning_rate": 0.0009332807318824717, + "loss": 1.3044, + "step": 1252 + }, + { + "epoch": 0.5747047356954478, + "grad_norm": 0.6643463969230652, + "learning_rate": 0.0009331570877151324, + "loss": 1.9626, + "step": 1253 + }, + { + "epoch": 0.5751633986928104, + "grad_norm": 0.25192347168922424, + "learning_rate": 0.0009330333372912361, + "loss": 1.2851, + "step": 1254 + }, + { + "epoch": 0.5756220616901732, + "grad_norm": 0.28251874446868896, + "learning_rate": 0.0009329094806411401, + "loss": 1.4486, + "step": 1255 + }, + { + "epoch": 0.5760807246875358, + "grad_norm": 0.3544906675815582, + "learning_rate": 0.0009327855177952267, + "loss": 1.7761, + "step": 1256 + }, + { + "epoch": 0.5765393876848985, + "grad_norm": 0.18498443067073822, + "learning_rate": 0.0009326614487839053, + "loss": 0.8643, + "step": 1257 + }, + { + "epoch": 0.5769980506822612, + "grad_norm": 0.2767449617385864, + "learning_rate": 0.0009325372736376109, + "loss": 1.3204, + "step": 1258 + }, + { + "epoch": 0.5774567136796239, + "grad_norm": 0.32360634207725525, + "learning_rate": 0.0009324129923868048, + "loss": 1.555, + "step": 1259 + }, + { + "epoch": 0.5779153766769866, + "grad_norm": 0.09593556076288223, + "learning_rate": 0.0009322886050619735, + "loss": 0.5922, + "step": 1260 + }, + { + "epoch": 0.5783740396743493, + "grad_norm": 0.2054739147424698, + "learning_rate": 0.0009321641116936306, + "loss": 0.8165, + "step": 1261 + }, + { + "epoch": 0.5788327026717119, + "grad_norm": 0.6027858257293701, + "learning_rate": 0.0009320395123123149, + "loss": 1.8843, + "step": 1262 + }, + { + "epoch": 0.5792913656690747, + "grad_norm": 0.31743377447128296, + "learning_rate": 0.0009319148069485917, + "loss": 1.2451, + "step": 1263 + }, + { + "epoch": 0.5797500286664373, + "grad_norm": 0.2888292670249939, + "learning_rate": 0.0009317899956330522, + "loss": 1.4385, + "step": 1264 + }, + { + "epoch": 0.5802086916638001, + "grad_norm": 0.17408441007137299, + "learning_rate": 0.0009316650783963132, + "loss": 0.7488, + "step": 1265 + }, + { + "epoch": 0.5806673546611627, + "grad_norm": 0.3600836396217346, + "learning_rate": 0.0009315400552690181, + "loss": 1.5487, + "step": 1266 + }, + { + "epoch": 0.5811260176585255, + "grad_norm": 0.2011416256427765, + "learning_rate": 0.0009314149262818358, + "loss": 0.825, + "step": 1267 + }, + { + "epoch": 0.5815846806558881, + "grad_norm": 0.37469393014907837, + "learning_rate": 0.0009312896914654616, + "loss": 1.4067, + "step": 1268 + }, + { + "epoch": 0.5820433436532507, + "grad_norm": 0.2877853512763977, + "learning_rate": 0.0009311643508506162, + "loss": 1.4527, + "step": 1269 + }, + { + "epoch": 0.5825020066506135, + "grad_norm": 0.3054576814174652, + "learning_rate": 0.0009310389044680467, + "loss": 1.39, + "step": 1270 + }, + { + "epoch": 0.5829606696479761, + "grad_norm": 0.09884677827358246, + "learning_rate": 0.000930913352348526, + "loss": 0.6243, + "step": 1271 + }, + { + "epoch": 0.5834193326453389, + "grad_norm": 0.25772392749786377, + "learning_rate": 0.0009307876945228528, + "loss": 1.4088, + "step": 1272 + }, + { + "epoch": 0.5838779956427015, + "grad_norm": 0.3259493112564087, + "learning_rate": 0.0009306619310218521, + "loss": 1.6538, + "step": 1273 + }, + { + "epoch": 0.5843366586400642, + "grad_norm": 0.09127166122198105, + "learning_rate": 0.0009305360618763745, + "loss": 0.5361, + "step": 1274 + }, + { + "epoch": 0.5847953216374269, + "grad_norm": 0.18113906681537628, + "learning_rate": 0.0009304100871172967, + "loss": 0.9267, + "step": 1275 + }, + { + "epoch": 0.5852539846347896, + "grad_norm": 0.32655301690101624, + "learning_rate": 0.000930284006775521, + "loss": 1.6382, + "step": 1276 + }, + { + "epoch": 0.5857126476321523, + "grad_norm": 0.19575290381908417, + "learning_rate": 0.0009301578208819758, + "loss": 0.9375, + "step": 1277 + }, + { + "epoch": 0.586171310629515, + "grad_norm": 0.3941885828971863, + "learning_rate": 0.0009300315294676158, + "loss": 2.1287, + "step": 1278 + }, + { + "epoch": 0.5866299736268776, + "grad_norm": 0.28158482909202576, + "learning_rate": 0.0009299051325634208, + "loss": 1.3167, + "step": 1279 + }, + { + "epoch": 0.5870886366242404, + "grad_norm": 0.1906464397907257, + "learning_rate": 0.000929778630200397, + "loss": 0.8625, + "step": 1280 + }, + { + "epoch": 0.587547299621603, + "grad_norm": 0.1061529740691185, + "learning_rate": 0.0009296520224095764, + "loss": 0.6863, + "step": 1281 + }, + { + "epoch": 0.5880059626189658, + "grad_norm": 0.26948490738868713, + "learning_rate": 0.0009295253092220166, + "loss": 0.886, + "step": 1282 + }, + { + "epoch": 0.5884646256163284, + "grad_norm": 0.36432507634162903, + "learning_rate": 0.0009293984906688016, + "loss": 1.8413, + "step": 1283 + }, + { + "epoch": 0.5889232886136911, + "grad_norm": 0.5231337547302246, + "learning_rate": 0.0009292715667810406, + "loss": 1.7709, + "step": 1284 + }, + { + "epoch": 0.5893819516110538, + "grad_norm": 0.3480357825756073, + "learning_rate": 0.000929144537589869, + "loss": 1.9674, + "step": 1285 + }, + { + "epoch": 0.5898406146084164, + "grad_norm": 0.24957574903964996, + "learning_rate": 0.0009290174031264482, + "loss": 1.2784, + "step": 1286 + }, + { + "epoch": 0.5902992776057792, + "grad_norm": 0.33761247992515564, + "learning_rate": 0.000928890163421965, + "loss": 1.9197, + "step": 1287 + }, + { + "epoch": 0.5907579406031418, + "grad_norm": 0.35372909903526306, + "learning_rate": 0.0009287628185076322, + "loss": 2.0242, + "step": 1288 + }, + { + "epoch": 0.5912166036005045, + "grad_norm": 0.21915297210216522, + "learning_rate": 0.0009286353684146884, + "loss": 0.9164, + "step": 1289 + }, + { + "epoch": 0.5916752665978672, + "grad_norm": 0.3058471083641052, + "learning_rate": 0.0009285078131743982, + "loss": 1.5614, + "step": 1290 + }, + { + "epoch": 0.5921339295952299, + "grad_norm": 0.1828153282403946, + "learning_rate": 0.0009283801528180517, + "loss": 0.8684, + "step": 1291 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 0.1839916706085205, + "learning_rate": 0.000928252387376965, + "loss": 0.8804, + "step": 1292 + }, + { + "epoch": 0.5930512555899553, + "grad_norm": 0.21669816970825195, + "learning_rate": 0.0009281245168824799, + "loss": 1.1169, + "step": 1293 + }, + { + "epoch": 0.5935099185873179, + "grad_norm": 0.2863485813140869, + "learning_rate": 0.0009279965413659637, + "loss": 1.5664, + "step": 1294 + }, + { + "epoch": 0.5939685815846807, + "grad_norm": 0.22065885365009308, + "learning_rate": 0.00092786846085881, + "loss": 1.0018, + "step": 1295 + }, + { + "epoch": 0.5944272445820433, + "grad_norm": 0.3145039975643158, + "learning_rate": 0.0009277402753924376, + "loss": 1.4487, + "step": 1296 + }, + { + "epoch": 0.5948859075794061, + "grad_norm": 0.2897392809391022, + "learning_rate": 0.0009276119849982917, + "loss": 1.384, + "step": 1297 + }, + { + "epoch": 0.5953445705767687, + "grad_norm": 0.3938884139060974, + "learning_rate": 0.0009274835897078425, + "loss": 2.1997, + "step": 1298 + }, + { + "epoch": 0.5958032335741315, + "grad_norm": 0.17570100724697113, + "learning_rate": 0.0009273550895525864, + "loss": 0.8196, + "step": 1299 + }, + { + "epoch": 0.5962618965714941, + "grad_norm": 0.32507872581481934, + "learning_rate": 0.0009272264845640455, + "loss": 1.4766, + "step": 1300 + }, + { + "epoch": 0.5967205595688568, + "grad_norm": 0.19803722202777863, + "learning_rate": 0.0009270977747737675, + "loss": 0.9604, + "step": 1301 + }, + { + "epoch": 0.5971792225662195, + "grad_norm": 0.32639971375465393, + "learning_rate": 0.0009269689602133258, + "loss": 1.7936, + "step": 1302 + }, + { + "epoch": 0.5976378855635821, + "grad_norm": 0.35615068674087524, + "learning_rate": 0.0009268400409143195, + "loss": 1.7267, + "step": 1303 + }, + { + "epoch": 0.5980965485609449, + "grad_norm": 0.33645865321159363, + "learning_rate": 0.0009267110169083734, + "loss": 1.6238, + "step": 1304 + }, + { + "epoch": 0.5985552115583075, + "grad_norm": 0.29043132066726685, + "learning_rate": 0.0009265818882271384, + "loss": 1.2636, + "step": 1305 + }, + { + "epoch": 0.5990138745556702, + "grad_norm": 0.41903120279312134, + "learning_rate": 0.0009264526549022903, + "loss": 2.3948, + "step": 1306 + }, + { + "epoch": 0.5994725375530329, + "grad_norm": 0.2502308189868927, + "learning_rate": 0.0009263233169655309, + "loss": 1.1672, + "step": 1307 + }, + { + "epoch": 0.5999312005503956, + "grad_norm": 0.1772756576538086, + "learning_rate": 0.000926193874448588, + "loss": 0.8739, + "step": 1308 + }, + { + "epoch": 0.6003898635477583, + "grad_norm": 0.24992649257183075, + "learning_rate": 0.0009260643273832147, + "loss": 1.4184, + "step": 1309 + }, + { + "epoch": 0.600848526545121, + "grad_norm": 0.1002533808350563, + "learning_rate": 0.0009259346758011898, + "loss": 0.6456, + "step": 1310 + }, + { + "epoch": 0.6013071895424836, + "grad_norm": 0.3838384449481964, + "learning_rate": 0.0009258049197343177, + "loss": 2.1184, + "step": 1311 + }, + { + "epoch": 0.6017658525398464, + "grad_norm": 0.1766338050365448, + "learning_rate": 0.0009256750592144287, + "loss": 0.8759, + "step": 1312 + }, + { + "epoch": 0.602224515537209, + "grad_norm": 0.37328869104385376, + "learning_rate": 0.0009255450942733783, + "loss": 1.9109, + "step": 1313 + }, + { + "epoch": 0.6026831785345718, + "grad_norm": 0.19689656794071198, + "learning_rate": 0.0009254150249430479, + "loss": 0.9383, + "step": 1314 + }, + { + "epoch": 0.6031418415319344, + "grad_norm": 0.0794055312871933, + "learning_rate": 0.0009252848512553447, + "loss": 0.4766, + "step": 1315 + }, + { + "epoch": 0.6036005045292971, + "grad_norm": 0.277407705783844, + "learning_rate": 0.0009251545732422009, + "loss": 1.7058, + "step": 1316 + }, + { + "epoch": 0.6040591675266598, + "grad_norm": 0.28111398220062256, + "learning_rate": 0.0009250241909355746, + "loss": 1.8057, + "step": 1317 + }, + { + "epoch": 0.6045178305240225, + "grad_norm": 0.2518330514431, + "learning_rate": 0.0009248937043674499, + "loss": 1.3856, + "step": 1318 + }, + { + "epoch": 0.6049764935213852, + "grad_norm": 0.3877377510070801, + "learning_rate": 0.0009247631135698358, + "loss": 2.1638, + "step": 1319 + }, + { + "epoch": 0.6054351565187478, + "grad_norm": 0.2325863391160965, + "learning_rate": 0.0009246324185747672, + "loss": 1.3607, + "step": 1320 + }, + { + "epoch": 0.6058938195161105, + "grad_norm": 0.23192419111728668, + "learning_rate": 0.0009245016194143047, + "loss": 1.1667, + "step": 1321 + }, + { + "epoch": 0.6063524825134732, + "grad_norm": 0.0964299887418747, + "learning_rate": 0.000924370716120534, + "loss": 0.623, + "step": 1322 + }, + { + "epoch": 0.6068111455108359, + "grad_norm": 0.23401454091072083, + "learning_rate": 0.0009242397087255667, + "loss": 1.2682, + "step": 1323 + }, + { + "epoch": 0.6072698085081986, + "grad_norm": 0.30539819598197937, + "learning_rate": 0.0009241085972615401, + "loss": 1.9392, + "step": 1324 + }, + { + "epoch": 0.6077284715055613, + "grad_norm": 0.1904824823141098, + "learning_rate": 0.0009239773817606165, + "loss": 0.8026, + "step": 1325 + }, + { + "epoch": 0.6081871345029239, + "grad_norm": 0.31935641169548035, + "learning_rate": 0.0009238460622549842, + "loss": 1.7756, + "step": 1326 + }, + { + "epoch": 0.6086457975002867, + "grad_norm": 0.232622429728508, + "learning_rate": 0.0009237146387768567, + "loss": 1.2202, + "step": 1327 + }, + { + "epoch": 0.6091044604976493, + "grad_norm": 0.22386907041072845, + "learning_rate": 0.0009235831113584732, + "loss": 1.0802, + "step": 1328 + }, + { + "epoch": 0.6095631234950121, + "grad_norm": 0.2252710908651352, + "learning_rate": 0.0009234514800320983, + "loss": 0.9511, + "step": 1329 + }, + { + "epoch": 0.6100217864923747, + "grad_norm": 0.3520384430885315, + "learning_rate": 0.0009233197448300221, + "loss": 1.898, + "step": 1330 + }, + { + "epoch": 0.6104804494897375, + "grad_norm": 0.3444898724555969, + "learning_rate": 0.0009231879057845601, + "loss": 1.77, + "step": 1331 + }, + { + "epoch": 0.6109391124871001, + "grad_norm": 0.24436596035957336, + "learning_rate": 0.0009230559629280535, + "loss": 1.054, + "step": 1332 + }, + { + "epoch": 0.6113977754844628, + "grad_norm": 0.11772722005844116, + "learning_rate": 0.0009229239162928689, + "loss": 0.6621, + "step": 1333 + }, + { + "epoch": 0.6118564384818255, + "grad_norm": 0.2954722046852112, + "learning_rate": 0.0009227917659113982, + "loss": 1.5235, + "step": 1334 + }, + { + "epoch": 0.6123151014791882, + "grad_norm": 0.2937714755535126, + "learning_rate": 0.0009226595118160588, + "loss": 1.3464, + "step": 1335 + }, + { + "epoch": 0.6127737644765509, + "grad_norm": 0.27231964468955994, + "learning_rate": 0.0009225271540392934, + "loss": 1.2777, + "step": 1336 + }, + { + "epoch": 0.6132324274739135, + "grad_norm": 0.19506557285785675, + "learning_rate": 0.0009223946926135709, + "loss": 0.8657, + "step": 1337 + }, + { + "epoch": 0.6136910904712762, + "grad_norm": 0.39066511392593384, + "learning_rate": 0.0009222621275713844, + "loss": 1.8091, + "step": 1338 + }, + { + "epoch": 0.6141497534686389, + "grad_norm": 0.4800299108028412, + "learning_rate": 0.0009221294589452535, + "loss": 2.0739, + "step": 1339 + }, + { + "epoch": 0.6146084164660016, + "grad_norm": 0.3111729621887207, + "learning_rate": 0.0009219966867677226, + "loss": 1.5457, + "step": 1340 + }, + { + "epoch": 0.6150670794633643, + "grad_norm": 0.30846676230430603, + "learning_rate": 0.0009218638110713615, + "loss": 1.4479, + "step": 1341 + }, + { + "epoch": 0.615525742460727, + "grad_norm": 0.3237318992614746, + "learning_rate": 0.0009217308318887659, + "loss": 1.5574, + "step": 1342 + }, + { + "epoch": 0.6159844054580896, + "grad_norm": 0.2283388376235962, + "learning_rate": 0.0009215977492525565, + "loss": 1.1732, + "step": 1343 + }, + { + "epoch": 0.6164430684554524, + "grad_norm": 0.3451511263847351, + "learning_rate": 0.0009214645631953791, + "loss": 1.7261, + "step": 1344 + }, + { + "epoch": 0.616901731452815, + "grad_norm": 0.3029400110244751, + "learning_rate": 0.0009213312737499055, + "loss": 1.3227, + "step": 1345 + }, + { + "epoch": 0.6173603944501778, + "grad_norm": 0.17842943966388702, + "learning_rate": 0.0009211978809488327, + "loss": 0.5785, + "step": 1346 + }, + { + "epoch": 0.6178190574475404, + "grad_norm": 0.3857697546482086, + "learning_rate": 0.0009210643848248824, + "loss": 1.8008, + "step": 1347 + }, + { + "epoch": 0.6182777204449031, + "grad_norm": 0.23998984694480896, + "learning_rate": 0.0009209307854108026, + "loss": 1.1037, + "step": 1348 + }, + { + "epoch": 0.6187363834422658, + "grad_norm": 0.2030552476644516, + "learning_rate": 0.0009207970827393661, + "loss": 0.8847, + "step": 1349 + }, + { + "epoch": 0.6191950464396285, + "grad_norm": 0.17879889905452728, + "learning_rate": 0.0009206632768433711, + "loss": 0.938, + "step": 1350 + }, + { + "epoch": 0.6196537094369912, + "grad_norm": 0.30431312322616577, + "learning_rate": 0.0009205293677556413, + "loss": 1.2913, + "step": 1351 + }, + { + "epoch": 0.6201123724343539, + "grad_norm": 0.09835697710514069, + "learning_rate": 0.0009203953555090252, + "loss": 0.4719, + "step": 1352 + }, + { + "epoch": 0.6205710354317165, + "grad_norm": 0.3493775427341461, + "learning_rate": 0.0009202612401363972, + "loss": 1.7051, + "step": 1353 + }, + { + "epoch": 0.6210296984290792, + "grad_norm": 0.35240983963012695, + "learning_rate": 0.0009201270216706568, + "loss": 1.2707, + "step": 1354 + }, + { + "epoch": 0.6214883614264419, + "grad_norm": 0.08556913584470749, + "learning_rate": 0.0009199927001447287, + "loss": 0.4551, + "step": 1355 + }, + { + "epoch": 0.6219470244238046, + "grad_norm": 0.18923942744731903, + "learning_rate": 0.000919858275591563, + "loss": 0.9558, + "step": 1356 + }, + { + "epoch": 0.6224056874211673, + "grad_norm": 0.22347323596477509, + "learning_rate": 0.000919723748044135, + "loss": 1.0806, + "step": 1357 + }, + { + "epoch": 0.6228643504185299, + "grad_norm": 1.7624882459640503, + "learning_rate": 0.0009195891175354451, + "loss": 1.8876, + "step": 1358 + }, + { + "epoch": 0.6233230134158927, + "grad_norm": 0.20518071949481964, + "learning_rate": 0.0009194543840985193, + "loss": 0.9335, + "step": 1359 + }, + { + "epoch": 0.6237816764132553, + "grad_norm": 0.24788199365139008, + "learning_rate": 0.0009193195477664087, + "loss": 1.2626, + "step": 1360 + }, + { + "epoch": 0.6242403394106181, + "grad_norm": 0.1483655720949173, + "learning_rate": 0.0009191846085721896, + "loss": 0.8627, + "step": 1361 + }, + { + "epoch": 0.6246990024079807, + "grad_norm": 0.42687055468559265, + "learning_rate": 0.0009190495665489635, + "loss": 1.7659, + "step": 1362 + }, + { + "epoch": 0.6251576654053435, + "grad_norm": 0.23796336352825165, + "learning_rate": 0.0009189144217298571, + "loss": 0.9816, + "step": 1363 + }, + { + "epoch": 0.6256163284027061, + "grad_norm": 0.30603915452957153, + "learning_rate": 0.0009187791741480227, + "loss": 1.3668, + "step": 1364 + }, + { + "epoch": 0.6260749914000688, + "grad_norm": 0.2691842019557953, + "learning_rate": 0.0009186438238366373, + "loss": 1.4046, + "step": 1365 + }, + { + "epoch": 0.6265336543974315, + "grad_norm": 0.3278261125087738, + "learning_rate": 0.0009185083708289032, + "loss": 1.6694, + "step": 1366 + }, + { + "epoch": 0.6269923173947942, + "grad_norm": 0.319321870803833, + "learning_rate": 0.0009183728151580484, + "loss": 1.8997, + "step": 1367 + }, + { + "epoch": 0.6274509803921569, + "grad_norm": 0.2504161596298218, + "learning_rate": 0.0009182371568573252, + "loss": 1.1439, + "step": 1368 + }, + { + "epoch": 0.6279096433895196, + "grad_norm": 0.35240206122398376, + "learning_rate": 0.0009181013959600119, + "loss": 1.7751, + "step": 1369 + }, + { + "epoch": 0.6283683063868822, + "grad_norm": 0.30558404326438904, + "learning_rate": 0.0009179655324994114, + "loss": 1.4472, + "step": 1370 + }, + { + "epoch": 0.6288269693842449, + "grad_norm": 0.3027516305446625, + "learning_rate": 0.0009178295665088522, + "loss": 1.431, + "step": 1371 + }, + { + "epoch": 0.6292856323816076, + "grad_norm": 0.1590060144662857, + "learning_rate": 0.0009176934980216876, + "loss": 0.7231, + "step": 1372 + }, + { + "epoch": 0.6297442953789703, + "grad_norm": 0.280381441116333, + "learning_rate": 0.0009175573270712961, + "loss": 1.2524, + "step": 1373 + }, + { + "epoch": 0.630202958376333, + "grad_norm": 0.2787134051322937, + "learning_rate": 0.0009174210536910816, + "loss": 1.3318, + "step": 1374 + }, + { + "epoch": 0.6306616213736956, + "grad_norm": 0.24251703917980194, + "learning_rate": 0.0009172846779144729, + "loss": 1.301, + "step": 1375 + }, + { + "epoch": 0.6311202843710584, + "grad_norm": 0.20021305978298187, + "learning_rate": 0.0009171481997749239, + "loss": 1.0351, + "step": 1376 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.19602487981319427, + "learning_rate": 0.0009170116193059138, + "loss": 0.8918, + "step": 1377 + }, + { + "epoch": 0.6320376103657838, + "grad_norm": 0.2832145690917969, + "learning_rate": 0.0009168749365409466, + "loss": 1.3265, + "step": 1378 + }, + { + "epoch": 0.6324962733631464, + "grad_norm": 0.2144933044910431, + "learning_rate": 0.0009167381515135515, + "loss": 0.8716, + "step": 1379 + }, + { + "epoch": 0.6329549363605091, + "grad_norm": 0.17951545119285583, + "learning_rate": 0.0009166012642572832, + "loss": 0.7958, + "step": 1380 + }, + { + "epoch": 0.6334135993578718, + "grad_norm": 0.06861251592636108, + "learning_rate": 0.0009164642748057208, + "loss": 0.403, + "step": 1381 + }, + { + "epoch": 0.6338722623552345, + "grad_norm": 0.08268727362155914, + "learning_rate": 0.0009163271831924689, + "loss": 0.4848, + "step": 1382 + }, + { + "epoch": 0.6343309253525972, + "grad_norm": 0.26671648025512695, + "learning_rate": 0.0009161899894511572, + "loss": 1.2793, + "step": 1383 + }, + { + "epoch": 0.6347895883499599, + "grad_norm": 0.31374937295913696, + "learning_rate": 0.00091605269361544, + "loss": 1.6044, + "step": 1384 + }, + { + "epoch": 0.6352482513473225, + "grad_norm": 0.3432426452636719, + "learning_rate": 0.0009159152957189975, + "loss": 1.7935, + "step": 1385 + }, + { + "epoch": 0.6357069143446853, + "grad_norm": 0.3372509479522705, + "learning_rate": 0.0009157777957955337, + "loss": 1.8975, + "step": 1386 + }, + { + "epoch": 0.6361655773420479, + "grad_norm": 0.3484379053115845, + "learning_rate": 0.000915640193878779, + "loss": 1.6467, + "step": 1387 + }, + { + "epoch": 0.6366242403394106, + "grad_norm": 0.268187016248703, + "learning_rate": 0.0009155024900024877, + "loss": 1.3937, + "step": 1388 + }, + { + "epoch": 0.6370829033367733, + "grad_norm": 0.27969205379486084, + "learning_rate": 0.0009153646842004396, + "loss": 1.2155, + "step": 1389 + }, + { + "epoch": 0.637541566334136, + "grad_norm": 0.22392551600933075, + "learning_rate": 0.0009152267765064395, + "loss": 0.8904, + "step": 1390 + }, + { + "epoch": 0.6380002293314987, + "grad_norm": 0.3370068371295929, + "learning_rate": 0.0009150887669543173, + "loss": 1.496, + "step": 1391 + }, + { + "epoch": 0.6384588923288613, + "grad_norm": 0.4173247814178467, + "learning_rate": 0.0009149506555779277, + "loss": 2.1025, + "step": 1392 + }, + { + "epoch": 0.6389175553262241, + "grad_norm": 0.3257886469364166, + "learning_rate": 0.0009148124424111501, + "loss": 1.2627, + "step": 1393 + }, + { + "epoch": 0.6393762183235867, + "grad_norm": 0.34521573781967163, + "learning_rate": 0.0009146741274878896, + "loss": 1.6971, + "step": 1394 + }, + { + "epoch": 0.6398348813209495, + "grad_norm": 0.43050798773765564, + "learning_rate": 0.0009145357108420756, + "loss": 1.8064, + "step": 1395 + }, + { + "epoch": 0.6402935443183121, + "grad_norm": 0.31571662425994873, + "learning_rate": 0.0009143971925076629, + "loss": 1.3123, + "step": 1396 + }, + { + "epoch": 0.6407522073156748, + "grad_norm": 0.3214929401874542, + "learning_rate": 0.0009142585725186307, + "loss": 1.5576, + "step": 1397 + }, + { + "epoch": 0.6412108703130375, + "grad_norm": 0.3679821491241455, + "learning_rate": 0.0009141198509089838, + "loss": 1.677, + "step": 1398 + }, + { + "epoch": 0.6416695333104002, + "grad_norm": 0.3775405287742615, + "learning_rate": 0.0009139810277127516, + "loss": 1.8577, + "step": 1399 + }, + { + "epoch": 0.6421281963077629, + "grad_norm": 0.12690681219100952, + "learning_rate": 0.0009138421029639882, + "loss": 0.6524, + "step": 1400 + }, + { + "epoch": 0.6425868593051256, + "grad_norm": 0.22937628626823425, + "learning_rate": 0.0009137030766967731, + "loss": 0.9138, + "step": 1401 + }, + { + "epoch": 0.6430455223024882, + "grad_norm": 0.29358240962028503, + "learning_rate": 0.0009135639489452103, + "loss": 1.4675, + "step": 1402 + }, + { + "epoch": 0.643504185299851, + "grad_norm": 0.30726784467697144, + "learning_rate": 0.0009134247197434288, + "loss": 1.5634, + "step": 1403 + }, + { + "epoch": 0.6439628482972136, + "grad_norm": 0.2912197411060333, + "learning_rate": 0.0009132853891255827, + "loss": 1.3088, + "step": 1404 + }, + { + "epoch": 0.6444215112945763, + "grad_norm": 0.2756345868110657, + "learning_rate": 0.0009131459571258507, + "loss": 1.5412, + "step": 1405 + }, + { + "epoch": 0.644880174291939, + "grad_norm": 0.41657257080078125, + "learning_rate": 0.0009130064237784364, + "loss": 1.8962, + "step": 1406 + }, + { + "epoch": 0.6453388372893016, + "grad_norm": 0.270457923412323, + "learning_rate": 0.0009128667891175685, + "loss": 1.441, + "step": 1407 + }, + { + "epoch": 0.6457975002866644, + "grad_norm": 0.3133867383003235, + "learning_rate": 0.0009127270531775003, + "loss": 1.7017, + "step": 1408 + }, + { + "epoch": 0.646256163284027, + "grad_norm": 0.44148820638656616, + "learning_rate": 0.00091258721599251, + "loss": 1.6197, + "step": 1409 + }, + { + "epoch": 0.6467148262813898, + "grad_norm": 0.33237019181251526, + "learning_rate": 0.0009124472775969006, + "loss": 1.6628, + "step": 1410 + }, + { + "epoch": 0.6471734892787524, + "grad_norm": 0.3485216796398163, + "learning_rate": 0.0009123072380250003, + "loss": 0.9962, + "step": 1411 + }, + { + "epoch": 0.6476321522761151, + "grad_norm": 0.25593242049217224, + "learning_rate": 0.0009121670973111616, + "loss": 1.3634, + "step": 1412 + }, + { + "epoch": 0.6480908152734778, + "grad_norm": 0.20914927124977112, + "learning_rate": 0.000912026855489762, + "loss": 0.9994, + "step": 1413 + }, + { + "epoch": 0.6485494782708405, + "grad_norm": 0.2085810899734497, + "learning_rate": 0.0009118865125952038, + "loss": 1.0074, + "step": 1414 + }, + { + "epoch": 0.6490081412682032, + "grad_norm": 0.1792818307876587, + "learning_rate": 0.0009117460686619143, + "loss": 0.774, + "step": 1415 + }, + { + "epoch": 0.6494668042655659, + "grad_norm": 0.24848847091197968, + "learning_rate": 0.0009116055237243454, + "loss": 1.1238, + "step": 1416 + }, + { + "epoch": 0.6499254672629285, + "grad_norm": 0.2472166121006012, + "learning_rate": 0.0009114648778169735, + "loss": 1.1925, + "step": 1417 + }, + { + "epoch": 0.6503841302602913, + "grad_norm": 0.30023932456970215, + "learning_rate": 0.0009113241309743003, + "loss": 1.6653, + "step": 1418 + }, + { + "epoch": 0.6508427932576539, + "grad_norm": 0.2693098783493042, + "learning_rate": 0.0009111832832308522, + "loss": 1.4279, + "step": 1419 + }, + { + "epoch": 0.6513014562550167, + "grad_norm": 0.2827019691467285, + "learning_rate": 0.0009110423346211797, + "loss": 1.5595, + "step": 1420 + }, + { + "epoch": 0.6517601192523793, + "grad_norm": 0.31124547123908997, + "learning_rate": 0.0009109012851798588, + "loss": 1.7781, + "step": 1421 + }, + { + "epoch": 0.6522187822497421, + "grad_norm": 0.23482073843479156, + "learning_rate": 0.0009107601349414899, + "loss": 1.1257, + "step": 1422 + }, + { + "epoch": 0.6526774452471047, + "grad_norm": 0.34504595398902893, + "learning_rate": 0.0009106188839406982, + "loss": 1.9086, + "step": 1423 + }, + { + "epoch": 0.6531361082444673, + "grad_norm": 0.1269492506980896, + "learning_rate": 0.0009104775322121334, + "loss": 0.7008, + "step": 1424 + }, + { + "epoch": 0.6535947712418301, + "grad_norm": 0.2349303960800171, + "learning_rate": 0.0009103360797904705, + "loss": 1.3405, + "step": 1425 + }, + { + "epoch": 0.6540534342391927, + "grad_norm": 0.27141550183296204, + "learning_rate": 0.0009101945267104084, + "loss": 1.4506, + "step": 1426 + }, + { + "epoch": 0.6545120972365555, + "grad_norm": 0.28015798330307007, + "learning_rate": 0.0009100528730066712, + "loss": 1.3356, + "step": 1427 + }, + { + "epoch": 0.6549707602339181, + "grad_norm": 0.07972334325313568, + "learning_rate": 0.0009099111187140078, + "loss": 0.5118, + "step": 1428 + }, + { + "epoch": 0.6554294232312808, + "grad_norm": 0.2610866129398346, + "learning_rate": 0.0009097692638671913, + "loss": 1.1424, + "step": 1429 + }, + { + "epoch": 0.6558880862286435, + "grad_norm": 0.46445217728614807, + "learning_rate": 0.0009096273085010197, + "loss": 2.2559, + "step": 1430 + }, + { + "epoch": 0.6563467492260062, + "grad_norm": 0.2951407730579376, + "learning_rate": 0.0009094852526503158, + "loss": 1.4451, + "step": 1431 + }, + { + "epoch": 0.6568054122233689, + "grad_norm": 0.11164001375436783, + "learning_rate": 0.0009093430963499269, + "loss": 0.6334, + "step": 1432 + }, + { + "epoch": 0.6572640752207316, + "grad_norm": 0.3511189818382263, + "learning_rate": 0.0009092008396347249, + "loss": 1.946, + "step": 1433 + }, + { + "epoch": 0.6577227382180942, + "grad_norm": 0.29423317313194275, + "learning_rate": 0.0009090584825396064, + "loss": 1.4645, + "step": 1434 + }, + { + "epoch": 0.658181401215457, + "grad_norm": 0.2262077033519745, + "learning_rate": 0.0009089160250994928, + "loss": 1.0862, + "step": 1435 + }, + { + "epoch": 0.6586400642128196, + "grad_norm": 0.2860635221004486, + "learning_rate": 0.0009087734673493298, + "loss": 1.3347, + "step": 1436 + }, + { + "epoch": 0.6590987272101824, + "grad_norm": 0.3139696717262268, + "learning_rate": 0.0009086308093240878, + "loss": 1.4883, + "step": 1437 + }, + { + "epoch": 0.659557390207545, + "grad_norm": 0.3125250041484833, + "learning_rate": 0.0009084880510587619, + "loss": 1.6454, + "step": 1438 + }, + { + "epoch": 0.6600160532049077, + "grad_norm": 0.2761354446411133, + "learning_rate": 0.0009083451925883716, + "loss": 1.4274, + "step": 1439 + }, + { + "epoch": 0.6604747162022704, + "grad_norm": 0.20550890266895294, + "learning_rate": 0.0009082022339479615, + "loss": 0.9324, + "step": 1440 + }, + { + "epoch": 0.660933379199633, + "grad_norm": 0.18471978604793549, + "learning_rate": 0.0009080591751726, + "loss": 0.9747, + "step": 1441 + }, + { + "epoch": 0.6613920421969958, + "grad_norm": 0.23342064023017883, + "learning_rate": 0.0009079160162973805, + "loss": 1.2667, + "step": 1442 + }, + { + "epoch": 0.6618507051943584, + "grad_norm": 0.4921042025089264, + "learning_rate": 0.0009077727573574211, + "loss": 2.2029, + "step": 1443 + }, + { + "epoch": 0.6623093681917211, + "grad_norm": 0.2813926935195923, + "learning_rate": 0.000907629398387864, + "loss": 1.328, + "step": 1444 + }, + { + "epoch": 0.6627680311890838, + "grad_norm": 0.18378609418869019, + "learning_rate": 0.0009074859394238763, + "loss": 0.8692, + "step": 1445 + }, + { + "epoch": 0.6632266941864465, + "grad_norm": 0.11556479334831238, + "learning_rate": 0.0009073423805006495, + "loss": 0.556, + "step": 1446 + }, + { + "epoch": 0.6636853571838092, + "grad_norm": 0.28978320956230164, + "learning_rate": 0.0009071987216533999, + "loss": 0.9043, + "step": 1447 + }, + { + "epoch": 0.6641440201811719, + "grad_norm": 0.41565844416618347, + "learning_rate": 0.0009070549629173677, + "loss": 1.9819, + "step": 1448 + }, + { + "epoch": 0.6646026831785345, + "grad_norm": 0.3662017285823822, + "learning_rate": 0.0009069111043278181, + "loss": 1.5942, + "step": 1449 + }, + { + "epoch": 0.6650613461758973, + "grad_norm": 0.31095772981643677, + "learning_rate": 0.0009067671459200406, + "loss": 1.3741, + "step": 1450 + }, + { + "epoch": 0.6655200091732599, + "grad_norm": 0.33298200368881226, + "learning_rate": 0.0009066230877293493, + "loss": 1.777, + "step": 1451 + }, + { + "epoch": 0.6659786721706227, + "grad_norm": 0.28431054949760437, + "learning_rate": 0.0009064789297910826, + "loss": 1.4286, + "step": 1452 + }, + { + "epoch": 0.6664373351679853, + "grad_norm": 0.32433438301086426, + "learning_rate": 0.0009063346721406037, + "loss": 1.8176, + "step": 1453 + }, + { + "epoch": 0.6668959981653481, + "grad_norm": 0.24182525277137756, + "learning_rate": 0.0009061903148132997, + "loss": 1.2091, + "step": 1454 + }, + { + "epoch": 0.6673546611627107, + "grad_norm": 0.21921652555465698, + "learning_rate": 0.0009060458578445829, + "loss": 1.0043, + "step": 1455 + }, + { + "epoch": 0.6678133241600734, + "grad_norm": 0.3465001881122589, + "learning_rate": 0.0009059013012698892, + "loss": 1.97, + "step": 1456 + }, + { + "epoch": 0.6682719871574361, + "grad_norm": 0.3193022310733795, + "learning_rate": 0.0009057566451246797, + "loss": 1.6008, + "step": 1457 + }, + { + "epoch": 0.6687306501547987, + "grad_norm": 0.261091947555542, + "learning_rate": 0.0009056118894444396, + "loss": 1.1853, + "step": 1458 + }, + { + "epoch": 0.6691893131521615, + "grad_norm": 0.21906504034996033, + "learning_rate": 0.0009054670342646782, + "loss": 0.846, + "step": 1459 + }, + { + "epoch": 0.6696479761495241, + "grad_norm": 0.37668684124946594, + "learning_rate": 0.0009053220796209298, + "loss": 1.3909, + "step": 1460 + }, + { + "epoch": 0.6701066391468868, + "grad_norm": 0.38679805397987366, + "learning_rate": 0.000905177025548753, + "loss": 1.8932, + "step": 1461 + }, + { + "epoch": 0.6705653021442495, + "grad_norm": 0.09460335969924927, + "learning_rate": 0.00090503187208373, + "loss": 0.5168, + "step": 1462 + }, + { + "epoch": 0.6710239651416122, + "grad_norm": 0.1873873472213745, + "learning_rate": 0.0009048866192614685, + "loss": 1.0623, + "step": 1463 + }, + { + "epoch": 0.6714826281389749, + "grad_norm": 0.34352460503578186, + "learning_rate": 0.0009047412671175999, + "loss": 1.9176, + "step": 1464 + }, + { + "epoch": 0.6719412911363376, + "grad_norm": 0.36623653769493103, + "learning_rate": 0.0009045958156877801, + "loss": 1.3452, + "step": 1465 + }, + { + "epoch": 0.6723999541337002, + "grad_norm": 0.2123580127954483, + "learning_rate": 0.0009044502650076895, + "loss": 0.9063, + "step": 1466 + }, + { + "epoch": 0.672858617131063, + "grad_norm": 0.3140578866004944, + "learning_rate": 0.0009043046151130326, + "loss": 1.7998, + "step": 1467 + }, + { + "epoch": 0.6733172801284256, + "grad_norm": 0.32382750511169434, + "learning_rate": 0.0009041588660395385, + "loss": 1.5197, + "step": 1468 + }, + { + "epoch": 0.6737759431257884, + "grad_norm": 0.3363771140575409, + "learning_rate": 0.0009040130178229604, + "loss": 2.1333, + "step": 1469 + }, + { + "epoch": 0.674234606123151, + "grad_norm": 0.3622763156890869, + "learning_rate": 0.0009038670704990759, + "loss": 1.6564, + "step": 1470 + }, + { + "epoch": 0.6746932691205138, + "grad_norm": 0.3316439092159271, + "learning_rate": 0.000903721024103687, + "loss": 1.965, + "step": 1471 + }, + { + "epoch": 0.6751519321178764, + "grad_norm": 0.28195494413375854, + "learning_rate": 0.0009035748786726199, + "loss": 1.2933, + "step": 1472 + }, + { + "epoch": 0.6756105951152391, + "grad_norm": 0.26023930311203003, + "learning_rate": 0.0009034286342417251, + "loss": 1.2734, + "step": 1473 + }, + { + "epoch": 0.6760692581126018, + "grad_norm": 0.39502447843551636, + "learning_rate": 0.0009032822908468775, + "loss": 2.2822, + "step": 1474 + }, + { + "epoch": 0.6765279211099644, + "grad_norm": 0.20976006984710693, + "learning_rate": 0.0009031358485239761, + "loss": 1.0251, + "step": 1475 + }, + { + "epoch": 0.6769865841073271, + "grad_norm": 0.3310341238975525, + "learning_rate": 0.0009029893073089443, + "loss": 1.9667, + "step": 1476 + }, + { + "epoch": 0.6774452471046898, + "grad_norm": 0.2633422911167145, + "learning_rate": 0.0009028426672377297, + "loss": 1.5052, + "step": 1477 + }, + { + "epoch": 0.6779039101020525, + "grad_norm": 0.20135562121868134, + "learning_rate": 0.0009026959283463044, + "loss": 1.1064, + "step": 1478 + }, + { + "epoch": 0.6783625730994152, + "grad_norm": 0.43766430020332336, + "learning_rate": 0.000902549090670664, + "loss": 2.6384, + "step": 1479 + }, + { + "epoch": 0.6788212360967779, + "grad_norm": 0.35967737436294556, + "learning_rate": 0.0009024021542468292, + "loss": 2.2778, + "step": 1480 + }, + { + "epoch": 0.6792798990941405, + "grad_norm": 0.2625736892223358, + "learning_rate": 0.0009022551191108446, + "loss": 1.4241, + "step": 1481 + }, + { + "epoch": 0.6797385620915033, + "grad_norm": 0.41545170545578003, + "learning_rate": 0.0009021079852987788, + "loss": 2.2012, + "step": 1482 + }, + { + "epoch": 0.6801972250888659, + "grad_norm": 0.2915220856666565, + "learning_rate": 0.0009019607528467249, + "loss": 1.3696, + "step": 1483 + }, + { + "epoch": 0.6806558880862287, + "grad_norm": 0.274554967880249, + "learning_rate": 0.0009018134217907999, + "loss": 1.3186, + "step": 1484 + }, + { + "epoch": 0.6811145510835913, + "grad_norm": 0.2678084671497345, + "learning_rate": 0.0009016659921671454, + "loss": 1.5616, + "step": 1485 + }, + { + "epoch": 0.6815732140809541, + "grad_norm": 0.22896720468997955, + "learning_rate": 0.000901518464011927, + "loss": 1.2181, + "step": 1486 + }, + { + "epoch": 0.6820318770783167, + "grad_norm": 0.3303893506526947, + "learning_rate": 0.0009013708373613341, + "loss": 1.9144, + "step": 1487 + }, + { + "epoch": 0.6824905400756794, + "grad_norm": 0.34536314010620117, + "learning_rate": 0.0009012231122515807, + "loss": 1.5248, + "step": 1488 + }, + { + "epoch": 0.6829492030730421, + "grad_norm": 0.2860594093799591, + "learning_rate": 0.0009010752887189051, + "loss": 1.5133, + "step": 1489 + }, + { + "epoch": 0.6834078660704048, + "grad_norm": 0.15731200575828552, + "learning_rate": 0.0009009273667995691, + "loss": 0.7388, + "step": 1490 + }, + { + "epoch": 0.6838665290677675, + "grad_norm": 0.2773778438568115, + "learning_rate": 0.0009007793465298593, + "loss": 1.5673, + "step": 1491 + }, + { + "epoch": 0.6843251920651301, + "grad_norm": 0.3588542342185974, + "learning_rate": 0.000900631227946086, + "loss": 1.9658, + "step": 1492 + }, + { + "epoch": 0.6847838550624928, + "grad_norm": 0.3078388273715973, + "learning_rate": 0.0009004830110845838, + "loss": 1.25, + "step": 1493 + }, + { + "epoch": 0.6852425180598555, + "grad_norm": 0.34084051847457886, + "learning_rate": 0.0009003346959817113, + "loss": 1.7067, + "step": 1494 + }, + { + "epoch": 0.6857011810572182, + "grad_norm": 0.34470683336257935, + "learning_rate": 0.0009001862826738514, + "loss": 1.8342, + "step": 1495 + }, + { + "epoch": 0.6861598440545809, + "grad_norm": 0.281382292509079, + "learning_rate": 0.0009000377711974109, + "loss": 1.2902, + "step": 1496 + }, + { + "epoch": 0.6866185070519436, + "grad_norm": 0.2875047028064728, + "learning_rate": 0.0008998891615888205, + "loss": 1.4417, + "step": 1497 + }, + { + "epoch": 0.6870771700493062, + "grad_norm": 0.37324872612953186, + "learning_rate": 0.0008997404538845355, + "loss": 1.9172, + "step": 1498 + }, + { + "epoch": 0.687535833046669, + "grad_norm": 0.35222485661506653, + "learning_rate": 0.0008995916481210349, + "loss": 1.7999, + "step": 1499 + }, + { + "epoch": 0.6879944960440316, + "grad_norm": 0.28432413935661316, + "learning_rate": 0.0008994427443348217, + "loss": 1.3091, + "step": 1500 + }, + { + "epoch": 0.6884531590413944, + "grad_norm": 0.3234626054763794, + "learning_rate": 0.0008992937425624235, + "loss": 1.6951, + "step": 1501 + }, + { + "epoch": 0.688911822038757, + "grad_norm": 0.3846684396266937, + "learning_rate": 0.0008991446428403909, + "loss": 2.1714, + "step": 1502 + }, + { + "epoch": 0.6893704850361198, + "grad_norm": 0.3131919205188751, + "learning_rate": 0.0008989954452052995, + "loss": 1.3457, + "step": 1503 + }, + { + "epoch": 0.6898291480334824, + "grad_norm": 0.19373203814029694, + "learning_rate": 0.0008988461496937485, + "loss": 0.9649, + "step": 1504 + }, + { + "epoch": 0.6902878110308451, + "grad_norm": 0.41193410754203796, + "learning_rate": 0.0008986967563423612, + "loss": 1.9963, + "step": 1505 + }, + { + "epoch": 0.6907464740282078, + "grad_norm": 0.3604571223258972, + "learning_rate": 0.0008985472651877847, + "loss": 1.7517, + "step": 1506 + }, + { + "epoch": 0.6912051370255705, + "grad_norm": 0.3667939007282257, + "learning_rate": 0.0008983976762666905, + "loss": 1.6728, + "step": 1507 + }, + { + "epoch": 0.6916638000229332, + "grad_norm": 0.3759375214576721, + "learning_rate": 0.0008982479896157737, + "loss": 2.1001, + "step": 1508 + }, + { + "epoch": 0.6921224630202958, + "grad_norm": 0.37497055530548096, + "learning_rate": 0.0008980982052717534, + "loss": 1.7802, + "step": 1509 + }, + { + "epoch": 0.6925811260176585, + "grad_norm": 0.23320765793323517, + "learning_rate": 0.0008979483232713731, + "loss": 1.096, + "step": 1510 + }, + { + "epoch": 0.6930397890150212, + "grad_norm": 0.27966445684432983, + "learning_rate": 0.0008977983436513997, + "loss": 1.3906, + "step": 1511 + }, + { + "epoch": 0.6934984520123839, + "grad_norm": 0.2946244180202484, + "learning_rate": 0.0008976482664486241, + "loss": 1.5397, + "step": 1512 + }, + { + "epoch": 0.6939571150097466, + "grad_norm": 0.3360169231891632, + "learning_rate": 0.0008974980916998618, + "loss": 1.6011, + "step": 1513 + }, + { + "epoch": 0.6944157780071093, + "grad_norm": 0.28576546907424927, + "learning_rate": 0.0008973478194419515, + "loss": 1.4437, + "step": 1514 + }, + { + "epoch": 0.6948744410044719, + "grad_norm": 0.25533977150917053, + "learning_rate": 0.000897197449711756, + "loss": 1.0789, + "step": 1515 + }, + { + "epoch": 0.6953331040018347, + "grad_norm": 0.08754102140665054, + "learning_rate": 0.000897046982546162, + "loss": 0.4806, + "step": 1516 + }, + { + "epoch": 0.6957917669991973, + "grad_norm": 0.2931134104728699, + "learning_rate": 0.0008968964179820806, + "loss": 1.8093, + "step": 1517 + }, + { + "epoch": 0.6962504299965601, + "grad_norm": 0.10447783023118973, + "learning_rate": 0.0008967457560564459, + "loss": 0.5872, + "step": 1518 + }, + { + "epoch": 0.6967090929939227, + "grad_norm": 0.1723310351371765, + "learning_rate": 0.0008965949968062166, + "loss": 0.758, + "step": 1519 + }, + { + "epoch": 0.6971677559912854, + "grad_norm": 0.27590087056159973, + "learning_rate": 0.000896444140268375, + "loss": 1.2286, + "step": 1520 + }, + { + "epoch": 0.6976264189886481, + "grad_norm": 0.3477742671966553, + "learning_rate": 0.0008962931864799272, + "loss": 1.5517, + "step": 1521 + }, + { + "epoch": 0.6980850819860108, + "grad_norm": 0.34647393226623535, + "learning_rate": 0.0008961421354779036, + "loss": 1.5421, + "step": 1522 + }, + { + "epoch": 0.6985437449833735, + "grad_norm": 0.36403578519821167, + "learning_rate": 0.0008959909872993574, + "loss": 1.9447, + "step": 1523 + }, + { + "epoch": 0.6990024079807362, + "grad_norm": 0.33395373821258545, + "learning_rate": 0.0008958397419813671, + "loss": 1.6851, + "step": 1524 + }, + { + "epoch": 0.6994610709780988, + "grad_norm": 0.23508980870246887, + "learning_rate": 0.0008956883995610338, + "loss": 1.063, + "step": 1525 + }, + { + "epoch": 0.6999197339754615, + "grad_norm": 0.29683560132980347, + "learning_rate": 0.0008955369600754831, + "loss": 1.3033, + "step": 1526 + }, + { + "epoch": 0.7003783969728242, + "grad_norm": 0.238669291138649, + "learning_rate": 0.0008953854235618641, + "loss": 1.0421, + "step": 1527 + }, + { + "epoch": 0.7008370599701869, + "grad_norm": 0.31371814012527466, + "learning_rate": 0.00089523379005735, + "loss": 1.0029, + "step": 1528 + }, + { + "epoch": 0.7012957229675496, + "grad_norm": 0.2695467472076416, + "learning_rate": 0.0008950820595991371, + "loss": 1.3376, + "step": 1529 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.17910736799240112, + "learning_rate": 0.0008949302322244465, + "loss": 0.9264, + "step": 1530 + }, + { + "epoch": 0.702213048962275, + "grad_norm": 0.4682910144329071, + "learning_rate": 0.0008947783079705223, + "loss": 1.4216, + "step": 1531 + }, + { + "epoch": 0.7026717119596376, + "grad_norm": 0.37215444445610046, + "learning_rate": 0.0008946262868746327, + "loss": 2.3613, + "step": 1532 + }, + { + "epoch": 0.7031303749570004, + "grad_norm": 0.25320783257484436, + "learning_rate": 0.0008944741689740695, + "loss": 1.2519, + "step": 1533 + }, + { + "epoch": 0.703589037954363, + "grad_norm": 0.24361571669578552, + "learning_rate": 0.0008943219543061481, + "loss": 1.191, + "step": 1534 + }, + { + "epoch": 0.7040477009517258, + "grad_norm": 0.3469868004322052, + "learning_rate": 0.0008941696429082084, + "loss": 1.9496, + "step": 1535 + }, + { + "epoch": 0.7045063639490884, + "grad_norm": 0.3765385150909424, + "learning_rate": 0.0008940172348176132, + "loss": 2.0288, + "step": 1536 + }, + { + "epoch": 0.7049650269464511, + "grad_norm": 0.23416545987129211, + "learning_rate": 0.0008938647300717491, + "loss": 0.9575, + "step": 1537 + }, + { + "epoch": 0.7054236899438138, + "grad_norm": 0.3623865246772766, + "learning_rate": 0.0008937121287080268, + "loss": 1.8331, + "step": 1538 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.20962877571582794, + "learning_rate": 0.0008935594307638806, + "loss": 1.1107, + "step": 1539 + }, + { + "epoch": 0.7063410159385392, + "grad_norm": 0.24036376178264618, + "learning_rate": 0.0008934066362767684, + "loss": 1.1372, + "step": 1540 + }, + { + "epoch": 0.7067996789359019, + "grad_norm": 0.0797274112701416, + "learning_rate": 0.0008932537452841716, + "loss": 0.4638, + "step": 1541 + }, + { + "epoch": 0.7072583419332645, + "grad_norm": 0.32924991846084595, + "learning_rate": 0.0008931007578235957, + "loss": 1.4589, + "step": 1542 + }, + { + "epoch": 0.7077170049306272, + "grad_norm": 0.3090713322162628, + "learning_rate": 0.0008929476739325694, + "loss": 1.2155, + "step": 1543 + }, + { + "epoch": 0.7081756679279899, + "grad_norm": 0.32841405272483826, + "learning_rate": 0.0008927944936486454, + "loss": 1.3251, + "step": 1544 + }, + { + "epoch": 0.7086343309253526, + "grad_norm": 0.2854306697845459, + "learning_rate": 0.0008926412170093998, + "loss": 1.2567, + "step": 1545 + }, + { + "epoch": 0.7090929939227153, + "grad_norm": 0.3006875514984131, + "learning_rate": 0.0008924878440524326, + "loss": 1.7976, + "step": 1546 + }, + { + "epoch": 0.7095516569200779, + "grad_norm": 0.42627203464508057, + "learning_rate": 0.0008923343748153674, + "loss": 1.429, + "step": 1547 + }, + { + "epoch": 0.7100103199174407, + "grad_norm": 0.36591845750808716, + "learning_rate": 0.0008921808093358512, + "loss": 2.0798, + "step": 1548 + }, + { + "epoch": 0.7104689829148033, + "grad_norm": 0.2793489694595337, + "learning_rate": 0.0008920271476515547, + "loss": 1.7866, + "step": 1549 + }, + { + "epoch": 0.7109276459121661, + "grad_norm": 0.24239283800125122, + "learning_rate": 0.0008918733898001721, + "loss": 1.1033, + "step": 1550 + }, + { + "epoch": 0.7113863089095287, + "grad_norm": 0.42722949385643005, + "learning_rate": 0.0008917195358194214, + "loss": 1.3158, + "step": 1551 + }, + { + "epoch": 0.7118449719068914, + "grad_norm": 0.10515722632408142, + "learning_rate": 0.0008915655857470443, + "loss": 0.5974, + "step": 1552 + }, + { + "epoch": 0.7123036349042541, + "grad_norm": 0.3216477334499359, + "learning_rate": 0.0008914115396208056, + "loss": 1.8257, + "step": 1553 + }, + { + "epoch": 0.7127622979016168, + "grad_norm": 0.27405673265457153, + "learning_rate": 0.000891257397478494, + "loss": 1.3561, + "step": 1554 + }, + { + "epoch": 0.7132209608989795, + "grad_norm": 0.2392241358757019, + "learning_rate": 0.0008911031593579217, + "loss": 1.3283, + "step": 1555 + }, + { + "epoch": 0.7136796238963422, + "grad_norm": 0.3045576810836792, + "learning_rate": 0.0008909488252969244, + "loss": 1.3752, + "step": 1556 + }, + { + "epoch": 0.7141382868937048, + "grad_norm": 0.34698760509490967, + "learning_rate": 0.0008907943953333613, + "loss": 1.8883, + "step": 1557 + }, + { + "epoch": 0.7145969498910676, + "grad_norm": 0.3657462000846863, + "learning_rate": 0.0008906398695051153, + "loss": 2.2324, + "step": 1558 + }, + { + "epoch": 0.7150556128884302, + "grad_norm": 0.2741200029850006, + "learning_rate": 0.0008904852478500927, + "loss": 1.3587, + "step": 1559 + }, + { + "epoch": 0.7155142758857929, + "grad_norm": 0.3094598352909088, + "learning_rate": 0.0008903305304062232, + "loss": 1.7721, + "step": 1560 + }, + { + "epoch": 0.7159729388831556, + "grad_norm": 0.45374658703804016, + "learning_rate": 0.0008901757172114601, + "loss": 2.1138, + "step": 1561 + }, + { + "epoch": 0.7164316018805182, + "grad_norm": 0.6525793671607971, + "learning_rate": 0.0008900208083037804, + "loss": 1.7445, + "step": 1562 + }, + { + "epoch": 0.716890264877881, + "grad_norm": 0.39578354358673096, + "learning_rate": 0.0008898658037211842, + "loss": 1.2509, + "step": 1563 + }, + { + "epoch": 0.7173489278752436, + "grad_norm": 0.348197340965271, + "learning_rate": 0.0008897107035016952, + "loss": 1.8569, + "step": 1564 + }, + { + "epoch": 0.7178075908726064, + "grad_norm": 0.3598276972770691, + "learning_rate": 0.0008895555076833607, + "loss": 1.7867, + "step": 1565 + }, + { + "epoch": 0.718266253869969, + "grad_norm": 0.2803894579410553, + "learning_rate": 0.0008894002163042514, + "loss": 1.3177, + "step": 1566 + }, + { + "epoch": 0.7187249168673318, + "grad_norm": 0.3473854064941406, + "learning_rate": 0.0008892448294024612, + "loss": 1.8008, + "step": 1567 + }, + { + "epoch": 0.7191835798646944, + "grad_norm": 0.2351778894662857, + "learning_rate": 0.0008890893470161078, + "loss": 0.7619, + "step": 1568 + }, + { + "epoch": 0.7196422428620571, + "grad_norm": 0.3453526794910431, + "learning_rate": 0.0008889337691833321, + "loss": 1.9944, + "step": 1569 + }, + { + "epoch": 0.7201009058594198, + "grad_norm": 0.40439310669898987, + "learning_rate": 0.0008887780959422984, + "loss": 1.496, + "step": 1570 + }, + { + "epoch": 0.7205595688567825, + "grad_norm": 0.3145516514778137, + "learning_rate": 0.0008886223273311946, + "loss": 1.8382, + "step": 1571 + }, + { + "epoch": 0.7210182318541452, + "grad_norm": 0.33060380816459656, + "learning_rate": 0.0008884664633882317, + "loss": 1.4937, + "step": 1572 + }, + { + "epoch": 0.7214768948515079, + "grad_norm": 0.11799333989620209, + "learning_rate": 0.0008883105041516445, + "loss": 0.6943, + "step": 1573 + }, + { + "epoch": 0.7219355578488705, + "grad_norm": 0.3208516538143158, + "learning_rate": 0.0008881544496596907, + "loss": 1.3055, + "step": 1574 + }, + { + "epoch": 0.7223942208462333, + "grad_norm": 0.29803258180618286, + "learning_rate": 0.0008879982999506518, + "loss": 1.3234, + "step": 1575 + }, + { + "epoch": 0.7228528838435959, + "grad_norm": 0.2925770580768585, + "learning_rate": 0.000887842055062832, + "loss": 1.3806, + "step": 1576 + }, + { + "epoch": 0.7233115468409586, + "grad_norm": 0.3469628095626831, + "learning_rate": 0.0008876857150345598, + "loss": 1.9561, + "step": 1577 + }, + { + "epoch": 0.7237702098383213, + "grad_norm": 0.2605293393135071, + "learning_rate": 0.0008875292799041863, + "loss": 1.2745, + "step": 1578 + }, + { + "epoch": 0.7242288728356839, + "grad_norm": 0.16196803748607635, + "learning_rate": 0.0008873727497100862, + "loss": 0.758, + "step": 1579 + }, + { + "epoch": 0.7246875358330467, + "grad_norm": 0.35878610610961914, + "learning_rate": 0.0008872161244906576, + "loss": 1.5375, + "step": 1580 + }, + { + "epoch": 0.7251461988304093, + "grad_norm": 0.43217015266418457, + "learning_rate": 0.0008870594042843216, + "loss": 2.2036, + "step": 1581 + }, + { + "epoch": 0.7256048618277721, + "grad_norm": 0.42131683230400085, + "learning_rate": 0.0008869025891295228, + "loss": 0.9508, + "step": 1582 + }, + { + "epoch": 0.7260635248251347, + "grad_norm": 0.36816248297691345, + "learning_rate": 0.0008867456790647292, + "loss": 2.1968, + "step": 1583 + }, + { + "epoch": 0.7265221878224974, + "grad_norm": 0.30012619495391846, + "learning_rate": 0.0008865886741284321, + "loss": 1.7261, + "step": 1584 + }, + { + "epoch": 0.7269808508198601, + "grad_norm": 0.29164719581604004, + "learning_rate": 0.0008864315743591457, + "loss": 1.5405, + "step": 1585 + }, + { + "epoch": 0.7274395138172228, + "grad_norm": 0.3047489821910858, + "learning_rate": 0.0008862743797954078, + "loss": 1.7762, + "step": 1586 + }, + { + "epoch": 0.7278981768145855, + "grad_norm": 0.25507205724716187, + "learning_rate": 0.0008861170904757794, + "loss": 1.3858, + "step": 1587 + }, + { + "epoch": 0.7283568398119482, + "grad_norm": 0.21503114700317383, + "learning_rate": 0.0008859597064388445, + "loss": 1.0641, + "step": 1588 + }, + { + "epoch": 0.7288155028093108, + "grad_norm": 0.2100914567708969, + "learning_rate": 0.0008858022277232107, + "loss": 1.0745, + "step": 1589 + }, + { + "epoch": 0.7292741658066736, + "grad_norm": 0.20805799961090088, + "learning_rate": 0.0008856446543675088, + "loss": 1.007, + "step": 1590 + }, + { + "epoch": 0.7297328288040362, + "grad_norm": 0.34034040570259094, + "learning_rate": 0.0008854869864103925, + "loss": 1.625, + "step": 1591 + }, + { + "epoch": 0.730191491801399, + "grad_norm": 0.3298024535179138, + "learning_rate": 0.000885329223890539, + "loss": 1.6735, + "step": 1592 + }, + { + "epoch": 0.7306501547987616, + "grad_norm": 0.33766958117485046, + "learning_rate": 0.0008851713668466484, + "loss": 1.7477, + "step": 1593 + }, + { + "epoch": 0.7311088177961242, + "grad_norm": 0.27652397751808167, + "learning_rate": 0.0008850134153174443, + "loss": 0.8909, + "step": 1594 + }, + { + "epoch": 0.731567480793487, + "grad_norm": 0.40618276596069336, + "learning_rate": 0.0008848553693416734, + "loss": 1.8905, + "step": 1595 + }, + { + "epoch": 0.7320261437908496, + "grad_norm": 0.3916473090648651, + "learning_rate": 0.0008846972289581053, + "loss": 1.6833, + "step": 1596 + }, + { + "epoch": 0.7324848067882124, + "grad_norm": 0.3562237024307251, + "learning_rate": 0.0008845389942055333, + "loss": 1.2002, + "step": 1597 + }, + { + "epoch": 0.732943469785575, + "grad_norm": 0.2905832827091217, + "learning_rate": 0.0008843806651227733, + "loss": 1.2439, + "step": 1598 + }, + { + "epoch": 0.7334021327829378, + "grad_norm": 0.2873367369174957, + "learning_rate": 0.0008842222417486646, + "loss": 1.3354, + "step": 1599 + }, + { + "epoch": 0.7338607957803004, + "grad_norm": 0.26076123118400574, + "learning_rate": 0.0008840637241220696, + "loss": 1.2548, + "step": 1600 + }, + { + "epoch": 0.7343194587776631, + "grad_norm": 0.2180601954460144, + "learning_rate": 0.0008839051122818737, + "loss": 1.0288, + "step": 1601 + }, + { + "epoch": 0.7347781217750258, + "grad_norm": 0.3309728801250458, + "learning_rate": 0.000883746406266986, + "loss": 1.4196, + "step": 1602 + }, + { + "epoch": 0.7352367847723885, + "grad_norm": 0.403773695230484, + "learning_rate": 0.0008835876061163377, + "loss": 2.1072, + "step": 1603 + }, + { + "epoch": 0.7356954477697512, + "grad_norm": 0.2361578643321991, + "learning_rate": 0.0008834287118688837, + "loss": 1.1724, + "step": 1604 + }, + { + "epoch": 0.7361541107671139, + "grad_norm": 0.3139882981777191, + "learning_rate": 0.0008832697235636023, + "loss": 1.5528, + "step": 1605 + }, + { + "epoch": 0.7366127737644765, + "grad_norm": 0.32773545384407043, + "learning_rate": 0.0008831106412394938, + "loss": 1.7954, + "step": 1606 + }, + { + "epoch": 0.7370714367618393, + "grad_norm": 0.36932218074798584, + "learning_rate": 0.0008829514649355829, + "loss": 2.0931, + "step": 1607 + }, + { + "epoch": 0.7375300997592019, + "grad_norm": 0.3764024078845978, + "learning_rate": 0.0008827921946909164, + "loss": 1.7544, + "step": 1608 + }, + { + "epoch": 0.7379887627565647, + "grad_norm": 0.3229760527610779, + "learning_rate": 0.0008826328305445644, + "loss": 1.8318, + "step": 1609 + }, + { + "epoch": 0.7384474257539273, + "grad_norm": 0.26198244094848633, + "learning_rate": 0.0008824733725356202, + "loss": 1.2097, + "step": 1610 + }, + { + "epoch": 0.7389060887512899, + "grad_norm": 0.24832715094089508, + "learning_rate": 0.0008823138207031999, + "loss": 1.4565, + "step": 1611 + }, + { + "epoch": 0.7393647517486527, + "grad_norm": 0.29865747690200806, + "learning_rate": 0.0008821541750864428, + "loss": 1.2469, + "step": 1612 + }, + { + "epoch": 0.7398234147460153, + "grad_norm": 0.3033508062362671, + "learning_rate": 0.0008819944357245111, + "loss": 1.3326, + "step": 1613 + }, + { + "epoch": 0.7402820777433781, + "grad_norm": 0.2659357190132141, + "learning_rate": 0.0008818346026565897, + "loss": 1.2563, + "step": 1614 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 0.3529365360736847, + "learning_rate": 0.0008816746759218874, + "loss": 1.9199, + "step": 1615 + }, + { + "epoch": 0.7411994037381034, + "grad_norm": 0.20898482203483582, + "learning_rate": 0.0008815146555596351, + "loss": 0.8429, + "step": 1616 + }, + { + "epoch": 0.7416580667354661, + "grad_norm": 0.39400166273117065, + "learning_rate": 0.0008813545416090869, + "loss": 1.9886, + "step": 1617 + }, + { + "epoch": 0.7421167297328288, + "grad_norm": 0.2804252803325653, + "learning_rate": 0.0008811943341095199, + "loss": 1.4959, + "step": 1618 + }, + { + "epoch": 0.7425753927301915, + "grad_norm": 0.24878603219985962, + "learning_rate": 0.0008810340331002341, + "loss": 1.1033, + "step": 1619 + }, + { + "epoch": 0.7430340557275542, + "grad_norm": 0.3392964005470276, + "learning_rate": 0.0008808736386205527, + "loss": 1.5595, + "step": 1620 + }, + { + "epoch": 0.7434927187249168, + "grad_norm": 0.25439828634262085, + "learning_rate": 0.0008807131507098213, + "loss": 1.127, + "step": 1621 + }, + { + "epoch": 0.7439513817222796, + "grad_norm": 0.2508736252784729, + "learning_rate": 0.0008805525694074093, + "loss": 1.2366, + "step": 1622 + }, + { + "epoch": 0.7444100447196422, + "grad_norm": 0.21780788898468018, + "learning_rate": 0.0008803918947527079, + "loss": 1.1689, + "step": 1623 + }, + { + "epoch": 0.744868707717005, + "grad_norm": 0.22065167129039764, + "learning_rate": 0.000880231126785132, + "loss": 0.8861, + "step": 1624 + }, + { + "epoch": 0.7453273707143676, + "grad_norm": 0.3314412236213684, + "learning_rate": 0.000880070265544119, + "loss": 1.9618, + "step": 1625 + }, + { + "epoch": 0.7457860337117304, + "grad_norm": 0.2850133180618286, + "learning_rate": 0.0008799093110691294, + "loss": 1.5959, + "step": 1626 + }, + { + "epoch": 0.746244696709093, + "grad_norm": 0.3236753046512604, + "learning_rate": 0.0008797482633996466, + "loss": 1.6779, + "step": 1627 + }, + { + "epoch": 0.7467033597064556, + "grad_norm": 0.29706433415412903, + "learning_rate": 0.0008795871225751766, + "loss": 1.6093, + "step": 1628 + }, + { + "epoch": 0.7471620227038184, + "grad_norm": 0.3429460823535919, + "learning_rate": 0.0008794258886352485, + "loss": 1.5433, + "step": 1629 + }, + { + "epoch": 0.747620685701181, + "grad_norm": 0.32680225372314453, + "learning_rate": 0.0008792645616194141, + "loss": 1.8085, + "step": 1630 + }, + { + "epoch": 0.7480793486985438, + "grad_norm": 0.2640790641307831, + "learning_rate": 0.0008791031415672482, + "loss": 1.277, + "step": 1631 + }, + { + "epoch": 0.7485380116959064, + "grad_norm": 0.3999061584472656, + "learning_rate": 0.000878941628518348, + "loss": 1.9884, + "step": 1632 + }, + { + "epoch": 0.7489966746932691, + "grad_norm": 0.28930339217185974, + "learning_rate": 0.0008787800225123341, + "loss": 1.2634, + "step": 1633 + }, + { + "epoch": 0.7494553376906318, + "grad_norm": 0.25914284586906433, + "learning_rate": 0.0008786183235888497, + "loss": 1.2084, + "step": 1634 + }, + { + "epoch": 0.7499140006879945, + "grad_norm": 0.3261665403842926, + "learning_rate": 0.0008784565317875604, + "loss": 1.5663, + "step": 1635 + }, + { + "epoch": 0.7503726636853572, + "grad_norm": 0.23450294137001038, + "learning_rate": 0.000878294647148155, + "loss": 1.1509, + "step": 1636 + }, + { + "epoch": 0.7508313266827199, + "grad_norm": 0.2073303908109665, + "learning_rate": 0.000878132669710345, + "loss": 0.9553, + "step": 1637 + }, + { + "epoch": 0.7512899896800825, + "grad_norm": 0.18613280355930328, + "learning_rate": 0.0008779705995138647, + "loss": 0.8871, + "step": 1638 + }, + { + "epoch": 0.7517486526774453, + "grad_norm": 0.3788055181503296, + "learning_rate": 0.000877808436598471, + "loss": 1.5571, + "step": 1639 + }, + { + "epoch": 0.7522073156748079, + "grad_norm": 0.2849988639354706, + "learning_rate": 0.0008776461810039437, + "loss": 1.4867, + "step": 1640 + }, + { + "epoch": 0.7526659786721707, + "grad_norm": 0.3063529133796692, + "learning_rate": 0.0008774838327700852, + "loss": 1.3499, + "step": 1641 + }, + { + "epoch": 0.7531246416695333, + "grad_norm": 0.35424184799194336, + "learning_rate": 0.0008773213919367206, + "loss": 1.7198, + "step": 1642 + }, + { + "epoch": 0.753583304666896, + "grad_norm": 0.32375872135162354, + "learning_rate": 0.0008771588585436982, + "loss": 1.8474, + "step": 1643 + }, + { + "epoch": 0.7540419676642587, + "grad_norm": 0.3555268943309784, + "learning_rate": 0.0008769962326308882, + "loss": 1.7086, + "step": 1644 + }, + { + "epoch": 0.7545006306616213, + "grad_norm": 0.32062697410583496, + "learning_rate": 0.000876833514238184, + "loss": 1.4244, + "step": 1645 + }, + { + "epoch": 0.7549592936589841, + "grad_norm": 0.5851942896842957, + "learning_rate": 0.0008766707034055017, + "loss": 1.8285, + "step": 1646 + }, + { + "epoch": 0.7554179566563467, + "grad_norm": 0.39790624380111694, + "learning_rate": 0.0008765078001727799, + "loss": 1.5838, + "step": 1647 + }, + { + "epoch": 0.7558766196537094, + "grad_norm": 0.22544077038764954, + "learning_rate": 0.00087634480457998, + "loss": 1.0953, + "step": 1648 + }, + { + "epoch": 0.7563352826510721, + "grad_norm": 0.4409390985965729, + "learning_rate": 0.000876181716667086, + "loss": 2.1729, + "step": 1649 + }, + { + "epoch": 0.7567939456484348, + "grad_norm": 0.09505387395620346, + "learning_rate": 0.0008760185364741045, + "loss": 0.6027, + "step": 1650 + }, + { + "epoch": 0.7572526086457975, + "grad_norm": 0.18144452571868896, + "learning_rate": 0.0008758552640410647, + "loss": 0.7814, + "step": 1651 + }, + { + "epoch": 0.7577112716431602, + "grad_norm": 0.25609463453292847, + "learning_rate": 0.0008756918994080184, + "loss": 1.2465, + "step": 1652 + }, + { + "epoch": 0.7581699346405228, + "grad_norm": 0.3209281265735626, + "learning_rate": 0.0008755284426150405, + "loss": 1.327, + "step": 1653 + }, + { + "epoch": 0.7586285976378856, + "grad_norm": 0.28706008195877075, + "learning_rate": 0.0008753648937022278, + "loss": 1.5922, + "step": 1654 + }, + { + "epoch": 0.7590872606352482, + "grad_norm": 0.32143130898475647, + "learning_rate": 0.0008752012527097003, + "loss": 1.2776, + "step": 1655 + }, + { + "epoch": 0.759545923632611, + "grad_norm": 0.29344642162323, + "learning_rate": 0.0008750375196776002, + "loss": 1.5062, + "step": 1656 + }, + { + "epoch": 0.7600045866299736, + "grad_norm": 0.38162222504615784, + "learning_rate": 0.0008748736946460922, + "loss": 1.892, + "step": 1657 + }, + { + "epoch": 0.7604632496273364, + "grad_norm": 0.28159940242767334, + "learning_rate": 0.0008747097776553639, + "loss": 1.3286, + "step": 1658 + }, + { + "epoch": 0.760921912624699, + "grad_norm": 0.09235129505395889, + "learning_rate": 0.0008745457687456255, + "loss": 0.5844, + "step": 1659 + }, + { + "epoch": 0.7613805756220617, + "grad_norm": 0.3361958861351013, + "learning_rate": 0.0008743816679571094, + "loss": 1.7726, + "step": 1660 + }, + { + "epoch": 0.7618392386194244, + "grad_norm": 0.31122922897338867, + "learning_rate": 0.0008742174753300707, + "loss": 1.3808, + "step": 1661 + }, + { + "epoch": 0.762297901616787, + "grad_norm": 0.3450722396373749, + "learning_rate": 0.000874053190904787, + "loss": 1.8993, + "step": 1662 + }, + { + "epoch": 0.7627565646141498, + "grad_norm": 0.3466764986515045, + "learning_rate": 0.0008738888147215584, + "loss": 1.9905, + "step": 1663 + }, + { + "epoch": 0.7632152276115124, + "grad_norm": 0.13895724713802338, + "learning_rate": 0.0008737243468207079, + "loss": 0.6572, + "step": 1664 + }, + { + "epoch": 0.7636738906088751, + "grad_norm": 0.32592159509658813, + "learning_rate": 0.0008735597872425804, + "loss": 1.7026, + "step": 1665 + }, + { + "epoch": 0.7641325536062378, + "grad_norm": 0.2743968367576599, + "learning_rate": 0.0008733951360275434, + "loss": 1.1798, + "step": 1666 + }, + { + "epoch": 0.7645912166036005, + "grad_norm": 0.1800556182861328, + "learning_rate": 0.0008732303932159873, + "loss": 0.8318, + "step": 1667 + }, + { + "epoch": 0.7650498796009632, + "grad_norm": 0.35121503472328186, + "learning_rate": 0.0008730655588483247, + "loss": 1.9238, + "step": 1668 + }, + { + "epoch": 0.7655085425983259, + "grad_norm": 0.31581225991249084, + "learning_rate": 0.0008729006329649906, + "loss": 1.5878, + "step": 1669 + }, + { + "epoch": 0.7659672055956885, + "grad_norm": 0.34449562430381775, + "learning_rate": 0.0008727356156064424, + "loss": 1.348, + "step": 1670 + }, + { + "epoch": 0.7664258685930513, + "grad_norm": 0.2677781581878662, + "learning_rate": 0.0008725705068131599, + "loss": 1.2385, + "step": 1671 + }, + { + "epoch": 0.7668845315904139, + "grad_norm": 0.3324863314628601, + "learning_rate": 0.0008724053066256461, + "loss": 1.827, + "step": 1672 + }, + { + "epoch": 0.7673431945877767, + "grad_norm": 0.32999104261398315, + "learning_rate": 0.0008722400150844252, + "loss": 1.4756, + "step": 1673 + }, + { + "epoch": 0.7678018575851393, + "grad_norm": 0.15800195932388306, + "learning_rate": 0.0008720746322300447, + "loss": 0.7594, + "step": 1674 + }, + { + "epoch": 0.768260520582502, + "grad_norm": 0.3041391968727112, + "learning_rate": 0.0008719091581030741, + "loss": 1.4358, + "step": 1675 + }, + { + "epoch": 0.7687191835798647, + "grad_norm": 0.3658222258090973, + "learning_rate": 0.0008717435927441053, + "loss": 1.684, + "step": 1676 + }, + { + "epoch": 0.7691778465772274, + "grad_norm": 0.3466409146785736, + "learning_rate": 0.0008715779361937528, + "loss": 1.8845, + "step": 1677 + }, + { + "epoch": 0.7696365095745901, + "grad_norm": 0.32669052481651306, + "learning_rate": 0.0008714121884926536, + "loss": 1.6675, + "step": 1678 + }, + { + "epoch": 0.7700951725719528, + "grad_norm": 0.35387712717056274, + "learning_rate": 0.0008712463496814662, + "loss": 1.9104, + "step": 1679 + }, + { + "epoch": 0.7705538355693154, + "grad_norm": 0.3723506033420563, + "learning_rate": 0.0008710804198008727, + "loss": 1.8469, + "step": 1680 + }, + { + "epoch": 0.7710124985666781, + "grad_norm": 0.19466747343540192, + "learning_rate": 0.0008709143988915763, + "loss": 0.8387, + "step": 1681 + }, + { + "epoch": 0.7714711615640408, + "grad_norm": 0.33413955569267273, + "learning_rate": 0.0008707482869943035, + "loss": 1.6544, + "step": 1682 + }, + { + "epoch": 0.7719298245614035, + "grad_norm": 0.29037922620773315, + "learning_rate": 0.0008705820841498029, + "loss": 1.5114, + "step": 1683 + }, + { + "epoch": 0.7723884875587662, + "grad_norm": 0.3319530785083771, + "learning_rate": 0.0008704157903988448, + "loss": 1.61, + "step": 1684 + }, + { + "epoch": 0.7728471505561288, + "grad_norm": 0.37831956148147583, + "learning_rate": 0.0008702494057822224, + "loss": 1.8689, + "step": 1685 + }, + { + "epoch": 0.7733058135534916, + "grad_norm": 0.24735870957374573, + "learning_rate": 0.0008700829303407514, + "loss": 1.0308, + "step": 1686 + }, + { + "epoch": 0.7737644765508542, + "grad_norm": 0.09444022178649902, + "learning_rate": 0.000869916364115269, + "loss": 0.5718, + "step": 1687 + }, + { + "epoch": 0.774223139548217, + "grad_norm": 0.4078546166419983, + "learning_rate": 0.0008697497071466351, + "loss": 1.4986, + "step": 1688 + }, + { + "epoch": 0.7746818025455796, + "grad_norm": 0.3958517611026764, + "learning_rate": 0.0008695829594757323, + "loss": 1.8461, + "step": 1689 + }, + { + "epoch": 0.7751404655429424, + "grad_norm": 0.2555053234100342, + "learning_rate": 0.0008694161211434645, + "loss": 1.2473, + "step": 1690 + }, + { + "epoch": 0.775599128540305, + "grad_norm": 0.30251985788345337, + "learning_rate": 0.0008692491921907586, + "loss": 1.225, + "step": 1691 + }, + { + "epoch": 0.7760577915376677, + "grad_norm": 0.2819230258464813, + "learning_rate": 0.0008690821726585634, + "loss": 1.0078, + "step": 1692 + }, + { + "epoch": 0.7765164545350304, + "grad_norm": 0.32672372460365295, + "learning_rate": 0.0008689150625878501, + "loss": 1.4543, + "step": 1693 + }, + { + "epoch": 0.7769751175323931, + "grad_norm": 0.282443642616272, + "learning_rate": 0.000868747862019612, + "loss": 1.412, + "step": 1694 + }, + { + "epoch": 0.7774337805297558, + "grad_norm": 0.33812659978866577, + "learning_rate": 0.0008685805709948644, + "loss": 1.8073, + "step": 1695 + }, + { + "epoch": 0.7778924435271185, + "grad_norm": 0.29638344049453735, + "learning_rate": 0.0008684131895546453, + "loss": 1.5007, + "step": 1696 + }, + { + "epoch": 0.7783511065244811, + "grad_norm": 0.36412540078163147, + "learning_rate": 0.0008682457177400141, + "loss": 1.8148, + "step": 1697 + }, + { + "epoch": 0.7788097695218438, + "grad_norm": 0.1184324324131012, + "learning_rate": 0.0008680781555920533, + "loss": 0.6136, + "step": 1698 + }, + { + "epoch": 0.7792684325192065, + "grad_norm": 0.32119160890579224, + "learning_rate": 0.000867910503151867, + "loss": 1.8235, + "step": 1699 + }, + { + "epoch": 0.7797270955165692, + "grad_norm": 0.36955249309539795, + "learning_rate": 0.0008677427604605816, + "loss": 2.0277, + "step": 1700 + }, + { + "epoch": 0.7801857585139319, + "grad_norm": 0.3423328697681427, + "learning_rate": 0.0008675749275593454, + "loss": 1.6159, + "step": 1701 + }, + { + "epoch": 0.7806444215112945, + "grad_norm": 0.2848832905292511, + "learning_rate": 0.000867407004489329, + "loss": 1.5393, + "step": 1702 + }, + { + "epoch": 0.7811030845086573, + "grad_norm": 0.31208136677742004, + "learning_rate": 0.0008672389912917253, + "loss": 1.3461, + "step": 1703 + }, + { + "epoch": 0.7815617475060199, + "grad_norm": 0.35254040360450745, + "learning_rate": 0.0008670708880077492, + "loss": 1.6527, + "step": 1704 + }, + { + "epoch": 0.7820204105033827, + "grad_norm": 0.2813601493835449, + "learning_rate": 0.0008669026946786375, + "loss": 1.372, + "step": 1705 + }, + { + "epoch": 0.7824790735007453, + "grad_norm": 0.29800623655319214, + "learning_rate": 0.0008667344113456495, + "loss": 1.4514, + "step": 1706 + }, + { + "epoch": 0.782937736498108, + "grad_norm": 0.3098766505718231, + "learning_rate": 0.0008665660380500657, + "loss": 1.6478, + "step": 1707 + }, + { + "epoch": 0.7833963994954707, + "grad_norm": 0.4111390709877014, + "learning_rate": 0.00086639757483319, + "loss": 2.3545, + "step": 1708 + }, + { + "epoch": 0.7838550624928334, + "grad_norm": 0.2358768880367279, + "learning_rate": 0.0008662290217363474, + "loss": 0.8783, + "step": 1709 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.3664570152759552, + "learning_rate": 0.0008660603788008847, + "loss": 2.2107, + "step": 1710 + }, + { + "epoch": 0.7847723884875588, + "grad_norm": 0.22186599671840668, + "learning_rate": 0.0008658916460681721, + "loss": 1.1024, + "step": 1711 + }, + { + "epoch": 0.7852310514849214, + "grad_norm": 0.36577606201171875, + "learning_rate": 0.0008657228235796002, + "loss": 1.7358, + "step": 1712 + }, + { + "epoch": 0.7856897144822842, + "grad_norm": 0.2555186450481415, + "learning_rate": 0.0008655539113765828, + "loss": 0.8061, + "step": 1713 + }, + { + "epoch": 0.7861483774796468, + "grad_norm": 0.24842973053455353, + "learning_rate": 0.0008653849095005551, + "loss": 0.882, + "step": 1714 + }, + { + "epoch": 0.7866070404770095, + "grad_norm": 0.37208500504493713, + "learning_rate": 0.0008652158179929746, + "loss": 1.7593, + "step": 1715 + }, + { + "epoch": 0.7870657034743722, + "grad_norm": 0.3234862685203552, + "learning_rate": 0.0008650466368953206, + "loss": 1.5933, + "step": 1716 + }, + { + "epoch": 0.7875243664717348, + "grad_norm": 0.36344775557518005, + "learning_rate": 0.0008648773662490944, + "loss": 1.6957, + "step": 1717 + }, + { + "epoch": 0.7879830294690976, + "grad_norm": 0.29362058639526367, + "learning_rate": 0.0008647080060958194, + "loss": 1.246, + "step": 1718 + }, + { + "epoch": 0.7884416924664602, + "grad_norm": 0.48402926325798035, + "learning_rate": 0.0008645385564770409, + "loss": 1.8261, + "step": 1719 + }, + { + "epoch": 0.788900355463823, + "grad_norm": 0.36820700764656067, + "learning_rate": 0.0008643690174343258, + "loss": 1.7743, + "step": 1720 + }, + { + "epoch": 0.7893590184611856, + "grad_norm": 0.15446464717388153, + "learning_rate": 0.0008641993890092637, + "loss": 0.8386, + "step": 1721 + }, + { + "epoch": 0.7898176814585484, + "grad_norm": 0.3198365271091461, + "learning_rate": 0.0008640296712434654, + "loss": 1.6002, + "step": 1722 + }, + { + "epoch": 0.790276344455911, + "grad_norm": 0.15601693093776703, + "learning_rate": 0.000863859864178564, + "loss": 0.7734, + "step": 1723 + }, + { + "epoch": 0.7907350074532737, + "grad_norm": 0.38354170322418213, + "learning_rate": 0.0008636899678562143, + "loss": 1.7987, + "step": 1724 + }, + { + "epoch": 0.7911936704506364, + "grad_norm": 0.07455827295780182, + "learning_rate": 0.0008635199823180931, + "loss": 0.4332, + "step": 1725 + }, + { + "epoch": 0.7916523334479991, + "grad_norm": 0.3557904064655304, + "learning_rate": 0.0008633499076058992, + "loss": 1.8337, + "step": 1726 + }, + { + "epoch": 0.7921109964453618, + "grad_norm": 0.20361699163913727, + "learning_rate": 0.000863179743761353, + "loss": 0.9175, + "step": 1727 + }, + { + "epoch": 0.7925696594427245, + "grad_norm": 0.2946905493736267, + "learning_rate": 0.000863009490826197, + "loss": 1.3851, + "step": 1728 + }, + { + "epoch": 0.7930283224400871, + "grad_norm": 0.26087355613708496, + "learning_rate": 0.0008628391488421956, + "loss": 1.1733, + "step": 1729 + }, + { + "epoch": 0.7934869854374499, + "grad_norm": 0.33554574847221375, + "learning_rate": 0.0008626687178511346, + "loss": 1.6946, + "step": 1730 + }, + { + "epoch": 0.7939456484348125, + "grad_norm": 0.08757393062114716, + "learning_rate": 0.0008624981978948223, + "loss": 0.4806, + "step": 1731 + }, + { + "epoch": 0.7944043114321752, + "grad_norm": 0.3665269911289215, + "learning_rate": 0.0008623275890150882, + "loss": 1.4377, + "step": 1732 + }, + { + "epoch": 0.7948629744295379, + "grad_norm": 0.2906312346458435, + "learning_rate": 0.0008621568912537841, + "loss": 1.0789, + "step": 1733 + }, + { + "epoch": 0.7953216374269005, + "grad_norm": 0.28736838698387146, + "learning_rate": 0.0008619861046527832, + "loss": 0.922, + "step": 1734 + }, + { + "epoch": 0.7957803004242633, + "grad_norm": 0.37983497977256775, + "learning_rate": 0.0008618152292539807, + "loss": 1.8016, + "step": 1735 + }, + { + "epoch": 0.7962389634216259, + "grad_norm": 0.347023606300354, + "learning_rate": 0.0008616442650992937, + "loss": 1.9502, + "step": 1736 + }, + { + "epoch": 0.7966976264189887, + "grad_norm": 0.3550783097743988, + "learning_rate": 0.000861473212230661, + "loss": 1.7224, + "step": 1737 + }, + { + "epoch": 0.7971562894163513, + "grad_norm": 0.3077528774738312, + "learning_rate": 0.0008613020706900429, + "loss": 0.9033, + "step": 1738 + }, + { + "epoch": 0.797614952413714, + "grad_norm": 0.4047601819038391, + "learning_rate": 0.0008611308405194218, + "loss": 2.2925, + "step": 1739 + }, + { + "epoch": 0.7980736154110767, + "grad_norm": 0.4590601623058319, + "learning_rate": 0.0008609595217608015, + "loss": 1.0867, + "step": 1740 + }, + { + "epoch": 0.7985322784084394, + "grad_norm": 0.23011907935142517, + "learning_rate": 0.000860788114456208, + "loss": 0.8041, + "step": 1741 + }, + { + "epoch": 0.7989909414058021, + "grad_norm": 0.3282072842121124, + "learning_rate": 0.0008606166186476884, + "loss": 1.417, + "step": 1742 + }, + { + "epoch": 0.7994496044031648, + "grad_norm": 0.3150783181190491, + "learning_rate": 0.0008604450343773122, + "loss": 1.3964, + "step": 1743 + }, + { + "epoch": 0.7999082674005275, + "grad_norm": 0.268691748380661, + "learning_rate": 0.0008602733616871701, + "loss": 1.0648, + "step": 1744 + }, + { + "epoch": 0.8003669303978902, + "grad_norm": 0.31134241819381714, + "learning_rate": 0.0008601016006193747, + "loss": 0.8968, + "step": 1745 + }, + { + "epoch": 0.8008255933952528, + "grad_norm": 0.2897193729877472, + "learning_rate": 0.0008599297512160602, + "loss": 1.3438, + "step": 1746 + }, + { + "epoch": 0.8012842563926156, + "grad_norm": 0.3896466791629791, + "learning_rate": 0.0008597578135193826, + "loss": 1.8129, + "step": 1747 + }, + { + "epoch": 0.8017429193899782, + "grad_norm": 0.34135007858276367, + "learning_rate": 0.0008595857875715191, + "loss": 1.311, + "step": 1748 + }, + { + "epoch": 0.8022015823873409, + "grad_norm": 0.49784937500953674, + "learning_rate": 0.0008594136734146694, + "loss": 1.9587, + "step": 1749 + }, + { + "epoch": 0.8026602453847036, + "grad_norm": 0.3799792230129242, + "learning_rate": 0.0008592414710910541, + "loss": 1.6164, + "step": 1750 + }, + { + "epoch": 0.8031189083820662, + "grad_norm": 0.4176270067691803, + "learning_rate": 0.0008590691806429155, + "loss": 1.7939, + "step": 1751 + }, + { + "epoch": 0.803577571379429, + "grad_norm": 0.31819960474967957, + "learning_rate": 0.000858896802112518, + "loss": 1.2059, + "step": 1752 + }, + { + "epoch": 0.8040362343767916, + "grad_norm": 0.34135720133781433, + "learning_rate": 0.000858724335542147, + "loss": 1.4622, + "step": 1753 + }, + { + "epoch": 0.8044948973741544, + "grad_norm": 0.33655521273612976, + "learning_rate": 0.0008585517809741102, + "loss": 1.8534, + "step": 1754 + }, + { + "epoch": 0.804953560371517, + "grad_norm": 0.24837389588356018, + "learning_rate": 0.000858379138450736, + "loss": 1.2936, + "step": 1755 + }, + { + "epoch": 0.8054122233688797, + "grad_norm": 0.302501380443573, + "learning_rate": 0.0008582064080143752, + "loss": 1.5486, + "step": 1756 + }, + { + "epoch": 0.8058708863662424, + "grad_norm": 0.38520947098731995, + "learning_rate": 0.0008580335897073999, + "loss": 2.1995, + "step": 1757 + }, + { + "epoch": 0.8063295493636051, + "grad_norm": 0.05348968505859375, + "learning_rate": 0.0008578606835722032, + "loss": 0.3607, + "step": 1758 + }, + { + "epoch": 0.8067882123609678, + "grad_norm": 0.2687481939792633, + "learning_rate": 0.0008576876896512006, + "loss": 1.4349, + "step": 1759 + }, + { + "epoch": 0.8072468753583305, + "grad_norm": 0.2733592689037323, + "learning_rate": 0.0008575146079868287, + "loss": 1.4257, + "step": 1760 + }, + { + "epoch": 0.8077055383556931, + "grad_norm": 0.27336040139198303, + "learning_rate": 0.0008573414386215457, + "loss": 1.2039, + "step": 1761 + }, + { + "epoch": 0.8081642013530559, + "grad_norm": 0.27744030952453613, + "learning_rate": 0.000857168181597831, + "loss": 1.4286, + "step": 1762 + }, + { + "epoch": 0.8086228643504185, + "grad_norm": 0.2781740725040436, + "learning_rate": 0.0008569948369581864, + "loss": 1.3666, + "step": 1763 + }, + { + "epoch": 0.8090815273477813, + "grad_norm": 0.2565990686416626, + "learning_rate": 0.0008568214047451339, + "loss": 1.2767, + "step": 1764 + }, + { + "epoch": 0.8095401903451439, + "grad_norm": 0.34465324878692627, + "learning_rate": 0.000856647885001218, + "loss": 1.6884, + "step": 1765 + }, + { + "epoch": 0.8099988533425065, + "grad_norm": 0.3289906978607178, + "learning_rate": 0.0008564742777690047, + "loss": 1.5647, + "step": 1766 + }, + { + "epoch": 0.8104575163398693, + "grad_norm": 0.46371495723724365, + "learning_rate": 0.0008563005830910804, + "loss": 2.2827, + "step": 1767 + }, + { + "epoch": 0.8109161793372319, + "grad_norm": 0.3772294521331787, + "learning_rate": 0.0008561268010100541, + "loss": 1.7618, + "step": 1768 + }, + { + "epoch": 0.8113748423345947, + "grad_norm": 0.3338968753814697, + "learning_rate": 0.0008559529315685558, + "loss": 1.4556, + "step": 1769 + }, + { + "epoch": 0.8118335053319573, + "grad_norm": 0.39005744457244873, + "learning_rate": 0.0008557789748092369, + "loss": 1.8439, + "step": 1770 + }, + { + "epoch": 0.81229216832932, + "grad_norm": 0.2635294497013092, + "learning_rate": 0.00085560493077477, + "loss": 0.9517, + "step": 1771 + }, + { + "epoch": 0.8127508313266827, + "grad_norm": 0.3014441430568695, + "learning_rate": 0.0008554307995078495, + "loss": 1.2068, + "step": 1772 + }, + { + "epoch": 0.8132094943240454, + "grad_norm": 0.297055721282959, + "learning_rate": 0.0008552565810511912, + "loss": 1.3474, + "step": 1773 + }, + { + "epoch": 0.8136681573214081, + "grad_norm": 0.39429694414138794, + "learning_rate": 0.000855082275447532, + "loss": 1.3822, + "step": 1774 + }, + { + "epoch": 0.8141268203187708, + "grad_norm": 0.4196150302886963, + "learning_rate": 0.0008549078827396304, + "loss": 1.9164, + "step": 1775 + }, + { + "epoch": 0.8145854833161335, + "grad_norm": 0.33066174387931824, + "learning_rate": 0.0008547334029702661, + "loss": 1.3359, + "step": 1776 + }, + { + "epoch": 0.8150441463134962, + "grad_norm": 0.5004546642303467, + "learning_rate": 0.0008545588361822403, + "loss": 1.6743, + "step": 1777 + }, + { + "epoch": 0.8155028093108588, + "grad_norm": 0.3157084882259369, + "learning_rate": 0.0008543841824183756, + "loss": 1.2809, + "step": 1778 + }, + { + "epoch": 0.8159614723082216, + "grad_norm": 0.4495697021484375, + "learning_rate": 0.0008542094417215156, + "loss": 2.0178, + "step": 1779 + }, + { + "epoch": 0.8164201353055842, + "grad_norm": 0.40004342794418335, + "learning_rate": 0.0008540346141345254, + "loss": 1.8539, + "step": 1780 + }, + { + "epoch": 0.816878798302947, + "grad_norm": 0.3421812355518341, + "learning_rate": 0.0008538596997002918, + "loss": 1.2512, + "step": 1781 + }, + { + "epoch": 0.8173374613003096, + "grad_norm": 0.28738144040107727, + "learning_rate": 0.0008536846984617224, + "loss": 1.2953, + "step": 1782 + }, + { + "epoch": 0.8177961242976722, + "grad_norm": 0.40851840376853943, + "learning_rate": 0.0008535096104617464, + "loss": 2.1592, + "step": 1783 + }, + { + "epoch": 0.818254787295035, + "grad_norm": 0.2928701937198639, + "learning_rate": 0.0008533344357433138, + "loss": 1.3396, + "step": 1784 + }, + { + "epoch": 0.8187134502923976, + "grad_norm": 0.08765708655118942, + "learning_rate": 0.0008531591743493966, + "loss": 0.4814, + "step": 1785 + }, + { + "epoch": 0.8191721132897604, + "grad_norm": 0.3306613564491272, + "learning_rate": 0.0008529838263229874, + "loss": 1.3589, + "step": 1786 + }, + { + "epoch": 0.819630776287123, + "grad_norm": 0.29500478506088257, + "learning_rate": 0.0008528083917071006, + "loss": 1.3019, + "step": 1787 + }, + { + "epoch": 0.8200894392844857, + "grad_norm": 0.1884388029575348, + "learning_rate": 0.0008526328705447712, + "loss": 0.7717, + "step": 1788 + }, + { + "epoch": 0.8205481022818484, + "grad_norm": 0.31622594594955444, + "learning_rate": 0.0008524572628790562, + "loss": 1.669, + "step": 1789 + }, + { + "epoch": 0.8210067652792111, + "grad_norm": 0.4221099615097046, + "learning_rate": 0.0008522815687530333, + "loss": 1.6276, + "step": 1790 + }, + { + "epoch": 0.8214654282765738, + "grad_norm": 0.3094578981399536, + "learning_rate": 0.0008521057882098015, + "loss": 1.3534, + "step": 1791 + }, + { + "epoch": 0.8219240912739365, + "grad_norm": 0.2990223169326782, + "learning_rate": 0.000851929921292481, + "loss": 1.2487, + "step": 1792 + }, + { + "epoch": 0.8223827542712991, + "grad_norm": 1.4571868181228638, + "learning_rate": 0.0008517539680442133, + "loss": 1.2966, + "step": 1793 + }, + { + "epoch": 0.8228414172686619, + "grad_norm": 0.22885964810848236, + "learning_rate": 0.0008515779285081608, + "loss": 0.8865, + "step": 1794 + }, + { + "epoch": 0.8233000802660245, + "grad_norm": 0.3707837164402008, + "learning_rate": 0.0008514018027275074, + "loss": 1.4297, + "step": 1795 + }, + { + "epoch": 0.8237587432633873, + "grad_norm": 0.34915557503700256, + "learning_rate": 0.0008512255907454584, + "loss": 1.7339, + "step": 1796 + }, + { + "epoch": 0.8242174062607499, + "grad_norm": 0.2898045480251312, + "learning_rate": 0.0008510492926052393, + "loss": 1.4886, + "step": 1797 + }, + { + "epoch": 0.8246760692581127, + "grad_norm": 0.3200799822807312, + "learning_rate": 0.0008508729083500974, + "loss": 1.5057, + "step": 1798 + }, + { + "epoch": 0.8251347322554753, + "grad_norm": 0.23627309501171112, + "learning_rate": 0.0008506964380233014, + "loss": 1.0652, + "step": 1799 + }, + { + "epoch": 0.8255933952528379, + "grad_norm": 0.19365330040454865, + "learning_rate": 0.0008505198816681403, + "loss": 0.8976, + "step": 1800 + }, + { + "epoch": 0.8260520582502007, + "grad_norm": 0.30963438749313354, + "learning_rate": 0.0008503432393279251, + "loss": 1.4713, + "step": 1801 + }, + { + "epoch": 0.8265107212475633, + "grad_norm": 0.30070507526397705, + "learning_rate": 0.000850166511045987, + "loss": 1.3842, + "step": 1802 + }, + { + "epoch": 0.826969384244926, + "grad_norm": 0.23169678449630737, + "learning_rate": 0.0008499896968656789, + "loss": 1.0347, + "step": 1803 + }, + { + "epoch": 0.8274280472422887, + "grad_norm": 0.27118271589279175, + "learning_rate": 0.0008498127968303747, + "loss": 1.2278, + "step": 1804 + }, + { + "epoch": 0.8278867102396514, + "grad_norm": 0.28942152857780457, + "learning_rate": 0.0008496358109834691, + "loss": 1.2407, + "step": 1805 + }, + { + "epoch": 0.8283453732370141, + "grad_norm": 0.34731897711753845, + "learning_rate": 0.000849458739368378, + "loss": 1.7978, + "step": 1806 + }, + { + "epoch": 0.8288040362343768, + "grad_norm": 0.33974307775497437, + "learning_rate": 0.0008492815820285384, + "loss": 1.8225, + "step": 1807 + }, + { + "epoch": 0.8292626992317395, + "grad_norm": 0.36200281977653503, + "learning_rate": 0.0008491043390074085, + "loss": 1.8848, + "step": 1808 + }, + { + "epoch": 0.8297213622291022, + "grad_norm": 0.3786148130893707, + "learning_rate": 0.0008489270103484668, + "loss": 1.8229, + "step": 1809 + }, + { + "epoch": 0.8301800252264648, + "grad_norm": 0.3229668140411377, + "learning_rate": 0.0008487495960952139, + "loss": 1.4528, + "step": 1810 + }, + { + "epoch": 0.8306386882238276, + "grad_norm": 0.34114035964012146, + "learning_rate": 0.00084857209629117, + "loss": 1.3704, + "step": 1811 + }, + { + "epoch": 0.8310973512211902, + "grad_norm": 0.22234633564949036, + "learning_rate": 0.0008483945109798778, + "loss": 0.8441, + "step": 1812 + }, + { + "epoch": 0.831556014218553, + "grad_norm": 0.2830922305583954, + "learning_rate": 0.0008482168402049001, + "loss": 1.3244, + "step": 1813 + }, + { + "epoch": 0.8320146772159156, + "grad_norm": 0.30978924036026, + "learning_rate": 0.0008480390840098207, + "loss": 1.2952, + "step": 1814 + }, + { + "epoch": 0.8324733402132783, + "grad_norm": 0.3014751076698303, + "learning_rate": 0.0008478612424382444, + "loss": 1.5973, + "step": 1815 + }, + { + "epoch": 0.832932003210641, + "grad_norm": 0.1848728209733963, + "learning_rate": 0.0008476833155337971, + "loss": 0.7438, + "step": 1816 + }, + { + "epoch": 0.8333906662080036, + "grad_norm": 0.18665684759616852, + "learning_rate": 0.0008475053033401256, + "loss": 0.854, + "step": 1817 + }, + { + "epoch": 0.8338493292053664, + "grad_norm": 0.36908119916915894, + "learning_rate": 0.0008473272059008976, + "loss": 1.7524, + "step": 1818 + }, + { + "epoch": 0.834307992202729, + "grad_norm": 0.2789961099624634, + "learning_rate": 0.0008471490232598016, + "loss": 1.1132, + "step": 1819 + }, + { + "epoch": 0.8347666552000917, + "grad_norm": 0.3717726171016693, + "learning_rate": 0.000846970755460547, + "loss": 1.8761, + "step": 1820 + }, + { + "epoch": 0.8352253181974544, + "grad_norm": 0.38276535272598267, + "learning_rate": 0.0008467924025468645, + "loss": 2.0549, + "step": 1821 + }, + { + "epoch": 0.8356839811948171, + "grad_norm": 0.3366381824016571, + "learning_rate": 0.000846613964562505, + "loss": 1.5439, + "step": 1822 + }, + { + "epoch": 0.8361426441921798, + "grad_norm": 0.21815823018550873, + "learning_rate": 0.0008464354415512409, + "loss": 1.1877, + "step": 1823 + }, + { + "epoch": 0.8366013071895425, + "grad_norm": 0.3503132462501526, + "learning_rate": 0.000846256833556865, + "loss": 1.9175, + "step": 1824 + }, + { + "epoch": 0.8370599701869051, + "grad_norm": 0.2623533010482788, + "learning_rate": 0.0008460781406231913, + "loss": 1.3655, + "step": 1825 + }, + { + "epoch": 0.8375186331842679, + "grad_norm": 0.3180966377258301, + "learning_rate": 0.0008458993627940541, + "loss": 1.6582, + "step": 1826 + }, + { + "epoch": 0.8379772961816305, + "grad_norm": 0.19976475834846497, + "learning_rate": 0.0008457205001133093, + "loss": 0.8719, + "step": 1827 + }, + { + "epoch": 0.8384359591789933, + "grad_norm": 0.2790711224079132, + "learning_rate": 0.000845541552624833, + "loss": 1.736, + "step": 1828 + }, + { + "epoch": 0.8388946221763559, + "grad_norm": 0.25180596113204956, + "learning_rate": 0.0008453625203725224, + "loss": 1.3915, + "step": 1829 + }, + { + "epoch": 0.8393532851737187, + "grad_norm": 0.35062751173973083, + "learning_rate": 0.0008451834034002954, + "loss": 1.631, + "step": 1830 + }, + { + "epoch": 0.8398119481710813, + "grad_norm": 0.20606881380081177, + "learning_rate": 0.0008450042017520905, + "loss": 0.9485, + "step": 1831 + }, + { + "epoch": 0.840270611168444, + "grad_norm": 0.34081435203552246, + "learning_rate": 0.0008448249154718675, + "loss": 2.0344, + "step": 1832 + }, + { + "epoch": 0.8407292741658067, + "grad_norm": 0.2597459852695465, + "learning_rate": 0.0008446455446036063, + "loss": 1.5004, + "step": 1833 + }, + { + "epoch": 0.8411879371631693, + "grad_norm": 0.19320417940616608, + "learning_rate": 0.0008444660891913079, + "loss": 0.9119, + "step": 1834 + }, + { + "epoch": 0.841646600160532, + "grad_norm": 0.3378828763961792, + "learning_rate": 0.0008442865492789943, + "loss": 1.5369, + "step": 1835 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.2489069253206253, + "learning_rate": 0.0008441069249107076, + "loss": 1.3823, + "step": 1836 + }, + { + "epoch": 0.8425639261552574, + "grad_norm": 0.299431174993515, + "learning_rate": 0.000843927216130511, + "loss": 1.6409, + "step": 1837 + }, + { + "epoch": 0.8430225891526201, + "grad_norm": 0.17401325702667236, + "learning_rate": 0.0008437474229824886, + "loss": 0.735, + "step": 1838 + }, + { + "epoch": 0.8434812521499828, + "grad_norm": 0.267111212015152, + "learning_rate": 0.0008435675455107448, + "loss": 1.342, + "step": 1839 + }, + { + "epoch": 0.8439399151473455, + "grad_norm": 0.17649328708648682, + "learning_rate": 0.0008433875837594048, + "loss": 0.9951, + "step": 1840 + }, + { + "epoch": 0.8443985781447082, + "grad_norm": 0.24475933611392975, + "learning_rate": 0.0008432075377726144, + "loss": 1.2884, + "step": 1841 + }, + { + "epoch": 0.8448572411420708, + "grad_norm": 0.2003568857908249, + "learning_rate": 0.0008430274075945405, + "loss": 1.0649, + "step": 1842 + }, + { + "epoch": 0.8453159041394336, + "grad_norm": 0.26941803097724915, + "learning_rate": 0.0008428471932693701, + "loss": 1.4079, + "step": 1843 + }, + { + "epoch": 0.8457745671367962, + "grad_norm": 0.2373347133398056, + "learning_rate": 0.0008426668948413111, + "loss": 1.1529, + "step": 1844 + }, + { + "epoch": 0.846233230134159, + "grad_norm": 0.2616652846336365, + "learning_rate": 0.0008424865123545921, + "loss": 1.233, + "step": 1845 + }, + { + "epoch": 0.8466918931315216, + "grad_norm": 0.28413131833076477, + "learning_rate": 0.0008423060458534621, + "loss": 1.3795, + "step": 1846 + }, + { + "epoch": 0.8471505561288843, + "grad_norm": 0.2537064552307129, + "learning_rate": 0.000842125495382191, + "loss": 1.3186, + "step": 1847 + }, + { + "epoch": 0.847609219126247, + "grad_norm": 0.21368277072906494, + "learning_rate": 0.0008419448609850689, + "loss": 0.9202, + "step": 1848 + }, + { + "epoch": 0.8480678821236097, + "grad_norm": 0.2566043734550476, + "learning_rate": 0.000841764142706407, + "loss": 1.2372, + "step": 1849 + }, + { + "epoch": 0.8485265451209724, + "grad_norm": 0.34998470544815063, + "learning_rate": 0.0008415833405905366, + "loss": 1.5085, + "step": 1850 + }, + { + "epoch": 0.848985208118335, + "grad_norm": 0.13233597576618195, + "learning_rate": 0.0008414024546818098, + "loss": 0.6812, + "step": 1851 + }, + { + "epoch": 0.8494438711156977, + "grad_norm": 0.21131940186023712, + "learning_rate": 0.0008412214850245991, + "loss": 1.2487, + "step": 1852 + }, + { + "epoch": 0.8499025341130604, + "grad_norm": 0.35627928376197815, + "learning_rate": 0.000841040431663298, + "loss": 1.9961, + "step": 1853 + }, + { + "epoch": 0.8503611971104231, + "grad_norm": 0.37637192010879517, + "learning_rate": 0.0008408592946423198, + "loss": 2.4197, + "step": 1854 + }, + { + "epoch": 0.8508198601077858, + "grad_norm": 0.12372195720672607, + "learning_rate": 0.0008406780740060991, + "loss": 0.7192, + "step": 1855 + }, + { + "epoch": 0.8512785231051485, + "grad_norm": 0.24412862956523895, + "learning_rate": 0.0008404967697990902, + "loss": 1.1282, + "step": 1856 + }, + { + "epoch": 0.8517371861025111, + "grad_norm": 0.2743844985961914, + "learning_rate": 0.0008403153820657688, + "loss": 1.4214, + "step": 1857 + }, + { + "epoch": 0.8521958490998739, + "grad_norm": 0.3629717230796814, + "learning_rate": 0.0008401339108506302, + "loss": 1.7277, + "step": 1858 + }, + { + "epoch": 0.8526545120972365, + "grad_norm": 0.3425946533679962, + "learning_rate": 0.0008399523561981908, + "loss": 1.9844, + "step": 1859 + }, + { + "epoch": 0.8531131750945993, + "grad_norm": 0.284390926361084, + "learning_rate": 0.0008397707181529873, + "loss": 1.2471, + "step": 1860 + }, + { + "epoch": 0.8535718380919619, + "grad_norm": 0.2141527682542801, + "learning_rate": 0.0008395889967595766, + "loss": 0.8869, + "step": 1861 + }, + { + "epoch": 0.8540305010893247, + "grad_norm": 0.29586008191108704, + "learning_rate": 0.0008394071920625366, + "loss": 1.6176, + "step": 1862 + }, + { + "epoch": 0.8544891640866873, + "grad_norm": 0.25857093930244446, + "learning_rate": 0.0008392253041064652, + "loss": 1.2783, + "step": 1863 + }, + { + "epoch": 0.85494782708405, + "grad_norm": 0.33708396553993225, + "learning_rate": 0.0008390433329359806, + "loss": 1.4294, + "step": 1864 + }, + { + "epoch": 0.8554064900814127, + "grad_norm": 0.18086546659469604, + "learning_rate": 0.0008388612785957219, + "loss": 0.9876, + "step": 1865 + }, + { + "epoch": 0.8558651530787754, + "grad_norm": 0.2990560531616211, + "learning_rate": 0.0008386791411303482, + "loss": 1.2799, + "step": 1866 + }, + { + "epoch": 0.8563238160761381, + "grad_norm": 0.3266335129737854, + "learning_rate": 0.0008384969205845391, + "loss": 1.6877, + "step": 1867 + }, + { + "epoch": 0.8567824790735007, + "grad_norm": 0.21711353957653046, + "learning_rate": 0.0008383146170029949, + "loss": 0.8325, + "step": 1868 + }, + { + "epoch": 0.8572411420708634, + "grad_norm": 0.33946093916893005, + "learning_rate": 0.0008381322304304356, + "loss": 1.637, + "step": 1869 + }, + { + "epoch": 0.8576998050682261, + "grad_norm": 0.35655567049980164, + "learning_rate": 0.0008379497609116021, + "loss": 1.988, + "step": 1870 + }, + { + "epoch": 0.8581584680655888, + "grad_norm": 0.29446879029273987, + "learning_rate": 0.0008377672084912556, + "loss": 1.4559, + "step": 1871 + }, + { + "epoch": 0.8586171310629515, + "grad_norm": 0.30917033553123474, + "learning_rate": 0.0008375845732141773, + "loss": 1.2666, + "step": 1872 + }, + { + "epoch": 0.8590757940603142, + "grad_norm": 0.2967301905155182, + "learning_rate": 0.0008374018551251693, + "loss": 1.3149, + "step": 1873 + }, + { + "epoch": 0.8595344570576768, + "grad_norm": 0.37791574001312256, + "learning_rate": 0.0008372190542690534, + "loss": 1.9565, + "step": 1874 + }, + { + "epoch": 0.8599931200550396, + "grad_norm": 0.4147866368293762, + "learning_rate": 0.0008370361706906719, + "loss": 2.095, + "step": 1875 + }, + { + "epoch": 0.8604517830524022, + "grad_norm": 0.31991320848464966, + "learning_rate": 0.0008368532044348876, + "loss": 1.0417, + "step": 1876 + }, + { + "epoch": 0.860910446049765, + "grad_norm": 0.3214443624019623, + "learning_rate": 0.0008366701555465834, + "loss": 1.3086, + "step": 1877 + }, + { + "epoch": 0.8613691090471276, + "grad_norm": 0.32853764295578003, + "learning_rate": 0.0008364870240706626, + "loss": 1.4961, + "step": 1878 + }, + { + "epoch": 0.8618277720444903, + "grad_norm": 0.3698137700557709, + "learning_rate": 0.0008363038100520485, + "loss": 1.6169, + "step": 1879 + }, + { + "epoch": 0.862286435041853, + "grad_norm": 0.3349319100379944, + "learning_rate": 0.000836120513535685, + "loss": 1.6381, + "step": 1880 + }, + { + "epoch": 0.8627450980392157, + "grad_norm": 0.354120135307312, + "learning_rate": 0.0008359371345665359, + "loss": 1.7372, + "step": 1881 + }, + { + "epoch": 0.8632037610365784, + "grad_norm": 0.31368064880371094, + "learning_rate": 0.0008357536731895855, + "loss": 1.7236, + "step": 1882 + }, + { + "epoch": 0.8636624240339411, + "grad_norm": 0.33393827080726624, + "learning_rate": 0.0008355701294498381, + "loss": 1.7104, + "step": 1883 + }, + { + "epoch": 0.8641210870313037, + "grad_norm": 0.26583972573280334, + "learning_rate": 0.0008353865033923185, + "loss": 1.2689, + "step": 1884 + }, + { + "epoch": 0.8645797500286664, + "grad_norm": 0.33901429176330566, + "learning_rate": 0.0008352027950620714, + "loss": 1.652, + "step": 1885 + }, + { + "epoch": 0.8650384130260291, + "grad_norm": 0.371894508600235, + "learning_rate": 0.0008350190045041615, + "loss": 1.7666, + "step": 1886 + }, + { + "epoch": 0.8654970760233918, + "grad_norm": 0.19485050439834595, + "learning_rate": 0.0008348351317636742, + "loss": 0.8739, + "step": 1887 + }, + { + "epoch": 0.8659557390207545, + "grad_norm": 0.308272123336792, + "learning_rate": 0.000834651176885715, + "loss": 1.4182, + "step": 1888 + }, + { + "epoch": 0.8664144020181171, + "grad_norm": 0.3224412500858307, + "learning_rate": 0.000834467139915409, + "loss": 1.9642, + "step": 1889 + }, + { + "epoch": 0.8668730650154799, + "grad_norm": 0.24551476538181305, + "learning_rate": 0.000834283020897902, + "loss": 1.1545, + "step": 1890 + }, + { + "epoch": 0.8673317280128425, + "grad_norm": 0.3001308739185333, + "learning_rate": 0.0008340988198783597, + "loss": 1.6565, + "step": 1891 + }, + { + "epoch": 0.8677903910102053, + "grad_norm": 0.36925092339515686, + "learning_rate": 0.0008339145369019678, + "loss": 2.0125, + "step": 1892 + }, + { + "epoch": 0.8682490540075679, + "grad_norm": 0.3273385465145111, + "learning_rate": 0.0008337301720139323, + "loss": 1.3712, + "step": 1893 + }, + { + "epoch": 0.8687077170049307, + "grad_norm": 0.3309119939804077, + "learning_rate": 0.0008335457252594795, + "loss": 1.6698, + "step": 1894 + }, + { + "epoch": 0.8691663800022933, + "grad_norm": 0.23802553117275238, + "learning_rate": 0.0008333611966838552, + "loss": 0.9141, + "step": 1895 + }, + { + "epoch": 0.869625042999656, + "grad_norm": 0.26225998997688293, + "learning_rate": 0.0008331765863323255, + "loss": 1.04, + "step": 1896 + }, + { + "epoch": 0.8700837059970187, + "grad_norm": 0.33123326301574707, + "learning_rate": 0.0008329918942501772, + "loss": 1.7666, + "step": 1897 + }, + { + "epoch": 0.8705423689943814, + "grad_norm": 0.238576740026474, + "learning_rate": 0.000832807120482716, + "loss": 1.0255, + "step": 1898 + }, + { + "epoch": 0.8710010319917441, + "grad_norm": 0.3323553204536438, + "learning_rate": 0.0008326222650752686, + "loss": 1.4065, + "step": 1899 + }, + { + "epoch": 0.8714596949891068, + "grad_norm": 0.18547557294368744, + "learning_rate": 0.0008324373280731811, + "loss": 0.8987, + "step": 1900 + }, + { + "epoch": 0.8719183579864694, + "grad_norm": 0.3150063455104828, + "learning_rate": 0.0008322523095218202, + "loss": 1.2314, + "step": 1901 + }, + { + "epoch": 0.8723770209838321, + "grad_norm": 0.3147270977497101, + "learning_rate": 0.0008320672094665722, + "loss": 1.2318, + "step": 1902 + }, + { + "epoch": 0.8728356839811948, + "grad_norm": 0.4290883541107178, + "learning_rate": 0.0008318820279528432, + "loss": 2.0358, + "step": 1903 + }, + { + "epoch": 0.8732943469785575, + "grad_norm": 0.4178559482097626, + "learning_rate": 0.0008316967650260596, + "loss": 2.2466, + "step": 1904 + }, + { + "epoch": 0.8737530099759202, + "grad_norm": 0.18568667769432068, + "learning_rate": 0.0008315114207316682, + "loss": 0.9221, + "step": 1905 + }, + { + "epoch": 0.8742116729732828, + "grad_norm": 0.14113818109035492, + "learning_rate": 0.0008313259951151349, + "loss": 0.7635, + "step": 1906 + }, + { + "epoch": 0.8746703359706456, + "grad_norm": 0.3099023103713989, + "learning_rate": 0.0008311404882219458, + "loss": 1.3216, + "step": 1907 + }, + { + "epoch": 0.8751289989680082, + "grad_norm": 0.2004273533821106, + "learning_rate": 0.0008309549000976075, + "loss": 1.1125, + "step": 1908 + }, + { + "epoch": 0.875587661965371, + "grad_norm": 0.2782037556171417, + "learning_rate": 0.0008307692307876458, + "loss": 1.4631, + "step": 1909 + }, + { + "epoch": 0.8760463249627336, + "grad_norm": 0.29525527358055115, + "learning_rate": 0.0008305834803376069, + "loss": 1.3377, + "step": 1910 + }, + { + "epoch": 0.8765049879600963, + "grad_norm": 0.28939589858055115, + "learning_rate": 0.0008303976487930567, + "loss": 1.2637, + "step": 1911 + }, + { + "epoch": 0.876963650957459, + "grad_norm": 0.24675489962100983, + "learning_rate": 0.0008302117361995808, + "loss": 1.3734, + "step": 1912 + }, + { + "epoch": 0.8774223139548217, + "grad_norm": 0.2444259226322174, + "learning_rate": 0.0008300257426027851, + "loss": 1.0965, + "step": 1913 + }, + { + "epoch": 0.8778809769521844, + "grad_norm": 0.17775723338127136, + "learning_rate": 0.0008298396680482951, + "loss": 0.9796, + "step": 1914 + }, + { + "epoch": 0.8783396399495471, + "grad_norm": 0.24205970764160156, + "learning_rate": 0.0008296535125817564, + "loss": 1.2806, + "step": 1915 + }, + { + "epoch": 0.8787983029469097, + "grad_norm": 0.47772538661956787, + "learning_rate": 0.000829467276248834, + "loss": 1.7538, + "step": 1916 + }, + { + "epoch": 0.8792569659442725, + "grad_norm": 0.30592289566993713, + "learning_rate": 0.0008292809590952132, + "loss": 1.6431, + "step": 1917 + }, + { + "epoch": 0.8797156289416351, + "grad_norm": 0.2858329713344574, + "learning_rate": 0.000829094561166599, + "loss": 1.5801, + "step": 1918 + }, + { + "epoch": 0.8801742919389978, + "grad_norm": 0.28482332825660706, + "learning_rate": 0.0008289080825087158, + "loss": 1.093, + "step": 1919 + }, + { + "epoch": 0.8806329549363605, + "grad_norm": 0.35098031163215637, + "learning_rate": 0.0008287215231673087, + "loss": 1.6113, + "step": 1920 + }, + { + "epoch": 0.8810916179337231, + "grad_norm": 0.15006844699382782, + "learning_rate": 0.0008285348831881417, + "loss": 0.6972, + "step": 1921 + }, + { + "epoch": 0.8815502809310859, + "grad_norm": 0.393998384475708, + "learning_rate": 0.0008283481626169989, + "loss": 1.7005, + "step": 1922 + }, + { + "epoch": 0.8820089439284485, + "grad_norm": 0.2868679165840149, + "learning_rate": 0.0008281613614996842, + "loss": 0.8066, + "step": 1923 + }, + { + "epoch": 0.8824676069258113, + "grad_norm": 0.20780214667320251, + "learning_rate": 0.0008279744798820216, + "loss": 0.973, + "step": 1924 + }, + { + "epoch": 0.8829262699231739, + "grad_norm": 0.34180140495300293, + "learning_rate": 0.0008277875178098541, + "loss": 1.4466, + "step": 1925 + }, + { + "epoch": 0.8833849329205367, + "grad_norm": 0.229378804564476, + "learning_rate": 0.0008276004753290451, + "loss": 1.1042, + "step": 1926 + }, + { + "epoch": 0.8838435959178993, + "grad_norm": 0.27841222286224365, + "learning_rate": 0.0008274133524854773, + "loss": 1.3994, + "step": 1927 + }, + { + "epoch": 0.884302258915262, + "grad_norm": 0.21060162782669067, + "learning_rate": 0.0008272261493250533, + "loss": 1.0967, + "step": 1928 + }, + { + "epoch": 0.8847609219126247, + "grad_norm": 0.37685856223106384, + "learning_rate": 0.0008270388658936956, + "loss": 1.8954, + "step": 1929 + }, + { + "epoch": 0.8852195849099874, + "grad_norm": 0.44438236951828003, + "learning_rate": 0.000826851502237346, + "loss": 2.2446, + "step": 1930 + }, + { + "epoch": 0.8856782479073501, + "grad_norm": 0.4858162999153137, + "learning_rate": 0.0008266640584019662, + "loss": 2.2056, + "step": 1931 + }, + { + "epoch": 0.8861369109047128, + "grad_norm": 0.2732187509536743, + "learning_rate": 0.0008264765344335373, + "loss": 1.4948, + "step": 1932 + }, + { + "epoch": 0.8865955739020754, + "grad_norm": 0.2720052897930145, + "learning_rate": 0.0008262889303780607, + "loss": 1.3649, + "step": 1933 + }, + { + "epoch": 0.8870542368994382, + "grad_norm": 0.250740110874176, + "learning_rate": 0.0008261012462815567, + "loss": 1.2161, + "step": 1934 + }, + { + "epoch": 0.8875128998968008, + "grad_norm": 0.20551849901676178, + "learning_rate": 0.0008259134821900658, + "loss": 1.0547, + "step": 1935 + }, + { + "epoch": 0.8879715628941636, + "grad_norm": 0.23970794677734375, + "learning_rate": 0.0008257256381496476, + "loss": 1.1497, + "step": 1936 + }, + { + "epoch": 0.8884302258915262, + "grad_norm": 0.357015997171402, + "learning_rate": 0.0008255377142063819, + "loss": 1.4954, + "step": 1937 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.3159123659133911, + "learning_rate": 0.0008253497104063676, + "loss": 1.3619, + "step": 1938 + }, + { + "epoch": 0.8893475518862516, + "grad_norm": 0.2619781196117401, + "learning_rate": 0.0008251616267957234, + "loss": 1.0236, + "step": 1939 + }, + { + "epoch": 0.8898062148836142, + "grad_norm": 0.2195945680141449, + "learning_rate": 0.0008249734634205876, + "loss": 0.9583, + "step": 1940 + }, + { + "epoch": 0.890264877880977, + "grad_norm": 0.11660360544919968, + "learning_rate": 0.0008247852203271182, + "loss": 0.6041, + "step": 1941 + }, + { + "epoch": 0.8907235408783396, + "grad_norm": 0.33388739824295044, + "learning_rate": 0.0008245968975614924, + "loss": 1.6398, + "step": 1942 + }, + { + "epoch": 0.8911822038757024, + "grad_norm": 0.24743525683879852, + "learning_rate": 0.0008244084951699071, + "loss": 1.4167, + "step": 1943 + }, + { + "epoch": 0.891640866873065, + "grad_norm": 0.2996421456336975, + "learning_rate": 0.0008242200131985789, + "loss": 1.6158, + "step": 1944 + }, + { + "epoch": 0.8920995298704277, + "grad_norm": 0.24171142280101776, + "learning_rate": 0.0008240314516937435, + "loss": 1.1656, + "step": 1945 + }, + { + "epoch": 0.8925581928677904, + "grad_norm": 0.5590919256210327, + "learning_rate": 0.0008238428107016568, + "loss": 1.809, + "step": 1946 + }, + { + "epoch": 0.8930168558651531, + "grad_norm": 0.16760781407356262, + "learning_rate": 0.0008236540902685937, + "loss": 0.8538, + "step": 1947 + }, + { + "epoch": 0.8934755188625157, + "grad_norm": 0.2322719246149063, + "learning_rate": 0.0008234652904408482, + "loss": 1.0128, + "step": 1948 + }, + { + "epoch": 0.8939341818598785, + "grad_norm": 0.28307685256004333, + "learning_rate": 0.000823276411264735, + "loss": 1.377, + "step": 1949 + }, + { + "epoch": 0.8943928448572411, + "grad_norm": 0.27569177746772766, + "learning_rate": 0.000823087452786587, + "loss": 1.4318, + "step": 1950 + }, + { + "epoch": 0.8948515078546039, + "grad_norm": 0.2631165087223053, + "learning_rate": 0.0008228984150527574, + "loss": 1.4855, + "step": 1951 + }, + { + "epoch": 0.8953101708519665, + "grad_norm": 0.18486323952674866, + "learning_rate": 0.0008227092981096183, + "loss": 0.7875, + "step": 1952 + }, + { + "epoch": 0.8957688338493293, + "grad_norm": 0.20646964013576508, + "learning_rate": 0.0008225201020035615, + "loss": 0.9301, + "step": 1953 + }, + { + "epoch": 0.8962274968466919, + "grad_norm": 0.2068818062543869, + "learning_rate": 0.0008223308267809982, + "loss": 1.1328, + "step": 1954 + }, + { + "epoch": 0.8966861598440545, + "grad_norm": 0.1731792837381363, + "learning_rate": 0.000822141472488359, + "loss": 0.8079, + "step": 1955 + }, + { + "epoch": 0.8971448228414173, + "grad_norm": 0.36508551239967346, + "learning_rate": 0.0008219520391720938, + "loss": 2.1365, + "step": 1956 + }, + { + "epoch": 0.8976034858387799, + "grad_norm": 0.29660528898239136, + "learning_rate": 0.000821762526878672, + "loss": 1.6558, + "step": 1957 + }, + { + "epoch": 0.8980621488361427, + "grad_norm": 0.20313714444637299, + "learning_rate": 0.0008215729356545823, + "loss": 0.9536, + "step": 1958 + }, + { + "epoch": 0.8985208118335053, + "grad_norm": 0.24836888909339905, + "learning_rate": 0.0008213832655463329, + "loss": 1.2862, + "step": 1959 + }, + { + "epoch": 0.898979474830868, + "grad_norm": 0.26236483454704285, + "learning_rate": 0.000821193516600451, + "loss": 1.5496, + "step": 1960 + }, + { + "epoch": 0.8994381378282307, + "grad_norm": 0.2642938196659088, + "learning_rate": 0.0008210036888634838, + "loss": 1.2446, + "step": 1961 + }, + { + "epoch": 0.8998968008255934, + "grad_norm": 0.2847262918949127, + "learning_rate": 0.000820813782381997, + "loss": 1.3784, + "step": 1962 + }, + { + "epoch": 0.9003554638229561, + "grad_norm": 0.3096359372138977, + "learning_rate": 0.0008206237972025761, + "loss": 1.4408, + "step": 1963 + }, + { + "epoch": 0.9008141268203188, + "grad_norm": 0.18418483436107635, + "learning_rate": 0.0008204337333718261, + "loss": 0.8153, + "step": 1964 + }, + { + "epoch": 0.9012727898176814, + "grad_norm": 0.24402552843093872, + "learning_rate": 0.0008202435909363708, + "loss": 1.1557, + "step": 1965 + }, + { + "epoch": 0.9017314528150442, + "grad_norm": 0.2893608808517456, + "learning_rate": 0.0008200533699428536, + "loss": 1.2294, + "step": 1966 + }, + { + "epoch": 0.9021901158124068, + "grad_norm": 0.1799740344285965, + "learning_rate": 0.000819863070437937, + "loss": 0.9913, + "step": 1967 + }, + { + "epoch": 0.9026487788097696, + "grad_norm": 0.26435887813568115, + "learning_rate": 0.0008196726924683029, + "loss": 1.302, + "step": 1968 + }, + { + "epoch": 0.9031074418071322, + "grad_norm": 0.2572958767414093, + "learning_rate": 0.0008194822360806525, + "loss": 1.5272, + "step": 1969 + }, + { + "epoch": 0.903566104804495, + "grad_norm": 0.3217867612838745, + "learning_rate": 0.0008192917013217059, + "loss": 1.6104, + "step": 1970 + }, + { + "epoch": 0.9040247678018576, + "grad_norm": 0.34428954124450684, + "learning_rate": 0.0008191010882382027, + "loss": 1.8489, + "step": 1971 + }, + { + "epoch": 0.9044834307992202, + "grad_norm": 0.23394142091274261, + "learning_rate": 0.0008189103968769018, + "loss": 1.1929, + "step": 1972 + }, + { + "epoch": 0.904942093796583, + "grad_norm": 0.21039988100528717, + "learning_rate": 0.0008187196272845811, + "loss": 1.2102, + "step": 1973 + }, + { + "epoch": 0.9054007567939456, + "grad_norm": 0.36821311712265015, + "learning_rate": 0.0008185287795080377, + "loss": 2.1794, + "step": 1974 + }, + { + "epoch": 0.9058594197913084, + "grad_norm": 0.47129011154174805, + "learning_rate": 0.0008183378535940879, + "loss": 1.7034, + "step": 1975 + }, + { + "epoch": 0.906318082788671, + "grad_norm": 0.28976601362228394, + "learning_rate": 0.0008181468495895674, + "loss": 1.404, + "step": 1976 + }, + { + "epoch": 0.9067767457860337, + "grad_norm": 0.416605681180954, + "learning_rate": 0.0008179557675413307, + "loss": 1.8235, + "step": 1977 + }, + { + "epoch": 0.9072354087833964, + "grad_norm": 0.3279779255390167, + "learning_rate": 0.0008177646074962517, + "loss": 1.7274, + "step": 1978 + }, + { + "epoch": 0.9076940717807591, + "grad_norm": 0.2587118148803711, + "learning_rate": 0.0008175733695012231, + "loss": 1.1779, + "step": 1979 + }, + { + "epoch": 0.9081527347781218, + "grad_norm": 0.33901700377464294, + "learning_rate": 0.0008173820536031574, + "loss": 2.0381, + "step": 1980 + }, + { + "epoch": 0.9086113977754845, + "grad_norm": 0.22714871168136597, + "learning_rate": 0.0008171906598489853, + "loss": 1.3698, + "step": 1981 + }, + { + "epoch": 0.9090700607728471, + "grad_norm": 0.32320547103881836, + "learning_rate": 0.0008169991882856574, + "loss": 1.7041, + "step": 1982 + }, + { + "epoch": 0.9095287237702099, + "grad_norm": 0.3090137541294098, + "learning_rate": 0.0008168076389601427, + "loss": 1.3677, + "step": 1983 + }, + { + "epoch": 0.9099873867675725, + "grad_norm": 0.25512242317199707, + "learning_rate": 0.0008166160119194301, + "loss": 1.3021, + "step": 1984 + }, + { + "epoch": 0.9104460497649353, + "grad_norm": 0.328730970621109, + "learning_rate": 0.0008164243072105267, + "loss": 1.4302, + "step": 1985 + }, + { + "epoch": 0.9109047127622979, + "grad_norm": 0.30282872915267944, + "learning_rate": 0.0008162325248804594, + "loss": 1.5295, + "step": 1986 + }, + { + "epoch": 0.9113633757596606, + "grad_norm": 0.3073786795139313, + "learning_rate": 0.0008160406649762735, + "loss": 1.605, + "step": 1987 + }, + { + "epoch": 0.9118220387570233, + "grad_norm": 0.5435524582862854, + "learning_rate": 0.0008158487275450335, + "loss": 1.8843, + "step": 1988 + }, + { + "epoch": 0.9122807017543859, + "grad_norm": 0.3203667104244232, + "learning_rate": 0.0008156567126338236, + "loss": 0.826, + "step": 1989 + }, + { + "epoch": 0.9127393647517487, + "grad_norm": 0.2148309201002121, + "learning_rate": 0.0008154646202897459, + "loss": 0.9943, + "step": 1990 + }, + { + "epoch": 0.9131980277491113, + "grad_norm": 0.2307606041431427, + "learning_rate": 0.0008152724505599223, + "loss": 1.1053, + "step": 1991 + }, + { + "epoch": 0.913656690746474, + "grad_norm": 0.30222174525260925, + "learning_rate": 0.0008150802034914932, + "loss": 1.1675, + "step": 1992 + }, + { + "epoch": 0.9141153537438367, + "grad_norm": 0.3027079105377197, + "learning_rate": 0.0008148878791316184, + "loss": 1.2104, + "step": 1993 + }, + { + "epoch": 0.9145740167411994, + "grad_norm": 0.2728803753852844, + "learning_rate": 0.0008146954775274764, + "loss": 1.3286, + "step": 1994 + }, + { + "epoch": 0.9150326797385621, + "grad_norm": 0.413828045129776, + "learning_rate": 0.0008145029987262649, + "loss": 2.343, + "step": 1995 + }, + { + "epoch": 0.9154913427359248, + "grad_norm": 0.125292107462883, + "learning_rate": 0.0008143104427751998, + "loss": 0.7368, + "step": 1996 + }, + { + "epoch": 0.9159500057332874, + "grad_norm": 0.22482484579086304, + "learning_rate": 0.000814117809721517, + "loss": 1.1551, + "step": 1997 + }, + { + "epoch": 0.9164086687306502, + "grad_norm": 0.2546938955783844, + "learning_rate": 0.0008139250996124706, + "loss": 1.3978, + "step": 1998 + }, + { + "epoch": 0.9168673317280128, + "grad_norm": 0.31431275606155396, + "learning_rate": 0.0008137323124953335, + "loss": 1.5389, + "step": 1999 + }, + { + "epoch": 0.9173259947253756, + "grad_norm": 0.3111874759197235, + "learning_rate": 0.0008135394484173981, + "loss": 1.564, + "step": 2000 + }, + { + "epoch": 0.9177846577227382, + "grad_norm": 0.36270517110824585, + "learning_rate": 0.0008133465074259754, + "loss": 2.1987, + "step": 2001 + }, + { + "epoch": 0.918243320720101, + "grad_norm": 0.34382519125938416, + "learning_rate": 0.000813153489568395, + "loss": 2.0159, + "step": 2002 + }, + { + "epoch": 0.9187019837174636, + "grad_norm": 0.39224573969841003, + "learning_rate": 0.0008129603948920056, + "loss": 1.6236, + "step": 2003 + }, + { + "epoch": 0.9191606467148263, + "grad_norm": 0.2750489115715027, + "learning_rate": 0.000812767223444175, + "loss": 1.2472, + "step": 2004 + }, + { + "epoch": 0.919619309712189, + "grad_norm": 0.26714855432510376, + "learning_rate": 0.0008125739752722891, + "loss": 1.5997, + "step": 2005 + }, + { + "epoch": 0.9200779727095516, + "grad_norm": 0.24979934096336365, + "learning_rate": 0.0008123806504237533, + "loss": 1.3241, + "step": 2006 + }, + { + "epoch": 0.9205366357069144, + "grad_norm": 0.08660929650068283, + "learning_rate": 0.0008121872489459916, + "loss": 0.5286, + "step": 2007 + }, + { + "epoch": 0.920995298704277, + "grad_norm": 0.13258817791938782, + "learning_rate": 0.0008119937708864469, + "loss": 0.6686, + "step": 2008 + }, + { + "epoch": 0.9214539617016397, + "grad_norm": 0.18847809731960297, + "learning_rate": 0.0008118002162925804, + "loss": 0.839, + "step": 2009 + }, + { + "epoch": 0.9219126246990024, + "grad_norm": 0.059659842401742935, + "learning_rate": 0.0008116065852118728, + "loss": 0.3741, + "step": 2010 + }, + { + "epoch": 0.9223712876963651, + "grad_norm": 0.33892372250556946, + "learning_rate": 0.0008114128776918229, + "loss": 1.7498, + "step": 2011 + }, + { + "epoch": 0.9228299506937278, + "grad_norm": 0.2842087149620056, + "learning_rate": 0.0008112190937799488, + "loss": 1.4104, + "step": 2012 + }, + { + "epoch": 0.9232886136910905, + "grad_norm": 0.18660642206668854, + "learning_rate": 0.0008110252335237868, + "loss": 0.9644, + "step": 2013 + }, + { + "epoch": 0.9237472766884531, + "grad_norm": 0.1834762841463089, + "learning_rate": 0.0008108312969708928, + "loss": 0.8232, + "step": 2014 + }, + { + "epoch": 0.9242059396858159, + "grad_norm": 0.3446117639541626, + "learning_rate": 0.0008106372841688401, + "loss": 2.0298, + "step": 2015 + }, + { + "epoch": 0.9246646026831785, + "grad_norm": 0.3695407807826996, + "learning_rate": 0.0008104431951652219, + "loss": 1.6854, + "step": 2016 + }, + { + "epoch": 0.9251232656805413, + "grad_norm": 0.26748037338256836, + "learning_rate": 0.0008102490300076496, + "loss": 1.3953, + "step": 2017 + }, + { + "epoch": 0.9255819286779039, + "grad_norm": 0.08492042124271393, + "learning_rate": 0.0008100547887437531, + "loss": 0.5242, + "step": 2018 + }, + { + "epoch": 0.9260405916752666, + "grad_norm": 0.2597675919532776, + "learning_rate": 0.0008098604714211813, + "loss": 1.1575, + "step": 2019 + }, + { + "epoch": 0.9264992546726293, + "grad_norm": 0.29596418142318726, + "learning_rate": 0.0008096660780876017, + "loss": 1.4683, + "step": 2020 + }, + { + "epoch": 0.926957917669992, + "grad_norm": 0.2640438973903656, + "learning_rate": 0.0008094716087907003, + "loss": 1.4299, + "step": 2021 + }, + { + "epoch": 0.9274165806673547, + "grad_norm": 0.1778401881456375, + "learning_rate": 0.0008092770635781821, + "loss": 0.8381, + "step": 2022 + }, + { + "epoch": 0.9278752436647173, + "grad_norm": 0.2718510329723358, + "learning_rate": 0.0008090824424977699, + "loss": 1.5079, + "step": 2023 + }, + { + "epoch": 0.92833390666208, + "grad_norm": 0.2521864175796509, + "learning_rate": 0.0008088877455972062, + "loss": 1.0537, + "step": 2024 + }, + { + "epoch": 0.9287925696594427, + "grad_norm": 0.2869158685207367, + "learning_rate": 0.000808692972924251, + "loss": 1.6425, + "step": 2025 + }, + { + "epoch": 0.9292512326568054, + "grad_norm": 0.33233222365379333, + "learning_rate": 0.000808498124526684, + "loss": 1.9667, + "step": 2026 + }, + { + "epoch": 0.9297098956541681, + "grad_norm": 0.34734752774238586, + "learning_rate": 0.0008083032004523026, + "loss": 1.9368, + "step": 2027 + }, + { + "epoch": 0.9301685586515308, + "grad_norm": 0.37285783886909485, + "learning_rate": 0.0008081082007489231, + "loss": 1.7972, + "step": 2028 + }, + { + "epoch": 0.9306272216488934, + "grad_norm": 0.19466300308704376, + "learning_rate": 0.0008079131254643804, + "loss": 0.8481, + "step": 2029 + }, + { + "epoch": 0.9310858846462562, + "grad_norm": 0.28476589918136597, + "learning_rate": 0.0008077179746465278, + "loss": 1.4938, + "step": 2030 + }, + { + "epoch": 0.9315445476436188, + "grad_norm": 0.22528983652591705, + "learning_rate": 0.0008075227483432374, + "loss": 1.0093, + "step": 2031 + }, + { + "epoch": 0.9320032106409816, + "grad_norm": 0.33551692962646484, + "learning_rate": 0.0008073274466023994, + "loss": 1.4329, + "step": 2032 + }, + { + "epoch": 0.9324618736383442, + "grad_norm": 0.40014681220054626, + "learning_rate": 0.0008071320694719226, + "loss": 1.513, + "step": 2033 + }, + { + "epoch": 0.932920536635707, + "grad_norm": 0.35248538851737976, + "learning_rate": 0.000806936616999735, + "loss": 1.8031, + "step": 2034 + }, + { + "epoch": 0.9333791996330696, + "grad_norm": 0.20108796656131744, + "learning_rate": 0.0008067410892337819, + "loss": 0.9306, + "step": 2035 + }, + { + "epoch": 0.9338378626304323, + "grad_norm": 0.2716940939426422, + "learning_rate": 0.000806545486222028, + "loss": 1.2828, + "step": 2036 + }, + { + "epoch": 0.934296525627795, + "grad_norm": 0.3034631311893463, + "learning_rate": 0.0008063498080124559, + "loss": 1.4306, + "step": 2037 + }, + { + "epoch": 0.9347551886251577, + "grad_norm": 0.3705253005027771, + "learning_rate": 0.000806154054653067, + "loss": 1.8486, + "step": 2038 + }, + { + "epoch": 0.9352138516225204, + "grad_norm": 0.25508952140808105, + "learning_rate": 0.0008059582261918812, + "loss": 1.2751, + "step": 2039 + }, + { + "epoch": 0.935672514619883, + "grad_norm": 0.39250513911247253, + "learning_rate": 0.0008057623226769362, + "loss": 2.0559, + "step": 2040 + }, + { + "epoch": 0.9361311776172457, + "grad_norm": 0.2672523558139801, + "learning_rate": 0.0008055663441562889, + "loss": 1.037, + "step": 2041 + }, + { + "epoch": 0.9365898406146084, + "grad_norm": 0.19914977252483368, + "learning_rate": 0.0008053702906780142, + "loss": 0.812, + "step": 2042 + }, + { + "epoch": 0.9370485036119711, + "grad_norm": 0.20971417427062988, + "learning_rate": 0.0008051741622902052, + "loss": 0.8383, + "step": 2043 + }, + { + "epoch": 0.9375071666093338, + "grad_norm": 0.3085592985153198, + "learning_rate": 0.0008049779590409739, + "loss": 1.4219, + "step": 2044 + }, + { + "epoch": 0.9379658296066965, + "grad_norm": 0.361795574426651, + "learning_rate": 0.0008047816809784502, + "loss": 1.9539, + "step": 2045 + }, + { + "epoch": 0.9384244926040591, + "grad_norm": 0.37029165029525757, + "learning_rate": 0.0008045853281507827, + "loss": 1.7736, + "step": 2046 + }, + { + "epoch": 0.9388831556014219, + "grad_norm": 0.3089427053928375, + "learning_rate": 0.0008043889006061378, + "loss": 1.5316, + "step": 2047 + }, + { + "epoch": 0.9393418185987845, + "grad_norm": 0.22889067232608795, + "learning_rate": 0.0008041923983927009, + "loss": 0.8428, + "step": 2048 + }, + { + "epoch": 0.9398004815961473, + "grad_norm": 0.3076949417591095, + "learning_rate": 0.0008039958215586753, + "loss": 1.6298, + "step": 2049 + }, + { + "epoch": 0.9402591445935099, + "grad_norm": 0.3263643682003021, + "learning_rate": 0.0008037991701522829, + "loss": 1.8137, + "step": 2050 + }, + { + "epoch": 0.9407178075908726, + "grad_norm": 0.3142676055431366, + "learning_rate": 0.0008036024442217636, + "loss": 1.4615, + "step": 2051 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.27744585275650024, + "learning_rate": 0.0008034056438153753, + "loss": 1.2395, + "step": 2052 + }, + { + "epoch": 0.941635133585598, + "grad_norm": 0.21574388444423676, + "learning_rate": 0.0008032087689813952, + "loss": 0.914, + "step": 2053 + }, + { + "epoch": 0.9420937965829607, + "grad_norm": 0.1983022391796112, + "learning_rate": 0.0008030118197681178, + "loss": 1.0479, + "step": 2054 + }, + { + "epoch": 0.9425524595803234, + "grad_norm": 0.2801907956600189, + "learning_rate": 0.0008028147962238561, + "loss": 1.3931, + "step": 2055 + }, + { + "epoch": 0.943011122577686, + "grad_norm": 0.10424070060253143, + "learning_rate": 0.0008026176983969415, + "loss": 0.5858, + "step": 2056 + }, + { + "epoch": 0.9434697855750487, + "grad_norm": 0.260968953371048, + "learning_rate": 0.0008024205263357234, + "loss": 1.2817, + "step": 2057 + }, + { + "epoch": 0.9439284485724114, + "grad_norm": 0.24895592033863068, + "learning_rate": 0.0008022232800885697, + "loss": 1.2282, + "step": 2058 + }, + { + "epoch": 0.9443871115697741, + "grad_norm": 0.35649770498275757, + "learning_rate": 0.0008020259597038663, + "loss": 1.7548, + "step": 2059 + }, + { + "epoch": 0.9448457745671368, + "grad_norm": 0.3545326292514801, + "learning_rate": 0.0008018285652300173, + "loss": 1.819, + "step": 2060 + }, + { + "epoch": 0.9453044375644994, + "grad_norm": 0.35756614804267883, + "learning_rate": 0.0008016310967154448, + "loss": 1.8008, + "step": 2061 + }, + { + "epoch": 0.9457631005618622, + "grad_norm": 0.12270601838827133, + "learning_rate": 0.0008014335542085896, + "loss": 0.6562, + "step": 2062 + }, + { + "epoch": 0.9462217635592248, + "grad_norm": 0.3161708414554596, + "learning_rate": 0.0008012359377579099, + "loss": 1.5311, + "step": 2063 + }, + { + "epoch": 0.9466804265565876, + "grad_norm": 0.20928043127059937, + "learning_rate": 0.0008010382474118827, + "loss": 0.9492, + "step": 2064 + }, + { + "epoch": 0.9471390895539502, + "grad_norm": 0.3087616562843323, + "learning_rate": 0.0008008404832190028, + "loss": 1.3438, + "step": 2065 + }, + { + "epoch": 0.947597752551313, + "grad_norm": 0.24473027884960175, + "learning_rate": 0.0008006426452277833, + "loss": 1.151, + "step": 2066 + }, + { + "epoch": 0.9480564155486756, + "grad_norm": 0.08595964312553406, + "learning_rate": 0.0008004447334867551, + "loss": 0.43, + "step": 2067 + }, + { + "epoch": 0.9485150785460383, + "grad_norm": 0.5434077382087708, + "learning_rate": 0.0008002467480444675, + "loss": 2.1346, + "step": 2068 + }, + { + "epoch": 0.948973741543401, + "grad_norm": 0.32616284489631653, + "learning_rate": 0.0008000486889494877, + "loss": 1.3558, + "step": 2069 + }, + { + "epoch": 0.9494324045407637, + "grad_norm": 0.24117501080036163, + "learning_rate": 0.000799850556250401, + "loss": 1.0023, + "step": 2070 + }, + { + "epoch": 0.9498910675381264, + "grad_norm": 0.26819151639938354, + "learning_rate": 0.0007996523499958109, + "loss": 1.0614, + "step": 2071 + }, + { + "epoch": 0.9503497305354891, + "grad_norm": 0.28361374139785767, + "learning_rate": 0.0007994540702343386, + "loss": 1.374, + "step": 2072 + }, + { + "epoch": 0.9508083935328517, + "grad_norm": 0.4279637038707733, + "learning_rate": 0.000799255717014624, + "loss": 1.9542, + "step": 2073 + }, + { + "epoch": 0.9512670565302144, + "grad_norm": 0.4314320981502533, + "learning_rate": 0.0007990572903853239, + "loss": 1.9646, + "step": 2074 + }, + { + "epoch": 0.9517257195275771, + "grad_norm": 0.3635926842689514, + "learning_rate": 0.0007988587903951143, + "loss": 1.699, + "step": 2075 + }, + { + "epoch": 0.9521843825249398, + "grad_norm": 0.3035918176174164, + "learning_rate": 0.0007986602170926885, + "loss": 1.4184, + "step": 2076 + }, + { + "epoch": 0.9526430455223025, + "grad_norm": 0.38614422082901, + "learning_rate": 0.0007984615705267581, + "loss": 2.1599, + "step": 2077 + }, + { + "epoch": 0.9531017085196651, + "grad_norm": 0.3324802815914154, + "learning_rate": 0.0007982628507460523, + "loss": 1.2087, + "step": 2078 + }, + { + "epoch": 0.9535603715170279, + "grad_norm": 0.243191197514534, + "learning_rate": 0.0007980640577993187, + "loss": 0.9754, + "step": 2079 + }, + { + "epoch": 0.9540190345143905, + "grad_norm": 0.2958884537220001, + "learning_rate": 0.0007978651917353225, + "loss": 1.3487, + "step": 2080 + }, + { + "epoch": 0.9544776975117533, + "grad_norm": 0.17807622253894806, + "learning_rate": 0.0007976662526028473, + "loss": 0.9265, + "step": 2081 + }, + { + "epoch": 0.9549363605091159, + "grad_norm": 0.20209459960460663, + "learning_rate": 0.0007974672404506937, + "loss": 0.9037, + "step": 2082 + }, + { + "epoch": 0.9553950235064786, + "grad_norm": 0.31564533710479736, + "learning_rate": 0.0007972681553276813, + "loss": 1.7772, + "step": 2083 + }, + { + "epoch": 0.9558536865038413, + "grad_norm": 0.4125016927719116, + "learning_rate": 0.0007970689972826471, + "loss": 2.0496, + "step": 2084 + }, + { + "epoch": 0.956312349501204, + "grad_norm": 0.49603113532066345, + "learning_rate": 0.0007968697663644457, + "loss": 1.1933, + "step": 2085 + }, + { + "epoch": 0.9567710124985667, + "grad_norm": 0.2608060836791992, + "learning_rate": 0.0007966704626219503, + "loss": 1.3788, + "step": 2086 + }, + { + "epoch": 0.9572296754959294, + "grad_norm": 0.3525409400463104, + "learning_rate": 0.0007964710861040512, + "loss": 1.7322, + "step": 2087 + }, + { + "epoch": 0.957688338493292, + "grad_norm": 0.3033929169178009, + "learning_rate": 0.000796271636859657, + "loss": 1.6537, + "step": 2088 + }, + { + "epoch": 0.9581470014906548, + "grad_norm": 0.24828091263771057, + "learning_rate": 0.000796072114937694, + "loss": 1.1296, + "step": 2089 + }, + { + "epoch": 0.9586056644880174, + "grad_norm": 0.2196667641401291, + "learning_rate": 0.0007958725203871064, + "loss": 1.0247, + "step": 2090 + }, + { + "epoch": 0.9590643274853801, + "grad_norm": 0.2858245074748993, + "learning_rate": 0.0007956728532568563, + "loss": 1.3901, + "step": 2091 + }, + { + "epoch": 0.9595229904827428, + "grad_norm": 0.2958674430847168, + "learning_rate": 0.0007954731135959235, + "loss": 1.3847, + "step": 2092 + }, + { + "epoch": 0.9599816534801054, + "grad_norm": 0.25934916734695435, + "learning_rate": 0.0007952733014533051, + "loss": 1.3095, + "step": 2093 + }, + { + "epoch": 0.9604403164774682, + "grad_norm": 0.26100558042526245, + "learning_rate": 0.0007950734168780171, + "loss": 1.519, + "step": 2094 + }, + { + "epoch": 0.9608989794748308, + "grad_norm": 0.18241111934185028, + "learning_rate": 0.0007948734599190924, + "loss": 0.9885, + "step": 2095 + }, + { + "epoch": 0.9613576424721936, + "grad_norm": 1.0444318056106567, + "learning_rate": 0.0007946734306255816, + "loss": 1.6812, + "step": 2096 + }, + { + "epoch": 0.9618163054695562, + "grad_norm": 0.31861168146133423, + "learning_rate": 0.0007944733290465535, + "loss": 2.0449, + "step": 2097 + }, + { + "epoch": 0.962274968466919, + "grad_norm": 0.28558486700057983, + "learning_rate": 0.0007942731552310949, + "loss": 1.6525, + "step": 2098 + }, + { + "epoch": 0.9627336314642816, + "grad_norm": 0.23924612998962402, + "learning_rate": 0.0007940729092283092, + "loss": 1.2884, + "step": 2099 + }, + { + "epoch": 0.9631922944616443, + "grad_norm": 0.2011316865682602, + "learning_rate": 0.0007938725910873186, + "loss": 1.3885, + "step": 2100 + }, + { + "epoch": 0.963650957459007, + "grad_norm": 0.31433144211769104, + "learning_rate": 0.0007936722008572625, + "loss": 1.9054, + "step": 2101 + }, + { + "epoch": 0.9641096204563697, + "grad_norm": 0.26311978697776794, + "learning_rate": 0.000793471738587298, + "loss": 1.584, + "step": 2102 + }, + { + "epoch": 0.9645682834537324, + "grad_norm": 0.27694377303123474, + "learning_rate": 0.0007932712043266, + "loss": 1.2675, + "step": 2103 + }, + { + "epoch": 0.9650269464510951, + "grad_norm": 0.15894590318202972, + "learning_rate": 0.000793070598124361, + "loss": 0.8647, + "step": 2104 + }, + { + "epoch": 0.9654856094484577, + "grad_norm": 0.3218410909175873, + "learning_rate": 0.000792869920029791, + "loss": 1.8047, + "step": 2105 + }, + { + "epoch": 0.9659442724458205, + "grad_norm": 0.18074116110801697, + "learning_rate": 0.0007926691700921181, + "loss": 1.0374, + "step": 2106 + }, + { + "epoch": 0.9664029354431831, + "grad_norm": 0.3340591490268707, + "learning_rate": 0.0007924683483605875, + "loss": 1.8395, + "step": 2107 + }, + { + "epoch": 0.9668615984405458, + "grad_norm": 0.30720779299736023, + "learning_rate": 0.0007922674548844622, + "loss": 1.6614, + "step": 2108 + }, + { + "epoch": 0.9673202614379085, + "grad_norm": 0.275790810585022, + "learning_rate": 0.0007920664897130228, + "loss": 1.6523, + "step": 2109 + }, + { + "epoch": 0.9677789244352711, + "grad_norm": 0.0825454369187355, + "learning_rate": 0.0007918654528955675, + "loss": 0.4651, + "step": 2110 + }, + { + "epoch": 0.9682375874326339, + "grad_norm": 0.31684210896492004, + "learning_rate": 0.0007916643444814123, + "loss": 1.9248, + "step": 2111 + }, + { + "epoch": 0.9686962504299965, + "grad_norm": 0.18997028470039368, + "learning_rate": 0.0007914631645198904, + "loss": 1.0344, + "step": 2112 + }, + { + "epoch": 0.9691549134273593, + "grad_norm": 0.24045085906982422, + "learning_rate": 0.0007912619130603527, + "loss": 1.2285, + "step": 2113 + }, + { + "epoch": 0.9696135764247219, + "grad_norm": 0.3353755474090576, + "learning_rate": 0.0007910605901521675, + "loss": 1.7612, + "step": 2114 + }, + { + "epoch": 0.9700722394220846, + "grad_norm": 0.32669898867607117, + "learning_rate": 0.0007908591958447209, + "loss": 1.4136, + "step": 2115 + }, + { + "epoch": 0.9705309024194473, + "grad_norm": 0.37192410230636597, + "learning_rate": 0.0007906577301874163, + "loss": 1.8505, + "step": 2116 + }, + { + "epoch": 0.97098956541681, + "grad_norm": 0.2398930788040161, + "learning_rate": 0.000790456193229675, + "loss": 1.0386, + "step": 2117 + }, + { + "epoch": 0.9714482284141727, + "grad_norm": 0.3819679319858551, + "learning_rate": 0.0007902545850209349, + "loss": 1.7427, + "step": 2118 + }, + { + "epoch": 0.9719068914115354, + "grad_norm": 0.2345220297574997, + "learning_rate": 0.0007900529056106525, + "loss": 1.2595, + "step": 2119 + }, + { + "epoch": 0.972365554408898, + "grad_norm": 0.3607707917690277, + "learning_rate": 0.0007898511550483007, + "loss": 2.0488, + "step": 2120 + }, + { + "epoch": 0.9728242174062608, + "grad_norm": 0.33080729842185974, + "learning_rate": 0.0007896493333833707, + "loss": 1.4388, + "step": 2121 + }, + { + "epoch": 0.9732828804036234, + "grad_norm": 0.2641333341598511, + "learning_rate": 0.0007894474406653709, + "loss": 1.5211, + "step": 2122 + }, + { + "epoch": 0.9737415434009862, + "grad_norm": 0.18903343379497528, + "learning_rate": 0.0007892454769438268, + "loss": 0.7437, + "step": 2123 + }, + { + "epoch": 0.9742002063983488, + "grad_norm": 0.2841821014881134, + "learning_rate": 0.0007890434422682817, + "loss": 1.4099, + "step": 2124 + }, + { + "epoch": 0.9746588693957114, + "grad_norm": 0.18609897792339325, + "learning_rate": 0.0007888413366882959, + "loss": 0.9775, + "step": 2125 + }, + { + "epoch": 0.9751175323930742, + "grad_norm": 0.3263588845729828, + "learning_rate": 0.0007886391602534477, + "loss": 1.9374, + "step": 2126 + }, + { + "epoch": 0.9755761953904368, + "grad_norm": 0.29043829441070557, + "learning_rate": 0.0007884369130133325, + "loss": 1.6285, + "step": 2127 + }, + { + "epoch": 0.9760348583877996, + "grad_norm": 0.23733757436275482, + "learning_rate": 0.0007882345950175624, + "loss": 1.009, + "step": 2128 + }, + { + "epoch": 0.9764935213851622, + "grad_norm": 0.28636664152145386, + "learning_rate": 0.0007880322063157681, + "loss": 1.4238, + "step": 2129 + }, + { + "epoch": 0.976952184382525, + "grad_norm": 0.24077756702899933, + "learning_rate": 0.0007878297469575967, + "loss": 1.3356, + "step": 2130 + }, + { + "epoch": 0.9774108473798876, + "grad_norm": 0.21748611330986023, + "learning_rate": 0.000787627216992713, + "loss": 1.0264, + "step": 2131 + }, + { + "epoch": 0.9778695103772503, + "grad_norm": 0.17146669328212738, + "learning_rate": 0.0007874246164707991, + "loss": 0.9994, + "step": 2132 + }, + { + "epoch": 0.978328173374613, + "grad_norm": 0.28764429688453674, + "learning_rate": 0.0007872219454415543, + "loss": 1.8247, + "step": 2133 + }, + { + "epoch": 0.9787868363719757, + "grad_norm": 0.3070955276489258, + "learning_rate": 0.0007870192039546954, + "loss": 1.7511, + "step": 2134 + }, + { + "epoch": 0.9792454993693384, + "grad_norm": 0.24031320214271545, + "learning_rate": 0.0007868163920599563, + "loss": 1.4128, + "step": 2135 + }, + { + "epoch": 0.9797041623667011, + "grad_norm": 0.260254442691803, + "learning_rate": 0.000786613509807088, + "loss": 1.5773, + "step": 2136 + }, + { + "epoch": 0.9801628253640637, + "grad_norm": 0.25486305356025696, + "learning_rate": 0.0007864105572458592, + "loss": 1.146, + "step": 2137 + }, + { + "epoch": 0.9806214883614265, + "grad_norm": 0.2558663785457611, + "learning_rate": 0.0007862075344260555, + "loss": 1.3716, + "step": 2138 + }, + { + "epoch": 0.9810801513587891, + "grad_norm": 0.2597788870334625, + "learning_rate": 0.0007860044413974801, + "loss": 1.4079, + "step": 2139 + }, + { + "epoch": 0.9815388143561519, + "grad_norm": 0.2434176504611969, + "learning_rate": 0.000785801278209953, + "loss": 1.3001, + "step": 2140 + }, + { + "epoch": 0.9819974773535145, + "grad_norm": 0.24413499236106873, + "learning_rate": 0.0007855980449133116, + "loss": 1.4407, + "step": 2141 + }, + { + "epoch": 0.9824561403508771, + "grad_norm": 0.24809010326862335, + "learning_rate": 0.0007853947415574106, + "loss": 1.393, + "step": 2142 + }, + { + "epoch": 0.9829148033482399, + "grad_norm": 0.2992349863052368, + "learning_rate": 0.0007851913681921214, + "loss": 1.4247, + "step": 2143 + }, + { + "epoch": 0.9833734663456025, + "grad_norm": 0.28984925150871277, + "learning_rate": 0.0007849879248673336, + "loss": 1.7083, + "step": 2144 + }, + { + "epoch": 0.9838321293429653, + "grad_norm": 0.2734730839729309, + "learning_rate": 0.0007847844116329527, + "loss": 1.2769, + "step": 2145 + }, + { + "epoch": 0.9842907923403279, + "grad_norm": 0.23517167568206787, + "learning_rate": 0.0007845808285389024, + "loss": 1.291, + "step": 2146 + }, + { + "epoch": 0.9847494553376906, + "grad_norm": 0.09210627526044846, + "learning_rate": 0.0007843771756351228, + "loss": 0.5888, + "step": 2147 + }, + { + "epoch": 0.9852081183350533, + "grad_norm": 0.25133419036865234, + "learning_rate": 0.0007841734529715717, + "loss": 1.1768, + "step": 2148 + }, + { + "epoch": 0.985666781332416, + "grad_norm": 0.2581869065761566, + "learning_rate": 0.0007839696605982235, + "loss": 1.5143, + "step": 2149 + }, + { + "epoch": 0.9861254443297787, + "grad_norm": 0.2132362276315689, + "learning_rate": 0.00078376579856507, + "loss": 0.9654, + "step": 2150 + }, + { + "epoch": 0.9865841073271414, + "grad_norm": 0.3260076344013214, + "learning_rate": 0.00078356186692212, + "loss": 1.6854, + "step": 2151 + }, + { + "epoch": 0.987042770324504, + "grad_norm": 0.1947498768568039, + "learning_rate": 0.0007833578657193996, + "loss": 0.923, + "step": 2152 + }, + { + "epoch": 0.9875014333218668, + "grad_norm": 0.25844818353652954, + "learning_rate": 0.0007831537950069516, + "loss": 1.3283, + "step": 2153 + }, + { + "epoch": 0.9879600963192294, + "grad_norm": 0.3847392499446869, + "learning_rate": 0.000782949654834836, + "loss": 1.8961, + "step": 2154 + }, + { + "epoch": 0.9884187593165922, + "grad_norm": 0.376164048910141, + "learning_rate": 0.00078274544525313, + "loss": 2.1665, + "step": 2155 + }, + { + "epoch": 0.9888774223139548, + "grad_norm": 0.14740854501724243, + "learning_rate": 0.0007825411663119274, + "loss": 0.7855, + "step": 2156 + }, + { + "epoch": 0.9893360853113176, + "grad_norm": 0.3205507695674896, + "learning_rate": 0.0007823368180613395, + "loss": 1.5332, + "step": 2157 + }, + { + "epoch": 0.9897947483086802, + "grad_norm": 0.2271261066198349, + "learning_rate": 0.0007821324005514945, + "loss": 0.9458, + "step": 2158 + }, + { + "epoch": 0.9902534113060428, + "grad_norm": 0.2652105987071991, + "learning_rate": 0.0007819279138325373, + "loss": 1.389, + "step": 2159 + }, + { + "epoch": 0.9907120743034056, + "grad_norm": 0.2753114104270935, + "learning_rate": 0.00078172335795463, + "loss": 1.2935, + "step": 2160 + }, + { + "epoch": 0.9911707373007682, + "grad_norm": 0.10524280369281769, + "learning_rate": 0.0007815187329679517, + "loss": 0.6075, + "step": 2161 + }, + { + "epoch": 0.991629400298131, + "grad_norm": 0.22543789446353912, + "learning_rate": 0.0007813140389226984, + "loss": 1.2902, + "step": 2162 + }, + { + "epoch": 0.9920880632954936, + "grad_norm": 0.31716790795326233, + "learning_rate": 0.0007811092758690829, + "loss": 1.8091, + "step": 2163 + }, + { + "epoch": 0.9925467262928563, + "grad_norm": 0.28479596972465515, + "learning_rate": 0.000780904443857335, + "loss": 1.4618, + "step": 2164 + }, + { + "epoch": 0.993005389290219, + "grad_norm": 0.18860182166099548, + "learning_rate": 0.0007806995429377019, + "loss": 0.9116, + "step": 2165 + }, + { + "epoch": 0.9934640522875817, + "grad_norm": 0.24793879687786102, + "learning_rate": 0.0007804945731604467, + "loss": 1.2674, + "step": 2166 + }, + { + "epoch": 0.9939227152849444, + "grad_norm": 0.2623526453971863, + "learning_rate": 0.0007802895345758503, + "loss": 1.4075, + "step": 2167 + }, + { + "epoch": 0.9943813782823071, + "grad_norm": 0.3471762239933014, + "learning_rate": 0.0007800844272342103, + "loss": 2.0403, + "step": 2168 + }, + { + "epoch": 0.9948400412796697, + "grad_norm": 0.29183194041252136, + "learning_rate": 0.0007798792511858404, + "loss": 1.6896, + "step": 2169 + }, + { + "epoch": 0.9952987042770325, + "grad_norm": 0.30205872654914856, + "learning_rate": 0.0007796740064810724, + "loss": 1.6495, + "step": 2170 + }, + { + "epoch": 0.9957573672743951, + "grad_norm": 0.2651442289352417, + "learning_rate": 0.000779468693170254, + "loss": 1.3665, + "step": 2171 + }, + { + "epoch": 0.9962160302717579, + "grad_norm": 0.29746365547180176, + "learning_rate": 0.0007792633113037501, + "loss": 1.4551, + "step": 2172 + }, + { + "epoch": 0.9966746932691205, + "grad_norm": 0.30460765957832336, + "learning_rate": 0.0007790578609319424, + "loss": 1.6595, + "step": 2173 + }, + { + "epoch": 0.9971333562664833, + "grad_norm": 0.2733561396598816, + "learning_rate": 0.0007788523421052291, + "loss": 0.9736, + "step": 2174 + }, + { + "epoch": 0.9975920192638459, + "grad_norm": 0.3780529499053955, + "learning_rate": 0.0007786467548740259, + "loss": 1.7864, + "step": 2175 + }, + { + "epoch": 0.9980506822612085, + "grad_norm": 0.16774733364582062, + "learning_rate": 0.0007784410992887645, + "loss": 0.9591, + "step": 2176 + }, + { + "epoch": 0.9985093452585713, + "grad_norm": 0.31271597743034363, + "learning_rate": 0.0007782353753998936, + "loss": 1.7136, + "step": 2177 + }, + { + "epoch": 0.9989680082559339, + "grad_norm": 0.298076331615448, + "learning_rate": 0.0007780295832578792, + "loss": 1.2218, + "step": 2178 + }, + { + "epoch": 0.9994266712532967, + "grad_norm": 0.37182536721229553, + "learning_rate": 0.0007778237229132032, + "loss": 1.7005, + "step": 2179 + }, + { + "epoch": 0.9998853342506593, + "grad_norm": 0.2658548355102539, + "learning_rate": 0.0007776177944163648, + "loss": 1.2988, + "step": 2180 + }, + { + "epoch": 1.0, + "grad_norm": 0.2658548355102539, + "learning_rate": 0.0007776177944163648, + "loss": 0.5049, + "step": 2181 + }, + { + "epoch": 1.0004586629973626, + "grad_norm": 0.3670017421245575, + "learning_rate": 0.0007774117978178797, + "loss": 1.684, + "step": 2182 + }, + { + "epoch": 1.0009173259947253, + "grad_norm": 0.281902939081192, + "learning_rate": 0.0007772057331682802, + "loss": 1.1651, + "step": 2183 + }, + { + "epoch": 1.0013759889920881, + "grad_norm": 0.21292293071746826, + "learning_rate": 0.0007769996005181159, + "loss": 1.2873, + "step": 2184 + }, + { + "epoch": 1.0018346519894508, + "grad_norm": 0.3060021996498108, + "learning_rate": 0.0007767933999179521, + "loss": 1.6379, + "step": 2185 + }, + { + "epoch": 1.0022933149868134, + "grad_norm": 0.6159801483154297, + "learning_rate": 0.0007765871314183715, + "loss": 1.8337, + "step": 2186 + }, + { + "epoch": 1.002751977984176, + "grad_norm": 0.27383625507354736, + "learning_rate": 0.0007763807950699734, + "loss": 1.1533, + "step": 2187 + }, + { + "epoch": 1.003210640981539, + "grad_norm": 0.26541003584861755, + "learning_rate": 0.0007761743909233733, + "loss": 1.7667, + "step": 2188 + }, + { + "epoch": 1.0036693039789015, + "grad_norm": 0.2529599070549011, + "learning_rate": 0.0007759679190292039, + "loss": 1.2758, + "step": 2189 + }, + { + "epoch": 1.0041279669762642, + "grad_norm": 0.3762117326259613, + "learning_rate": 0.0007757613794381142, + "loss": 1.4408, + "step": 2190 + }, + { + "epoch": 1.0045866299736268, + "grad_norm": 0.17387861013412476, + "learning_rate": 0.0007755547722007696, + "loss": 1.2704, + "step": 2191 + }, + { + "epoch": 1.0050452929709897, + "grad_norm": 0.3292040228843689, + "learning_rate": 0.0007753480973678527, + "loss": 1.5397, + "step": 2192 + }, + { + "epoch": 1.0055039559683523, + "grad_norm": 0.2807086408138275, + "learning_rate": 0.0007751413549900621, + "loss": 1.2153, + "step": 2193 + }, + { + "epoch": 1.005962618965715, + "grad_norm": 0.3431303799152374, + "learning_rate": 0.0007749345451181132, + "loss": 1.6964, + "step": 2194 + }, + { + "epoch": 1.0064212819630776, + "grad_norm": 0.32860153913497925, + "learning_rate": 0.0007747276678027379, + "loss": 2.2512, + "step": 2195 + }, + { + "epoch": 1.0068799449604404, + "grad_norm": 0.4267536401748657, + "learning_rate": 0.000774520723094685, + "loss": 2.2029, + "step": 2196 + }, + { + "epoch": 1.007338607957803, + "grad_norm": 0.3573954403400421, + "learning_rate": 0.0007743137110447194, + "loss": 1.3235, + "step": 2197 + }, + { + "epoch": 1.0077972709551657, + "grad_norm": 0.31035539507865906, + "learning_rate": 0.0007741066317036222, + "loss": 1.5737, + "step": 2198 + }, + { + "epoch": 1.0082559339525283, + "grad_norm": 0.43037089705467224, + "learning_rate": 0.0007738994851221921, + "loss": 1.7621, + "step": 2199 + }, + { + "epoch": 1.008714596949891, + "grad_norm": 0.3466168940067291, + "learning_rate": 0.0007736922713512434, + "loss": 2.3148, + "step": 2200 + }, + { + "epoch": 1.0091732599472538, + "grad_norm": 0.3968213200569153, + "learning_rate": 0.000773484990441607, + "loss": 2.1924, + "step": 2201 + }, + { + "epoch": 1.0096319229446165, + "grad_norm": 0.3038836419582367, + "learning_rate": 0.0007732776424441307, + "loss": 0.9113, + "step": 2202 + }, + { + "epoch": 1.010090585941979, + "grad_norm": 0.29169583320617676, + "learning_rate": 0.0007730702274096782, + "loss": 1.7387, + "step": 2203 + }, + { + "epoch": 1.0105492489393417, + "grad_norm": 0.37365761399269104, + "learning_rate": 0.0007728627453891297, + "loss": 2.0283, + "step": 2204 + }, + { + "epoch": 1.0110079119367046, + "grad_norm": 0.35608237981796265, + "learning_rate": 0.0007726551964333827, + "loss": 0.9617, + "step": 2205 + }, + { + "epoch": 1.0114665749340672, + "grad_norm": 0.3315924406051636, + "learning_rate": 0.0007724475805933498, + "loss": 1.8494, + "step": 2206 + }, + { + "epoch": 1.0119252379314299, + "grad_norm": 0.31007975339889526, + "learning_rate": 0.000772239897919961, + "loss": 1.2838, + "step": 2207 + }, + { + "epoch": 1.0123839009287925, + "grad_norm": 0.4036838710308075, + "learning_rate": 0.0007720321484641625, + "loss": 1.7657, + "step": 2208 + }, + { + "epoch": 1.0128425639261553, + "grad_norm": 0.32576027512550354, + "learning_rate": 0.0007718243322769163, + "loss": 1.1251, + "step": 2209 + }, + { + "epoch": 1.013301226923518, + "grad_norm": 0.3255734145641327, + "learning_rate": 0.0007716164494092014, + "loss": 1.7505, + "step": 2210 + }, + { + "epoch": 1.0137598899208806, + "grad_norm": 0.2708061635494232, + "learning_rate": 0.0007714084999120132, + "loss": 1.2809, + "step": 2211 + }, + { + "epoch": 1.0142185529182433, + "grad_norm": 0.3722837269306183, + "learning_rate": 0.0007712004838363629, + "loss": 0.8855, + "step": 2212 + }, + { + "epoch": 1.0146772159156061, + "grad_norm": 0.19533422589302063, + "learning_rate": 0.0007709924012332784, + "loss": 0.9815, + "step": 2213 + }, + { + "epoch": 1.0151358789129687, + "grad_norm": 0.14171691238880157, + "learning_rate": 0.000770784252153804, + "loss": 1.119, + "step": 2214 + }, + { + "epoch": 1.0155945419103314, + "grad_norm": 0.4291399121284485, + "learning_rate": 0.000770576036649, + "loss": 1.8247, + "step": 2215 + }, + { + "epoch": 1.016053204907694, + "grad_norm": 0.35333549976348877, + "learning_rate": 0.0007703677547699435, + "loss": 1.5409, + "step": 2216 + }, + { + "epoch": 1.0165118679050567, + "grad_norm": 0.6529225707054138, + "learning_rate": 0.000770159406567727, + "loss": 1.7983, + "step": 2217 + }, + { + "epoch": 1.0169705309024195, + "grad_norm": 0.1469832956790924, + "learning_rate": 0.0007699509920934603, + "loss": 0.5275, + "step": 2218 + }, + { + "epoch": 1.0174291938997821, + "grad_norm": 0.35177433490753174, + "learning_rate": 0.0007697425113982688, + "loss": 2.0906, + "step": 2219 + }, + { + "epoch": 1.0178878568971448, + "grad_norm": 0.3718166649341583, + "learning_rate": 0.0007695339645332943, + "loss": 1.5574, + "step": 2220 + }, + { + "epoch": 1.0183465198945074, + "grad_norm": 0.4496723413467407, + "learning_rate": 0.0007693253515496947, + "loss": 1.6213, + "step": 2221 + }, + { + "epoch": 1.0188051828918703, + "grad_norm": 0.3293614089488983, + "learning_rate": 0.0007691166724986447, + "loss": 1.3462, + "step": 2222 + }, + { + "epoch": 1.019263845889233, + "grad_norm": 0.2451786994934082, + "learning_rate": 0.0007689079274313342, + "loss": 1.3479, + "step": 2223 + }, + { + "epoch": 1.0197225088865955, + "grad_norm": 0.33322182297706604, + "learning_rate": 0.0007686991163989704, + "loss": 2.0833, + "step": 2224 + }, + { + "epoch": 1.0201811718839582, + "grad_norm": 0.18467935919761658, + "learning_rate": 0.000768490239452776, + "loss": 0.8448, + "step": 2225 + }, + { + "epoch": 1.020639834881321, + "grad_norm": 0.21402765810489655, + "learning_rate": 0.0007682812966439896, + "loss": 0.7428, + "step": 2226 + }, + { + "epoch": 1.0210984978786837, + "grad_norm": 0.2612096667289734, + "learning_rate": 0.0007680722880238669, + "loss": 1.5559, + "step": 2227 + }, + { + "epoch": 1.0215571608760463, + "grad_norm": 0.2521737217903137, + "learning_rate": 0.0007678632136436792, + "loss": 1.6891, + "step": 2228 + }, + { + "epoch": 1.022015823873409, + "grad_norm": 0.2869427502155304, + "learning_rate": 0.0007676540735547136, + "loss": 0.9844, + "step": 2229 + }, + { + "epoch": 1.0224744868707718, + "grad_norm": 0.17099249362945557, + "learning_rate": 0.0007674448678082741, + "loss": 1.3577, + "step": 2230 + }, + { + "epoch": 1.0229331498681344, + "grad_norm": 0.3532930612564087, + "learning_rate": 0.0007672355964556799, + "loss": 1.4924, + "step": 2231 + }, + { + "epoch": 1.023391812865497, + "grad_norm": 0.22827592492103577, + "learning_rate": 0.000767026259548267, + "loss": 0.8428, + "step": 2232 + }, + { + "epoch": 1.0238504758628597, + "grad_norm": 0.19980807602405548, + "learning_rate": 0.0007668168571373875, + "loss": 1.0052, + "step": 2233 + }, + { + "epoch": 1.0243091388602223, + "grad_norm": 0.1704528033733368, + "learning_rate": 0.000766607389274409, + "loss": 0.8841, + "step": 2234 + }, + { + "epoch": 1.0247678018575852, + "grad_norm": 0.28992947936058044, + "learning_rate": 0.0007663978560107155, + "loss": 1.5616, + "step": 2235 + }, + { + "epoch": 1.0252264648549478, + "grad_norm": 0.29282909631729126, + "learning_rate": 0.000766188257397707, + "loss": 1.7693, + "step": 2236 + }, + { + "epoch": 1.0256851278523105, + "grad_norm": 0.4391837418079376, + "learning_rate": 0.0007659785934867999, + "loss": 2.3022, + "step": 2237 + }, + { + "epoch": 1.026143790849673, + "grad_norm": 0.3151383399963379, + "learning_rate": 0.0007657688643294259, + "loss": 1.6134, + "step": 2238 + }, + { + "epoch": 1.026602453847036, + "grad_norm": 0.264639288187027, + "learning_rate": 0.0007655590699770331, + "loss": 1.0672, + "step": 2239 + }, + { + "epoch": 1.0270611168443986, + "grad_norm": 0.24356485903263092, + "learning_rate": 0.0007653492104810858, + "loss": 1.1269, + "step": 2240 + }, + { + "epoch": 1.0275197798417612, + "grad_norm": 0.191898375749588, + "learning_rate": 0.000765139285893064, + "loss": 0.8673, + "step": 2241 + }, + { + "epoch": 1.0279784428391239, + "grad_norm": 0.2663523256778717, + "learning_rate": 0.0007649292962644634, + "loss": 1.2664, + "step": 2242 + }, + { + "epoch": 1.0284371058364867, + "grad_norm": 0.24687746167182922, + "learning_rate": 0.0007647192416467962, + "loss": 1.3866, + "step": 2243 + }, + { + "epoch": 1.0288957688338494, + "grad_norm": 0.26128000020980835, + "learning_rate": 0.0007645091220915904, + "loss": 1.1718, + "step": 2244 + }, + { + "epoch": 1.029354431831212, + "grad_norm": 0.3191836476325989, + "learning_rate": 0.0007642989376503897, + "loss": 1.4741, + "step": 2245 + }, + { + "epoch": 1.0298130948285746, + "grad_norm": 0.11389821767807007, + "learning_rate": 0.0007640886883747539, + "loss": 1.0261, + "step": 2246 + }, + { + "epoch": 1.0302717578259375, + "grad_norm": 0.4371493458747864, + "learning_rate": 0.0007638783743162586, + "loss": 1.3102, + "step": 2247 + }, + { + "epoch": 1.0307304208233001, + "grad_norm": 0.2924779951572418, + "learning_rate": 0.0007636679955264954, + "loss": 1.5544, + "step": 2248 + }, + { + "epoch": 1.0311890838206628, + "grad_norm": 0.29356637597084045, + "learning_rate": 0.0007634575520570719, + "loss": 1.4492, + "step": 2249 + }, + { + "epoch": 1.0316477468180254, + "grad_norm": 0.41005346179008484, + "learning_rate": 0.0007632470439596113, + "loss": 1.8448, + "step": 2250 + }, + { + "epoch": 1.032106409815388, + "grad_norm": 0.3051200807094574, + "learning_rate": 0.0007630364712857525, + "loss": 1.9664, + "step": 2251 + }, + { + "epoch": 1.032565072812751, + "grad_norm": 0.29568442702293396, + "learning_rate": 0.0007628258340871507, + "loss": 1.519, + "step": 2252 + }, + { + "epoch": 1.0330237358101135, + "grad_norm": 0.34316301345825195, + "learning_rate": 0.0007626151324154768, + "loss": 1.6171, + "step": 2253 + }, + { + "epoch": 1.0334823988074762, + "grad_norm": 0.34773850440979004, + "learning_rate": 0.0007624043663224173, + "loss": 1.3993, + "step": 2254 + }, + { + "epoch": 1.0339410618048388, + "grad_norm": 0.4576634466648102, + "learning_rate": 0.000762193535859675, + "loss": 1.3873, + "step": 2255 + }, + { + "epoch": 1.0343997248022017, + "grad_norm": 0.3191813826560974, + "learning_rate": 0.0007619826410789676, + "loss": 1.6381, + "step": 2256 + }, + { + "epoch": 1.0348583877995643, + "grad_norm": 0.29539400339126587, + "learning_rate": 0.0007617716820320293, + "loss": 1.3324, + "step": 2257 + }, + { + "epoch": 1.035317050796927, + "grad_norm": 0.18559886515140533, + "learning_rate": 0.0007615606587706101, + "loss": 1.2413, + "step": 2258 + }, + { + "epoch": 1.0357757137942896, + "grad_norm": 0.33654916286468506, + "learning_rate": 0.0007613495713464752, + "loss": 1.215, + "step": 2259 + }, + { + "epoch": 1.0362343767916524, + "grad_norm": 0.2632749378681183, + "learning_rate": 0.0007611384198114061, + "loss": 1.8086, + "step": 2260 + }, + { + "epoch": 1.036693039789015, + "grad_norm": 0.36246350407600403, + "learning_rate": 0.0007609272042171998, + "loss": 1.32, + "step": 2261 + }, + { + "epoch": 1.0371517027863777, + "grad_norm": 0.21696679294109344, + "learning_rate": 0.0007607159246156688, + "loss": 1.3752, + "step": 2262 + }, + { + "epoch": 1.0376103657837403, + "grad_norm": 0.20672181248664856, + "learning_rate": 0.0007605045810586415, + "loss": 0.3842, + "step": 2263 + }, + { + "epoch": 1.0380690287811032, + "grad_norm": 0.22013358771800995, + "learning_rate": 0.0007602931735979624, + "loss": 0.9232, + "step": 2264 + }, + { + "epoch": 1.0385276917784658, + "grad_norm": 0.2625974118709564, + "learning_rate": 0.0007600817022854908, + "loss": 1.6846, + "step": 2265 + }, + { + "epoch": 1.0389863547758285, + "grad_norm": 0.2270309031009674, + "learning_rate": 0.0007598701671731025, + "loss": 0.6877, + "step": 2266 + }, + { + "epoch": 1.039445017773191, + "grad_norm": 0.35904768109321594, + "learning_rate": 0.0007596585683126883, + "loss": 1.9496, + "step": 2267 + }, + { + "epoch": 1.0399036807705537, + "grad_norm": 0.31437742710113525, + "learning_rate": 0.0007594469057561551, + "loss": 1.4388, + "step": 2268 + }, + { + "epoch": 1.0403623437679166, + "grad_norm": 0.31415170431137085, + "learning_rate": 0.0007592351795554254, + "loss": 2.0281, + "step": 2269 + }, + { + "epoch": 1.0408210067652792, + "grad_norm": 0.19663023948669434, + "learning_rate": 0.0007590233897624367, + "loss": 0.5938, + "step": 2270 + }, + { + "epoch": 1.0412796697626419, + "grad_norm": 0.16666308045387268, + "learning_rate": 0.0007588115364291429, + "loss": 1.0012, + "step": 2271 + }, + { + "epoch": 1.0417383327600045, + "grad_norm": 0.10785413533449173, + "learning_rate": 0.0007585996196075131, + "loss": 0.913, + "step": 2272 + }, + { + "epoch": 1.0421969957573674, + "grad_norm": 0.34523460268974304, + "learning_rate": 0.000758387639349532, + "loss": 1.8508, + "step": 2273 + }, + { + "epoch": 1.04265565875473, + "grad_norm": 0.25581368803977966, + "learning_rate": 0.0007581755957072, + "loss": 1.0048, + "step": 2274 + }, + { + "epoch": 1.0431143217520926, + "grad_norm": 0.2878674864768982, + "learning_rate": 0.0007579634887325328, + "loss": 1.2516, + "step": 2275 + }, + { + "epoch": 1.0435729847494553, + "grad_norm": 0.43240469694137573, + "learning_rate": 0.0007577513184775617, + "loss": 1.5632, + "step": 2276 + }, + { + "epoch": 1.0440316477468181, + "grad_norm": 0.25301140546798706, + "learning_rate": 0.0007575390849943337, + "loss": 1.3549, + "step": 2277 + }, + { + "epoch": 1.0444903107441807, + "grad_norm": 0.31979095935821533, + "learning_rate": 0.0007573267883349114, + "loss": 1.1457, + "step": 2278 + }, + { + "epoch": 1.0449489737415434, + "grad_norm": 1.364774465560913, + "learning_rate": 0.0007571144285513723, + "loss": 2.123, + "step": 2279 + }, + { + "epoch": 1.045407636738906, + "grad_norm": 0.30994194746017456, + "learning_rate": 0.00075690200569581, + "loss": 1.4648, + "step": 2280 + }, + { + "epoch": 1.0458662997362689, + "grad_norm": 0.3474901616573334, + "learning_rate": 0.0007566895198203334, + "loss": 1.8733, + "step": 2281 + }, + { + "epoch": 1.0463249627336315, + "grad_norm": 0.23491498827934265, + "learning_rate": 0.0007564769709770667, + "loss": 1.5338, + "step": 2282 + }, + { + "epoch": 1.0467836257309941, + "grad_norm": 0.25132763385772705, + "learning_rate": 0.0007562643592181498, + "loss": 1.3498, + "step": 2283 + }, + { + "epoch": 1.0472422887283568, + "grad_norm": 0.33392879366874695, + "learning_rate": 0.0007560516845957377, + "loss": 1.4968, + "step": 2284 + }, + { + "epoch": 1.0477009517257194, + "grad_norm": 0.3323450982570648, + "learning_rate": 0.0007558389471620013, + "loss": 1.1117, + "step": 2285 + }, + { + "epoch": 1.0481596147230823, + "grad_norm": 0.2603180408477783, + "learning_rate": 0.0007556261469691264, + "loss": 1.6954, + "step": 2286 + }, + { + "epoch": 1.048618277720445, + "grad_norm": 0.35003596544265747, + "learning_rate": 0.0007554132840693145, + "loss": 1.53, + "step": 2287 + }, + { + "epoch": 1.0490769407178075, + "grad_norm": 0.3515700101852417, + "learning_rate": 0.0007552003585147823, + "loss": 1.696, + "step": 2288 + }, + { + "epoch": 1.0495356037151702, + "grad_norm": 0.2830311059951782, + "learning_rate": 0.0007549873703577622, + "loss": 1.8374, + "step": 2289 + }, + { + "epoch": 1.049994266712533, + "grad_norm": 0.2728405296802521, + "learning_rate": 0.0007547743196505014, + "loss": 0.9555, + "step": 2290 + }, + { + "epoch": 1.0504529297098957, + "grad_norm": 0.3142251968383789, + "learning_rate": 0.0007545612064452632, + "loss": 1.657, + "step": 2291 + }, + { + "epoch": 1.0509115927072583, + "grad_norm": 0.2731315791606903, + "learning_rate": 0.0007543480307943256, + "loss": 0.8105, + "step": 2292 + }, + { + "epoch": 1.051370255704621, + "grad_norm": 0.2212861180305481, + "learning_rate": 0.0007541347927499818, + "loss": 1.3339, + "step": 2293 + }, + { + "epoch": 1.0518289187019838, + "grad_norm": 0.29469791054725647, + "learning_rate": 0.0007539214923645412, + "loss": 1.0019, + "step": 2294 + }, + { + "epoch": 1.0522875816993464, + "grad_norm": 0.20634829998016357, + "learning_rate": 0.0007537081296903277, + "loss": 1.3472, + "step": 2295 + }, + { + "epoch": 1.052746244696709, + "grad_norm": 0.3181491792201996, + "learning_rate": 0.0007534947047796805, + "loss": 2.062, + "step": 2296 + }, + { + "epoch": 1.0532049076940717, + "grad_norm": 0.31653597950935364, + "learning_rate": 0.0007532812176849545, + "loss": 1.8488, + "step": 2297 + }, + { + "epoch": 1.0536635706914346, + "grad_norm": 0.3137333393096924, + "learning_rate": 0.0007530676684585194, + "loss": 1.6195, + "step": 2298 + }, + { + "epoch": 1.0541222336887972, + "grad_norm": 0.3669377565383911, + "learning_rate": 0.0007528540571527607, + "loss": 1.928, + "step": 2299 + }, + { + "epoch": 1.0545808966861598, + "grad_norm": 0.35595008730888367, + "learning_rate": 0.0007526403838200786, + "loss": 2.1411, + "step": 2300 + }, + { + "epoch": 1.0550395596835225, + "grad_norm": 0.4003475606441498, + "learning_rate": 0.0007524266485128885, + "loss": 1.6777, + "step": 2301 + }, + { + "epoch": 1.0554982226808853, + "grad_norm": 0.2703254818916321, + "learning_rate": 0.0007522128512836217, + "loss": 1.7441, + "step": 2302 + }, + { + "epoch": 1.055956885678248, + "grad_norm": 0.3241822123527527, + "learning_rate": 0.0007519989921847236, + "loss": 1.4656, + "step": 2303 + }, + { + "epoch": 1.0564155486756106, + "grad_norm": 0.38216519355773926, + "learning_rate": 0.0007517850712686561, + "loss": 1.7497, + "step": 2304 + }, + { + "epoch": 1.0568742116729732, + "grad_norm": 0.24903887510299683, + "learning_rate": 0.0007515710885878948, + "loss": 1.7455, + "step": 2305 + }, + { + "epoch": 1.0573328746703359, + "grad_norm": 0.36446714401245117, + "learning_rate": 0.0007513570441949319, + "loss": 2.2563, + "step": 2306 + }, + { + "epoch": 1.0577915376676987, + "grad_norm": 0.23342153429985046, + "learning_rate": 0.0007511429381422734, + "loss": 0.7739, + "step": 2307 + }, + { + "epoch": 1.0582502006650614, + "grad_norm": 0.3443793058395386, + "learning_rate": 0.0007509287704824415, + "loss": 0.9252, + "step": 2308 + }, + { + "epoch": 1.058708863662424, + "grad_norm": 0.4298572540283203, + "learning_rate": 0.0007507145412679728, + "loss": 2.2942, + "step": 2309 + }, + { + "epoch": 1.0591675266597866, + "grad_norm": 0.2859693765640259, + "learning_rate": 0.0007505002505514194, + "loss": 1.3583, + "step": 2310 + }, + { + "epoch": 1.0596261896571495, + "grad_norm": 0.35848772525787354, + "learning_rate": 0.0007502858983853485, + "loss": 1.5612, + "step": 2311 + }, + { + "epoch": 1.0600848526545121, + "grad_norm": 0.1740964949131012, + "learning_rate": 0.000750071484822342, + "loss": 1.1107, + "step": 2312 + }, + { + "epoch": 1.0605435156518748, + "grad_norm": 0.24920308589935303, + "learning_rate": 0.000749857009914997, + "loss": 0.8362, + "step": 2313 + }, + { + "epoch": 1.0610021786492374, + "grad_norm": 0.07761970907449722, + "learning_rate": 0.000749642473715926, + "loss": 0.4695, + "step": 2314 + }, + { + "epoch": 1.0614608416466003, + "grad_norm": 0.17175906896591187, + "learning_rate": 0.0007494278762777562, + "loss": 1.2153, + "step": 2315 + }, + { + "epoch": 1.061919504643963, + "grad_norm": 0.20324799418449402, + "learning_rate": 0.0007492132176531299, + "loss": 0.6542, + "step": 2316 + }, + { + "epoch": 1.0623781676413255, + "grad_norm": 0.3163895308971405, + "learning_rate": 0.0007489984978947044, + "loss": 2.1238, + "step": 2317 + }, + { + "epoch": 1.0628368306386882, + "grad_norm": 0.36171120405197144, + "learning_rate": 0.000748783717055152, + "loss": 1.8789, + "step": 2318 + }, + { + "epoch": 1.0632954936360508, + "grad_norm": 0.6624128222465515, + "learning_rate": 0.0007485688751871597, + "loss": 2.0256, + "step": 2319 + }, + { + "epoch": 1.0637541566334137, + "grad_norm": 0.3431372046470642, + "learning_rate": 0.0007483539723434305, + "loss": 1.7518, + "step": 2320 + }, + { + "epoch": 1.0642128196307763, + "grad_norm": 0.3786202371120453, + "learning_rate": 0.0007481390085766808, + "loss": 1.8253, + "step": 2321 + }, + { + "epoch": 1.064671482628139, + "grad_norm": 0.30203521251678467, + "learning_rate": 0.0007479239839396434, + "loss": 1.3458, + "step": 2322 + }, + { + "epoch": 1.0651301456255016, + "grad_norm": 0.2725338637828827, + "learning_rate": 0.000747708898485065, + "loss": 1.3139, + "step": 2323 + }, + { + "epoch": 1.0655888086228644, + "grad_norm": 0.2869611084461212, + "learning_rate": 0.0007474937522657076, + "loss": 0.9489, + "step": 2324 + }, + { + "epoch": 1.066047471620227, + "grad_norm": 0.4075610637664795, + "learning_rate": 0.0007472785453343485, + "loss": 1.8008, + "step": 2325 + }, + { + "epoch": 1.0665061346175897, + "grad_norm": 0.16893206536769867, + "learning_rate": 0.000747063277743779, + "loss": 1.1216, + "step": 2326 + }, + { + "epoch": 1.0669647976149523, + "grad_norm": 0.3267589211463928, + "learning_rate": 0.0007468479495468061, + "loss": 1.4798, + "step": 2327 + }, + { + "epoch": 1.0674234606123152, + "grad_norm": 0.3620133101940155, + "learning_rate": 0.0007466325607962516, + "loss": 1.8303, + "step": 2328 + }, + { + "epoch": 1.0678821236096778, + "grad_norm": 0.3057517409324646, + "learning_rate": 0.0007464171115449512, + "loss": 1.6888, + "step": 2329 + }, + { + "epoch": 1.0683407866070405, + "grad_norm": 0.2740839719772339, + "learning_rate": 0.0007462016018457568, + "loss": 1.4534, + "step": 2330 + }, + { + "epoch": 1.068799449604403, + "grad_norm": 0.20154106616973877, + "learning_rate": 0.0007459860317515344, + "loss": 1.3666, + "step": 2331 + }, + { + "epoch": 1.069258112601766, + "grad_norm": 0.24330544471740723, + "learning_rate": 0.0007457704013151645, + "loss": 0.856, + "step": 2332 + }, + { + "epoch": 1.0697167755991286, + "grad_norm": 0.23237231373786926, + "learning_rate": 0.0007455547105895432, + "loss": 1.5328, + "step": 2333 + }, + { + "epoch": 1.0701754385964912, + "grad_norm": 0.24399161338806152, + "learning_rate": 0.0007453389596275808, + "loss": 1.0707, + "step": 2334 + }, + { + "epoch": 1.0706341015938539, + "grad_norm": 0.2570146322250366, + "learning_rate": 0.0007451231484822025, + "loss": 0.9601, + "step": 2335 + }, + { + "epoch": 1.0710927645912167, + "grad_norm": 0.33579733967781067, + "learning_rate": 0.0007449072772063486, + "loss": 1.7599, + "step": 2336 + }, + { + "epoch": 1.0715514275885794, + "grad_norm": 0.3293912410736084, + "learning_rate": 0.0007446913458529738, + "loss": 1.2646, + "step": 2337 + }, + { + "epoch": 1.072010090585942, + "grad_norm": 0.19118434190750122, + "learning_rate": 0.0007444753544750475, + "loss": 0.8713, + "step": 2338 + }, + { + "epoch": 1.0724687535833046, + "grad_norm": 0.1878962367773056, + "learning_rate": 0.0007442593031255539, + "loss": 1.1241, + "step": 2339 + }, + { + "epoch": 1.0729274165806673, + "grad_norm": 0.3359505534172058, + "learning_rate": 0.000744043191857492, + "loss": 1.3387, + "step": 2340 + }, + { + "epoch": 1.0733860795780301, + "grad_norm": 0.2292228788137436, + "learning_rate": 0.0007438270207238756, + "loss": 1.3343, + "step": 2341 + }, + { + "epoch": 1.0738447425753928, + "grad_norm": 0.41298791766166687, + "learning_rate": 0.000743610789777733, + "loss": 2.1614, + "step": 2342 + }, + { + "epoch": 1.0743034055727554, + "grad_norm": 0.28897812962532043, + "learning_rate": 0.0007433944990721071, + "loss": 1.3058, + "step": 2343 + }, + { + "epoch": 1.074762068570118, + "grad_norm": 0.13999620079994202, + "learning_rate": 0.0007431781486600556, + "loss": 0.7708, + "step": 2344 + }, + { + "epoch": 1.0752207315674809, + "grad_norm": 0.259048193693161, + "learning_rate": 0.0007429617385946507, + "loss": 1.1908, + "step": 2345 + }, + { + "epoch": 1.0756793945648435, + "grad_norm": 0.18018385767936707, + "learning_rate": 0.0007427452689289795, + "loss": 0.8871, + "step": 2346 + }, + { + "epoch": 1.0761380575622062, + "grad_norm": 0.2923835217952728, + "learning_rate": 0.0007425287397161437, + "loss": 1.6891, + "step": 2347 + }, + { + "epoch": 1.0765967205595688, + "grad_norm": 0.3825288414955139, + "learning_rate": 0.0007423121510092593, + "loss": 1.6979, + "step": 2348 + }, + { + "epoch": 1.0770553835569316, + "grad_norm": 0.3247275948524475, + "learning_rate": 0.000742095502861457, + "loss": 1.6702, + "step": 2349 + }, + { + "epoch": 1.0775140465542943, + "grad_norm": 0.30282968282699585, + "learning_rate": 0.0007418787953258822, + "loss": 1.756, + "step": 2350 + }, + { + "epoch": 1.077972709551657, + "grad_norm": 0.3590962588787079, + "learning_rate": 0.000741662028455695, + "loss": 2.1335, + "step": 2351 + }, + { + "epoch": 1.0784313725490196, + "grad_norm": 0.3158145546913147, + "learning_rate": 0.0007414452023040697, + "loss": 1.6022, + "step": 2352 + }, + { + "epoch": 1.0788900355463822, + "grad_norm": 0.26424726843833923, + "learning_rate": 0.0007412283169241955, + "loss": 1.3064, + "step": 2353 + }, + { + "epoch": 1.079348698543745, + "grad_norm": 0.3655887544155121, + "learning_rate": 0.0007410113723692756, + "loss": 1.7661, + "step": 2354 + }, + { + "epoch": 1.0798073615411077, + "grad_norm": 0.22848142683506012, + "learning_rate": 0.0007407943686925282, + "loss": 0.8983, + "step": 2355 + }, + { + "epoch": 1.0802660245384703, + "grad_norm": 0.2924310863018036, + "learning_rate": 0.0007405773059471863, + "loss": 1.9707, + "step": 2356 + }, + { + "epoch": 1.080724687535833, + "grad_norm": 0.27466198801994324, + "learning_rate": 0.0007403601841864964, + "loss": 1.3336, + "step": 2357 + }, + { + "epoch": 1.0811833505331958, + "grad_norm": 0.2992285490036011, + "learning_rate": 0.0007401430034637202, + "loss": 0.8565, + "step": 2358 + }, + { + "epoch": 1.0816420135305584, + "grad_norm": 0.27274230122566223, + "learning_rate": 0.0007399257638321338, + "loss": 1.6908, + "step": 2359 + }, + { + "epoch": 1.082100676527921, + "grad_norm": 0.2696456015110016, + "learning_rate": 0.0007397084653450274, + "loss": 0.8956, + "step": 2360 + }, + { + "epoch": 1.0825593395252837, + "grad_norm": 0.18956485390663147, + "learning_rate": 0.0007394911080557063, + "loss": 1.3373, + "step": 2361 + }, + { + "epoch": 1.0830180025226466, + "grad_norm": 0.36281585693359375, + "learning_rate": 0.0007392736920174895, + "loss": 1.6471, + "step": 2362 + }, + { + "epoch": 1.0834766655200092, + "grad_norm": 0.30485787987709045, + "learning_rate": 0.0007390562172837108, + "loss": 1.2273, + "step": 2363 + }, + { + "epoch": 1.0839353285173718, + "grad_norm": 0.4604003131389618, + "learning_rate": 0.0007388386839077182, + "loss": 1.9656, + "step": 2364 + }, + { + "epoch": 1.0843939915147345, + "grad_norm": 0.21141566336154938, + "learning_rate": 0.0007386210919428744, + "loss": 1.2972, + "step": 2365 + }, + { + "epoch": 1.0848526545120973, + "grad_norm": 0.26035094261169434, + "learning_rate": 0.0007384034414425562, + "loss": 1.1507, + "step": 2366 + }, + { + "epoch": 1.08531131750946, + "grad_norm": 0.2898489832878113, + "learning_rate": 0.000738185732460155, + "loss": 1.4745, + "step": 2367 + }, + { + "epoch": 1.0857699805068226, + "grad_norm": 0.3080480992794037, + "learning_rate": 0.000737967965049076, + "loss": 1.7976, + "step": 2368 + }, + { + "epoch": 1.0862286435041852, + "grad_norm": 0.2572938799858093, + "learning_rate": 0.0007377501392627394, + "loss": 1.108, + "step": 2369 + }, + { + "epoch": 1.086687306501548, + "grad_norm": 0.2855343520641327, + "learning_rate": 0.0007375322551545794, + "loss": 1.2778, + "step": 2370 + }, + { + "epoch": 1.0871459694989107, + "grad_norm": 0.22971484065055847, + "learning_rate": 0.0007373143127780444, + "loss": 1.0277, + "step": 2371 + }, + { + "epoch": 1.0876046324962734, + "grad_norm": 0.41709139943122864, + "learning_rate": 0.0007370963121865974, + "loss": 2.3818, + "step": 2372 + }, + { + "epoch": 1.088063295493636, + "grad_norm": 0.21090003848075867, + "learning_rate": 0.0007368782534337156, + "loss": 1.0302, + "step": 2373 + }, + { + "epoch": 1.0885219584909986, + "grad_norm": 0.2724030315876007, + "learning_rate": 0.0007366601365728902, + "loss": 1.4246, + "step": 2374 + }, + { + "epoch": 1.0889806214883615, + "grad_norm": 0.45919355750083923, + "learning_rate": 0.0007364419616576268, + "loss": 2.0112, + "step": 2375 + }, + { + "epoch": 1.0894392844857241, + "grad_norm": 0.4057621657848358, + "learning_rate": 0.0007362237287414455, + "loss": 2.0664, + "step": 2376 + }, + { + "epoch": 1.0898979474830868, + "grad_norm": 0.2805062532424927, + "learning_rate": 0.0007360054378778801, + "loss": 1.3488, + "step": 2377 + }, + { + "epoch": 1.0903566104804494, + "grad_norm": 0.38183027505874634, + "learning_rate": 0.0007357870891204792, + "loss": 1.8075, + "step": 2378 + }, + { + "epoch": 1.0908152734778123, + "grad_norm": 0.22754809260368347, + "learning_rate": 0.0007355686825228053, + "loss": 0.8548, + "step": 2379 + }, + { + "epoch": 1.091273936475175, + "grad_norm": 0.3018571138381958, + "learning_rate": 0.0007353502181384349, + "loss": 1.9176, + "step": 2380 + }, + { + "epoch": 1.0917325994725375, + "grad_norm": 0.3934653103351593, + "learning_rate": 0.0007351316960209591, + "loss": 2.1248, + "step": 2381 + }, + { + "epoch": 1.0921912624699002, + "grad_norm": 0.25040295720100403, + "learning_rate": 0.0007349131162239828, + "loss": 1.211, + "step": 2382 + }, + { + "epoch": 1.092649925467263, + "grad_norm": 0.3955865204334259, + "learning_rate": 0.0007346944788011254, + "loss": 1.103, + "step": 2383 + }, + { + "epoch": 1.0931085884646257, + "grad_norm": 0.406610906124115, + "learning_rate": 0.0007344757838060203, + "loss": 2.2019, + "step": 2384 + }, + { + "epoch": 1.0935672514619883, + "grad_norm": 0.45408493280410767, + "learning_rate": 0.0007342570312923143, + "loss": 1.9214, + "step": 2385 + }, + { + "epoch": 1.094025914459351, + "grad_norm": 0.3225853741168976, + "learning_rate": 0.0007340382213136695, + "loss": 1.437, + "step": 2386 + }, + { + "epoch": 1.0944845774567136, + "grad_norm": 0.2830888330936432, + "learning_rate": 0.0007338193539237619, + "loss": 1.6927, + "step": 2387 + }, + { + "epoch": 1.0949432404540764, + "grad_norm": 0.3365802466869354, + "learning_rate": 0.0007336004291762807, + "loss": 1.3085, + "step": 2388 + }, + { + "epoch": 1.095401903451439, + "grad_norm": 0.2103971689939499, + "learning_rate": 0.0007333814471249298, + "loss": 1.3113, + "step": 2389 + }, + { + "epoch": 1.0958605664488017, + "grad_norm": 0.3487602472305298, + "learning_rate": 0.0007331624078234272, + "loss": 1.2643, + "step": 2390 + }, + { + "epoch": 1.0963192294461643, + "grad_norm": 0.29860758781433105, + "learning_rate": 0.0007329433113255047, + "loss": 1.6295, + "step": 2391 + }, + { + "epoch": 1.0967778924435272, + "grad_norm": 0.21186383068561554, + "learning_rate": 0.0007327241576849083, + "loss": 1.4821, + "step": 2392 + }, + { + "epoch": 1.0972365554408898, + "grad_norm": 0.2788501977920532, + "learning_rate": 0.0007325049469553981, + "loss": 1.2378, + "step": 2393 + }, + { + "epoch": 1.0976952184382525, + "grad_norm": 0.19782112538814545, + "learning_rate": 0.000732285679190748, + "loss": 0.8517, + "step": 2394 + }, + { + "epoch": 1.098153881435615, + "grad_norm": 0.3813456594944, + "learning_rate": 0.0007320663544447459, + "loss": 1.818, + "step": 2395 + }, + { + "epoch": 1.098612544432978, + "grad_norm": 0.29733943939208984, + "learning_rate": 0.0007318469727711936, + "loss": 0.8673, + "step": 2396 + }, + { + "epoch": 1.0990712074303406, + "grad_norm": 0.18940015137195587, + "learning_rate": 0.0007316275342239074, + "loss": 0.8093, + "step": 2397 + }, + { + "epoch": 1.0995298704277032, + "grad_norm": 0.3412417769432068, + "learning_rate": 0.0007314080388567168, + "loss": 1.7512, + "step": 2398 + }, + { + "epoch": 1.0999885334250659, + "grad_norm": 0.09079885482788086, + "learning_rate": 0.0007311884867234658, + "loss": 0.9669, + "step": 2399 + }, + { + "epoch": 1.1004471964224287, + "grad_norm": 0.34120801091194153, + "learning_rate": 0.0007309688778780121, + "loss": 1.6829, + "step": 2400 + }, + { + "epoch": 1.1009058594197914, + "grad_norm": 0.31036293506622314, + "learning_rate": 0.0007307492123742271, + "loss": 1.2819, + "step": 2401 + }, + { + "epoch": 1.101364522417154, + "grad_norm": 0.32201525568962097, + "learning_rate": 0.0007305294902659967, + "loss": 1.3809, + "step": 2402 + }, + { + "epoch": 1.1018231854145166, + "grad_norm": 0.09801112860441208, + "learning_rate": 0.0007303097116072199, + "loss": 1.0084, + "step": 2403 + }, + { + "epoch": 1.1022818484118795, + "grad_norm": 0.28749385476112366, + "learning_rate": 0.0007300898764518105, + "loss": 1.0966, + "step": 2404 + }, + { + "epoch": 1.1027405114092421, + "grad_norm": 0.22139480710029602, + "learning_rate": 0.0007298699848536953, + "loss": 1.0551, + "step": 2405 + }, + { + "epoch": 1.1031991744066048, + "grad_norm": 0.24073892831802368, + "learning_rate": 0.0007296500368668154, + "loss": 1.7311, + "step": 2406 + }, + { + "epoch": 1.1036578374039674, + "grad_norm": 0.3581311106681824, + "learning_rate": 0.0007294300325451253, + "loss": 1.576, + "step": 2407 + }, + { + "epoch": 1.10411650040133, + "grad_norm": 0.2959865927696228, + "learning_rate": 0.0007292099719425942, + "loss": 1.2758, + "step": 2408 + }, + { + "epoch": 1.1045751633986929, + "grad_norm": 0.24298930168151855, + "learning_rate": 0.0007289898551132044, + "loss": 1.2407, + "step": 2409 + }, + { + "epoch": 1.1050338263960555, + "grad_norm": 0.24619142711162567, + "learning_rate": 0.0007287696821109517, + "loss": 1.2528, + "step": 2410 + }, + { + "epoch": 1.1054924893934182, + "grad_norm": 0.28459224104881287, + "learning_rate": 0.0007285494529898468, + "loss": 2.0142, + "step": 2411 + }, + { + "epoch": 1.1059511523907808, + "grad_norm": 0.295964777469635, + "learning_rate": 0.0007283291678039129, + "loss": 1.2126, + "step": 2412 + }, + { + "epoch": 1.1064098153881436, + "grad_norm": 0.23746977746486664, + "learning_rate": 0.000728108826607188, + "loss": 1.3847, + "step": 2413 + }, + { + "epoch": 1.1068684783855063, + "grad_norm": 0.38938137888908386, + "learning_rate": 0.0007278884294537229, + "loss": 2.1191, + "step": 2414 + }, + { + "epoch": 1.107327141382869, + "grad_norm": 0.1922319382429123, + "learning_rate": 0.0007276679763975832, + "loss": 0.5424, + "step": 2415 + }, + { + "epoch": 1.1077858043802316, + "grad_norm": 0.2816941738128662, + "learning_rate": 0.0007274474674928472, + "loss": 1.7775, + "step": 2416 + }, + { + "epoch": 1.1082444673775944, + "grad_norm": 0.25834113359451294, + "learning_rate": 0.0007272269027936073, + "loss": 0.9534, + "step": 2417 + }, + { + "epoch": 1.108703130374957, + "grad_norm": 0.49492350220680237, + "learning_rate": 0.00072700628235397, + "loss": 1.386, + "step": 2418 + }, + { + "epoch": 1.1091617933723197, + "grad_norm": 0.18091417849063873, + "learning_rate": 0.0007267856062280547, + "loss": 0.86, + "step": 2419 + }, + { + "epoch": 1.1096204563696823, + "grad_norm": 0.19237549602985382, + "learning_rate": 0.0007265648744699951, + "loss": 1.0081, + "step": 2420 + }, + { + "epoch": 1.110079119367045, + "grad_norm": 0.17153310775756836, + "learning_rate": 0.0007263440871339382, + "loss": 0.7964, + "step": 2421 + }, + { + "epoch": 1.1105377823644078, + "grad_norm": 0.22889000177383423, + "learning_rate": 0.0007261232442740444, + "loss": 1.6779, + "step": 2422 + }, + { + "epoch": 1.1109964453617704, + "grad_norm": 0.3227975070476532, + "learning_rate": 0.0007259023459444887, + "loss": 1.6245, + "step": 2423 + }, + { + "epoch": 1.111455108359133, + "grad_norm": 0.38517293334007263, + "learning_rate": 0.0007256813921994585, + "loss": 1.1709, + "step": 2424 + }, + { + "epoch": 1.1119137713564957, + "grad_norm": 0.19292643666267395, + "learning_rate": 0.0007254603830931555, + "loss": 0.8391, + "step": 2425 + }, + { + "epoch": 1.1123724343538586, + "grad_norm": 0.18455813825130463, + "learning_rate": 0.000725239318679795, + "loss": 0.7338, + "step": 2426 + }, + { + "epoch": 1.1128310973512212, + "grad_norm": 0.27097609639167786, + "learning_rate": 0.0007250181990136054, + "loss": 1.8622, + "step": 2427 + }, + { + "epoch": 1.1132897603485838, + "grad_norm": 0.30490532517433167, + "learning_rate": 0.0007247970241488293, + "loss": 1.6636, + "step": 2428 + }, + { + "epoch": 1.1137484233459465, + "grad_norm": 0.27548128366470337, + "learning_rate": 0.0007245757941397223, + "loss": 1.4517, + "step": 2429 + }, + { + "epoch": 1.1142070863433093, + "grad_norm": 0.28075456619262695, + "learning_rate": 0.0007243545090405537, + "loss": 1.1777, + "step": 2430 + }, + { + "epoch": 1.114665749340672, + "grad_norm": 0.2504705488681793, + "learning_rate": 0.0007241331689056064, + "loss": 1.1601, + "step": 2431 + }, + { + "epoch": 1.1151244123380346, + "grad_norm": 0.25834277272224426, + "learning_rate": 0.0007239117737891765, + "loss": 1.1205, + "step": 2432 + }, + { + "epoch": 1.1155830753353972, + "grad_norm": 0.35737091302871704, + "learning_rate": 0.0007236903237455741, + "loss": 1.629, + "step": 2433 + }, + { + "epoch": 1.11604173833276, + "grad_norm": 0.3298056423664093, + "learning_rate": 0.0007234688188291226, + "loss": 1.9195, + "step": 2434 + }, + { + "epoch": 1.1165004013301227, + "grad_norm": 0.4346219003200531, + "learning_rate": 0.0007232472590941582, + "loss": 2.2007, + "step": 2435 + }, + { + "epoch": 1.1169590643274854, + "grad_norm": 0.3693140745162964, + "learning_rate": 0.0007230256445950316, + "loss": 1.0917, + "step": 2436 + }, + { + "epoch": 1.117417727324848, + "grad_norm": 0.3174065947532654, + "learning_rate": 0.0007228039753861062, + "loss": 2.0214, + "step": 2437 + }, + { + "epoch": 1.1178763903222109, + "grad_norm": 0.37119260430336, + "learning_rate": 0.0007225822515217593, + "loss": 1.564, + "step": 2438 + }, + { + "epoch": 1.1183350533195735, + "grad_norm": 0.22383145987987518, + "learning_rate": 0.0007223604730563811, + "loss": 1.2421, + "step": 2439 + }, + { + "epoch": 1.1187937163169361, + "grad_norm": 0.34915295243263245, + "learning_rate": 0.0007221386400443757, + "loss": 1.7604, + "step": 2440 + }, + { + "epoch": 1.1192523793142988, + "grad_norm": 0.31573063135147095, + "learning_rate": 0.00072191675254016, + "loss": 1.6093, + "step": 2441 + }, + { + "epoch": 1.1197110423116614, + "grad_norm": 0.3130987882614136, + "learning_rate": 0.0007216948105981649, + "loss": 1.3731, + "step": 2442 + }, + { + "epoch": 1.1201697053090243, + "grad_norm": 0.25755324959754944, + "learning_rate": 0.0007214728142728342, + "loss": 1.4297, + "step": 2443 + }, + { + "epoch": 1.120628368306387, + "grad_norm": 0.12649127840995789, + "learning_rate": 0.0007212507636186251, + "loss": 0.6196, + "step": 2444 + }, + { + "epoch": 1.1210870313037495, + "grad_norm": 0.20406757295131683, + "learning_rate": 0.0007210286586900086, + "loss": 1.4123, + "step": 2445 + }, + { + "epoch": 1.1215456943011122, + "grad_norm": 0.3635023236274719, + "learning_rate": 0.0007208064995414686, + "loss": 1.8994, + "step": 2446 + }, + { + "epoch": 1.122004357298475, + "grad_norm": 0.27390024065971375, + "learning_rate": 0.0007205842862275019, + "loss": 0.8153, + "step": 2447 + }, + { + "epoch": 1.1224630202958377, + "grad_norm": 0.18709996342658997, + "learning_rate": 0.0007203620188026193, + "loss": 1.1956, + "step": 2448 + }, + { + "epoch": 1.1229216832932003, + "grad_norm": 0.2258118987083435, + "learning_rate": 0.0007201396973213446, + "loss": 1.261, + "step": 2449 + }, + { + "epoch": 1.123380346290563, + "grad_norm": 0.3747474253177643, + "learning_rate": 0.000719917321838215, + "loss": 1.8525, + "step": 2450 + }, + { + "epoch": 1.1238390092879258, + "grad_norm": 0.4060818552970886, + "learning_rate": 0.0007196948924077806, + "loss": 1.8212, + "step": 2451 + }, + { + "epoch": 1.1242976722852884, + "grad_norm": 0.35225608944892883, + "learning_rate": 0.000719472409084605, + "loss": 1.3362, + "step": 2452 + }, + { + "epoch": 1.124756335282651, + "grad_norm": 0.29107534885406494, + "learning_rate": 0.0007192498719232649, + "loss": 1.9098, + "step": 2453 + }, + { + "epoch": 1.1252149982800137, + "grad_norm": 0.35325366258621216, + "learning_rate": 0.0007190272809783504, + "loss": 1.4185, + "step": 2454 + }, + { + "epoch": 1.1256736612773763, + "grad_norm": 0.3643795847892761, + "learning_rate": 0.0007188046363044646, + "loss": 1.0865, + "step": 2455 + }, + { + "epoch": 1.1261323242747392, + "grad_norm": 0.10707731544971466, + "learning_rate": 0.0007185819379562238, + "loss": 0.8521, + "step": 2456 + }, + { + "epoch": 1.1265909872721018, + "grad_norm": 0.9587938189506531, + "learning_rate": 0.0007183591859882578, + "loss": 2.2844, + "step": 2457 + }, + { + "epoch": 1.1270496502694645, + "grad_norm": 0.2738874554634094, + "learning_rate": 0.0007181363804552086, + "loss": 0.9316, + "step": 2458 + }, + { + "epoch": 1.1275083132668273, + "grad_norm": 0.1969241052865982, + "learning_rate": 0.0007179135214117327, + "loss": 1.2078, + "step": 2459 + }, + { + "epoch": 1.12796697626419, + "grad_norm": 0.2931908965110779, + "learning_rate": 0.0007176906089124989, + "loss": 0.901, + "step": 2460 + }, + { + "epoch": 1.1284256392615526, + "grad_norm": 0.26238539814949036, + "learning_rate": 0.0007174676430121889, + "loss": 1.839, + "step": 2461 + }, + { + "epoch": 1.1288843022589152, + "grad_norm": 0.2989310026168823, + "learning_rate": 0.0007172446237654981, + "loss": 1.1135, + "step": 2462 + }, + { + "epoch": 1.1293429652562779, + "grad_norm": 0.42721855640411377, + "learning_rate": 0.0007170215512271347, + "loss": 1.2623, + "step": 2463 + }, + { + "epoch": 1.1298016282536407, + "grad_norm": 0.29083552956581116, + "learning_rate": 0.0007167984254518199, + "loss": 1.5911, + "step": 2464 + }, + { + "epoch": 1.1302602912510034, + "grad_norm": 0.32871025800704956, + "learning_rate": 0.0007165752464942882, + "loss": 1.3376, + "step": 2465 + }, + { + "epoch": 1.130718954248366, + "grad_norm": 0.28994280099868774, + "learning_rate": 0.000716352014409287, + "loss": 1.7526, + "step": 2466 + }, + { + "epoch": 1.1311776172457286, + "grad_norm": 0.30776262283325195, + "learning_rate": 0.0007161287292515766, + "loss": 1.5325, + "step": 2467 + }, + { + "epoch": 1.1316362802430913, + "grad_norm": 0.22005335986614227, + "learning_rate": 0.0007159053910759304, + "loss": 1.5527, + "step": 2468 + }, + { + "epoch": 1.1320949432404541, + "grad_norm": 0.38737305998802185, + "learning_rate": 0.0007156819999371353, + "loss": 2.2419, + "step": 2469 + }, + { + "epoch": 1.1325536062378168, + "grad_norm": 0.243751659989357, + "learning_rate": 0.0007154585558899902, + "loss": 0.8154, + "step": 2470 + }, + { + "epoch": 1.1330122692351794, + "grad_norm": 0.18171143531799316, + "learning_rate": 0.0007152350589893081, + "loss": 1.5097, + "step": 2471 + }, + { + "epoch": 1.1334709322325422, + "grad_norm": 0.2935083508491516, + "learning_rate": 0.0007150115092899138, + "loss": 1.6631, + "step": 2472 + }, + { + "epoch": 1.1339295952299049, + "grad_norm": 0.2859863340854645, + "learning_rate": 0.0007147879068466462, + "loss": 1.7206, + "step": 2473 + }, + { + "epoch": 1.1343882582272675, + "grad_norm": 0.3790501058101654, + "learning_rate": 0.0007145642517143563, + "loss": 1.9156, + "step": 2474 + }, + { + "epoch": 1.1348469212246302, + "grad_norm": 0.23689134418964386, + "learning_rate": 0.0007143405439479082, + "loss": 0.707, + "step": 2475 + }, + { + "epoch": 1.1353055842219928, + "grad_norm": 0.3226507008075714, + "learning_rate": 0.0007141167836021793, + "loss": 1.9152, + "step": 2476 + }, + { + "epoch": 1.1357642472193556, + "grad_norm": 0.3560091257095337, + "learning_rate": 0.0007138929707320596, + "loss": 1.97, + "step": 2477 + }, + { + "epoch": 1.1362229102167183, + "grad_norm": 0.3632325828075409, + "learning_rate": 0.0007136691053924519, + "loss": 0.7859, + "step": 2478 + }, + { + "epoch": 1.136681573214081, + "grad_norm": 0.11780796200037003, + "learning_rate": 0.0007134451876382719, + "loss": 1.0535, + "step": 2479 + }, + { + "epoch": 1.1371402362114436, + "grad_norm": 0.3758339583873749, + "learning_rate": 0.0007132212175244484, + "loss": 1.8105, + "step": 2480 + }, + { + "epoch": 1.1375988992088064, + "grad_norm": 0.3397655785083771, + "learning_rate": 0.0007129971951059229, + "loss": 1.3989, + "step": 2481 + }, + { + "epoch": 1.138057562206169, + "grad_norm": 0.3613055944442749, + "learning_rate": 0.0007127731204376497, + "loss": 1.5942, + "step": 2482 + }, + { + "epoch": 1.1385162252035317, + "grad_norm": 0.3114750385284424, + "learning_rate": 0.0007125489935745958, + "loss": 1.447, + "step": 2483 + }, + { + "epoch": 1.1389748882008943, + "grad_norm": 0.4733560085296631, + "learning_rate": 0.0007123248145717412, + "loss": 2.2578, + "step": 2484 + }, + { + "epoch": 1.1394335511982572, + "grad_norm": 0.28705328702926636, + "learning_rate": 0.0007121005834840786, + "loss": 1.2581, + "step": 2485 + }, + { + "epoch": 1.1398922141956198, + "grad_norm": 0.22501793503761292, + "learning_rate": 0.0007118763003666137, + "loss": 1.1338, + "step": 2486 + }, + { + "epoch": 1.1403508771929824, + "grad_norm": 0.24394629895687103, + "learning_rate": 0.0007116519652743644, + "loss": 1.2005, + "step": 2487 + }, + { + "epoch": 1.140809540190345, + "grad_norm": 0.32340019941329956, + "learning_rate": 0.0007114275782623622, + "loss": 1.2241, + "step": 2488 + }, + { + "epoch": 1.1412682031877077, + "grad_norm": 0.29200488328933716, + "learning_rate": 0.0007112031393856504, + "loss": 0.9522, + "step": 2489 + }, + { + "epoch": 1.1417268661850706, + "grad_norm": 0.11450646817684174, + "learning_rate": 0.0007109786486992856, + "loss": 0.9667, + "step": 2490 + }, + { + "epoch": 1.1421855291824332, + "grad_norm": 0.32306355237960815, + "learning_rate": 0.0007107541062583372, + "loss": 0.917, + "step": 2491 + }, + { + "epoch": 1.1426441921797958, + "grad_norm": 0.2934865355491638, + "learning_rate": 0.0007105295121178867, + "loss": 1.5807, + "step": 2492 + }, + { + "epoch": 1.1431028551771587, + "grad_norm": 0.3221622705459595, + "learning_rate": 0.0007103048663330291, + "loss": 1.895, + "step": 2493 + }, + { + "epoch": 1.1435615181745213, + "grad_norm": 0.3509625792503357, + "learning_rate": 0.0007100801689588714, + "loss": 1.4817, + "step": 2494 + }, + { + "epoch": 1.144020181171884, + "grad_norm": 0.255881130695343, + "learning_rate": 0.0007098554200505334, + "loss": 1.564, + "step": 2495 + }, + { + "epoch": 1.1444788441692466, + "grad_norm": 0.33859267830848694, + "learning_rate": 0.0007096306196631478, + "loss": 1.1945, + "step": 2496 + }, + { + "epoch": 1.1449375071666092, + "grad_norm": 0.2730942964553833, + "learning_rate": 0.0007094057678518597, + "loss": 1.1724, + "step": 2497 + }, + { + "epoch": 1.145396170163972, + "grad_norm": 0.12389812618494034, + "learning_rate": 0.0007091808646718268, + "loss": 1.1204, + "step": 2498 + }, + { + "epoch": 1.1458548331613347, + "grad_norm": 0.3796531558036804, + "learning_rate": 0.0007089559101782195, + "loss": 1.622, + "step": 2499 + }, + { + "epoch": 1.1463134961586974, + "grad_norm": 0.2889677584171295, + "learning_rate": 0.0007087309044262206, + "loss": 0.8866, + "step": 2500 + }, + { + "epoch": 1.14677215915606, + "grad_norm": 0.3515912890434265, + "learning_rate": 0.0007085058474710261, + "loss": 1.2305, + "step": 2501 + }, + { + "epoch": 1.1472308221534229, + "grad_norm": 0.2346973717212677, + "learning_rate": 0.0007082807393678439, + "loss": 1.3991, + "step": 2502 + }, + { + "epoch": 1.1476894851507855, + "grad_norm": 0.26319047808647156, + "learning_rate": 0.0007080555801718943, + "loss": 1.3355, + "step": 2503 + }, + { + "epoch": 1.1481481481481481, + "grad_norm": 0.38340768218040466, + "learning_rate": 0.0007078303699384107, + "loss": 1.7297, + "step": 2504 + }, + { + "epoch": 1.1486068111455108, + "grad_norm": 0.27420321106910706, + "learning_rate": 0.0007076051087226389, + "loss": 1.7687, + "step": 2505 + }, + { + "epoch": 1.1490654741428736, + "grad_norm": 0.36036863923072815, + "learning_rate": 0.0007073797965798371, + "loss": 1.2501, + "step": 2506 + }, + { + "epoch": 1.1495241371402363, + "grad_norm": 0.2933008074760437, + "learning_rate": 0.000707154433565276, + "loss": 1.762, + "step": 2507 + }, + { + "epoch": 1.149982800137599, + "grad_norm": 0.6776466369628906, + "learning_rate": 0.0007069290197342385, + "loss": 1.1041, + "step": 2508 + }, + { + "epoch": 1.1504414631349615, + "grad_norm": 0.16171732544898987, + "learning_rate": 0.0007067035551420205, + "loss": 0.8364, + "step": 2509 + }, + { + "epoch": 1.1509001261323242, + "grad_norm": 0.28582391142845154, + "learning_rate": 0.0007064780398439299, + "loss": 1.4814, + "step": 2510 + }, + { + "epoch": 1.151358789129687, + "grad_norm": 0.31193310022354126, + "learning_rate": 0.0007062524738952875, + "loss": 1.2991, + "step": 2511 + }, + { + "epoch": 1.1518174521270497, + "grad_norm": 0.28195393085479736, + "learning_rate": 0.0007060268573514259, + "loss": 1.8887, + "step": 2512 + }, + { + "epoch": 1.1522761151244123, + "grad_norm": 0.30946487188339233, + "learning_rate": 0.0007058011902676909, + "loss": 1.2166, + "step": 2513 + }, + { + "epoch": 1.152734778121775, + "grad_norm": 0.3007708787918091, + "learning_rate": 0.0007055754726994399, + "loss": 1.4958, + "step": 2514 + }, + { + "epoch": 1.1531934411191378, + "grad_norm": 0.21293707191944122, + "learning_rate": 0.0007053497047020432, + "loss": 1.0453, + "step": 2515 + }, + { + "epoch": 1.1536521041165004, + "grad_norm": 0.32221704721450806, + "learning_rate": 0.0007051238863308832, + "loss": 1.4675, + "step": 2516 + }, + { + "epoch": 1.154110767113863, + "grad_norm": 0.2842558026313782, + "learning_rate": 0.0007048980176413549, + "loss": 1.9645, + "step": 2517 + }, + { + "epoch": 1.1545694301112257, + "grad_norm": 0.3128332793712616, + "learning_rate": 0.0007046720986888656, + "loss": 1.3461, + "step": 2518 + }, + { + "epoch": 1.1550280931085886, + "grad_norm": 0.3141840994358063, + "learning_rate": 0.0007044461295288347, + "loss": 2.0339, + "step": 2519 + }, + { + "epoch": 1.1554867561059512, + "grad_norm": 0.39887315034866333, + "learning_rate": 0.0007042201102166939, + "loss": 2.0115, + "step": 2520 + }, + { + "epoch": 1.1559454191033138, + "grad_norm": 0.3459334075450897, + "learning_rate": 0.0007039940408078878, + "loss": 1.922, + "step": 2521 + }, + { + "epoch": 1.1564040821006765, + "grad_norm": 0.31680259108543396, + "learning_rate": 0.0007037679213578725, + "loss": 1.6561, + "step": 2522 + }, + { + "epoch": 1.156862745098039, + "grad_norm": 0.27092963457107544, + "learning_rate": 0.0007035417519221168, + "loss": 1.2754, + "step": 2523 + }, + { + "epoch": 1.157321408095402, + "grad_norm": 0.23691010475158691, + "learning_rate": 0.0007033155325561018, + "loss": 1.4512, + "step": 2524 + }, + { + "epoch": 1.1577800710927646, + "grad_norm": 0.3520990014076233, + "learning_rate": 0.0007030892633153208, + "loss": 1.3797, + "step": 2525 + }, + { + "epoch": 1.1582387340901272, + "grad_norm": 0.19260728359222412, + "learning_rate": 0.0007028629442552788, + "loss": 1.3934, + "step": 2526 + }, + { + "epoch": 1.15869739708749, + "grad_norm": 0.31958216428756714, + "learning_rate": 0.0007026365754314943, + "loss": 1.1669, + "step": 2527 + }, + { + "epoch": 1.1591560600848527, + "grad_norm": 0.1669234186410904, + "learning_rate": 0.0007024101568994965, + "loss": 1.2692, + "step": 2528 + }, + { + "epoch": 1.1596147230822154, + "grad_norm": 0.3692995011806488, + "learning_rate": 0.0007021836887148278, + "loss": 1.7561, + "step": 2529 + }, + { + "epoch": 1.160073386079578, + "grad_norm": 0.21594902873039246, + "learning_rate": 0.0007019571709330425, + "loss": 1.4732, + "step": 2530 + }, + { + "epoch": 1.1605320490769406, + "grad_norm": 0.3213531970977783, + "learning_rate": 0.0007017306036097068, + "loss": 1.6332, + "step": 2531 + }, + { + "epoch": 1.1609907120743035, + "grad_norm": 0.34258440136909485, + "learning_rate": 0.0007015039868003998, + "loss": 1.9807, + "step": 2532 + }, + { + "epoch": 1.1614493750716661, + "grad_norm": 0.4435928165912628, + "learning_rate": 0.0007012773205607117, + "loss": 1.7593, + "step": 2533 + }, + { + "epoch": 1.1619080380690288, + "grad_norm": 0.2438933551311493, + "learning_rate": 0.0007010506049462456, + "loss": 1.1445, + "step": 2534 + }, + { + "epoch": 1.1623667010663914, + "grad_norm": 0.32633253931999207, + "learning_rate": 0.0007008238400126165, + "loss": 2.1003, + "step": 2535 + }, + { + "epoch": 1.1628253640637543, + "grad_norm": 0.3241181969642639, + "learning_rate": 0.0007005970258154514, + "loss": 1.5915, + "step": 2536 + }, + { + "epoch": 1.1632840270611169, + "grad_norm": 0.2975384593009949, + "learning_rate": 0.0007003701624103895, + "loss": 1.5494, + "step": 2537 + }, + { + "epoch": 1.1637426900584795, + "grad_norm": 0.26780200004577637, + "learning_rate": 0.000700143249853082, + "loss": 1.3053, + "step": 2538 + }, + { + "epoch": 1.1642013530558422, + "grad_norm": 0.26362505555152893, + "learning_rate": 0.0006999162881991922, + "loss": 1.3801, + "step": 2539 + }, + { + "epoch": 1.164660016053205, + "grad_norm": 0.21280516684055328, + "learning_rate": 0.0006996892775043955, + "loss": 1.0339, + "step": 2540 + }, + { + "epoch": 1.1651186790505677, + "grad_norm": 0.2693462073802948, + "learning_rate": 0.0006994622178243792, + "loss": 1.1786, + "step": 2541 + }, + { + "epoch": 1.1655773420479303, + "grad_norm": 0.18628975749015808, + "learning_rate": 0.0006992351092148426, + "loss": 1.4404, + "step": 2542 + }, + { + "epoch": 1.166036005045293, + "grad_norm": 0.41851601004600525, + "learning_rate": 0.0006990079517314971, + "loss": 1.853, + "step": 2543 + }, + { + "epoch": 1.1664946680426556, + "grad_norm": 0.3848247826099396, + "learning_rate": 0.0006987807454300662, + "loss": 1.8879, + "step": 2544 + }, + { + "epoch": 1.1669533310400184, + "grad_norm": 0.258331835269928, + "learning_rate": 0.0006985534903662851, + "loss": 1.3988, + "step": 2545 + }, + { + "epoch": 1.167411994037381, + "grad_norm": 0.30951839685440063, + "learning_rate": 0.0006983261865959011, + "loss": 1.3336, + "step": 2546 + }, + { + "epoch": 1.1678706570347437, + "grad_norm": 0.2942619025707245, + "learning_rate": 0.0006980988341746737, + "loss": 1.7388, + "step": 2547 + }, + { + "epoch": 1.1683293200321063, + "grad_norm": 0.2846639156341553, + "learning_rate": 0.0006978714331583739, + "loss": 1.2854, + "step": 2548 + }, + { + "epoch": 1.1687879830294692, + "grad_norm": 0.3299280107021332, + "learning_rate": 0.0006976439836027848, + "loss": 2.272, + "step": 2549 + }, + { + "epoch": 1.1692466460268318, + "grad_norm": 0.2958674132823944, + "learning_rate": 0.0006974164855637015, + "loss": 1.6334, + "step": 2550 + }, + { + "epoch": 1.1697053090241945, + "grad_norm": 0.43015730381011963, + "learning_rate": 0.0006971889390969307, + "loss": 1.6784, + "step": 2551 + }, + { + "epoch": 1.170163972021557, + "grad_norm": 0.1926140934228897, + "learning_rate": 0.0006969613442582914, + "loss": 1.0963, + "step": 2552 + }, + { + "epoch": 1.17062263501892, + "grad_norm": 0.26699936389923096, + "learning_rate": 0.0006967337011036141, + "loss": 1.2809, + "step": 2553 + }, + { + "epoch": 1.1710812980162826, + "grad_norm": 0.34958434104919434, + "learning_rate": 0.0006965060096887414, + "loss": 2.0635, + "step": 2554 + }, + { + "epoch": 1.1715399610136452, + "grad_norm": 0.3384473919868469, + "learning_rate": 0.0006962782700695278, + "loss": 1.3294, + "step": 2555 + }, + { + "epoch": 1.1719986240110079, + "grad_norm": 0.3625926375389099, + "learning_rate": 0.0006960504823018392, + "loss": 1.9546, + "step": 2556 + }, + { + "epoch": 1.1724572870083705, + "grad_norm": 0.23835191130638123, + "learning_rate": 0.0006958226464415537, + "loss": 1.3571, + "step": 2557 + }, + { + "epoch": 1.1729159500057333, + "grad_norm": 0.38838183879852295, + "learning_rate": 0.0006955947625445611, + "loss": 1.9731, + "step": 2558 + }, + { + "epoch": 1.173374613003096, + "grad_norm": 0.40480363368988037, + "learning_rate": 0.000695366830666763, + "loss": 2.1235, + "step": 2559 + }, + { + "epoch": 1.1738332760004586, + "grad_norm": 0.32704851031303406, + "learning_rate": 0.0006951388508640725, + "loss": 1.7461, + "step": 2560 + }, + { + "epoch": 1.1742919389978215, + "grad_norm": 0.2514033615589142, + "learning_rate": 0.000694910823192415, + "loss": 0.8705, + "step": 2561 + }, + { + "epoch": 1.174750601995184, + "grad_norm": 0.20851144194602966, + "learning_rate": 0.0006946827477077271, + "loss": 1.0728, + "step": 2562 + }, + { + "epoch": 1.1752092649925467, + "grad_norm": 0.1491597592830658, + "learning_rate": 0.0006944546244659575, + "loss": 0.7351, + "step": 2563 + }, + { + "epoch": 1.1756679279899094, + "grad_norm": 0.2879053056240082, + "learning_rate": 0.0006942264535230665, + "loss": 1.2752, + "step": 2564 + }, + { + "epoch": 1.176126590987272, + "grad_norm": 0.4318799078464508, + "learning_rate": 0.0006939982349350259, + "loss": 1.8137, + "step": 2565 + }, + { + "epoch": 1.1765852539846349, + "grad_norm": 0.23562239110469818, + "learning_rate": 0.0006937699687578195, + "loss": 1.4907, + "step": 2566 + }, + { + "epoch": 1.1770439169819975, + "grad_norm": 0.22170042991638184, + "learning_rate": 0.0006935416550474426, + "loss": 1.2719, + "step": 2567 + }, + { + "epoch": 1.1775025799793601, + "grad_norm": 0.3226730227470398, + "learning_rate": 0.0006933132938599022, + "loss": 1.6284, + "step": 2568 + }, + { + "epoch": 1.1779612429767228, + "grad_norm": 0.2588571310043335, + "learning_rate": 0.000693084885251217, + "loss": 0.9474, + "step": 2569 + }, + { + "epoch": 1.1784199059740856, + "grad_norm": 0.331263929605484, + "learning_rate": 0.0006928564292774173, + "loss": 2.2437, + "step": 2570 + }, + { + "epoch": 1.1788785689714483, + "grad_norm": 0.37038788199424744, + "learning_rate": 0.0006926279259945447, + "loss": 1.5394, + "step": 2571 + }, + { + "epoch": 1.179337231968811, + "grad_norm": 0.21661770343780518, + "learning_rate": 0.0006923993754586532, + "loss": 1.4075, + "step": 2572 + }, + { + "epoch": 1.1797958949661735, + "grad_norm": 0.24760504066944122, + "learning_rate": 0.0006921707777258073, + "loss": 0.8289, + "step": 2573 + }, + { + "epoch": 1.1802545579635364, + "grad_norm": 0.29299482703208923, + "learning_rate": 0.0006919421328520844, + "loss": 1.6813, + "step": 2574 + }, + { + "epoch": 1.180713220960899, + "grad_norm": 0.21871504187583923, + "learning_rate": 0.0006917134408935721, + "loss": 1.2732, + "step": 2575 + }, + { + "epoch": 1.1811718839582617, + "grad_norm": 0.31473806500434875, + "learning_rate": 0.0006914847019063705, + "loss": 0.5663, + "step": 2576 + }, + { + "epoch": 1.1816305469556243, + "grad_norm": 0.2423599660396576, + "learning_rate": 0.0006912559159465908, + "loss": 0.8704, + "step": 2577 + }, + { + "epoch": 1.182089209952987, + "grad_norm": 0.3309333026409149, + "learning_rate": 0.0006910270830703559, + "loss": 1.4814, + "step": 2578 + }, + { + "epoch": 1.1825478729503498, + "grad_norm": 0.281486451625824, + "learning_rate": 0.0006907982033338001, + "loss": 1.5132, + "step": 2579 + }, + { + "epoch": 1.1830065359477124, + "grad_norm": 0.3493801951408386, + "learning_rate": 0.0006905692767930695, + "loss": 1.7191, + "step": 2580 + }, + { + "epoch": 1.183465198945075, + "grad_norm": 0.3665405213832855, + "learning_rate": 0.000690340303504321, + "loss": 2.2778, + "step": 2581 + }, + { + "epoch": 1.1839238619424377, + "grad_norm": 0.36640089750289917, + "learning_rate": 0.0006901112835237237, + "loss": 1.8275, + "step": 2582 + }, + { + "epoch": 1.1843825249398006, + "grad_norm": 0.33539673686027527, + "learning_rate": 0.0006898822169074577, + "loss": 1.6515, + "step": 2583 + }, + { + "epoch": 1.1848411879371632, + "grad_norm": 0.3045608401298523, + "learning_rate": 0.0006896531037117148, + "loss": 1.5253, + "step": 2584 + }, + { + "epoch": 1.1852998509345258, + "grad_norm": 0.18760670721530914, + "learning_rate": 0.0006894239439926981, + "loss": 0.6896, + "step": 2585 + }, + { + "epoch": 1.1857585139318885, + "grad_norm": 0.11621358245611191, + "learning_rate": 0.0006891947378066223, + "loss": 0.916, + "step": 2586 + }, + { + "epoch": 1.1862171769292513, + "grad_norm": 0.24042093753814697, + "learning_rate": 0.000688965485209713, + "loss": 0.7641, + "step": 2587 + }, + { + "epoch": 1.186675839926614, + "grad_norm": 0.1797315925359726, + "learning_rate": 0.0006887361862582077, + "loss": 1.1713, + "step": 2588 + }, + { + "epoch": 1.1871345029239766, + "grad_norm": 0.305295467376709, + "learning_rate": 0.000688506841008355, + "loss": 1.2055, + "step": 2589 + }, + { + "epoch": 1.1875931659213392, + "grad_norm": 0.18166819214820862, + "learning_rate": 0.000688277449516415, + "loss": 1.0945, + "step": 2590 + }, + { + "epoch": 1.1880518289187019, + "grad_norm": 0.23136307299137115, + "learning_rate": 0.0006880480118386592, + "loss": 0.8324, + "step": 2591 + }, + { + "epoch": 1.1885104919160647, + "grad_norm": 0.22618937492370605, + "learning_rate": 0.00068781852803137, + "loss": 1.1397, + "step": 2592 + }, + { + "epoch": 1.1889691549134274, + "grad_norm": 0.30645951628685, + "learning_rate": 0.0006875889981508416, + "loss": 1.4277, + "step": 2593 + }, + { + "epoch": 1.18942781791079, + "grad_norm": 0.3529110848903656, + "learning_rate": 0.0006873594222533796, + "loss": 1.6997, + "step": 2594 + }, + { + "epoch": 1.1898864809081529, + "grad_norm": 0.32532528042793274, + "learning_rate": 0.0006871298003953004, + "loss": 1.8083, + "step": 2595 + }, + { + "epoch": 1.1903451439055155, + "grad_norm": 0.3486868143081665, + "learning_rate": 0.0006869001326329317, + "loss": 0.871, + "step": 2596 + }, + { + "epoch": 1.1908038069028781, + "grad_norm": 0.2799234092235565, + "learning_rate": 0.0006866704190226131, + "loss": 1.4758, + "step": 2597 + }, + { + "epoch": 1.1912624699002408, + "grad_norm": 0.17606878280639648, + "learning_rate": 0.0006864406596206945, + "loss": 1.1527, + "step": 2598 + }, + { + "epoch": 1.1917211328976034, + "grad_norm": 0.32030197978019714, + "learning_rate": 0.0006862108544835379, + "loss": 1.5411, + "step": 2599 + }, + { + "epoch": 1.1921797958949663, + "grad_norm": 0.2235921025276184, + "learning_rate": 0.000685981003667516, + "loss": 0.9424, + "step": 2600 + }, + { + "epoch": 1.192638458892329, + "grad_norm": 0.39481407403945923, + "learning_rate": 0.0006857511072290128, + "loss": 2.1982, + "step": 2601 + }, + { + "epoch": 1.1930971218896915, + "grad_norm": 0.2962379455566406, + "learning_rate": 0.0006855211652244238, + "loss": 1.0606, + "step": 2602 + }, + { + "epoch": 1.1935557848870542, + "grad_norm": 0.2588828504085541, + "learning_rate": 0.0006852911777101553, + "loss": 1.2921, + "step": 2603 + }, + { + "epoch": 1.194014447884417, + "grad_norm": 0.09400691092014313, + "learning_rate": 0.0006850611447426248, + "loss": 0.9105, + "step": 2604 + }, + { + "epoch": 1.1944731108817797, + "grad_norm": 0.299527645111084, + "learning_rate": 0.0006848310663782613, + "loss": 1.4135, + "step": 2605 + }, + { + "epoch": 1.1949317738791423, + "grad_norm": 0.33210527896881104, + "learning_rate": 0.0006846009426735045, + "loss": 1.6572, + "step": 2606 + }, + { + "epoch": 1.195390436876505, + "grad_norm": 0.3115682303905487, + "learning_rate": 0.0006843707736848052, + "loss": 1.3729, + "step": 2607 + }, + { + "epoch": 1.1958490998738678, + "grad_norm": 0.356736958026886, + "learning_rate": 0.0006841405594686259, + "loss": 1.7662, + "step": 2608 + }, + { + "epoch": 1.1963077628712304, + "grad_norm": 0.2792001962661743, + "learning_rate": 0.0006839103000814397, + "loss": 1.5021, + "step": 2609 + }, + { + "epoch": 1.196766425868593, + "grad_norm": 0.35029587149620056, + "learning_rate": 0.0006836799955797306, + "loss": 2.1519, + "step": 2610 + }, + { + "epoch": 1.1972250888659557, + "grad_norm": 0.24187533557415009, + "learning_rate": 0.0006834496460199944, + "loss": 0.7456, + "step": 2611 + }, + { + "epoch": 1.1976837518633183, + "grad_norm": 0.12775690853595734, + "learning_rate": 0.0006832192514587372, + "loss": 0.6849, + "step": 2612 + }, + { + "epoch": 1.1981424148606812, + "grad_norm": 0.23146137595176697, + "learning_rate": 0.0006829888119524765, + "loss": 1.2602, + "step": 2613 + }, + { + "epoch": 1.1986010778580438, + "grad_norm": 0.3723481595516205, + "learning_rate": 0.0006827583275577409, + "loss": 1.2093, + "step": 2614 + }, + { + "epoch": 1.1990597408554065, + "grad_norm": 0.2874572277069092, + "learning_rate": 0.0006825277983310697, + "loss": 1.7144, + "step": 2615 + }, + { + "epoch": 1.199518403852769, + "grad_norm": 0.32732078433036804, + "learning_rate": 0.0006822972243290136, + "loss": 1.6793, + "step": 2616 + }, + { + "epoch": 1.199977066850132, + "grad_norm": 0.28288617730140686, + "learning_rate": 0.0006820666056081339, + "loss": 1.5604, + "step": 2617 + }, + { + "epoch": 1.2004357298474946, + "grad_norm": 0.18358604609966278, + "learning_rate": 0.000681835942225003, + "loss": 0.4566, + "step": 2618 + }, + { + "epoch": 1.2008943928448572, + "grad_norm": 0.260851114988327, + "learning_rate": 0.0006816052342362045, + "loss": 0.9449, + "step": 2619 + }, + { + "epoch": 1.2013530558422199, + "grad_norm": 0.22726760804653168, + "learning_rate": 0.0006813744816983324, + "loss": 0.879, + "step": 2620 + }, + { + "epoch": 1.2018117188395827, + "grad_norm": 0.229889914393425, + "learning_rate": 0.0006811436846679923, + "loss": 0.899, + "step": 2621 + }, + { + "epoch": 1.2022703818369453, + "grad_norm": 0.30879998207092285, + "learning_rate": 0.0006809128432018003, + "loss": 1.7426, + "step": 2622 + }, + { + "epoch": 1.202729044834308, + "grad_norm": 0.1792958378791809, + "learning_rate": 0.0006806819573563832, + "loss": 0.9025, + "step": 2623 + }, + { + "epoch": 1.2031877078316706, + "grad_norm": 0.4172597825527191, + "learning_rate": 0.0006804510271883793, + "loss": 2.02, + "step": 2624 + }, + { + "epoch": 1.2036463708290333, + "grad_norm": 0.26834097504615784, + "learning_rate": 0.0006802200527544374, + "loss": 1.1736, + "step": 2625 + }, + { + "epoch": 1.204105033826396, + "grad_norm": 0.2656348645687103, + "learning_rate": 0.000679989034111217, + "loss": 1.2722, + "step": 2626 + }, + { + "epoch": 1.2045636968237587, + "grad_norm": 0.22897177934646606, + "learning_rate": 0.0006797579713153888, + "loss": 1.1406, + "step": 2627 + }, + { + "epoch": 1.2050223598211214, + "grad_norm": 0.2911318838596344, + "learning_rate": 0.0006795268644236341, + "loss": 0.839, + "step": 2628 + }, + { + "epoch": 1.2054810228184842, + "grad_norm": 1.6645565032958984, + "learning_rate": 0.000679295713492645, + "loss": 1.858, + "step": 2629 + }, + { + "epoch": 1.2059396858158469, + "grad_norm": 0.32228031754493713, + "learning_rate": 0.0006790645185791247, + "loss": 2.0117, + "step": 2630 + }, + { + "epoch": 1.2063983488132095, + "grad_norm": 0.1701250672340393, + "learning_rate": 0.0006788332797397868, + "loss": 0.4305, + "step": 2631 + }, + { + "epoch": 1.2068570118105721, + "grad_norm": 0.3705667555332184, + "learning_rate": 0.0006786019970313559, + "loss": 1.7415, + "step": 2632 + }, + { + "epoch": 1.2073156748079348, + "grad_norm": 0.12469058483839035, + "learning_rate": 0.0006783706705105675, + "loss": 1.3766, + "step": 2633 + }, + { + "epoch": 1.2077743378052976, + "grad_norm": 0.5770121216773987, + "learning_rate": 0.0006781393002341674, + "loss": 1.9946, + "step": 2634 + }, + { + "epoch": 1.2082330008026603, + "grad_norm": 0.2683452069759369, + "learning_rate": 0.0006779078862589126, + "loss": 1.5091, + "step": 2635 + }, + { + "epoch": 1.208691663800023, + "grad_norm": 0.3430030643939972, + "learning_rate": 0.0006776764286415704, + "loss": 1.6704, + "step": 2636 + }, + { + "epoch": 1.2091503267973855, + "grad_norm": 0.3244686424732208, + "learning_rate": 0.0006774449274389192, + "loss": 1.5453, + "step": 2637 + }, + { + "epoch": 1.2096089897947484, + "grad_norm": 0.31932759284973145, + "learning_rate": 0.0006772133827077478, + "loss": 1.8912, + "step": 2638 + }, + { + "epoch": 1.210067652792111, + "grad_norm": 0.3642272353172302, + "learning_rate": 0.0006769817945048558, + "loss": 1.7648, + "step": 2639 + }, + { + "epoch": 1.2105263157894737, + "grad_norm": 0.40499117970466614, + "learning_rate": 0.0006767501628870536, + "loss": 1.9568, + "step": 2640 + }, + { + "epoch": 1.2109849787868363, + "grad_norm": 0.2826281189918518, + "learning_rate": 0.0006765184879111621, + "loss": 1.1599, + "step": 2641 + }, + { + "epoch": 1.2114436417841992, + "grad_norm": 0.2369105964899063, + "learning_rate": 0.0006762867696340126, + "loss": 1.1869, + "step": 2642 + }, + { + "epoch": 1.2119023047815618, + "grad_norm": 0.34914588928222656, + "learning_rate": 0.0006760550081124475, + "loss": 1.6072, + "step": 2643 + }, + { + "epoch": 1.2123609677789244, + "grad_norm": 0.8818534016609192, + "learning_rate": 0.0006758232034033193, + "loss": 2.145, + "step": 2644 + }, + { + "epoch": 1.212819630776287, + "grad_norm": 0.3968670070171356, + "learning_rate": 0.0006755913555634918, + "loss": 1.8677, + "step": 2645 + }, + { + "epoch": 1.2132782937736497, + "grad_norm": 0.20291972160339355, + "learning_rate": 0.0006753594646498384, + "loss": 0.8382, + "step": 2646 + }, + { + "epoch": 1.2137369567710126, + "grad_norm": 0.3209551274776459, + "learning_rate": 0.0006751275307192442, + "loss": 1.3622, + "step": 2647 + }, + { + "epoch": 1.2141956197683752, + "grad_norm": 0.12200617045164108, + "learning_rate": 0.000674895553828604, + "loss": 0.7502, + "step": 2648 + }, + { + "epoch": 1.2146542827657378, + "grad_norm": 0.3413892090320587, + "learning_rate": 0.0006746635340348232, + "loss": 2.4104, + "step": 2649 + }, + { + "epoch": 1.2151129457631005, + "grad_norm": 0.3582800626754761, + "learning_rate": 0.0006744314713948182, + "loss": 1.6539, + "step": 2650 + }, + { + "epoch": 1.2155716087604633, + "grad_norm": 0.386416494846344, + "learning_rate": 0.0006741993659655155, + "loss": 1.6932, + "step": 2651 + }, + { + "epoch": 1.216030271757826, + "grad_norm": 0.3171131908893585, + "learning_rate": 0.0006739672178038522, + "loss": 1.5858, + "step": 2652 + }, + { + "epoch": 1.2164889347551886, + "grad_norm": 0.35482868552207947, + "learning_rate": 0.0006737350269667763, + "loss": 1.8237, + "step": 2653 + }, + { + "epoch": 1.2169475977525512, + "grad_norm": 0.3112960457801819, + "learning_rate": 0.0006735027935112454, + "loss": 1.3225, + "step": 2654 + }, + { + "epoch": 1.217406260749914, + "grad_norm": 0.3752141296863556, + "learning_rate": 0.0006732705174942283, + "loss": 1.3334, + "step": 2655 + }, + { + "epoch": 1.2178649237472767, + "grad_norm": 0.29117441177368164, + "learning_rate": 0.0006730381989727038, + "loss": 1.148, + "step": 2656 + }, + { + "epoch": 1.2183235867446394, + "grad_norm": 0.19909793138504028, + "learning_rate": 0.0006728058380036614, + "loss": 1.4814, + "step": 2657 + }, + { + "epoch": 1.218782249742002, + "grad_norm": 0.2802788317203522, + "learning_rate": 0.0006725734346441013, + "loss": 1.1915, + "step": 2658 + }, + { + "epoch": 1.2192409127393646, + "grad_norm": 0.34183627367019653, + "learning_rate": 0.000672340988951033, + "loss": 1.6281, + "step": 2659 + }, + { + "epoch": 1.2196995757367275, + "grad_norm": 0.09204282611608505, + "learning_rate": 0.0006721085009814775, + "loss": 0.5768, + "step": 2660 + }, + { + "epoch": 1.2201582387340901, + "grad_norm": 0.20829735696315765, + "learning_rate": 0.0006718759707924658, + "loss": 1.2005, + "step": 2661 + }, + { + "epoch": 1.2206169017314528, + "grad_norm": 0.23320595920085907, + "learning_rate": 0.0006716433984410391, + "loss": 1.4799, + "step": 2662 + }, + { + "epoch": 1.2210755647288156, + "grad_norm": 0.446114718914032, + "learning_rate": 0.0006714107839842493, + "loss": 1.2414, + "step": 2663 + }, + { + "epoch": 1.2215342277261783, + "grad_norm": 0.26882967352867126, + "learning_rate": 0.0006711781274791582, + "loss": 1.344, + "step": 2664 + }, + { + "epoch": 1.221992890723541, + "grad_norm": 0.20352056622505188, + "learning_rate": 0.000670945428982838, + "loss": 1.0079, + "step": 2665 + }, + { + "epoch": 1.2224515537209035, + "grad_norm": 0.23894716799259186, + "learning_rate": 0.0006707126885523717, + "loss": 1.8065, + "step": 2666 + }, + { + "epoch": 1.2229102167182662, + "grad_norm": 0.3363592028617859, + "learning_rate": 0.000670479906244852, + "loss": 1.2569, + "step": 2667 + }, + { + "epoch": 1.223368879715629, + "grad_norm": 0.30896368622779846, + "learning_rate": 0.000670247082117382, + "loss": 1.9127, + "step": 2668 + }, + { + "epoch": 1.2238275427129917, + "grad_norm": 0.21522556245326996, + "learning_rate": 0.0006700142162270753, + "loss": 1.5338, + "step": 2669 + }, + { + "epoch": 1.2242862057103543, + "grad_norm": 0.30787453055381775, + "learning_rate": 0.0006697813086310553, + "loss": 1.3445, + "step": 2670 + }, + { + "epoch": 1.224744868707717, + "grad_norm": 0.2880922853946686, + "learning_rate": 0.0006695483593864562, + "loss": 1.2805, + "step": 2671 + }, + { + "epoch": 1.2252035317050798, + "grad_norm": 0.2538045346736908, + "learning_rate": 0.0006693153685504221, + "loss": 1.4706, + "step": 2672 + }, + { + "epoch": 1.2256621947024424, + "grad_norm": 0.2669422924518585, + "learning_rate": 0.0006690823361801073, + "loss": 0.8488, + "step": 2673 + }, + { + "epoch": 1.226120857699805, + "grad_norm": 0.09968861192464828, + "learning_rate": 0.0006688492623326762, + "loss": 0.6257, + "step": 2674 + }, + { + "epoch": 1.2265795206971677, + "grad_norm": 0.06321458518505096, + "learning_rate": 0.0006686161470653036, + "loss": 0.8595, + "step": 2675 + }, + { + "epoch": 1.2270381836945305, + "grad_norm": 0.3460922837257385, + "learning_rate": 0.0006683829904351742, + "loss": 2.0936, + "step": 2676 + }, + { + "epoch": 1.2274968466918932, + "grad_norm": 0.28384023904800415, + "learning_rate": 0.0006681497924994834, + "loss": 0.7923, + "step": 2677 + }, + { + "epoch": 1.2279555096892558, + "grad_norm": 0.22630473971366882, + "learning_rate": 0.000667916553315436, + "loss": 1.5095, + "step": 2678 + }, + { + "epoch": 1.2284141726866185, + "grad_norm": 0.3462603688240051, + "learning_rate": 0.0006676832729402472, + "loss": 1.7954, + "step": 2679 + }, + { + "epoch": 1.228872835683981, + "grad_norm": 0.18489515781402588, + "learning_rate": 0.0006674499514311426, + "loss": 0.5131, + "step": 2680 + }, + { + "epoch": 1.229331498681344, + "grad_norm": 0.2563524544239044, + "learning_rate": 0.0006672165888453576, + "loss": 1.455, + "step": 2681 + }, + { + "epoch": 1.2297901616787066, + "grad_norm": 0.28500133752822876, + "learning_rate": 0.0006669831852401375, + "loss": 1.4278, + "step": 2682 + }, + { + "epoch": 1.2302488246760692, + "grad_norm": 0.21729940176010132, + "learning_rate": 0.0006667497406727382, + "loss": 1.6448, + "step": 2683 + }, + { + "epoch": 1.2307074876734319, + "grad_norm": 0.32196274399757385, + "learning_rate": 0.0006665162552004251, + "loss": 1.5613, + "step": 2684 + }, + { + "epoch": 1.2311661506707947, + "grad_norm": 0.2639254927635193, + "learning_rate": 0.0006662827288804739, + "loss": 1.1826, + "step": 2685 + }, + { + "epoch": 1.2316248136681573, + "grad_norm": 0.3229156732559204, + "learning_rate": 0.0006660491617701704, + "loss": 2.0408, + "step": 2686 + }, + { + "epoch": 1.23208347666552, + "grad_norm": 0.3566708266735077, + "learning_rate": 0.0006658155539268103, + "loss": 1.7527, + "step": 2687 + }, + { + "epoch": 1.2325421396628826, + "grad_norm": 0.09745471924543381, + "learning_rate": 0.0006655819054076991, + "loss": 0.9554, + "step": 2688 + }, + { + "epoch": 1.2330008026602455, + "grad_norm": 0.37350574135780334, + "learning_rate": 0.0006653482162701528, + "loss": 1.5167, + "step": 2689 + }, + { + "epoch": 1.2334594656576081, + "grad_norm": 0.3267020285129547, + "learning_rate": 0.0006651144865714966, + "loss": 2.0956, + "step": 2690 + }, + { + "epoch": 1.2339181286549707, + "grad_norm": 0.32065388560295105, + "learning_rate": 0.0006648807163690664, + "loss": 1.6762, + "step": 2691 + }, + { + "epoch": 1.2343767916523334, + "grad_norm": 0.5290999412536621, + "learning_rate": 0.0006646469057202076, + "loss": 1.6888, + "step": 2692 + }, + { + "epoch": 1.234835454649696, + "grad_norm": 0.3192092478275299, + "learning_rate": 0.0006644130546822757, + "loss": 1.5339, + "step": 2693 + }, + { + "epoch": 1.2352941176470589, + "grad_norm": 0.3551962077617645, + "learning_rate": 0.000664179163312636, + "loss": 1.5905, + "step": 2694 + }, + { + "epoch": 1.2357527806444215, + "grad_norm": 0.2710552513599396, + "learning_rate": 0.0006639452316686638, + "loss": 1.394, + "step": 2695 + }, + { + "epoch": 1.2362114436417841, + "grad_norm": 0.3438358008861542, + "learning_rate": 0.0006637112598077441, + "loss": 1.373, + "step": 2696 + }, + { + "epoch": 1.236670106639147, + "grad_norm": 0.33121544122695923, + "learning_rate": 0.0006634772477872719, + "loss": 1.8278, + "step": 2697 + }, + { + "epoch": 1.2371287696365096, + "grad_norm": 0.2892778217792511, + "learning_rate": 0.0006632431956646522, + "loss": 1.3161, + "step": 2698 + }, + { + "epoch": 1.2375874326338723, + "grad_norm": 0.3857962191104889, + "learning_rate": 0.0006630091034972995, + "loss": 1.9475, + "step": 2699 + }, + { + "epoch": 1.238046095631235, + "grad_norm": 0.2920449376106262, + "learning_rate": 0.0006627749713426385, + "loss": 1.6476, + "step": 2700 + }, + { + "epoch": 1.2385047586285975, + "grad_norm": 0.29545149207115173, + "learning_rate": 0.000662540799258103, + "loss": 1.2706, + "step": 2701 + }, + { + "epoch": 1.2389634216259604, + "grad_norm": 0.32311832904815674, + "learning_rate": 0.0006623065873011378, + "loss": 0.8628, + "step": 2702 + }, + { + "epoch": 1.239422084623323, + "grad_norm": 0.20221565663814545, + "learning_rate": 0.0006620723355291963, + "loss": 0.8695, + "step": 2703 + }, + { + "epoch": 1.2398807476206857, + "grad_norm": 0.3797789216041565, + "learning_rate": 0.0006618380439997423, + "loss": 1.8011, + "step": 2704 + }, + { + "epoch": 1.2403394106180483, + "grad_norm": 0.27936404943466187, + "learning_rate": 0.0006616037127702493, + "loss": 1.7474, + "step": 2705 + }, + { + "epoch": 1.2407980736154112, + "grad_norm": 0.305891752243042, + "learning_rate": 0.0006613693418982002, + "loss": 1.4064, + "step": 2706 + }, + { + "epoch": 1.2412567366127738, + "grad_norm": 0.22525465488433838, + "learning_rate": 0.0006611349314410881, + "loss": 1.634, + "step": 2707 + }, + { + "epoch": 1.2417153996101364, + "grad_norm": 0.2716008126735687, + "learning_rate": 0.0006609004814564154, + "loss": 1.3457, + "step": 2708 + }, + { + "epoch": 1.242174062607499, + "grad_norm": 0.28558602929115295, + "learning_rate": 0.0006606659920016945, + "loss": 1.4251, + "step": 2709 + }, + { + "epoch": 1.242632725604862, + "grad_norm": 0.36208319664001465, + "learning_rate": 0.0006604314631344472, + "loss": 1.4049, + "step": 2710 + }, + { + "epoch": 1.2430913886022246, + "grad_norm": 0.3267166316509247, + "learning_rate": 0.0006601968949122053, + "loss": 1.9824, + "step": 2711 + }, + { + "epoch": 1.2435500515995872, + "grad_norm": 0.22049808502197266, + "learning_rate": 0.00065996228739251, + "loss": 0.8636, + "step": 2712 + }, + { + "epoch": 1.2440087145969498, + "grad_norm": 0.17521077394485474, + "learning_rate": 0.0006597276406329122, + "loss": 1.3998, + "step": 2713 + }, + { + "epoch": 1.2444673775943125, + "grad_norm": 0.4500933289527893, + "learning_rate": 0.0006594929546909725, + "loss": 1.4056, + "step": 2714 + }, + { + "epoch": 1.2449260405916753, + "grad_norm": 0.37240904569625854, + "learning_rate": 0.0006592582296242609, + "loss": 1.4709, + "step": 2715 + }, + { + "epoch": 1.245384703589038, + "grad_norm": 0.3110402226448059, + "learning_rate": 0.0006590234654903574, + "loss": 1.8608, + "step": 2716 + }, + { + "epoch": 1.2458433665864006, + "grad_norm": 0.2775774896144867, + "learning_rate": 0.0006587886623468511, + "loss": 1.3253, + "step": 2717 + }, + { + "epoch": 1.2463020295837632, + "grad_norm": 0.2369251698255539, + "learning_rate": 0.000658553820251341, + "loss": 0.7903, + "step": 2718 + }, + { + "epoch": 1.246760692581126, + "grad_norm": 0.25416409969329834, + "learning_rate": 0.0006583189392614356, + "loss": 1.2426, + "step": 2719 + }, + { + "epoch": 1.2472193555784887, + "grad_norm": 0.36366981267929077, + "learning_rate": 0.0006580840194347529, + "loss": 2.1185, + "step": 2720 + }, + { + "epoch": 1.2476780185758514, + "grad_norm": 0.21823568642139435, + "learning_rate": 0.0006578490608289204, + "loss": 0.8336, + "step": 2721 + }, + { + "epoch": 1.248136681573214, + "grad_norm": 0.2580203115940094, + "learning_rate": 0.0006576140635015749, + "loss": 1.243, + "step": 2722 + }, + { + "epoch": 1.2485953445705769, + "grad_norm": 0.19676536321640015, + "learning_rate": 0.0006573790275103635, + "loss": 1.3928, + "step": 2723 + }, + { + "epoch": 1.2490540075679395, + "grad_norm": 0.3913279175758362, + "learning_rate": 0.0006571439529129417, + "loss": 1.6071, + "step": 2724 + }, + { + "epoch": 1.2495126705653021, + "grad_norm": 0.33274513483047485, + "learning_rate": 0.0006569088397669752, + "loss": 1.9368, + "step": 2725 + }, + { + "epoch": 1.2499713335626648, + "grad_norm": 0.3166465759277344, + "learning_rate": 0.0006566736881301389, + "loss": 1.193, + "step": 2726 + }, + { + "epoch": 1.2504299965600274, + "grad_norm": 0.12209037691354752, + "learning_rate": 0.0006564384980601172, + "loss": 0.7718, + "step": 2727 + }, + { + "epoch": 1.2508886595573903, + "grad_norm": 0.19902729988098145, + "learning_rate": 0.000656203269614604, + "loss": 0.83, + "step": 2728 + }, + { + "epoch": 1.251347322554753, + "grad_norm": 0.23850427567958832, + "learning_rate": 0.0006559680028513027, + "loss": 1.1411, + "step": 2729 + }, + { + "epoch": 1.2518059855521155, + "grad_norm": 0.22274386882781982, + "learning_rate": 0.0006557326978279255, + "loss": 1.2014, + "step": 2730 + }, + { + "epoch": 1.2522646485494784, + "grad_norm": 0.2648008465766907, + "learning_rate": 0.0006554973546021946, + "loss": 1.1293, + "step": 2731 + }, + { + "epoch": 1.252723311546841, + "grad_norm": 0.20041297376155853, + "learning_rate": 0.0006552619732318414, + "loss": 0.9521, + "step": 2732 + }, + { + "epoch": 1.2531819745442037, + "grad_norm": 0.3079836666584015, + "learning_rate": 0.0006550265537746068, + "loss": 1.9308, + "step": 2733 + }, + { + "epoch": 1.2536406375415663, + "grad_norm": 0.3721216917037964, + "learning_rate": 0.0006547910962882407, + "loss": 1.7333, + "step": 2734 + }, + { + "epoch": 1.254099300538929, + "grad_norm": 0.37531670928001404, + "learning_rate": 0.0006545556008305025, + "loss": 1.4659, + "step": 2735 + }, + { + "epoch": 1.2545579635362918, + "grad_norm": 0.24703913927078247, + "learning_rate": 0.0006543200674591611, + "loss": 1.6251, + "step": 2736 + }, + { + "epoch": 1.2550166265336544, + "grad_norm": 0.3122749924659729, + "learning_rate": 0.0006540844962319944, + "loss": 0.7767, + "step": 2737 + }, + { + "epoch": 1.255475289531017, + "grad_norm": 0.41236305236816406, + "learning_rate": 0.0006538488872067899, + "loss": 2.0791, + "step": 2738 + }, + { + "epoch": 1.2559339525283797, + "grad_norm": 0.3451601266860962, + "learning_rate": 0.000653613240441344, + "loss": 1.5688, + "step": 2739 + }, + { + "epoch": 1.2563926155257423, + "grad_norm": 0.27274802327156067, + "learning_rate": 0.0006533775559934624, + "loss": 1.2577, + "step": 2740 + }, + { + "epoch": 1.2568512785231052, + "grad_norm": 0.42158329486846924, + "learning_rate": 0.0006531418339209607, + "loss": 1.9504, + "step": 2741 + }, + { + "epoch": 1.2573099415204678, + "grad_norm": 0.336901992559433, + "learning_rate": 0.0006529060742816627, + "loss": 1.6164, + "step": 2742 + }, + { + "epoch": 1.2577686045178305, + "grad_norm": 0.2214047908782959, + "learning_rate": 0.0006526702771334023, + "loss": 1.2476, + "step": 2743 + }, + { + "epoch": 1.2582272675151933, + "grad_norm": 0.2765026092529297, + "learning_rate": 0.000652434442534022, + "loss": 1.2798, + "step": 2744 + }, + { + "epoch": 1.258685930512556, + "grad_norm": 0.4691556692123413, + "learning_rate": 0.0006521985705413741, + "loss": 2.1313, + "step": 2745 + }, + { + "epoch": 1.2591445935099186, + "grad_norm": 0.2610776126384735, + "learning_rate": 0.0006519626612133192, + "loss": 0.9233, + "step": 2746 + }, + { + "epoch": 1.2596032565072812, + "grad_norm": 0.26943257451057434, + "learning_rate": 0.0006517267146077279, + "loss": 1.618, + "step": 2747 + }, + { + "epoch": 1.2600619195046439, + "grad_norm": 0.3554578125476837, + "learning_rate": 0.0006514907307824794, + "loss": 1.9045, + "step": 2748 + }, + { + "epoch": 1.2605205825020067, + "grad_norm": 0.28835994005203247, + "learning_rate": 0.0006512547097954624, + "loss": 0.8852, + "step": 2749 + }, + { + "epoch": 1.2609792454993694, + "grad_norm": 0.12801244854927063, + "learning_rate": 0.0006510186517045744, + "loss": 0.7682, + "step": 2750 + }, + { + "epoch": 1.261437908496732, + "grad_norm": 0.3506546914577484, + "learning_rate": 0.0006507825565677225, + "loss": 2.2786, + "step": 2751 + }, + { + "epoch": 1.2618965714940948, + "grad_norm": 0.36096346378326416, + "learning_rate": 0.0006505464244428219, + "loss": 1.8541, + "step": 2752 + }, + { + "epoch": 1.2623552344914575, + "grad_norm": 0.18552398681640625, + "learning_rate": 0.0006503102553877982, + "loss": 0.4271, + "step": 2753 + }, + { + "epoch": 1.2628138974888201, + "grad_norm": 0.11175768822431564, + "learning_rate": 0.0006500740494605848, + "loss": 0.7008, + "step": 2754 + }, + { + "epoch": 1.2632725604861827, + "grad_norm": 0.34600672125816345, + "learning_rate": 0.0006498378067191252, + "loss": 2.1968, + "step": 2755 + }, + { + "epoch": 1.2637312234835454, + "grad_norm": 0.32166171073913574, + "learning_rate": 0.0006496015272213711, + "loss": 1.4522, + "step": 2756 + }, + { + "epoch": 1.2641898864809082, + "grad_norm": 0.3662795126438141, + "learning_rate": 0.0006493652110252838, + "loss": 2.3257, + "step": 2757 + }, + { + "epoch": 1.2646485494782709, + "grad_norm": 0.2909261882305145, + "learning_rate": 0.000649128858188833, + "loss": 1.1611, + "step": 2758 + }, + { + "epoch": 1.2651072124756335, + "grad_norm": 0.4382424056529999, + "learning_rate": 0.0006488924687699983, + "loss": 1.4101, + "step": 2759 + }, + { + "epoch": 1.2655658754729961, + "grad_norm": 0.17091111838817596, + "learning_rate": 0.0006486560428267674, + "loss": 0.8147, + "step": 2760 + }, + { + "epoch": 1.2660245384703588, + "grad_norm": 0.3439600169658661, + "learning_rate": 0.0006484195804171371, + "loss": 1.7435, + "step": 2761 + }, + { + "epoch": 1.2664832014677216, + "grad_norm": 0.3564503490924835, + "learning_rate": 0.0006481830815991138, + "loss": 1.1747, + "step": 2762 + }, + { + "epoch": 1.2669418644650843, + "grad_norm": 0.2912631034851074, + "learning_rate": 0.000647946546430712, + "loss": 1.6711, + "step": 2763 + }, + { + "epoch": 1.267400527462447, + "grad_norm": 0.4116171598434448, + "learning_rate": 0.0006477099749699557, + "loss": 2.1626, + "step": 2764 + }, + { + "epoch": 1.2678591904598098, + "grad_norm": 0.2863385081291199, + "learning_rate": 0.0006474733672748775, + "loss": 0.9852, + "step": 2765 + }, + { + "epoch": 1.2683178534571724, + "grad_norm": 0.30499163269996643, + "learning_rate": 0.0006472367234035189, + "loss": 1.5042, + "step": 2766 + }, + { + "epoch": 1.268776516454535, + "grad_norm": 0.20567883551120758, + "learning_rate": 0.0006470000434139306, + "loss": 0.7004, + "step": 2767 + }, + { + "epoch": 1.2692351794518977, + "grad_norm": 0.1941119283437729, + "learning_rate": 0.0006467633273641714, + "loss": 1.2625, + "step": 2768 + }, + { + "epoch": 1.2696938424492603, + "grad_norm": 0.33352068066596985, + "learning_rate": 0.00064652657531231, + "loss": 1.3027, + "step": 2769 + }, + { + "epoch": 1.2701525054466232, + "grad_norm": 0.23452205955982208, + "learning_rate": 0.0006462897873164232, + "loss": 1.4857, + "step": 2770 + }, + { + "epoch": 1.2706111684439858, + "grad_norm": 0.3402770757675171, + "learning_rate": 0.0006460529634345967, + "loss": 1.4496, + "step": 2771 + }, + { + "epoch": 1.2710698314413484, + "grad_norm": 0.25416603684425354, + "learning_rate": 0.000645816103724925, + "loss": 1.1967, + "step": 2772 + }, + { + "epoch": 1.271528494438711, + "grad_norm": 0.29919010400772095, + "learning_rate": 0.0006455792082455118, + "loss": 1.3418, + "step": 2773 + }, + { + "epoch": 1.2719871574360737, + "grad_norm": 0.2903623580932617, + "learning_rate": 0.0006453422770544692, + "loss": 1.3553, + "step": 2774 + }, + { + "epoch": 1.2724458204334366, + "grad_norm": 0.478496253490448, + "learning_rate": 0.0006451053102099181, + "loss": 2.0164, + "step": 2775 + }, + { + "epoch": 1.2729044834307992, + "grad_norm": 0.2910837233066559, + "learning_rate": 0.000644868307769988, + "loss": 1.1557, + "step": 2776 + }, + { + "epoch": 1.2733631464281618, + "grad_norm": 0.19006802141666412, + "learning_rate": 0.0006446312697928176, + "loss": 1.3286, + "step": 2777 + }, + { + "epoch": 1.2738218094255247, + "grad_norm": 0.23939000070095062, + "learning_rate": 0.0006443941963365539, + "loss": 0.9206, + "step": 2778 + }, + { + "epoch": 1.2742804724228873, + "grad_norm": 0.34608039259910583, + "learning_rate": 0.0006441570874593524, + "loss": 2.2271, + "step": 2779 + }, + { + "epoch": 1.27473913542025, + "grad_norm": 0.3570116460323334, + "learning_rate": 0.0006439199432193782, + "loss": 1.7172, + "step": 2780 + }, + { + "epoch": 1.2751977984176126, + "grad_norm": 0.21899329125881195, + "learning_rate": 0.0006436827636748042, + "loss": 1.1881, + "step": 2781 + }, + { + "epoch": 1.2756564614149752, + "grad_norm": 0.2552841305732727, + "learning_rate": 0.0006434455488838121, + "loss": 1.5012, + "step": 2782 + }, + { + "epoch": 1.276115124412338, + "grad_norm": 0.3439258635044098, + "learning_rate": 0.0006432082989045926, + "loss": 1.7224, + "step": 2783 + }, + { + "epoch": 1.2765737874097007, + "grad_norm": 0.12296123802661896, + "learning_rate": 0.000642971013795345, + "loss": 1.0084, + "step": 2784 + }, + { + "epoch": 1.2770324504070634, + "grad_norm": 0.37332117557525635, + "learning_rate": 0.0006427336936142766, + "loss": 1.8006, + "step": 2785 + }, + { + "epoch": 1.2774911134044262, + "grad_norm": 0.1939849704504013, + "learning_rate": 0.0006424963384196041, + "loss": 0.8541, + "step": 2786 + }, + { + "epoch": 1.2779497764017889, + "grad_norm": 0.3263346552848816, + "learning_rate": 0.0006422589482695523, + "loss": 1.8838, + "step": 2787 + }, + { + "epoch": 1.2784084393991515, + "grad_norm": 0.07417495548725128, + "learning_rate": 0.0006420215232223548, + "loss": 0.3335, + "step": 2788 + }, + { + "epoch": 1.2788671023965141, + "grad_norm": 0.27612733840942383, + "learning_rate": 0.0006417840633362535, + "loss": 1.3939, + "step": 2789 + }, + { + "epoch": 1.2793257653938768, + "grad_norm": 0.31349727511405945, + "learning_rate": 0.0006415465686694993, + "loss": 1.7898, + "step": 2790 + }, + { + "epoch": 1.2797844283912396, + "grad_norm": 0.3115302622318268, + "learning_rate": 0.0006413090392803511, + "loss": 1.4323, + "step": 2791 + }, + { + "epoch": 1.2802430913886023, + "grad_norm": 0.29694634675979614, + "learning_rate": 0.0006410714752270769, + "loss": 1.7373, + "step": 2792 + }, + { + "epoch": 1.280701754385965, + "grad_norm": 0.2605624496936798, + "learning_rate": 0.0006408338765679525, + "loss": 1.2168, + "step": 2793 + }, + { + "epoch": 1.2811604173833275, + "grad_norm": 0.4056708514690399, + "learning_rate": 0.0006405962433612625, + "loss": 1.701, + "step": 2794 + }, + { + "epoch": 1.2816190803806902, + "grad_norm": 0.30486351251602173, + "learning_rate": 0.0006403585756653004, + "loss": 0.8638, + "step": 2795 + }, + { + "epoch": 1.282077743378053, + "grad_norm": 0.27140724658966064, + "learning_rate": 0.0006401208735383677, + "loss": 1.5908, + "step": 2796 + }, + { + "epoch": 1.2825364063754157, + "grad_norm": 0.30335965752601624, + "learning_rate": 0.0006398831370387744, + "loss": 1.1826, + "step": 2797 + }, + { + "epoch": 1.2829950693727783, + "grad_norm": 0.22256600856781006, + "learning_rate": 0.0006396453662248391, + "loss": 1.0292, + "step": 2798 + }, + { + "epoch": 1.2834537323701412, + "grad_norm": 0.34704825282096863, + "learning_rate": 0.0006394075611548882, + "loss": 1.5649, + "step": 2799 + }, + { + "epoch": 1.2839123953675038, + "grad_norm": 0.4331229031085968, + "learning_rate": 0.0006391697218872576, + "loss": 1.5117, + "step": 2800 + }, + { + "epoch": 1.2843710583648664, + "grad_norm": 0.25863364338874817, + "learning_rate": 0.0006389318484802908, + "loss": 1.5621, + "step": 2801 + }, + { + "epoch": 1.284829721362229, + "grad_norm": 0.31081312894821167, + "learning_rate": 0.0006386939409923398, + "loss": 1.7057, + "step": 2802 + }, + { + "epoch": 1.2852883843595917, + "grad_norm": 0.3220570683479309, + "learning_rate": 0.0006384559994817649, + "loss": 0.9301, + "step": 2803 + }, + { + "epoch": 1.2857470473569546, + "grad_norm": 0.32526329159736633, + "learning_rate": 0.0006382180240069352, + "loss": 2.1208, + "step": 2804 + }, + { + "epoch": 1.2862057103543172, + "grad_norm": 0.33022022247314453, + "learning_rate": 0.0006379800146262274, + "loss": 1.1948, + "step": 2805 + }, + { + "epoch": 1.2866643733516798, + "grad_norm": 0.28297072649002075, + "learning_rate": 0.0006377419713980274, + "loss": 1.351, + "step": 2806 + }, + { + "epoch": 1.2871230363490425, + "grad_norm": 0.23854339122772217, + "learning_rate": 0.0006375038943807284, + "loss": 1.46, + "step": 2807 + }, + { + "epoch": 1.287581699346405, + "grad_norm": 0.2918485999107361, + "learning_rate": 0.0006372657836327328, + "loss": 0.8088, + "step": 2808 + }, + { + "epoch": 1.288040362343768, + "grad_norm": 0.19841410219669342, + "learning_rate": 0.0006370276392124506, + "loss": 1.2504, + "step": 2809 + }, + { + "epoch": 1.2884990253411306, + "grad_norm": 0.28576546907424927, + "learning_rate": 0.0006367894611783006, + "loss": 1.4295, + "step": 2810 + }, + { + "epoch": 1.2889576883384932, + "grad_norm": 0.32226133346557617, + "learning_rate": 0.0006365512495887094, + "loss": 1.6712, + "step": 2811 + }, + { + "epoch": 1.289416351335856, + "grad_norm": 0.29555824398994446, + "learning_rate": 0.0006363130045021121, + "loss": 1.2164, + "step": 2812 + }, + { + "epoch": 1.2898750143332187, + "grad_norm": 0.17073696851730347, + "learning_rate": 0.0006360747259769521, + "loss": 0.9707, + "step": 2813 + }, + { + "epoch": 1.2903336773305814, + "grad_norm": 0.19965320825576782, + "learning_rate": 0.0006358364140716805, + "loss": 1.3948, + "step": 2814 + }, + { + "epoch": 1.290792340327944, + "grad_norm": 0.2776814103126526, + "learning_rate": 0.0006355980688447571, + "loss": 1.6216, + "step": 2815 + }, + { + "epoch": 1.2912510033253066, + "grad_norm": 0.29604098200798035, + "learning_rate": 0.0006353596903546497, + "loss": 1.0638, + "step": 2816 + }, + { + "epoch": 1.2917096663226695, + "grad_norm": 0.2199414223432541, + "learning_rate": 0.0006351212786598341, + "loss": 1.8243, + "step": 2817 + }, + { + "epoch": 1.2921683293200321, + "grad_norm": 0.4345381259918213, + "learning_rate": 0.0006348828338187949, + "loss": 2.1404, + "step": 2818 + }, + { + "epoch": 1.2926269923173948, + "grad_norm": 0.303279310464859, + "learning_rate": 0.0006346443558900239, + "loss": 1.6801, + "step": 2819 + }, + { + "epoch": 1.2930856553147576, + "grad_norm": 0.32503482699394226, + "learning_rate": 0.0006344058449320215, + "loss": 1.7594, + "step": 2820 + }, + { + "epoch": 1.2935443183121202, + "grad_norm": 0.2687481641769409, + "learning_rate": 0.0006341673010032962, + "loss": 0.9268, + "step": 2821 + }, + { + "epoch": 1.2940029813094829, + "grad_norm": 0.21548904478549957, + "learning_rate": 0.0006339287241623646, + "loss": 1.2141, + "step": 2822 + }, + { + "epoch": 1.2944616443068455, + "grad_norm": 0.2961852550506592, + "learning_rate": 0.0006336901144677514, + "loss": 1.4321, + "step": 2823 + }, + { + "epoch": 1.2949203073042082, + "grad_norm": 0.3688824474811554, + "learning_rate": 0.0006334514719779891, + "loss": 1.7928, + "step": 2824 + }, + { + "epoch": 1.295378970301571, + "grad_norm": 0.3493098318576813, + "learning_rate": 0.0006332127967516182, + "loss": 1.9301, + "step": 2825 + }, + { + "epoch": 1.2958376332989336, + "grad_norm": 0.344375878572464, + "learning_rate": 0.0006329740888471881, + "loss": 1.4666, + "step": 2826 + }, + { + "epoch": 1.2962962962962963, + "grad_norm": 0.36221787333488464, + "learning_rate": 0.000632735348323255, + "loss": 1.4453, + "step": 2827 + }, + { + "epoch": 1.296754959293659, + "grad_norm": 0.3239736557006836, + "learning_rate": 0.000632496575238384, + "loss": 1.926, + "step": 2828 + }, + { + "epoch": 1.2972136222910216, + "grad_norm": 0.20216989517211914, + "learning_rate": 0.0006322577696511479, + "loss": 0.4995, + "step": 2829 + }, + { + "epoch": 1.2976722852883844, + "grad_norm": 0.713234007358551, + "learning_rate": 0.000632018931620127, + "loss": 1.2388, + "step": 2830 + }, + { + "epoch": 1.298130948285747, + "grad_norm": 0.39462095499038696, + "learning_rate": 0.0006317800612039103, + "loss": 1.3626, + "step": 2831 + }, + { + "epoch": 1.2985896112831097, + "grad_norm": 0.3811722993850708, + "learning_rate": 0.0006315411584610946, + "loss": 1.5548, + "step": 2832 + }, + { + "epoch": 1.2990482742804725, + "grad_norm": 0.1306716352701187, + "learning_rate": 0.0006313022234502841, + "loss": 1.1012, + "step": 2833 + }, + { + "epoch": 1.2995069372778352, + "grad_norm": 0.37927016615867615, + "learning_rate": 0.0006310632562300917, + "loss": 1.6273, + "step": 2834 + }, + { + "epoch": 1.2999656002751978, + "grad_norm": 0.19377583265304565, + "learning_rate": 0.0006308242568591371, + "loss": 1.2111, + "step": 2835 + }, + { + "epoch": 1.3004242632725604, + "grad_norm": 0.3456363379955292, + "learning_rate": 0.0006305852253960493, + "loss": 2.2551, + "step": 2836 + }, + { + "epoch": 1.300882926269923, + "grad_norm": 0.3247735798358917, + "learning_rate": 0.0006303461618994642, + "loss": 1.5282, + "step": 2837 + }, + { + "epoch": 1.301341589267286, + "grad_norm": 0.22547945380210876, + "learning_rate": 0.0006301070664280255, + "loss": 1.5934, + "step": 2838 + }, + { + "epoch": 1.3018002522646486, + "grad_norm": 0.26010963320732117, + "learning_rate": 0.0006298679390403854, + "loss": 1.6635, + "step": 2839 + }, + { + "epoch": 1.3022589152620112, + "grad_norm": 0.2988232672214508, + "learning_rate": 0.0006296287797952034, + "loss": 1.1585, + "step": 2840 + }, + { + "epoch": 1.3027175782593738, + "grad_norm": 0.3130618631839752, + "learning_rate": 0.0006293895887511472, + "loss": 2.3552, + "step": 2841 + }, + { + "epoch": 1.3031762412567365, + "grad_norm": 0.40085339546203613, + "learning_rate": 0.0006291503659668916, + "loss": 1.7498, + "step": 2842 + }, + { + "epoch": 1.3036349042540993, + "grad_norm": 0.3021315336227417, + "learning_rate": 0.0006289111115011204, + "loss": 1.6539, + "step": 2843 + }, + { + "epoch": 1.304093567251462, + "grad_norm": 0.32519641518592834, + "learning_rate": 0.0006286718254125238, + "loss": 1.626, + "step": 2844 + }, + { + "epoch": 1.3045522302488246, + "grad_norm": 0.34999242424964905, + "learning_rate": 0.0006284325077598008, + "loss": 1.4192, + "step": 2845 + }, + { + "epoch": 1.3050108932461875, + "grad_norm": 0.48127058148384094, + "learning_rate": 0.0006281931586016576, + "loss": 1.2933, + "step": 2846 + }, + { + "epoch": 1.30546955624355, + "grad_norm": 0.3110017776489258, + "learning_rate": 0.0006279537779968082, + "loss": 1.4093, + "step": 2847 + }, + { + "epoch": 1.3059282192409127, + "grad_norm": 0.28830286860466003, + "learning_rate": 0.0006277143660039746, + "loss": 1.553, + "step": 2848 + }, + { + "epoch": 1.3063868822382754, + "grad_norm": 0.29233160614967346, + "learning_rate": 0.000627474922681886, + "loss": 1.988, + "step": 2849 + }, + { + "epoch": 1.306845545235638, + "grad_norm": 0.3408198058605194, + "learning_rate": 0.0006272354480892797, + "loss": 1.4908, + "step": 2850 + }, + { + "epoch": 1.3073042082330009, + "grad_norm": 0.3165978491306305, + "learning_rate": 0.0006269959422849007, + "loss": 1.6967, + "step": 2851 + }, + { + "epoch": 1.3077628712303635, + "grad_norm": 0.2932069003582001, + "learning_rate": 0.0006267564053275012, + "loss": 1.9072, + "step": 2852 + }, + { + "epoch": 1.3082215342277261, + "grad_norm": 0.35463714599609375, + "learning_rate": 0.0006265168372758417, + "loss": 1.5883, + "step": 2853 + }, + { + "epoch": 1.308680197225089, + "grad_norm": 0.32562148571014404, + "learning_rate": 0.0006262772381886896, + "loss": 1.4199, + "step": 2854 + }, + { + "epoch": 1.3091388602224516, + "grad_norm": 0.3598746955394745, + "learning_rate": 0.0006260376081248205, + "loss": 1.2422, + "step": 2855 + }, + { + "epoch": 1.3095975232198143, + "grad_norm": 0.23722459375858307, + "learning_rate": 0.0006257979471430174, + "loss": 1.5364, + "step": 2856 + }, + { + "epoch": 1.310056186217177, + "grad_norm": 0.25034475326538086, + "learning_rate": 0.0006255582553020708, + "loss": 1.2079, + "step": 2857 + }, + { + "epoch": 1.3105148492145395, + "grad_norm": 0.3492996096611023, + "learning_rate": 0.000625318532660779, + "loss": 1.614, + "step": 2858 + }, + { + "epoch": 1.3109735122119024, + "grad_norm": 0.3420841097831726, + "learning_rate": 0.0006250787792779473, + "loss": 1.7344, + "step": 2859 + }, + { + "epoch": 1.311432175209265, + "grad_norm": 0.3418578505516052, + "learning_rate": 0.0006248389952123895, + "loss": 2.0039, + "step": 2860 + }, + { + "epoch": 1.3118908382066277, + "grad_norm": 0.4057673513889313, + "learning_rate": 0.0006245991805229256, + "loss": 1.4351, + "step": 2861 + }, + { + "epoch": 1.3123495012039903, + "grad_norm": 0.1766156405210495, + "learning_rate": 0.0006243593352683848, + "loss": 0.7945, + "step": 2862 + }, + { + "epoch": 1.312808164201353, + "grad_norm": 0.32402530312538147, + "learning_rate": 0.0006241194595076019, + "loss": 1.3118, + "step": 2863 + }, + { + "epoch": 1.3132668271987158, + "grad_norm": 0.24747517704963684, + "learning_rate": 0.000623879553299421, + "loss": 1.4999, + "step": 2864 + }, + { + "epoch": 1.3137254901960784, + "grad_norm": 0.4034758508205414, + "learning_rate": 0.0006236396167026923, + "loss": 1.2577, + "step": 2865 + }, + { + "epoch": 1.314184153193441, + "grad_norm": 0.2698371410369873, + "learning_rate": 0.0006233996497762741, + "loss": 1.4708, + "step": 2866 + }, + { + "epoch": 1.314642816190804, + "grad_norm": 0.26158544421195984, + "learning_rate": 0.000623159652579032, + "loss": 1.4747, + "step": 2867 + }, + { + "epoch": 1.3151014791881666, + "grad_norm": 0.3202878534793854, + "learning_rate": 0.0006229196251698394, + "loss": 1.4017, + "step": 2868 + }, + { + "epoch": 1.3155601421855292, + "grad_norm": 0.33199355006217957, + "learning_rate": 0.0006226795676075762, + "loss": 1.305, + "step": 2869 + }, + { + "epoch": 1.3160188051828918, + "grad_norm": 0.29111596941947937, + "learning_rate": 0.0006224394799511306, + "loss": 1.4327, + "step": 2870 + }, + { + "epoch": 1.3164774681802545, + "grad_norm": 0.2958594858646393, + "learning_rate": 0.0006221993622593975, + "loss": 1.2742, + "step": 2871 + }, + { + "epoch": 1.3169361311776173, + "grad_norm": 0.08179383724927902, + "learning_rate": 0.00062195921459128, + "loss": 0.8383, + "step": 2872 + }, + { + "epoch": 1.31739479417498, + "grad_norm": 0.3127470314502716, + "learning_rate": 0.0006217190370056876, + "loss": 1.7184, + "step": 2873 + }, + { + "epoch": 1.3178534571723426, + "grad_norm": 0.2972666621208191, + "learning_rate": 0.0006214788295615381, + "loss": 1.6077, + "step": 2874 + }, + { + "epoch": 1.3183121201697052, + "grad_norm": 0.3870674669742584, + "learning_rate": 0.0006212385923177556, + "loss": 1.2744, + "step": 2875 + }, + { + "epoch": 1.3187707831670679, + "grad_norm": 0.2808718979358673, + "learning_rate": 0.0006209983253332721, + "loss": 1.2932, + "step": 2876 + }, + { + "epoch": 1.3192294461644307, + "grad_norm": 0.2264634668827057, + "learning_rate": 0.0006207580286670271, + "loss": 1.3213, + "step": 2877 + }, + { + "epoch": 1.3196881091617934, + "grad_norm": 0.3030881881713867, + "learning_rate": 0.0006205177023779669, + "loss": 1.1255, + "step": 2878 + }, + { + "epoch": 1.320146772159156, + "grad_norm": 0.3098306953907013, + "learning_rate": 0.0006202773465250456, + "loss": 1.3304, + "step": 2879 + }, + { + "epoch": 1.3206054351565188, + "grad_norm": 0.26796406507492065, + "learning_rate": 0.0006200369611672236, + "loss": 1.2411, + "step": 2880 + }, + { + "epoch": 1.3210640981538815, + "grad_norm": 0.20453116297721863, + "learning_rate": 0.0006197965463634696, + "loss": 0.9436, + "step": 2881 + }, + { + "epoch": 1.3215227611512441, + "grad_norm": 0.22560381889343262, + "learning_rate": 0.000619556102172759, + "loss": 1.673, + "step": 2882 + }, + { + "epoch": 1.3219814241486068, + "grad_norm": 0.24689380824565887, + "learning_rate": 0.0006193156286540745, + "loss": 0.8455, + "step": 2883 + }, + { + "epoch": 1.3224400871459694, + "grad_norm": 0.24661903083324432, + "learning_rate": 0.000619075125866406, + "loss": 0.9642, + "step": 2884 + }, + { + "epoch": 1.3228987501433322, + "grad_norm": 0.2800322473049164, + "learning_rate": 0.0006188345938687506, + "loss": 1.3403, + "step": 2885 + }, + { + "epoch": 1.3233574131406949, + "grad_norm": 0.21044687926769257, + "learning_rate": 0.0006185940327201125, + "loss": 0.7411, + "step": 2886 + }, + { + "epoch": 1.3238160761380575, + "grad_norm": 0.319131463766098, + "learning_rate": 0.000618353442479503, + "loss": 1.756, + "step": 2887 + }, + { + "epoch": 1.3242747391354204, + "grad_norm": 0.37177905440330505, + "learning_rate": 0.0006181128232059407, + "loss": 1.2766, + "step": 2888 + }, + { + "epoch": 1.324733402132783, + "grad_norm": 0.30548402667045593, + "learning_rate": 0.0006178721749584512, + "loss": 1.2389, + "step": 2889 + }, + { + "epoch": 1.3251920651301456, + "grad_norm": 0.30189475417137146, + "learning_rate": 0.0006176314977960673, + "loss": 1.2825, + "step": 2890 + }, + { + "epoch": 1.3256507281275083, + "grad_norm": 0.11458157002925873, + "learning_rate": 0.0006173907917778288, + "loss": 0.5948, + "step": 2891 + }, + { + "epoch": 1.326109391124871, + "grad_norm": 0.2670215666294098, + "learning_rate": 0.0006171500569627827, + "loss": 0.8963, + "step": 2892 + }, + { + "epoch": 1.3265680541222338, + "grad_norm": 0.23610422015190125, + "learning_rate": 0.0006169092934099828, + "loss": 1.4451, + "step": 2893 + }, + { + "epoch": 1.3270267171195964, + "grad_norm": 0.3228674530982971, + "learning_rate": 0.0006166685011784904, + "loss": 1.6901, + "step": 2894 + }, + { + "epoch": 1.327485380116959, + "grad_norm": 0.323810875415802, + "learning_rate": 0.0006164276803273734, + "loss": 1.6255, + "step": 2895 + }, + { + "epoch": 1.3279440431143217, + "grad_norm": 0.2585465908050537, + "learning_rate": 0.000616186830915707, + "loss": 0.8431, + "step": 2896 + }, + { + "epoch": 1.3284027061116843, + "grad_norm": 0.2836966812610626, + "learning_rate": 0.000615945953002573, + "loss": 1.4153, + "step": 2897 + }, + { + "epoch": 1.3288613691090472, + "grad_norm": 0.3975668251514435, + "learning_rate": 0.0006157050466470608, + "loss": 1.78, + "step": 2898 + }, + { + "epoch": 1.3293200321064098, + "grad_norm": 0.3772067129611969, + "learning_rate": 0.0006154641119082666, + "loss": 2.2615, + "step": 2899 + }, + { + "epoch": 1.3297786951037724, + "grad_norm": 0.39629077911376953, + "learning_rate": 0.0006152231488452931, + "loss": 1.9524, + "step": 2900 + }, + { + "epoch": 1.3302373581011353, + "grad_norm": 0.26282697916030884, + "learning_rate": 0.0006149821575172502, + "loss": 0.9766, + "step": 2901 + }, + { + "epoch": 1.330696021098498, + "grad_norm": 0.22965925931930542, + "learning_rate": 0.0006147411379832553, + "loss": 1.179, + "step": 2902 + }, + { + "epoch": 1.3311546840958606, + "grad_norm": 0.2938666045665741, + "learning_rate": 0.0006145000903024317, + "loss": 1.5908, + "step": 2903 + }, + { + "epoch": 1.3316133470932232, + "grad_norm": 0.35874783992767334, + "learning_rate": 0.0006142590145339106, + "loss": 1.3643, + "step": 2904 + }, + { + "epoch": 1.3320720100905858, + "grad_norm": 0.3151531219482422, + "learning_rate": 0.0006140179107368291, + "loss": 1.5058, + "step": 2905 + }, + { + "epoch": 1.3325306730879487, + "grad_norm": 0.3742069602012634, + "learning_rate": 0.0006137767789703322, + "loss": 1.6921, + "step": 2906 + }, + { + "epoch": 1.3329893360853113, + "grad_norm": 0.19927988946437836, + "learning_rate": 0.0006135356192935709, + "loss": 1.2602, + "step": 2907 + }, + { + "epoch": 1.333447999082674, + "grad_norm": 0.31676074862480164, + "learning_rate": 0.0006132944317657035, + "loss": 1.7969, + "step": 2908 + }, + { + "epoch": 1.3339066620800368, + "grad_norm": 0.2802976965904236, + "learning_rate": 0.0006130532164458952, + "loss": 1.3497, + "step": 2909 + }, + { + "epoch": 1.3343653250773992, + "grad_norm": 0.3053966760635376, + "learning_rate": 0.0006128119733933179, + "loss": 1.7183, + "step": 2910 + }, + { + "epoch": 1.334823988074762, + "grad_norm": 0.3425951302051544, + "learning_rate": 0.0006125707026671498, + "loss": 1.333, + "step": 2911 + }, + { + "epoch": 1.3352826510721247, + "grad_norm": 0.33375629782676697, + "learning_rate": 0.0006123294043265768, + "loss": 2.2371, + "step": 2912 + }, + { + "epoch": 1.3357413140694874, + "grad_norm": 0.358460396528244, + "learning_rate": 0.000612088078430791, + "loss": 1.4041, + "step": 2913 + }, + { + "epoch": 1.3361999770668502, + "grad_norm": 0.30156955122947693, + "learning_rate": 0.0006118467250389914, + "loss": 2.1921, + "step": 2914 + }, + { + "epoch": 1.3366586400642129, + "grad_norm": 0.32735612988471985, + "learning_rate": 0.0006116053442103836, + "loss": 1.3602, + "step": 2915 + }, + { + "epoch": 1.3371173030615755, + "grad_norm": 0.2227545529603958, + "learning_rate": 0.0006113639360041803, + "loss": 0.9218, + "step": 2916 + }, + { + "epoch": 1.3375759660589381, + "grad_norm": 0.28981199860572815, + "learning_rate": 0.0006111225004796004, + "loss": 1.1524, + "step": 2917 + }, + { + "epoch": 1.3380346290563008, + "grad_norm": 0.2632843554019928, + "learning_rate": 0.0006108810376958699, + "loss": 1.2719, + "step": 2918 + }, + { + "epoch": 1.3384932920536636, + "grad_norm": 0.3473341166973114, + "learning_rate": 0.0006106395477122213, + "loss": 1.3489, + "step": 2919 + }, + { + "epoch": 1.3389519550510263, + "grad_norm": 0.10266976803541183, + "learning_rate": 0.000610398030587894, + "loss": 0.5534, + "step": 2920 + }, + { + "epoch": 1.339410618048389, + "grad_norm": 0.29631996154785156, + "learning_rate": 0.0006101564863821338, + "loss": 1.2847, + "step": 2921 + }, + { + "epoch": 1.3398692810457518, + "grad_norm": 0.2845175266265869, + "learning_rate": 0.0006099149151541931, + "loss": 1.7554, + "step": 2922 + }, + { + "epoch": 1.3403279440431144, + "grad_norm": 0.2774314284324646, + "learning_rate": 0.0006096733169633314, + "loss": 1.5494, + "step": 2923 + }, + { + "epoch": 1.340786607040477, + "grad_norm": 0.31548067927360535, + "learning_rate": 0.0006094316918688141, + "loss": 1.1424, + "step": 2924 + }, + { + "epoch": 1.3412452700378397, + "grad_norm": 0.357732892036438, + "learning_rate": 0.0006091900399299138, + "loss": 1.3782, + "step": 2925 + }, + { + "epoch": 1.3417039330352023, + "grad_norm": 0.2636417746543884, + "learning_rate": 0.0006089483612059093, + "loss": 1.6639, + "step": 2926 + }, + { + "epoch": 1.3421625960325652, + "grad_norm": 0.32517683506011963, + "learning_rate": 0.0006087066557560865, + "loss": 1.743, + "step": 2927 + }, + { + "epoch": 1.3426212590299278, + "grad_norm": 0.3475479185581207, + "learning_rate": 0.0006084649236397369, + "loss": 1.6309, + "step": 2928 + }, + { + "epoch": 1.3430799220272904, + "grad_norm": 0.3126490116119385, + "learning_rate": 0.0006082231649161595, + "loss": 1.4764, + "step": 2929 + }, + { + "epoch": 1.343538585024653, + "grad_norm": 0.24782423675060272, + "learning_rate": 0.0006079813796446596, + "loss": 0.7667, + "step": 2930 + }, + { + "epoch": 1.3439972480220157, + "grad_norm": 0.28448745608329773, + "learning_rate": 0.0006077395678845486, + "loss": 1.4008, + "step": 2931 + }, + { + "epoch": 1.3444559110193786, + "grad_norm": 0.1474987417459488, + "learning_rate": 0.0006074977296951448, + "loss": 0.5947, + "step": 2932 + }, + { + "epoch": 1.3449145740167412, + "grad_norm": 0.20980101823806763, + "learning_rate": 0.0006072558651357727, + "loss": 1.2209, + "step": 2933 + }, + { + "epoch": 1.3453732370141038, + "grad_norm": 0.377865731716156, + "learning_rate": 0.0006070139742657635, + "loss": 2.0645, + "step": 2934 + }, + { + "epoch": 1.3458319000114667, + "grad_norm": 0.47778239846229553, + "learning_rate": 0.0006067720571444551, + "loss": 1.5841, + "step": 2935 + }, + { + "epoch": 1.3462905630088293, + "grad_norm": 0.3798399865627289, + "learning_rate": 0.000606530113831191, + "loss": 1.7783, + "step": 2936 + }, + { + "epoch": 1.346749226006192, + "grad_norm": 0.21647045016288757, + "learning_rate": 0.0006062881443853218, + "loss": 0.4051, + "step": 2937 + }, + { + "epoch": 1.3472078890035546, + "grad_norm": 0.3337489664554596, + "learning_rate": 0.0006060461488662044, + "loss": 2.0281, + "step": 2938 + }, + { + "epoch": 1.3476665520009172, + "grad_norm": 0.3525756895542145, + "learning_rate": 0.0006058041273332021, + "loss": 1.5044, + "step": 2939 + }, + { + "epoch": 1.34812521499828, + "grad_norm": 0.24019671976566315, + "learning_rate": 0.0006055620798456845, + "loss": 1.3129, + "step": 2940 + }, + { + "epoch": 1.3485838779956427, + "grad_norm": 0.37379053235054016, + "learning_rate": 0.0006053200064630276, + "loss": 2.0564, + "step": 2941 + }, + { + "epoch": 1.3490425409930054, + "grad_norm": 0.2772229015827179, + "learning_rate": 0.0006050779072446137, + "loss": 1.0508, + "step": 2942 + }, + { + "epoch": 1.3495012039903682, + "grad_norm": 0.15392114222049713, + "learning_rate": 0.0006048357822498315, + "loss": 1.0146, + "step": 2943 + }, + { + "epoch": 1.3499598669877306, + "grad_norm": 0.3035467267036438, + "learning_rate": 0.000604593631538076, + "loss": 1.5189, + "step": 2944 + }, + { + "epoch": 1.3504185299850935, + "grad_norm": 0.3849334120750427, + "learning_rate": 0.0006043514551687484, + "loss": 1.7883, + "step": 2945 + }, + { + "epoch": 1.3508771929824561, + "grad_norm": 0.36944082379341125, + "learning_rate": 0.0006041092532012566, + "loss": 1.8156, + "step": 2946 + }, + { + "epoch": 1.3513358559798188, + "grad_norm": 0.5630089044570923, + "learning_rate": 0.0006038670256950144, + "loss": 1.1769, + "step": 2947 + }, + { + "epoch": 1.3517945189771816, + "grad_norm": 0.22693516314029694, + "learning_rate": 0.0006036247727094418, + "loss": 1.3827, + "step": 2948 + }, + { + "epoch": 1.3522531819745442, + "grad_norm": 0.2991299033164978, + "learning_rate": 0.0006033824943039651, + "loss": 0.9545, + "step": 2949 + }, + { + "epoch": 1.3527118449719069, + "grad_norm": 0.21848253905773163, + "learning_rate": 0.0006031401905380173, + "loss": 0.9471, + "step": 2950 + }, + { + "epoch": 1.3531705079692695, + "grad_norm": 0.2379998415708542, + "learning_rate": 0.000602897861471037, + "loss": 1.1672, + "step": 2951 + }, + { + "epoch": 1.3536291709666322, + "grad_norm": 0.28074660897254944, + "learning_rate": 0.0006026555071624694, + "loss": 1.7654, + "step": 2952 + }, + { + "epoch": 1.354087833963995, + "grad_norm": 0.4257287383079529, + "learning_rate": 0.0006024131276717657, + "loss": 2.002, + "step": 2953 + }, + { + "epoch": 1.3545464969613576, + "grad_norm": 0.9441617131233215, + "learning_rate": 0.0006021707230583834, + "loss": 1.7286, + "step": 2954 + }, + { + "epoch": 1.3550051599587203, + "grad_norm": 0.24226488173007965, + "learning_rate": 0.0006019282933817859, + "loss": 1.2914, + "step": 2955 + }, + { + "epoch": 1.3554638229560831, + "grad_norm": 0.3882712125778198, + "learning_rate": 0.0006016858387014432, + "loss": 2.168, + "step": 2956 + }, + { + "epoch": 1.3559224859534458, + "grad_norm": 0.2512166202068329, + "learning_rate": 0.0006014433590768311, + "loss": 1.1456, + "step": 2957 + }, + { + "epoch": 1.3563811489508084, + "grad_norm": 0.17546239495277405, + "learning_rate": 0.0006012008545674316, + "loss": 1.2354, + "step": 2958 + }, + { + "epoch": 1.356839811948171, + "grad_norm": 0.35493841767311096, + "learning_rate": 0.0006009583252327326, + "loss": 1.7339, + "step": 2959 + }, + { + "epoch": 1.3572984749455337, + "grad_norm": 0.381168931722641, + "learning_rate": 0.0006007157711322286, + "loss": 2.2961, + "step": 2960 + }, + { + "epoch": 1.3577571379428965, + "grad_norm": 0.229428231716156, + "learning_rate": 0.0006004731923254197, + "loss": 0.6636, + "step": 2961 + }, + { + "epoch": 1.3582158009402592, + "grad_norm": 0.3353422284126282, + "learning_rate": 0.0006002305888718123, + "loss": 1.9835, + "step": 2962 + }, + { + "epoch": 1.3586744639376218, + "grad_norm": 0.32611238956451416, + "learning_rate": 0.0005999879608309189, + "loss": 1.601, + "step": 2963 + }, + { + "epoch": 1.3591331269349844, + "grad_norm": 0.29757001996040344, + "learning_rate": 0.0005997453082622575, + "loss": 1.6636, + "step": 2964 + }, + { + "epoch": 1.359591789932347, + "grad_norm": 0.2990017235279083, + "learning_rate": 0.0005995026312253527, + "loss": 1.4233, + "step": 2965 + }, + { + "epoch": 1.36005045292971, + "grad_norm": 0.3065161108970642, + "learning_rate": 0.0005992599297797354, + "loss": 1.7319, + "step": 2966 + }, + { + "epoch": 1.3605091159270726, + "grad_norm": 0.2563233971595764, + "learning_rate": 0.0005990172039849413, + "loss": 0.6647, + "step": 2967 + }, + { + "epoch": 1.3609677789244352, + "grad_norm": 0.12097487598657608, + "learning_rate": 0.0005987744539005134, + "loss": 0.9222, + "step": 2968 + }, + { + "epoch": 1.361426441921798, + "grad_norm": 0.2720169723033905, + "learning_rate": 0.0005985316795859994, + "loss": 0.8245, + "step": 2969 + }, + { + "epoch": 1.3618851049191607, + "grad_norm": 0.2813059985637665, + "learning_rate": 0.0005982888811009541, + "loss": 1.371, + "step": 2970 + }, + { + "epoch": 1.3623437679165233, + "grad_norm": 0.32221513986587524, + "learning_rate": 0.0005980460585049377, + "loss": 1.9143, + "step": 2971 + }, + { + "epoch": 1.362802430913886, + "grad_norm": 0.3392675817012787, + "learning_rate": 0.0005978032118575162, + "loss": 1.5422, + "step": 2972 + }, + { + "epoch": 1.3632610939112486, + "grad_norm": 0.26565980911254883, + "learning_rate": 0.0005975603412182615, + "loss": 1.222, + "step": 2973 + }, + { + "epoch": 1.3637197569086115, + "grad_norm": 0.32214289903640747, + "learning_rate": 0.0005973174466467516, + "loss": 2.3055, + "step": 2974 + }, + { + "epoch": 1.364178419905974, + "grad_norm": 0.3520325720310211, + "learning_rate": 0.0005970745282025705, + "loss": 1.8467, + "step": 2975 + }, + { + "epoch": 1.3646370829033367, + "grad_norm": 0.42569711804389954, + "learning_rate": 0.0005968315859453075, + "loss": 2.2639, + "step": 2976 + }, + { + "epoch": 1.3650957459006996, + "grad_norm": 0.32188594341278076, + "learning_rate": 0.0005965886199345584, + "loss": 1.6467, + "step": 2977 + }, + { + "epoch": 1.365554408898062, + "grad_norm": 0.3434820771217346, + "learning_rate": 0.0005963456302299245, + "loss": 1.8241, + "step": 2978 + }, + { + "epoch": 1.3660130718954249, + "grad_norm": 0.4579332172870636, + "learning_rate": 0.0005961026168910126, + "loss": 1.7689, + "step": 2979 + }, + { + "epoch": 1.3664717348927875, + "grad_norm": 0.32917508482933044, + "learning_rate": 0.000595859579977436, + "loss": 1.7894, + "step": 2980 + }, + { + "epoch": 1.3669303978901501, + "grad_norm": 0.2983636260032654, + "learning_rate": 0.0005956165195488131, + "loss": 1.6581, + "step": 2981 + }, + { + "epoch": 1.367389060887513, + "grad_norm": 0.23157159984111786, + "learning_rate": 0.0005953734356647686, + "loss": 0.804, + "step": 2982 + }, + { + "epoch": 1.3678477238848756, + "grad_norm": 0.2740468680858612, + "learning_rate": 0.0005951303283849329, + "loss": 1.2906, + "step": 2983 + }, + { + "epoch": 1.3683063868822383, + "grad_norm": 0.15333612263202667, + "learning_rate": 0.0005948871977689414, + "loss": 0.7368, + "step": 2984 + }, + { + "epoch": 1.368765049879601, + "grad_norm": 0.3173515796661377, + "learning_rate": 0.0005946440438764362, + "loss": 2.1804, + "step": 2985 + }, + { + "epoch": 1.3692237128769635, + "grad_norm": 0.3675207495689392, + "learning_rate": 0.0005944008667670646, + "loss": 1.7012, + "step": 2986 + }, + { + "epoch": 1.3696823758743264, + "grad_norm": 0.42129290103912354, + "learning_rate": 0.0005941576665004798, + "loss": 1.2498, + "step": 2987 + }, + { + "epoch": 1.370141038871689, + "grad_norm": 0.24997709691524506, + "learning_rate": 0.0005939144431363404, + "loss": 1.4775, + "step": 2988 + }, + { + "epoch": 1.3705997018690517, + "grad_norm": 0.2527235448360443, + "learning_rate": 0.0005936711967343111, + "loss": 1.2377, + "step": 2989 + }, + { + "epoch": 1.3710583648664145, + "grad_norm": 0.2272566556930542, + "learning_rate": 0.0005934279273540616, + "loss": 1.5359, + "step": 2990 + }, + { + "epoch": 1.3715170278637772, + "grad_norm": 0.30363374948501587, + "learning_rate": 0.000593184635055268, + "loss": 1.6325, + "step": 2991 + }, + { + "epoch": 1.3719756908611398, + "grad_norm": 0.352542519569397, + "learning_rate": 0.0005929413198976115, + "loss": 1.656, + "step": 2992 + }, + { + "epoch": 1.3724343538585024, + "grad_norm": 0.3105866611003876, + "learning_rate": 0.0005926979819407791, + "loss": 1.9192, + "step": 2993 + }, + { + "epoch": 1.372893016855865, + "grad_norm": 0.2809374928474426, + "learning_rate": 0.0005924546212444634, + "loss": 1.3723, + "step": 2994 + }, + { + "epoch": 1.373351679853228, + "grad_norm": 0.32282719016075134, + "learning_rate": 0.0005922112378683624, + "loss": 1.6761, + "step": 2995 + }, + { + "epoch": 1.3738103428505906, + "grad_norm": 0.2368834912776947, + "learning_rate": 0.0005919678318721797, + "loss": 1.1937, + "step": 2996 + }, + { + "epoch": 1.3742690058479532, + "grad_norm": 0.2217353880405426, + "learning_rate": 0.000591724403315625, + "loss": 1.2874, + "step": 2997 + }, + { + "epoch": 1.3747276688453158, + "grad_norm": 0.2690431773662567, + "learning_rate": 0.0005914809522584127, + "loss": 1.4406, + "step": 2998 + }, + { + "epoch": 1.3751863318426785, + "grad_norm": 0.20234516263008118, + "learning_rate": 0.0005912374787602632, + "loss": 1.3164, + "step": 2999 + }, + { + "epoch": 1.3756449948400413, + "grad_norm": 0.265828937292099, + "learning_rate": 0.0005909939828809024, + "loss": 1.2651, + "step": 3000 + }, + { + "epoch": 1.376103657837404, + "grad_norm": 0.28443530201911926, + "learning_rate": 0.0005907504646800613, + "loss": 1.1567, + "step": 3001 + }, + { + "epoch": 1.3765623208347666, + "grad_norm": 0.2254990190267563, + "learning_rate": 0.0005905069242174769, + "loss": 1.4349, + "step": 3002 + }, + { + "epoch": 1.3770209838321295, + "grad_norm": 0.2833521366119385, + "learning_rate": 0.0005902633615528916, + "loss": 1.8909, + "step": 3003 + }, + { + "epoch": 1.377479646829492, + "grad_norm": 0.3044750392436981, + "learning_rate": 0.0005900197767460527, + "loss": 1.5297, + "step": 3004 + }, + { + "epoch": 1.3779383098268547, + "grad_norm": 0.29396411776542664, + "learning_rate": 0.0005897761698567135, + "loss": 1.2523, + "step": 3005 + }, + { + "epoch": 1.3783969728242174, + "grad_norm": 0.24863453209400177, + "learning_rate": 0.0005895325409446327, + "loss": 1.5853, + "step": 3006 + }, + { + "epoch": 1.37885563582158, + "grad_norm": 0.31140193343162537, + "learning_rate": 0.0005892888900695738, + "loss": 1.259, + "step": 3007 + }, + { + "epoch": 1.3793142988189429, + "grad_norm": 0.21979323029518127, + "learning_rate": 0.0005890452172913065, + "loss": 1.415, + "step": 3008 + }, + { + "epoch": 1.3797729618163055, + "grad_norm": 0.3025084435939789, + "learning_rate": 0.0005888015226696053, + "loss": 0.7619, + "step": 3009 + }, + { + "epoch": 1.3802316248136681, + "grad_norm": 0.11269453167915344, + "learning_rate": 0.0005885578062642502, + "loss": 1.0517, + "step": 3010 + }, + { + "epoch": 1.380690287811031, + "grad_norm": 0.2220744490623474, + "learning_rate": 0.0005883140681350269, + "loss": 0.8101, + "step": 3011 + }, + { + "epoch": 1.3811489508083934, + "grad_norm": 0.25374385714530945, + "learning_rate": 0.0005880703083417258, + "loss": 1.2319, + "step": 3012 + }, + { + "epoch": 1.3816076138057563, + "grad_norm": 0.26675471663475037, + "learning_rate": 0.0005878265269441432, + "loss": 1.4619, + "step": 3013 + }, + { + "epoch": 1.3820662768031189, + "grad_norm": 0.09885770082473755, + "learning_rate": 0.00058758272400208, + "loss": 0.6359, + "step": 3014 + }, + { + "epoch": 1.3825249398004815, + "grad_norm": 0.16511155664920807, + "learning_rate": 0.0005873388995753432, + "loss": 1.0938, + "step": 3015 + }, + { + "epoch": 1.3829836027978444, + "grad_norm": 0.3333290219306946, + "learning_rate": 0.0005870950537237446, + "loss": 1.7571, + "step": 3016 + }, + { + "epoch": 1.383442265795207, + "grad_norm": 0.26791778206825256, + "learning_rate": 0.0005868511865071013, + "loss": 0.9278, + "step": 3017 + }, + { + "epoch": 1.3839009287925697, + "grad_norm": 0.2120082974433899, + "learning_rate": 0.0005866072979852358, + "loss": 1.2115, + "step": 3018 + }, + { + "epoch": 1.3843595917899323, + "grad_norm": 0.41435691714286804, + "learning_rate": 0.0005863633882179758, + "loss": 2.1711, + "step": 3019 + }, + { + "epoch": 1.384818254787295, + "grad_norm": 0.2728036046028137, + "learning_rate": 0.0005861194572651537, + "loss": 1.5656, + "step": 3020 + }, + { + "epoch": 1.3852769177846578, + "grad_norm": 0.3878629803657532, + "learning_rate": 0.0005858755051866078, + "loss": 1.8513, + "step": 3021 + }, + { + "epoch": 1.3857355807820204, + "grad_norm": 0.3821435570716858, + "learning_rate": 0.0005856315320421812, + "loss": 1.7039, + "step": 3022 + }, + { + "epoch": 1.386194243779383, + "grad_norm": 0.28816846013069153, + "learning_rate": 0.0005853875378917225, + "loss": 1.8958, + "step": 3023 + }, + { + "epoch": 1.386652906776746, + "grad_norm": 0.24760308861732483, + "learning_rate": 0.0005851435227950851, + "loss": 0.9471, + "step": 3024 + }, + { + "epoch": 1.3871115697741085, + "grad_norm": 0.20163798332214355, + "learning_rate": 0.0005848994868121277, + "loss": 1.524, + "step": 3025 + }, + { + "epoch": 1.3875702327714712, + "grad_norm": 0.6160727143287659, + "learning_rate": 0.0005846554300027138, + "loss": 1.6717, + "step": 3026 + }, + { + "epoch": 1.3880288957688338, + "grad_norm": 0.2782137095928192, + "learning_rate": 0.0005844113524267128, + "loss": 0.8874, + "step": 3027 + }, + { + "epoch": 1.3884875587661965, + "grad_norm": 0.2697048783302307, + "learning_rate": 0.0005841672541439982, + "loss": 1.3064, + "step": 3028 + }, + { + "epoch": 1.3889462217635593, + "grad_norm": 0.2576363980770111, + "learning_rate": 0.0005839231352144493, + "loss": 1.8171, + "step": 3029 + }, + { + "epoch": 1.389404884760922, + "grad_norm": 0.34383904933929443, + "learning_rate": 0.0005836789956979503, + "loss": 1.5734, + "step": 3030 + }, + { + "epoch": 1.3898635477582846, + "grad_norm": 0.2631419003009796, + "learning_rate": 0.0005834348356543903, + "loss": 1.2598, + "step": 3031 + }, + { + "epoch": 1.3903222107556472, + "grad_norm": 0.3006002604961395, + "learning_rate": 0.0005831906551436633, + "loss": 1.4291, + "step": 3032 + }, + { + "epoch": 1.3907808737530098, + "grad_norm": 0.13184750080108643, + "learning_rate": 0.0005829464542256691, + "loss": 0.9445, + "step": 3033 + }, + { + "epoch": 1.3912395367503727, + "grad_norm": 0.38121703267097473, + "learning_rate": 0.0005827022329603114, + "loss": 1.3728, + "step": 3034 + }, + { + "epoch": 1.3916981997477353, + "grad_norm": 0.3366886079311371, + "learning_rate": 0.0005824579914074996, + "loss": 2.0142, + "step": 3035 + }, + { + "epoch": 1.392156862745098, + "grad_norm": 0.2520803213119507, + "learning_rate": 0.0005822137296271481, + "loss": 1.1912, + "step": 3036 + }, + { + "epoch": 1.3926155257424608, + "grad_norm": 0.2898450493812561, + "learning_rate": 0.0005819694476791757, + "loss": 1.5584, + "step": 3037 + }, + { + "epoch": 1.3930741887398235, + "grad_norm": 0.33184388279914856, + "learning_rate": 0.000581725145623507, + "loss": 1.6064, + "step": 3038 + }, + { + "epoch": 1.393532851737186, + "grad_norm": 0.40687495470046997, + "learning_rate": 0.0005814808235200708, + "loss": 2.0935, + "step": 3039 + }, + { + "epoch": 1.3939915147345487, + "grad_norm": 0.30613189935684204, + "learning_rate": 0.000581236481428801, + "loss": 1.0827, + "step": 3040 + }, + { + "epoch": 1.3944501777319114, + "grad_norm": 0.350396066904068, + "learning_rate": 0.0005809921194096365, + "loss": 1.9321, + "step": 3041 + }, + { + "epoch": 1.3949088407292742, + "grad_norm": 0.2401767075061798, + "learning_rate": 0.0005807477375225212, + "loss": 1.1565, + "step": 3042 + }, + { + "epoch": 1.3953675037266369, + "grad_norm": 0.29170459508895874, + "learning_rate": 0.0005805033358274037, + "loss": 1.3792, + "step": 3043 + }, + { + "epoch": 1.3958261667239995, + "grad_norm": 0.3244495689868927, + "learning_rate": 0.0005802589143842374, + "loss": 1.5734, + "step": 3044 + }, + { + "epoch": 1.3962848297213624, + "grad_norm": 0.35495296120643616, + "learning_rate": 0.0005800144732529808, + "loss": 1.4654, + "step": 3045 + }, + { + "epoch": 1.396743492718725, + "grad_norm": 0.31566861271858215, + "learning_rate": 0.0005797700124935968, + "loss": 2.1155, + "step": 3046 + }, + { + "epoch": 1.3972021557160876, + "grad_norm": 0.35331422090530396, + "learning_rate": 0.0005795255321660536, + "loss": 1.4407, + "step": 3047 + }, + { + "epoch": 1.3976608187134503, + "grad_norm": 0.46943673491477966, + "learning_rate": 0.000579281032330324, + "loss": 0.9726, + "step": 3048 + }, + { + "epoch": 1.398119481710813, + "grad_norm": 0.3449120819568634, + "learning_rate": 0.0005790365130463857, + "loss": 1.9149, + "step": 3049 + }, + { + "epoch": 1.3985781447081758, + "grad_norm": 0.45382195711135864, + "learning_rate": 0.0005787919743742207, + "loss": 1.5399, + "step": 3050 + }, + { + "epoch": 1.3990368077055384, + "grad_norm": 0.2743489742279053, + "learning_rate": 0.0005785474163738163, + "loss": 1.3011, + "step": 3051 + }, + { + "epoch": 1.399495470702901, + "grad_norm": 0.24921023845672607, + "learning_rate": 0.0005783028391051642, + "loss": 1.4218, + "step": 3052 + }, + { + "epoch": 1.3999541337002637, + "grad_norm": 0.25878721475601196, + "learning_rate": 0.0005780582426282611, + "loss": 1.3025, + "step": 3053 + }, + { + "epoch": 1.4004127966976263, + "grad_norm": 0.24198046326637268, + "learning_rate": 0.0005778136270031083, + "loss": 0.8168, + "step": 3054 + }, + { + "epoch": 1.4008714596949892, + "grad_norm": 0.259734183549881, + "learning_rate": 0.0005775689922897119, + "loss": 0.9978, + "step": 3055 + }, + { + "epoch": 1.4013301226923518, + "grad_norm": 0.2518146336078644, + "learning_rate": 0.0005773243385480822, + "loss": 1.4738, + "step": 3056 + }, + { + "epoch": 1.4017887856897144, + "grad_norm": 0.16398538649082184, + "learning_rate": 0.0005770796658382347, + "loss": 1.2174, + "step": 3057 + }, + { + "epoch": 1.4022474486870773, + "grad_norm": 0.2645060122013092, + "learning_rate": 0.0005768349742201895, + "loss": 1.6903, + "step": 3058 + }, + { + "epoch": 1.40270611168444, + "grad_norm": 0.30692818760871887, + "learning_rate": 0.0005765902637539709, + "loss": 1.3245, + "step": 3059 + }, + { + "epoch": 1.4031647746818026, + "grad_norm": 0.19634264707565308, + "learning_rate": 0.0005763455344996086, + "loss": 1.1637, + "step": 3060 + }, + { + "epoch": 1.4036234376791652, + "grad_norm": 0.2435206174850464, + "learning_rate": 0.0005761007865171361, + "loss": 1.6165, + "step": 3061 + }, + { + "epoch": 1.4040821006765278, + "grad_norm": 0.17027845978736877, + "learning_rate": 0.0005758560198665921, + "loss": 0.8738, + "step": 3062 + }, + { + "epoch": 1.4045407636738907, + "grad_norm": 0.23753295838832855, + "learning_rate": 0.0005756112346080192, + "loss": 1.6266, + "step": 3063 + }, + { + "epoch": 1.4049994266712533, + "grad_norm": 0.26798611879348755, + "learning_rate": 0.0005753664308014655, + "loss": 1.7124, + "step": 3064 + }, + { + "epoch": 1.405458089668616, + "grad_norm": 0.4665045738220215, + "learning_rate": 0.0005751216085069828, + "loss": 2.0098, + "step": 3065 + }, + { + "epoch": 1.4059167526659786, + "grad_norm": 0.279823362827301, + "learning_rate": 0.0005748767677846279, + "loss": 1.3131, + "step": 3066 + }, + { + "epoch": 1.4063754156633412, + "grad_norm": 0.27207767963409424, + "learning_rate": 0.000574631908694462, + "loss": 1.1439, + "step": 3067 + }, + { + "epoch": 1.406834078660704, + "grad_norm": 0.30169880390167236, + "learning_rate": 0.0005743870312965506, + "loss": 1.2415, + "step": 3068 + }, + { + "epoch": 1.4072927416580667, + "grad_norm": 0.20790642499923706, + "learning_rate": 0.0005741421356509645, + "loss": 1.0963, + "step": 3069 + }, + { + "epoch": 1.4077514046554294, + "grad_norm": 0.3159151077270508, + "learning_rate": 0.0005738972218177774, + "loss": 1.3106, + "step": 3070 + }, + { + "epoch": 1.4082100676527922, + "grad_norm": 0.2372143566608429, + "learning_rate": 0.0005736522898570694, + "loss": 1.2921, + "step": 3071 + }, + { + "epoch": 1.4086687306501549, + "grad_norm": 0.37024182081222534, + "learning_rate": 0.0005734073398289234, + "loss": 2.0061, + "step": 3072 + }, + { + "epoch": 1.4091273936475175, + "grad_norm": 0.38484346866607666, + "learning_rate": 0.0005731623717934275, + "loss": 1.5103, + "step": 3073 + }, + { + "epoch": 1.4095860566448801, + "grad_norm": 0.33676987886428833, + "learning_rate": 0.0005729173858106746, + "loss": 1.8154, + "step": 3074 + }, + { + "epoch": 1.4100447196422428, + "grad_norm": 0.35367336869239807, + "learning_rate": 0.0005726723819407611, + "loss": 1.9507, + "step": 3075 + }, + { + "epoch": 1.4105033826396056, + "grad_norm": 0.3594896197319031, + "learning_rate": 0.0005724273602437882, + "loss": 1.793, + "step": 3076 + }, + { + "epoch": 1.4109620456369683, + "grad_norm": 0.3166644275188446, + "learning_rate": 0.0005721823207798616, + "loss": 1.2424, + "step": 3077 + }, + { + "epoch": 1.411420708634331, + "grad_norm": 0.30905553698539734, + "learning_rate": 0.0005719372636090914, + "loss": 1.4936, + "step": 3078 + }, + { + "epoch": 1.4118793716316937, + "grad_norm": 0.3719440698623657, + "learning_rate": 0.0005716921887915916, + "loss": 1.8947, + "step": 3079 + }, + { + "epoch": 1.4123380346290564, + "grad_norm": 0.35400110483169556, + "learning_rate": 0.000571447096387481, + "loss": 1.325, + "step": 3080 + }, + { + "epoch": 1.412796697626419, + "grad_norm": 0.2209610939025879, + "learning_rate": 0.0005712019864568827, + "loss": 1.1089, + "step": 3081 + }, + { + "epoch": 1.4132553606237817, + "grad_norm": 0.3398480713367462, + "learning_rate": 0.0005709568590599234, + "loss": 1.7584, + "step": 3082 + }, + { + "epoch": 1.4137140236211443, + "grad_norm": 0.3765453100204468, + "learning_rate": 0.0005707117142567351, + "loss": 2.1033, + "step": 3083 + }, + { + "epoch": 1.4141726866185071, + "grad_norm": 0.2562631070613861, + "learning_rate": 0.0005704665521074534, + "loss": 1.5005, + "step": 3084 + }, + { + "epoch": 1.4146313496158698, + "grad_norm": 0.32559189200401306, + "learning_rate": 0.0005702213726722185, + "loss": 1.1359, + "step": 3085 + }, + { + "epoch": 1.4150900126132324, + "grad_norm": 0.2984718084335327, + "learning_rate": 0.0005699761760111745, + "loss": 1.4721, + "step": 3086 + }, + { + "epoch": 1.415548675610595, + "grad_norm": 0.3191785216331482, + "learning_rate": 0.00056973096218447, + "loss": 1.773, + "step": 3087 + }, + { + "epoch": 1.4160073386079577, + "grad_norm": 0.3590591251850128, + "learning_rate": 0.0005694857312522576, + "loss": 1.7094, + "step": 3088 + }, + { + "epoch": 1.4164660016053205, + "grad_norm": 0.23591914772987366, + "learning_rate": 0.0005692404832746944, + "loss": 1.0517, + "step": 3089 + }, + { + "epoch": 1.4169246646026832, + "grad_norm": 0.28452879190444946, + "learning_rate": 0.0005689952183119413, + "loss": 1.6881, + "step": 3090 + }, + { + "epoch": 1.4173833276000458, + "grad_norm": 0.32875385880470276, + "learning_rate": 0.0005687499364241637, + "loss": 1.7948, + "step": 3091 + }, + { + "epoch": 1.4178419905974087, + "grad_norm": 0.3080821931362152, + "learning_rate": 0.0005685046376715311, + "loss": 1.4707, + "step": 3092 + }, + { + "epoch": 1.4183006535947713, + "grad_norm": 0.3038639724254608, + "learning_rate": 0.0005682593221142168, + "loss": 1.2142, + "step": 3093 + }, + { + "epoch": 1.418759316592134, + "grad_norm": 0.3059542775154114, + "learning_rate": 0.0005680139898123985, + "loss": 1.2471, + "step": 3094 + }, + { + "epoch": 1.4192179795894966, + "grad_norm": 0.2539036273956299, + "learning_rate": 0.0005677686408262583, + "loss": 1.058, + "step": 3095 + }, + { + "epoch": 1.4196766425868592, + "grad_norm": 0.17899896204471588, + "learning_rate": 0.0005675232752159818, + "loss": 1.3064, + "step": 3096 + }, + { + "epoch": 1.420135305584222, + "grad_norm": 0.3230034112930298, + "learning_rate": 0.0005672778930417592, + "loss": 1.6761, + "step": 3097 + }, + { + "epoch": 1.4205939685815847, + "grad_norm": 0.2745686173439026, + "learning_rate": 0.0005670324943637842, + "loss": 0.7442, + "step": 3098 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 0.2400350570678711, + "learning_rate": 0.0005667870792422547, + "loss": 0.937, + "step": 3099 + }, + { + "epoch": 1.42151129457631, + "grad_norm": 0.3091734051704407, + "learning_rate": 0.0005665416477373734, + "loss": 1.6765, + "step": 3100 + }, + { + "epoch": 1.4219699575736726, + "grad_norm": 0.217657670378685, + "learning_rate": 0.0005662961999093461, + "loss": 0.8566, + "step": 3101 + }, + { + "epoch": 1.4224286205710355, + "grad_norm": 0.18862880766391754, + "learning_rate": 0.0005660507358183829, + "loss": 1.1898, + "step": 3102 + }, + { + "epoch": 1.422887283568398, + "grad_norm": 0.3786904513835907, + "learning_rate": 0.000565805255524698, + "loss": 1.9863, + "step": 3103 + }, + { + "epoch": 1.4233459465657607, + "grad_norm": 0.3076726496219635, + "learning_rate": 0.0005655597590885095, + "loss": 1.2256, + "step": 3104 + }, + { + "epoch": 1.4238046095631236, + "grad_norm": 0.24448339641094208, + "learning_rate": 0.0005653142465700392, + "loss": 1.7419, + "step": 3105 + }, + { + "epoch": 1.4242632725604862, + "grad_norm": 0.28757891058921814, + "learning_rate": 0.0005650687180295134, + "loss": 1.0779, + "step": 3106 + }, + { + "epoch": 1.4247219355578489, + "grad_norm": 0.0921797975897789, + "learning_rate": 0.0005648231735271619, + "loss": 0.5927, + "step": 3107 + }, + { + "epoch": 1.4251805985552115, + "grad_norm": 0.2944778800010681, + "learning_rate": 0.0005645776131232186, + "loss": 1.6512, + "step": 3108 + }, + { + "epoch": 1.4256392615525741, + "grad_norm": 0.24034199118614197, + "learning_rate": 0.000564332036877921, + "loss": 1.505, + "step": 3109 + }, + { + "epoch": 1.426097924549937, + "grad_norm": 0.245818093419075, + "learning_rate": 0.000564086444851511, + "loss": 1.2884, + "step": 3110 + }, + { + "epoch": 1.4265565875472996, + "grad_norm": 0.29974740743637085, + "learning_rate": 0.000563840837104234, + "loss": 0.8771, + "step": 3111 + }, + { + "epoch": 1.4270152505446623, + "grad_norm": 0.2733640968799591, + "learning_rate": 0.0005635952136963393, + "loss": 1.5035, + "step": 3112 + }, + { + "epoch": 1.4274739135420251, + "grad_norm": 0.2323792427778244, + "learning_rate": 0.0005633495746880801, + "loss": 1.1803, + "step": 3113 + }, + { + "epoch": 1.4279325765393878, + "grad_norm": 0.438142329454422, + "learning_rate": 0.0005631039201397136, + "loss": 2.2593, + "step": 3114 + }, + { + "epoch": 1.4283912395367504, + "grad_norm": 0.2616313099861145, + "learning_rate": 0.0005628582501115004, + "loss": 1.3368, + "step": 3115 + }, + { + "epoch": 1.428849902534113, + "grad_norm": 0.2123333215713501, + "learning_rate": 0.0005626125646637051, + "loss": 1.0031, + "step": 3116 + }, + { + "epoch": 1.4293085655314757, + "grad_norm": 0.32749924063682556, + "learning_rate": 0.0005623668638565964, + "loss": 1.4506, + "step": 3117 + }, + { + "epoch": 1.4297672285288385, + "grad_norm": 0.2560507655143738, + "learning_rate": 0.0005621211477504463, + "loss": 1.7554, + "step": 3118 + }, + { + "epoch": 1.4302258915262012, + "grad_norm": 0.3207387924194336, + "learning_rate": 0.0005618754164055306, + "loss": 1.4721, + "step": 3119 + }, + { + "epoch": 1.4306845545235638, + "grad_norm": 0.3025607764720917, + "learning_rate": 0.0005616296698821293, + "loss": 0.993, + "step": 3120 + }, + { + "epoch": 1.4311432175209264, + "grad_norm": 0.22865654528141022, + "learning_rate": 0.0005613839082405255, + "loss": 1.5714, + "step": 3121 + }, + { + "epoch": 1.431601880518289, + "grad_norm": 0.31752631068229675, + "learning_rate": 0.0005611381315410065, + "loss": 1.4345, + "step": 3122 + }, + { + "epoch": 1.432060543515652, + "grad_norm": 0.29477816820144653, + "learning_rate": 0.0005608923398438628, + "loss": 1.9834, + "step": 3123 + }, + { + "epoch": 1.4325192065130146, + "grad_norm": 0.30475521087646484, + "learning_rate": 0.0005606465332093892, + "loss": 1.5315, + "step": 3124 + }, + { + "epoch": 1.4329778695103772, + "grad_norm": 0.3452279567718506, + "learning_rate": 0.0005604007116978836, + "loss": 1.2822, + "step": 3125 + }, + { + "epoch": 1.43343653250774, + "grad_norm": 0.3388407230377197, + "learning_rate": 0.0005601548753696481, + "loss": 1.6831, + "step": 3126 + }, + { + "epoch": 1.4338951955051027, + "grad_norm": 0.3091072142124176, + "learning_rate": 0.0005599090242849878, + "loss": 0.8168, + "step": 3127 + }, + { + "epoch": 1.4343538585024653, + "grad_norm": 0.27518755197525024, + "learning_rate": 0.000559663158504212, + "loss": 1.9319, + "step": 3128 + }, + { + "epoch": 1.434812521499828, + "grad_norm": 0.2800983488559723, + "learning_rate": 0.0005594172780876332, + "loss": 1.3327, + "step": 3129 + }, + { + "epoch": 1.4352711844971906, + "grad_norm": 0.34306713938713074, + "learning_rate": 0.0005591713830955674, + "loss": 1.2191, + "step": 3130 + }, + { + "epoch": 1.4357298474945535, + "grad_norm": 0.19693058729171753, + "learning_rate": 0.000558925473588335, + "loss": 1.3058, + "step": 3131 + }, + { + "epoch": 1.436188510491916, + "grad_norm": 0.42898425459861755, + "learning_rate": 0.000558679549626259, + "loss": 1.7591, + "step": 3132 + }, + { + "epoch": 1.4366471734892787, + "grad_norm": 0.3185971975326538, + "learning_rate": 0.0005584336112696663, + "loss": 2.0237, + "step": 3133 + }, + { + "epoch": 1.4371058364866414, + "grad_norm": 0.18667162954807281, + "learning_rate": 0.0005581876585788875, + "loss": 0.9062, + "step": 3134 + }, + { + "epoch": 1.437564499484004, + "grad_norm": 0.2841646671295166, + "learning_rate": 0.0005579416916142564, + "loss": 1.2529, + "step": 3135 + }, + { + "epoch": 1.4380231624813669, + "grad_norm": 0.27004119753837585, + "learning_rate": 0.0005576957104361106, + "loss": 1.3765, + "step": 3136 + }, + { + "epoch": 1.4384818254787295, + "grad_norm": 0.2476406842470169, + "learning_rate": 0.0005574497151047911, + "loss": 1.1895, + "step": 3137 + }, + { + "epoch": 1.4389404884760921, + "grad_norm": 0.25219327211380005, + "learning_rate": 0.0005572037056806421, + "loss": 1.1415, + "step": 3138 + }, + { + "epoch": 1.439399151473455, + "grad_norm": 0.31320807337760925, + "learning_rate": 0.0005569576822240118, + "loss": 2.229, + "step": 3139 + }, + { + "epoch": 1.4398578144708176, + "grad_norm": 0.3259999752044678, + "learning_rate": 0.0005567116447952513, + "loss": 1.7386, + "step": 3140 + }, + { + "epoch": 1.4403164774681803, + "grad_norm": 0.1600116491317749, + "learning_rate": 0.0005564655934547154, + "loss": 0.7964, + "step": 3141 + }, + { + "epoch": 1.440775140465543, + "grad_norm": 0.2874891757965088, + "learning_rate": 0.0005562195282627624, + "loss": 1.3448, + "step": 3142 + }, + { + "epoch": 1.4412338034629055, + "grad_norm": 0.29961147904396057, + "learning_rate": 0.0005559734492797536, + "loss": 1.353, + "step": 3143 + }, + { + "epoch": 1.4416924664602684, + "grad_norm": 0.4548627734184265, + "learning_rate": 0.0005557273565660541, + "loss": 1.9115, + "step": 3144 + }, + { + "epoch": 1.442151129457631, + "grad_norm": 0.25489258766174316, + "learning_rate": 0.0005554812501820322, + "loss": 1.1127, + "step": 3145 + }, + { + "epoch": 1.4426097924549937, + "grad_norm": 0.3416697680950165, + "learning_rate": 0.0005552351301880597, + "loss": 2.1523, + "step": 3146 + }, + { + "epoch": 1.4430684554523565, + "grad_norm": 0.29285651445388794, + "learning_rate": 0.0005549889966445115, + "loss": 1.4188, + "step": 3147 + }, + { + "epoch": 1.4435271184497191, + "grad_norm": 0.3054216206073761, + "learning_rate": 0.0005547428496117659, + "loss": 1.0746, + "step": 3148 + }, + { + "epoch": 1.4439857814470818, + "grad_norm": 0.2599756717681885, + "learning_rate": 0.0005544966891502046, + "loss": 1.3993, + "step": 3149 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.3153897821903229, + "learning_rate": 0.0005542505153202124, + "loss": 1.8159, + "step": 3150 + }, + { + "epoch": 1.444903107441807, + "grad_norm": 0.5788086652755737, + "learning_rate": 0.0005540043281821777, + "loss": 2.1663, + "step": 3151 + }, + { + "epoch": 1.44536177043917, + "grad_norm": 0.2490013986825943, + "learning_rate": 0.0005537581277964919, + "loss": 1.3651, + "step": 3152 + }, + { + "epoch": 1.4458204334365325, + "grad_norm": 0.28390905261039734, + "learning_rate": 0.0005535119142235499, + "loss": 0.8854, + "step": 3153 + }, + { + "epoch": 1.4462790964338952, + "grad_norm": 0.31385406851768494, + "learning_rate": 0.0005532656875237492, + "loss": 1.4232, + "step": 3154 + }, + { + "epoch": 1.4467377594312578, + "grad_norm": 0.3778940439224243, + "learning_rate": 0.0005530194477574914, + "loss": 1.9998, + "step": 3155 + }, + { + "epoch": 1.4471964224286205, + "grad_norm": 0.18103332817554474, + "learning_rate": 0.0005527731949851809, + "loss": 0.7658, + "step": 3156 + }, + { + "epoch": 1.4476550854259833, + "grad_norm": 0.26208803057670593, + "learning_rate": 0.000552526929267225, + "loss": 1.3012, + "step": 3157 + }, + { + "epoch": 1.448113748423346, + "grad_norm": 0.23250733315944672, + "learning_rate": 0.0005522806506640346, + "loss": 1.4842, + "step": 3158 + }, + { + "epoch": 1.4485724114207086, + "grad_norm": 0.38623639941215515, + "learning_rate": 0.0005520343592360238, + "loss": 2.0955, + "step": 3159 + }, + { + "epoch": 1.4490310744180714, + "grad_norm": 0.30866172909736633, + "learning_rate": 0.0005517880550436094, + "loss": 1.6685, + "step": 3160 + }, + { + "epoch": 1.449489737415434, + "grad_norm": 0.1992468386888504, + "learning_rate": 0.0005515417381472117, + "loss": 0.969, + "step": 3161 + }, + { + "epoch": 1.4499484004127967, + "grad_norm": 0.29209160804748535, + "learning_rate": 0.000551295408607254, + "loss": 2.0913, + "step": 3162 + }, + { + "epoch": 1.4504070634101593, + "grad_norm": 0.19212748110294342, + "learning_rate": 0.0005510490664841626, + "loss": 0.8058, + "step": 3163 + }, + { + "epoch": 1.450865726407522, + "grad_norm": 0.31732991337776184, + "learning_rate": 0.0005508027118383673, + "loss": 1.6986, + "step": 3164 + }, + { + "epoch": 1.4513243894048848, + "grad_norm": 0.23571406304836273, + "learning_rate": 0.0005505563447303004, + "loss": 1.1407, + "step": 3165 + }, + { + "epoch": 1.4517830524022475, + "grad_norm": 0.24879354238510132, + "learning_rate": 0.0005503099652203974, + "loss": 1.7388, + "step": 3166 + }, + { + "epoch": 1.4522417153996101, + "grad_norm": 0.28732791543006897, + "learning_rate": 0.0005500635733690975, + "loss": 1.2711, + "step": 3167 + }, + { + "epoch": 1.4527003783969727, + "grad_norm": 0.2248416393995285, + "learning_rate": 0.0005498171692368419, + "loss": 1.1397, + "step": 3168 + }, + { + "epoch": 1.4531590413943354, + "grad_norm": 0.34044063091278076, + "learning_rate": 0.0005495707528840755, + "loss": 1.819, + "step": 3169 + }, + { + "epoch": 1.4536177043916982, + "grad_norm": 0.19185033440589905, + "learning_rate": 0.0005493243243712461, + "loss": 1.2428, + "step": 3170 + }, + { + "epoch": 1.4540763673890609, + "grad_norm": 0.2840229570865631, + "learning_rate": 0.0005490778837588041, + "loss": 1.1735, + "step": 3171 + }, + { + "epoch": 1.4545350303864235, + "grad_norm": 0.3059384822845459, + "learning_rate": 0.0005488314311072035, + "loss": 1.9674, + "step": 3172 + }, + { + "epoch": 1.4549936933837864, + "grad_norm": 0.27836617827415466, + "learning_rate": 0.0005485849664769008, + "loss": 0.7415, + "step": 3173 + }, + { + "epoch": 1.455452356381149, + "grad_norm": 0.21108005940914154, + "learning_rate": 0.0005483384899283554, + "loss": 1.1039, + "step": 3174 + }, + { + "epoch": 1.4559110193785116, + "grad_norm": 0.24077321588993073, + "learning_rate": 0.0005480920015220298, + "loss": 1.6875, + "step": 3175 + }, + { + "epoch": 1.4563696823758743, + "grad_norm": 0.3374916911125183, + "learning_rate": 0.0005478455013183895, + "loss": 1.938, + "step": 3176 + }, + { + "epoch": 1.456828345373237, + "grad_norm": 0.3088974356651306, + "learning_rate": 0.0005475989893779028, + "loss": 1.232, + "step": 3177 + }, + { + "epoch": 1.4572870083705998, + "grad_norm": 0.3156466782093048, + "learning_rate": 0.0005473524657610409, + "loss": 2.2339, + "step": 3178 + }, + { + "epoch": 1.4577456713679624, + "grad_norm": 0.373761922121048, + "learning_rate": 0.0005471059305282776, + "loss": 1.6581, + "step": 3179 + }, + { + "epoch": 1.458204334365325, + "grad_norm": 0.18877850472927094, + "learning_rate": 0.00054685938374009, + "loss": 0.885, + "step": 3180 + }, + { + "epoch": 1.458662997362688, + "grad_norm": 0.2712680995464325, + "learning_rate": 0.0005466128254569575, + "loss": 1.3269, + "step": 3181 + }, + { + "epoch": 1.4591216603600505, + "grad_norm": 0.17114239931106567, + "learning_rate": 0.0005463662557393628, + "loss": 0.8921, + "step": 3182 + }, + { + "epoch": 1.4595803233574132, + "grad_norm": 0.22322210669517517, + "learning_rate": 0.0005461196746477915, + "loss": 1.4297, + "step": 3183 + }, + { + "epoch": 1.4600389863547758, + "grad_norm": 0.24018515646457672, + "learning_rate": 0.0005458730822427314, + "loss": 0.8356, + "step": 3184 + }, + { + "epoch": 1.4604976493521384, + "grad_norm": 0.18391193449497223, + "learning_rate": 0.0005456264785846733, + "loss": 1.4681, + "step": 3185 + }, + { + "epoch": 1.4609563123495013, + "grad_norm": 0.33580252528190613, + "learning_rate": 0.0005453798637341112, + "loss": 1.371, + "step": 3186 + }, + { + "epoch": 1.461414975346864, + "grad_norm": 0.2702253460884094, + "learning_rate": 0.0005451332377515412, + "loss": 1.6213, + "step": 3187 + }, + { + "epoch": 1.4618736383442266, + "grad_norm": 0.32494962215423584, + "learning_rate": 0.0005448866006974625, + "loss": 1.9729, + "step": 3188 + }, + { + "epoch": 1.4623323013415892, + "grad_norm": 0.27085816860198975, + "learning_rate": 0.000544639952632377, + "loss": 1.0135, + "step": 3189 + }, + { + "epoch": 1.4627909643389518, + "grad_norm": 0.20625822246074677, + "learning_rate": 0.0005443932936167895, + "loss": 1.3878, + "step": 3190 + }, + { + "epoch": 1.4632496273363147, + "grad_norm": 0.38351795077323914, + "learning_rate": 0.0005441466237112068, + "loss": 1.9377, + "step": 3191 + }, + { + "epoch": 1.4637082903336773, + "grad_norm": 0.20434589684009552, + "learning_rate": 0.000543899942976139, + "loss": 1.1232, + "step": 3192 + }, + { + "epoch": 1.46416695333104, + "grad_norm": 0.28791090846061707, + "learning_rate": 0.0005436532514720986, + "loss": 0.877, + "step": 3193 + }, + { + "epoch": 1.4646256163284028, + "grad_norm": 0.27952685952186584, + "learning_rate": 0.000543406549259601, + "loss": 1.4366, + "step": 3194 + }, + { + "epoch": 1.4650842793257655, + "grad_norm": 0.34718945622444153, + "learning_rate": 0.0005431598363991639, + "loss": 1.9646, + "step": 3195 + }, + { + "epoch": 1.465542942323128, + "grad_norm": 0.3882228434085846, + "learning_rate": 0.0005429131129513076, + "loss": 1.6616, + "step": 3196 + }, + { + "epoch": 1.4660016053204907, + "grad_norm": 0.2647295296192169, + "learning_rate": 0.0005426663789765552, + "loss": 1.4854, + "step": 3197 + }, + { + "epoch": 1.4664602683178534, + "grad_norm": 0.28624817728996277, + "learning_rate": 0.0005424196345354326, + "loss": 1.9561, + "step": 3198 + }, + { + "epoch": 1.4669189313152162, + "grad_norm": 0.3558148741722107, + "learning_rate": 0.0005421728796884677, + "loss": 1.7657, + "step": 3199 + }, + { + "epoch": 1.4673775943125789, + "grad_norm": 0.29009684920310974, + "learning_rate": 0.0005419261144961914, + "loss": 1.6457, + "step": 3200 + }, + { + "epoch": 1.4678362573099415, + "grad_norm": 0.3886021375656128, + "learning_rate": 0.0005416793390191369, + "loss": 1.6, + "step": 3201 + }, + { + "epoch": 1.4682949203073041, + "grad_norm": 0.14990092813968658, + "learning_rate": 0.0005414325533178398, + "loss": 0.8758, + "step": 3202 + }, + { + "epoch": 1.4687535833046668, + "grad_norm": 0.21671035885810852, + "learning_rate": 0.0005411857574528389, + "loss": 0.9272, + "step": 3203 + }, + { + "epoch": 1.4692122463020296, + "grad_norm": 0.2458488643169403, + "learning_rate": 0.0005409389514846746, + "loss": 1.6566, + "step": 3204 + }, + { + "epoch": 1.4696709092993923, + "grad_norm": 0.38461047410964966, + "learning_rate": 0.0005406921354738904, + "loss": 1.5311, + "step": 3205 + }, + { + "epoch": 1.470129572296755, + "grad_norm": 0.31382817029953003, + "learning_rate": 0.000540445309481032, + "loss": 1.6455, + "step": 3206 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 0.23861967027187347, + "learning_rate": 0.0005401984735666474, + "loss": 1.2702, + "step": 3207 + }, + { + "epoch": 1.4710468982914804, + "grad_norm": 0.4511847496032715, + "learning_rate": 0.0005399516277912873, + "loss": 1.2289, + "step": 3208 + }, + { + "epoch": 1.471505561288843, + "grad_norm": 0.3328205645084381, + "learning_rate": 0.0005397047722155051, + "loss": 1.4151, + "step": 3209 + }, + { + "epoch": 1.4719642242862057, + "grad_norm": 0.10846666991710663, + "learning_rate": 0.0005394579068998559, + "loss": 0.8251, + "step": 3210 + }, + { + "epoch": 1.4724228872835683, + "grad_norm": 0.3210654854774475, + "learning_rate": 0.0005392110319048975, + "loss": 1.7081, + "step": 3211 + }, + { + "epoch": 1.4728815502809312, + "grad_norm": 0.1869962215423584, + "learning_rate": 0.0005389641472911904, + "loss": 1.0442, + "step": 3212 + }, + { + "epoch": 1.4733402132782938, + "grad_norm": 0.3858005404472351, + "learning_rate": 0.000538717253119297, + "loss": 1.7048, + "step": 3213 + }, + { + "epoch": 1.4737988762756564, + "grad_norm": 0.22492477297782898, + "learning_rate": 0.0005384703494497821, + "loss": 1.5041, + "step": 3214 + }, + { + "epoch": 1.4742575392730193, + "grad_norm": 0.3225257694721222, + "learning_rate": 0.0005382234363432134, + "loss": 1.8363, + "step": 3215 + }, + { + "epoch": 1.474716202270382, + "grad_norm": 0.37609943747520447, + "learning_rate": 0.0005379765138601598, + "loss": 2.0479, + "step": 3216 + }, + { + "epoch": 1.4751748652677446, + "grad_norm": 0.38782861828804016, + "learning_rate": 0.0005377295820611939, + "loss": 1.3388, + "step": 3217 + }, + { + "epoch": 1.4756335282651072, + "grad_norm": 0.40626588463783264, + "learning_rate": 0.0005374826410068891, + "loss": 1.9222, + "step": 3218 + }, + { + "epoch": 1.4760921912624698, + "grad_norm": 0.4013312757015228, + "learning_rate": 0.0005372356907578224, + "loss": 2.2166, + "step": 3219 + }, + { + "epoch": 1.4765508542598327, + "grad_norm": 0.2755921483039856, + "learning_rate": 0.0005369887313745723, + "loss": 1.2296, + "step": 3220 + }, + { + "epoch": 1.4770095172571953, + "grad_norm": 0.17257989943027496, + "learning_rate": 0.0005367417629177196, + "loss": 0.9679, + "step": 3221 + }, + { + "epoch": 1.477468180254558, + "grad_norm": 0.28084853291511536, + "learning_rate": 0.0005364947854478476, + "loss": 1.3068, + "step": 3222 + }, + { + "epoch": 1.4779268432519206, + "grad_norm": 0.24307353794574738, + "learning_rate": 0.0005362477990255416, + "loss": 0.8342, + "step": 3223 + }, + { + "epoch": 1.4783855062492832, + "grad_norm": 0.22293055057525635, + "learning_rate": 0.000536000803711389, + "loss": 1.3204, + "step": 3224 + }, + { + "epoch": 1.478844169246646, + "grad_norm": 0.2704020142555237, + "learning_rate": 0.0005357537995659798, + "loss": 1.0284, + "step": 3225 + }, + { + "epoch": 1.4793028322440087, + "grad_norm": 0.22363778948783875, + "learning_rate": 0.0005355067866499059, + "loss": 1.4955, + "step": 3226 + }, + { + "epoch": 1.4797614952413713, + "grad_norm": 0.3845861256122589, + "learning_rate": 0.000535259765023761, + "loss": 1.5793, + "step": 3227 + }, + { + "epoch": 1.4802201582387342, + "grad_norm": 0.2982635498046875, + "learning_rate": 0.0005350127347481414, + "loss": 1.2065, + "step": 3228 + }, + { + "epoch": 1.4806788212360968, + "grad_norm": 0.18739233911037445, + "learning_rate": 0.0005347656958836457, + "loss": 1.3103, + "step": 3229 + }, + { + "epoch": 1.4811374842334595, + "grad_norm": 0.34681299328804016, + "learning_rate": 0.0005345186484908741, + "loss": 1.4734, + "step": 3230 + }, + { + "epoch": 1.4815961472308221, + "grad_norm": 0.3014410734176636, + "learning_rate": 0.0005342715926304291, + "loss": 1.6099, + "step": 3231 + }, + { + "epoch": 1.4820548102281847, + "grad_norm": 0.4709073603153229, + "learning_rate": 0.0005340245283629155, + "loss": 1.7924, + "step": 3232 + }, + { + "epoch": 1.4825134732255476, + "grad_norm": 0.37753811478614807, + "learning_rate": 0.0005337774557489394, + "loss": 1.3316, + "step": 3233 + }, + { + "epoch": 1.4829721362229102, + "grad_norm": 0.31126853823661804, + "learning_rate": 0.0005335303748491101, + "loss": 0.7645, + "step": 3234 + }, + { + "epoch": 1.4834307992202729, + "grad_norm": 0.10572401434183121, + "learning_rate": 0.0005332832857240381, + "loss": 0.9926, + "step": 3235 + }, + { + "epoch": 1.4838894622176355, + "grad_norm": 0.38704633712768555, + "learning_rate": 0.0005330361884343361, + "loss": 2.0361, + "step": 3236 + }, + { + "epoch": 1.4843481252149981, + "grad_norm": 0.2931767702102661, + "learning_rate": 0.0005327890830406189, + "loss": 0.9363, + "step": 3237 + }, + { + "epoch": 1.484806788212361, + "grad_norm": 0.11565959453582764, + "learning_rate": 0.000532541969603503, + "loss": 0.9478, + "step": 3238 + }, + { + "epoch": 1.4852654512097236, + "grad_norm": 0.3610280156135559, + "learning_rate": 0.0005322948481836075, + "loss": 1.6384, + "step": 3239 + }, + { + "epoch": 1.4857241142070863, + "grad_norm": 0.32327285408973694, + "learning_rate": 0.0005320477188415529, + "loss": 0.8816, + "step": 3240 + }, + { + "epoch": 1.4861827772044491, + "grad_norm": 0.20238777995109558, + "learning_rate": 0.0005318005816379618, + "loss": 1.2931, + "step": 3241 + }, + { + "epoch": 1.4866414402018118, + "grad_norm": 0.29169613122940063, + "learning_rate": 0.0005315534366334587, + "loss": 1.1745, + "step": 3242 + }, + { + "epoch": 1.4871001031991744, + "grad_norm": 0.34999263286590576, + "learning_rate": 0.00053130628388867, + "loss": 1.7833, + "step": 3243 + }, + { + "epoch": 1.487558766196537, + "grad_norm": 0.25474709272384644, + "learning_rate": 0.0005310591234642242, + "loss": 1.2222, + "step": 3244 + }, + { + "epoch": 1.4880174291938997, + "grad_norm": 0.333258718252182, + "learning_rate": 0.0005308119554207515, + "loss": 1.3405, + "step": 3245 + }, + { + "epoch": 1.4884760921912625, + "grad_norm": 0.19669318199157715, + "learning_rate": 0.0005305647798188839, + "loss": 1.2637, + "step": 3246 + }, + { + "epoch": 1.4889347551886252, + "grad_norm": 0.25908851623535156, + "learning_rate": 0.0005303175967192555, + "loss": 1.5399, + "step": 3247 + }, + { + "epoch": 1.4893934181859878, + "grad_norm": 0.32331693172454834, + "learning_rate": 0.0005300704061825019, + "loss": 1.6418, + "step": 3248 + }, + { + "epoch": 1.4898520811833507, + "grad_norm": 0.2600635588169098, + "learning_rate": 0.0005298232082692609, + "loss": 1.0823, + "step": 3249 + }, + { + "epoch": 1.4903107441807133, + "grad_norm": 0.2755780518054962, + "learning_rate": 0.000529576003040172, + "loss": 1.6127, + "step": 3250 + }, + { + "epoch": 1.490769407178076, + "grad_norm": 0.33199846744537354, + "learning_rate": 0.0005293287905558762, + "loss": 1.6889, + "step": 3251 + }, + { + "epoch": 1.4912280701754386, + "grad_norm": 0.24848337471485138, + "learning_rate": 0.0005290815708770166, + "loss": 0.8654, + "step": 3252 + }, + { + "epoch": 1.4916867331728012, + "grad_norm": 0.2537640333175659, + "learning_rate": 0.0005288343440642379, + "loss": 1.316, + "step": 3253 + }, + { + "epoch": 1.492145396170164, + "grad_norm": 0.22786585986614227, + "learning_rate": 0.0005285871101781868, + "loss": 1.1967, + "step": 3254 + }, + { + "epoch": 1.4926040591675267, + "grad_norm": 0.30544570088386536, + "learning_rate": 0.0005283398692795114, + "loss": 1.6742, + "step": 3255 + }, + { + "epoch": 1.4930627221648893, + "grad_norm": 1.4452085494995117, + "learning_rate": 0.0005280926214288617, + "loss": 0.8225, + "step": 3256 + }, + { + "epoch": 1.493521385162252, + "grad_norm": 0.2699876129627228, + "learning_rate": 0.0005278453666868896, + "loss": 1.3572, + "step": 3257 + }, + { + "epoch": 1.4939800481596146, + "grad_norm": 0.35777711868286133, + "learning_rate": 0.0005275981051142481, + "loss": 2.1099, + "step": 3258 + }, + { + "epoch": 1.4944387111569775, + "grad_norm": 0.19279718399047852, + "learning_rate": 0.0005273508367715923, + "loss": 0.8453, + "step": 3259 + }, + { + "epoch": 1.49489737415434, + "grad_norm": 0.29106536507606506, + "learning_rate": 0.0005271035617195793, + "loss": 1.2269, + "step": 3260 + }, + { + "epoch": 1.4953560371517027, + "grad_norm": 0.286878377199173, + "learning_rate": 0.0005268562800188671, + "loss": 1.6506, + "step": 3261 + }, + { + "epoch": 1.4958147001490656, + "grad_norm": 0.3785189688205719, + "learning_rate": 0.0005266089917301158, + "loss": 1.5022, + "step": 3262 + }, + { + "epoch": 1.4962733631464282, + "grad_norm": 0.25897216796875, + "learning_rate": 0.0005263616969139868, + "loss": 1.6508, + "step": 3263 + }, + { + "epoch": 1.4967320261437909, + "grad_norm": 0.3450983166694641, + "learning_rate": 0.0005261143956311435, + "loss": 1.7249, + "step": 3264 + }, + { + "epoch": 1.4971906891411535, + "grad_norm": 0.28310757875442505, + "learning_rate": 0.0005258670879422508, + "loss": 1.2171, + "step": 3265 + }, + { + "epoch": 1.4976493521385161, + "grad_norm": 0.3658524751663208, + "learning_rate": 0.0005256197739079749, + "loss": 1.3394, + "step": 3266 + }, + { + "epoch": 1.498108015135879, + "grad_norm": 0.3043535053730011, + "learning_rate": 0.0005253724535889836, + "loss": 1.3353, + "step": 3267 + }, + { + "epoch": 1.4985666781332416, + "grad_norm": 0.2485763430595398, + "learning_rate": 0.0005251251270459468, + "loss": 1.0306, + "step": 3268 + }, + { + "epoch": 1.4990253411306043, + "grad_norm": 0.26514461636543274, + "learning_rate": 0.0005248777943395347, + "loss": 1.0359, + "step": 3269 + }, + { + "epoch": 1.499484004127967, + "grad_norm": 0.26036179065704346, + "learning_rate": 0.0005246304555304205, + "loss": 1.3007, + "step": 3270 + }, + { + "epoch": 1.4999426671253295, + "grad_norm": 0.20205332338809967, + "learning_rate": 0.000524383110679278, + "loss": 0.61, + "step": 3271 + }, + { + "epoch": 1.5004013301226924, + "grad_norm": 0.36065131425857544, + "learning_rate": 0.0005241357598467826, + "loss": 2.1804, + "step": 3272 + }, + { + "epoch": 1.500859993120055, + "grad_norm": 0.3411425054073334, + "learning_rate": 0.0005238884030936112, + "loss": 1.4267, + "step": 3273 + }, + { + "epoch": 1.5013186561174177, + "grad_norm": 0.45014554262161255, + "learning_rate": 0.0005236410404804425, + "loss": 2.0813, + "step": 3274 + }, + { + "epoch": 1.5017773191147805, + "grad_norm": 0.4235081076622009, + "learning_rate": 0.000523393672067956, + "loss": 1.2112, + "step": 3275 + }, + { + "epoch": 1.5022359821121432, + "grad_norm": 0.26777932047843933, + "learning_rate": 0.0005231462979168331, + "loss": 0.8293, + "step": 3276 + }, + { + "epoch": 1.5026946451095058, + "grad_norm": 0.20920783281326294, + "learning_rate": 0.0005228989180877564, + "loss": 1.123, + "step": 3277 + }, + { + "epoch": 1.5031533081068684, + "grad_norm": 0.27009251713752747, + "learning_rate": 0.0005226515326414099, + "loss": 1.7735, + "step": 3278 + }, + { + "epoch": 1.503611971104231, + "grad_norm": 0.3320116698741913, + "learning_rate": 0.0005224041416384791, + "loss": 1.6422, + "step": 3279 + }, + { + "epoch": 1.504070634101594, + "grad_norm": 0.2971821129322052, + "learning_rate": 0.0005221567451396509, + "loss": 0.9693, + "step": 3280 + }, + { + "epoch": 1.5045292970989566, + "grad_norm": 0.13652589917182922, + "learning_rate": 0.0005219093432056133, + "loss": 1.0251, + "step": 3281 + }, + { + "epoch": 1.5049879600963192, + "grad_norm": 0.31048429012298584, + "learning_rate": 0.000521661935897056, + "loss": 1.3032, + "step": 3282 + }, + { + "epoch": 1.505446623093682, + "grad_norm": 0.2567446827888489, + "learning_rate": 0.0005214145232746696, + "loss": 0.8873, + "step": 3283 + }, + { + "epoch": 1.5059052860910445, + "grad_norm": 0.2890402674674988, + "learning_rate": 0.000521167105399146, + "loss": 1.3135, + "step": 3284 + }, + { + "epoch": 1.5063639490884073, + "grad_norm": 0.3106888234615326, + "learning_rate": 0.0005209196823311791, + "loss": 1.3594, + "step": 3285 + }, + { + "epoch": 1.50682261208577, + "grad_norm": 0.28741034865379333, + "learning_rate": 0.0005206722541314631, + "loss": 1.7812, + "step": 3286 + }, + { + "epoch": 1.5072812750831326, + "grad_norm": 0.32389602065086365, + "learning_rate": 0.0005204248208606942, + "loss": 1.4911, + "step": 3287 + }, + { + "epoch": 1.5077399380804954, + "grad_norm": 0.26342904567718506, + "learning_rate": 0.0005201773825795694, + "loss": 1.6143, + "step": 3288 + }, + { + "epoch": 1.508198601077858, + "grad_norm": 0.21466577053070068, + "learning_rate": 0.0005199299393487872, + "loss": 0.3381, + "step": 3289 + }, + { + "epoch": 1.5086572640752207, + "grad_norm": 0.3118995428085327, + "learning_rate": 0.0005196824912290472, + "loss": 1.2741, + "step": 3290 + }, + { + "epoch": 1.5091159270725836, + "grad_norm": 0.3177085220813751, + "learning_rate": 0.00051943503828105, + "loss": 1.3895, + "step": 3291 + }, + { + "epoch": 1.509574590069946, + "grad_norm": 0.14354930818080902, + "learning_rate": 0.0005191875805654981, + "loss": 1.14, + "step": 3292 + }, + { + "epoch": 1.5100332530673088, + "grad_norm": 0.256320983171463, + "learning_rate": 0.0005189401181430941, + "loss": 0.592, + "step": 3293 + }, + { + "epoch": 1.5104919160646715, + "grad_norm": 0.33321627974510193, + "learning_rate": 0.0005186926510745427, + "loss": 2.0759, + "step": 3294 + }, + { + "epoch": 1.5109505790620341, + "grad_norm": 0.26053670048713684, + "learning_rate": 0.0005184451794205491, + "loss": 1.1047, + "step": 3295 + }, + { + "epoch": 1.511409242059397, + "grad_norm": 0.408119261264801, + "learning_rate": 0.00051819770324182, + "loss": 1.6132, + "step": 3296 + }, + { + "epoch": 1.5118679050567594, + "grad_norm": 0.2578348219394684, + "learning_rate": 0.0005179502225990632, + "loss": 0.9136, + "step": 3297 + }, + { + "epoch": 1.5123265680541222, + "grad_norm": 0.2745778560638428, + "learning_rate": 0.0005177027375529872, + "loss": 1.7166, + "step": 3298 + }, + { + "epoch": 1.5127852310514849, + "grad_norm": 0.3158656656742096, + "learning_rate": 0.0005174552481643023, + "loss": 1.1242, + "step": 3299 + }, + { + "epoch": 1.5132438940488475, + "grad_norm": 0.270859032869339, + "learning_rate": 0.0005172077544937188, + "loss": 1.9039, + "step": 3300 + }, + { + "epoch": 1.5137025570462104, + "grad_norm": 0.4259583652019501, + "learning_rate": 0.0005169602566019493, + "loss": 2.1599, + "step": 3301 + }, + { + "epoch": 1.514161220043573, + "grad_norm": 0.33669549226760864, + "learning_rate": 0.0005167127545497066, + "loss": 2.24, + "step": 3302 + }, + { + "epoch": 1.5146198830409356, + "grad_norm": 0.3532778024673462, + "learning_rate": 0.0005164652483977044, + "loss": 1.3746, + "step": 3303 + }, + { + "epoch": 1.5150785460382985, + "grad_norm": 0.22524480521678925, + "learning_rate": 0.0005162177382066584, + "loss": 0.8802, + "step": 3304 + }, + { + "epoch": 1.515537209035661, + "grad_norm": 0.21874026954174042, + "learning_rate": 0.000515970224037284, + "loss": 0.9228, + "step": 3305 + }, + { + "epoch": 1.5159958720330238, + "grad_norm": 0.2486424446105957, + "learning_rate": 0.0005157227059502987, + "loss": 1.7778, + "step": 3306 + }, + { + "epoch": 1.5164545350303864, + "grad_norm": 0.28031882643699646, + "learning_rate": 0.0005154751840064203, + "loss": 1.2578, + "step": 3307 + }, + { + "epoch": 1.516913198027749, + "grad_norm": 0.4096969664096832, + "learning_rate": 0.0005152276582663676, + "loss": 2.0068, + "step": 3308 + }, + { + "epoch": 1.517371861025112, + "grad_norm": 0.3127899467945099, + "learning_rate": 0.0005149801287908604, + "loss": 1.2456, + "step": 3309 + }, + { + "epoch": 1.5178305240224745, + "grad_norm": 0.325255811214447, + "learning_rate": 0.0005147325956406197, + "loss": 1.6535, + "step": 3310 + }, + { + "epoch": 1.5182891870198372, + "grad_norm": 0.06586357206106186, + "learning_rate": 0.000514485058876367, + "loss": 0.9711, + "step": 3311 + }, + { + "epoch": 1.5187478500171998, + "grad_norm": 0.31240516901016235, + "learning_rate": 0.000514237518558825, + "loss": 1.6185, + "step": 3312 + }, + { + "epoch": 1.5192065130145624, + "grad_norm": 0.29829588532447815, + "learning_rate": 0.0005139899747487171, + "loss": 1.2696, + "step": 3313 + }, + { + "epoch": 1.5196651760119253, + "grad_norm": 0.25990763306617737, + "learning_rate": 0.0005137424275067674, + "loss": 1.3779, + "step": 3314 + }, + { + "epoch": 1.520123839009288, + "grad_norm": 0.32825708389282227, + "learning_rate": 0.0005134948768937011, + "loss": 1.3303, + "step": 3315 + }, + { + "epoch": 1.5205825020066506, + "grad_norm": 0.2681593894958496, + "learning_rate": 0.0005132473229702444, + "loss": 1.9851, + "step": 3316 + }, + { + "epoch": 1.5210411650040134, + "grad_norm": 0.2627713680267334, + "learning_rate": 0.0005129997657971236, + "loss": 1.3954, + "step": 3317 + }, + { + "epoch": 1.5214998280013758, + "grad_norm": 0.22389861941337585, + "learning_rate": 0.0005127522054350667, + "loss": 0.7996, + "step": 3318 + }, + { + "epoch": 1.5219584909987387, + "grad_norm": 0.06943022459745407, + "learning_rate": 0.0005125046419448019, + "loss": 0.4316, + "step": 3319 + }, + { + "epoch": 1.5224171539961013, + "grad_norm": 0.16455991566181183, + "learning_rate": 0.0005122570753870582, + "loss": 0.9303, + "step": 3320 + }, + { + "epoch": 1.522875816993464, + "grad_norm": 0.22131837904453278, + "learning_rate": 0.0005120095058225654, + "loss": 1.1393, + "step": 3321 + }, + { + "epoch": 1.5233344799908268, + "grad_norm": 0.2995634973049164, + "learning_rate": 0.0005117619333120544, + "loss": 1.0248, + "step": 3322 + }, + { + "epoch": 1.5237931429881895, + "grad_norm": 0.2665209472179413, + "learning_rate": 0.0005115143579162561, + "loss": 1.1959, + "step": 3323 + }, + { + "epoch": 1.524251805985552, + "grad_norm": 0.41921502351760864, + "learning_rate": 0.000511266779695903, + "loss": 1.6565, + "step": 3324 + }, + { + "epoch": 1.524710468982915, + "grad_norm": 0.3324905037879944, + "learning_rate": 0.0005110191987117277, + "loss": 2.2944, + "step": 3325 + }, + { + "epoch": 1.5251691319802774, + "grad_norm": 0.22564005851745605, + "learning_rate": 0.0005107716150244634, + "loss": 0.761, + "step": 3326 + }, + { + "epoch": 1.5256277949776402, + "grad_norm": 0.25976645946502686, + "learning_rate": 0.0005105240286948442, + "loss": 1.4489, + "step": 3327 + }, + { + "epoch": 1.5260864579750029, + "grad_norm": 0.3188987076282501, + "learning_rate": 0.0005102764397836049, + "loss": 1.4666, + "step": 3328 + }, + { + "epoch": 1.5265451209723655, + "grad_norm": 0.1813998967409134, + "learning_rate": 0.0005100288483514809, + "loss": 0.9902, + "step": 3329 + }, + { + "epoch": 1.5270037839697284, + "grad_norm": 0.24019388854503632, + "learning_rate": 0.000509781254459208, + "loss": 1.1389, + "step": 3330 + }, + { + "epoch": 1.5274624469670908, + "grad_norm": 0.27883660793304443, + "learning_rate": 0.0005095336581675228, + "loss": 1.225, + "step": 3331 + }, + { + "epoch": 1.5279211099644536, + "grad_norm": 0.3314226269721985, + "learning_rate": 0.0005092860595371627, + "loss": 2.3223, + "step": 3332 + }, + { + "epoch": 1.5283797729618163, + "grad_norm": 0.3248099982738495, + "learning_rate": 0.000509038458628865, + "loss": 1.6426, + "step": 3333 + }, + { + "epoch": 1.528838435959179, + "grad_norm": 0.3302631378173828, + "learning_rate": 0.0005087908555033683, + "loss": 1.1074, + "step": 3334 + }, + { + "epoch": 1.5292970989565418, + "grad_norm": 0.2534301280975342, + "learning_rate": 0.0005085432502214116, + "loss": 1.6361, + "step": 3335 + }, + { + "epoch": 1.5297557619539044, + "grad_norm": 0.4103301167488098, + "learning_rate": 0.0005082956428437337, + "loss": 1.5873, + "step": 3336 + }, + { + "epoch": 1.530214424951267, + "grad_norm": 0.17628952860832214, + "learning_rate": 0.000508048033431075, + "loss": 0.789, + "step": 3337 + }, + { + "epoch": 1.5306730879486299, + "grad_norm": 0.28603991866111755, + "learning_rate": 0.0005078004220441756, + "loss": 1.2981, + "step": 3338 + }, + { + "epoch": 1.5311317509459923, + "grad_norm": 0.30459219217300415, + "learning_rate": 0.0005075528087437764, + "loss": 1.3026, + "step": 3339 + }, + { + "epoch": 1.5315904139433552, + "grad_norm": 0.09923279285430908, + "learning_rate": 0.0005073051935906188, + "loss": 0.6987, + "step": 3340 + }, + { + "epoch": 1.5320490769407178, + "grad_norm": 0.28860870003700256, + "learning_rate": 0.0005070575766454445, + "loss": 0.8572, + "step": 3341 + }, + { + "epoch": 1.5325077399380804, + "grad_norm": 0.2449285089969635, + "learning_rate": 0.0005068099579689958, + "loss": 1.2982, + "step": 3342 + }, + { + "epoch": 1.5329664029354433, + "grad_norm": 0.26312926411628723, + "learning_rate": 0.0005065623376220154, + "loss": 1.7064, + "step": 3343 + }, + { + "epoch": 1.533425065932806, + "grad_norm": 0.36778807640075684, + "learning_rate": 0.0005063147156652461, + "loss": 1.5251, + "step": 3344 + }, + { + "epoch": 1.5338837289301686, + "grad_norm": 0.2748435437679291, + "learning_rate": 0.0005060670921594316, + "loss": 1.427, + "step": 3345 + }, + { + "epoch": 1.5343423919275314, + "grad_norm": 0.18528130650520325, + "learning_rate": 0.0005058194671653156, + "loss": 0.4789, + "step": 3346 + }, + { + "epoch": 1.5348010549248938, + "grad_norm": 0.3140660524368286, + "learning_rate": 0.0005055718407436424, + "loss": 1.6775, + "step": 3347 + }, + { + "epoch": 1.5352597179222567, + "grad_norm": 0.37981608510017395, + "learning_rate": 0.0005053242129551564, + "loss": 1.5612, + "step": 3348 + }, + { + "epoch": 1.5357183809196193, + "grad_norm": 0.2672194242477417, + "learning_rate": 0.0005050765838606027, + "loss": 1.0255, + "step": 3349 + }, + { + "epoch": 1.536177043916982, + "grad_norm": 0.08587975054979324, + "learning_rate": 0.0005048289535207264, + "loss": 0.9292, + "step": 3350 + }, + { + "epoch": 1.5366357069143448, + "grad_norm": 0.3729512691497803, + "learning_rate": 0.0005045813219962728, + "loss": 1.7538, + "step": 3351 + }, + { + "epoch": 1.5370943699117072, + "grad_norm": 0.29773613810539246, + "learning_rate": 0.0005043336893479879, + "loss": 1.9409, + "step": 3352 + }, + { + "epoch": 1.53755303290907, + "grad_norm": 0.27870243787765503, + "learning_rate": 0.0005040860556366179, + "loss": 0.8977, + "step": 3353 + }, + { + "epoch": 1.5380116959064327, + "grad_norm": 0.13409045338630676, + "learning_rate": 0.0005038384209229089, + "loss": 0.6826, + "step": 3354 + }, + { + "epoch": 1.5384703589037954, + "grad_norm": 0.22798749804496765, + "learning_rate": 0.0005035907852676076, + "loss": 1.1876, + "step": 3355 + }, + { + "epoch": 1.5389290219011582, + "grad_norm": 0.3014310300350189, + "learning_rate": 0.0005033431487314608, + "loss": 1.7822, + "step": 3356 + }, + { + "epoch": 1.5393876848985208, + "grad_norm": 0.19987566769123077, + "learning_rate": 0.0005030955113752155, + "loss": 0.8876, + "step": 3357 + }, + { + "epoch": 1.5398463478958835, + "grad_norm": 0.2765864431858063, + "learning_rate": 0.0005028478732596189, + "loss": 1.7389, + "step": 3358 + }, + { + "epoch": 1.5403050108932463, + "grad_norm": 0.26134082674980164, + "learning_rate": 0.0005026002344454184, + "loss": 0.8819, + "step": 3359 + }, + { + "epoch": 1.5407636738906088, + "grad_norm": 0.1979461908340454, + "learning_rate": 0.0005023525949933618, + "loss": 1.2951, + "step": 3360 + }, + { + "epoch": 1.5412223368879716, + "grad_norm": 0.5640215873718262, + "learning_rate": 0.0005021049549641967, + "loss": 1.6044, + "step": 3361 + }, + { + "epoch": 1.5416809998853342, + "grad_norm": 0.4211662709712982, + "learning_rate": 0.0005018573144186708, + "loss": 1.8833, + "step": 3362 + }, + { + "epoch": 1.5421396628826969, + "grad_norm": 0.28429386019706726, + "learning_rate": 0.0005016096734175324, + "loss": 1.2702, + "step": 3363 + }, + { + "epoch": 1.5425983258800597, + "grad_norm": 0.20547764003276825, + "learning_rate": 0.0005013620320215294, + "loss": 1.0284, + "step": 3364 + }, + { + "epoch": 1.5430569888774222, + "grad_norm": 0.3609144985675812, + "learning_rate": 0.0005011143902914102, + "loss": 1.1247, + "step": 3365 + }, + { + "epoch": 1.543515651874785, + "grad_norm": 0.3178929388523102, + "learning_rate": 0.000500866748287923, + "loss": 1.6366, + "step": 3366 + }, + { + "epoch": 1.5439743148721476, + "grad_norm": 0.22963233292102814, + "learning_rate": 0.0005006191060718163, + "loss": 1.7095, + "step": 3367 + }, + { + "epoch": 1.5444329778695103, + "grad_norm": 0.29375985264778137, + "learning_rate": 0.0005003714637038381, + "loss": 1.2357, + "step": 3368 + }, + { + "epoch": 1.5448916408668731, + "grad_norm": 0.26059386134147644, + "learning_rate": 0.0005001238212447376, + "loss": 1.4654, + "step": 3369 + }, + { + "epoch": 1.5453503038642358, + "grad_norm": 0.2658666670322418, + "learning_rate": 0.0004998761787552626, + "loss": 1.481, + "step": 3370 + }, + { + "epoch": 1.5458089668615984, + "grad_norm": 0.26609566807746887, + "learning_rate": 0.0004996285362961619, + "loss": 1.1959, + "step": 3371 + }, + { + "epoch": 1.5462676298589613, + "grad_norm": 0.2814446985721588, + "learning_rate": 0.0004993808939281839, + "loss": 1.764, + "step": 3372 + }, + { + "epoch": 1.5467262928563237, + "grad_norm": 0.35369497537612915, + "learning_rate": 0.0004991332517120771, + "loss": 1.0037, + "step": 3373 + }, + { + "epoch": 1.5471849558536865, + "grad_norm": 0.2843606173992157, + "learning_rate": 0.00049888560970859, + "loss": 1.7404, + "step": 3374 + }, + { + "epoch": 1.5476436188510492, + "grad_norm": 0.20866908133029938, + "learning_rate": 0.0004986379679784707, + "loss": 0.5552, + "step": 3375 + }, + { + "epoch": 1.5481022818484118, + "grad_norm": 0.25553098320961, + "learning_rate": 0.0004983903265824677, + "loss": 1.1967, + "step": 3376 + }, + { + "epoch": 1.5485609448457747, + "grad_norm": 0.26248809695243835, + "learning_rate": 0.0004981426855813293, + "loss": 1.6772, + "step": 3377 + }, + { + "epoch": 1.5490196078431373, + "grad_norm": 0.35352829098701477, + "learning_rate": 0.0004978950450358036, + "loss": 1.5785, + "step": 3378 + }, + { + "epoch": 1.5494782708405, + "grad_norm": 0.26842719316482544, + "learning_rate": 0.0004976474050066384, + "loss": 1.6119, + "step": 3379 + }, + { + "epoch": 1.5499369338378628, + "grad_norm": 0.17810232937335968, + "learning_rate": 0.0004973997655545817, + "loss": 1.106, + "step": 3380 + }, + { + "epoch": 1.5503955968352252, + "grad_norm": 0.2810850143432617, + "learning_rate": 0.0004971521267403812, + "loss": 1.1019, + "step": 3381 + }, + { + "epoch": 1.550854259832588, + "grad_norm": 0.34839344024658203, + "learning_rate": 0.0004969044886247846, + "loss": 1.548, + "step": 3382 + }, + { + "epoch": 1.5513129228299507, + "grad_norm": 0.2651059925556183, + "learning_rate": 0.0004966568512685392, + "loss": 1.3282, + "step": 3383 + }, + { + "epoch": 1.5517715858273133, + "grad_norm": 0.2800137996673584, + "learning_rate": 0.0004964092147323925, + "loss": 2.1348, + "step": 3384 + }, + { + "epoch": 1.5522302488246762, + "grad_norm": 0.223674938082695, + "learning_rate": 0.0004961615790770912, + "loss": 0.8656, + "step": 3385 + }, + { + "epoch": 1.5526889118220386, + "grad_norm": 0.3796307146549225, + "learning_rate": 0.0004959139443633823, + "loss": 2.2166, + "step": 3386 + }, + { + "epoch": 1.5531475748194015, + "grad_norm": 0.3234909176826477, + "learning_rate": 0.0004956663106520121, + "loss": 1.7649, + "step": 3387 + }, + { + "epoch": 1.553606237816764, + "grad_norm": 0.3189575672149658, + "learning_rate": 0.0004954186780037273, + "loss": 1.5074, + "step": 3388 + }, + { + "epoch": 1.5540649008141267, + "grad_norm": 0.2766447365283966, + "learning_rate": 0.0004951710464792736, + "loss": 1.6479, + "step": 3389 + }, + { + "epoch": 1.5545235638114896, + "grad_norm": 0.35292649269104004, + "learning_rate": 0.0004949234161393974, + "loss": 1.7307, + "step": 3390 + }, + { + "epoch": 1.5549822268088522, + "grad_norm": 0.24963252246379852, + "learning_rate": 0.0004946757870448437, + "loss": 0.7278, + "step": 3391 + }, + { + "epoch": 1.5554408898062149, + "grad_norm": 0.17716290056705475, + "learning_rate": 0.0004944281592563577, + "loss": 0.7671, + "step": 3392 + }, + { + "epoch": 1.5558995528035777, + "grad_norm": 0.234075665473938, + "learning_rate": 0.0004941805328346845, + "loss": 1.2981, + "step": 3393 + }, + { + "epoch": 1.5563582158009401, + "grad_norm": 0.4235064685344696, + "learning_rate": 0.0004939329078405683, + "loss": 2.1018, + "step": 3394 + }, + { + "epoch": 1.556816878798303, + "grad_norm": 0.31396913528442383, + "learning_rate": 0.0004936852843347541, + "loss": 1.6549, + "step": 3395 + }, + { + "epoch": 1.5572755417956656, + "grad_norm": 0.275888592004776, + "learning_rate": 0.0004934376623779848, + "loss": 1.3459, + "step": 3396 + }, + { + "epoch": 1.5577342047930283, + "grad_norm": 0.36937257647514343, + "learning_rate": 0.0004931900420310042, + "loss": 2.2034, + "step": 3397 + }, + { + "epoch": 1.5581928677903911, + "grad_norm": 0.45881137251853943, + "learning_rate": 0.0004929424233545556, + "loss": 1.7052, + "step": 3398 + }, + { + "epoch": 1.5586515307877535, + "grad_norm": 0.26412880420684814, + "learning_rate": 0.0004926948064093811, + "loss": 1.5784, + "step": 3399 + }, + { + "epoch": 1.5591101937851164, + "grad_norm": 0.35005372762680054, + "learning_rate": 0.0004924471912562236, + "loss": 1.5, + "step": 3400 + }, + { + "epoch": 1.559568856782479, + "grad_norm": 0.27285388112068176, + "learning_rate": 0.0004921995779558246, + "loss": 0.7449, + "step": 3401 + }, + { + "epoch": 1.5600275197798417, + "grad_norm": 0.35796064138412476, + "learning_rate": 0.0004919519665689252, + "loss": 1.644, + "step": 3402 + }, + { + "epoch": 1.5604861827772045, + "grad_norm": 0.25865140557289124, + "learning_rate": 0.0004917043571562664, + "loss": 1.5549, + "step": 3403 + }, + { + "epoch": 1.5609448457745672, + "grad_norm": 0.3593021631240845, + "learning_rate": 0.0004914567497785885, + "loss": 1.1608, + "step": 3404 + }, + { + "epoch": 1.5614035087719298, + "grad_norm": 0.2613557279109955, + "learning_rate": 0.0004912091444966316, + "loss": 1.0942, + "step": 3405 + }, + { + "epoch": 1.5618621717692927, + "grad_norm": 0.20529672503471375, + "learning_rate": 0.0004909615413711351, + "loss": 1.1398, + "step": 3406 + }, + { + "epoch": 1.562320834766655, + "grad_norm": 0.3400242030620575, + "learning_rate": 0.0004907139404628375, + "loss": 1.6556, + "step": 3407 + }, + { + "epoch": 1.562779497764018, + "grad_norm": 0.19008702039718628, + "learning_rate": 0.0004904663418324772, + "loss": 0.3512, + "step": 3408 + }, + { + "epoch": 1.5632381607613806, + "grad_norm": 0.28319889307022095, + "learning_rate": 0.0004902187455407921, + "loss": 1.2299, + "step": 3409 + }, + { + "epoch": 1.5636968237587432, + "grad_norm": 0.19672320783138275, + "learning_rate": 0.0004899711516485192, + "loss": 1.1458, + "step": 3410 + }, + { + "epoch": 1.564155486756106, + "grad_norm": 0.38868579268455505, + "learning_rate": 0.0004897235602163952, + "loss": 1.691, + "step": 3411 + }, + { + "epoch": 1.5646141497534687, + "grad_norm": 0.33131128549575806, + "learning_rate": 0.0004894759713051561, + "loss": 1.6458, + "step": 3412 + }, + { + "epoch": 1.5650728127508313, + "grad_norm": 0.31250569224357605, + "learning_rate": 0.0004892283849755368, + "loss": 1.0369, + "step": 3413 + }, + { + "epoch": 1.5655314757481942, + "grad_norm": 0.3426419198513031, + "learning_rate": 0.0004889808012882725, + "loss": 1.7825, + "step": 3414 + }, + { + "epoch": 1.5659901387455566, + "grad_norm": 0.10699386149644852, + "learning_rate": 0.000488733220304097, + "loss": 0.5646, + "step": 3415 + }, + { + "epoch": 1.5664488017429194, + "grad_norm": 0.22663241624832153, + "learning_rate": 0.0004884856420837438, + "loss": 1.1749, + "step": 3416 + }, + { + "epoch": 1.566907464740282, + "grad_norm": 0.21530061960220337, + "learning_rate": 0.0004882380666879457, + "loss": 1.2376, + "step": 3417 + }, + { + "epoch": 1.5673661277376447, + "grad_norm": 0.4145222306251526, + "learning_rate": 0.0004879904941774347, + "loss": 1.5345, + "step": 3418 + }, + { + "epoch": 1.5678247907350076, + "grad_norm": 0.28199657797813416, + "learning_rate": 0.00048774292461294203, + "loss": 1.5359, + "step": 3419 + }, + { + "epoch": 1.56828345373237, + "grad_norm": 0.35698872804641724, + "learning_rate": 0.00048749535805519824, + "loss": 1.5125, + "step": 3420 + }, + { + "epoch": 1.5687421167297328, + "grad_norm": 0.08986271172761917, + "learning_rate": 0.00048724779456493333, + "loss": 0.5488, + "step": 3421 + }, + { + "epoch": 1.5692007797270955, + "grad_norm": 0.28814515471458435, + "learning_rate": 0.00048700023420287635, + "loss": 1.4119, + "step": 3422 + }, + { + "epoch": 1.5696594427244581, + "grad_norm": 0.33969786763191223, + "learning_rate": 0.0004867526770297558, + "loss": 2.0642, + "step": 3423 + }, + { + "epoch": 1.570118105721821, + "grad_norm": 0.18406681716442108, + "learning_rate": 0.00048650512310629895, + "loss": 0.7826, + "step": 3424 + }, + { + "epoch": 1.5705767687191836, + "grad_norm": 0.2164044976234436, + "learning_rate": 0.0004862575724932327, + "loss": 1.1195, + "step": 3425 + }, + { + "epoch": 1.5710354317165462, + "grad_norm": 0.3818514943122864, + "learning_rate": 0.00048601002525128304, + "loss": 1.6297, + "step": 3426 + }, + { + "epoch": 1.571494094713909, + "grad_norm": 0.27068087458610535, + "learning_rate": 0.000485762481441175, + "loss": 1.4949, + "step": 3427 + }, + { + "epoch": 1.5719527577112715, + "grad_norm": 0.2720354497432709, + "learning_rate": 0.000485514941123633, + "loss": 1.3561, + "step": 3428 + }, + { + "epoch": 1.5724114207086344, + "grad_norm": 0.273271769285202, + "learning_rate": 0.00048526740435938045, + "loss": 1.4606, + "step": 3429 + }, + { + "epoch": 1.572870083705997, + "grad_norm": 0.06212342157959938, + "learning_rate": 0.0004850198712091397, + "loss": 0.6363, + "step": 3430 + }, + { + "epoch": 1.5733287467033596, + "grad_norm": 0.17056238651275635, + "learning_rate": 0.0004847723417336326, + "loss": 0.4197, + "step": 3431 + }, + { + "epoch": 1.5737874097007225, + "grad_norm": 0.3277488648891449, + "learning_rate": 0.00048452481599357985, + "loss": 1.6985, + "step": 3432 + }, + { + "epoch": 1.574246072698085, + "grad_norm": 0.29063600301742554, + "learning_rate": 0.00048427729404970133, + "loss": 1.446, + "step": 3433 + }, + { + "epoch": 1.5747047356954478, + "grad_norm": 0.2664231061935425, + "learning_rate": 0.00048402977596271604, + "loss": 1.5747, + "step": 3434 + }, + { + "epoch": 1.5751633986928104, + "grad_norm": 0.28931164741516113, + "learning_rate": 0.0004837822617933417, + "loss": 1.275, + "step": 3435 + }, + { + "epoch": 1.575622061690173, + "grad_norm": 0.3391111195087433, + "learning_rate": 0.0004835347516022956, + "loss": 2.0002, + "step": 3436 + }, + { + "epoch": 1.576080724687536, + "grad_norm": 0.3661150336265564, + "learning_rate": 0.00048328724545029355, + "loss": 1.0988, + "step": 3437 + }, + { + "epoch": 1.5765393876848985, + "grad_norm": 0.1353694349527359, + "learning_rate": 0.00048303974339805074, + "loss": 0.6863, + "step": 3438 + }, + { + "epoch": 1.5769980506822612, + "grad_norm": 0.14725974202156067, + "learning_rate": 0.00048279224550628117, + "loss": 1.3112, + "step": 3439 + }, + { + "epoch": 1.577456713679624, + "grad_norm": 0.2503822147846222, + "learning_rate": 0.000482544751835698, + "loss": 1.1244, + "step": 3440 + }, + { + "epoch": 1.5779153766769864, + "grad_norm": 0.55783611536026, + "learning_rate": 0.0004822972624470128, + "loss": 1.9937, + "step": 3441 + }, + { + "epoch": 1.5783740396743493, + "grad_norm": 0.2923833429813385, + "learning_rate": 0.00048204977740093694, + "loss": 1.0448, + "step": 3442 + }, + { + "epoch": 1.578832702671712, + "grad_norm": 0.09032203257083893, + "learning_rate": 0.00048180229675817997, + "loss": 0.529, + "step": 3443 + }, + { + "epoch": 1.5792913656690746, + "grad_norm": 0.12156753242015839, + "learning_rate": 0.00048155482057945094, + "loss": 0.9733, + "step": 3444 + }, + { + "epoch": 1.5797500286664374, + "grad_norm": 0.3344416916370392, + "learning_rate": 0.00048130734892545737, + "loss": 1.7934, + "step": 3445 + }, + { + "epoch": 1.5802086916638, + "grad_norm": 0.35299161076545715, + "learning_rate": 0.00048105988185690596, + "loss": 1.5703, + "step": 3446 + }, + { + "epoch": 1.5806673546611627, + "grad_norm": 0.22280535101890564, + "learning_rate": 0.00048081241943450205, + "loss": 1.6415, + "step": 3447 + }, + { + "epoch": 1.5811260176585256, + "grad_norm": 0.34450089931488037, + "learning_rate": 0.00048056496171895, + "loss": 1.4304, + "step": 3448 + }, + { + "epoch": 1.581584680655888, + "grad_norm": 0.2558358311653137, + "learning_rate": 0.0004803175087709529, + "loss": 1.5538, + "step": 3449 + }, + { + "epoch": 1.5820433436532508, + "grad_norm": 0.2851111888885498, + "learning_rate": 0.00048007006065121287, + "loss": 1.8889, + "step": 3450 + }, + { + "epoch": 1.5825020066506135, + "grad_norm": 0.21149995923042297, + "learning_rate": 0.0004798226174204308, + "loss": 0.9199, + "step": 3451 + }, + { + "epoch": 1.582960669647976, + "grad_norm": 0.31764793395996094, + "learning_rate": 0.00047957517913930597, + "loss": 1.7222, + "step": 3452 + }, + { + "epoch": 1.583419332645339, + "grad_norm": 0.27936065196990967, + "learning_rate": 0.00047932774586853706, + "loss": 1.4536, + "step": 3453 + }, + { + "epoch": 1.5838779956427014, + "grad_norm": 0.35680124163627625, + "learning_rate": 0.0004790803176688211, + "loss": 1.7658, + "step": 3454 + }, + { + "epoch": 1.5843366586400642, + "grad_norm": 0.2759742736816406, + "learning_rate": 0.00047883289460085406, + "loss": 1.122, + "step": 3455 + }, + { + "epoch": 1.5847953216374269, + "grad_norm": 0.36273276805877686, + "learning_rate": 0.0004785854767253305, + "loss": 1.7786, + "step": 3456 + }, + { + "epoch": 1.5852539846347895, + "grad_norm": 0.2069203108549118, + "learning_rate": 0.00047833806410294417, + "loss": 0.7401, + "step": 3457 + }, + { + "epoch": 1.5857126476321524, + "grad_norm": 0.2959374487400055, + "learning_rate": 0.00047809065679438675, + "loss": 1.3228, + "step": 3458 + }, + { + "epoch": 1.586171310629515, + "grad_norm": 0.25699517130851746, + "learning_rate": 0.00047784325486034917, + "loss": 1.2399, + "step": 3459 + }, + { + "epoch": 1.5866299736268776, + "grad_norm": 0.22081950306892395, + "learning_rate": 0.00047759585836152095, + "loss": 1.1349, + "step": 3460 + }, + { + "epoch": 1.5870886366242405, + "grad_norm": 0.304353266954422, + "learning_rate": 0.0004773484673585901, + "loss": 1.9224, + "step": 3461 + }, + { + "epoch": 1.587547299621603, + "grad_norm": 0.3038237392902374, + "learning_rate": 0.00047710108191224387, + "loss": 1.6517, + "step": 3462 + }, + { + "epoch": 1.5880059626189658, + "grad_norm": 0.24518848955631256, + "learning_rate": 0.00047685370208316717, + "loss": 0.8, + "step": 3463 + }, + { + "epoch": 1.5884646256163284, + "grad_norm": 0.2411755472421646, + "learning_rate": 0.0004766063279320442, + "loss": 1.7107, + "step": 3464 + }, + { + "epoch": 1.588923288613691, + "grad_norm": 0.31453174352645874, + "learning_rate": 0.0004763589595195576, + "loss": 1.9206, + "step": 3465 + }, + { + "epoch": 1.589381951611054, + "grad_norm": 0.36513078212738037, + "learning_rate": 0.00047611159690638867, + "loss": 1.6099, + "step": 3466 + }, + { + "epoch": 1.5898406146084163, + "grad_norm": 0.2547532916069031, + "learning_rate": 0.00047586424015321735, + "loss": 1.4818, + "step": 3467 + }, + { + "epoch": 1.5902992776057792, + "grad_norm": 0.35492321848869324, + "learning_rate": 0.00047561688932072215, + "loss": 1.6364, + "step": 3468 + }, + { + "epoch": 1.5907579406031418, + "grad_norm": 0.2952052652835846, + "learning_rate": 0.00047536954446957957, + "loss": 1.6503, + "step": 3469 + }, + { + "epoch": 1.5912166036005044, + "grad_norm": 0.3726816773414612, + "learning_rate": 0.0004751222056604654, + "loss": 2.0603, + "step": 3470 + }, + { + "epoch": 1.5916752665978673, + "grad_norm": 0.33943477272987366, + "learning_rate": 0.0004748748729540534, + "loss": 1.558, + "step": 3471 + }, + { + "epoch": 1.59213392959523, + "grad_norm": 0.35351845622062683, + "learning_rate": 0.00047462754641101635, + "loss": 1.6615, + "step": 3472 + }, + { + "epoch": 1.5925925925925926, + "grad_norm": 0.40344950556755066, + "learning_rate": 0.00047438022609202536, + "loss": 1.6789, + "step": 3473 + }, + { + "epoch": 1.5930512555899554, + "grad_norm": 0.16976001858711243, + "learning_rate": 0.0004741329120577494, + "loss": 0.945, + "step": 3474 + }, + { + "epoch": 1.5935099185873178, + "grad_norm": 0.25279539823532104, + "learning_rate": 0.00047388560436885656, + "loss": 1.0529, + "step": 3475 + }, + { + "epoch": 1.5939685815846807, + "grad_norm": 0.22543933987617493, + "learning_rate": 0.0004736383030860132, + "loss": 1.2178, + "step": 3476 + }, + { + "epoch": 1.5944272445820433, + "grad_norm": 0.33394721150398254, + "learning_rate": 0.00047339100826988427, + "loss": 1.3074, + "step": 3477 + }, + { + "epoch": 1.594885907579406, + "grad_norm": 0.30631691217422485, + "learning_rate": 0.0004731437199811329, + "loss": 1.2521, + "step": 3478 + }, + { + "epoch": 1.5953445705767688, + "grad_norm": 0.20698705315589905, + "learning_rate": 0.0004728964382804209, + "loss": 1.071, + "step": 3479 + }, + { + "epoch": 1.5958032335741315, + "grad_norm": 0.5738452076911926, + "learning_rate": 0.00047264916322840774, + "loss": 1.965, + "step": 3480 + }, + { + "epoch": 1.596261896571494, + "grad_norm": 0.42928802967071533, + "learning_rate": 0.000472401894885752, + "loss": 1.6219, + "step": 3481 + }, + { + "epoch": 1.596720559568857, + "grad_norm": 0.30077821016311646, + "learning_rate": 0.00047215463331311047, + "loss": 1.8263, + "step": 3482 + }, + { + "epoch": 1.5971792225662194, + "grad_norm": 0.27442824840545654, + "learning_rate": 0.00047190737857113823, + "loss": 1.48, + "step": 3483 + }, + { + "epoch": 1.5976378855635822, + "grad_norm": 0.3932221531867981, + "learning_rate": 0.00047166013072048857, + "loss": 1.9976, + "step": 3484 + }, + { + "epoch": 1.5980965485609449, + "grad_norm": 0.30163654685020447, + "learning_rate": 0.0004714128898218133, + "loss": 1.4284, + "step": 3485 + }, + { + "epoch": 1.5985552115583075, + "grad_norm": 0.29906541109085083, + "learning_rate": 0.0004711656559357621, + "loss": 1.1145, + "step": 3486 + }, + { + "epoch": 1.5990138745556703, + "grad_norm": 0.2641544044017792, + "learning_rate": 0.0004709184291229835, + "loss": 1.3258, + "step": 3487 + }, + { + "epoch": 1.5994725375530328, + "grad_norm": 0.2394290268421173, + "learning_rate": 0.0004706712094441239, + "loss": 1.6715, + "step": 3488 + }, + { + "epoch": 1.5999312005503956, + "grad_norm": 0.33737921714782715, + "learning_rate": 0.0004704239969598281, + "loss": 1.3931, + "step": 3489 + }, + { + "epoch": 1.6003898635477583, + "grad_norm": 0.2610185444355011, + "learning_rate": 0.0004701767917307391, + "loss": 1.7404, + "step": 3490 + }, + { + "epoch": 1.6008485265451209, + "grad_norm": 0.22963584959506989, + "learning_rate": 0.00046992959381749816, + "loss": 1.2682, + "step": 3491 + }, + { + "epoch": 1.6013071895424837, + "grad_norm": 0.28074997663497925, + "learning_rate": 0.00046968240328074465, + "loss": 1.2201, + "step": 3492 + }, + { + "epoch": 1.6017658525398464, + "grad_norm": 0.29938602447509766, + "learning_rate": 0.00046943522018111616, + "loss": 1.4655, + "step": 3493 + }, + { + "epoch": 1.602224515537209, + "grad_norm": 0.2991687059402466, + "learning_rate": 0.0004691880445792486, + "loss": 2.1379, + "step": 3494 + }, + { + "epoch": 1.6026831785345719, + "grad_norm": 0.2304016351699829, + "learning_rate": 0.0004689408765357758, + "loss": 0.7851, + "step": 3495 + }, + { + "epoch": 1.6031418415319343, + "grad_norm": 0.3397028148174286, + "learning_rate": 0.0004686937161113301, + "loss": 1.3103, + "step": 3496 + }, + { + "epoch": 1.6036005045292971, + "grad_norm": 0.23544049263000488, + "learning_rate": 0.0004684465633665415, + "loss": 1.593, + "step": 3497 + }, + { + "epoch": 1.6040591675266598, + "grad_norm": 0.2129555195569992, + "learning_rate": 0.0004681994183620383, + "loss": 0.743, + "step": 3498 + }, + { + "epoch": 1.6045178305240224, + "grad_norm": 0.4064857065677643, + "learning_rate": 0.0004679522811584471, + "loss": 1.6642, + "step": 3499 + }, + { + "epoch": 1.6049764935213853, + "grad_norm": 0.1782543659210205, + "learning_rate": 0.0004677051518163925, + "loss": 0.8695, + "step": 3500 + }, + { + "epoch": 1.6054351565187477, + "grad_norm": 0.21518990397453308, + "learning_rate": 0.00046745803039649703, + "loss": 0.8586, + "step": 3501 + }, + { + "epoch": 1.6058938195161105, + "grad_norm": 0.3452260494232178, + "learning_rate": 0.0004672109169593813, + "loss": 1.7291, + "step": 3502 + }, + { + "epoch": 1.6063524825134732, + "grad_norm": 0.20058414340019226, + "learning_rate": 0.00046696381156566405, + "loss": 1.2682, + "step": 3503 + }, + { + "epoch": 1.6068111455108358, + "grad_norm": 0.2975291609764099, + "learning_rate": 0.00046671671427596194, + "loss": 1.2484, + "step": 3504 + }, + { + "epoch": 1.6072698085081987, + "grad_norm": 0.6816866397857666, + "learning_rate": 0.0004664696251508899, + "loss": 1.4316, + "step": 3505 + }, + { + "epoch": 1.6077284715055613, + "grad_norm": 0.27953004837036133, + "learning_rate": 0.00046622254425106053, + "loss": 1.7111, + "step": 3506 + }, + { + "epoch": 1.608187134502924, + "grad_norm": 0.34763795137405396, + "learning_rate": 0.0004659754716370848, + "loss": 1.1754, + "step": 3507 + }, + { + "epoch": 1.6086457975002868, + "grad_norm": 0.2617149353027344, + "learning_rate": 0.00046572840736957096, + "loss": 1.636, + "step": 3508 + }, + { + "epoch": 1.6091044604976492, + "grad_norm": 0.3082357943058014, + "learning_rate": 0.00046548135150912596, + "loss": 1.7413, + "step": 3509 + }, + { + "epoch": 1.609563123495012, + "grad_norm": 0.2684236764907837, + "learning_rate": 0.00046523430411635436, + "loss": 0.8466, + "step": 3510 + }, + { + "epoch": 1.6100217864923747, + "grad_norm": 0.2572011649608612, + "learning_rate": 0.00046498726525185866, + "loss": 1.6624, + "step": 3511 + }, + { + "epoch": 1.6104804494897373, + "grad_norm": 0.27827000617980957, + "learning_rate": 0.0004647402349762392, + "loss": 1.3497, + "step": 3512 + }, + { + "epoch": 1.6109391124871002, + "grad_norm": 0.2422563135623932, + "learning_rate": 0.00046449321335009444, + "loss": 0.9238, + "step": 3513 + }, + { + "epoch": 1.6113977754844628, + "grad_norm": 0.2140582948923111, + "learning_rate": 0.0004642462004340203, + "loss": 1.4498, + "step": 3514 + }, + { + "epoch": 1.6118564384818255, + "grad_norm": 0.3195685148239136, + "learning_rate": 0.0004639991962886111, + "loss": 1.6652, + "step": 3515 + }, + { + "epoch": 1.6123151014791883, + "grad_norm": 0.40083667635917664, + "learning_rate": 0.0004637522009744586, + "loss": 1.7421, + "step": 3516 + }, + { + "epoch": 1.6127737644765507, + "grad_norm": 0.2781684398651123, + "learning_rate": 0.0004635052145521525, + "loss": 1.3621, + "step": 3517 + }, + { + "epoch": 1.6132324274739136, + "grad_norm": 0.3488773703575134, + "learning_rate": 0.00046325823708228054, + "loss": 1.6398, + "step": 3518 + }, + { + "epoch": 1.6136910904712762, + "grad_norm": 0.3113149106502533, + "learning_rate": 0.0004630112686254279, + "loss": 1.9727, + "step": 3519 + }, + { + "epoch": 1.6141497534686389, + "grad_norm": 0.3670024871826172, + "learning_rate": 0.0004627643092421777, + "loss": 1.7104, + "step": 3520 + }, + { + "epoch": 1.6146084164660017, + "grad_norm": 0.2058316022157669, + "learning_rate": 0.00046251735899311097, + "loss": 1.31, + "step": 3521 + }, + { + "epoch": 1.6150670794633641, + "grad_norm": 0.2428571581840515, + "learning_rate": 0.0004622704179388063, + "loss": 1.3736, + "step": 3522 + }, + { + "epoch": 1.615525742460727, + "grad_norm": 0.4996251165866852, + "learning_rate": 0.00046202348613984007, + "loss": 1.9026, + "step": 3523 + }, + { + "epoch": 1.6159844054580896, + "grad_norm": 0.33074381947517395, + "learning_rate": 0.0004617765636567869, + "loss": 1.9189, + "step": 3524 + }, + { + "epoch": 1.6164430684554523, + "grad_norm": 0.2520156800746918, + "learning_rate": 0.00046152965055021794, + "loss": 0.8857, + "step": 3525 + }, + { + "epoch": 1.6169017314528151, + "grad_norm": 0.28665515780448914, + "learning_rate": 0.00046128274688070315, + "loss": 1.8541, + "step": 3526 + }, + { + "epoch": 1.6173603944501778, + "grad_norm": 0.18662500381469727, + "learning_rate": 0.0004610358527088097, + "loss": 0.7944, + "step": 3527 + }, + { + "epoch": 1.6178190574475404, + "grad_norm": 0.3536602258682251, + "learning_rate": 0.00046078896809510246, + "loss": 1.3347, + "step": 3528 + }, + { + "epoch": 1.6182777204449033, + "grad_norm": 0.3583596348762512, + "learning_rate": 0.00046054209310014433, + "loss": 1.6628, + "step": 3529 + }, + { + "epoch": 1.6187363834422657, + "grad_norm": 0.6140498518943787, + "learning_rate": 0.0004602952277844951, + "loss": 2.1606, + "step": 3530 + }, + { + "epoch": 1.6191950464396285, + "grad_norm": 1.5802074670791626, + "learning_rate": 0.00046004837220871276, + "loss": 1.8225, + "step": 3531 + }, + { + "epoch": 1.6196537094369912, + "grad_norm": 0.33315399289131165, + "learning_rate": 0.0004598015264333528, + "loss": 1.4759, + "step": 3532 + }, + { + "epoch": 1.6201123724343538, + "grad_norm": 0.24571913480758667, + "learning_rate": 0.00045955469051896816, + "loss": 1.5706, + "step": 3533 + }, + { + "epoch": 1.6205710354317167, + "grad_norm": 0.33706846833229065, + "learning_rate": 0.0004593078645261096, + "loss": 1.7424, + "step": 3534 + }, + { + "epoch": 1.621029698429079, + "grad_norm": 0.29877063632011414, + "learning_rate": 0.0004590610485153255, + "loss": 1.397, + "step": 3535 + }, + { + "epoch": 1.621488361426442, + "grad_norm": 0.33204445242881775, + "learning_rate": 0.00045881424254716127, + "loss": 1.1624, + "step": 3536 + }, + { + "epoch": 1.6219470244238046, + "grad_norm": 0.2581148147583008, + "learning_rate": 0.00045856744668216023, + "loss": 1.7374, + "step": 3537 + }, + { + "epoch": 1.6224056874211672, + "grad_norm": 0.33581775426864624, + "learning_rate": 0.00045832066098086316, + "loss": 1.9, + "step": 3538 + }, + { + "epoch": 1.62286435041853, + "grad_norm": 0.1745557337999344, + "learning_rate": 0.00045807388550380855, + "loss": 0.8279, + "step": 3539 + }, + { + "epoch": 1.6233230134158927, + "grad_norm": 0.31032976508140564, + "learning_rate": 0.00045782712031153223, + "loss": 1.6848, + "step": 3540 + }, + { + "epoch": 1.6237816764132553, + "grad_norm": 0.1794162392616272, + "learning_rate": 0.0004575803654645675, + "loss": 0.5406, + "step": 3541 + }, + { + "epoch": 1.6242403394106182, + "grad_norm": 0.19714608788490295, + "learning_rate": 0.00045733362102344483, + "loss": 1.3489, + "step": 3542 + }, + { + "epoch": 1.6246990024079806, + "grad_norm": 0.3311532139778137, + "learning_rate": 0.0004570868870486924, + "loss": 1.7387, + "step": 3543 + }, + { + "epoch": 1.6251576654053435, + "grad_norm": 0.3105451762676239, + "learning_rate": 0.00045684016360083614, + "loss": 1.4056, + "step": 3544 + }, + { + "epoch": 1.625616328402706, + "grad_norm": 0.14965184032917023, + "learning_rate": 0.00045659345074039893, + "loss": 0.7161, + "step": 3545 + }, + { + "epoch": 1.6260749914000687, + "grad_norm": 0.2359105944633484, + "learning_rate": 0.0004563467485279015, + "loss": 1.7198, + "step": 3546 + }, + { + "epoch": 1.6265336543974316, + "grad_norm": 0.3810403645038605, + "learning_rate": 0.00045610005702386114, + "loss": 1.8206, + "step": 3547 + }, + { + "epoch": 1.6269923173947942, + "grad_norm": 0.2406771034002304, + "learning_rate": 0.0004558533762887932, + "loss": 1.6469, + "step": 3548 + }, + { + "epoch": 1.6274509803921569, + "grad_norm": 0.29384976625442505, + "learning_rate": 0.00045560670638321055, + "loss": 1.3338, + "step": 3549 + }, + { + "epoch": 1.6279096433895197, + "grad_norm": 0.3127608597278595, + "learning_rate": 0.0004553600473676229, + "loss": 1.8749, + "step": 3550 + }, + { + "epoch": 1.6283683063868821, + "grad_norm": 0.24576979875564575, + "learning_rate": 0.0004551133993025374, + "loss": 0.7694, + "step": 3551 + }, + { + "epoch": 1.628826969384245, + "grad_norm": 0.315415620803833, + "learning_rate": 0.000454866762248459, + "loss": 1.6774, + "step": 3552 + }, + { + "epoch": 1.6292856323816076, + "grad_norm": 0.2339719533920288, + "learning_rate": 0.00045462013626588896, + "loss": 0.7991, + "step": 3553 + }, + { + "epoch": 1.6297442953789703, + "grad_norm": 0.2772282361984253, + "learning_rate": 0.0004543735214153267, + "loss": 1.3975, + "step": 3554 + }, + { + "epoch": 1.6302029583763331, + "grad_norm": 0.2241169661283493, + "learning_rate": 0.00045412691775726874, + "loss": 0.8122, + "step": 3555 + }, + { + "epoch": 1.6306616213736955, + "grad_norm": 0.10823415219783783, + "learning_rate": 0.0004538803253522086, + "loss": 0.9379, + "step": 3556 + }, + { + "epoch": 1.6311202843710584, + "grad_norm": 0.3783659040927887, + "learning_rate": 0.00045363374426063717, + "loss": 2.1521, + "step": 3557 + }, + { + "epoch": 1.631578947368421, + "grad_norm": 0.3071146309375763, + "learning_rate": 0.00045338717454304265, + "loss": 0.8087, + "step": 3558 + }, + { + "epoch": 1.6320376103657837, + "grad_norm": 0.2720809876918793, + "learning_rate": 0.0004531406162599102, + "loss": 1.369, + "step": 3559 + }, + { + "epoch": 1.6324962733631465, + "grad_norm": 0.21303652226924896, + "learning_rate": 0.0004528940694717225, + "loss": 1.4075, + "step": 3560 + }, + { + "epoch": 1.6329549363605091, + "grad_norm": 0.2808886766433716, + "learning_rate": 0.0004526475342389592, + "loss": 1.2719, + "step": 3561 + }, + { + "epoch": 1.6334135993578718, + "grad_norm": 0.21272645890712738, + "learning_rate": 0.0004524010106220972, + "loss": 1.0988, + "step": 3562 + }, + { + "epoch": 1.6338722623552346, + "grad_norm": 0.49326127767562866, + "learning_rate": 0.00045215449868161057, + "loss": 1.6325, + "step": 3563 + }, + { + "epoch": 1.634330925352597, + "grad_norm": 0.19443345069885254, + "learning_rate": 0.0004519079984779703, + "loss": 0.8355, + "step": 3564 + }, + { + "epoch": 1.63478958834996, + "grad_norm": 0.2636580169200897, + "learning_rate": 0.0004516615100716448, + "loss": 1.3528, + "step": 3565 + }, + { + "epoch": 1.6352482513473225, + "grad_norm": 0.2543386220932007, + "learning_rate": 0.0004514150335230994, + "loss": 1.15, + "step": 3566 + }, + { + "epoch": 1.6357069143446852, + "grad_norm": 0.18656910955905914, + "learning_rate": 0.0004511685688927966, + "loss": 1.1378, + "step": 3567 + }, + { + "epoch": 1.636165577342048, + "grad_norm": 0.2844998836517334, + "learning_rate": 0.0004509221162411959, + "loss": 1.1173, + "step": 3568 + }, + { + "epoch": 1.6366242403394105, + "grad_norm": 0.1812886893749237, + "learning_rate": 0.00045067567562875405, + "loss": 1.0365, + "step": 3569 + }, + { + "epoch": 1.6370829033367733, + "grad_norm": 0.20662900805473328, + "learning_rate": 0.0004504292471159246, + "loss": 1.2693, + "step": 3570 + }, + { + "epoch": 1.637541566334136, + "grad_norm": 0.2970770299434662, + "learning_rate": 0.00045018283076315817, + "loss": 1.7665, + "step": 3571 + }, + { + "epoch": 1.6380002293314986, + "grad_norm": 0.22958998382091522, + "learning_rate": 0.0004499364266309026, + "loss": 0.7569, + "step": 3572 + }, + { + "epoch": 1.6384588923288614, + "grad_norm": 0.22331257164478302, + "learning_rate": 0.0004496900347796025, + "loss": 1.5889, + "step": 3573 + }, + { + "epoch": 1.638917555326224, + "grad_norm": 0.2571808695793152, + "learning_rate": 0.00044944365526969983, + "loss": 0.8337, + "step": 3574 + }, + { + "epoch": 1.6393762183235867, + "grad_norm": 0.17887678742408752, + "learning_rate": 0.0004491972881616329, + "loss": 0.6978, + "step": 3575 + }, + { + "epoch": 1.6398348813209496, + "grad_norm": 0.24759553372859955, + "learning_rate": 0.00044895093351583743, + "loss": 1.2387, + "step": 3576 + }, + { + "epoch": 1.640293544318312, + "grad_norm": 0.3115570545196533, + "learning_rate": 0.0004487045913927461, + "loss": 1.6693, + "step": 3577 + }, + { + "epoch": 1.6407522073156748, + "grad_norm": 0.32540813088417053, + "learning_rate": 0.00044845826185278836, + "loss": 1.0903, + "step": 3578 + }, + { + "epoch": 1.6412108703130375, + "grad_norm": 0.08525694906711578, + "learning_rate": 0.0004482119449563906, + "loss": 0.5742, + "step": 3579 + }, + { + "epoch": 1.6416695333104, + "grad_norm": 0.12443814426660538, + "learning_rate": 0.0004479656407639763, + "loss": 1.0961, + "step": 3580 + }, + { + "epoch": 1.642128196307763, + "grad_norm": 0.2620854079723358, + "learning_rate": 0.00044771934933596544, + "loss": 1.0662, + "step": 3581 + }, + { + "epoch": 1.6425868593051256, + "grad_norm": 0.20561723411083221, + "learning_rate": 0.00044747307073277504, + "loss": 1.4257, + "step": 3582 + }, + { + "epoch": 1.6430455223024882, + "grad_norm": 0.42892375588417053, + "learning_rate": 0.00044722680501481924, + "loss": 1.651, + "step": 3583 + }, + { + "epoch": 1.643504185299851, + "grad_norm": 0.18103362619876862, + "learning_rate": 0.00044698055224250854, + "loss": 1.4236, + "step": 3584 + }, + { + "epoch": 1.6439628482972135, + "grad_norm": 0.3172457218170166, + "learning_rate": 0.0004467343124762509, + "loss": 1.7086, + "step": 3585 + }, + { + "epoch": 1.6444215112945764, + "grad_norm": 0.3391329348087311, + "learning_rate": 0.0004464880857764503, + "loss": 1.2327, + "step": 3586 + }, + { + "epoch": 1.644880174291939, + "grad_norm": 0.2603316307067871, + "learning_rate": 0.00044624187220350815, + "loss": 1.8528, + "step": 3587 + }, + { + "epoch": 1.6453388372893016, + "grad_norm": 0.2531551122665405, + "learning_rate": 0.0004459956718178224, + "loss": 1.1763, + "step": 3588 + }, + { + "epoch": 1.6457975002866645, + "grad_norm": 0.3372352719306946, + "learning_rate": 0.00044574948467978763, + "loss": 1.3344, + "step": 3589 + }, + { + "epoch": 1.646256163284027, + "grad_norm": 0.2342667281627655, + "learning_rate": 0.0004455033108497954, + "loss": 1.1017, + "step": 3590 + }, + { + "epoch": 1.6467148262813898, + "grad_norm": 0.21960115432739258, + "learning_rate": 0.0004452571503882342, + "loss": 0.8768, + "step": 3591 + }, + { + "epoch": 1.6471734892787524, + "grad_norm": 0.24046221375465393, + "learning_rate": 0.0004450110033554886, + "loss": 1.0574, + "step": 3592 + }, + { + "epoch": 1.647632152276115, + "grad_norm": 0.22882205247879028, + "learning_rate": 0.00044476486981194035, + "loss": 1.2656, + "step": 3593 + }, + { + "epoch": 1.648090815273478, + "grad_norm": 0.21356913447380066, + "learning_rate": 0.0004445187498179678, + "loss": 0.9254, + "step": 3594 + }, + { + "epoch": 1.6485494782708405, + "grad_norm": 0.2324090301990509, + "learning_rate": 0.00044427264343394583, + "loss": 1.6883, + "step": 3595 + }, + { + "epoch": 1.6490081412682032, + "grad_norm": 0.19994021952152252, + "learning_rate": 0.0004440265507202464, + "loss": 0.803, + "step": 3596 + }, + { + "epoch": 1.649466804265566, + "grad_norm": 0.22320851683616638, + "learning_rate": 0.0004437804717372378, + "loss": 1.3083, + "step": 3597 + }, + { + "epoch": 1.6499254672629284, + "grad_norm": 0.3010443150997162, + "learning_rate": 0.0004435344065452847, + "loss": 1.4539, + "step": 3598 + }, + { + "epoch": 1.6503841302602913, + "grad_norm": 0.27338162064552307, + "learning_rate": 0.0004432883552047488, + "loss": 1.2612, + "step": 3599 + }, + { + "epoch": 1.650842793257654, + "grad_norm": 0.2307090312242508, + "learning_rate": 0.0004430423177759882, + "loss": 0.721, + "step": 3600 + }, + { + "epoch": 1.6513014562550166, + "grad_norm": 0.26010721921920776, + "learning_rate": 0.0004427962943193578, + "loss": 1.3625, + "step": 3601 + }, + { + "epoch": 1.6517601192523794, + "grad_norm": 0.302643746137619, + "learning_rate": 0.0004425502848952091, + "loss": 1.3484, + "step": 3602 + }, + { + "epoch": 1.652218782249742, + "grad_norm": 0.32459530234336853, + "learning_rate": 0.0004423042895638895, + "loss": 1.6387, + "step": 3603 + }, + { + "epoch": 1.6526774452471047, + "grad_norm": 0.1907692402601242, + "learning_rate": 0.0004420583083857437, + "loss": 0.7533, + "step": 3604 + }, + { + "epoch": 1.6531361082444673, + "grad_norm": 0.26839619874954224, + "learning_rate": 0.00044181234142111255, + "loss": 1.3954, + "step": 3605 + }, + { + "epoch": 1.65359477124183, + "grad_norm": 0.09783615916967392, + "learning_rate": 0.0004415663887303337, + "loss": 0.8584, + "step": 3606 + }, + { + "epoch": 1.6540534342391928, + "grad_norm": 0.42067283391952515, + "learning_rate": 0.00044132045037374094, + "loss": 1.8727, + "step": 3607 + }, + { + "epoch": 1.6545120972365555, + "grad_norm": 0.32053786516189575, + "learning_rate": 0.00044107452641166514, + "loss": 1.4919, + "step": 3608 + }, + { + "epoch": 1.654970760233918, + "grad_norm": 0.2961629331111908, + "learning_rate": 0.0004408286169044326, + "loss": 1.5599, + "step": 3609 + }, + { + "epoch": 1.655429423231281, + "grad_norm": 0.16102895140647888, + "learning_rate": 0.0004405827219123669, + "loss": 0.568, + "step": 3610 + }, + { + "epoch": 1.6558880862286434, + "grad_norm": 0.2084360271692276, + "learning_rate": 0.0004403368414957881, + "loss": 1.0012, + "step": 3611 + }, + { + "epoch": 1.6563467492260062, + "grad_norm": 0.222111776471138, + "learning_rate": 0.00044009097571501217, + "loss": 1.2141, + "step": 3612 + }, + { + "epoch": 1.6568054122233689, + "grad_norm": 0.3039361536502838, + "learning_rate": 0.0004398451246303521, + "loss": 1.1281, + "step": 3613 + }, + { + "epoch": 1.6572640752207315, + "grad_norm": 0.19220460951328278, + "learning_rate": 0.00043959928830211655, + "loss": 1.2178, + "step": 3614 + }, + { + "epoch": 1.6577227382180943, + "grad_norm": 0.23995845019817352, + "learning_rate": 0.0004393534667906109, + "loss": 0.944, + "step": 3615 + }, + { + "epoch": 1.658181401215457, + "grad_norm": 0.2802288830280304, + "learning_rate": 0.00043910766015613727, + "loss": 0.8565, + "step": 3616 + }, + { + "epoch": 1.6586400642128196, + "grad_norm": 0.17965088784694672, + "learning_rate": 0.00043886186845899366, + "loss": 0.9316, + "step": 3617 + }, + { + "epoch": 1.6590987272101825, + "grad_norm": 0.33390891551971436, + "learning_rate": 0.0004386160917594746, + "loss": 1.7971, + "step": 3618 + }, + { + "epoch": 1.659557390207545, + "grad_norm": 0.19103044271469116, + "learning_rate": 0.00043837033011787097, + "loss": 0.9313, + "step": 3619 + }, + { + "epoch": 1.6600160532049077, + "grad_norm": 0.23847994208335876, + "learning_rate": 0.00043812458359446943, + "loss": 1.623, + "step": 3620 + }, + { + "epoch": 1.6604747162022704, + "grad_norm": 0.26979711651802063, + "learning_rate": 0.0004378788522495538, + "loss": 1.0571, + "step": 3621 + }, + { + "epoch": 1.660933379199633, + "grad_norm": 0.2273758053779602, + "learning_rate": 0.0004376331361434036, + "loss": 1.2567, + "step": 3622 + }, + { + "epoch": 1.6613920421969959, + "grad_norm": 0.3926750123500824, + "learning_rate": 0.00043738743533629486, + "loss": 2.0264, + "step": 3623 + }, + { + "epoch": 1.6618507051943583, + "grad_norm": 0.3111841380596161, + "learning_rate": 0.00043714174988849965, + "loss": 1.1513, + "step": 3624 + }, + { + "epoch": 1.6623093681917211, + "grad_norm": 0.12631827592849731, + "learning_rate": 0.0004368960798602865, + "loss": 0.8732, + "step": 3625 + }, + { + "epoch": 1.6627680311890838, + "grad_norm": 0.18259353935718536, + "learning_rate": 0.0004366504253119199, + "loss": 0.5492, + "step": 3626 + }, + { + "epoch": 1.6632266941864464, + "grad_norm": 0.27222898602485657, + "learning_rate": 0.00043640478630366074, + "loss": 1.4242, + "step": 3627 + }, + { + "epoch": 1.6636853571838093, + "grad_norm": 0.31462812423706055, + "learning_rate": 0.0004361591628957661, + "loss": 1.7146, + "step": 3628 + }, + { + "epoch": 1.664144020181172, + "grad_norm": 0.28236204385757446, + "learning_rate": 0.00043591355514848904, + "loss": 1.4069, + "step": 3629 + }, + { + "epoch": 1.6646026831785345, + "grad_norm": 0.23949186503887177, + "learning_rate": 0.0004356679631220791, + "loss": 0.7886, + "step": 3630 + }, + { + "epoch": 1.6650613461758974, + "grad_norm": 0.06866566091775894, + "learning_rate": 0.00043542238687678157, + "loss": 0.4383, + "step": 3631 + }, + { + "epoch": 1.6655200091732598, + "grad_norm": 0.2557963728904724, + "learning_rate": 0.00043517682647283815, + "loss": 1.4226, + "step": 3632 + }, + { + "epoch": 1.6659786721706227, + "grad_norm": 0.19061607122421265, + "learning_rate": 0.00043493128197048657, + "loss": 1.2208, + "step": 3633 + }, + { + "epoch": 1.6664373351679853, + "grad_norm": 0.3818952143192291, + "learning_rate": 0.00043468575342996076, + "loss": 2.0298, + "step": 3634 + }, + { + "epoch": 1.666895998165348, + "grad_norm": 0.3634778559207916, + "learning_rate": 0.0004344402409114906, + "loss": 1.5644, + "step": 3635 + }, + { + "epoch": 1.6673546611627108, + "grad_norm": 0.17684818804264069, + "learning_rate": 0.00043419474447530204, + "loss": 1.0081, + "step": 3636 + }, + { + "epoch": 1.6678133241600734, + "grad_norm": 0.2824413478374481, + "learning_rate": 0.0004339492641816171, + "loss": 1.5297, + "step": 3637 + }, + { + "epoch": 1.668271987157436, + "grad_norm": 0.34963420033454895, + "learning_rate": 0.00043370380009065396, + "loss": 1.3151, + "step": 3638 + }, + { + "epoch": 1.6687306501547987, + "grad_norm": 0.1694229692220688, + "learning_rate": 0.00043345835226262663, + "loss": 1.2376, + "step": 3639 + }, + { + "epoch": 1.6691893131521613, + "grad_norm": 0.26671671867370605, + "learning_rate": 0.00043321292075774526, + "loss": 1.6826, + "step": 3640 + }, + { + "epoch": 1.6696479761495242, + "grad_norm": 0.3384977877140045, + "learning_rate": 0.00043296750563621614, + "loss": 1.525, + "step": 3641 + }, + { + "epoch": 1.6701066391468868, + "grad_norm": 0.2929702699184418, + "learning_rate": 0.0004327221069582411, + "loss": 2.0645, + "step": 3642 + }, + { + "epoch": 1.6705653021442495, + "grad_norm": 0.3001018762588501, + "learning_rate": 0.0004324767247840183, + "loss": 1.6188, + "step": 3643 + }, + { + "epoch": 1.6710239651416123, + "grad_norm": 0.2780381441116333, + "learning_rate": 0.0004322313591737418, + "loss": 1.1631, + "step": 3644 + }, + { + "epoch": 1.6714826281389747, + "grad_norm": 0.19090348482131958, + "learning_rate": 0.00043198601018760145, + "loss": 0.9705, + "step": 3645 + }, + { + "epoch": 1.6719412911363376, + "grad_norm": 0.17542727291584015, + "learning_rate": 0.0004317406778857833, + "loss": 1.0067, + "step": 3646 + }, + { + "epoch": 1.6723999541337002, + "grad_norm": 0.3193509876728058, + "learning_rate": 0.00043149536232846915, + "loss": 2.0522, + "step": 3647 + }, + { + "epoch": 1.6728586171310629, + "grad_norm": 0.262650728225708, + "learning_rate": 0.00043125006357583643, + "loss": 1.2916, + "step": 3648 + }, + { + "epoch": 1.6733172801284257, + "grad_norm": 0.2200448364019394, + "learning_rate": 0.0004310047816880588, + "loss": 1.0105, + "step": 3649 + }, + { + "epoch": 1.6737759431257884, + "grad_norm": 0.31807059049606323, + "learning_rate": 0.00043075951672530573, + "loss": 1.375, + "step": 3650 + }, + { + "epoch": 1.674234606123151, + "grad_norm": 0.23543456196784973, + "learning_rate": 0.0004305142687477425, + "loss": 1.4635, + "step": 3651 + }, + { + "epoch": 1.6746932691205139, + "grad_norm": 0.7807444930076599, + "learning_rate": 0.00043026903781553016, + "loss": 0.9573, + "step": 3652 + }, + { + "epoch": 1.6751519321178763, + "grad_norm": 0.18116839230060577, + "learning_rate": 0.0004300238239888256, + "loss": 1.1865, + "step": 3653 + }, + { + "epoch": 1.6756105951152391, + "grad_norm": 0.32931381464004517, + "learning_rate": 0.00042977862732778154, + "loss": 1.6169, + "step": 3654 + }, + { + "epoch": 1.6760692581126018, + "grad_norm": 0.32929980754852295, + "learning_rate": 0.0004295334478925466, + "loss": 1.5129, + "step": 3655 + }, + { + "epoch": 1.6765279211099644, + "grad_norm": 0.26823946833610535, + "learning_rate": 0.0004292882857432649, + "loss": 1.6238, + "step": 3656 + }, + { + "epoch": 1.6769865841073273, + "grad_norm": 0.3574983775615692, + "learning_rate": 0.00042904314094007655, + "loss": 2.0161, + "step": 3657 + }, + { + "epoch": 1.6774452471046897, + "grad_norm": 0.45830219984054565, + "learning_rate": 0.0004287980135431175, + "loss": 1.4208, + "step": 3658 + }, + { + "epoch": 1.6779039101020525, + "grad_norm": 0.3208571672439575, + "learning_rate": 0.000428552903612519, + "loss": 1.8867, + "step": 3659 + }, + { + "epoch": 1.6783625730994152, + "grad_norm": 0.11624974012374878, + "learning_rate": 0.00042830781120840845, + "loss": 1.0681, + "step": 3660 + }, + { + "epoch": 1.6788212360967778, + "grad_norm": 0.2735399901866913, + "learning_rate": 0.0004280627363909087, + "loss": 0.9048, + "step": 3661 + }, + { + "epoch": 1.6792798990941407, + "grad_norm": 0.31739717721939087, + "learning_rate": 0.0004278176792201383, + "loss": 1.7313, + "step": 3662 + }, + { + "epoch": 1.6797385620915033, + "grad_norm": 0.3343389332294464, + "learning_rate": 0.00042757263975621174, + "loss": 1.8636, + "step": 3663 + }, + { + "epoch": 1.680197225088866, + "grad_norm": 0.35586023330688477, + "learning_rate": 0.0004273276180592391, + "loss": 1.7568, + "step": 3664 + }, + { + "epoch": 1.6806558880862288, + "grad_norm": 0.31270912289619446, + "learning_rate": 0.0004270826141893256, + "loss": 1.6099, + "step": 3665 + }, + { + "epoch": 1.6811145510835912, + "grad_norm": 0.4064270555973053, + "learning_rate": 0.0004268376282065725, + "loss": 1.5682, + "step": 3666 + }, + { + "epoch": 1.681573214080954, + "grad_norm": 0.24555212259292603, + "learning_rate": 0.00042659266017107666, + "loss": 0.9243, + "step": 3667 + }, + { + "epoch": 1.6820318770783167, + "grad_norm": 0.37791720032691956, + "learning_rate": 0.0004263477101429307, + "loss": 1.3367, + "step": 3668 + }, + { + "epoch": 1.6824905400756793, + "grad_norm": 0.25306499004364014, + "learning_rate": 0.0004261027781822227, + "loss": 1.5412, + "step": 3669 + }, + { + "epoch": 1.6829492030730422, + "grad_norm": 0.23409046232700348, + "learning_rate": 0.00042585786434903584, + "loss": 0.8784, + "step": 3670 + }, + { + "epoch": 1.6834078660704048, + "grad_norm": 0.23634447157382965, + "learning_rate": 0.00042561296870344945, + "loss": 1.4553, + "step": 3671 + }, + { + "epoch": 1.6838665290677675, + "grad_norm": 0.29898911714553833, + "learning_rate": 0.0004253680913055381, + "loss": 1.2293, + "step": 3672 + }, + { + "epoch": 1.68432519206513, + "grad_norm": 0.20857423543930054, + "learning_rate": 0.00042512323221537206, + "loss": 1.5924, + "step": 3673 + }, + { + "epoch": 1.6847838550624927, + "grad_norm": 0.4438040852546692, + "learning_rate": 0.0004248783914930172, + "loss": 1.8789, + "step": 3674 + }, + { + "epoch": 1.6852425180598556, + "grad_norm": 0.33179306983947754, + "learning_rate": 0.0004246335691985347, + "loss": 1.343, + "step": 3675 + }, + { + "epoch": 1.6857011810572182, + "grad_norm": 0.3002280294895172, + "learning_rate": 0.0004243887653919809, + "loss": 1.6851, + "step": 3676 + }, + { + "epoch": 1.6861598440545809, + "grad_norm": 0.2567557692527771, + "learning_rate": 0.00042414398013340806, + "loss": 1.5004, + "step": 3677 + }, + { + "epoch": 1.6866185070519437, + "grad_norm": 0.28894615173339844, + "learning_rate": 0.00042389921348286386, + "loss": 1.1944, + "step": 3678 + }, + { + "epoch": 1.6870771700493061, + "grad_norm": 0.30856838822364807, + "learning_rate": 0.00042365446550039136, + "loss": 1.746, + "step": 3679 + }, + { + "epoch": 1.687535833046669, + "grad_norm": 0.24894581735134125, + "learning_rate": 0.0004234097362460292, + "loss": 0.8261, + "step": 3680 + }, + { + "epoch": 1.6879944960440316, + "grad_norm": 0.16843147575855255, + "learning_rate": 0.00042316502577981077, + "loss": 1.1577, + "step": 3681 + }, + { + "epoch": 1.6884531590413943, + "grad_norm": 0.37144291400909424, + "learning_rate": 0.00042292033416176534, + "loss": 1.601, + "step": 3682 + }, + { + "epoch": 1.6889118220387571, + "grad_norm": 0.28090983629226685, + "learning_rate": 0.0004226756614519179, + "loss": 1.4488, + "step": 3683 + }, + { + "epoch": 1.6893704850361198, + "grad_norm": 0.2437400072813034, + "learning_rate": 0.0004224310077102882, + "loss": 1.2493, + "step": 3684 + }, + { + "epoch": 1.6898291480334824, + "grad_norm": 0.25606828927993774, + "learning_rate": 0.0004221863729968917, + "loss": 1.2882, + "step": 3685 + }, + { + "epoch": 1.6902878110308452, + "grad_norm": 0.1753174066543579, + "learning_rate": 0.00042194175737173906, + "loss": 0.7293, + "step": 3686 + }, + { + "epoch": 1.6907464740282077, + "grad_norm": 0.2955300211906433, + "learning_rate": 0.0004216971608948359, + "loss": 1.5758, + "step": 3687 + }, + { + "epoch": 1.6912051370255705, + "grad_norm": 0.3101601004600525, + "learning_rate": 0.00042145258362618384, + "loss": 1.5141, + "step": 3688 + }, + { + "epoch": 1.6916638000229332, + "grad_norm": 0.3169883191585541, + "learning_rate": 0.00042120802562577945, + "loss": 1.5436, + "step": 3689 + }, + { + "epoch": 1.6921224630202958, + "grad_norm": 0.3298269510269165, + "learning_rate": 0.00042096348695361437, + "loss": 1.8237, + "step": 3690 + }, + { + "epoch": 1.6925811260176586, + "grad_norm": 0.3161281645298004, + "learning_rate": 0.0004207189676696759, + "loss": 1.5731, + "step": 3691 + }, + { + "epoch": 1.693039789015021, + "grad_norm": 0.3234587609767914, + "learning_rate": 0.0004204744678339464, + "loss": 1.3514, + "step": 3692 + }, + { + "epoch": 1.693498452012384, + "grad_norm": 0.19994625449180603, + "learning_rate": 0.0004202299875064033, + "loss": 0.9323, + "step": 3693 + }, + { + "epoch": 1.6939571150097466, + "grad_norm": 0.2861116826534271, + "learning_rate": 0.0004199855267470193, + "loss": 1.4347, + "step": 3694 + }, + { + "epoch": 1.6944157780071092, + "grad_norm": 0.32218021154403687, + "learning_rate": 0.00041974108561576264, + "loss": 1.743, + "step": 3695 + }, + { + "epoch": 1.694874441004472, + "grad_norm": 0.33372506499290466, + "learning_rate": 0.00041949666417259637, + "loss": 1.1621, + "step": 3696 + }, + { + "epoch": 1.6953331040018347, + "grad_norm": 0.37846240401268005, + "learning_rate": 0.0004192522624774789, + "loss": 1.1866, + "step": 3697 + }, + { + "epoch": 1.6957917669991973, + "grad_norm": 0.2000287026166916, + "learning_rate": 0.00041900788059036354, + "loss": 1.3976, + "step": 3698 + }, + { + "epoch": 1.6962504299965602, + "grad_norm": 0.37890589237213135, + "learning_rate": 0.0004187635185711991, + "loss": 1.7291, + "step": 3699 + }, + { + "epoch": 1.6967090929939226, + "grad_norm": 0.30530160665512085, + "learning_rate": 0.0004185191764799293, + "loss": 1.0033, + "step": 3700 + }, + { + "epoch": 1.6971677559912854, + "grad_norm": 0.24736268818378448, + "learning_rate": 0.00041827485437649306, + "loss": 1.6701, + "step": 3701 + }, + { + "epoch": 1.697626418988648, + "grad_norm": 0.2429756373167038, + "learning_rate": 0.00041803055232082423, + "loss": 0.8325, + "step": 3702 + }, + { + "epoch": 1.6980850819860107, + "grad_norm": 0.297656774520874, + "learning_rate": 0.00041778627037285205, + "loss": 1.4572, + "step": 3703 + }, + { + "epoch": 1.6985437449833736, + "grad_norm": 0.26800161600112915, + "learning_rate": 0.0004175420085925005, + "loss": 1.7028, + "step": 3704 + }, + { + "epoch": 1.6990024079807362, + "grad_norm": 0.27825766801834106, + "learning_rate": 0.0004172977670396887, + "loss": 1.6484, + "step": 3705 + }, + { + "epoch": 1.6994610709780988, + "grad_norm": 0.2302626520395279, + "learning_rate": 0.0004170535457743311, + "loss": 0.9787, + "step": 3706 + }, + { + "epoch": 1.6999197339754615, + "grad_norm": 0.22059178352355957, + "learning_rate": 0.0004168093448563367, + "loss": 0.9163, + "step": 3707 + }, + { + "epoch": 1.7003783969728241, + "grad_norm": 0.28476452827453613, + "learning_rate": 0.0004165651643456099, + "loss": 1.9269, + "step": 3708 + }, + { + "epoch": 1.700837059970187, + "grad_norm": 0.3191598057746887, + "learning_rate": 0.0004163210043020499, + "loss": 1.7429, + "step": 3709 + }, + { + "epoch": 1.7012957229675496, + "grad_norm": 0.5909731984138489, + "learning_rate": 0.00041607686478555076, + "loss": 1.2906, + "step": 3710 + }, + { + "epoch": 1.7017543859649122, + "grad_norm": 0.2239387482404709, + "learning_rate": 0.00041583274585600195, + "loss": 0.8284, + "step": 3711 + }, + { + "epoch": 1.702213048962275, + "grad_norm": 0.3220686912536621, + "learning_rate": 0.0004155886475732874, + "loss": 1.9756, + "step": 3712 + }, + { + "epoch": 1.7026717119596375, + "grad_norm": 0.23089122772216797, + "learning_rate": 0.0004153445699972862, + "loss": 1.3217, + "step": 3713 + }, + { + "epoch": 1.7031303749570004, + "grad_norm": 0.2528243064880371, + "learning_rate": 0.0004151005131878725, + "loss": 0.8455, + "step": 3714 + }, + { + "epoch": 1.703589037954363, + "grad_norm": 0.10433992743492126, + "learning_rate": 0.00041485647720491503, + "loss": 1.0346, + "step": 3715 + }, + { + "epoch": 1.7040477009517256, + "grad_norm": 0.26862263679504395, + "learning_rate": 0.0004146124621082775, + "loss": 1.2124, + "step": 3716 + }, + { + "epoch": 1.7045063639490885, + "grad_norm": 0.3269628882408142, + "learning_rate": 0.0004143684679578188, + "loss": 1.3434, + "step": 3717 + }, + { + "epoch": 1.7049650269464511, + "grad_norm": 0.2566255033016205, + "learning_rate": 0.00041412449481339233, + "loss": 1.7516, + "step": 3718 + }, + { + "epoch": 1.7054236899438138, + "grad_norm": 0.2328820526599884, + "learning_rate": 0.0004138805427348464, + "loss": 1.1428, + "step": 3719 + }, + { + "epoch": 1.7058823529411766, + "grad_norm": 0.41982701420783997, + "learning_rate": 0.0004136366117820245, + "loss": 1.6322, + "step": 3720 + }, + { + "epoch": 1.706341015938539, + "grad_norm": 0.2226077914237976, + "learning_rate": 0.00041339270201476425, + "loss": 1.2604, + "step": 3721 + }, + { + "epoch": 1.706799678935902, + "grad_norm": 0.2776189148426056, + "learning_rate": 0.0004131488134928987, + "loss": 1.2419, + "step": 3722 + }, + { + "epoch": 1.7072583419332645, + "grad_norm": 0.39088577032089233, + "learning_rate": 0.0004129049462762554, + "loss": 1.9712, + "step": 3723 + }, + { + "epoch": 1.7077170049306272, + "grad_norm": 0.28871458768844604, + "learning_rate": 0.0004126611004246568, + "loss": 1.5851, + "step": 3724 + }, + { + "epoch": 1.70817566792799, + "grad_norm": 0.2573470175266266, + "learning_rate": 0.00041241727599792015, + "loss": 1.0604, + "step": 3725 + }, + { + "epoch": 1.7086343309253524, + "grad_norm": 0.257318913936615, + "learning_rate": 0.00041217347305585707, + "loss": 1.5261, + "step": 3726 + }, + { + "epoch": 1.7090929939227153, + "grad_norm": 0.32521334290504456, + "learning_rate": 0.00041192969165827433, + "loss": 1.9292, + "step": 3727 + }, + { + "epoch": 1.709551656920078, + "grad_norm": 0.2845672369003296, + "learning_rate": 0.00041168593186497317, + "loss": 1.2908, + "step": 3728 + }, + { + "epoch": 1.7100103199174406, + "grad_norm": 0.3328210115432739, + "learning_rate": 0.00041144219373574976, + "loss": 2.0022, + "step": 3729 + }, + { + "epoch": 1.7104689829148034, + "grad_norm": 0.3225518763065338, + "learning_rate": 0.0004111984773303946, + "loss": 1.4616, + "step": 3730 + }, + { + "epoch": 1.710927645912166, + "grad_norm": 0.14123696088790894, + "learning_rate": 0.0004109547827086937, + "loss": 0.4822, + "step": 3731 + }, + { + "epoch": 1.7113863089095287, + "grad_norm": 0.08275225758552551, + "learning_rate": 0.00041071110993042627, + "loss": 0.8438, + "step": 3732 + }, + { + "epoch": 1.7118449719068916, + "grad_norm": 0.2467496544122696, + "learning_rate": 0.0004104674590553675, + "loss": 1.2802, + "step": 3733 + }, + { + "epoch": 1.712303634904254, + "grad_norm": 0.25316253304481506, + "learning_rate": 0.0004102238301432865, + "loss": 0.9283, + "step": 3734 + }, + { + "epoch": 1.7127622979016168, + "grad_norm": 0.23193414509296417, + "learning_rate": 0.00040998022325394723, + "loss": 1.4131, + "step": 3735 + }, + { + "epoch": 1.7132209608989795, + "grad_norm": 0.22135387361049652, + "learning_rate": 0.0004097366384471086, + "loss": 0.8179, + "step": 3736 + }, + { + "epoch": 1.713679623896342, + "grad_norm": 0.26501017808914185, + "learning_rate": 0.00040949307578252314, + "loss": 1.5214, + "step": 3737 + }, + { + "epoch": 1.714138286893705, + "grad_norm": 0.2112581878900528, + "learning_rate": 0.0004092495353199388, + "loss": 1.0725, + "step": 3738 + }, + { + "epoch": 1.7145969498910676, + "grad_norm": 0.21010689437389374, + "learning_rate": 0.0004090060171190977, + "loss": 0.6734, + "step": 3739 + }, + { + "epoch": 1.7150556128884302, + "grad_norm": 0.3008427321910858, + "learning_rate": 0.00040876252123973677, + "loss": 2.1113, + "step": 3740 + }, + { + "epoch": 1.7155142758857929, + "grad_norm": 0.33156007528305054, + "learning_rate": 0.00040851904774158725, + "loss": 1.8834, + "step": 3741 + }, + { + "epoch": 1.7159729388831555, + "grad_norm": 0.2164911925792694, + "learning_rate": 0.0004082755966843752, + "loss": 0.832, + "step": 3742 + }, + { + "epoch": 1.7164316018805184, + "grad_norm": 0.2621867060661316, + "learning_rate": 0.0004080321681278204, + "loss": 0.9056, + "step": 3743 + }, + { + "epoch": 1.716890264877881, + "grad_norm": 0.2789194881916046, + "learning_rate": 0.0004077887621316377, + "loss": 1.7653, + "step": 3744 + }, + { + "epoch": 1.7173489278752436, + "grad_norm": 0.27011173963546753, + "learning_rate": 0.0004075453787555367, + "loss": 1.3661, + "step": 3745 + }, + { + "epoch": 1.7178075908726065, + "grad_norm": 0.29901739954948425, + "learning_rate": 0.00040730201805922096, + "loss": 1.5548, + "step": 3746 + }, + { + "epoch": 1.718266253869969, + "grad_norm": 0.3142026662826538, + "learning_rate": 0.0004070586801023885, + "loss": 1.79, + "step": 3747 + }, + { + "epoch": 1.7187249168673318, + "grad_norm": 0.26456326246261597, + "learning_rate": 0.00040681536494473224, + "loss": 1.305, + "step": 3748 + }, + { + "epoch": 1.7191835798646944, + "grad_norm": 0.35430580377578735, + "learning_rate": 0.0004065720726459385, + "loss": 1.6096, + "step": 3749 + }, + { + "epoch": 1.719642242862057, + "grad_norm": 0.31563955545425415, + "learning_rate": 0.0004063288032656891, + "loss": 1.6702, + "step": 3750 + }, + { + "epoch": 1.7201009058594199, + "grad_norm": 0.31197500228881836, + "learning_rate": 0.00040608555686365966, + "loss": 0.9615, + "step": 3751 + }, + { + "epoch": 1.7205595688567825, + "grad_norm": 0.26841220259666443, + "learning_rate": 0.00040584233349952027, + "loss": 1.7595, + "step": 3752 + }, + { + "epoch": 1.7210182318541452, + "grad_norm": 0.3827781677246094, + "learning_rate": 0.0004055991332329356, + "loss": 1.309, + "step": 3753 + }, + { + "epoch": 1.721476894851508, + "grad_norm": 0.30011874437332153, + "learning_rate": 0.00040535595612356393, + "loss": 1.0917, + "step": 3754 + }, + { + "epoch": 1.7219355578488704, + "grad_norm": 0.23119381070137024, + "learning_rate": 0.00040511280223105866, + "loss": 0.9667, + "step": 3755 + }, + { + "epoch": 1.7223942208462333, + "grad_norm": 0.3517080545425415, + "learning_rate": 0.00040486967161506725, + "loss": 1.3582, + "step": 3756 + }, + { + "epoch": 1.722852883843596, + "grad_norm": 0.12287653982639313, + "learning_rate": 0.0004046265643352313, + "loss": 0.7969, + "step": 3757 + }, + { + "epoch": 1.7233115468409586, + "grad_norm": 0.22328488528728485, + "learning_rate": 0.0004043834804511868, + "loss": 1.2675, + "step": 3758 + }, + { + "epoch": 1.7237702098383214, + "grad_norm": 0.3529665768146515, + "learning_rate": 0.0004041404200225641, + "loss": 1.1346, + "step": 3759 + }, + { + "epoch": 1.7242288728356838, + "grad_norm": 0.283286452293396, + "learning_rate": 0.00040389738310898736, + "loss": 1.2228, + "step": 3760 + }, + { + "epoch": 1.7246875358330467, + "grad_norm": 0.181251659989357, + "learning_rate": 0.0004036543697700756, + "loss": 1.1888, + "step": 3761 + }, + { + "epoch": 1.7251461988304093, + "grad_norm": 0.2636159062385559, + "learning_rate": 0.0004034113800654415, + "loss": 0.6633, + "step": 3762 + }, + { + "epoch": 1.725604861827772, + "grad_norm": 0.2084595412015915, + "learning_rate": 0.0004031684140546924, + "loss": 1.3781, + "step": 3763 + }, + { + "epoch": 1.7260635248251348, + "grad_norm": 0.3345155715942383, + "learning_rate": 0.0004029254717974297, + "loss": 1.3012, + "step": 3764 + }, + { + "epoch": 1.7265221878224974, + "grad_norm": 0.3305032551288605, + "learning_rate": 0.00040268255335324844, + "loss": 1.6357, + "step": 3765 + }, + { + "epoch": 1.72698085081986, + "grad_norm": 0.33602216839790344, + "learning_rate": 0.0004024396587817386, + "loss": 1.7511, + "step": 3766 + }, + { + "epoch": 1.727439513817223, + "grad_norm": 0.3443238437175751, + "learning_rate": 0.00040219678814248396, + "loss": 1.7075, + "step": 3767 + }, + { + "epoch": 1.7278981768145854, + "grad_norm": 0.3635205924510956, + "learning_rate": 0.00040195394149506234, + "loss": 1.413, + "step": 3768 + }, + { + "epoch": 1.7283568398119482, + "grad_norm": 0.2028258591890335, + "learning_rate": 0.00040171111889904584, + "loss": 0.8682, + "step": 3769 + }, + { + "epoch": 1.7288155028093108, + "grad_norm": 0.26344314217567444, + "learning_rate": 0.0004014683204140006, + "loss": 1.6746, + "step": 3770 + }, + { + "epoch": 1.7292741658066735, + "grad_norm": 0.36593350768089294, + "learning_rate": 0.0004012255460994868, + "loss": 1.3928, + "step": 3771 + }, + { + "epoch": 1.7297328288040363, + "grad_norm": 0.2758760154247284, + "learning_rate": 0.0004009827960150587, + "loss": 0.9172, + "step": 3772 + }, + { + "epoch": 1.730191491801399, + "grad_norm": 0.348369836807251, + "learning_rate": 0.00040074007022026473, + "loss": 2.3096, + "step": 3773 + }, + { + "epoch": 1.7306501547987616, + "grad_norm": 0.3024252653121948, + "learning_rate": 0.0004004973687746472, + "loss": 1.2795, + "step": 3774 + }, + { + "epoch": 1.7311088177961242, + "grad_norm": 0.3365992307662964, + "learning_rate": 0.00040025469173774256, + "loss": 1.1777, + "step": 3775 + }, + { + "epoch": 1.7315674807934869, + "grad_norm": 0.2750336229801178, + "learning_rate": 0.0004000120391690814, + "loss": 1.3168, + "step": 3776 + }, + { + "epoch": 1.7320261437908497, + "grad_norm": 0.32935917377471924, + "learning_rate": 0.00039976941112818777, + "loss": 1.9238, + "step": 3777 + }, + { + "epoch": 1.7324848067882124, + "grad_norm": 0.3010925054550171, + "learning_rate": 0.00039952680767458036, + "loss": 1.0422, + "step": 3778 + }, + { + "epoch": 1.732943469785575, + "grad_norm": 0.29777830839157104, + "learning_rate": 0.0003992842288677715, + "loss": 1.7227, + "step": 3779 + }, + { + "epoch": 1.7334021327829379, + "grad_norm": 0.3348708748817444, + "learning_rate": 0.00039904167476726744, + "loss": 1.6201, + "step": 3780 + }, + { + "epoch": 1.7338607957803003, + "grad_norm": 0.2975271940231323, + "learning_rate": 0.00039879914543256863, + "loss": 1.2951, + "step": 3781 + }, + { + "epoch": 1.7343194587776631, + "grad_norm": 0.2370220124721527, + "learning_rate": 0.000398556640923169, + "loss": 0.8168, + "step": 3782 + }, + { + "epoch": 1.7347781217750258, + "grad_norm": 0.3307121992111206, + "learning_rate": 0.0003983141612985569, + "loss": 1.6048, + "step": 3783 + }, + { + "epoch": 1.7352367847723884, + "grad_norm": 0.357295960187912, + "learning_rate": 0.00039807170661821414, + "loss": 1.0067, + "step": 3784 + }, + { + "epoch": 1.7356954477697513, + "grad_norm": 0.28571048378944397, + "learning_rate": 0.0003978292769416167, + "loss": 0.9839, + "step": 3785 + }, + { + "epoch": 1.736154110767114, + "grad_norm": 0.35537266731262207, + "learning_rate": 0.00039758687232823434, + "loss": 1.6417, + "step": 3786 + }, + { + "epoch": 1.7366127737644765, + "grad_norm": 0.28138241171836853, + "learning_rate": 0.0003973444928375307, + "loss": 1.1609, + "step": 3787 + }, + { + "epoch": 1.7370714367618394, + "grad_norm": 0.24497286975383759, + "learning_rate": 0.0003971021385289631, + "loss": 1.7524, + "step": 3788 + }, + { + "epoch": 1.7375300997592018, + "grad_norm": 0.289495587348938, + "learning_rate": 0.0003968598094619828, + "loss": 1.1777, + "step": 3789 + }, + { + "epoch": 1.7379887627565647, + "grad_norm": 0.2697417736053467, + "learning_rate": 0.00039661750569603495, + "loss": 1.42, + "step": 3790 + }, + { + "epoch": 1.7384474257539273, + "grad_norm": 0.3958907127380371, + "learning_rate": 0.00039637522729055836, + "loss": 1.7687, + "step": 3791 + }, + { + "epoch": 1.73890608875129, + "grad_norm": 0.23908482491970062, + "learning_rate": 0.00039613297430498586, + "loss": 0.9498, + "step": 3792 + }, + { + "epoch": 1.7393647517486528, + "grad_norm": 0.262422651052475, + "learning_rate": 0.0003958907467987435, + "loss": 1.0038, + "step": 3793 + }, + { + "epoch": 1.7398234147460152, + "grad_norm": 0.3177359998226166, + "learning_rate": 0.00039564854483125164, + "loss": 1.9203, + "step": 3794 + }, + { + "epoch": 1.740282077743378, + "grad_norm": 0.3895881772041321, + "learning_rate": 0.0003954063684619241, + "loss": 1.788, + "step": 3795 + }, + { + "epoch": 1.7407407407407407, + "grad_norm": 0.2772713005542755, + "learning_rate": 0.00039516421775016863, + "loss": 1.3488, + "step": 3796 + }, + { + "epoch": 1.7411994037381033, + "grad_norm": 0.2620038390159607, + "learning_rate": 0.00039492209275538624, + "loss": 1.4497, + "step": 3797 + }, + { + "epoch": 1.7416580667354662, + "grad_norm": 0.21062573790550232, + "learning_rate": 0.0003946799935369726, + "loss": 0.8097, + "step": 3798 + }, + { + "epoch": 1.7421167297328288, + "grad_norm": 0.313883900642395, + "learning_rate": 0.0003944379201543156, + "loss": 1.6286, + "step": 3799 + }, + { + "epoch": 1.7425753927301915, + "grad_norm": 0.28428205847740173, + "learning_rate": 0.000394195872666798, + "loss": 1.1285, + "step": 3800 + }, + { + "epoch": 1.7430340557275543, + "grad_norm": 0.33933934569358826, + "learning_rate": 0.00039395385113379566, + "loss": 1.5898, + "step": 3801 + }, + { + "epoch": 1.7434927187249167, + "grad_norm": 0.3156382143497467, + "learning_rate": 0.00039371185561467827, + "loss": 0.9087, + "step": 3802 + }, + { + "epoch": 1.7439513817222796, + "grad_norm": 0.38781848549842834, + "learning_rate": 0.0003934698861688093, + "loss": 1.8621, + "step": 3803 + }, + { + "epoch": 1.7444100447196422, + "grad_norm": 0.2756793200969696, + "learning_rate": 0.0003932279428555452, + "loss": 1.5569, + "step": 3804 + }, + { + "epoch": 1.7448687077170049, + "grad_norm": 0.2981017231941223, + "learning_rate": 0.0003929860257342366, + "loss": 1.1586, + "step": 3805 + }, + { + "epoch": 1.7453273707143677, + "grad_norm": 0.32057425379753113, + "learning_rate": 0.0003927441348642274, + "loss": 1.6129, + "step": 3806 + }, + { + "epoch": 1.7457860337117304, + "grad_norm": 0.359298437833786, + "learning_rate": 0.0003925022703048553, + "loss": 1.2742, + "step": 3807 + }, + { + "epoch": 1.746244696709093, + "grad_norm": 0.21599337458610535, + "learning_rate": 0.0003922604321154514, + "loss": 1.2625, + "step": 3808 + }, + { + "epoch": 1.7467033597064556, + "grad_norm": 0.25992512702941895, + "learning_rate": 0.00039201862035534066, + "loss": 1.3849, + "step": 3809 + }, + { + "epoch": 1.7471620227038183, + "grad_norm": 0.342759907245636, + "learning_rate": 0.0003917768350838406, + "loss": 1.7155, + "step": 3810 + }, + { + "epoch": 1.7476206857011811, + "grad_norm": 0.22882609069347382, + "learning_rate": 0.0003915350763602632, + "loss": 1.1857, + "step": 3811 + }, + { + "epoch": 1.7480793486985438, + "grad_norm": 0.2731204032897949, + "learning_rate": 0.0003912933442439137, + "loss": 1.0107, + "step": 3812 + }, + { + "epoch": 1.7485380116959064, + "grad_norm": 0.21852166950702667, + "learning_rate": 0.00039105163879409066, + "loss": 1.2848, + "step": 3813 + }, + { + "epoch": 1.7489966746932692, + "grad_norm": 0.6290088295936584, + "learning_rate": 0.00039080996007008625, + "loss": 1.4191, + "step": 3814 + }, + { + "epoch": 1.7494553376906317, + "grad_norm": 0.2989594042301178, + "learning_rate": 0.0003905683081311861, + "loss": 1.5646, + "step": 3815 + }, + { + "epoch": 1.7499140006879945, + "grad_norm": 0.256778746843338, + "learning_rate": 0.00039032668303666876, + "loss": 1.4916, + "step": 3816 + }, + { + "epoch": 1.7503726636853572, + "grad_norm": 0.19242458045482635, + "learning_rate": 0.00039008508484580684, + "loss": 0.338, + "step": 3817 + }, + { + "epoch": 1.7508313266827198, + "grad_norm": 0.24674712121486664, + "learning_rate": 0.0003898435136178662, + "loss": 1.1446, + "step": 3818 + }, + { + "epoch": 1.7512899896800826, + "grad_norm": 0.2414608746767044, + "learning_rate": 0.000389601969412106, + "loss": 1.5443, + "step": 3819 + }, + { + "epoch": 1.7517486526774453, + "grad_norm": 0.8330361247062683, + "learning_rate": 0.00038936045228777884, + "loss": 1.5558, + "step": 3820 + }, + { + "epoch": 1.752207315674808, + "grad_norm": 0.2938498258590698, + "learning_rate": 0.0003891189623041302, + "loss": 1.0142, + "step": 3821 + }, + { + "epoch": 1.7526659786721708, + "grad_norm": 0.3147178888320923, + "learning_rate": 0.0003888774995203997, + "loss": 1.2997, + "step": 3822 + }, + { + "epoch": 1.7531246416695332, + "grad_norm": 0.26122626662254333, + "learning_rate": 0.0003886360639958198, + "loss": 1.3746, + "step": 3823 + }, + { + "epoch": 1.753583304666896, + "grad_norm": 0.08440568298101425, + "learning_rate": 0.00038839465578961637, + "loss": 0.9028, + "step": 3824 + }, + { + "epoch": 1.7540419676642587, + "grad_norm": 0.27365854382514954, + "learning_rate": 0.00038815327496100863, + "loss": 1.2714, + "step": 3825 + }, + { + "epoch": 1.7545006306616213, + "grad_norm": 0.2890813648700714, + "learning_rate": 0.0003879119215692091, + "loss": 1.3066, + "step": 3826 + }, + { + "epoch": 1.7549592936589842, + "grad_norm": 0.315480500459671, + "learning_rate": 0.00038767059567342325, + "loss": 1.3728, + "step": 3827 + }, + { + "epoch": 1.7554179566563466, + "grad_norm": 0.22030934691429138, + "learning_rate": 0.0003874292973328502, + "loss": 1.0422, + "step": 3828 + }, + { + "epoch": 1.7558766196537094, + "grad_norm": 0.21673201024532318, + "learning_rate": 0.0003871880266066823, + "loss": 1.2688, + "step": 3829 + }, + { + "epoch": 1.756335282651072, + "grad_norm": 0.31848639249801636, + "learning_rate": 0.0003869467835541048, + "loss": 1.376, + "step": 3830 + }, + { + "epoch": 1.7567939456484347, + "grad_norm": 0.08145318180322647, + "learning_rate": 0.0003867055682342966, + "loss": 0.4348, + "step": 3831 + }, + { + "epoch": 1.7572526086457976, + "grad_norm": 0.1764577478170395, + "learning_rate": 0.00038646438070642926, + "loss": 0.7916, + "step": 3832 + }, + { + "epoch": 1.7577112716431602, + "grad_norm": 0.32474926114082336, + "learning_rate": 0.00038622322102966803, + "loss": 1.7656, + "step": 3833 + }, + { + "epoch": 1.7581699346405228, + "grad_norm": 0.2409229725599289, + "learning_rate": 0.00038598208926317096, + "loss": 1.5365, + "step": 3834 + }, + { + "epoch": 1.7586285976378857, + "grad_norm": 0.2720002830028534, + "learning_rate": 0.00038574098546608957, + "loss": 0.9022, + "step": 3835 + }, + { + "epoch": 1.7590872606352481, + "grad_norm": 1.557668924331665, + "learning_rate": 0.0003854999096975683, + "loss": 1.754, + "step": 3836 + }, + { + "epoch": 1.759545923632611, + "grad_norm": 0.3512636721134186, + "learning_rate": 0.00038525886201674485, + "loss": 1.4061, + "step": 3837 + }, + { + "epoch": 1.7600045866299736, + "grad_norm": 0.29787155985832214, + "learning_rate": 0.0003850178424827497, + "loss": 1.8271, + "step": 3838 + }, + { + "epoch": 1.7604632496273362, + "grad_norm": 0.3310008645057678, + "learning_rate": 0.000384776851154707, + "loss": 1.6361, + "step": 3839 + }, + { + "epoch": 1.760921912624699, + "grad_norm": 0.42053020000457764, + "learning_rate": 0.00038453588809173343, + "loss": 1.8173, + "step": 3840 + }, + { + "epoch": 1.7613805756220617, + "grad_norm": 0.41074061393737793, + "learning_rate": 0.00038429495335293905, + "loss": 1.7546, + "step": 3841 + }, + { + "epoch": 1.7618392386194244, + "grad_norm": 0.27962028980255127, + "learning_rate": 0.00038405404699742694, + "loss": 1.4692, + "step": 3842 + }, + { + "epoch": 1.762297901616787, + "grad_norm": 0.3928588628768921, + "learning_rate": 0.0003838131690842932, + "loss": 1.9565, + "step": 3843 + }, + { + "epoch": 1.7627565646141496, + "grad_norm": 0.4200422465801239, + "learning_rate": 0.0003835723196726267, + "loss": 1.8337, + "step": 3844 + }, + { + "epoch": 1.7632152276115125, + "grad_norm": 0.3624529540538788, + "learning_rate": 0.0003833314988215097, + "loss": 1.7949, + "step": 3845 + }, + { + "epoch": 1.7636738906088751, + "grad_norm": 0.4131540358066559, + "learning_rate": 0.00038309070659001723, + "loss": 1.7876, + "step": 3846 + }, + { + "epoch": 1.7641325536062378, + "grad_norm": 0.26267632842063904, + "learning_rate": 0.00038284994303721743, + "loss": 1.3583, + "step": 3847 + }, + { + "epoch": 1.7645912166036006, + "grad_norm": 0.31181371212005615, + "learning_rate": 0.0003826092082221714, + "loss": 1.2529, + "step": 3848 + }, + { + "epoch": 1.765049879600963, + "grad_norm": 0.2337476760149002, + "learning_rate": 0.00038236850220393285, + "loss": 1.1989, + "step": 3849 + }, + { + "epoch": 1.765508542598326, + "grad_norm": 0.2774173319339752, + "learning_rate": 0.000382127825041549, + "loss": 1.6288, + "step": 3850 + }, + { + "epoch": 1.7659672055956885, + "grad_norm": 0.3347560465335846, + "learning_rate": 0.0003818871767940595, + "loss": 1.3356, + "step": 3851 + }, + { + "epoch": 1.7664258685930512, + "grad_norm": 0.339616596698761, + "learning_rate": 0.00038164655752049713, + "loss": 1.9014, + "step": 3852 + }, + { + "epoch": 1.766884531590414, + "grad_norm": 0.3301674425601959, + "learning_rate": 0.0003814059672798876, + "loss": 1.2728, + "step": 3853 + }, + { + "epoch": 1.7673431945877767, + "grad_norm": 0.2250106781721115, + "learning_rate": 0.0003811654061312495, + "loss": 1.7408, + "step": 3854 + }, + { + "epoch": 1.7678018575851393, + "grad_norm": 0.252518892288208, + "learning_rate": 0.00038092487413359405, + "loss": 1.2438, + "step": 3855 + }, + { + "epoch": 1.7682605205825022, + "grad_norm": 0.2107822746038437, + "learning_rate": 0.00038068437134592553, + "loss": 1.3066, + "step": 3856 + }, + { + "epoch": 1.7687191835798646, + "grad_norm": 0.33197417855262756, + "learning_rate": 0.0003804438978272411, + "loss": 1.2697, + "step": 3857 + }, + { + "epoch": 1.7691778465772274, + "grad_norm": 0.22738035023212433, + "learning_rate": 0.0003802034536365305, + "loss": 0.8106, + "step": 3858 + }, + { + "epoch": 1.76963650957459, + "grad_norm": 0.22326631844043732, + "learning_rate": 0.0003799630388327766, + "loss": 1.5657, + "step": 3859 + }, + { + "epoch": 1.7700951725719527, + "grad_norm": 0.30757084488868713, + "learning_rate": 0.00037972265347495474, + "loss": 1.4067, + "step": 3860 + }, + { + "epoch": 1.7705538355693156, + "grad_norm": 0.22894053161144257, + "learning_rate": 0.00037948229762203313, + "loss": 0.7838, + "step": 3861 + }, + { + "epoch": 1.771012498566678, + "grad_norm": 0.3080291152000427, + "learning_rate": 0.0003792419713329729, + "loss": 1.3486, + "step": 3862 + }, + { + "epoch": 1.7714711615640408, + "grad_norm": 0.21413278579711914, + "learning_rate": 0.00037900167466672793, + "loss": 1.0098, + "step": 3863 + }, + { + "epoch": 1.7719298245614035, + "grad_norm": 0.2879568040370941, + "learning_rate": 0.00037876140768224444, + "loss": 1.5512, + "step": 3864 + }, + { + "epoch": 1.772388487558766, + "grad_norm": 0.28043875098228455, + "learning_rate": 0.0003785211704384621, + "loss": 1.2938, + "step": 3865 + }, + { + "epoch": 1.772847150556129, + "grad_norm": 0.3483384847640991, + "learning_rate": 0.0003782809629943124, + "loss": 1.7574, + "step": 3866 + }, + { + "epoch": 1.7733058135534916, + "grad_norm": 0.33012205362319946, + "learning_rate": 0.00037804078540872005, + "loss": 1.2911, + "step": 3867 + }, + { + "epoch": 1.7737644765508542, + "grad_norm": 0.2982887327671051, + "learning_rate": 0.0003778006377406025, + "loss": 1.8078, + "step": 3868 + }, + { + "epoch": 1.774223139548217, + "grad_norm": 0.19081206619739532, + "learning_rate": 0.0003775605200488694, + "loss": 0.4456, + "step": 3869 + }, + { + "epoch": 1.7746818025455795, + "grad_norm": 0.3630902171134949, + "learning_rate": 0.00037732043239242373, + "loss": 1.98, + "step": 3870 + }, + { + "epoch": 1.7751404655429424, + "grad_norm": 0.25359615683555603, + "learning_rate": 0.00037708037483016085, + "loss": 0.9155, + "step": 3871 + }, + { + "epoch": 1.775599128540305, + "grad_norm": 0.3517308533191681, + "learning_rate": 0.000376840347420968, + "loss": 1.8185, + "step": 3872 + }, + { + "epoch": 1.7760577915376676, + "grad_norm": 0.23731131851673126, + "learning_rate": 0.00037660035022372604, + "loss": 0.7537, + "step": 3873 + }, + { + "epoch": 1.7765164545350305, + "grad_norm": 0.2584936320781708, + "learning_rate": 0.0003763603832973077, + "loss": 1.7855, + "step": 3874 + }, + { + "epoch": 1.7769751175323931, + "grad_norm": 0.42461255192756653, + "learning_rate": 0.00037612044670057906, + "loss": 1.1224, + "step": 3875 + }, + { + "epoch": 1.7774337805297558, + "grad_norm": 0.2801918089389801, + "learning_rate": 0.00037588054049239817, + "loss": 1.6647, + "step": 3876 + }, + { + "epoch": 1.7778924435271186, + "grad_norm": 0.3888150751590729, + "learning_rate": 0.0003756406647316155, + "loss": 1.7823, + "step": 3877 + }, + { + "epoch": 1.778351106524481, + "grad_norm": 0.4158638119697571, + "learning_rate": 0.00037540081947707443, + "loss": 1.3085, + "step": 3878 + }, + { + "epoch": 1.7788097695218439, + "grad_norm": 0.055091023445129395, + "learning_rate": 0.0003751610047876106, + "loss": 0.7616, + "step": 3879 + }, + { + "epoch": 1.7792684325192065, + "grad_norm": 0.2197464108467102, + "learning_rate": 0.00037492122072205257, + "loss": 0.7161, + "step": 3880 + }, + { + "epoch": 1.7797270955165692, + "grad_norm": 0.39263108372688293, + "learning_rate": 0.00037468146733922106, + "loss": 1.9539, + "step": 3881 + }, + { + "epoch": 1.780185758513932, + "grad_norm": 0.43456920981407166, + "learning_rate": 0.0003744417446979293, + "loss": 1.8441, + "step": 3882 + }, + { + "epoch": 1.7806444215112944, + "grad_norm": 0.3044719099998474, + "learning_rate": 0.0003742020528569827, + "loss": 1.7751, + "step": 3883 + }, + { + "epoch": 1.7811030845086573, + "grad_norm": 0.44964897632598877, + "learning_rate": 0.0003739623918751795, + "loss": 1.338, + "step": 3884 + }, + { + "epoch": 1.78156174750602, + "grad_norm": 0.3019511103630066, + "learning_rate": 0.00037372276181131043, + "loss": 1.298, + "step": 3885 + }, + { + "epoch": 1.7820204105033826, + "grad_norm": 0.33377906680107117, + "learning_rate": 0.0003734831627241584, + "loss": 2.1521, + "step": 3886 + }, + { + "epoch": 1.7824790735007454, + "grad_norm": 0.4243369996547699, + "learning_rate": 0.000373243594672499, + "loss": 2.0967, + "step": 3887 + }, + { + "epoch": 1.782937736498108, + "grad_norm": 0.41644036769866943, + "learning_rate": 0.0003730040577150995, + "loss": 1.89, + "step": 3888 + }, + { + "epoch": 1.7833963994954707, + "grad_norm": 0.2524632215499878, + "learning_rate": 0.0003727645519107204, + "loss": 1.5137, + "step": 3889 + }, + { + "epoch": 1.7838550624928335, + "grad_norm": 0.4522644281387329, + "learning_rate": 0.0003725250773181141, + "loss": 1.5999, + "step": 3890 + }, + { + "epoch": 1.784313725490196, + "grad_norm": 0.25750890374183655, + "learning_rate": 0.0003722856339960256, + "loss": 1.3307, + "step": 3891 + }, + { + "epoch": 1.7847723884875588, + "grad_norm": 0.37072017788887024, + "learning_rate": 0.0003720462220031918, + "loss": 1.6609, + "step": 3892 + }, + { + "epoch": 1.7852310514849214, + "grad_norm": 0.35205331444740295, + "learning_rate": 0.0003718068413983425, + "loss": 1.0562, + "step": 3893 + }, + { + "epoch": 1.785689714482284, + "grad_norm": 0.05115019530057907, + "learning_rate": 0.00037156749224019923, + "loss": 0.6781, + "step": 3894 + }, + { + "epoch": 1.786148377479647, + "grad_norm": 0.25191041827201843, + "learning_rate": 0.00037132817458747613, + "loss": 1.4272, + "step": 3895 + }, + { + "epoch": 1.7866070404770094, + "grad_norm": 0.27612894773483276, + "learning_rate": 0.00037108888849887966, + "loss": 0.9952, + "step": 3896 + }, + { + "epoch": 1.7870657034743722, + "grad_norm": 0.2937767803668976, + "learning_rate": 0.0003708496340331082, + "loss": 1.6864, + "step": 3897 + }, + { + "epoch": 1.7875243664717348, + "grad_norm": 0.3436873257160187, + "learning_rate": 0.00037061041124885285, + "loss": 1.9355, + "step": 3898 + }, + { + "epoch": 1.7879830294690975, + "grad_norm": 0.35292357206344604, + "learning_rate": 0.00037037122020479665, + "loss": 1.7389, + "step": 3899 + }, + { + "epoch": 1.7884416924664603, + "grad_norm": 0.3429259955883026, + "learning_rate": 0.0003701320609596147, + "loss": 1.8025, + "step": 3900 + }, + { + "epoch": 1.788900355463823, + "grad_norm": 0.38343197107315063, + "learning_rate": 0.00036989293357197464, + "loss": 2.0798, + "step": 3901 + }, + { + "epoch": 1.7893590184611856, + "grad_norm": 0.30104926228523254, + "learning_rate": 0.00036965383810053597, + "loss": 1.3101, + "step": 3902 + }, + { + "epoch": 1.7898176814585485, + "grad_norm": 0.2541302740573883, + "learning_rate": 0.00036941477460395074, + "loss": 1.0455, + "step": 3903 + }, + { + "epoch": 1.7902763444559109, + "grad_norm": 0.2968154847621918, + "learning_rate": 0.000369175743140863, + "loss": 1.9026, + "step": 3904 + }, + { + "epoch": 1.7907350074532737, + "grad_norm": 0.33623620867729187, + "learning_rate": 0.0003689367437699086, + "loss": 1.1528, + "step": 3905 + }, + { + "epoch": 1.7911936704506364, + "grad_norm": 0.3630521297454834, + "learning_rate": 0.00036869777654971594, + "loss": 1.7005, + "step": 3906 + }, + { + "epoch": 1.791652333447999, + "grad_norm": 0.38895827531814575, + "learning_rate": 0.0003684588415389055, + "loss": 1.7465, + "step": 3907 + }, + { + "epoch": 1.7921109964453619, + "grad_norm": 0.38369861245155334, + "learning_rate": 0.0003682199387960896, + "loss": 1.2702, + "step": 3908 + }, + { + "epoch": 1.7925696594427245, + "grad_norm": 0.222330242395401, + "learning_rate": 0.00036798106837987297, + "loss": 1.2868, + "step": 3909 + }, + { + "epoch": 1.7930283224400871, + "grad_norm": 0.2768653631210327, + "learning_rate": 0.00036774223034885236, + "loss": 1.0325, + "step": 3910 + }, + { + "epoch": 1.79348698543745, + "grad_norm": 0.4517102837562561, + "learning_rate": 0.000367503424761616, + "loss": 1.3302, + "step": 3911 + }, + { + "epoch": 1.7939456484348124, + "grad_norm": 0.31973719596862793, + "learning_rate": 0.000367264651676745, + "loss": 1.7516, + "step": 3912 + }, + { + "epoch": 1.7944043114321753, + "grad_norm": 0.36346325278282166, + "learning_rate": 0.000367025911152812, + "loss": 1.7175, + "step": 3913 + }, + { + "epoch": 1.794862974429538, + "grad_norm": 0.3458804190158844, + "learning_rate": 0.00036678720324838176, + "loss": 1.3469, + "step": 3914 + }, + { + "epoch": 1.7953216374269005, + "grad_norm": 0.3012971580028534, + "learning_rate": 0.0003665485280220112, + "loss": 1.5734, + "step": 3915 + }, + { + "epoch": 1.7957803004242634, + "grad_norm": 0.3793402910232544, + "learning_rate": 0.00036630988553224887, + "loss": 1.6147, + "step": 3916 + }, + { + "epoch": 1.7962389634216258, + "grad_norm": 0.4072246253490448, + "learning_rate": 0.00036607127583763554, + "loss": 1.4536, + "step": 3917 + }, + { + "epoch": 1.7966976264189887, + "grad_norm": 0.32000985741615295, + "learning_rate": 0.0003658326989967039, + "loss": 1.438, + "step": 3918 + }, + { + "epoch": 1.7971562894163513, + "grad_norm": 0.2759474217891693, + "learning_rate": 0.00036559415506797865, + "loss": 1.33, + "step": 3919 + }, + { + "epoch": 1.797614952413714, + "grad_norm": 0.32986119389533997, + "learning_rate": 0.0003653556441099762, + "loss": 1.3684, + "step": 3920 + }, + { + "epoch": 1.7980736154110768, + "grad_norm": 0.23913626372814178, + "learning_rate": 0.0003651171661812053, + "loss": 1.3252, + "step": 3921 + }, + { + "epoch": 1.7985322784084394, + "grad_norm": 0.33937355875968933, + "learning_rate": 0.0003648787213401659, + "loss": 1.4227, + "step": 3922 + }, + { + "epoch": 1.798990941405802, + "grad_norm": 0.2144535481929779, + "learning_rate": 0.00036464030964535044, + "loss": 0.9513, + "step": 3923 + }, + { + "epoch": 1.799449604403165, + "grad_norm": 0.2545612156391144, + "learning_rate": 0.00036440193115524306, + "loss": 1.1305, + "step": 3924 + }, + { + "epoch": 1.7999082674005273, + "grad_norm": 0.33486199378967285, + "learning_rate": 0.0003641635859283197, + "loss": 1.4782, + "step": 3925 + }, + { + "epoch": 1.8003669303978902, + "grad_norm": 0.16233354806900024, + "learning_rate": 0.000363925274023048, + "loss": 0.611, + "step": 3926 + }, + { + "epoch": 1.8008255933952528, + "grad_norm": 0.33829358220100403, + "learning_rate": 0.00036368699549788795, + "loss": 1.8271, + "step": 3927 + }, + { + "epoch": 1.8012842563926155, + "grad_norm": 0.2723900079727173, + "learning_rate": 0.00036344875041129066, + "loss": 1.1435, + "step": 3928 + }, + { + "epoch": 1.8017429193899783, + "grad_norm": 0.2544368803501129, + "learning_rate": 0.00036321053882169954, + "loss": 0.8193, + "step": 3929 + }, + { + "epoch": 1.8022015823873407, + "grad_norm": 0.2767103910446167, + "learning_rate": 0.00036297236078754945, + "loss": 1.3754, + "step": 3930 + }, + { + "epoch": 1.8026602453847036, + "grad_norm": 0.1679062396287918, + "learning_rate": 0.00036273421636726723, + "loss": 0.8304, + "step": 3931 + }, + { + "epoch": 1.8031189083820662, + "grad_norm": 0.32275891304016113, + "learning_rate": 0.0003624961056192717, + "loss": 1.2045, + "step": 3932 + }, + { + "epoch": 1.8035775713794289, + "grad_norm": 0.1948852390050888, + "learning_rate": 0.00036225802860197275, + "loss": 0.963, + "step": 3933 + }, + { + "epoch": 1.8040362343767917, + "grad_norm": 0.39679884910583496, + "learning_rate": 0.00036201998537377264, + "loss": 1.5401, + "step": 3934 + }, + { + "epoch": 1.8044948973741544, + "grad_norm": 0.2812747657299042, + "learning_rate": 0.00036178197599306494, + "loss": 2.0823, + "step": 3935 + }, + { + "epoch": 1.804953560371517, + "grad_norm": 0.42034879326820374, + "learning_rate": 0.00036154400051823497, + "loss": 2.0599, + "step": 3936 + }, + { + "epoch": 1.8054122233688799, + "grad_norm": 0.4247654676437378, + "learning_rate": 0.00036130605900766024, + "loss": 1.8334, + "step": 3937 + }, + { + "epoch": 1.8058708863662423, + "grad_norm": 0.35334038734436035, + "learning_rate": 0.0003610681515197094, + "loss": 2.0579, + "step": 3938 + }, + { + "epoch": 1.8063295493636051, + "grad_norm": 0.3598625063896179, + "learning_rate": 0.0003608302781127425, + "loss": 2.2363, + "step": 3939 + }, + { + "epoch": 1.8067882123609678, + "grad_norm": 0.29301023483276367, + "learning_rate": 0.00036059243884511185, + "loss": 1.0839, + "step": 3940 + }, + { + "epoch": 1.8072468753583304, + "grad_norm": 0.281200110912323, + "learning_rate": 0.0003603546337751611, + "loss": 1.0665, + "step": 3941 + }, + { + "epoch": 1.8077055383556933, + "grad_norm": 0.16020144522190094, + "learning_rate": 0.0003601168629612256, + "loss": 1.1812, + "step": 3942 + }, + { + "epoch": 1.8081642013530559, + "grad_norm": 0.3196254372596741, + "learning_rate": 0.00035987912646163247, + "loss": 1.4895, + "step": 3943 + }, + { + "epoch": 1.8086228643504185, + "grad_norm": 0.34358012676239014, + "learning_rate": 0.0003596414243346997, + "loss": 1.4952, + "step": 3944 + }, + { + "epoch": 1.8090815273477814, + "grad_norm": 0.2578059136867523, + "learning_rate": 0.00035940375663873767, + "loss": 1.4763, + "step": 3945 + }, + { + "epoch": 1.8095401903451438, + "grad_norm": 0.33023521304130554, + "learning_rate": 0.0003591661234320477, + "loss": 1.0975, + "step": 3946 + }, + { + "epoch": 1.8099988533425067, + "grad_norm": 0.16670942306518555, + "learning_rate": 0.00035892852477292325, + "loss": 1.1774, + "step": 3947 + }, + { + "epoch": 1.8104575163398693, + "grad_norm": 0.3831387758255005, + "learning_rate": 0.00035869096071964885, + "loss": 2.0361, + "step": 3948 + }, + { + "epoch": 1.810916179337232, + "grad_norm": 0.3592255413532257, + "learning_rate": 0.0003584534313305009, + "loss": 1.2978, + "step": 3949 + }, + { + "epoch": 1.8113748423345948, + "grad_norm": 0.05875217914581299, + "learning_rate": 0.0003582159366637466, + "loss": 0.4578, + "step": 3950 + }, + { + "epoch": 1.8118335053319572, + "grad_norm": 0.2751762270927429, + "learning_rate": 0.00035797847677764526, + "loss": 1.256, + "step": 3951 + }, + { + "epoch": 1.81229216832932, + "grad_norm": 0.20929132401943207, + "learning_rate": 0.0003577410517304477, + "loss": 1.1679, + "step": 3952 + }, + { + "epoch": 1.8127508313266827, + "grad_norm": 0.25931280851364136, + "learning_rate": 0.00035750366158039594, + "loss": 0.9565, + "step": 3953 + }, + { + "epoch": 1.8132094943240453, + "grad_norm": 0.20596548914909363, + "learning_rate": 0.0003572663063857234, + "loss": 1.2343, + "step": 3954 + }, + { + "epoch": 1.8136681573214082, + "grad_norm": 0.2564990818500519, + "learning_rate": 0.0003570289862046553, + "loss": 1.6389, + "step": 3955 + }, + { + "epoch": 1.8141268203187708, + "grad_norm": 0.3115670382976532, + "learning_rate": 0.0003567917010954074, + "loss": 1.24, + "step": 3956 + }, + { + "epoch": 1.8145854833161335, + "grad_norm": 0.29912760853767395, + "learning_rate": 0.0003565544511161879, + "loss": 1.6694, + "step": 3957 + }, + { + "epoch": 1.8150441463134963, + "grad_norm": 0.37836483120918274, + "learning_rate": 0.00035631723632519594, + "loss": 2.1592, + "step": 3958 + }, + { + "epoch": 1.8155028093108587, + "grad_norm": 0.3637576401233673, + "learning_rate": 0.0003560800567806218, + "loss": 1.2883, + "step": 3959 + }, + { + "epoch": 1.8159614723082216, + "grad_norm": 0.2932051420211792, + "learning_rate": 0.0003558429125406476, + "loss": 1.4525, + "step": 3960 + }, + { + "epoch": 1.8164201353055842, + "grad_norm": 0.3286663591861725, + "learning_rate": 0.0003556058036634463, + "loss": 1.7548, + "step": 3961 + }, + { + "epoch": 1.8168787983029469, + "grad_norm": 0.3011263310909271, + "learning_rate": 0.00035536873020718254, + "loss": 1.6652, + "step": 3962 + }, + { + "epoch": 1.8173374613003097, + "grad_norm": 0.381115585565567, + "learning_rate": 0.000355131692230012, + "loss": 1.6675, + "step": 3963 + }, + { + "epoch": 1.8177961242976721, + "grad_norm": 0.2726176083087921, + "learning_rate": 0.00035489468979008195, + "loss": 1.4564, + "step": 3964 + }, + { + "epoch": 1.818254787295035, + "grad_norm": 0.32600149512290955, + "learning_rate": 0.0003546577229455308, + "loss": 1.9954, + "step": 3965 + }, + { + "epoch": 1.8187134502923976, + "grad_norm": 0.2766259014606476, + "learning_rate": 0.0003544207917544882, + "loss": 1.4604, + "step": 3966 + }, + { + "epoch": 1.8191721132897603, + "grad_norm": 0.3894624710083008, + "learning_rate": 0.000354183896275075, + "loss": 0.8529, + "step": 3967 + }, + { + "epoch": 1.819630776287123, + "grad_norm": 0.2296787053346634, + "learning_rate": 0.00035394703656540345, + "loss": 1.3822, + "step": 3968 + }, + { + "epoch": 1.8200894392844857, + "grad_norm": 0.3591334819793701, + "learning_rate": 0.00035371021268357694, + "loss": 1.9995, + "step": 3969 + }, + { + "epoch": 1.8205481022818484, + "grad_norm": 0.33714228868484497, + "learning_rate": 0.00035347342468769, + "loss": 1.4948, + "step": 3970 + }, + { + "epoch": 1.8210067652792112, + "grad_norm": 0.2692127227783203, + "learning_rate": 0.00035323667263582865, + "loss": 1.1647, + "step": 3971 + }, + { + "epoch": 1.8214654282765737, + "grad_norm": 0.1590389609336853, + "learning_rate": 0.00035299995658606963, + "loss": 0.9988, + "step": 3972 + }, + { + "epoch": 1.8219240912739365, + "grad_norm": 0.25324851274490356, + "learning_rate": 0.0003527632765964811, + "loss": 1.0857, + "step": 3973 + }, + { + "epoch": 1.8223827542712991, + "grad_norm": 0.29076528549194336, + "learning_rate": 0.00035252663272512255, + "loss": 1.6969, + "step": 3974 + }, + { + "epoch": 1.8228414172686618, + "grad_norm": 0.2527695894241333, + "learning_rate": 0.0003522900250300443, + "loss": 1.2706, + "step": 3975 + }, + { + "epoch": 1.8233000802660246, + "grad_norm": 0.34270545840263367, + "learning_rate": 0.000352053453569288, + "loss": 1.6218, + "step": 3976 + }, + { + "epoch": 1.8237587432633873, + "grad_norm": 0.2784097194671631, + "learning_rate": 0.00035181691840088623, + "loss": 1.166, + "step": 3977 + }, + { + "epoch": 1.82421740626075, + "grad_norm": 0.30808866024017334, + "learning_rate": 0.0003515804195828629, + "loss": 1.2634, + "step": 3978 + }, + { + "epoch": 1.8246760692581128, + "grad_norm": 0.16882003843784332, + "learning_rate": 0.00035134395717323276, + "loss": 0.8922, + "step": 3979 + }, + { + "epoch": 1.8251347322554752, + "grad_norm": 0.1895928829908371, + "learning_rate": 0.0003511075312300018, + "loss": 0.895, + "step": 3980 + }, + { + "epoch": 1.825593395252838, + "grad_norm": 0.21428383886814117, + "learning_rate": 0.00035087114181116697, + "loss": 1.3183, + "step": 3981 + }, + { + "epoch": 1.8260520582502007, + "grad_norm": 0.28207674622535706, + "learning_rate": 0.0003506347889747164, + "loss": 0.9283, + "step": 3982 + }, + { + "epoch": 1.8265107212475633, + "grad_norm": 0.24229072034358978, + "learning_rate": 0.000350398472778629, + "loss": 1.783, + "step": 3983 + }, + { + "epoch": 1.8269693842449262, + "grad_norm": 0.309413880109787, + "learning_rate": 0.000350162193280875, + "loss": 1.2617, + "step": 3984 + }, + { + "epoch": 1.8274280472422886, + "grad_norm": 0.2385943979024887, + "learning_rate": 0.00034992595053941525, + "loss": 1.2587, + "step": 3985 + }, + { + "epoch": 1.8278867102396514, + "grad_norm": 0.31749603152275085, + "learning_rate": 0.00034968974461220195, + "loss": 1.8345, + "step": 3986 + }, + { + "epoch": 1.828345373237014, + "grad_norm": 0.28810974955558777, + "learning_rate": 0.0003494535755571781, + "loss": 1.673, + "step": 3987 + }, + { + "epoch": 1.8288040362343767, + "grad_norm": 0.310904324054718, + "learning_rate": 0.0003492174434322778, + "loss": 1.5857, + "step": 3988 + }, + { + "epoch": 1.8292626992317396, + "grad_norm": 0.20826320350170135, + "learning_rate": 0.00034898134829542565, + "loss": 0.8103, + "step": 3989 + }, + { + "epoch": 1.8297213622291022, + "grad_norm": 0.3267036974430084, + "learning_rate": 0.0003487452902045377, + "loss": 1.3232, + "step": 3990 + }, + { + "epoch": 1.8301800252264648, + "grad_norm": 0.24371105432510376, + "learning_rate": 0.00034850926921752067, + "loss": 1.2933, + "step": 3991 + }, + { + "epoch": 1.8306386882238277, + "grad_norm": 0.3523206412792206, + "learning_rate": 0.0003482732853922722, + "loss": 1.2762, + "step": 3992 + }, + { + "epoch": 1.83109735122119, + "grad_norm": 0.5423739552497864, + "learning_rate": 0.00034803733878668077, + "loss": 0.9312, + "step": 3993 + }, + { + "epoch": 1.831556014218553, + "grad_norm": 0.31650495529174805, + "learning_rate": 0.0003478014294586261, + "loss": 2.0525, + "step": 3994 + }, + { + "epoch": 1.8320146772159156, + "grad_norm": 0.3087564706802368, + "learning_rate": 0.000347565557465978, + "loss": 1.3784, + "step": 3995 + }, + { + "epoch": 1.8324733402132782, + "grad_norm": 0.2533722519874573, + "learning_rate": 0.0003473297228665978, + "loss": 1.3376, + "step": 3996 + }, + { + "epoch": 1.832932003210641, + "grad_norm": 0.3357933759689331, + "learning_rate": 0.0003470939257183373, + "loss": 1.7561, + "step": 3997 + }, + { + "epoch": 1.8333906662080035, + "grad_norm": 0.36861729621887207, + "learning_rate": 0.0003468581660790393, + "loss": 1.7476, + "step": 3998 + }, + { + "epoch": 1.8338493292053664, + "grad_norm": 0.26520270109176636, + "learning_rate": 0.0003466224440065377, + "loss": 1.5038, + "step": 3999 + }, + { + "epoch": 1.834307992202729, + "grad_norm": 0.3509536385536194, + "learning_rate": 0.0003463867595586562, + "loss": 1.8193, + "step": 4000 + }, + { + "epoch": 1.8347666552000916, + "grad_norm": 0.2574576139450073, + "learning_rate": 0.0003461511127932103, + "loss": 1.2697, + "step": 4001 + }, + { + "epoch": 1.8352253181974545, + "grad_norm": 0.3160921037197113, + "learning_rate": 0.00034591550376800563, + "loss": 1.655, + "step": 4002 + }, + { + "epoch": 1.8356839811948171, + "grad_norm": 0.34427279233932495, + "learning_rate": 0.00034567993254083887, + "loss": 1.3817, + "step": 4003 + }, + { + "epoch": 1.8361426441921798, + "grad_norm": 0.31001847982406616, + "learning_rate": 0.00034544439916949746, + "loss": 1.1495, + "step": 4004 + }, + { + "epoch": 1.8366013071895426, + "grad_norm": 0.23114445805549622, + "learning_rate": 0.0003452089037117595, + "loss": 1.3843, + "step": 4005 + }, + { + "epoch": 1.837059970186905, + "grad_norm": 0.2845550775527954, + "learning_rate": 0.0003449734462253934, + "loss": 1.0899, + "step": 4006 + }, + { + "epoch": 1.837518633184268, + "grad_norm": 0.368184894323349, + "learning_rate": 0.0003447380267681587, + "loss": 1.9531, + "step": 4007 + }, + { + "epoch": 1.8379772961816305, + "grad_norm": 0.27855613827705383, + "learning_rate": 0.0003445026453978054, + "loss": 0.8792, + "step": 4008 + }, + { + "epoch": 1.8384359591789932, + "grad_norm": 0.29736289381980896, + "learning_rate": 0.00034426730217207457, + "loss": 1.3437, + "step": 4009 + }, + { + "epoch": 1.838894622176356, + "grad_norm": 0.26978176832199097, + "learning_rate": 0.0003440319971486976, + "loss": 0.9284, + "step": 4010 + }, + { + "epoch": 1.8393532851737187, + "grad_norm": 0.24370868504047394, + "learning_rate": 0.00034379673038539604, + "loss": 1.8301, + "step": 4011 + }, + { + "epoch": 1.8398119481710813, + "grad_norm": 0.31381699442863464, + "learning_rate": 0.0003435615019398828, + "loss": 1.2786, + "step": 4012 + }, + { + "epoch": 1.8402706111684441, + "grad_norm": 0.36527925729751587, + "learning_rate": 0.00034332631186986107, + "loss": 1.5788, + "step": 4013 + }, + { + "epoch": 1.8407292741658066, + "grad_norm": 0.29045170545578003, + "learning_rate": 0.0003430911602330248, + "loss": 1.7421, + "step": 4014 + }, + { + "epoch": 1.8411879371631694, + "grad_norm": 0.515656590461731, + "learning_rate": 0.0003428560470870583, + "loss": 2.0176, + "step": 4015 + }, + { + "epoch": 1.841646600160532, + "grad_norm": 0.2787885367870331, + "learning_rate": 0.00034262097248963675, + "loss": 1.4663, + "step": 4016 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 0.2862381637096405, + "learning_rate": 0.00034238593649842515, + "loss": 0.9792, + "step": 4017 + }, + { + "epoch": 1.8425639261552575, + "grad_norm": 0.30698710680007935, + "learning_rate": 0.0003421509391710797, + "loss": 1.7229, + "step": 4018 + }, + { + "epoch": 1.84302258915262, + "grad_norm": 0.3188919723033905, + "learning_rate": 0.0003419159805652471, + "loss": 1.5143, + "step": 4019 + }, + { + "epoch": 1.8434812521499828, + "grad_norm": 0.3322117328643799, + "learning_rate": 0.0003416810607385644, + "loss": 1.9916, + "step": 4020 + }, + { + "epoch": 1.8439399151473455, + "grad_norm": 0.3120747208595276, + "learning_rate": 0.00034144617974865896, + "loss": 1.4741, + "step": 4021 + }, + { + "epoch": 1.844398578144708, + "grad_norm": 0.2903960049152374, + "learning_rate": 0.00034121133765314905, + "loss": 1.6743, + "step": 4022 + }, + { + "epoch": 1.844857241142071, + "grad_norm": 0.40729033946990967, + "learning_rate": 0.00034097653450964265, + "loss": 1.9248, + "step": 4023 + }, + { + "epoch": 1.8453159041394336, + "grad_norm": 0.3693930208683014, + "learning_rate": 0.00034074177037573904, + "loss": 2.1533, + "step": 4024 + }, + { + "epoch": 1.8457745671367962, + "grad_norm": 0.2894419729709625, + "learning_rate": 0.00034050704530902756, + "loss": 1.4656, + "step": 4025 + }, + { + "epoch": 1.846233230134159, + "grad_norm": 0.32523804903030396, + "learning_rate": 0.00034027235936708776, + "loss": 1.505, + "step": 4026 + }, + { + "epoch": 1.8466918931315215, + "grad_norm": 0.2615148723125458, + "learning_rate": 0.00034003771260749017, + "loss": 1.1829, + "step": 4027 + }, + { + "epoch": 1.8471505561288843, + "grad_norm": 0.34713688492774963, + "learning_rate": 0.00033980310508779476, + "loss": 1.1238, + "step": 4028 + }, + { + "epoch": 1.847609219126247, + "grad_norm": 0.2878703773021698, + "learning_rate": 0.0003395685368655528, + "loss": 1.0275, + "step": 4029 + }, + { + "epoch": 1.8480678821236096, + "grad_norm": 0.2368079125881195, + "learning_rate": 0.00033933400799830563, + "loss": 1.4276, + "step": 4030 + }, + { + "epoch": 1.8485265451209725, + "grad_norm": 0.20480850338935852, + "learning_rate": 0.0003390995185435847, + "loss": 1.324, + "step": 4031 + }, + { + "epoch": 1.8489852081183349, + "grad_norm": 0.27292463183403015, + "learning_rate": 0.00033886506855891195, + "loss": 1.3118, + "step": 4032 + }, + { + "epoch": 1.8494438711156977, + "grad_norm": 0.2694958448410034, + "learning_rate": 0.00033863065810179986, + "loss": 1.5169, + "step": 4033 + }, + { + "epoch": 1.8499025341130604, + "grad_norm": 0.3515981137752533, + "learning_rate": 0.0003383962872297508, + "loss": 2.0571, + "step": 4034 + }, + { + "epoch": 1.850361197110423, + "grad_norm": 0.3751930594444275, + "learning_rate": 0.0003381619560002577, + "loss": 1.3073, + "step": 4035 + }, + { + "epoch": 1.8508198601077859, + "grad_norm": 0.3277819752693176, + "learning_rate": 0.0003379276644708037, + "loss": 1.6505, + "step": 4036 + }, + { + "epoch": 1.8512785231051485, + "grad_norm": 0.17637912929058075, + "learning_rate": 0.00033769341269886225, + "loss": 0.4892, + "step": 4037 + }, + { + "epoch": 1.8517371861025111, + "grad_norm": 0.2396342158317566, + "learning_rate": 0.00033745920074189703, + "loss": 1.3491, + "step": 4038 + }, + { + "epoch": 1.852195849099874, + "grad_norm": 0.3997967541217804, + "learning_rate": 0.0003372250286573617, + "loss": 2.1116, + "step": 4039 + }, + { + "epoch": 1.8526545120972364, + "grad_norm": 0.27099528908729553, + "learning_rate": 0.00033699089650270054, + "loss": 1.0897, + "step": 4040 + }, + { + "epoch": 1.8531131750945993, + "grad_norm": 0.29823604226112366, + "learning_rate": 0.00033675680433534785, + "loss": 1.5873, + "step": 4041 + }, + { + "epoch": 1.853571838091962, + "grad_norm": 0.30324751138687134, + "learning_rate": 0.0003365227522127281, + "loss": 1.3735, + "step": 4042 + }, + { + "epoch": 1.8540305010893245, + "grad_norm": 0.2946029305458069, + "learning_rate": 0.000336288740192256, + "loss": 1.4114, + "step": 4043 + }, + { + "epoch": 1.8544891640866874, + "grad_norm": 0.2893146872520447, + "learning_rate": 0.0003360547683313363, + "loss": 2.0024, + "step": 4044 + }, + { + "epoch": 1.85494782708405, + "grad_norm": 0.37933430075645447, + "learning_rate": 0.00033582083668736405, + "loss": 1.7002, + "step": 4045 + }, + { + "epoch": 1.8554064900814127, + "grad_norm": 0.27654582262039185, + "learning_rate": 0.0003355869453177244, + "loss": 0.7906, + "step": 4046 + }, + { + "epoch": 1.8558651530787755, + "grad_norm": 0.09229593724012375, + "learning_rate": 0.00033535309427979245, + "loss": 1.09, + "step": 4047 + }, + { + "epoch": 1.856323816076138, + "grad_norm": 0.3510340750217438, + "learning_rate": 0.0003351192836309336, + "loss": 1.3514, + "step": 4048 + }, + { + "epoch": 1.8567824790735008, + "grad_norm": 0.2260064333677292, + "learning_rate": 0.0003348855134285034, + "loss": 0.8711, + "step": 4049 + }, + { + "epoch": 1.8572411420708634, + "grad_norm": 0.27447935938835144, + "learning_rate": 0.0003346517837298474, + "loss": 1.7773, + "step": 4050 + }, + { + "epoch": 1.857699805068226, + "grad_norm": 0.2299553006887436, + "learning_rate": 0.000334418094592301, + "loss": 0.9055, + "step": 4051 + }, + { + "epoch": 1.858158468065589, + "grad_norm": 0.2812572121620178, + "learning_rate": 0.0003341844460731899, + "loss": 1.3986, + "step": 4052 + }, + { + "epoch": 1.8586171310629513, + "grad_norm": 0.18786439299583435, + "learning_rate": 0.0003339508382298297, + "loss": 0.7558, + "step": 4053 + }, + { + "epoch": 1.8590757940603142, + "grad_norm": 0.06244681030511856, + "learning_rate": 0.0003337172711195262, + "loss": 0.9041, + "step": 4054 + }, + { + "epoch": 1.8595344570576768, + "grad_norm": 0.3086715042591095, + "learning_rate": 0.00033348374479957513, + "loss": 1.2524, + "step": 4055 + }, + { + "epoch": 1.8599931200550395, + "grad_norm": 0.1853373497724533, + "learning_rate": 0.000333250259327262, + "loss": 0.4612, + "step": 4056 + }, + { + "epoch": 1.8604517830524023, + "grad_norm": 0.24042657017707825, + "learning_rate": 0.0003330168147598626, + "loss": 1.0255, + "step": 4057 + }, + { + "epoch": 1.860910446049765, + "grad_norm": 0.36561837792396545, + "learning_rate": 0.00033278341115464263, + "loss": 1.3232, + "step": 4058 + }, + { + "epoch": 1.8613691090471276, + "grad_norm": 0.2835361361503601, + "learning_rate": 0.0003325500485688575, + "loss": 1.1592, + "step": 4059 + }, + { + "epoch": 1.8618277720444905, + "grad_norm": 0.3773711919784546, + "learning_rate": 0.0003323167270597528, + "loss": 1.4202, + "step": 4060 + }, + { + "epoch": 1.8622864350418529, + "grad_norm": 0.28732502460479736, + "learning_rate": 0.00033208344668456417, + "loss": 1.6362, + "step": 4061 + }, + { + "epoch": 1.8627450980392157, + "grad_norm": 0.35971352458000183, + "learning_rate": 0.00033185020750051673, + "loss": 1.4979, + "step": 4062 + }, + { + "epoch": 1.8632037610365784, + "grad_norm": 0.22411774098873138, + "learning_rate": 0.00033161700956482574, + "loss": 0.8292, + "step": 4063 + }, + { + "epoch": 1.863662424033941, + "grad_norm": 0.2379181832075119, + "learning_rate": 0.00033138385293469654, + "loss": 1.1412, + "step": 4064 + }, + { + "epoch": 1.8641210870313039, + "grad_norm": 0.2763615548610687, + "learning_rate": 0.00033115073766732376, + "loss": 1.2769, + "step": 4065 + }, + { + "epoch": 1.8645797500286663, + "grad_norm": 0.23881085216999054, + "learning_rate": 0.0003309176638198929, + "loss": 0.9071, + "step": 4066 + }, + { + "epoch": 1.8650384130260291, + "grad_norm": 0.3810456097126007, + "learning_rate": 0.000330684631449578, + "loss": 1.6372, + "step": 4067 + }, + { + "epoch": 1.8654970760233918, + "grad_norm": 0.310472309589386, + "learning_rate": 0.0003304516406135438, + "loss": 1.0613, + "step": 4068 + }, + { + "epoch": 1.8659557390207544, + "grad_norm": 0.15861466526985168, + "learning_rate": 0.0003302186913689448, + "loss": 0.6324, + "step": 4069 + }, + { + "epoch": 1.8664144020181173, + "grad_norm": 0.22283266484737396, + "learning_rate": 0.00032998578377292474, + "loss": 0.9156, + "step": 4070 + }, + { + "epoch": 1.86687306501548, + "grad_norm": 1.139766812324524, + "learning_rate": 0.00032975291788261794, + "loss": 1.3506, + "step": 4071 + }, + { + "epoch": 1.8673317280128425, + "grad_norm": 0.41252392530441284, + "learning_rate": 0.00032952009375514815, + "loss": 1.6586, + "step": 4072 + }, + { + "epoch": 1.8677903910102054, + "grad_norm": 0.1836547702550888, + "learning_rate": 0.00032928731144762837, + "loss": 0.9196, + "step": 4073 + }, + { + "epoch": 1.8682490540075678, + "grad_norm": 0.32911989092826843, + "learning_rate": 0.00032905457101716195, + "loss": 1.861, + "step": 4074 + }, + { + "epoch": 1.8687077170049307, + "grad_norm": 0.3695148825645447, + "learning_rate": 0.00032882187252084185, + "loss": 2.0391, + "step": 4075 + }, + { + "epoch": 1.8691663800022933, + "grad_norm": 0.4213230311870575, + "learning_rate": 0.0003285892160157507, + "loss": 1.6511, + "step": 4076 + }, + { + "epoch": 1.869625042999656, + "grad_norm": 0.24618668854236603, + "learning_rate": 0.0003283566015589608, + "loss": 1.6064, + "step": 4077 + }, + { + "epoch": 1.8700837059970188, + "grad_norm": 0.43131116032600403, + "learning_rate": 0.00032812402920753434, + "loss": 1.8835, + "step": 4078 + }, + { + "epoch": 1.8705423689943814, + "grad_norm": 0.3532734811306, + "learning_rate": 0.00032789149901852265, + "loss": 0.9022, + "step": 4079 + }, + { + "epoch": 1.871001031991744, + "grad_norm": 0.21644118428230286, + "learning_rate": 0.00032765901104896714, + "loss": 1.0367, + "step": 4080 + }, + { + "epoch": 1.871459694989107, + "grad_norm": 0.13179314136505127, + "learning_rate": 0.0003274265653558989, + "loss": 1.123, + "step": 4081 + }, + { + "epoch": 1.8719183579864693, + "grad_norm": 0.3340977430343628, + "learning_rate": 0.00032719416199633843, + "loss": 1.6467, + "step": 4082 + }, + { + "epoch": 1.8723770209838322, + "grad_norm": 0.3839796185493469, + "learning_rate": 0.0003269618010272963, + "loss": 2.147, + "step": 4083 + }, + { + "epoch": 1.8728356839811948, + "grad_norm": 0.42947497963905334, + "learning_rate": 0.0003267294825057719, + "loss": 1.448, + "step": 4084 + }, + { + "epoch": 1.8732943469785575, + "grad_norm": 0.3551345765590668, + "learning_rate": 0.0003264972064887546, + "loss": 1.4087, + "step": 4085 + }, + { + "epoch": 1.8737530099759203, + "grad_norm": 0.20159225165843964, + "learning_rate": 0.0003262649730332237, + "loss": 0.8688, + "step": 4086 + }, + { + "epoch": 1.8742116729732827, + "grad_norm": 0.17780308425426483, + "learning_rate": 0.0003260327821961476, + "loss": 0.6803, + "step": 4087 + }, + { + "epoch": 1.8746703359706456, + "grad_norm": 0.27476388216018677, + "learning_rate": 0.0003258006340344845, + "loss": 1.5988, + "step": 4088 + }, + { + "epoch": 1.8751289989680082, + "grad_norm": 0.2936633229255676, + "learning_rate": 0.000325568528605182, + "loss": 1.5938, + "step": 4089 + }, + { + "epoch": 1.8755876619653709, + "grad_norm": 0.24134798347949982, + "learning_rate": 0.00032533646596517683, + "loss": 0.5465, + "step": 4090 + }, + { + "epoch": 1.8760463249627337, + "grad_norm": 0.23964928090572357, + "learning_rate": 0.0003251044461713961, + "loss": 1.5688, + "step": 4091 + }, + { + "epoch": 1.8765049879600963, + "grad_norm": 0.4487531781196594, + "learning_rate": 0.0003248724692807558, + "loss": 1.3783, + "step": 4092 + }, + { + "epoch": 1.876963650957459, + "grad_norm": 0.2621656358242035, + "learning_rate": 0.00032464053535016145, + "loss": 1.6387, + "step": 4093 + }, + { + "epoch": 1.8774223139548218, + "grad_norm": 0.5254698395729065, + "learning_rate": 0.0003244086444365085, + "loss": 1.5309, + "step": 4094 + }, + { + "epoch": 1.8778809769521843, + "grad_norm": 0.4006554186344147, + "learning_rate": 0.00032417679659668073, + "loss": 1.6442, + "step": 4095 + }, + { + "epoch": 1.8783396399495471, + "grad_norm": 0.44737085700035095, + "learning_rate": 0.00032394499188755267, + "loss": 1.443, + "step": 4096 + }, + { + "epoch": 1.8787983029469097, + "grad_norm": 0.5100827217102051, + "learning_rate": 0.0003237132303659875, + "loss": 2.3247, + "step": 4097 + }, + { + "epoch": 1.8792569659442724, + "grad_norm": 0.43226632475852966, + "learning_rate": 0.00032348151208883805, + "loss": 2.1605, + "step": 4098 + }, + { + "epoch": 1.8797156289416352, + "grad_norm": 0.35672900080680847, + "learning_rate": 0.0003232498371129464, + "loss": 1.7168, + "step": 4099 + }, + { + "epoch": 1.8801742919389977, + "grad_norm": 0.3012166917324066, + "learning_rate": 0.0003230182054951443, + "loss": 1.6538, + "step": 4100 + }, + { + "epoch": 1.8806329549363605, + "grad_norm": 0.31628987193107605, + "learning_rate": 0.00032278661729225234, + "loss": 0.8449, + "step": 4101 + }, + { + "epoch": 1.8810916179337231, + "grad_norm": 0.25368666648864746, + "learning_rate": 0.000322555072561081, + "loss": 1.324, + "step": 4102 + }, + { + "epoch": 1.8815502809310858, + "grad_norm": 0.21288840472698212, + "learning_rate": 0.0003223235713584297, + "loss": 0.9169, + "step": 4103 + }, + { + "epoch": 1.8820089439284486, + "grad_norm": 0.24230335652828217, + "learning_rate": 0.00032209211374108746, + "loss": 1.0717, + "step": 4104 + }, + { + "epoch": 1.8824676069258113, + "grad_norm": 0.24073179066181183, + "learning_rate": 0.0003218606997658326, + "loss": 1.5495, + "step": 4105 + }, + { + "epoch": 1.882926269923174, + "grad_norm": 0.2884657680988312, + "learning_rate": 0.00032162932948943257, + "loss": 0.7274, + "step": 4106 + }, + { + "epoch": 1.8833849329205368, + "grad_norm": 0.2240462303161621, + "learning_rate": 0.0003213980029686441, + "loss": 1.6514, + "step": 4107 + }, + { + "epoch": 1.8838435959178992, + "grad_norm": 0.3176068663597107, + "learning_rate": 0.0003211667202602132, + "loss": 1.1339, + "step": 4108 + }, + { + "epoch": 1.884302258915262, + "grad_norm": 0.2350878119468689, + "learning_rate": 0.0003209354814208754, + "loss": 0.9783, + "step": 4109 + }, + { + "epoch": 1.8847609219126247, + "grad_norm": 0.16942141950130463, + "learning_rate": 0.00032070428650735506, + "loss": 1.2051, + "step": 4110 + }, + { + "epoch": 1.8852195849099873, + "grad_norm": 0.2659427523612976, + "learning_rate": 0.0003204731355763661, + "loss": 1.3427, + "step": 4111 + }, + { + "epoch": 1.8856782479073502, + "grad_norm": 0.2091580033302307, + "learning_rate": 0.00032024202868461137, + "loss": 1.1494, + "step": 4112 + }, + { + "epoch": 1.8861369109047128, + "grad_norm": 0.3054332733154297, + "learning_rate": 0.0003200109658887831, + "loss": 1.0687, + "step": 4113 + }, + { + "epoch": 1.8865955739020754, + "grad_norm": 0.23581813275814056, + "learning_rate": 0.0003197799472455627, + "loss": 1.0554, + "step": 4114 + }, + { + "epoch": 1.8870542368994383, + "grad_norm": 0.3387794494628906, + "learning_rate": 0.0003195489728116207, + "loss": 2.1521, + "step": 4115 + }, + { + "epoch": 1.8875128998968007, + "grad_norm": 0.29953324794769287, + "learning_rate": 0.00031931804264361674, + "loss": 0.8916, + "step": 4116 + }, + { + "epoch": 1.8879715628941636, + "grad_norm": 0.2123539000749588, + "learning_rate": 0.0003190871567981999, + "loss": 1.2987, + "step": 4117 + }, + { + "epoch": 1.8884302258915262, + "grad_norm": 0.40169602632522583, + "learning_rate": 0.00031885631533200775, + "loss": 1.4036, + "step": 4118 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.2332458198070526, + "learning_rate": 0.00031862551830166765, + "loss": 1.162, + "step": 4119 + }, + { + "epoch": 1.8893475518862517, + "grad_norm": 0.27471116185188293, + "learning_rate": 0.0003183947657637957, + "loss": 1.3871, + "step": 4120 + }, + { + "epoch": 1.889806214883614, + "grad_norm": 0.3699858784675598, + "learning_rate": 0.00031816405777499704, + "loss": 1.5227, + "step": 4121 + }, + { + "epoch": 1.890264877880977, + "grad_norm": 0.27009811997413635, + "learning_rate": 0.0003179333943918663, + "loss": 1.3455, + "step": 4122 + }, + { + "epoch": 1.8907235408783396, + "grad_norm": 0.2861959934234619, + "learning_rate": 0.00031770277567098654, + "loss": 1.7307, + "step": 4123 + }, + { + "epoch": 1.8911822038757022, + "grad_norm": 0.31758224964141846, + "learning_rate": 0.0003174722016689303, + "loss": 1.1325, + "step": 4124 + }, + { + "epoch": 1.891640866873065, + "grad_norm": 0.32737424969673157, + "learning_rate": 0.0003172416724422592, + "loss": 1.3532, + "step": 4125 + }, + { + "epoch": 1.8920995298704277, + "grad_norm": 0.14036279916763306, + "learning_rate": 0.00031701118804752353, + "loss": 0.6664, + "step": 4126 + }, + { + "epoch": 1.8925581928677904, + "grad_norm": 0.19789431989192963, + "learning_rate": 0.0003167807485412629, + "loss": 1.0829, + "step": 4127 + }, + { + "epoch": 1.8930168558651532, + "grad_norm": 0.2573801577091217, + "learning_rate": 0.00031655035398000576, + "loss": 1.0196, + "step": 4128 + }, + { + "epoch": 1.8934755188625156, + "grad_norm": 0.20172810554504395, + "learning_rate": 0.00031632000442026947, + "loss": 1.053, + "step": 4129 + }, + { + "epoch": 1.8939341818598785, + "grad_norm": 0.33292561769485474, + "learning_rate": 0.00031608969991856053, + "loss": 1.9376, + "step": 4130 + }, + { + "epoch": 1.8943928448572411, + "grad_norm": 0.2850032150745392, + "learning_rate": 0.00031585944053137417, + "loss": 1.5627, + "step": 4131 + }, + { + "epoch": 1.8948515078546038, + "grad_norm": 0.46692344546318054, + "learning_rate": 0.0003156292263151949, + "loss": 1.7141, + "step": 4132 + }, + { + "epoch": 1.8953101708519666, + "grad_norm": 0.39476048946380615, + "learning_rate": 0.00031539905732649555, + "loss": 2.0542, + "step": 4133 + }, + { + "epoch": 1.8957688338493293, + "grad_norm": 0.4325892925262451, + "learning_rate": 0.00031516893362173884, + "loss": 1.1792, + "step": 4134 + }, + { + "epoch": 1.896227496846692, + "grad_norm": 0.22258636355400085, + "learning_rate": 0.0003149388552573752, + "loss": 1.4821, + "step": 4135 + }, + { + "epoch": 1.8966861598440545, + "grad_norm": 0.47724971175193787, + "learning_rate": 0.00031470882228984475, + "loss": 2.2678, + "step": 4136 + }, + { + "epoch": 1.8971448228414172, + "grad_norm": 0.26698240637779236, + "learning_rate": 0.0003144788347755763, + "loss": 1.0363, + "step": 4137 + }, + { + "epoch": 1.89760348583878, + "grad_norm": 0.3450622856616974, + "learning_rate": 0.0003142488927709871, + "loss": 2.0267, + "step": 4138 + }, + { + "epoch": 1.8980621488361427, + "grad_norm": 0.2508101463317871, + "learning_rate": 0.0003140189963324842, + "loss": 0.9557, + "step": 4139 + }, + { + "epoch": 1.8985208118335053, + "grad_norm": 0.2404715120792389, + "learning_rate": 0.0003137891455164623, + "loss": 1.6057, + "step": 4140 + }, + { + "epoch": 1.8989794748308682, + "grad_norm": 0.3351982533931732, + "learning_rate": 0.00031355934037930567, + "loss": 1.451, + "step": 4141 + }, + { + "epoch": 1.8994381378282306, + "grad_norm": 0.3819112479686737, + "learning_rate": 0.00031332958097738707, + "loss": 1.3151, + "step": 4142 + }, + { + "epoch": 1.8998968008255934, + "grad_norm": 0.25655218958854675, + "learning_rate": 0.00031309986736706826, + "loss": 1.1487, + "step": 4143 + }, + { + "epoch": 1.900355463822956, + "grad_norm": 0.18810351192951202, + "learning_rate": 0.00031287019960469966, + "loss": 1.4159, + "step": 4144 + }, + { + "epoch": 1.9008141268203187, + "grad_norm": 0.3258521854877472, + "learning_rate": 0.00031264057774662044, + "loss": 1.6307, + "step": 4145 + }, + { + "epoch": 1.9012727898176816, + "grad_norm": 0.309843510389328, + "learning_rate": 0.0003124110018491584, + "loss": 1.011, + "step": 4146 + }, + { + "epoch": 1.9017314528150442, + "grad_norm": 0.2843288481235504, + "learning_rate": 0.00031218147196863, + "loss": 1.2536, + "step": 4147 + }, + { + "epoch": 1.9021901158124068, + "grad_norm": 0.08047755807638168, + "learning_rate": 0.00031195198816134093, + "loss": 0.6959, + "step": 4148 + }, + { + "epoch": 1.9026487788097697, + "grad_norm": 0.4606052339076996, + "learning_rate": 0.000311722550483585, + "loss": 1.2046, + "step": 4149 + }, + { + "epoch": 1.903107441807132, + "grad_norm": 0.2677907645702362, + "learning_rate": 0.0003114931589916452, + "loss": 1.6278, + "step": 4150 + }, + { + "epoch": 1.903566104804495, + "grad_norm": 0.32580938935279846, + "learning_rate": 0.0003112638137417925, + "loss": 1.4391, + "step": 4151 + }, + { + "epoch": 1.9040247678018576, + "grad_norm": 0.23725193738937378, + "learning_rate": 0.00031103451479028713, + "loss": 1.2067, + "step": 4152 + }, + { + "epoch": 1.9044834307992202, + "grad_norm": 0.3534078299999237, + "learning_rate": 0.0003108052621933778, + "loss": 1.437, + "step": 4153 + }, + { + "epoch": 1.904942093796583, + "grad_norm": 0.395913690328598, + "learning_rate": 0.0003105760560073018, + "loss": 2.239, + "step": 4154 + }, + { + "epoch": 1.9054007567939455, + "grad_norm": 0.38620731234550476, + "learning_rate": 0.0003103468962882851, + "loss": 1.0384, + "step": 4155 + }, + { + "epoch": 1.9058594197913084, + "grad_norm": 0.27703335881233215, + "learning_rate": 0.00031011778309254247, + "loss": 1.6416, + "step": 4156 + }, + { + "epoch": 1.906318082788671, + "grad_norm": 0.23753665387630463, + "learning_rate": 0.00030988871647627645, + "loss": 0.7991, + "step": 4157 + }, + { + "epoch": 1.9067767457860336, + "grad_norm": 0.39035359025001526, + "learning_rate": 0.0003096596964956791, + "loss": 1.59, + "step": 4158 + }, + { + "epoch": 1.9072354087833965, + "grad_norm": 0.22406819462776184, + "learning_rate": 0.00030943072320693067, + "loss": 0.8857, + "step": 4159 + }, + { + "epoch": 1.9076940717807591, + "grad_norm": 0.36969655752182007, + "learning_rate": 0.00030920179666619986, + "loss": 1.712, + "step": 4160 + }, + { + "epoch": 1.9081527347781218, + "grad_norm": 0.3277752697467804, + "learning_rate": 0.0003089729169296444, + "loss": 1.5408, + "step": 4161 + }, + { + "epoch": 1.9086113977754846, + "grad_norm": 0.4305424392223358, + "learning_rate": 0.0003087440840534093, + "loss": 1.7606, + "step": 4162 + }, + { + "epoch": 1.909070060772847, + "grad_norm": 0.2649680972099304, + "learning_rate": 0.0003085152980936296, + "loss": 1.4221, + "step": 4163 + }, + { + "epoch": 1.9095287237702099, + "grad_norm": 0.34129345417022705, + "learning_rate": 0.00030828655910642794, + "loss": 1.2801, + "step": 4164 + }, + { + "epoch": 1.9099873867675725, + "grad_norm": 0.16658979654312134, + "learning_rate": 0.0003080578671479157, + "loss": 0.7952, + "step": 4165 + }, + { + "epoch": 1.9104460497649352, + "grad_norm": 0.18683701753616333, + "learning_rate": 0.0003078292222741925, + "loss": 1.1949, + "step": 4166 + }, + { + "epoch": 1.910904712762298, + "grad_norm": 0.25396600365638733, + "learning_rate": 0.00030760062454134697, + "loss": 0.8212, + "step": 4167 + }, + { + "epoch": 1.9113633757596606, + "grad_norm": 0.2024562507867813, + "learning_rate": 0.0003073720740054553, + "loss": 1.2571, + "step": 4168 + }, + { + "epoch": 1.9118220387570233, + "grad_norm": 0.3227396309375763, + "learning_rate": 0.0003071435707225828, + "loss": 1.4965, + "step": 4169 + }, + { + "epoch": 1.912280701754386, + "grad_norm": 0.29104283452033997, + "learning_rate": 0.000306915114748783, + "loss": 1.8234, + "step": 4170 + }, + { + "epoch": 1.9127393647517485, + "grad_norm": 0.3544504642486572, + "learning_rate": 0.00030668670614009775, + "loss": 1.3737, + "step": 4171 + }, + { + "epoch": 1.9131980277491114, + "grad_norm": 0.14929145574569702, + "learning_rate": 0.0003064583449525574, + "loss": 1.0649, + "step": 4172 + }, + { + "epoch": 1.913656690746474, + "grad_norm": 0.2932626008987427, + "learning_rate": 0.0003062300312421806, + "loss": 1.3504, + "step": 4173 + }, + { + "epoch": 1.9141153537438367, + "grad_norm": 0.3478389084339142, + "learning_rate": 0.0003060017650649742, + "loss": 1.5471, + "step": 4174 + }, + { + "epoch": 1.9145740167411995, + "grad_norm": 0.30949702858924866, + "learning_rate": 0.00030577354647693354, + "loss": 1.1508, + "step": 4175 + }, + { + "epoch": 1.915032679738562, + "grad_norm": 0.2801455855369568, + "learning_rate": 0.0003055453755340425, + "loss": 1.7807, + "step": 4176 + }, + { + "epoch": 1.9154913427359248, + "grad_norm": 0.360272616147995, + "learning_rate": 0.0003053172522922729, + "loss": 1.7485, + "step": 4177 + }, + { + "epoch": 1.9159500057332874, + "grad_norm": 0.34427425265312195, + "learning_rate": 0.0003050891768075851, + "loss": 1.1602, + "step": 4178 + }, + { + "epoch": 1.91640866873065, + "grad_norm": 0.3189522922039032, + "learning_rate": 0.00030486114913592753, + "loss": 1.6614, + "step": 4179 + }, + { + "epoch": 1.916867331728013, + "grad_norm": 0.3376857340335846, + "learning_rate": 0.00030463316933323717, + "loss": 1.2032, + "step": 4180 + }, + { + "epoch": 1.9173259947253756, + "grad_norm": 0.36120155453681946, + "learning_rate": 0.00030440523745543893, + "loss": 1.8868, + "step": 4181 + }, + { + "epoch": 1.9177846577227382, + "grad_norm": 0.3722556531429291, + "learning_rate": 0.00030417735355844634, + "loss": 1.3034, + "step": 4182 + }, + { + "epoch": 1.918243320720101, + "grad_norm": 0.27059370279312134, + "learning_rate": 0.00030394951769816084, + "loss": 0.9223, + "step": 4183 + }, + { + "epoch": 1.9187019837174635, + "grad_norm": 0.3722488582134247, + "learning_rate": 0.0003037217299304723, + "loss": 2.0471, + "step": 4184 + }, + { + "epoch": 1.9191606467148263, + "grad_norm": 0.42110106348991394, + "learning_rate": 0.00030349399031125856, + "loss": 1.8804, + "step": 4185 + }, + { + "epoch": 1.919619309712189, + "grad_norm": 0.33694103360176086, + "learning_rate": 0.00030326629889638595, + "loss": 1.3992, + "step": 4186 + }, + { + "epoch": 1.9200779727095516, + "grad_norm": 0.26390311121940613, + "learning_rate": 0.00030303865574170876, + "loss": 1.6186, + "step": 4187 + }, + { + "epoch": 1.9205366357069145, + "grad_norm": 0.2693273723125458, + "learning_rate": 0.0003028110609030694, + "loss": 1.4586, + "step": 4188 + }, + { + "epoch": 1.9209952987042769, + "grad_norm": 0.4307970702648163, + "learning_rate": 0.0003025835144362987, + "loss": 1.1955, + "step": 4189 + }, + { + "epoch": 1.9214539617016397, + "grad_norm": 0.11909201741218567, + "learning_rate": 0.00030235601639721534, + "loss": 1.0518, + "step": 4190 + }, + { + "epoch": 1.9219126246990024, + "grad_norm": 0.36401188373565674, + "learning_rate": 0.00030212856684162613, + "loss": 0.9025, + "step": 4191 + }, + { + "epoch": 1.922371287696365, + "grad_norm": 0.22211536765098572, + "learning_rate": 0.00030190116582532627, + "loss": 1.1021, + "step": 4192 + }, + { + "epoch": 1.9228299506937279, + "grad_norm": 0.310380220413208, + "learning_rate": 0.0003016738134040988, + "loss": 1.709, + "step": 4193 + }, + { + "epoch": 1.9232886136910905, + "grad_norm": 0.36067163944244385, + "learning_rate": 0.00030144650963371487, + "loss": 1.153, + "step": 4194 + }, + { + "epoch": 1.9237472766884531, + "grad_norm": 0.2253832072019577, + "learning_rate": 0.00030121925456993396, + "loss": 1.1205, + "step": 4195 + }, + { + "epoch": 1.924205939685816, + "grad_norm": 0.3536555767059326, + "learning_rate": 0.000300992048268503, + "loss": 1.7184, + "step": 4196 + }, + { + "epoch": 1.9246646026831784, + "grad_norm": 0.0917428508400917, + "learning_rate": 0.0003007648907851576, + "loss": 1.0483, + "step": 4197 + }, + { + "epoch": 1.9251232656805413, + "grad_norm": 0.36443841457366943, + "learning_rate": 0.000300537782175621, + "loss": 1.7299, + "step": 4198 + }, + { + "epoch": 1.925581928677904, + "grad_norm": 0.33077698945999146, + "learning_rate": 0.0003003107224956046, + "loss": 1.6121, + "step": 4199 + }, + { + "epoch": 1.9260405916752665, + "grad_norm": 0.40068572759628296, + "learning_rate": 0.00030008371180080773, + "loss": 1.2148, + "step": 4200 + }, + { + "epoch": 1.9264992546726294, + "grad_norm": 0.363941490650177, + "learning_rate": 0.00029985675014691814, + "loss": 1.9509, + "step": 4201 + }, + { + "epoch": 1.926957917669992, + "grad_norm": 0.1912655234336853, + "learning_rate": 0.00029962983758961067, + "loss": 0.9309, + "step": 4202 + }, + { + "epoch": 1.9274165806673547, + "grad_norm": 0.2495778501033783, + "learning_rate": 0.0002994029741845488, + "loss": 1.5195, + "step": 4203 + }, + { + "epoch": 1.9278752436647173, + "grad_norm": 0.29733824729919434, + "learning_rate": 0.00029917615998738365, + "loss": 1.0684, + "step": 4204 + }, + { + "epoch": 1.92833390666208, + "grad_norm": 0.36461958289146423, + "learning_rate": 0.0002989493950537544, + "loss": 1.8616, + "step": 4205 + }, + { + "epoch": 1.9287925696594428, + "grad_norm": 0.3024968206882477, + "learning_rate": 0.0002987226794392885, + "loss": 1.166, + "step": 4206 + }, + { + "epoch": 1.9292512326568054, + "grad_norm": 0.32046911120414734, + "learning_rate": 0.0002984960131996004, + "loss": 1.0956, + "step": 4207 + }, + { + "epoch": 1.929709895654168, + "grad_norm": 0.1437264233827591, + "learning_rate": 0.00029826939639029324, + "loss": 1.0188, + "step": 4208 + }, + { + "epoch": 1.930168558651531, + "grad_norm": 0.342047780752182, + "learning_rate": 0.00029804282906695765, + "loss": 1.4002, + "step": 4209 + }, + { + "epoch": 1.9306272216488933, + "grad_norm": 0.27232903242111206, + "learning_rate": 0.0002978163112851722, + "loss": 1.7488, + "step": 4210 + }, + { + "epoch": 1.9310858846462562, + "grad_norm": 0.30967265367507935, + "learning_rate": 0.00029758984310050354, + "loss": 1.4791, + "step": 4211 + }, + { + "epoch": 1.9315445476436188, + "grad_norm": 0.3274848759174347, + "learning_rate": 0.000297363424568506, + "loss": 1.6986, + "step": 4212 + }, + { + "epoch": 1.9320032106409815, + "grad_norm": 0.3882467746734619, + "learning_rate": 0.0002971370557447213, + "loss": 1.7115, + "step": 4213 + }, + { + "epoch": 1.9324618736383443, + "grad_norm": 0.35071897506713867, + "learning_rate": 0.0002969107366846794, + "loss": 1.2036, + "step": 4214 + }, + { + "epoch": 1.932920536635707, + "grad_norm": 0.3669132590293884, + "learning_rate": 0.0002966844674438982, + "loss": 1.8774, + "step": 4215 + }, + { + "epoch": 1.9333791996330696, + "grad_norm": 0.3220601975917816, + "learning_rate": 0.00029645824807788325, + "loss": 1.8994, + "step": 4216 + }, + { + "epoch": 1.9338378626304324, + "grad_norm": 0.3539913594722748, + "learning_rate": 0.00029623207864212775, + "loss": 1.5546, + "step": 4217 + }, + { + "epoch": 1.9342965256277949, + "grad_norm": 0.2368205189704895, + "learning_rate": 0.00029600595919211247, + "loss": 1.272, + "step": 4218 + }, + { + "epoch": 1.9347551886251577, + "grad_norm": 0.38960981369018555, + "learning_rate": 0.00029577988978330615, + "loss": 2.0374, + "step": 4219 + }, + { + "epoch": 1.9352138516225204, + "grad_norm": 0.3929775059223175, + "learning_rate": 0.00029555387047116547, + "loss": 2.1378, + "step": 4220 + }, + { + "epoch": 1.935672514619883, + "grad_norm": 0.34435439109802246, + "learning_rate": 0.00029532790131113446, + "loss": 1.444, + "step": 4221 + }, + { + "epoch": 1.9361311776172458, + "grad_norm": 0.27234023809432983, + "learning_rate": 0.00029510198235864504, + "loss": 1.2824, + "step": 4222 + }, + { + "epoch": 1.9365898406146083, + "grad_norm": 0.30018359422683716, + "learning_rate": 0.000294876113669117, + "loss": 1.8326, + "step": 4223 + }, + { + "epoch": 1.9370485036119711, + "grad_norm": 0.46480369567871094, + "learning_rate": 0.00029465029529795696, + "loss": 1.6801, + "step": 4224 + }, + { + "epoch": 1.9375071666093338, + "grad_norm": 0.4117535948753357, + "learning_rate": 0.0002944245273005602, + "loss": 1.7745, + "step": 4225 + }, + { + "epoch": 1.9379658296066964, + "grad_norm": 0.19903844594955444, + "learning_rate": 0.00029419880973230916, + "loss": 0.8529, + "step": 4226 + }, + { + "epoch": 1.9384244926040592, + "grad_norm": 0.3728976249694824, + "learning_rate": 0.00029397314264857405, + "loss": 2.2271, + "step": 4227 + }, + { + "epoch": 1.9388831556014219, + "grad_norm": 0.4130072593688965, + "learning_rate": 0.00029374752610471255, + "loss": 1.9492, + "step": 4228 + }, + { + "epoch": 1.9393418185987845, + "grad_norm": 0.09748774021863937, + "learning_rate": 0.00029352196015607014, + "loss": 0.4857, + "step": 4229 + }, + { + "epoch": 1.9398004815961474, + "grad_norm": 0.289890319108963, + "learning_rate": 0.00029329644485797963, + "loss": 1.7548, + "step": 4230 + }, + { + "epoch": 1.9402591445935098, + "grad_norm": 0.33195921778678894, + "learning_rate": 0.00029307098026576156, + "loss": 1.767, + "step": 4231 + }, + { + "epoch": 1.9407178075908726, + "grad_norm": 0.30616116523742676, + "learning_rate": 0.0002928455664347241, + "loss": 1.5225, + "step": 4232 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 0.3174733519554138, + "learning_rate": 0.0002926202034201628, + "loss": 1.3201, + "step": 4233 + }, + { + "epoch": 1.941635133585598, + "grad_norm": 0.3208681643009186, + "learning_rate": 0.00029239489127736107, + "loss": 1.0652, + "step": 4234 + }, + { + "epoch": 1.9420937965829608, + "grad_norm": 0.21303045749664307, + "learning_rate": 0.0002921696300615893, + "loss": 1.0555, + "step": 4235 + }, + { + "epoch": 1.9425524595803234, + "grad_norm": 0.2299261838197708, + "learning_rate": 0.0002919444198281058, + "loss": 1.4859, + "step": 4236 + }, + { + "epoch": 1.943011122577686, + "grad_norm": 0.38345012068748474, + "learning_rate": 0.0002917192606321563, + "loss": 1.7592, + "step": 4237 + }, + { + "epoch": 1.9434697855750487, + "grad_norm": 0.36465907096862793, + "learning_rate": 0.0002914941525289739, + "loss": 1.1496, + "step": 4238 + }, + { + "epoch": 1.9439284485724113, + "grad_norm": 0.4449915885925293, + "learning_rate": 0.00029126909557377923, + "loss": 1.3264, + "step": 4239 + }, + { + "epoch": 1.9443871115697742, + "grad_norm": 0.31518861651420593, + "learning_rate": 0.0002910440898217808, + "loss": 1.6181, + "step": 4240 + }, + { + "epoch": 1.9448457745671368, + "grad_norm": 0.3502573072910309, + "learning_rate": 0.0002908191353281735, + "loss": 1.5842, + "step": 4241 + }, + { + "epoch": 1.9453044375644994, + "grad_norm": 0.3748527467250824, + "learning_rate": 0.00029059423214814053, + "loss": 1.6724, + "step": 4242 + }, + { + "epoch": 1.9457631005618623, + "grad_norm": 0.39455685019493103, + "learning_rate": 0.0002903693803368522, + "loss": 1.4643, + "step": 4243 + }, + { + "epoch": 1.9462217635592247, + "grad_norm": 0.14820732176303864, + "learning_rate": 0.00029014457994946654, + "loss": 0.9309, + "step": 4244 + }, + { + "epoch": 1.9466804265565876, + "grad_norm": 0.18288370966911316, + "learning_rate": 0.00028991983104112874, + "loss": 0.9465, + "step": 4245 + }, + { + "epoch": 1.9471390895539502, + "grad_norm": 0.38087254762649536, + "learning_rate": 0.00028969513366697096, + "loss": 1.2691, + "step": 4246 + }, + { + "epoch": 1.9475977525513128, + "grad_norm": 0.14294399321079254, + "learning_rate": 0.0002894704878821133, + "loss": 1.094, + "step": 4247 + }, + { + "epoch": 1.9480564155486757, + "grad_norm": 0.4238864779472351, + "learning_rate": 0.00028924589374166286, + "loss": 1.7693, + "step": 4248 + }, + { + "epoch": 1.9485150785460383, + "grad_norm": 0.2929639220237732, + "learning_rate": 0.0002890213513007144, + "loss": 1.2274, + "step": 4249 + }, + { + "epoch": 1.948973741543401, + "grad_norm": 0.25756093859672546, + "learning_rate": 0.00028879686061434966, + "loss": 1.2784, + "step": 4250 + }, + { + "epoch": 1.9494324045407638, + "grad_norm": 0.3350880742073059, + "learning_rate": 0.0002885724217376381, + "loss": 2.0134, + "step": 4251 + }, + { + "epoch": 1.9498910675381262, + "grad_norm": 0.2928470969200134, + "learning_rate": 0.00028834803472563574, + "loss": 0.9233, + "step": 4252 + }, + { + "epoch": 1.950349730535489, + "grad_norm": 0.35262230038642883, + "learning_rate": 0.0002881236996333864, + "loss": 2.0552, + "step": 4253 + }, + { + "epoch": 1.9508083935328517, + "grad_norm": 0.3223377466201782, + "learning_rate": 0.00028789941651592135, + "loss": 1.6767, + "step": 4254 + }, + { + "epoch": 1.9512670565302144, + "grad_norm": 0.3945949375629425, + "learning_rate": 0.0002876751854282588, + "loss": 1.5204, + "step": 4255 + }, + { + "epoch": 1.9517257195275772, + "grad_norm": 0.25893688201904297, + "learning_rate": 0.00028745100642540425, + "loss": 1.2777, + "step": 4256 + }, + { + "epoch": 1.9521843825249396, + "grad_norm": 0.34024423360824585, + "learning_rate": 0.0002872268795623505, + "loss": 1.7372, + "step": 4257 + }, + { + "epoch": 1.9526430455223025, + "grad_norm": 0.2754332423210144, + "learning_rate": 0.00028700280489407713, + "loss": 1.1841, + "step": 4258 + }, + { + "epoch": 1.9531017085196651, + "grad_norm": 0.3347947299480438, + "learning_rate": 0.0002867787824755516, + "loss": 1.3132, + "step": 4259 + }, + { + "epoch": 1.9535603715170278, + "grad_norm": 0.21994498372077942, + "learning_rate": 0.00028655481236172813, + "loss": 1.2384, + "step": 4260 + }, + { + "epoch": 1.9540190345143906, + "grad_norm": 0.2327728271484375, + "learning_rate": 0.00028633089460754826, + "loss": 1.3356, + "step": 4261 + }, + { + "epoch": 1.9544776975117533, + "grad_norm": 0.2511197030544281, + "learning_rate": 0.00028610702926794063, + "loss": 0.728, + "step": 4262 + }, + { + "epoch": 1.954936360509116, + "grad_norm": 0.2759820222854614, + "learning_rate": 0.0002858832163978207, + "loss": 1.6011, + "step": 4263 + }, + { + "epoch": 1.9553950235064788, + "grad_norm": 0.3019004166126251, + "learning_rate": 0.00028565945605209177, + "loss": 1.7163, + "step": 4264 + }, + { + "epoch": 1.9558536865038412, + "grad_norm": 0.3434310555458069, + "learning_rate": 0.0002854357482856438, + "loss": 1.338, + "step": 4265 + }, + { + "epoch": 1.956312349501204, + "grad_norm": 0.2556118369102478, + "learning_rate": 0.0002852120931533538, + "loss": 1.6398, + "step": 4266 + }, + { + "epoch": 1.9567710124985667, + "grad_norm": 0.39801543951034546, + "learning_rate": 0.0002849884907100861, + "loss": 1.4964, + "step": 4267 + }, + { + "epoch": 1.9572296754959293, + "grad_norm": 0.30779194831848145, + "learning_rate": 0.0002847649410106921, + "loss": 1.7112, + "step": 4268 + }, + { + "epoch": 1.9576883384932922, + "grad_norm": 0.29870155453681946, + "learning_rate": 0.0002845414441100098, + "loss": 1.6331, + "step": 4269 + }, + { + "epoch": 1.9581470014906548, + "grad_norm": 0.2507461905479431, + "learning_rate": 0.0002843180000628649, + "loss": 1.0191, + "step": 4270 + }, + { + "epoch": 1.9586056644880174, + "grad_norm": 0.25291094183921814, + "learning_rate": 0.00028409460892406957, + "loss": 0.802, + "step": 4271 + }, + { + "epoch": 1.95906432748538, + "grad_norm": 0.06607484817504883, + "learning_rate": 0.00028387127074842357, + "loss": 0.6714, + "step": 4272 + }, + { + "epoch": 1.9595229904827427, + "grad_norm": 0.30349382758140564, + "learning_rate": 0.00028364798559071315, + "loss": 1.3984, + "step": 4273 + }, + { + "epoch": 1.9599816534801056, + "grad_norm": 0.23646502196788788, + "learning_rate": 0.0002834247535057118, + "loss": 1.5465, + "step": 4274 + }, + { + "epoch": 1.9604403164774682, + "grad_norm": 0.32397279143333435, + "learning_rate": 0.00028320157454818015, + "loss": 1.2772, + "step": 4275 + }, + { + "epoch": 1.9608989794748308, + "grad_norm": 0.2407788336277008, + "learning_rate": 0.00028297844877286536, + "loss": 1.336, + "step": 4276 + }, + { + "epoch": 1.9613576424721937, + "grad_norm": 0.3070877194404602, + "learning_rate": 0.00028275537623450187, + "loss": 1.8708, + "step": 4277 + }, + { + "epoch": 1.961816305469556, + "grad_norm": 0.3111575245857239, + "learning_rate": 0.0002825323569878111, + "loss": 1.2437, + "step": 4278 + }, + { + "epoch": 1.962274968466919, + "grad_norm": 0.1594800353050232, + "learning_rate": 0.0002823093910875013, + "loss": 0.7383, + "step": 4279 + }, + { + "epoch": 1.9627336314642816, + "grad_norm": 0.23517662286758423, + "learning_rate": 0.0002820864785882673, + "loss": 1.308, + "step": 4280 + }, + { + "epoch": 1.9631922944616442, + "grad_norm": 0.2781253755092621, + "learning_rate": 0.0002818636195447913, + "loss": 1.3627, + "step": 4281 + }, + { + "epoch": 1.963650957459007, + "grad_norm": 0.2628926634788513, + "learning_rate": 0.0002816408140117424, + "loss": 1.2191, + "step": 4282 + }, + { + "epoch": 1.9641096204563697, + "grad_norm": 0.2107929289340973, + "learning_rate": 0.00028141806204377617, + "loss": 1.2504, + "step": 4283 + }, + { + "epoch": 1.9645682834537324, + "grad_norm": 0.27736082673072815, + "learning_rate": 0.0002811953636955354, + "loss": 1.0677, + "step": 4284 + }, + { + "epoch": 1.9650269464510952, + "grad_norm": 0.23589631915092468, + "learning_rate": 0.00028097271902164967, + "loss": 1.6243, + "step": 4285 + }, + { + "epoch": 1.9654856094484576, + "grad_norm": 0.458365261554718, + "learning_rate": 0.00028075012807673516, + "loss": 1.6266, + "step": 4286 + }, + { + "epoch": 1.9659442724458205, + "grad_norm": 0.25476112961769104, + "learning_rate": 0.0002805275909153951, + "loss": 1.7578, + "step": 4287 + }, + { + "epoch": 1.9664029354431831, + "grad_norm": 0.3246137797832489, + "learning_rate": 0.00028030510759221943, + "loss": 1.1602, + "step": 4288 + }, + { + "epoch": 1.9668615984405458, + "grad_norm": 0.3675731420516968, + "learning_rate": 0.000280082678161785, + "loss": 1.2238, + "step": 4289 + }, + { + "epoch": 1.9673202614379086, + "grad_norm": 0.3241674304008484, + "learning_rate": 0.00027986030267865546, + "loss": 1.2205, + "step": 4290 + }, + { + "epoch": 1.967778924435271, + "grad_norm": 0.25529178977012634, + "learning_rate": 0.00027963798119738084, + "loss": 1.7133, + "step": 4291 + }, + { + "epoch": 1.9682375874326339, + "grad_norm": 0.2785083055496216, + "learning_rate": 0.0002794157137724983, + "loss": 1.4641, + "step": 4292 + }, + { + "epoch": 1.9686962504299965, + "grad_norm": 0.39693623781204224, + "learning_rate": 0.00027919350045853167, + "loss": 2.1711, + "step": 4293 + }, + { + "epoch": 1.9691549134273592, + "grad_norm": 0.3063134253025055, + "learning_rate": 0.0002789713413099914, + "loss": 1.2238, + "step": 4294 + }, + { + "epoch": 1.969613576424722, + "grad_norm": 0.21434158086776733, + "learning_rate": 0.0002787492363813748, + "loss": 1.2683, + "step": 4295 + }, + { + "epoch": 1.9700722394220846, + "grad_norm": 0.26843902468681335, + "learning_rate": 0.00027852718572716594, + "loss": 1.1436, + "step": 4296 + }, + { + "epoch": 1.9705309024194473, + "grad_norm": 0.26620641350746155, + "learning_rate": 0.00027830518940183527, + "loss": 1.2504, + "step": 4297 + }, + { + "epoch": 1.9709895654168101, + "grad_norm": 0.4168534278869629, + "learning_rate": 0.0002780832474598401, + "loss": 1.4581, + "step": 4298 + }, + { + "epoch": 1.9714482284141726, + "grad_norm": 0.5389671325683594, + "learning_rate": 0.00027786135995562446, + "loss": 1.738, + "step": 4299 + }, + { + "epoch": 1.9719068914115354, + "grad_norm": 0.18300795555114746, + "learning_rate": 0.00027763952694361894, + "loss": 0.9293, + "step": 4300 + }, + { + "epoch": 1.972365554408898, + "grad_norm": 0.18415942788124084, + "learning_rate": 0.00027741774847824094, + "loss": 1.2104, + "step": 4301 + }, + { + "epoch": 1.9728242174062607, + "grad_norm": 0.3821175992488861, + "learning_rate": 0.00027719602461389394, + "loss": 1.6418, + "step": 4302 + }, + { + "epoch": 1.9732828804036235, + "grad_norm": 0.34809380769729614, + "learning_rate": 0.0002769743554049686, + "loss": 1.3178, + "step": 4303 + }, + { + "epoch": 1.9737415434009862, + "grad_norm": 0.34898874163627625, + "learning_rate": 0.00027675274090584195, + "loss": 0.9742, + "step": 4304 + }, + { + "epoch": 1.9742002063983488, + "grad_norm": 0.30163076519966125, + "learning_rate": 0.0002765311811708775, + "loss": 1.6653, + "step": 4305 + }, + { + "epoch": 1.9746588693957114, + "grad_norm": 0.37521055340766907, + "learning_rate": 0.0002763096762544258, + "loss": 1.4052, + "step": 4306 + }, + { + "epoch": 1.975117532393074, + "grad_norm": 0.36267635226249695, + "learning_rate": 0.0002760882262108236, + "loss": 1.627, + "step": 4307 + }, + { + "epoch": 1.975576195390437, + "grad_norm": 0.37913310527801514, + "learning_rate": 0.0002758668310943938, + "loss": 1.8204, + "step": 4308 + }, + { + "epoch": 1.9760348583877996, + "grad_norm": 0.29660242795944214, + "learning_rate": 0.00027564549095944636, + "loss": 1.3698, + "step": 4309 + }, + { + "epoch": 1.9764935213851622, + "grad_norm": 0.3772091865539551, + "learning_rate": 0.00027542420586027774, + "loss": 1.7451, + "step": 4310 + }, + { + "epoch": 1.976952184382525, + "grad_norm": 0.32186251878738403, + "learning_rate": 0.0002752029758511707, + "loss": 1.7236, + "step": 4311 + }, + { + "epoch": 1.9774108473798875, + "grad_norm": 0.2753252685070038, + "learning_rate": 0.0002749818009863945, + "loss": 1.0989, + "step": 4312 + }, + { + "epoch": 1.9778695103772503, + "grad_norm": 0.30564045906066895, + "learning_rate": 0.0002747606813202052, + "loss": 1.3082, + "step": 4313 + }, + { + "epoch": 1.978328173374613, + "grad_norm": 0.24932487308979034, + "learning_rate": 0.0002745396169068447, + "loss": 0.9138, + "step": 4314 + }, + { + "epoch": 1.9787868363719756, + "grad_norm": 0.26185494661331177, + "learning_rate": 0.0002743186078005415, + "loss": 1.2475, + "step": 4315 + }, + { + "epoch": 1.9792454993693385, + "grad_norm": 0.2203855961561203, + "learning_rate": 0.00027409765405551136, + "loss": 1.3713, + "step": 4316 + }, + { + "epoch": 1.979704162366701, + "grad_norm": 0.3790103495121002, + "learning_rate": 0.0002738767557259555, + "loss": 1.4261, + "step": 4317 + }, + { + "epoch": 1.9801628253640637, + "grad_norm": 0.3733806312084198, + "learning_rate": 0.0002736559128660621, + "loss": 2.1038, + "step": 4318 + }, + { + "epoch": 1.9806214883614266, + "grad_norm": 0.20003962516784668, + "learning_rate": 0.00027343512553000505, + "loss": 0.7565, + "step": 4319 + }, + { + "epoch": 1.981080151358789, + "grad_norm": 0.2958119511604309, + "learning_rate": 0.00027321439377194537, + "loss": 0.9704, + "step": 4320 + }, + { + "epoch": 1.9815388143561519, + "grad_norm": 0.2841302752494812, + "learning_rate": 0.00027299371764603, + "loss": 1.327, + "step": 4321 + }, + { + "epoch": 1.9819974773535145, + "grad_norm": 0.3038768172264099, + "learning_rate": 0.00027277309720639265, + "loss": 1.2805, + "step": 4322 + }, + { + "epoch": 1.9824561403508771, + "grad_norm": 0.3406788110733032, + "learning_rate": 0.00027255253250715286, + "loss": 1.4304, + "step": 4323 + }, + { + "epoch": 1.98291480334824, + "grad_norm": 0.2855389416217804, + "learning_rate": 0.000272332023602417, + "loss": 1.6539, + "step": 4324 + }, + { + "epoch": 1.9833734663456024, + "grad_norm": 0.23860915005207062, + "learning_rate": 0.000272111570546277, + "loss": 1.3198, + "step": 4325 + }, + { + "epoch": 1.9838321293429653, + "grad_norm": 0.3688766360282898, + "learning_rate": 0.0002718911733928121, + "loss": 1.1564, + "step": 4326 + }, + { + "epoch": 1.984290792340328, + "grad_norm": 0.15739460289478302, + "learning_rate": 0.00027167083219608706, + "loss": 0.6855, + "step": 4327 + }, + { + "epoch": 1.9847494553376905, + "grad_norm": 0.2910972535610199, + "learning_rate": 0.0002714505470101533, + "loss": 1.7732, + "step": 4328 + }, + { + "epoch": 1.9852081183350534, + "grad_norm": 0.3392382562160492, + "learning_rate": 0.0002712303178890484, + "loss": 1.2592, + "step": 4329 + }, + { + "epoch": 1.985666781332416, + "grad_norm": 0.27205178141593933, + "learning_rate": 0.0002710101448867959, + "loss": 1.304, + "step": 4330 + }, + { + "epoch": 1.9861254443297787, + "grad_norm": 0.31658264994621277, + "learning_rate": 0.0002707900280574059, + "loss": 1.6132, + "step": 4331 + }, + { + "epoch": 1.9865841073271415, + "grad_norm": 0.34457242488861084, + "learning_rate": 0.00027056996745487475, + "loss": 1.6532, + "step": 4332 + }, + { + "epoch": 1.987042770324504, + "grad_norm": 0.27797314524650574, + "learning_rate": 0.00027034996313318483, + "loss": 1.3838, + "step": 4333 + }, + { + "epoch": 1.9875014333218668, + "grad_norm": 0.2671056091785431, + "learning_rate": 0.00027013001514630483, + "loss": 1.34, + "step": 4334 + }, + { + "epoch": 1.9879600963192294, + "grad_norm": 0.2947138845920563, + "learning_rate": 0.0002699101235481896, + "loss": 1.0969, + "step": 4335 + }, + { + "epoch": 1.988418759316592, + "grad_norm": 0.3621201515197754, + "learning_rate": 0.00026969028839278, + "loss": 1.7797, + "step": 4336 + }, + { + "epoch": 1.988877422313955, + "grad_norm": 0.2773393988609314, + "learning_rate": 0.00026947050973400333, + "loss": 1.5847, + "step": 4337 + }, + { + "epoch": 1.9893360853113176, + "grad_norm": 0.24329693615436554, + "learning_rate": 0.00026925078762577283, + "loss": 1.2757, + "step": 4338 + }, + { + "epoch": 1.9897947483086802, + "grad_norm": 0.3237564265727997, + "learning_rate": 0.00026903112212198796, + "loss": 1.1135, + "step": 4339 + }, + { + "epoch": 1.9902534113060428, + "grad_norm": 0.21693646907806396, + "learning_rate": 0.0002688115132765344, + "loss": 1.2356, + "step": 4340 + }, + { + "epoch": 1.9907120743034055, + "grad_norm": 0.27011099457740784, + "learning_rate": 0.00026859196114328333, + "loss": 1.6289, + "step": 4341 + }, + { + "epoch": 1.9911707373007683, + "grad_norm": 0.34364983439445496, + "learning_rate": 0.0002683724657760928, + "loss": 1.2663, + "step": 4342 + }, + { + "epoch": 1.991629400298131, + "grad_norm": 0.189200758934021, + "learning_rate": 0.00026815302722880643, + "loss": 0.7483, + "step": 4343 + }, + { + "epoch": 1.9920880632954936, + "grad_norm": 0.23307786881923676, + "learning_rate": 0.00026793364555525426, + "loss": 0.8417, + "step": 4344 + }, + { + "epoch": 1.9925467262928565, + "grad_norm": 0.23828265070915222, + "learning_rate": 0.00026771432080925205, + "loss": 0.7974, + "step": 4345 + }, + { + "epoch": 1.9930053892902189, + "grad_norm": 0.22147458791732788, + "learning_rate": 0.0002674950530446019, + "loss": 0.8344, + "step": 4346 + }, + { + "epoch": 1.9934640522875817, + "grad_norm": 0.2830180525779724, + "learning_rate": 0.0002672758423150916, + "loss": 1.2985, + "step": 4347 + }, + { + "epoch": 1.9939227152849444, + "grad_norm": 0.3324924409389496, + "learning_rate": 0.0002670566886744953, + "loss": 1.7325, + "step": 4348 + }, + { + "epoch": 1.994381378282307, + "grad_norm": 0.18930895626544952, + "learning_rate": 0.0002668375921765728, + "loss": 0.7449, + "step": 4349 + }, + { + "epoch": 1.9948400412796699, + "grad_norm": 0.28275609016418457, + "learning_rate": 0.0002666185528750702, + "loss": 1.7032, + "step": 4350 + }, + { + "epoch": 1.9952987042770325, + "grad_norm": 0.2890438735485077, + "learning_rate": 0.00026639957082371936, + "loss": 1.2205, + "step": 4351 + }, + { + "epoch": 1.9957573672743951, + "grad_norm": 0.36931562423706055, + "learning_rate": 0.0002661806460762383, + "loss": 1.7939, + "step": 4352 + }, + { + "epoch": 1.996216030271758, + "grad_norm": 0.3492710590362549, + "learning_rate": 0.0002659617786863304, + "loss": 1.6342, + "step": 4353 + }, + { + "epoch": 1.9966746932691204, + "grad_norm": 0.29765257239341736, + "learning_rate": 0.00026574296870768575, + "loss": 1.7397, + "step": 4354 + }, + { + "epoch": 1.9971333562664833, + "grad_norm": 0.37656140327453613, + "learning_rate": 0.00026552421619398004, + "loss": 2.0061, + "step": 4355 + }, + { + "epoch": 1.9975920192638459, + "grad_norm": 0.29920002818107605, + "learning_rate": 0.0002653055211988746, + "loss": 1.6586, + "step": 4356 + }, + { + "epoch": 1.9980506822612085, + "grad_norm": 0.34368252754211426, + "learning_rate": 0.0002650868837760172, + "loss": 1.3087, + "step": 4357 + }, + { + "epoch": 1.9985093452585714, + "grad_norm": 0.26091626286506653, + "learning_rate": 0.0002648683039790409, + "loss": 1.244, + "step": 4358 + }, + { + "epoch": 1.9989680082559338, + "grad_norm": 0.18554215133190155, + "learning_rate": 0.0002646497818615651, + "loss": 1.1157, + "step": 4359 + }, + { + "epoch": 1.9994266712532967, + "grad_norm": 0.23204344511032104, + "learning_rate": 0.00026443131747719474, + "loss": 1.2117, + "step": 4360 + }, + { + "epoch": 1.9998853342506593, + "grad_norm": 0.25021374225616455, + "learning_rate": 0.00026421291087952084, + "loss": 1.3657, + "step": 4361 + }, + { + "epoch": 2.0, + "grad_norm": 0.25021374225616455, + "learning_rate": 0.00026421291087952084, + "loss": 0.5557, + "step": 4362 + }, + { + "epoch": 2.000458662997363, + "grad_norm": 0.31650856137275696, + "learning_rate": 0.0002639945621221199, + "loss": 1.4053, + "step": 4363 + }, + { + "epoch": 2.0009173259947253, + "grad_norm": 0.3828623592853546, + "learning_rate": 0.00026377627125855475, + "loss": 1.5593, + "step": 4364 + }, + { + "epoch": 2.001375988992088, + "grad_norm": 0.3297773003578186, + "learning_rate": 0.00026355803834237333, + "loss": 1.6141, + "step": 4365 + }, + { + "epoch": 2.0018346519894505, + "grad_norm": 0.20084944367408752, + "learning_rate": 0.00026333986342711, + "loss": 0.831, + "step": 4366 + }, + { + "epoch": 2.0022933149868134, + "grad_norm": 0.21461676061153412, + "learning_rate": 0.0002631217465662845, + "loss": 1.5271, + "step": 4367 + }, + { + "epoch": 2.0027519779841763, + "grad_norm": 0.3193070590496063, + "learning_rate": 0.00026290368781340263, + "loss": 1.1464, + "step": 4368 + }, + { + "epoch": 2.0032106409815387, + "grad_norm": 0.26611328125, + "learning_rate": 0.00026268568722195564, + "loss": 0.8123, + "step": 4369 + }, + { + "epoch": 2.0036693039789015, + "grad_norm": 0.07296108454465866, + "learning_rate": 0.0002624677448454207, + "loss": 0.8224, + "step": 4370 + }, + { + "epoch": 2.0041279669762644, + "grad_norm": 0.36370766162872314, + "learning_rate": 0.00026224986073726064, + "loss": 1.5906, + "step": 4371 + }, + { + "epoch": 2.004586629973627, + "grad_norm": 0.3442339301109314, + "learning_rate": 0.0002620320349509241, + "loss": 1.3951, + "step": 4372 + }, + { + "epoch": 2.0050452929709897, + "grad_norm": 0.2694573700428009, + "learning_rate": 0.0002618142675398451, + "loss": 0.851, + "step": 4373 + }, + { + "epoch": 2.005503955968352, + "grad_norm": 0.22100800275802612, + "learning_rate": 0.00026159655855744374, + "loss": 1.3334, + "step": 4374 + }, + { + "epoch": 2.005962618965715, + "grad_norm": 0.3315063714981079, + "learning_rate": 0.00026137890805712574, + "loss": 1.7025, + "step": 4375 + }, + { + "epoch": 2.006421281963078, + "grad_norm": 0.5389212369918823, + "learning_rate": 0.0002611613160922819, + "loss": 1.4756, + "step": 4376 + }, + { + "epoch": 2.00687994496044, + "grad_norm": 0.3037826120853424, + "learning_rate": 0.0002609437827162894, + "loss": 1.66, + "step": 4377 + }, + { + "epoch": 2.007338607957803, + "grad_norm": 0.3313737213611603, + "learning_rate": 0.0002607263079825106, + "loss": 1.8962, + "step": 4378 + }, + { + "epoch": 2.007797270955166, + "grad_norm": 0.3897867500782013, + "learning_rate": 0.0002605088919442936, + "loss": 1.5619, + "step": 4379 + }, + { + "epoch": 2.0082559339525283, + "grad_norm": 0.24746671319007874, + "learning_rate": 0.00026029153465497243, + "loss": 0.7236, + "step": 4380 + }, + { + "epoch": 2.008714596949891, + "grad_norm": 0.30122828483581543, + "learning_rate": 0.0002600742361678663, + "loss": 1.7644, + "step": 4381 + }, + { + "epoch": 2.0091732599472536, + "grad_norm": 0.32251477241516113, + "learning_rate": 0.0002598569965362799, + "loss": 1.2259, + "step": 4382 + }, + { + "epoch": 2.0096319229446165, + "grad_norm": 0.219418004155159, + "learning_rate": 0.0002596398158135037, + "loss": 0.5097, + "step": 4383 + }, + { + "epoch": 2.0100905859419793, + "grad_norm": 0.3621459901332855, + "learning_rate": 0.00025942269405281385, + "loss": 1.3002, + "step": 4384 + }, + { + "epoch": 2.0105492489393417, + "grad_norm": 0.06939633190631866, + "learning_rate": 0.00025920563130747167, + "loss": 0.3688, + "step": 4385 + }, + { + "epoch": 2.0110079119367046, + "grad_norm": 0.2349678874015808, + "learning_rate": 0.0002589886276307246, + "loss": 1.1327, + "step": 4386 + }, + { + "epoch": 2.011466574934067, + "grad_norm": 0.3204725384712219, + "learning_rate": 0.0002587716830758048, + "loss": 1.3671, + "step": 4387 + }, + { + "epoch": 2.01192523793143, + "grad_norm": 0.22198036313056946, + "learning_rate": 0.0002585547976959303, + "loss": 1.0752, + "step": 4388 + }, + { + "epoch": 2.0123839009287927, + "grad_norm": 0.30143803358078003, + "learning_rate": 0.00025833797154430494, + "loss": 1.792, + "step": 4389 + }, + { + "epoch": 2.012842563926155, + "grad_norm": 0.3181972801685333, + "learning_rate": 0.0002581212046741177, + "loss": 1.5635, + "step": 4390 + }, + { + "epoch": 2.013301226923518, + "grad_norm": 0.3738367557525635, + "learning_rate": 0.000257904497138543, + "loss": 1.2866, + "step": 4391 + }, + { + "epoch": 2.013759889920881, + "grad_norm": 0.256151020526886, + "learning_rate": 0.00025768784899074087, + "loss": 1.8867, + "step": 4392 + }, + { + "epoch": 2.0142185529182433, + "grad_norm": 0.28460898995399475, + "learning_rate": 0.00025747126028385643, + "loss": 0.8942, + "step": 4393 + }, + { + "epoch": 2.014677215915606, + "grad_norm": 0.27417483925819397, + "learning_rate": 0.0002572547310710205, + "loss": 1.547, + "step": 4394 + }, + { + "epoch": 2.0151358789129685, + "grad_norm": 0.3943521976470947, + "learning_rate": 0.00025703826140534937, + "loss": 1.222, + "step": 4395 + }, + { + "epoch": 2.0155945419103314, + "grad_norm": 0.3956080377101898, + "learning_rate": 0.00025682185133994457, + "loss": 1.679, + "step": 4396 + }, + { + "epoch": 2.0160532049076942, + "grad_norm": 0.41523680090904236, + "learning_rate": 0.0002566055009278932, + "loss": 0.6923, + "step": 4397 + }, + { + "epoch": 2.0165118679050567, + "grad_norm": 0.228208526968956, + "learning_rate": 0.00025638921022226704, + "loss": 1.4105, + "step": 4398 + }, + { + "epoch": 2.0169705309024195, + "grad_norm": 0.27708700299263, + "learning_rate": 0.00025617297927612437, + "loss": 1.4982, + "step": 4399 + }, + { + "epoch": 2.017429193899782, + "grad_norm": 0.34748536348342896, + "learning_rate": 0.0002559568081425079, + "loss": 1.3942, + "step": 4400 + }, + { + "epoch": 2.017887856897145, + "grad_norm": 0.25076958537101746, + "learning_rate": 0.00025574069687444613, + "loss": 1.3903, + "step": 4401 + }, + { + "epoch": 2.0183465198945076, + "grad_norm": 0.34007957577705383, + "learning_rate": 0.00025552464552495253, + "loss": 1.5933, + "step": 4402 + }, + { + "epoch": 2.01880518289187, + "grad_norm": 0.31931763887405396, + "learning_rate": 0.0002553086541470263, + "loss": 0.8987, + "step": 4403 + }, + { + "epoch": 2.019263845889233, + "grad_norm": 0.2598975598812103, + "learning_rate": 0.0002550927227936515, + "loss": 1.6906, + "step": 4404 + }, + { + "epoch": 2.0197225088865958, + "grad_norm": 0.3859318196773529, + "learning_rate": 0.0002548768515177975, + "loss": 2.1326, + "step": 4405 + }, + { + "epoch": 2.020181171883958, + "grad_norm": 0.35015788674354553, + "learning_rate": 0.0002546610403724193, + "loss": 0.8087, + "step": 4406 + }, + { + "epoch": 2.020639834881321, + "grad_norm": 0.25756731629371643, + "learning_rate": 0.0002544452894104569, + "loss": 1.2472, + "step": 4407 + }, + { + "epoch": 2.0210984978786835, + "grad_norm": 0.3442479372024536, + "learning_rate": 0.0002542295986848355, + "loss": 1.7928, + "step": 4408 + }, + { + "epoch": 2.0215571608760463, + "grad_norm": 0.41881293058395386, + "learning_rate": 0.00025401396824846576, + "loss": 1.7277, + "step": 4409 + }, + { + "epoch": 2.022015823873409, + "grad_norm": 0.28389236330986023, + "learning_rate": 0.0002537983981542432, + "loss": 1.1472, + "step": 4410 + }, + { + "epoch": 2.0224744868707716, + "grad_norm": 0.19957682490348816, + "learning_rate": 0.0002535828884550487, + "loss": 0.9713, + "step": 4411 + }, + { + "epoch": 2.0229331498681344, + "grad_norm": 0.20129139721393585, + "learning_rate": 0.0002533674392037485, + "loss": 1.2563, + "step": 4412 + }, + { + "epoch": 2.023391812865497, + "grad_norm": 0.39847332239151, + "learning_rate": 0.0002531520504531938, + "loss": 1.3053, + "step": 4413 + }, + { + "epoch": 2.0238504758628597, + "grad_norm": 0.3507583439350128, + "learning_rate": 0.00025293672225622113, + "loss": 1.7719, + "step": 4414 + }, + { + "epoch": 2.0243091388602226, + "grad_norm": 0.34539008140563965, + "learning_rate": 0.0002527214546656517, + "loss": 1.5484, + "step": 4415 + }, + { + "epoch": 2.024767801857585, + "grad_norm": 0.3002701997756958, + "learning_rate": 0.0002525062477342925, + "loss": 1.8398, + "step": 4416 + }, + { + "epoch": 2.025226464854948, + "grad_norm": 0.39402061700820923, + "learning_rate": 0.00025229110151493516, + "loss": 1.6705, + "step": 4417 + }, + { + "epoch": 2.0256851278523107, + "grad_norm": 0.28095006942749023, + "learning_rate": 0.0002520760160603567, + "loss": 1.7056, + "step": 4418 + }, + { + "epoch": 2.026143790849673, + "grad_norm": 0.33673858642578125, + "learning_rate": 0.0002518609914233192, + "loss": 1.4712, + "step": 4419 + }, + { + "epoch": 2.026602453847036, + "grad_norm": 0.3398224711418152, + "learning_rate": 0.00025164602765656964, + "loss": 1.0774, + "step": 4420 + }, + { + "epoch": 2.0270611168443984, + "grad_norm": 0.3440173268318176, + "learning_rate": 0.00025143112481284017, + "loss": 1.2208, + "step": 4421 + }, + { + "epoch": 2.0275197798417612, + "grad_norm": 0.30769407749176025, + "learning_rate": 0.0002512162829448481, + "loss": 1.2709, + "step": 4422 + }, + { + "epoch": 2.027978442839124, + "grad_norm": 0.2576143443584442, + "learning_rate": 0.00025100150210529565, + "loss": 1.1309, + "step": 4423 + }, + { + "epoch": 2.0284371058364865, + "grad_norm": 0.17252714931964874, + "learning_rate": 0.00025078678234687014, + "loss": 0.61, + "step": 4424 + }, + { + "epoch": 2.0288957688338494, + "grad_norm": 0.21611100435256958, + "learning_rate": 0.000250572123722244, + "loss": 1.1369, + "step": 4425 + }, + { + "epoch": 2.0293544318312122, + "grad_norm": 0.3421436846256256, + "learning_rate": 0.00025035752628407414, + "loss": 1.5566, + "step": 4426 + }, + { + "epoch": 2.0298130948285746, + "grad_norm": 0.29357588291168213, + "learning_rate": 0.00025014299008500315, + "loss": 1.1662, + "step": 4427 + }, + { + "epoch": 2.0302717578259375, + "grad_norm": 0.20734919607639313, + "learning_rate": 0.00024992851517765825, + "loss": 1.5304, + "step": 4428 + }, + { + "epoch": 2.0307304208233, + "grad_norm": 0.41112732887268066, + "learning_rate": 0.0002497141016146517, + "loss": 1.8782, + "step": 4429 + }, + { + "epoch": 2.0311890838206628, + "grad_norm": 0.33853259682655334, + "learning_rate": 0.0002494997494485806, + "loss": 1.2181, + "step": 4430 + }, + { + "epoch": 2.0316477468180256, + "grad_norm": 0.4708898067474365, + "learning_rate": 0.0002492854587320272, + "loss": 1.8342, + "step": 4431 + }, + { + "epoch": 2.032106409815388, + "grad_norm": 0.3891645073890686, + "learning_rate": 0.00024907122951755856, + "loss": 0.9312, + "step": 4432 + }, + { + "epoch": 2.032565072812751, + "grad_norm": 0.304002583026886, + "learning_rate": 0.00024885706185772663, + "loss": 1.071, + "step": 4433 + }, + { + "epoch": 2.0330237358101133, + "grad_norm": 0.26506680250167847, + "learning_rate": 0.00024864295580506816, + "loss": 1.4494, + "step": 4434 + }, + { + "epoch": 2.033482398807476, + "grad_norm": 0.33976414799690247, + "learning_rate": 0.0002484289114121051, + "loss": 1.2593, + "step": 4435 + }, + { + "epoch": 2.033941061804839, + "grad_norm": 0.2222852259874344, + "learning_rate": 0.0002482149287313439, + "loss": 0.7598, + "step": 4436 + }, + { + "epoch": 2.0343997248022014, + "grad_norm": 0.34920746088027954, + "learning_rate": 0.00024800100781527645, + "loss": 1.8142, + "step": 4437 + }, + { + "epoch": 2.0348583877995643, + "grad_norm": 0.3795028328895569, + "learning_rate": 0.00024778714871637853, + "loss": 1.2554, + "step": 4438 + }, + { + "epoch": 2.035317050796927, + "grad_norm": 0.0961453840136528, + "learning_rate": 0.0002475733514871116, + "loss": 0.9809, + "step": 4439 + }, + { + "epoch": 2.0357757137942896, + "grad_norm": 0.2711848020553589, + "learning_rate": 0.00024735961617992165, + "loss": 0.9706, + "step": 4440 + }, + { + "epoch": 2.0362343767916524, + "grad_norm": 0.3385700583457947, + "learning_rate": 0.0002471459428472393, + "loss": 1.4021, + "step": 4441 + }, + { + "epoch": 2.036693039789015, + "grad_norm": 0.2203783392906189, + "learning_rate": 0.00024693233154148063, + "loss": 0.8029, + "step": 4442 + }, + { + "epoch": 2.0371517027863777, + "grad_norm": 0.3024193346500397, + "learning_rate": 0.0002467187823150457, + "loss": 1.1432, + "step": 4443 + }, + { + "epoch": 2.0376103657837406, + "grad_norm": 0.3142333924770355, + "learning_rate": 0.0002465052952203196, + "loss": 1.7221, + "step": 4444 + }, + { + "epoch": 2.038069028781103, + "grad_norm": 0.3084831237792969, + "learning_rate": 0.0002462918703096724, + "loss": 1.2093, + "step": 4445 + }, + { + "epoch": 2.038527691778466, + "grad_norm": 0.38534000515937805, + "learning_rate": 0.0002460785076354588, + "loss": 1.4895, + "step": 4446 + }, + { + "epoch": 2.0389863547758287, + "grad_norm": 0.303943008184433, + "learning_rate": 0.0002458652072500181, + "loss": 0.9302, + "step": 4447 + }, + { + "epoch": 2.039445017773191, + "grad_norm": 0.26607340574264526, + "learning_rate": 0.0002456519692056747, + "loss": 1.4809, + "step": 4448 + }, + { + "epoch": 2.039903680770554, + "grad_norm": 0.3666565418243408, + "learning_rate": 0.0002454387935547369, + "loss": 1.2939, + "step": 4449 + }, + { + "epoch": 2.0403623437679164, + "grad_norm": 0.24224300682544708, + "learning_rate": 0.00024522568034949865, + "loss": 1.0746, + "step": 4450 + }, + { + "epoch": 2.040821006765279, + "grad_norm": 0.2701547145843506, + "learning_rate": 0.0002450126296422377, + "loss": 1.1681, + "step": 4451 + }, + { + "epoch": 2.041279669762642, + "grad_norm": 0.19419100880622864, + "learning_rate": 0.0002447996414852176, + "loss": 1.3045, + "step": 4452 + }, + { + "epoch": 2.0417383327600045, + "grad_norm": 0.30444204807281494, + "learning_rate": 0.00024458671593068564, + "loss": 1.4119, + "step": 4453 + }, + { + "epoch": 2.0421969957573674, + "grad_norm": 0.3836541473865509, + "learning_rate": 0.00024437385303087373, + "loss": 1.7348, + "step": 4454 + }, + { + "epoch": 2.0426556587547298, + "grad_norm": 0.312198281288147, + "learning_rate": 0.0002441610528379988, + "loss": 1.2589, + "step": 4455 + }, + { + "epoch": 2.0431143217520926, + "grad_norm": 0.29707929491996765, + "learning_rate": 0.00024394831540426232, + "loss": 0.8035, + "step": 4456 + }, + { + "epoch": 2.0435729847494555, + "grad_norm": 0.2303507924079895, + "learning_rate": 0.0002437356407818503, + "loss": 1.088, + "step": 4457 + }, + { + "epoch": 2.044031647746818, + "grad_norm": 0.3469330072402954, + "learning_rate": 0.00024352302902293333, + "loss": 1.6418, + "step": 4458 + }, + { + "epoch": 2.0444903107441807, + "grad_norm": 0.21516135334968567, + "learning_rate": 0.00024331048017966683, + "loss": 0.9941, + "step": 4459 + }, + { + "epoch": 2.0449489737415436, + "grad_norm": 0.38784271478652954, + "learning_rate": 0.00024309799430419, + "loss": 2.2803, + "step": 4460 + }, + { + "epoch": 2.045407636738906, + "grad_norm": 0.4177115261554718, + "learning_rate": 0.0002428855714486277, + "loss": 1.9731, + "step": 4461 + }, + { + "epoch": 2.045866299736269, + "grad_norm": 0.3393002152442932, + "learning_rate": 0.00024267321166508867, + "loss": 1.2382, + "step": 4462 + }, + { + "epoch": 2.0463249627336313, + "grad_norm": 0.28032952547073364, + "learning_rate": 0.00024246091500566619, + "loss": 1.1954, + "step": 4463 + }, + { + "epoch": 2.046783625730994, + "grad_norm": 0.28215348720550537, + "learning_rate": 0.00024224868152243823, + "loss": 0.8485, + "step": 4464 + }, + { + "epoch": 2.047242288728357, + "grad_norm": 0.26953309774398804, + "learning_rate": 0.0002420365112674674, + "loss": 1.4311, + "step": 4465 + }, + { + "epoch": 2.0477009517257194, + "grad_norm": 0.20407378673553467, + "learning_rate": 0.0002418244042928001, + "loss": 0.9565, + "step": 4466 + }, + { + "epoch": 2.0481596147230823, + "grad_norm": 0.27266445755958557, + "learning_rate": 0.00024161236065046806, + "loss": 1.2134, + "step": 4467 + }, + { + "epoch": 2.0486182777204447, + "grad_norm": 0.2968637943267822, + "learning_rate": 0.00024140038039248697, + "loss": 1.1802, + "step": 4468 + }, + { + "epoch": 2.0490769407178075, + "grad_norm": 0.34877461194992065, + "learning_rate": 0.00024118846357085717, + "loss": 1.5968, + "step": 4469 + }, + { + "epoch": 2.0495356037151704, + "grad_norm": 0.3103572726249695, + "learning_rate": 0.0002409766102375634, + "loss": 1.218, + "step": 4470 + }, + { + "epoch": 2.049994266712533, + "grad_norm": 0.33419668674468994, + "learning_rate": 0.00024076482044457477, + "loss": 1.781, + "step": 4471 + }, + { + "epoch": 2.0504529297098957, + "grad_norm": 0.3635629415512085, + "learning_rate": 0.00024055309424384486, + "loss": 1.4742, + "step": 4472 + }, + { + "epoch": 2.0509115927072585, + "grad_norm": 0.336867094039917, + "learning_rate": 0.00024034143168731172, + "loss": 1.2226, + "step": 4473 + }, + { + "epoch": 2.051370255704621, + "grad_norm": 0.2931895852088928, + "learning_rate": 0.00024012983282689754, + "loss": 1.1535, + "step": 4474 + }, + { + "epoch": 2.051828918701984, + "grad_norm": 0.36698460578918457, + "learning_rate": 0.00023991829771450912, + "loss": 1.2108, + "step": 4475 + }, + { + "epoch": 2.052287581699346, + "grad_norm": 0.306598037481308, + "learning_rate": 0.00023970682640203782, + "loss": 1.5089, + "step": 4476 + }, + { + "epoch": 2.052746244696709, + "grad_norm": 0.37377047538757324, + "learning_rate": 0.00023949541894135857, + "loss": 1.7544, + "step": 4477 + }, + { + "epoch": 2.053204907694072, + "grad_norm": 0.3563328981399536, + "learning_rate": 0.00023928407538433138, + "loss": 1.3446, + "step": 4478 + }, + { + "epoch": 2.0536635706914343, + "grad_norm": 0.29219722747802734, + "learning_rate": 0.0002390727957828004, + "loss": 1.7413, + "step": 4479 + }, + { + "epoch": 2.054122233688797, + "grad_norm": 0.49362048506736755, + "learning_rate": 0.000238861580188594, + "loss": 1.6321, + "step": 4480 + }, + { + "epoch": 2.0545808966861596, + "grad_norm": 0.4204590916633606, + "learning_rate": 0.00023865042865352487, + "loss": 2.0813, + "step": 4481 + }, + { + "epoch": 2.0550395596835225, + "grad_norm": 0.33447083830833435, + "learning_rate": 0.00023843934122938997, + "loss": 1.4967, + "step": 4482 + }, + { + "epoch": 2.0554982226808853, + "grad_norm": 0.3546378016471863, + "learning_rate": 0.0002382283179679707, + "loss": 0.8393, + "step": 4483 + }, + { + "epoch": 2.0559568856782477, + "grad_norm": 0.2834445536136627, + "learning_rate": 0.00023801735892103244, + "loss": 1.6294, + "step": 4484 + }, + { + "epoch": 2.0564155486756106, + "grad_norm": 0.46338558197021484, + "learning_rate": 0.0002378064641403251, + "loss": 1.1402, + "step": 4485 + }, + { + "epoch": 2.0568742116729735, + "grad_norm": 0.14141368865966797, + "learning_rate": 0.00023759563367758252, + "loss": 1.6619, + "step": 4486 + }, + { + "epoch": 2.057332874670336, + "grad_norm": 0.35884973406791687, + "learning_rate": 0.00023738486758452326, + "loss": 1.1481, + "step": 4487 + }, + { + "epoch": 2.0577915376676987, + "grad_norm": 0.35921135544776917, + "learning_rate": 0.0002371741659128494, + "loss": 1.7199, + "step": 4488 + }, + { + "epoch": 2.058250200665061, + "grad_norm": 0.35381534695625305, + "learning_rate": 0.00023696352871424765, + "loss": 0.8198, + "step": 4489 + }, + { + "epoch": 2.058708863662424, + "grad_norm": 0.29065418243408203, + "learning_rate": 0.00023675295604038893, + "loss": 1.0642, + "step": 4490 + }, + { + "epoch": 2.059167526659787, + "grad_norm": 0.30102601647377014, + "learning_rate": 0.00023654244794292823, + "loss": 1.451, + "step": 4491 + }, + { + "epoch": 2.0596261896571493, + "grad_norm": 0.37403160333633423, + "learning_rate": 0.00023633200447350462, + "loss": 1.8662, + "step": 4492 + }, + { + "epoch": 2.060084852654512, + "grad_norm": 0.45195087790489197, + "learning_rate": 0.00023612162568374147, + "loss": 0.8842, + "step": 4493 + }, + { + "epoch": 2.060543515651875, + "grad_norm": 0.06819970905780792, + "learning_rate": 0.0002359113116252462, + "loss": 0.9856, + "step": 4494 + }, + { + "epoch": 2.0610021786492374, + "grad_norm": 0.31288787722587585, + "learning_rate": 0.00023570106234961036, + "loss": 1.5755, + "step": 4495 + }, + { + "epoch": 2.0614608416466003, + "grad_norm": 0.38856643438339233, + "learning_rate": 0.00023549087790840966, + "loss": 1.3749, + "step": 4496 + }, + { + "epoch": 2.0619195046439627, + "grad_norm": 0.2518812119960785, + "learning_rate": 0.00023528075835320378, + "loss": 1.7864, + "step": 4497 + }, + { + "epoch": 2.0623781676413255, + "grad_norm": 0.4028247892856598, + "learning_rate": 0.0002350707037355368, + "loss": 1.551, + "step": 4498 + }, + { + "epoch": 2.0628368306386884, + "grad_norm": 0.34255698323249817, + "learning_rate": 0.00023486071410693627, + "loss": 1.0178, + "step": 4499 + }, + { + "epoch": 2.063295493636051, + "grad_norm": 0.08957389742136002, + "learning_rate": 0.0002346507895189143, + "loss": 1.2612, + "step": 4500 + }, + { + "epoch": 2.0637541566334137, + "grad_norm": 0.3247790038585663, + "learning_rate": 0.0002344409300229669, + "loss": 1.258, + "step": 4501 + }, + { + "epoch": 2.064212819630776, + "grad_norm": 0.39857447147369385, + "learning_rate": 0.0002342311356705742, + "loss": 1.3753, + "step": 4502 + }, + { + "epoch": 2.064671482628139, + "grad_norm": 0.13461144268512726, + "learning_rate": 0.00023402140651320003, + "loss": 0.8576, + "step": 4503 + }, + { + "epoch": 2.065130145625502, + "grad_norm": 0.33944034576416016, + "learning_rate": 0.000233811742602293, + "loss": 1.5578, + "step": 4504 + }, + { + "epoch": 2.065588808622864, + "grad_norm": 0.3041727542877197, + "learning_rate": 0.0002336021439892846, + "loss": 1.278, + "step": 4505 + }, + { + "epoch": 2.066047471620227, + "grad_norm": 0.28007856011390686, + "learning_rate": 0.00023339261072559116, + "loss": 1.2006, + "step": 4506 + }, + { + "epoch": 2.06650613461759, + "grad_norm": 0.2758924663066864, + "learning_rate": 0.00023318314286261262, + "loss": 1.5737, + "step": 4507 + }, + { + "epoch": 2.0669647976149523, + "grad_norm": 0.318847119808197, + "learning_rate": 0.00023297374045173298, + "loss": 1.3716, + "step": 4508 + }, + { + "epoch": 2.067423460612315, + "grad_norm": 0.3623434007167816, + "learning_rate": 0.00023276440354432038, + "loss": 1.8606, + "step": 4509 + }, + { + "epoch": 2.0678821236096776, + "grad_norm": 0.43447864055633545, + "learning_rate": 0.00023255513219172625, + "loss": 1.9836, + "step": 4510 + }, + { + "epoch": 2.0683407866070405, + "grad_norm": 0.38707858324050903, + "learning_rate": 0.00023234592644528657, + "loss": 1.4044, + "step": 4511 + }, + { + "epoch": 2.0687994496044033, + "grad_norm": 0.43568772077560425, + "learning_rate": 0.00023213678635632102, + "loss": 2.0554, + "step": 4512 + }, + { + "epoch": 2.0692581126017657, + "grad_norm": 0.3717525601387024, + "learning_rate": 0.000231927711976133, + "loss": 1.493, + "step": 4513 + }, + { + "epoch": 2.0697167755991286, + "grad_norm": 0.21989569067955017, + "learning_rate": 0.0002317187033560103, + "loss": 0.6022, + "step": 4514 + }, + { + "epoch": 2.0701754385964914, + "grad_norm": 0.10017600655555725, + "learning_rate": 0.0002315097605472243, + "loss": 1.3052, + "step": 4515 + }, + { + "epoch": 2.070634101593854, + "grad_norm": 0.3896404802799225, + "learning_rate": 0.00023130088360102968, + "loss": 1.0817, + "step": 4516 + }, + { + "epoch": 2.0710927645912167, + "grad_norm": 0.07273133844137192, + "learning_rate": 0.00023109207256866583, + "loss": 0.5404, + "step": 4517 + }, + { + "epoch": 2.071551427588579, + "grad_norm": 0.35691580176353455, + "learning_rate": 0.00023088332750135544, + "loss": 1.6287, + "step": 4518 + }, + { + "epoch": 2.072010090585942, + "grad_norm": 0.42362430691719055, + "learning_rate": 0.00023067464845030527, + "loss": 1.7539, + "step": 4519 + }, + { + "epoch": 2.072468753583305, + "grad_norm": 0.5343214273452759, + "learning_rate": 0.00023046603546670596, + "loss": 2.1514, + "step": 4520 + }, + { + "epoch": 2.0729274165806673, + "grad_norm": 0.4141870439052582, + "learning_rate": 0.0002302574886017314, + "loss": 1.6187, + "step": 4521 + }, + { + "epoch": 2.07338607957803, + "grad_norm": 0.2982878088951111, + "learning_rate": 0.00023004900790653986, + "loss": 1.6875, + "step": 4522 + }, + { + "epoch": 2.0738447425753925, + "grad_norm": 0.28776493668556213, + "learning_rate": 0.00022984059343227292, + "loss": 0.642, + "step": 4523 + }, + { + "epoch": 2.0743034055727554, + "grad_norm": 0.2196163535118103, + "learning_rate": 0.00022963224523005654, + "loss": 1.3851, + "step": 4524 + }, + { + "epoch": 2.0747620685701182, + "grad_norm": 0.3807169795036316, + "learning_rate": 0.00022942396335099986, + "loss": 1.2697, + "step": 4525 + }, + { + "epoch": 2.0752207315674807, + "grad_norm": 0.3772948086261749, + "learning_rate": 0.00022921574784619608, + "loss": 1.7243, + "step": 4526 + }, + { + "epoch": 2.0756793945648435, + "grad_norm": 0.532681405544281, + "learning_rate": 0.00022900759876672168, + "loss": 1.4512, + "step": 4527 + }, + { + "epoch": 2.0761380575622064, + "grad_norm": 0.36544162034988403, + "learning_rate": 0.00022879951616363727, + "loss": 1.5479, + "step": 4528 + }, + { + "epoch": 2.076596720559569, + "grad_norm": 0.2846382260322571, + "learning_rate": 0.0002285915000879869, + "loss": 1.2101, + "step": 4529 + }, + { + "epoch": 2.0770553835569316, + "grad_norm": 0.3465891480445862, + "learning_rate": 0.00022838355059079862, + "loss": 1.9592, + "step": 4530 + }, + { + "epoch": 2.077514046554294, + "grad_norm": 0.3925153315067291, + "learning_rate": 0.00022817566772308378, + "loss": 2.1309, + "step": 4531 + }, + { + "epoch": 2.077972709551657, + "grad_norm": 0.3107575476169586, + "learning_rate": 0.0002279678515358376, + "loss": 0.5652, + "step": 4532 + }, + { + "epoch": 2.0784313725490198, + "grad_norm": 0.19689220190048218, + "learning_rate": 0.00022776010208003895, + "loss": 1.6448, + "step": 4533 + }, + { + "epoch": 2.078890035546382, + "grad_norm": 0.3134257197380066, + "learning_rate": 0.00022755241940665018, + "loss": 1.3657, + "step": 4534 + }, + { + "epoch": 2.079348698543745, + "grad_norm": 0.4296601116657257, + "learning_rate": 0.00022734480356661736, + "loss": 1.34, + "step": 4535 + }, + { + "epoch": 2.0798073615411075, + "grad_norm": 0.3112638294696808, + "learning_rate": 0.00022713725461087015, + "loss": 1.282, + "step": 4536 + }, + { + "epoch": 2.0802660245384703, + "grad_norm": 0.13700056076049805, + "learning_rate": 0.00022692977259032205, + "loss": 1.3737, + "step": 4537 + }, + { + "epoch": 2.080724687535833, + "grad_norm": 0.4667096436023712, + "learning_rate": 0.00022672235755586952, + "loss": 1.8356, + "step": 4538 + }, + { + "epoch": 2.0811833505331956, + "grad_norm": 0.31745702028274536, + "learning_rate": 0.00022651500955839305, + "loss": 0.7115, + "step": 4539 + }, + { + "epoch": 2.0816420135305584, + "grad_norm": 0.3241938352584839, + "learning_rate": 0.0002263077286487567, + "loss": 1.7333, + "step": 4540 + }, + { + "epoch": 2.0821006765279213, + "grad_norm": 0.2541601359844208, + "learning_rate": 0.00022610051487780792, + "loss": 1.3767, + "step": 4541 + }, + { + "epoch": 2.0825593395252837, + "grad_norm": 0.22772157192230225, + "learning_rate": 0.00022589336829637776, + "loss": 0.4693, + "step": 4542 + }, + { + "epoch": 2.0830180025226466, + "grad_norm": 0.283128947019577, + "learning_rate": 0.00022568628895528077, + "loss": 1.7429, + "step": 4543 + }, + { + "epoch": 2.083476665520009, + "grad_norm": 0.4075316786766052, + "learning_rate": 0.000225479276905315, + "loss": 1.6674, + "step": 4544 + }, + { + "epoch": 2.083935328517372, + "grad_norm": 0.36636197566986084, + "learning_rate": 0.00022527233219726202, + "loss": 1.7224, + "step": 4545 + }, + { + "epoch": 2.0843939915147347, + "grad_norm": 0.29077982902526855, + "learning_rate": 0.00022506545488188678, + "loss": 1.3293, + "step": 4546 + }, + { + "epoch": 2.084852654512097, + "grad_norm": 0.2742144465446472, + "learning_rate": 0.0002248586450099379, + "loss": 0.9725, + "step": 4547 + }, + { + "epoch": 2.08531131750946, + "grad_norm": 0.2022017389535904, + "learning_rate": 0.00022465190263214747, + "loss": 1.2256, + "step": 4548 + }, + { + "epoch": 2.0857699805068224, + "grad_norm": 0.2726742625236511, + "learning_rate": 0.00022444522779923044, + "loss": 1.079, + "step": 4549 + }, + { + "epoch": 2.0862286435041852, + "grad_norm": 0.23730774223804474, + "learning_rate": 0.00022423862056188593, + "loss": 1.5887, + "step": 4550 + }, + { + "epoch": 2.086687306501548, + "grad_norm": 1.3285748958587646, + "learning_rate": 0.00022403208097079613, + "loss": 1.9584, + "step": 4551 + }, + { + "epoch": 2.0871459694989105, + "grad_norm": 0.32901179790496826, + "learning_rate": 0.00022382560907662668, + "loss": 1.2246, + "step": 4552 + }, + { + "epoch": 2.0876046324962734, + "grad_norm": 0.260644793510437, + "learning_rate": 0.00022361920493002669, + "loss": 1.4697, + "step": 4553 + }, + { + "epoch": 2.0880632954936362, + "grad_norm": 0.35175642371177673, + "learning_rate": 0.0002234128685816285, + "loss": 1.5698, + "step": 4554 + }, + { + "epoch": 2.0885219584909986, + "grad_norm": 0.44502708315849304, + "learning_rate": 0.00022320660008204795, + "loss": 2.0159, + "step": 4555 + }, + { + "epoch": 2.0889806214883615, + "grad_norm": 0.37410539388656616, + "learning_rate": 0.00022300039948188418, + "loss": 1.7451, + "step": 4556 + }, + { + "epoch": 2.089439284485724, + "grad_norm": 0.4350254237651825, + "learning_rate": 0.0002227942668317197, + "loss": 1.2582, + "step": 4557 + }, + { + "epoch": 2.0898979474830868, + "grad_norm": 0.23693208396434784, + "learning_rate": 0.00022258820218212035, + "loss": 1.162, + "step": 4558 + }, + { + "epoch": 2.0903566104804496, + "grad_norm": 0.33850157260894775, + "learning_rate": 0.0002223822055836352, + "loss": 1.2121, + "step": 4559 + }, + { + "epoch": 2.090815273477812, + "grad_norm": 0.2646215260028839, + "learning_rate": 0.00022217627708679693, + "loss": 0.4118, + "step": 4560 + }, + { + "epoch": 2.091273936475175, + "grad_norm": 0.26649442315101624, + "learning_rate": 0.00022197041674212092, + "loss": 2.1147, + "step": 4561 + }, + { + "epoch": 2.0917325994725378, + "grad_norm": 0.27695992588996887, + "learning_rate": 0.0002217646246001064, + "loss": 1.0989, + "step": 4562 + }, + { + "epoch": 2.0921912624699, + "grad_norm": 0.34253594279289246, + "learning_rate": 0.00022155890071123564, + "loss": 1.6344, + "step": 4563 + }, + { + "epoch": 2.092649925467263, + "grad_norm": 0.32384851574897766, + "learning_rate": 0.0002213532451259742, + "loss": 1.7167, + "step": 4564 + }, + { + "epoch": 2.0931085884646254, + "grad_norm": 0.35032370686531067, + "learning_rate": 0.00022114765789477088, + "loss": 1.7271, + "step": 4565 + }, + { + "epoch": 2.0935672514619883, + "grad_norm": 0.2640658915042877, + "learning_rate": 0.0002209421390680577, + "loss": 1.6772, + "step": 4566 + }, + { + "epoch": 2.094025914459351, + "grad_norm": 0.33520472049713135, + "learning_rate": 0.00022073668869624995, + "loss": 1.4508, + "step": 4567 + }, + { + "epoch": 2.0944845774567136, + "grad_norm": 0.31736597418785095, + "learning_rate": 0.00022053130682974604, + "loss": 1.4152, + "step": 4568 + }, + { + "epoch": 2.0949432404540764, + "grad_norm": 0.31707099080085754, + "learning_rate": 0.00022032599351892764, + "loss": 1.6255, + "step": 4569 + }, + { + "epoch": 2.095401903451439, + "grad_norm": 0.4904292821884155, + "learning_rate": 0.00022012074881415955, + "loss": 1.9502, + "step": 4570 + }, + { + "epoch": 2.0958605664488017, + "grad_norm": 0.3095618188381195, + "learning_rate": 0.00021991557276579, + "loss": 1.5995, + "step": 4571 + }, + { + "epoch": 2.0963192294461646, + "grad_norm": 0.41936981678009033, + "learning_rate": 0.0002197104654241498, + "loss": 1.7164, + "step": 4572 + }, + { + "epoch": 2.096777892443527, + "grad_norm": 0.34843704104423523, + "learning_rate": 0.00021950542683955344, + "loss": 1.2392, + "step": 4573 + }, + { + "epoch": 2.09723655544089, + "grad_norm": 0.19430193305015564, + "learning_rate": 0.00021930045706229835, + "loss": 0.9546, + "step": 4574 + }, + { + "epoch": 2.0976952184382527, + "grad_norm": 0.36258745193481445, + "learning_rate": 0.00021909555614266484, + "loss": 1.447, + "step": 4575 + }, + { + "epoch": 2.098153881435615, + "grad_norm": 0.21773214638233185, + "learning_rate": 0.00021889072413091727, + "loss": 1.3679, + "step": 4576 + }, + { + "epoch": 2.098612544432978, + "grad_norm": 0.2897671163082123, + "learning_rate": 0.00021868596107730176, + "loss": 0.8525, + "step": 4577 + }, + { + "epoch": 2.0990712074303404, + "grad_norm": 0.22055676579475403, + "learning_rate": 0.0002184812670320484, + "loss": 1.4193, + "step": 4578 + }, + { + "epoch": 2.0995298704277032, + "grad_norm": 0.24514396488666534, + "learning_rate": 0.00021827664204537007, + "loss": 0.5204, + "step": 4579 + }, + { + "epoch": 2.099988533425066, + "grad_norm": 0.11148292571306229, + "learning_rate": 0.00021807208616746277, + "loss": 0.7519, + "step": 4580 + }, + { + "epoch": 2.1004471964224285, + "grad_norm": 0.35977640748023987, + "learning_rate": 0.00021786759944850554, + "loss": 1.3285, + "step": 4581 + }, + { + "epoch": 2.1009058594197914, + "grad_norm": 0.33894822001457214, + "learning_rate": 0.00021766318193866064, + "loss": 1.6344, + "step": 4582 + }, + { + "epoch": 2.101364522417154, + "grad_norm": 0.1890052706003189, + "learning_rate": 0.00021745883368807278, + "loss": 0.8127, + "step": 4583 + }, + { + "epoch": 2.1018231854145166, + "grad_norm": 0.3900175094604492, + "learning_rate": 0.00021725455474687027, + "loss": 1.8716, + "step": 4584 + }, + { + "epoch": 2.1022818484118795, + "grad_norm": 0.2661358714103699, + "learning_rate": 0.00021705034516516396, + "loss": 0.8125, + "step": 4585 + }, + { + "epoch": 2.102740511409242, + "grad_norm": 0.34905606508255005, + "learning_rate": 0.00021684620499304836, + "loss": 1.2191, + "step": 4586 + }, + { + "epoch": 2.1031991744066048, + "grad_norm": 0.2521708905696869, + "learning_rate": 0.0002166421342806003, + "loss": 1.4932, + "step": 4587 + }, + { + "epoch": 2.1036578374039676, + "grad_norm": 0.31396129727363586, + "learning_rate": 0.00021643813307788002, + "loss": 0.9659, + "step": 4588 + }, + { + "epoch": 2.10411650040133, + "grad_norm": 0.22702209651470184, + "learning_rate": 0.00021623420143493006, + "loss": 0.6938, + "step": 4589 + }, + { + "epoch": 2.104575163398693, + "grad_norm": 0.10886117815971375, + "learning_rate": 0.00021603033940177657, + "loss": 0.9339, + "step": 4590 + }, + { + "epoch": 2.1050338263960553, + "grad_norm": 0.262203186750412, + "learning_rate": 0.00021582654702842835, + "loss": 1.6595, + "step": 4591 + }, + { + "epoch": 2.105492489393418, + "grad_norm": 0.5163483023643494, + "learning_rate": 0.00021562282436487717, + "loss": 1.8124, + "step": 4592 + }, + { + "epoch": 2.105951152390781, + "grad_norm": 0.31572282314300537, + "learning_rate": 0.0002154191714610978, + "loss": 1.6561, + "step": 4593 + }, + { + "epoch": 2.1064098153881434, + "grad_norm": 0.5046913623809814, + "learning_rate": 0.0002152155883670474, + "loss": 1.4269, + "step": 4594 + }, + { + "epoch": 2.1068684783855063, + "grad_norm": 0.27989131212234497, + "learning_rate": 0.0002150120751326664, + "loss": 0.3858, + "step": 4595 + }, + { + "epoch": 2.107327141382869, + "grad_norm": 0.8256828784942627, + "learning_rate": 0.00021480863180787846, + "loss": 1.5546, + "step": 4596 + }, + { + "epoch": 2.1077858043802316, + "grad_norm": 0.2901069223880768, + "learning_rate": 0.00021460525844258944, + "loss": 1.1568, + "step": 4597 + }, + { + "epoch": 2.1082444673775944, + "grad_norm": 0.49482643604278564, + "learning_rate": 0.00021440195508668836, + "loss": 1.4728, + "step": 4598 + }, + { + "epoch": 2.108703130374957, + "grad_norm": 0.34062066674232483, + "learning_rate": 0.00021419872179004714, + "loss": 1.5667, + "step": 4599 + }, + { + "epoch": 2.1091617933723197, + "grad_norm": 0.3349739909172058, + "learning_rate": 0.00021399555860251995, + "loss": 1.2008, + "step": 4600 + }, + { + "epoch": 2.1096204563696825, + "grad_norm": 0.26807984709739685, + "learning_rate": 0.0002137924655739445, + "loss": 0.8011, + "step": 4601 + }, + { + "epoch": 2.110079119367045, + "grad_norm": 0.17875202000141144, + "learning_rate": 0.0002135894427541409, + "loss": 1.6011, + "step": 4602 + }, + { + "epoch": 2.110537782364408, + "grad_norm": 0.3981230854988098, + "learning_rate": 0.00021338649019291212, + "loss": 1.1253, + "step": 4603 + }, + { + "epoch": 2.1109964453617707, + "grad_norm": 0.24843645095825195, + "learning_rate": 0.00021318360794004388, + "loss": 1.1508, + "step": 4604 + }, + { + "epoch": 2.111455108359133, + "grad_norm": 0.25917336344718933, + "learning_rate": 0.00021298079604530464, + "loss": 1.8315, + "step": 4605 + }, + { + "epoch": 2.111913771356496, + "grad_norm": 0.36518481373786926, + "learning_rate": 0.00021277805455844568, + "loss": 1.3478, + "step": 4606 + }, + { + "epoch": 2.1123724343538584, + "grad_norm": 0.4103847146034241, + "learning_rate": 0.00021257538352920091, + "loss": 1.0914, + "step": 4607 + }, + { + "epoch": 2.112831097351221, + "grad_norm": 0.31975114345550537, + "learning_rate": 0.00021237278300728697, + "loss": 1.1174, + "step": 4608 + }, + { + "epoch": 2.113289760348584, + "grad_norm": 0.23576036095619202, + "learning_rate": 0.00021217025304240327, + "loss": 1.0789, + "step": 4609 + }, + { + "epoch": 2.1137484233459465, + "grad_norm": 0.3097432255744934, + "learning_rate": 0.00021196779368423208, + "loss": 0.7991, + "step": 4610 + }, + { + "epoch": 2.1142070863433093, + "grad_norm": 0.1049700602889061, + "learning_rate": 0.00021176540498243768, + "loss": 1.429, + "step": 4611 + }, + { + "epoch": 2.1146657493406718, + "grad_norm": 0.3536173403263092, + "learning_rate": 0.00021156308698666777, + "loss": 1.2648, + "step": 4612 + }, + { + "epoch": 2.1151244123380346, + "grad_norm": 0.26619645953178406, + "learning_rate": 0.00021136083974655236, + "loss": 1.4557, + "step": 4613 + }, + { + "epoch": 2.1155830753353975, + "grad_norm": 0.308630108833313, + "learning_rate": 0.0002111586633117041, + "loss": 0.9568, + "step": 4614 + }, + { + "epoch": 2.11604173833276, + "grad_norm": 0.26167064905166626, + "learning_rate": 0.0002109565577317184, + "loss": 1.3277, + "step": 4615 + }, + { + "epoch": 2.1165004013301227, + "grad_norm": 0.26667726039886475, + "learning_rate": 0.0002107545230561732, + "loss": 1.1866, + "step": 4616 + }, + { + "epoch": 2.116959064327485, + "grad_norm": 0.35242959856987, + "learning_rate": 0.00021055255933462912, + "loss": 1.6272, + "step": 4617 + }, + { + "epoch": 2.117417727324848, + "grad_norm": 0.3653687536716461, + "learning_rate": 0.0002103506666166292, + "loss": 1.4598, + "step": 4618 + }, + { + "epoch": 2.117876390322211, + "grad_norm": 0.3237496614456177, + "learning_rate": 0.00021014884495169927, + "loss": 1.6118, + "step": 4619 + }, + { + "epoch": 2.1183350533195733, + "grad_norm": 0.4205828011035919, + "learning_rate": 0.00020994709438934756, + "loss": 1.92, + "step": 4620 + }, + { + "epoch": 2.118793716316936, + "grad_norm": 0.3911856710910797, + "learning_rate": 0.00020974541497906525, + "loss": 1.6688, + "step": 4621 + }, + { + "epoch": 2.119252379314299, + "grad_norm": 0.345805823802948, + "learning_rate": 0.00020954380677032526, + "loss": 0.8207, + "step": 4622 + }, + { + "epoch": 2.1197110423116614, + "grad_norm": 0.2564966082572937, + "learning_rate": 0.00020934226981258376, + "loss": 1.2849, + "step": 4623 + }, + { + "epoch": 2.1201697053090243, + "grad_norm": 0.18908265233039856, + "learning_rate": 0.0002091408041552792, + "loss": 1.1646, + "step": 4624 + }, + { + "epoch": 2.1206283683063867, + "grad_norm": 0.32488518953323364, + "learning_rate": 0.00020893940984783262, + "loss": 1.7903, + "step": 4625 + }, + { + "epoch": 2.1210870313037495, + "grad_norm": 0.25629740953445435, + "learning_rate": 0.00020873808693964746, + "loss": 0.9413, + "step": 4626 + }, + { + "epoch": 2.1215456943011124, + "grad_norm": 0.2971414029598236, + "learning_rate": 0.00020853683548010965, + "loss": 1.7205, + "step": 4627 + }, + { + "epoch": 2.122004357298475, + "grad_norm": 0.2897069454193115, + "learning_rate": 0.00020833565551858768, + "loss": 1.1088, + "step": 4628 + }, + { + "epoch": 2.1224630202958377, + "grad_norm": 0.21978043019771576, + "learning_rate": 0.0002081345471044324, + "loss": 1.2353, + "step": 4629 + }, + { + "epoch": 2.1229216832932005, + "grad_norm": 0.3789485692977905, + "learning_rate": 0.0002079335102869772, + "loss": 1.7919, + "step": 4630 + }, + { + "epoch": 2.123380346290563, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.00020773254511553786, + "loss": 0.7957, + "step": 4631 + }, + { + "epoch": 2.123839009287926, + "grad_norm": 0.2726113796234131, + "learning_rate": 0.00020753165163941273, + "loss": 2.3215, + "step": 4632 + }, + { + "epoch": 2.124297672285288, + "grad_norm": 0.3731834888458252, + "learning_rate": 0.00020733082990788204, + "loss": 2.0028, + "step": 4633 + }, + { + "epoch": 2.124756335282651, + "grad_norm": 0.3649563789367676, + "learning_rate": 0.00020713007997020906, + "loss": 1.4407, + "step": 4634 + }, + { + "epoch": 2.125214998280014, + "grad_norm": 0.2045055776834488, + "learning_rate": 0.00020692940187563914, + "loss": 0.942, + "step": 4635 + }, + { + "epoch": 2.1256736612773763, + "grad_norm": 0.39572790265083313, + "learning_rate": 0.0002067287956734001, + "loss": 2.0579, + "step": 4636 + }, + { + "epoch": 2.126132324274739, + "grad_norm": 0.47954314947128296, + "learning_rate": 0.00020652826141270194, + "loss": 1.1333, + "step": 4637 + }, + { + "epoch": 2.1265909872721016, + "grad_norm": 0.30008918046951294, + "learning_rate": 0.00020632779914273757, + "loss": 0.9181, + "step": 4638 + }, + { + "epoch": 2.1270496502694645, + "grad_norm": 0.19218164682388306, + "learning_rate": 0.00020612740891268145, + "loss": 1.7064, + "step": 4639 + }, + { + "epoch": 2.1275083132668273, + "grad_norm": 0.36220747232437134, + "learning_rate": 0.00020592709077169082, + "loss": 1.2349, + "step": 4640 + }, + { + "epoch": 2.1279669762641897, + "grad_norm": 0.2821071445941925, + "learning_rate": 0.00020572684476890518, + "loss": 1.5732, + "step": 4641 + }, + { + "epoch": 2.1284256392615526, + "grad_norm": 0.3287753760814667, + "learning_rate": 0.00020552667095344635, + "loss": 1.9521, + "step": 4642 + }, + { + "epoch": 2.1288843022589155, + "grad_norm": 0.29272401332855225, + "learning_rate": 0.00020532656937441841, + "loss": 1.1461, + "step": 4643 + }, + { + "epoch": 2.129342965256278, + "grad_norm": 0.4012896716594696, + "learning_rate": 0.00020512654008090792, + "loss": 1.6783, + "step": 4644 + }, + { + "epoch": 2.1298016282536407, + "grad_norm": 0.2766939401626587, + "learning_rate": 0.00020492658312198304, + "loss": 0.8703, + "step": 4645 + }, + { + "epoch": 2.130260291251003, + "grad_norm": 0.20289787650108337, + "learning_rate": 0.00020472669854669495, + "loss": 1.5026, + "step": 4646 + }, + { + "epoch": 2.130718954248366, + "grad_norm": 0.37175294756889343, + "learning_rate": 0.00020452688640407656, + "loss": 1.6858, + "step": 4647 + }, + { + "epoch": 2.131177617245729, + "grad_norm": 0.28814128041267395, + "learning_rate": 0.00020432714674314362, + "loss": 1.4595, + "step": 4648 + }, + { + "epoch": 2.1316362802430913, + "grad_norm": 0.4047505557537079, + "learning_rate": 0.00020412747961289364, + "loss": 0.8171, + "step": 4649 + }, + { + "epoch": 2.132094943240454, + "grad_norm": 0.23079468309879303, + "learning_rate": 0.0002039278850623061, + "loss": 0.9852, + "step": 4650 + }, + { + "epoch": 2.132553606237817, + "grad_norm": 0.3248424232006073, + "learning_rate": 0.0002037283631403431, + "loss": 1.979, + "step": 4651 + }, + { + "epoch": 2.1330122692351794, + "grad_norm": 0.4883231222629547, + "learning_rate": 0.0002035289138959489, + "loss": 0.9462, + "step": 4652 + }, + { + "epoch": 2.1334709322325422, + "grad_norm": 0.09132955968379974, + "learning_rate": 0.00020332953737804978, + "loss": 0.8115, + "step": 4653 + }, + { + "epoch": 2.1339295952299047, + "grad_norm": 0.31292226910591125, + "learning_rate": 0.00020313023363555422, + "loss": 1.2776, + "step": 4654 + }, + { + "epoch": 2.1343882582272675, + "grad_norm": 0.32427236437797546, + "learning_rate": 0.00020293100271735303, + "loss": 1.2678, + "step": 4655 + }, + { + "epoch": 2.1348469212246304, + "grad_norm": 0.2879985272884369, + "learning_rate": 0.00020273184467231876, + "loss": 0.938, + "step": 4656 + }, + { + "epoch": 2.135305584221993, + "grad_norm": 0.2097647488117218, + "learning_rate": 0.00020253275954930621, + "loss": 1.1378, + "step": 4657 + }, + { + "epoch": 2.1357642472193556, + "grad_norm": 0.3992927670478821, + "learning_rate": 0.00020233374739715276, + "loss": 1.2597, + "step": 4658 + }, + { + "epoch": 2.136222910216718, + "grad_norm": 0.2745298147201538, + "learning_rate": 0.00020213480826467733, + "loss": 1.3282, + "step": 4659 + }, + { + "epoch": 2.136681573214081, + "grad_norm": 0.3620222806930542, + "learning_rate": 0.00020193594220068134, + "loss": 1.2953, + "step": 4660 + }, + { + "epoch": 2.1371402362114438, + "grad_norm": 0.4318819046020508, + "learning_rate": 0.00020173714925394775, + "loss": 2.0383, + "step": 4661 + }, + { + "epoch": 2.137598899208806, + "grad_norm": 0.4898906946182251, + "learning_rate": 0.00020153842947324196, + "loss": 1.1289, + "step": 4662 + }, + { + "epoch": 2.138057562206169, + "grad_norm": 0.32130667567253113, + "learning_rate": 0.00020133978290731152, + "loss": 1.5143, + "step": 4663 + }, + { + "epoch": 2.138516225203532, + "grad_norm": 0.196794331073761, + "learning_rate": 0.00020114120960488575, + "loss": 1.2338, + "step": 4664 + }, + { + "epoch": 2.1389748882008943, + "grad_norm": 0.3671046197414398, + "learning_rate": 0.00020094270961467614, + "loss": 0.8875, + "step": 4665 + }, + { + "epoch": 2.139433551198257, + "grad_norm": 0.3376992344856262, + "learning_rate": 0.00020074428298537633, + "loss": 1.4139, + "step": 4666 + }, + { + "epoch": 2.1398922141956196, + "grad_norm": 0.24704553186893463, + "learning_rate": 0.00020054592976566132, + "loss": 1.4949, + "step": 4667 + }, + { + "epoch": 2.1403508771929824, + "grad_norm": 0.21429981291294098, + "learning_rate": 0.00020034765000418914, + "loss": 0.863, + "step": 4668 + }, + { + "epoch": 2.1408095401903453, + "grad_norm": 0.2140214592218399, + "learning_rate": 0.0002001494437495989, + "loss": 0.8906, + "step": 4669 + }, + { + "epoch": 2.1412682031877077, + "grad_norm": 0.27707546949386597, + "learning_rate": 0.00019995131105051228, + "loss": 1.3032, + "step": 4670 + }, + { + "epoch": 2.1417268661850706, + "grad_norm": 0.7792873382568359, + "learning_rate": 0.00019975325195553263, + "loss": 1.3045, + "step": 4671 + }, + { + "epoch": 2.1421855291824334, + "grad_norm": 0.3791343569755554, + "learning_rate": 0.00019955526651324495, + "loss": 1.6859, + "step": 4672 + }, + { + "epoch": 2.142644192179796, + "grad_norm": 1.3164491653442383, + "learning_rate": 0.00019935735477221678, + "loss": 1.0486, + "step": 4673 + }, + { + "epoch": 2.1431028551771587, + "grad_norm": 0.318463534116745, + "learning_rate": 0.0001991595167809972, + "loss": 1.8174, + "step": 4674 + }, + { + "epoch": 2.143561518174521, + "grad_norm": 0.4365736246109009, + "learning_rate": 0.00019896175258811734, + "loss": 0.6856, + "step": 4675 + }, + { + "epoch": 2.144020181171884, + "grad_norm": 0.2647281289100647, + "learning_rate": 0.00019876406224209015, + "loss": 1.6011, + "step": 4676 + }, + { + "epoch": 2.144478844169247, + "grad_norm": 0.3120388090610504, + "learning_rate": 0.00019856644579141052, + "loss": 1.074, + "step": 4677 + }, + { + "epoch": 2.1449375071666092, + "grad_norm": 0.3103806674480438, + "learning_rate": 0.0001983689032845552, + "loss": 1.7906, + "step": 4678 + }, + { + "epoch": 2.145396170163972, + "grad_norm": 0.36245760321617126, + "learning_rate": 0.0001981714347699828, + "loss": 1.4742, + "step": 4679 + }, + { + "epoch": 2.1458548331613345, + "grad_norm": 0.36996036767959595, + "learning_rate": 0.00019797404029613368, + "loss": 1.2744, + "step": 4680 + }, + { + "epoch": 2.1463134961586974, + "grad_norm": 0.28153517842292786, + "learning_rate": 0.00019777671991143026, + "loss": 1.4012, + "step": 4681 + }, + { + "epoch": 2.1467721591560602, + "grad_norm": 0.3398634195327759, + "learning_rate": 0.00019757947366427653, + "loss": 1.4422, + "step": 4682 + }, + { + "epoch": 2.1472308221534226, + "grad_norm": 0.17714032530784607, + "learning_rate": 0.0001973823016030587, + "loss": 0.4065, + "step": 4683 + }, + { + "epoch": 2.1476894851507855, + "grad_norm": 0.21079567074775696, + "learning_rate": 0.00019718520377614407, + "loss": 1.353, + "step": 4684 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.46706634759902954, + "learning_rate": 0.00019698818023188236, + "loss": 1.7202, + "step": 4685 + }, + { + "epoch": 2.1486068111455108, + "grad_norm": 0.2902261018753052, + "learning_rate": 0.00019679123101860491, + "loss": 1.7689, + "step": 4686 + }, + { + "epoch": 2.1490654741428736, + "grad_norm": 0.4393427073955536, + "learning_rate": 0.00019659435618462473, + "loss": 2.2651, + "step": 4687 + }, + { + "epoch": 2.149524137140236, + "grad_norm": 0.3855237364768982, + "learning_rate": 0.0001963975557782366, + "loss": 1.5914, + "step": 4688 + }, + { + "epoch": 2.149982800137599, + "grad_norm": 0.6791412234306335, + "learning_rate": 0.00019620082984771715, + "loss": 1.9954, + "step": 4689 + }, + { + "epoch": 2.1504414631349618, + "grad_norm": 0.38182568550109863, + "learning_rate": 0.00019600417844132463, + "loss": 1.1696, + "step": 4690 + }, + { + "epoch": 2.150900126132324, + "grad_norm": 0.2527628540992737, + "learning_rate": 0.0001958076016072991, + "loss": 0.83, + "step": 4691 + }, + { + "epoch": 2.151358789129687, + "grad_norm": 0.29251906275749207, + "learning_rate": 0.00019561109939386217, + "loss": 1.1837, + "step": 4692 + }, + { + "epoch": 2.1518174521270494, + "grad_norm": 0.3355877995491028, + "learning_rate": 0.0001954146718492174, + "loss": 1.3884, + "step": 4693 + }, + { + "epoch": 2.1522761151244123, + "grad_norm": 0.2906731367111206, + "learning_rate": 0.0001952183190215499, + "loss": 1.2325, + "step": 4694 + }, + { + "epoch": 2.152734778121775, + "grad_norm": 0.2701873779296875, + "learning_rate": 0.0001950220409590262, + "loss": 1.6547, + "step": 4695 + }, + { + "epoch": 2.1531934411191376, + "grad_norm": 0.3364790976047516, + "learning_rate": 0.00019482583770979485, + "loss": 1.2307, + "step": 4696 + }, + { + "epoch": 2.1536521041165004, + "grad_norm": 0.22372028231620789, + "learning_rate": 0.00019462970932198592, + "loss": 1.2701, + "step": 4697 + }, + { + "epoch": 2.1541107671138633, + "grad_norm": 0.34942564368247986, + "learning_rate": 0.00019443365584371114, + "loss": 1.3538, + "step": 4698 + }, + { + "epoch": 2.1545694301112257, + "grad_norm": 0.21902301907539368, + "learning_rate": 0.0001942376773230638, + "loss": 0.9939, + "step": 4699 + }, + { + "epoch": 2.1550280931085886, + "grad_norm": 0.3378940224647522, + "learning_rate": 0.00019404177380811895, + "loss": 1.5621, + "step": 4700 + }, + { + "epoch": 2.155486756105951, + "grad_norm": 0.44091933965682983, + "learning_rate": 0.00019384594534693295, + "loss": 1.6776, + "step": 4701 + }, + { + "epoch": 2.155945419103314, + "grad_norm": 0.3093583285808563, + "learning_rate": 0.00019365019198754413, + "loss": 1.556, + "step": 4702 + }, + { + "epoch": 2.1564040821006767, + "grad_norm": 0.30539020895957947, + "learning_rate": 0.00019345451377797207, + "loss": 0.9653, + "step": 4703 + }, + { + "epoch": 2.156862745098039, + "grad_norm": 0.2688155174255371, + "learning_rate": 0.0001932589107662181, + "loss": 1.2151, + "step": 4704 + }, + { + "epoch": 2.157321408095402, + "grad_norm": 0.29725632071495056, + "learning_rate": 0.0001930633830002652, + "loss": 1.8734, + "step": 4705 + }, + { + "epoch": 2.1577800710927644, + "grad_norm": 0.3029474914073944, + "learning_rate": 0.00019286793052807744, + "loss": 0.8146, + "step": 4706 + }, + { + "epoch": 2.1582387340901272, + "grad_norm": 0.4033948481082916, + "learning_rate": 0.00019267255339760082, + "loss": 2.1886, + "step": 4707 + }, + { + "epoch": 2.15869739708749, + "grad_norm": 0.4187861979007721, + "learning_rate": 0.00019247725165676276, + "loss": 1.5054, + "step": 4708 + }, + { + "epoch": 2.1591560600848525, + "grad_norm": 0.22975748777389526, + "learning_rate": 0.00019228202535347212, + "loss": 1.0834, + "step": 4709 + }, + { + "epoch": 2.1596147230822154, + "grad_norm": 0.4031769633293152, + "learning_rate": 0.00019208687453561957, + "loss": 1.534, + "step": 4710 + }, + { + "epoch": 2.160073386079578, + "grad_norm": 0.36004751920700073, + "learning_rate": 0.00019189179925107702, + "loss": 1.5185, + "step": 4711 + }, + { + "epoch": 2.1605320490769406, + "grad_norm": 0.230648472905159, + "learning_rate": 0.00019169679954769754, + "loss": 0.6958, + "step": 4712 + }, + { + "epoch": 2.1609907120743035, + "grad_norm": 0.38399839401245117, + "learning_rate": 0.0001915018754733161, + "loss": 1.9727, + "step": 4713 + }, + { + "epoch": 2.161449375071666, + "grad_norm": 1.3798997402191162, + "learning_rate": 0.00019130702707574905, + "loss": 1.0898, + "step": 4714 + }, + { + "epoch": 2.1619080380690288, + "grad_norm": 0.3109949231147766, + "learning_rate": 0.00019111225440279395, + "loss": 1.5253, + "step": 4715 + }, + { + "epoch": 2.1623667010663916, + "grad_norm": 0.24511021375656128, + "learning_rate": 0.00019091755750223028, + "loss": 1.6026, + "step": 4716 + }, + { + "epoch": 2.162825364063754, + "grad_norm": 0.4369632601737976, + "learning_rate": 0.00019072293642181815, + "loss": 1.6895, + "step": 4717 + }, + { + "epoch": 2.163284027061117, + "grad_norm": 0.4405476748943329, + "learning_rate": 0.00019052839120929977, + "loss": 1.3796, + "step": 4718 + }, + { + "epoch": 2.1637426900584797, + "grad_norm": 0.31771278381347656, + "learning_rate": 0.00019033392191239817, + "loss": 1.1156, + "step": 4719 + }, + { + "epoch": 2.164201353055842, + "grad_norm": 0.28369247913360596, + "learning_rate": 0.0001901395285788186, + "loss": 2.0259, + "step": 4720 + }, + { + "epoch": 2.164660016053205, + "grad_norm": 0.39868542551994324, + "learning_rate": 0.0001899452112562468, + "loss": 1.8127, + "step": 4721 + }, + { + "epoch": 2.1651186790505674, + "grad_norm": 0.24997852742671967, + "learning_rate": 0.00018975096999235052, + "loss": 1.468, + "step": 4722 + }, + { + "epoch": 2.1655773420479303, + "grad_norm": 0.3659355342388153, + "learning_rate": 0.0001895568048347781, + "loss": 1.2206, + "step": 4723 + }, + { + "epoch": 2.166036005045293, + "grad_norm": 0.20338742434978485, + "learning_rate": 0.00018936271583115994, + "loss": 1.3026, + "step": 4724 + }, + { + "epoch": 2.1664946680426556, + "grad_norm": 0.39747223258018494, + "learning_rate": 0.00018916870302910732, + "loss": 1.6281, + "step": 4725 + }, + { + "epoch": 2.1669533310400184, + "grad_norm": 0.33269479870796204, + "learning_rate": 0.00018897476647621308, + "loss": 1.9271, + "step": 4726 + }, + { + "epoch": 2.167411994037381, + "grad_norm": 0.2740723788738251, + "learning_rate": 0.00018878090622005138, + "loss": 1.1646, + "step": 4727 + }, + { + "epoch": 2.1678706570347437, + "grad_norm": 0.3122137188911438, + "learning_rate": 0.00018858712230817727, + "loss": 1.6887, + "step": 4728 + }, + { + "epoch": 2.1683293200321065, + "grad_norm": 0.3537803590297699, + "learning_rate": 0.00018839341478812726, + "loss": 1.1824, + "step": 4729 + }, + { + "epoch": 2.168787983029469, + "grad_norm": 0.273580402135849, + "learning_rate": 0.00018819978370741958, + "loss": 0.7199, + "step": 4730 + }, + { + "epoch": 2.169246646026832, + "grad_norm": 0.25088873505592346, + "learning_rate": 0.00018800622911355314, + "loss": 1.5972, + "step": 4731 + }, + { + "epoch": 2.1697053090241947, + "grad_norm": 0.38248297572135925, + "learning_rate": 0.0001878127510540083, + "loss": 1.4888, + "step": 4732 + }, + { + "epoch": 2.170163972021557, + "grad_norm": 0.23572513461112976, + "learning_rate": 0.00018761934957624675, + "loss": 1.2231, + "step": 4733 + }, + { + "epoch": 2.17062263501892, + "grad_norm": 0.35793864727020264, + "learning_rate": 0.00018742602472771104, + "loss": 1.3505, + "step": 4734 + }, + { + "epoch": 2.1710812980162824, + "grad_norm": 0.5893045663833618, + "learning_rate": 0.00018723277655582516, + "loss": 1.1632, + "step": 4735 + }, + { + "epoch": 2.171539961013645, + "grad_norm": 0.18537980318069458, + "learning_rate": 0.0001870396051079944, + "loss": 0.7823, + "step": 4736 + }, + { + "epoch": 2.171998624011008, + "grad_norm": 0.31003275513648987, + "learning_rate": 0.00018684651043160506, + "loss": 1.2098, + "step": 4737 + }, + { + "epoch": 2.1724572870083705, + "grad_norm": 0.40928885340690613, + "learning_rate": 0.00018665349257402465, + "loss": 1.1833, + "step": 4738 + }, + { + "epoch": 2.1729159500057333, + "grad_norm": 0.27787014842033386, + "learning_rate": 0.00018646055158260189, + "loss": 1.4373, + "step": 4739 + }, + { + "epoch": 2.173374613003096, + "grad_norm": 0.2921246886253357, + "learning_rate": 0.00018626768750466656, + "loss": 1.1901, + "step": 4740 + }, + { + "epoch": 2.1738332760004586, + "grad_norm": 0.36071351170539856, + "learning_rate": 0.00018607490038752956, + "loss": 1.4543, + "step": 4741 + }, + { + "epoch": 2.1742919389978215, + "grad_norm": 0.2633201777935028, + "learning_rate": 0.00018588219027848303, + "loss": 1.1895, + "step": 4742 + }, + { + "epoch": 2.174750601995184, + "grad_norm": 0.30791527032852173, + "learning_rate": 0.0001856895572248002, + "loss": 1.2403, + "step": 4743 + }, + { + "epoch": 2.1752092649925467, + "grad_norm": 0.2025088369846344, + "learning_rate": 0.00018549700127373537, + "loss": 1.7025, + "step": 4744 + }, + { + "epoch": 2.1756679279899096, + "grad_norm": 0.34419453144073486, + "learning_rate": 0.00018530452247252367, + "loss": 1.7213, + "step": 4745 + }, + { + "epoch": 2.176126590987272, + "grad_norm": 0.3504865765571594, + "learning_rate": 0.00018511212086838163, + "loss": 1.312, + "step": 4746 + }, + { + "epoch": 2.176585253984635, + "grad_norm": 0.2538597881793976, + "learning_rate": 0.00018491979650850688, + "loss": 0.6609, + "step": 4747 + }, + { + "epoch": 2.1770439169819973, + "grad_norm": 0.226425439119339, + "learning_rate": 0.00018472754944007786, + "loss": 1.7527, + "step": 4748 + }, + { + "epoch": 2.17750257997936, + "grad_norm": 0.3634589910507202, + "learning_rate": 0.0001845353797102542, + "loss": 0.7849, + "step": 4749 + }, + { + "epoch": 2.177961242976723, + "grad_norm": 0.19483381509780884, + "learning_rate": 0.00018434328736617652, + "loss": 1.1454, + "step": 4750 + }, + { + "epoch": 2.1784199059740854, + "grad_norm": 0.2180272936820984, + "learning_rate": 0.00018415127245496643, + "loss": 1.18, + "step": 4751 + }, + { + "epoch": 2.1788785689714483, + "grad_norm": 0.36905816197395325, + "learning_rate": 0.0001839593350237266, + "loss": 1.0663, + "step": 4752 + }, + { + "epoch": 2.1793372319688107, + "grad_norm": 0.23431743681430817, + "learning_rate": 0.00018376747511954068, + "loss": 1.481, + "step": 4753 + }, + { + "epoch": 2.1797958949661735, + "grad_norm": 0.2120712399482727, + "learning_rate": 0.00018357569278947323, + "loss": 0.5578, + "step": 4754 + }, + { + "epoch": 2.1802545579635364, + "grad_norm": 0.29886701703071594, + "learning_rate": 0.00018338398808057004, + "loss": 1.2928, + "step": 4755 + }, + { + "epoch": 2.180713220960899, + "grad_norm": 0.09233911335468292, + "learning_rate": 0.00018319236103985737, + "loss": 1.2831, + "step": 4756 + }, + { + "epoch": 2.1811718839582617, + "grad_norm": 0.33295729756355286, + "learning_rate": 0.00018300081171434285, + "loss": 1.4729, + "step": 4757 + }, + { + "epoch": 2.1816305469556245, + "grad_norm": 0.42028337717056274, + "learning_rate": 0.00018280934015101486, + "loss": 1.1091, + "step": 4758 + }, + { + "epoch": 2.182089209952987, + "grad_norm": 0.21170879900455475, + "learning_rate": 0.00018261794639684283, + "loss": 0.8437, + "step": 4759 + }, + { + "epoch": 2.18254787295035, + "grad_norm": 0.22131375968456268, + "learning_rate": 0.00018242663049877696, + "loss": 0.491, + "step": 4760 + }, + { + "epoch": 2.183006535947712, + "grad_norm": 0.22741609811782837, + "learning_rate": 0.00018223539250374844, + "loss": 1.4339, + "step": 4761 + }, + { + "epoch": 2.183465198945075, + "grad_norm": 0.353257417678833, + "learning_rate": 0.00018204423245866936, + "loss": 1.5006, + "step": 4762 + }, + { + "epoch": 2.183923861942438, + "grad_norm": 0.21219374239444733, + "learning_rate": 0.00018185315041043267, + "loss": 0.8634, + "step": 4763 + }, + { + "epoch": 2.1843825249398003, + "grad_norm": 0.3175276815891266, + "learning_rate": 0.00018166214640591205, + "loss": 1.6262, + "step": 4764 + }, + { + "epoch": 2.184841187937163, + "grad_norm": 0.4105018377304077, + "learning_rate": 0.0001814712204919623, + "loss": 1.631, + "step": 4765 + }, + { + "epoch": 2.185299850934526, + "grad_norm": 0.36518141627311707, + "learning_rate": 0.0001812803727154189, + "loss": 1.6072, + "step": 4766 + }, + { + "epoch": 2.1857585139318885, + "grad_norm": 0.2514982521533966, + "learning_rate": 0.0001810896031230983, + "loss": 1.1576, + "step": 4767 + }, + { + "epoch": 2.1862171769292513, + "grad_norm": 0.33633479475975037, + "learning_rate": 0.0001808989117617974, + "loss": 1.3829, + "step": 4768 + }, + { + "epoch": 2.1866758399266137, + "grad_norm": 0.2530759274959564, + "learning_rate": 0.00018070829867829425, + "loss": 1.0331, + "step": 4769 + }, + { + "epoch": 2.1871345029239766, + "grad_norm": 0.25773900747299194, + "learning_rate": 0.0001805177639193476, + "loss": 0.5553, + "step": 4770 + }, + { + "epoch": 2.1875931659213395, + "grad_norm": 0.28829655051231384, + "learning_rate": 0.00018032730753169714, + "loss": 2.219, + "step": 4771 + }, + { + "epoch": 2.188051828918702, + "grad_norm": 0.36159905791282654, + "learning_rate": 0.00018013692956206302, + "loss": 1.0755, + "step": 4772 + }, + { + "epoch": 2.1885104919160647, + "grad_norm": 0.40641242265701294, + "learning_rate": 0.00017994663005714646, + "loss": 1.019, + "step": 4773 + }, + { + "epoch": 2.188969154913427, + "grad_norm": 0.2221248745918274, + "learning_rate": 0.00017975640906362923, + "loss": 1.5312, + "step": 4774 + }, + { + "epoch": 2.18942781791079, + "grad_norm": 0.23253923654556274, + "learning_rate": 0.00017956626662817387, + "loss": 0.9588, + "step": 4775 + }, + { + "epoch": 2.189886480908153, + "grad_norm": 0.3233194947242737, + "learning_rate": 0.00017937620279742384, + "loss": 1.1987, + "step": 4776 + }, + { + "epoch": 2.1903451439055153, + "grad_norm": 0.4366668164730072, + "learning_rate": 0.0001791862176180031, + "loss": 1.6294, + "step": 4777 + }, + { + "epoch": 2.190803806902878, + "grad_norm": 0.4189610481262207, + "learning_rate": 0.00017899631113651643, + "loss": 1.3567, + "step": 4778 + }, + { + "epoch": 2.191262469900241, + "grad_norm": 0.28253674507141113, + "learning_rate": 0.00017880648339954914, + "loss": 1.637, + "step": 4779 + }, + { + "epoch": 2.1917211328976034, + "grad_norm": 0.43727073073387146, + "learning_rate": 0.00017861673445366733, + "loss": 1.8063, + "step": 4780 + }, + { + "epoch": 2.1921797958949663, + "grad_norm": 0.2974712550640106, + "learning_rate": 0.00017842706434541785, + "loss": 1.1469, + "step": 4781 + }, + { + "epoch": 2.1926384588923287, + "grad_norm": 0.3751980662345886, + "learning_rate": 0.00017823747312132798, + "loss": 0.908, + "step": 4782 + }, + { + "epoch": 2.1930971218896915, + "grad_norm": 0.15295343101024628, + "learning_rate": 0.00017804796082790636, + "loss": 0.8782, + "step": 4783 + }, + { + "epoch": 2.1935557848870544, + "grad_norm": 0.33498823642730713, + "learning_rate": 0.00017785852751164117, + "loss": 1.3965, + "step": 4784 + }, + { + "epoch": 2.194014447884417, + "grad_norm": 0.3449128568172455, + "learning_rate": 0.0001776691732190019, + "loss": 1.7836, + "step": 4785 + }, + { + "epoch": 2.1944731108817797, + "grad_norm": 0.3249196708202362, + "learning_rate": 0.0001774798979964386, + "loss": 1.1383, + "step": 4786 + }, + { + "epoch": 2.1949317738791425, + "grad_norm": 0.3362160325050354, + "learning_rate": 0.00017729070189038176, + "loss": 1.3211, + "step": 4787 + }, + { + "epoch": 2.195390436876505, + "grad_norm": 0.32171592116355896, + "learning_rate": 0.00017710158494724265, + "loss": 1.9521, + "step": 4788 + }, + { + "epoch": 2.195849099873868, + "grad_norm": 0.39043116569519043, + "learning_rate": 0.0001769125472134131, + "loss": 1.2834, + "step": 4789 + }, + { + "epoch": 2.19630776287123, + "grad_norm": 0.29517433047294617, + "learning_rate": 0.00017672358873526518, + "loss": 1.4635, + "step": 4790 + }, + { + "epoch": 2.196766425868593, + "grad_norm": 0.4014551639556885, + "learning_rate": 0.0001765347095591517, + "loss": 1.3101, + "step": 4791 + }, + { + "epoch": 2.197225088865956, + "grad_norm": 0.247705340385437, + "learning_rate": 0.0001763459097314064, + "loss": 1.5868, + "step": 4792 + }, + { + "epoch": 2.1976837518633183, + "grad_norm": 0.2868292033672333, + "learning_rate": 0.00017615718929834317, + "loss": 1.5704, + "step": 4793 + }, + { + "epoch": 2.198142414860681, + "grad_norm": 0.24143101274967194, + "learning_rate": 0.00017596854830625642, + "loss": 1.4587, + "step": 4794 + }, + { + "epoch": 2.1986010778580436, + "grad_norm": 0.2784029245376587, + "learning_rate": 0.00017577998680142132, + "loss": 1.2386, + "step": 4795 + }, + { + "epoch": 2.1990597408554065, + "grad_norm": 0.336953729391098, + "learning_rate": 0.00017559150483009302, + "loss": 1.296, + "step": 4796 + }, + { + "epoch": 2.1995184038527693, + "grad_norm": 0.39816170930862427, + "learning_rate": 0.0001754031024385077, + "loss": 1.5718, + "step": 4797 + }, + { + "epoch": 2.1999770668501317, + "grad_norm": 0.34481489658355713, + "learning_rate": 0.0001752147796728818, + "loss": 2.1771, + "step": 4798 + }, + { + "epoch": 2.2004357298474946, + "grad_norm": 0.3532600998878479, + "learning_rate": 0.0001750265365794123, + "loss": 1.6757, + "step": 4799 + }, + { + "epoch": 2.2008943928448574, + "grad_norm": 0.4431455135345459, + "learning_rate": 0.0001748383732042767, + "loss": 1.632, + "step": 4800 + }, + { + "epoch": 2.20135305584222, + "grad_norm": 0.3615950047969818, + "learning_rate": 0.00017465028959363238, + "loss": 1.4655, + "step": 4801 + }, + { + "epoch": 2.2018117188395827, + "grad_norm": 0.21566683053970337, + "learning_rate": 0.00017446228579361806, + "loss": 0.9168, + "step": 4802 + }, + { + "epoch": 2.202270381836945, + "grad_norm": 0.3532969057559967, + "learning_rate": 0.00017427436185035234, + "loss": 2.1003, + "step": 4803 + }, + { + "epoch": 2.202729044834308, + "grad_norm": 0.4000427722930908, + "learning_rate": 0.00017408651780993417, + "loss": 2.0688, + "step": 4804 + }, + { + "epoch": 2.203187707831671, + "grad_norm": 0.2674494981765747, + "learning_rate": 0.0001738987537184432, + "loss": 0.7786, + "step": 4805 + }, + { + "epoch": 2.2036463708290333, + "grad_norm": 0.2524220049381256, + "learning_rate": 0.00017371106962193938, + "loss": 1.5247, + "step": 4806 + }, + { + "epoch": 2.204105033826396, + "grad_norm": 0.3025605082511902, + "learning_rate": 0.00017352346556646277, + "loss": 0.6016, + "step": 4807 + }, + { + "epoch": 2.204563696823759, + "grad_norm": 0.07593965530395508, + "learning_rate": 0.00017333594159803397, + "loss": 0.8819, + "step": 4808 + }, + { + "epoch": 2.2050223598211214, + "grad_norm": 0.322405606508255, + "learning_rate": 0.00017314849776265412, + "loss": 0.8851, + "step": 4809 + }, + { + "epoch": 2.2054810228184842, + "grad_norm": 0.25816667079925537, + "learning_rate": 0.0001729611341063045, + "loss": 1.9113, + "step": 4810 + }, + { + "epoch": 2.2059396858158467, + "grad_norm": 0.31706973910331726, + "learning_rate": 0.00017277385067494672, + "loss": 0.929, + "step": 4811 + }, + { + "epoch": 2.2063983488132095, + "grad_norm": 0.16823376715183258, + "learning_rate": 0.0001725866475145228, + "loss": 1.0099, + "step": 4812 + }, + { + "epoch": 2.2068570118105724, + "grad_norm": 0.28849852085113525, + "learning_rate": 0.00017239952467095498, + "loss": 1.4562, + "step": 4813 + }, + { + "epoch": 2.207315674807935, + "grad_norm": 0.3034904897212982, + "learning_rate": 0.00017221248219014595, + "loss": 1.6464, + "step": 4814 + }, + { + "epoch": 2.2077743378052976, + "grad_norm": 0.31821388006210327, + "learning_rate": 0.00017202552011797852, + "loss": 1.2505, + "step": 4815 + }, + { + "epoch": 2.20823300080266, + "grad_norm": 0.3559507429599762, + "learning_rate": 0.00017183863850031572, + "loss": 1.3757, + "step": 4816 + }, + { + "epoch": 2.208691663800023, + "grad_norm": 0.20952913165092468, + "learning_rate": 0.00017165183738300133, + "loss": 0.3598, + "step": 4817 + }, + { + "epoch": 2.2091503267973858, + "grad_norm": 0.0632583424448967, + "learning_rate": 0.0001714651168118585, + "loss": 0.3411, + "step": 4818 + }, + { + "epoch": 2.209608989794748, + "grad_norm": 0.13204383850097656, + "learning_rate": 0.00017127847683269144, + "loss": 1.1918, + "step": 4819 + }, + { + "epoch": 2.210067652792111, + "grad_norm": 0.30054593086242676, + "learning_rate": 0.00017109191749128418, + "loss": 1.1966, + "step": 4820 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.35834217071533203, + "learning_rate": 0.00017090543883340115, + "loss": 2.2354, + "step": 4821 + }, + { + "epoch": 2.2109849787868363, + "grad_norm": 0.3150874674320221, + "learning_rate": 0.00017071904090478686, + "loss": 0.3315, + "step": 4822 + }, + { + "epoch": 2.211443641784199, + "grad_norm": 0.2139630913734436, + "learning_rate": 0.00017053272375116603, + "loss": 1.3162, + "step": 4823 + }, + { + "epoch": 2.2119023047815616, + "grad_norm": 0.35393640398979187, + "learning_rate": 0.00017034648741824366, + "loss": 2.103, + "step": 4824 + }, + { + "epoch": 2.2123609677789244, + "grad_norm": 0.43089064955711365, + "learning_rate": 0.00017016033195170488, + "loss": 1.7612, + "step": 4825 + }, + { + "epoch": 2.2128196307762873, + "grad_norm": 0.3777117431163788, + "learning_rate": 0.00016997425739721488, + "loss": 1.6729, + "step": 4826 + }, + { + "epoch": 2.2132782937736497, + "grad_norm": 0.3245319426059723, + "learning_rate": 0.00016978826380041923, + "loss": 1.1172, + "step": 4827 + }, + { + "epoch": 2.2137369567710126, + "grad_norm": 0.2487240731716156, + "learning_rate": 0.0001696023512069435, + "loss": 1.145, + "step": 4828 + }, + { + "epoch": 2.214195619768375, + "grad_norm": 0.1767636239528656, + "learning_rate": 0.00016941651966239325, + "loss": 0.94, + "step": 4829 + }, + { + "epoch": 2.214654282765738, + "grad_norm": 0.2936687171459198, + "learning_rate": 0.00016923076921235424, + "loss": 1.3027, + "step": 4830 + }, + { + "epoch": 2.2151129457631007, + "grad_norm": 0.5650012493133545, + "learning_rate": 0.00016904509990239258, + "loss": 1.048, + "step": 4831 + }, + { + "epoch": 2.215571608760463, + "grad_norm": 0.266467809677124, + "learning_rate": 0.00016885951177805425, + "loss": 1.7801, + "step": 4832 + }, + { + "epoch": 2.216030271757826, + "grad_norm": 0.39118704199790955, + "learning_rate": 0.00016867400488486528, + "loss": 1.2689, + "step": 4833 + }, + { + "epoch": 2.216488934755189, + "grad_norm": 0.3572444021701813, + "learning_rate": 0.0001684885792683319, + "loss": 2.2139, + "step": 4834 + }, + { + "epoch": 2.2169475977525512, + "grad_norm": 0.351784348487854, + "learning_rate": 0.00016830323497394033, + "loss": 1.1197, + "step": 4835 + }, + { + "epoch": 2.217406260749914, + "grad_norm": 0.19013693928718567, + "learning_rate": 0.0001681179720471569, + "loss": 1.0192, + "step": 4836 + }, + { + "epoch": 2.2178649237472765, + "grad_norm": 0.35627466440200806, + "learning_rate": 0.00016793279053342792, + "loss": 1.6245, + "step": 4837 + }, + { + "epoch": 2.2183235867446394, + "grad_norm": 0.34376806020736694, + "learning_rate": 0.00016774769047817978, + "loss": 1.575, + "step": 4838 + }, + { + "epoch": 2.218782249742002, + "grad_norm": 0.24683015048503876, + "learning_rate": 0.00016756267192681896, + "loss": 0.9045, + "step": 4839 + }, + { + "epoch": 2.2192409127393646, + "grad_norm": 0.19168370962142944, + "learning_rate": 0.00016737773492473152, + "loss": 0.8613, + "step": 4840 + }, + { + "epoch": 2.2196995757367275, + "grad_norm": 0.29889073967933655, + "learning_rate": 0.00016719287951728407, + "loss": 0.8257, + "step": 4841 + }, + { + "epoch": 2.22015823873409, + "grad_norm": 0.0963655561208725, + "learning_rate": 0.00016700810574982294, + "loss": 0.5904, + "step": 4842 + }, + { + "epoch": 2.2206169017314528, + "grad_norm": 0.2194061130285263, + "learning_rate": 0.00016682341366767444, + "loss": 1.1664, + "step": 4843 + }, + { + "epoch": 2.2210755647288156, + "grad_norm": 0.20055797696113586, + "learning_rate": 0.0001666388033161448, + "loss": 1.1049, + "step": 4844 + }, + { + "epoch": 2.221534227726178, + "grad_norm": 0.2792820334434509, + "learning_rate": 0.0001664542747405206, + "loss": 1.011, + "step": 4845 + }, + { + "epoch": 2.221992890723541, + "grad_norm": 0.243804931640625, + "learning_rate": 0.0001662698279860677, + "loss": 1.5193, + "step": 4846 + }, + { + "epoch": 2.2224515537209037, + "grad_norm": 0.42623358964920044, + "learning_rate": 0.00016608546309803229, + "loss": 1.7656, + "step": 4847 + }, + { + "epoch": 2.222910216718266, + "grad_norm": 0.5009300112724304, + "learning_rate": 0.00016590118012164046, + "loss": 1.7704, + "step": 4848 + }, + { + "epoch": 2.223368879715629, + "grad_norm": 0.20968489348888397, + "learning_rate": 0.0001657169791020981, + "loss": 0.8382, + "step": 4849 + }, + { + "epoch": 2.2238275427129914, + "grad_norm": 0.3352392017841339, + "learning_rate": 0.00016553286008459117, + "loss": 1.5792, + "step": 4850 + }, + { + "epoch": 2.2242862057103543, + "grad_norm": 0.27464622259140015, + "learning_rate": 0.00016534882311428523, + "loss": 0.928, + "step": 4851 + }, + { + "epoch": 2.224744868707717, + "grad_norm": 0.1357714831829071, + "learning_rate": 0.00016516486823632586, + "loss": 0.797, + "step": 4852 + }, + { + "epoch": 2.2252035317050796, + "grad_norm": 0.36050960421562195, + "learning_rate": 0.00016498099549583866, + "loss": 1.7069, + "step": 4853 + }, + { + "epoch": 2.2256621947024424, + "grad_norm": 0.367965430021286, + "learning_rate": 0.00016479720493792872, + "loss": 1.2835, + "step": 4854 + }, + { + "epoch": 2.2261208576998053, + "grad_norm": 0.2768895626068115, + "learning_rate": 0.00016461349660768144, + "loss": 1.2358, + "step": 4855 + }, + { + "epoch": 2.2265795206971677, + "grad_norm": 0.3254320025444031, + "learning_rate": 0.00016442987055016194, + "loss": 0.8148, + "step": 4856 + }, + { + "epoch": 2.2270381836945305, + "grad_norm": 0.21533845365047455, + "learning_rate": 0.00016424632681041456, + "loss": 1.7542, + "step": 4857 + }, + { + "epoch": 2.227496846691893, + "grad_norm": 0.2936849594116211, + "learning_rate": 0.00016406286543346415, + "loss": 0.4233, + "step": 4858 + }, + { + "epoch": 2.227955509689256, + "grad_norm": 0.5081586241722107, + "learning_rate": 0.0001638794864643151, + "loss": 1.5443, + "step": 4859 + }, + { + "epoch": 2.2284141726866187, + "grad_norm": 0.24850165843963623, + "learning_rate": 0.00016369618994795156, + "loss": 0.9991, + "step": 4860 + }, + { + "epoch": 2.228872835683981, + "grad_norm": 0.3500426113605499, + "learning_rate": 0.0001635129759293375, + "loss": 1.6708, + "step": 4861 + }, + { + "epoch": 2.229331498681344, + "grad_norm": 0.25747743248939514, + "learning_rate": 0.00016332984445341681, + "loss": 1.2169, + "step": 4862 + }, + { + "epoch": 2.2297901616787064, + "grad_norm": 0.31107261776924133, + "learning_rate": 0.0001631467955651124, + "loss": 1.2371, + "step": 4863 + }, + { + "epoch": 2.230248824676069, + "grad_norm": 0.25597137212753296, + "learning_rate": 0.00016296382930932812, + "loss": 0.7018, + "step": 4864 + }, + { + "epoch": 2.230707487673432, + "grad_norm": 0.21790584921836853, + "learning_rate": 0.00016278094573094666, + "loss": 1.2848, + "step": 4865 + }, + { + "epoch": 2.2311661506707945, + "grad_norm": 0.30176883935928345, + "learning_rate": 0.00016259814487483066, + "loss": 1.7701, + "step": 4866 + }, + { + "epoch": 2.2316248136681573, + "grad_norm": 0.3356066644191742, + "learning_rate": 0.00016241542678582268, + "loss": 1.2003, + "step": 4867 + }, + { + "epoch": 2.23208347666552, + "grad_norm": 0.3005014955997467, + "learning_rate": 0.00016223279150874448, + "loss": 1.0873, + "step": 4868 + }, + { + "epoch": 2.2325421396628826, + "grad_norm": 0.2405785322189331, + "learning_rate": 0.00016205023908839793, + "loss": 1.3601, + "step": 4869 + }, + { + "epoch": 2.2330008026602455, + "grad_norm": 0.3136742115020752, + "learning_rate": 0.00016186776956956451, + "loss": 1.1295, + "step": 4870 + }, + { + "epoch": 2.233459465657608, + "grad_norm": 0.18356893956661224, + "learning_rate": 0.00016168538299700519, + "loss": 1.1707, + "step": 4871 + }, + { + "epoch": 2.2339181286549707, + "grad_norm": 0.2986769676208496, + "learning_rate": 0.00016150307941546088, + "loss": 0.9671, + "step": 4872 + }, + { + "epoch": 2.2343767916523336, + "grad_norm": 0.2441634237766266, + "learning_rate": 0.00016132085886965187, + "loss": 0.5982, + "step": 4873 + }, + { + "epoch": 2.234835454649696, + "grad_norm": 0.18692080676555634, + "learning_rate": 0.00016113872140427815, + "loss": 1.5326, + "step": 4874 + }, + { + "epoch": 2.235294117647059, + "grad_norm": 0.3282175362110138, + "learning_rate": 0.00016095666706401941, + "loss": 1.2403, + "step": 4875 + }, + { + "epoch": 2.2357527806444217, + "grad_norm": 0.33327755331993103, + "learning_rate": 0.00016077469589353483, + "loss": 1.1533, + "step": 4876 + }, + { + "epoch": 2.236211443641784, + "grad_norm": 0.274987131357193, + "learning_rate": 0.00016059280793746333, + "loss": 1.6538, + "step": 4877 + }, + { + "epoch": 2.236670106639147, + "grad_norm": 0.43245795369148254, + "learning_rate": 0.00016041100324042345, + "loss": 1.9604, + "step": 4878 + }, + { + "epoch": 2.2371287696365094, + "grad_norm": 0.3761482238769531, + "learning_rate": 0.00016022928184701286, + "loss": 1.5767, + "step": 4879 + }, + { + "epoch": 2.2375874326338723, + "grad_norm": 0.3432115316390991, + "learning_rate": 0.0001600476438018093, + "loss": 1.6666, + "step": 4880 + }, + { + "epoch": 2.238046095631235, + "grad_norm": 0.3119170367717743, + "learning_rate": 0.00015986608914936995, + "loss": 1.251, + "step": 4881 + }, + { + "epoch": 2.2385047586285975, + "grad_norm": 0.4060390591621399, + "learning_rate": 0.0001596846179342314, + "loss": 0.4879, + "step": 4882 + }, + { + "epoch": 2.2389634216259604, + "grad_norm": 0.20119361579418182, + "learning_rate": 0.00015950323020090984, + "loss": 1.4327, + "step": 4883 + }, + { + "epoch": 2.239422084623323, + "grad_norm": 0.21751080453395844, + "learning_rate": 0.00015932192599390105, + "loss": 1.6125, + "step": 4884 + }, + { + "epoch": 2.2398807476206857, + "grad_norm": 0.40151306986808777, + "learning_rate": 0.00015914070535768022, + "loss": 1.9309, + "step": 4885 + }, + { + "epoch": 2.2403394106180485, + "grad_norm": 0.35669711232185364, + "learning_rate": 0.00015895956833670205, + "loss": 1.8298, + "step": 4886 + }, + { + "epoch": 2.240798073615411, + "grad_norm": 0.3669508993625641, + "learning_rate": 0.00015877851497540085, + "loss": 1.6097, + "step": 4887 + }, + { + "epoch": 2.241256736612774, + "grad_norm": 0.2940203845500946, + "learning_rate": 0.00015859754531819028, + "loss": 0.3973, + "step": 4888 + }, + { + "epoch": 2.241715399610136, + "grad_norm": 0.19039645791053772, + "learning_rate": 0.00015841665940946343, + "loss": 1.3714, + "step": 4889 + }, + { + "epoch": 2.242174062607499, + "grad_norm": 0.3116855025291443, + "learning_rate": 0.00015823585729359314, + "loss": 1.6032, + "step": 4890 + }, + { + "epoch": 2.242632725604862, + "grad_norm": 0.32782799005508423, + "learning_rate": 0.00015805513901493118, + "loss": 1.1902, + "step": 4891 + }, + { + "epoch": 2.2430913886022243, + "grad_norm": 0.1876303255558014, + "learning_rate": 0.0001578745046178091, + "loss": 0.6099, + "step": 4892 + }, + { + "epoch": 2.243550051599587, + "grad_norm": 0.32714807987213135, + "learning_rate": 0.00015769395414653797, + "loss": 1.6815, + "step": 4893 + }, + { + "epoch": 2.24400871459695, + "grad_norm": 0.2183062732219696, + "learning_rate": 0.00015751348764540802, + "loss": 0.6329, + "step": 4894 + }, + { + "epoch": 2.2444673775943125, + "grad_norm": 0.12229790538549423, + "learning_rate": 0.00015733310515868897, + "loss": 1.269, + "step": 4895 + }, + { + "epoch": 2.2449260405916753, + "grad_norm": 0.38500019907951355, + "learning_rate": 0.00015715280673062997, + "loss": 1.7126, + "step": 4896 + }, + { + "epoch": 2.245384703589038, + "grad_norm": 0.33836498856544495, + "learning_rate": 0.00015697259240545958, + "loss": 1.7375, + "step": 4897 + }, + { + "epoch": 2.2458433665864006, + "grad_norm": 0.4215908646583557, + "learning_rate": 0.00015679246222738562, + "loss": 2.0007, + "step": 4898 + }, + { + "epoch": 2.2463020295837635, + "grad_norm": 0.38347312808036804, + "learning_rate": 0.0001566124162405953, + "loss": 1.9561, + "step": 4899 + }, + { + "epoch": 2.246760692581126, + "grad_norm": 0.34876549243927, + "learning_rate": 0.0001564324544892553, + "loss": 1.7222, + "step": 4900 + }, + { + "epoch": 2.2472193555784887, + "grad_norm": 0.29349571466445923, + "learning_rate": 0.00015625257701751155, + "loss": 1.0293, + "step": 4901 + }, + { + "epoch": 2.2476780185758516, + "grad_norm": 0.3368821442127228, + "learning_rate": 0.00015607278386948909, + "loss": 1.2804, + "step": 4902 + }, + { + "epoch": 2.248136681573214, + "grad_norm": 0.3242160975933075, + "learning_rate": 0.00015589307508929258, + "loss": 1.6011, + "step": 4903 + }, + { + "epoch": 2.248595344570577, + "grad_norm": 0.2980966567993164, + "learning_rate": 0.0001557134507210059, + "loss": 1.2891, + "step": 4904 + }, + { + "epoch": 2.2490540075679393, + "grad_norm": 0.32442331314086914, + "learning_rate": 0.00015553391080869218, + "loss": 1.694, + "step": 4905 + }, + { + "epoch": 2.249512670565302, + "grad_norm": 0.328112930059433, + "learning_rate": 0.00015535445539639382, + "loss": 1.6802, + "step": 4906 + }, + { + "epoch": 2.249971333562665, + "grad_norm": 0.275643914937973, + "learning_rate": 0.0001551750845281326, + "loss": 1.1635, + "step": 4907 + }, + { + "epoch": 2.2504299965600274, + "grad_norm": 0.31056198477745056, + "learning_rate": 0.00015499579824790948, + "loss": 1.6135, + "step": 4908 + }, + { + "epoch": 2.2508886595573903, + "grad_norm": 0.31692421436309814, + "learning_rate": 0.0001548165965997047, + "loss": 1.5069, + "step": 4909 + }, + { + "epoch": 2.2513473225547527, + "grad_norm": 0.3236372768878937, + "learning_rate": 0.00015463747962747766, + "loss": 1.6261, + "step": 4910 + }, + { + "epoch": 2.2518059855521155, + "grad_norm": 0.3909819722175598, + "learning_rate": 0.000154458447375167, + "loss": 2.105, + "step": 4911 + }, + { + "epoch": 2.2522646485494784, + "grad_norm": 0.3079122304916382, + "learning_rate": 0.00015427949988669088, + "loss": 1.2053, + "step": 4912 + }, + { + "epoch": 2.252723311546841, + "grad_norm": 0.3336406946182251, + "learning_rate": 0.00015410063720594603, + "loss": 1.9375, + "step": 4913 + }, + { + "epoch": 2.2531819745442037, + "grad_norm": 0.38423246145248413, + "learning_rate": 0.00015392185937680898, + "loss": 1.4344, + "step": 4914 + }, + { + "epoch": 2.2536406375415665, + "grad_norm": 0.3887491822242737, + "learning_rate": 0.00015374316644313512, + "loss": 1.4943, + "step": 4915 + }, + { + "epoch": 2.254099300538929, + "grad_norm": 0.21756817400455475, + "learning_rate": 0.00015356455844875905, + "loss": 0.9165, + "step": 4916 + }, + { + "epoch": 2.254557963536292, + "grad_norm": 0.4763137102127075, + "learning_rate": 0.0001533860354374949, + "loss": 1.7384, + "step": 4917 + }, + { + "epoch": 2.2550166265336546, + "grad_norm": 0.3908590078353882, + "learning_rate": 0.00015320759745313562, + "loss": 1.2142, + "step": 4918 + }, + { + "epoch": 2.255475289531017, + "grad_norm": 0.38368988037109375, + "learning_rate": 0.000153029244539453, + "loss": 1.4749, + "step": 4919 + }, + { + "epoch": 2.25593395252838, + "grad_norm": 0.1392853558063507, + "learning_rate": 0.0001528509767401985, + "loss": 0.4752, + "step": 4920 + }, + { + "epoch": 2.2563926155257423, + "grad_norm": 0.2166876643896103, + "learning_rate": 0.00015267279409910252, + "loss": 1.1606, + "step": 4921 + }, + { + "epoch": 2.256851278523105, + "grad_norm": 0.2793666124343872, + "learning_rate": 0.0001524946966598744, + "loss": 1.2599, + "step": 4922 + }, + { + "epoch": 2.257309941520468, + "grad_norm": 0.2081069052219391, + "learning_rate": 0.0001523166844662031, + "loss": 1.1284, + "step": 4923 + }, + { + "epoch": 2.2577686045178305, + "grad_norm": 0.2551545202732086, + "learning_rate": 0.00015213875756175583, + "loss": 1.6474, + "step": 4924 + }, + { + "epoch": 2.2582272675151933, + "grad_norm": 0.412919819355011, + "learning_rate": 0.00015196091599017951, + "loss": 1.2211, + "step": 4925 + }, + { + "epoch": 2.2586859305125557, + "grad_norm": 0.5170382261276245, + "learning_rate": 0.00015178315979509988, + "loss": 1.3411, + "step": 4926 + }, + { + "epoch": 2.2591445935099186, + "grad_norm": 0.23865391314029694, + "learning_rate": 0.00015160548902012205, + "loss": 1.4869, + "step": 4927 + }, + { + "epoch": 2.2596032565072814, + "grad_norm": 0.31659218668937683, + "learning_rate": 0.00015142790370882987, + "loss": 0.9003, + "step": 4928 + }, + { + "epoch": 2.260061919504644, + "grad_norm": 0.09430176764726639, + "learning_rate": 0.00015125040390478634, + "loss": 0.4091, + "step": 4929 + }, + { + "epoch": 2.2605205825020067, + "grad_norm": 0.2541019320487976, + "learning_rate": 0.0001510729896515332, + "loss": 1.455, + "step": 4930 + }, + { + "epoch": 2.260979245499369, + "grad_norm": 0.25227102637290955, + "learning_rate": 0.00015089566099259162, + "loss": 1.4478, + "step": 4931 + }, + { + "epoch": 2.261437908496732, + "grad_norm": 0.37687572836875916, + "learning_rate": 0.00015071841797146152, + "loss": 1.6046, + "step": 4932 + }, + { + "epoch": 2.261896571494095, + "grad_norm": 0.25936609506607056, + "learning_rate": 0.000150541260631622, + "loss": 1.2579, + "step": 4933 + }, + { + "epoch": 2.2623552344914573, + "grad_norm": 0.3484478294849396, + "learning_rate": 0.0001503641890165311, + "loss": 1.8318, + "step": 4934 + }, + { + "epoch": 2.26281389748882, + "grad_norm": 0.33855557441711426, + "learning_rate": 0.00015018720316962536, + "loss": 1.6309, + "step": 4935 + }, + { + "epoch": 2.2632725604861825, + "grad_norm": 0.3302558958530426, + "learning_rate": 0.00015001030313432107, + "loss": 1.1351, + "step": 4936 + }, + { + "epoch": 2.2637312234835454, + "grad_norm": 0.18633894622325897, + "learning_rate": 0.000149833488954013, + "loss": 1.4874, + "step": 4937 + }, + { + "epoch": 2.2641898864809082, + "grad_norm": 0.3469991981983185, + "learning_rate": 0.00014965676067207496, + "loss": 1.4942, + "step": 4938 + }, + { + "epoch": 2.2646485494782707, + "grad_norm": 0.39552754163742065, + "learning_rate": 0.0001494801183318596, + "loss": 1.8694, + "step": 4939 + }, + { + "epoch": 2.2651072124756335, + "grad_norm": 0.26547738909721375, + "learning_rate": 0.0001493035619766987, + "loss": 0.8757, + "step": 4940 + }, + { + "epoch": 2.2655658754729964, + "grad_norm": 0.39476221799850464, + "learning_rate": 0.00014912709164990263, + "loss": 1.9812, + "step": 4941 + }, + { + "epoch": 2.266024538470359, + "grad_norm": 0.37706848978996277, + "learning_rate": 0.00014895070739476087, + "loss": 0.931, + "step": 4942 + }, + { + "epoch": 2.2664832014677216, + "grad_norm": 0.36269137263298035, + "learning_rate": 0.00014877440925454172, + "loss": 1.2654, + "step": 4943 + }, + { + "epoch": 2.2669418644650845, + "grad_norm": 0.23204423487186432, + "learning_rate": 0.0001485981972724925, + "loss": 1.6955, + "step": 4944 + }, + { + "epoch": 2.267400527462447, + "grad_norm": 0.4347366392612457, + "learning_rate": 0.00014842207149183922, + "loss": 1.5891, + "step": 4945 + }, + { + "epoch": 2.2678591904598098, + "grad_norm": 0.2783931493759155, + "learning_rate": 0.00014824603195578683, + "loss": 1.1357, + "step": 4946 + }, + { + "epoch": 2.268317853457172, + "grad_norm": 0.31447914242744446, + "learning_rate": 0.00014807007870751908, + "loss": 1.8933, + "step": 4947 + }, + { + "epoch": 2.268776516454535, + "grad_norm": 0.41312742233276367, + "learning_rate": 0.00014789421179019858, + "loss": 1.8218, + "step": 4948 + }, + { + "epoch": 2.269235179451898, + "grad_norm": 0.36053013801574707, + "learning_rate": 0.0001477184312469667, + "loss": 1.2607, + "step": 4949 + }, + { + "epoch": 2.2696938424492603, + "grad_norm": 0.28373968601226807, + "learning_rate": 0.00014754273712094373, + "loss": 1.4863, + "step": 4950 + }, + { + "epoch": 2.270152505446623, + "grad_norm": 0.3220798671245575, + "learning_rate": 0.00014736712945522884, + "loss": 1.7179, + "step": 4951 + }, + { + "epoch": 2.2706111684439856, + "grad_norm": 0.29745230078697205, + "learning_rate": 0.00014719160829289958, + "loss": 1.5435, + "step": 4952 + }, + { + "epoch": 2.2710698314413484, + "grad_norm": 0.32250505685806274, + "learning_rate": 0.0001470161736770127, + "loss": 1.0081, + "step": 4953 + }, + { + "epoch": 2.2715284944387113, + "grad_norm": 0.29016774892807007, + "learning_rate": 0.00014684082565060352, + "loss": 1.2257, + "step": 4954 + }, + { + "epoch": 2.2719871574360737, + "grad_norm": 0.3230065107345581, + "learning_rate": 0.00014666556425668625, + "loss": 1.8289, + "step": 4955 + }, + { + "epoch": 2.2724458204334366, + "grad_norm": 0.2680938243865967, + "learning_rate": 0.00014649038953825372, + "loss": 0.6338, + "step": 4956 + }, + { + "epoch": 2.272904483430799, + "grad_norm": 0.1309625804424286, + "learning_rate": 0.00014631530153827755, + "loss": 1.0811, + "step": 4957 + }, + { + "epoch": 2.273363146428162, + "grad_norm": 0.36883193254470825, + "learning_rate": 0.00014614030029970815, + "loss": 1.3616, + "step": 4958 + }, + { + "epoch": 2.2738218094255247, + "grad_norm": 0.2308487594127655, + "learning_rate": 0.00014596538586547454, + "loss": 1.6417, + "step": 4959 + }, + { + "epoch": 2.274280472422887, + "grad_norm": 0.38618066906929016, + "learning_rate": 0.00014579055827848448, + "loss": 1.0165, + "step": 4960 + }, + { + "epoch": 2.27473913542025, + "grad_norm": 0.22514605522155762, + "learning_rate": 0.0001456158175816245, + "loss": 1.1064, + "step": 4961 + }, + { + "epoch": 2.275197798417613, + "grad_norm": 0.34494972229003906, + "learning_rate": 0.00014544116381775985, + "loss": 1.1996, + "step": 4962 + }, + { + "epoch": 2.2756564614149752, + "grad_norm": 0.2711491882801056, + "learning_rate": 0.000145266597029734, + "loss": 1.4014, + "step": 4963 + }, + { + "epoch": 2.276115124412338, + "grad_norm": 0.2802002727985382, + "learning_rate": 0.00014509211726036975, + "loss": 1.1682, + "step": 4964 + }, + { + "epoch": 2.276573787409701, + "grad_norm": 0.2963082194328308, + "learning_rate": 0.0001449177245524681, + "loss": 1.387, + "step": 4965 + }, + { + "epoch": 2.2770324504070634, + "grad_norm": 0.33113130927085876, + "learning_rate": 0.00014474341894880888, + "loss": 1.2594, + "step": 4966 + }, + { + "epoch": 2.2774911134044262, + "grad_norm": 0.37912243604660034, + "learning_rate": 0.00014456920049215054, + "loss": 1.7935, + "step": 4967 + }, + { + "epoch": 2.2779497764017886, + "grad_norm": 0.3738153278827667, + "learning_rate": 0.00014439506922523016, + "loss": 1.233, + "step": 4968 + }, + { + "epoch": 2.2784084393991515, + "grad_norm": 0.280269980430603, + "learning_rate": 0.0001442210251907633, + "loss": 1.2351, + "step": 4969 + }, + { + "epoch": 2.2788671023965144, + "grad_norm": 0.3059045374393463, + "learning_rate": 0.00014404706843144423, + "loss": 1.5538, + "step": 4970 + }, + { + "epoch": 2.2793257653938768, + "grad_norm": 0.22319234907627106, + "learning_rate": 0.0001438731989899459, + "loss": 0.9915, + "step": 4971 + }, + { + "epoch": 2.2797844283912396, + "grad_norm": 0.2013273686170578, + "learning_rate": 0.00014369941690891959, + "loss": 0.5437, + "step": 4972 + }, + { + "epoch": 2.280243091388602, + "grad_norm": 0.33141711354255676, + "learning_rate": 0.00014352572223099542, + "loss": 1.1842, + "step": 4973 + }, + { + "epoch": 2.280701754385965, + "grad_norm": 0.24733178317546844, + "learning_rate": 0.00014335211499878203, + "loss": 1.5018, + "step": 4974 + }, + { + "epoch": 2.2811604173833278, + "grad_norm": 0.32721972465515137, + "learning_rate": 0.00014317859525486625, + "loss": 1.2133, + "step": 4975 + }, + { + "epoch": 2.28161908038069, + "grad_norm": 0.3684854209423065, + "learning_rate": 0.00014300516304181389, + "loss": 1.7316, + "step": 4976 + }, + { + "epoch": 2.282077743378053, + "grad_norm": 0.22372929751873016, + "learning_rate": 0.0001428318184021691, + "loss": 0.6982, + "step": 4977 + }, + { + "epoch": 2.2825364063754154, + "grad_norm": 0.31195321679115295, + "learning_rate": 0.00014265856137845434, + "loss": 1.6587, + "step": 4978 + }, + { + "epoch": 2.2829950693727783, + "grad_norm": 0.2723373472690582, + "learning_rate": 0.0001424853920131714, + "loss": 1.2561, + "step": 4979 + }, + { + "epoch": 2.283453732370141, + "grad_norm": 0.4274025559425354, + "learning_rate": 0.0001423123103487995, + "loss": 2.2205, + "step": 4980 + }, + { + "epoch": 2.2839123953675036, + "grad_norm": 0.3065417408943176, + "learning_rate": 0.00014213931642779686, + "loss": 0.9494, + "step": 4981 + }, + { + "epoch": 2.2843710583648664, + "grad_norm": 0.21199515461921692, + "learning_rate": 0.00014196641029260026, + "loss": 0.624, + "step": 4982 + }, + { + "epoch": 2.2848297213622293, + "grad_norm": 0.23199045658111572, + "learning_rate": 0.00014179359198562475, + "loss": 1.1652, + "step": 4983 + }, + { + "epoch": 2.2852883843595917, + "grad_norm": 0.29177722334861755, + "learning_rate": 0.00014162086154926397, + "loss": 1.4468, + "step": 4984 + }, + { + "epoch": 2.2857470473569546, + "grad_norm": 0.37286636233329773, + "learning_rate": 0.00014144821902589, + "loss": 1.9711, + "step": 4985 + }, + { + "epoch": 2.2862057103543174, + "grad_norm": 0.4224682152271271, + "learning_rate": 0.00014127566445785306, + "loss": 1.3417, + "step": 4986 + }, + { + "epoch": 2.28666437335168, + "grad_norm": 0.3329372704029083, + "learning_rate": 0.00014110319788748215, + "loss": 1.4227, + "step": 4987 + }, + { + "epoch": 2.2871230363490427, + "grad_norm": 0.30778220295906067, + "learning_rate": 0.00014093081935708445, + "loss": 1.3458, + "step": 4988 + }, + { + "epoch": 2.287581699346405, + "grad_norm": 0.34832048416137695, + "learning_rate": 0.0001407585289089459, + "loss": 1.2986, + "step": 4989 + }, + { + "epoch": 2.288040362343768, + "grad_norm": 0.32428908348083496, + "learning_rate": 0.00014058632658533072, + "loss": 1.9873, + "step": 4990 + }, + { + "epoch": 2.288499025341131, + "grad_norm": 0.39319339394569397, + "learning_rate": 0.0001404142124284809, + "loss": 1.6913, + "step": 4991 + }, + { + "epoch": 2.2889576883384932, + "grad_norm": 0.3040050268173218, + "learning_rate": 0.00014024218648061755, + "loss": 1.1154, + "step": 4992 + }, + { + "epoch": 2.289416351335856, + "grad_norm": 0.4540446102619171, + "learning_rate": 0.00014007024878393983, + "loss": 1.297, + "step": 4993 + }, + { + "epoch": 2.2898750143332185, + "grad_norm": 0.29186972975730896, + "learning_rate": 0.0001398983993806253, + "loss": 1.6358, + "step": 4994 + }, + { + "epoch": 2.2903336773305814, + "grad_norm": 0.25946202874183655, + "learning_rate": 0.0001397266383128299, + "loss": 1.522, + "step": 4995 + }, + { + "epoch": 2.290792340327944, + "grad_norm": 0.27808359265327454, + "learning_rate": 0.00013955496562268794, + "loss": 1.1537, + "step": 4996 + }, + { + "epoch": 2.2912510033253066, + "grad_norm": 0.34630027413368225, + "learning_rate": 0.0001393833813523117, + "loss": 1.5734, + "step": 4997 + }, + { + "epoch": 2.2917096663226695, + "grad_norm": 0.2861468195915222, + "learning_rate": 0.0001392118855437921, + "loss": 0.5451, + "step": 4998 + }, + { + "epoch": 2.292168329320032, + "grad_norm": 0.3561016917228699, + "learning_rate": 0.0001390404782391985, + "loss": 1.2978, + "step": 4999 + }, + { + "epoch": 2.2926269923173948, + "grad_norm": 0.2726607024669647, + "learning_rate": 0.00013886915948057825, + "loss": 1.256, + "step": 5000 + }, + { + "epoch": 2.2930856553147576, + "grad_norm": 0.41800275444984436, + "learning_rate": 0.00013869792930995724, + "loss": 1.5729, + "step": 5001 + }, + { + "epoch": 2.29354431831212, + "grad_norm": 0.3518107533454895, + "learning_rate": 0.00013852678776933914, + "loss": 1.1977, + "step": 5002 + }, + { + "epoch": 2.294002981309483, + "grad_norm": 0.14994703233242035, + "learning_rate": 0.0001383557349007063, + "loss": 0.6909, + "step": 5003 + }, + { + "epoch": 2.2944616443068457, + "grad_norm": 0.2874891459941864, + "learning_rate": 0.00013818477074601933, + "loss": 2.2263, + "step": 5004 + }, + { + "epoch": 2.294920307304208, + "grad_norm": 0.41052594780921936, + "learning_rate": 0.00013801389534721692, + "loss": 1.6545, + "step": 5005 + }, + { + "epoch": 2.295378970301571, + "grad_norm": 0.6844239830970764, + "learning_rate": 0.00013784310874621604, + "loss": 1.9092, + "step": 5006 + }, + { + "epoch": 2.2958376332989334, + "grad_norm": 0.21883147954940796, + "learning_rate": 0.00013767241098491185, + "loss": 0.4407, + "step": 5007 + }, + { + "epoch": 2.2962962962962963, + "grad_norm": 0.06891786307096481, + "learning_rate": 0.00013750180210517777, + "loss": 0.7355, + "step": 5008 + }, + { + "epoch": 2.296754959293659, + "grad_norm": 0.2810133099555969, + "learning_rate": 0.00013733128214886536, + "loss": 0.9875, + "step": 5009 + }, + { + "epoch": 2.2972136222910216, + "grad_norm": 0.28389817476272583, + "learning_rate": 0.00013716085115780447, + "loss": 1.6359, + "step": 5010 + }, + { + "epoch": 2.2976722852883844, + "grad_norm": 0.43318691849708557, + "learning_rate": 0.00013699050917380295, + "loss": 1.6229, + "step": 5011 + }, + { + "epoch": 2.2981309482857473, + "grad_norm": 0.36424124240875244, + "learning_rate": 0.00013682025623864698, + "loss": 1.8982, + "step": 5012 + }, + { + "epoch": 2.2985896112831097, + "grad_norm": 0.3689767122268677, + "learning_rate": 0.00013665009239410098, + "loss": 1.2045, + "step": 5013 + }, + { + "epoch": 2.2990482742804725, + "grad_norm": 0.28414589166641235, + "learning_rate": 0.00013648001768190699, + "loss": 1.5854, + "step": 5014 + }, + { + "epoch": 2.299506937277835, + "grad_norm": 0.3238101601600647, + "learning_rate": 0.00013631003214378584, + "loss": 0.8048, + "step": 5015 + }, + { + "epoch": 2.299965600275198, + "grad_norm": 0.25493547320365906, + "learning_rate": 0.00013614013582143614, + "loss": 1.5388, + "step": 5016 + }, + { + "epoch": 2.3004242632725607, + "grad_norm": 0.3472574055194855, + "learning_rate": 0.0001359703287565347, + "loss": 1.3253, + "step": 5017 + }, + { + "epoch": 2.300882926269923, + "grad_norm": 0.21940115094184875, + "learning_rate": 0.00013580061099073638, + "loss": 1.473, + "step": 5018 + }, + { + "epoch": 2.301341589267286, + "grad_norm": 0.33092114329338074, + "learning_rate": 0.0001356309825656742, + "loss": 1.5323, + "step": 5019 + }, + { + "epoch": 2.3018002522646483, + "grad_norm": 0.3832414150238037, + "learning_rate": 0.0001354614435229592, + "loss": 1.3107, + "step": 5020 + }, + { + "epoch": 2.302258915262011, + "grad_norm": 0.27697277069091797, + "learning_rate": 0.0001352919939041806, + "loss": 1.5306, + "step": 5021 + }, + { + "epoch": 2.302717578259374, + "grad_norm": 0.30245643854141235, + "learning_rate": 0.00013512263375090562, + "loss": 1.4718, + "step": 5022 + }, + { + "epoch": 2.3031762412567365, + "grad_norm": 0.3190750777721405, + "learning_rate": 0.00013495336310467943, + "loss": 1.4829, + "step": 5023 + }, + { + "epoch": 2.3036349042540993, + "grad_norm": 0.2573975622653961, + "learning_rate": 0.00013478418200702552, + "loss": 0.5235, + "step": 5024 + }, + { + "epoch": 2.3040935672514617, + "grad_norm": 0.13736076653003693, + "learning_rate": 0.00013461509049944497, + "loss": 0.5812, + "step": 5025 + }, + { + "epoch": 2.3045522302488246, + "grad_norm": 0.19605790078639984, + "learning_rate": 0.00013444608862341734, + "loss": 1.2568, + "step": 5026 + }, + { + "epoch": 2.3050108932461875, + "grad_norm": 0.25617191195487976, + "learning_rate": 0.00013427717642039988, + "loss": 1.5799, + "step": 5027 + }, + { + "epoch": 2.30546955624355, + "grad_norm": 0.3717952370643616, + "learning_rate": 0.00013410835393182807, + "loss": 1.3069, + "step": 5028 + }, + { + "epoch": 2.3059282192409127, + "grad_norm": 0.37081441283226013, + "learning_rate": 0.00013393962119911528, + "loss": 1.7627, + "step": 5029 + }, + { + "epoch": 2.3063868822382756, + "grad_norm": 0.3451120853424072, + "learning_rate": 0.0001337709782636528, + "loss": 1.7584, + "step": 5030 + }, + { + "epoch": 2.306845545235638, + "grad_norm": 0.326267272233963, + "learning_rate": 0.00013360242516681004, + "loss": 1.0427, + "step": 5031 + }, + { + "epoch": 2.307304208233001, + "grad_norm": 0.2787007987499237, + "learning_rate": 0.00013343396194993423, + "loss": 1.3569, + "step": 5032 + }, + { + "epoch": 2.3077628712303637, + "grad_norm": 0.2687190771102905, + "learning_rate": 0.0001332655886543506, + "loss": 1.8645, + "step": 5033 + }, + { + "epoch": 2.308221534227726, + "grad_norm": 0.3988434970378876, + "learning_rate": 0.00013309730532136245, + "loss": 2.0012, + "step": 5034 + }, + { + "epoch": 2.308680197225089, + "grad_norm": 0.1921907663345337, + "learning_rate": 0.0001329291119922509, + "loss": 1.2577, + "step": 5035 + }, + { + "epoch": 2.3091388602224514, + "grad_norm": 0.4280322790145874, + "learning_rate": 0.00013276100870827473, + "loss": 1.693, + "step": 5036 + }, + { + "epoch": 2.3095975232198143, + "grad_norm": 0.34726113080978394, + "learning_rate": 0.00013259299551067106, + "loss": 0.8334, + "step": 5037 + }, + { + "epoch": 2.310056186217177, + "grad_norm": 0.1919124275445938, + "learning_rate": 0.00013242507244065477, + "loss": 1.77, + "step": 5038 + }, + { + "epoch": 2.3105148492145395, + "grad_norm": 0.3817395865917206, + "learning_rate": 0.00013225723953941854, + "loss": 1.9272, + "step": 5039 + }, + { + "epoch": 2.3109735122119024, + "grad_norm": 0.41375109553337097, + "learning_rate": 0.00013208949684813287, + "loss": 1.5177, + "step": 5040 + }, + { + "epoch": 2.311432175209265, + "grad_norm": 0.29578983783721924, + "learning_rate": 0.00013192184440794668, + "loss": 1.3288, + "step": 5041 + }, + { + "epoch": 2.3118908382066277, + "grad_norm": 0.19934503734111786, + "learning_rate": 0.00013175428225998593, + "loss": 0.9299, + "step": 5042 + }, + { + "epoch": 2.3123495012039905, + "grad_norm": 0.2743481397628784, + "learning_rate": 0.00013158681044535487, + "loss": 1.5061, + "step": 5043 + }, + { + "epoch": 2.312808164201353, + "grad_norm": 0.44560080766677856, + "learning_rate": 0.00013141942900513564, + "loss": 0.9513, + "step": 5044 + }, + { + "epoch": 2.313266827198716, + "grad_norm": 0.21826134622097015, + "learning_rate": 0.0001312521379803881, + "loss": 0.941, + "step": 5045 + }, + { + "epoch": 2.313725490196078, + "grad_norm": 0.16389012336730957, + "learning_rate": 0.00013108493741215, + "loss": 1.3627, + "step": 5046 + }, + { + "epoch": 2.314184153193441, + "grad_norm": 0.3645002543926239, + "learning_rate": 0.00013091782734143671, + "loss": 1.5707, + "step": 5047 + }, + { + "epoch": 2.314642816190804, + "grad_norm": 0.3624444901943207, + "learning_rate": 0.00013075080780924154, + "loss": 2.1348, + "step": 5048 + }, + { + "epoch": 2.3151014791881663, + "grad_norm": 0.3850797712802887, + "learning_rate": 0.00013058387885653562, + "loss": 1.4394, + "step": 5049 + }, + { + "epoch": 2.315560142185529, + "grad_norm": 0.281230092048645, + "learning_rate": 0.00013041704052426772, + "loss": 1.6204, + "step": 5050 + }, + { + "epoch": 2.316018805182892, + "grad_norm": 0.2876693308353424, + "learning_rate": 0.00013025029285336476, + "loss": 1.1374, + "step": 5051 + }, + { + "epoch": 2.3164774681802545, + "grad_norm": 0.3920785188674927, + "learning_rate": 0.00013008363588473115, + "loss": 1.639, + "step": 5052 + }, + { + "epoch": 2.3169361311776173, + "grad_norm": 0.2921794652938843, + "learning_rate": 0.00012991706965924876, + "loss": 1.5347, + "step": 5053 + }, + { + "epoch": 2.31739479417498, + "grad_norm": 0.26020175218582153, + "learning_rate": 0.00012975059421777759, + "loss": 1.1937, + "step": 5054 + }, + { + "epoch": 2.3178534571723426, + "grad_norm": 0.36081263422966003, + "learning_rate": 0.0001295842096011553, + "loss": 1.3285, + "step": 5055 + }, + { + "epoch": 2.3183121201697054, + "grad_norm": 0.48956963419914246, + "learning_rate": 0.00012941791585019725, + "loss": 1.4641, + "step": 5056 + }, + { + "epoch": 2.318770783167068, + "grad_norm": 0.1845049411058426, + "learning_rate": 0.0001292517130056966, + "loss": 1.1712, + "step": 5057 + }, + { + "epoch": 2.3192294461644307, + "grad_norm": 0.35714197158813477, + "learning_rate": 0.00012908560110842383, + "loss": 1.4028, + "step": 5058 + }, + { + "epoch": 2.3196881091617936, + "grad_norm": 0.2765589654445648, + "learning_rate": 0.00012891958019912758, + "loss": 0.852, + "step": 5059 + }, + { + "epoch": 2.320146772159156, + "grad_norm": 0.32246285676956177, + "learning_rate": 0.00012875365031853376, + "loss": 1.725, + "step": 5060 + }, + { + "epoch": 2.320605435156519, + "grad_norm": 0.3229038715362549, + "learning_rate": 0.0001285878115073465, + "loss": 1.4902, + "step": 5061 + }, + { + "epoch": 2.3210640981538813, + "grad_norm": 0.24635767936706543, + "learning_rate": 0.0001284220638062471, + "loss": 0.9351, + "step": 5062 + }, + { + "epoch": 2.321522761151244, + "grad_norm": 0.2740919589996338, + "learning_rate": 0.00012825640725589477, + "loss": 1.7063, + "step": 5063 + }, + { + "epoch": 2.321981424148607, + "grad_norm": 0.3636001646518707, + "learning_rate": 0.00012809084189692604, + "loss": 1.3257, + "step": 5064 + }, + { + "epoch": 2.3224400871459694, + "grad_norm": 0.27342599630355835, + "learning_rate": 0.0001279253677699554, + "loss": 1.2865, + "step": 5065 + }, + { + "epoch": 2.3228987501433322, + "grad_norm": 0.3146760165691376, + "learning_rate": 0.00012775998491557485, + "loss": 1.28, + "step": 5066 + }, + { + "epoch": 2.3233574131406947, + "grad_norm": 0.22414685785770416, + "learning_rate": 0.00012759469337435397, + "loss": 1.2673, + "step": 5067 + }, + { + "epoch": 2.3238160761380575, + "grad_norm": 0.42230096459388733, + "learning_rate": 0.00012742949318684, + "loss": 1.0508, + "step": 5068 + }, + { + "epoch": 2.3242747391354204, + "grad_norm": 0.38766834139823914, + "learning_rate": 0.00012726438439355787, + "loss": 1.6228, + "step": 5069 + }, + { + "epoch": 2.324733402132783, + "grad_norm": 0.37745577096939087, + "learning_rate": 0.00012709936703500947, + "loss": 1.8116, + "step": 5070 + }, + { + "epoch": 2.3251920651301456, + "grad_norm": 0.4257441759109497, + "learning_rate": 0.0001269344411516753, + "loss": 1.1618, + "step": 5071 + }, + { + "epoch": 2.3256507281275085, + "grad_norm": 0.3758509159088135, + "learning_rate": 0.00012676960678401262, + "loss": 1.6102, + "step": 5072 + }, + { + "epoch": 2.326109391124871, + "grad_norm": 0.27729111909866333, + "learning_rate": 0.0001266048639724565, + "loss": 0.6324, + "step": 5073 + }, + { + "epoch": 2.3265680541222338, + "grad_norm": 0.27736055850982666, + "learning_rate": 0.0001264402127574198, + "loss": 1.5171, + "step": 5074 + }, + { + "epoch": 2.327026717119596, + "grad_norm": 0.2820827066898346, + "learning_rate": 0.0001262756531792922, + "loss": 1.6448, + "step": 5075 + }, + { + "epoch": 2.327485380116959, + "grad_norm": 0.31878018379211426, + "learning_rate": 0.0001261111852784416, + "loss": 0.9581, + "step": 5076 + }, + { + "epoch": 2.327944043114322, + "grad_norm": 0.32956355810165405, + "learning_rate": 0.0001259468090952131, + "loss": 1.3363, + "step": 5077 + }, + { + "epoch": 2.3284027061116843, + "grad_norm": 0.33068135380744934, + "learning_rate": 0.0001257825246699294, + "loss": 0.8995, + "step": 5078 + }, + { + "epoch": 2.328861369109047, + "grad_norm": 0.2901584506034851, + "learning_rate": 0.0001256183320428907, + "loss": 1.6243, + "step": 5079 + }, + { + "epoch": 2.32932003210641, + "grad_norm": 0.2884989380836487, + "learning_rate": 0.0001254542312543745, + "loss": 1.683, + "step": 5080 + }, + { + "epoch": 2.3297786951037724, + "grad_norm": 0.38220369815826416, + "learning_rate": 0.00012529022234463604, + "loss": 1.7488, + "step": 5081 + }, + { + "epoch": 2.3302373581011353, + "grad_norm": 0.3322943150997162, + "learning_rate": 0.00012512630535390783, + "loss": 1.9446, + "step": 5082 + }, + { + "epoch": 2.3306960210984977, + "grad_norm": 0.3463505804538727, + "learning_rate": 0.00012496248032239988, + "loss": 1.0688, + "step": 5083 + }, + { + "epoch": 2.3311546840958606, + "grad_norm": 0.240933358669281, + "learning_rate": 0.00012479874729029968, + "loss": 1.6491, + "step": 5084 + }, + { + "epoch": 2.3316133470932234, + "grad_norm": 0.4334370195865631, + "learning_rate": 0.00012463510629777226, + "loss": 1.3899, + "step": 5085 + }, + { + "epoch": 2.332072010090586, + "grad_norm": 0.2391756922006607, + "learning_rate": 0.00012447155738495963, + "loss": 0.608, + "step": 5086 + }, + { + "epoch": 2.3325306730879487, + "grad_norm": 0.2066376507282257, + "learning_rate": 0.00012430810059198166, + "loss": 1.1892, + "step": 5087 + }, + { + "epoch": 2.332989336085311, + "grad_norm": 0.33126476407051086, + "learning_rate": 0.0001241447359589355, + "loss": 1.22, + "step": 5088 + }, + { + "epoch": 2.333447999082674, + "grad_norm": 0.29312899708747864, + "learning_rate": 0.00012398146352589568, + "loss": 1.2554, + "step": 5089 + }, + { + "epoch": 2.333906662080037, + "grad_norm": 0.24564377963542938, + "learning_rate": 0.0001238182833329141, + "loss": 1.3128, + "step": 5090 + }, + { + "epoch": 2.3343653250773992, + "grad_norm": 0.3976064920425415, + "learning_rate": 0.00012365519542002, + "loss": 1.4039, + "step": 5091 + }, + { + "epoch": 2.334823988074762, + "grad_norm": 0.31696027517318726, + "learning_rate": 0.0001234921998272201, + "loss": 2.1157, + "step": 5092 + }, + { + "epoch": 2.3352826510721245, + "grad_norm": 0.3862619400024414, + "learning_rate": 0.00012332929659449827, + "loss": 1.1534, + "step": 5093 + }, + { + "epoch": 2.3357413140694874, + "grad_norm": 0.2641201317310333, + "learning_rate": 0.000123166485761816, + "loss": 0.9848, + "step": 5094 + }, + { + "epoch": 2.3361999770668502, + "grad_norm": 0.3019377291202545, + "learning_rate": 0.00012300376736911183, + "loss": 1.233, + "step": 5095 + }, + { + "epoch": 2.3366586400642126, + "grad_norm": 0.280788779258728, + "learning_rate": 0.00012284114145630183, + "loss": 1.6123, + "step": 5096 + }, + { + "epoch": 2.3371173030615755, + "grad_norm": 0.3243066370487213, + "learning_rate": 0.0001226786080632794, + "loss": 0.5487, + "step": 5097 + }, + { + "epoch": 2.3375759660589384, + "grad_norm": 0.18734464049339294, + "learning_rate": 0.00012251616722991492, + "loss": 0.9184, + "step": 5098 + }, + { + "epoch": 2.3380346290563008, + "grad_norm": 0.17624834179878235, + "learning_rate": 0.0001223538189960564, + "loss": 1.5011, + "step": 5099 + }, + { + "epoch": 2.3384932920536636, + "grad_norm": 0.2665694057941437, + "learning_rate": 0.00012219156340152908, + "loss": 0.8928, + "step": 5100 + }, + { + "epoch": 2.3389519550510265, + "grad_norm": 0.2912820279598236, + "learning_rate": 0.00012202940048613542, + "loss": 1.5328, + "step": 5101 + }, + { + "epoch": 2.339410618048389, + "grad_norm": 0.3863449692726135, + "learning_rate": 0.00012186733028965507, + "loss": 1.4951, + "step": 5102 + }, + { + "epoch": 2.3398692810457518, + "grad_norm": 0.40611687302589417, + "learning_rate": 0.00012170535285184509, + "loss": 2.1716, + "step": 5103 + }, + { + "epoch": 2.340327944043114, + "grad_norm": 0.4273900091648102, + "learning_rate": 0.00012154346821243972, + "loss": 1.21, + "step": 5104 + }, + { + "epoch": 2.340786607040477, + "grad_norm": 0.3556171953678131, + "learning_rate": 0.0001213816764111504, + "loss": 1.3158, + "step": 5105 + }, + { + "epoch": 2.34124527003784, + "grad_norm": 0.28761526942253113, + "learning_rate": 0.00012121997748766583, + "loss": 1.6339, + "step": 5106 + }, + { + "epoch": 2.3417039330352023, + "grad_norm": 0.2865532338619232, + "learning_rate": 0.00012105837148165194, + "loss": 1.3912, + "step": 5107 + }, + { + "epoch": 2.342162596032565, + "grad_norm": 0.30358991026878357, + "learning_rate": 0.000120896858432752, + "loss": 1.1263, + "step": 5108 + }, + { + "epoch": 2.3426212590299276, + "grad_norm": 0.07187612354755402, + "learning_rate": 0.00012073543838058598, + "loss": 1.1824, + "step": 5109 + }, + { + "epoch": 2.3430799220272904, + "grad_norm": 0.2775716483592987, + "learning_rate": 0.00012057411136475161, + "loss": 1.1338, + "step": 5110 + }, + { + "epoch": 2.3435385850246533, + "grad_norm": 0.370094358921051, + "learning_rate": 0.00012041287742482348, + "loss": 0.8284, + "step": 5111 + }, + { + "epoch": 2.3439972480220157, + "grad_norm": 0.5262327790260315, + "learning_rate": 0.00012025173660035338, + "loss": 1.6775, + "step": 5112 + }, + { + "epoch": 2.3444559110193786, + "grad_norm": 0.36328423023223877, + "learning_rate": 0.00012009068893087067, + "loss": 1.453, + "step": 5113 + }, + { + "epoch": 2.344914574016741, + "grad_norm": 1.2442697286605835, + "learning_rate": 0.00011992973445588112, + "loss": 2.0186, + "step": 5114 + }, + { + "epoch": 2.345373237014104, + "grad_norm": 0.26644742488861084, + "learning_rate": 0.00011976887321486813, + "loss": 0.9504, + "step": 5115 + }, + { + "epoch": 2.3458319000114667, + "grad_norm": 0.31766730546951294, + "learning_rate": 0.00011960810524729222, + "loss": 1.277, + "step": 5116 + }, + { + "epoch": 2.346290563008829, + "grad_norm": 0.3006284236907959, + "learning_rate": 0.00011944743059259078, + "loss": 0.7456, + "step": 5117 + }, + { + "epoch": 2.346749226006192, + "grad_norm": 0.2082628756761551, + "learning_rate": 0.00011928684929017859, + "loss": 0.9155, + "step": 5118 + }, + { + "epoch": 2.347207889003555, + "grad_norm": 0.2231052815914154, + "learning_rate": 0.0001191263613794475, + "loss": 0.882, + "step": 5119 + }, + { + "epoch": 2.3476665520009172, + "grad_norm": 0.2642306983470917, + "learning_rate": 0.00011896596689976602, + "loss": 1.3383, + "step": 5120 + }, + { + "epoch": 2.34812521499828, + "grad_norm": 0.26055455207824707, + "learning_rate": 0.00011880566589048031, + "loss": 1.2426, + "step": 5121 + }, + { + "epoch": 2.348583877995643, + "grad_norm": 0.20619313418865204, + "learning_rate": 0.00011864545839091312, + "loss": 0.7382, + "step": 5122 + }, + { + "epoch": 2.3490425409930054, + "grad_norm": 0.19531986117362976, + "learning_rate": 0.00011848534444036485, + "loss": 1.091, + "step": 5123 + }, + { + "epoch": 2.349501203990368, + "grad_norm": 0.3989899158477783, + "learning_rate": 0.00011832532407811247, + "loss": 0.8412, + "step": 5124 + }, + { + "epoch": 2.3499598669877306, + "grad_norm": 0.2260282337665558, + "learning_rate": 0.00011816539734341025, + "loss": 1.4639, + "step": 5125 + }, + { + "epoch": 2.3504185299850935, + "grad_norm": 0.26231110095977783, + "learning_rate": 0.00011800556427548908, + "loss": 1.1874, + "step": 5126 + }, + { + "epoch": 2.3508771929824563, + "grad_norm": 0.19405876100063324, + "learning_rate": 0.00011784582491355727, + "loss": 0.5713, + "step": 5127 + }, + { + "epoch": 2.3513358559798188, + "grad_norm": 0.2558598220348358, + "learning_rate": 0.00011768617929680014, + "loss": 1.4579, + "step": 5128 + }, + { + "epoch": 2.3517945189771816, + "grad_norm": 0.2595175504684448, + "learning_rate": 0.00011752662746437986, + "loss": 1.4692, + "step": 5129 + }, + { + "epoch": 2.352253181974544, + "grad_norm": 0.4527866244316101, + "learning_rate": 0.0001173671694554358, + "loss": 1.7926, + "step": 5130 + }, + { + "epoch": 2.352711844971907, + "grad_norm": 0.24965371191501617, + "learning_rate": 0.00011720780530908381, + "loss": 0.972, + "step": 5131 + }, + { + "epoch": 2.3531705079692697, + "grad_norm": 0.21504342555999756, + "learning_rate": 0.0001170485350644171, + "loss": 0.7318, + "step": 5132 + }, + { + "epoch": 2.353629170966632, + "grad_norm": 0.46009358763694763, + "learning_rate": 0.00011688935876050616, + "loss": 1.8917, + "step": 5133 + }, + { + "epoch": 2.354087833963995, + "grad_norm": 0.35450685024261475, + "learning_rate": 0.00011673027643639784, + "loss": 0.7528, + "step": 5134 + }, + { + "epoch": 2.3545464969613574, + "grad_norm": 0.10655711591243744, + "learning_rate": 0.00011657128813111622, + "loss": 0.8409, + "step": 5135 + }, + { + "epoch": 2.3550051599587203, + "grad_norm": 0.40161076188087463, + "learning_rate": 0.00011641239388366249, + "loss": 1.6137, + "step": 5136 + }, + { + "epoch": 2.355463822956083, + "grad_norm": 0.34276244044303894, + "learning_rate": 0.00011625359373301414, + "loss": 0.9094, + "step": 5137 + }, + { + "epoch": 2.3559224859534456, + "grad_norm": 0.3367175757884979, + "learning_rate": 0.00011609488771812621, + "loss": 1.2706, + "step": 5138 + }, + { + "epoch": 2.3563811489508084, + "grad_norm": 0.27182361483573914, + "learning_rate": 0.00011593627587793043, + "loss": 1.489, + "step": 5139 + }, + { + "epoch": 2.3568398119481713, + "grad_norm": 0.3773139715194702, + "learning_rate": 0.00011577775825133546, + "loss": 1.0298, + "step": 5140 + }, + { + "epoch": 2.3572984749455337, + "grad_norm": 0.11614841967821121, + "learning_rate": 0.00011561933487722687, + "loss": 0.7662, + "step": 5141 + }, + { + "epoch": 2.3577571379428965, + "grad_norm": 0.24605746567249298, + "learning_rate": 0.00011546100579446672, + "loss": 0.8525, + "step": 5142 + }, + { + "epoch": 2.358215800940259, + "grad_norm": 0.4239916503429413, + "learning_rate": 0.00011530277104189463, + "loss": 1.5721, + "step": 5143 + }, + { + "epoch": 2.358674463937622, + "grad_norm": 0.2336055487394333, + "learning_rate": 0.00011514463065832665, + "loss": 1.2539, + "step": 5144 + }, + { + "epoch": 2.3591331269349847, + "grad_norm": 0.333873987197876, + "learning_rate": 0.00011498658468255568, + "loss": 1.3782, + "step": 5145 + }, + { + "epoch": 2.359591789932347, + "grad_norm": 0.35513144731521606, + "learning_rate": 0.00011482863315335157, + "loss": 1.3682, + "step": 5146 + }, + { + "epoch": 2.36005045292971, + "grad_norm": 0.2521505057811737, + "learning_rate": 0.00011467077610946113, + "loss": 1.2587, + "step": 5147 + }, + { + "epoch": 2.360509115927073, + "grad_norm": 0.30736634135246277, + "learning_rate": 0.00011451301358960758, + "loss": 1.3173, + "step": 5148 + }, + { + "epoch": 2.360967778924435, + "grad_norm": 0.3339071273803711, + "learning_rate": 0.00011435534563249123, + "loss": 1.3926, + "step": 5149 + }, + { + "epoch": 2.361426441921798, + "grad_norm": 0.3305213749408722, + "learning_rate": 0.00011419777227678929, + "loss": 1.4092, + "step": 5150 + }, + { + "epoch": 2.3618851049191605, + "grad_norm": 0.30224788188934326, + "learning_rate": 0.00011404029356115558, + "loss": 1.4058, + "step": 5151 + }, + { + "epoch": 2.3623437679165233, + "grad_norm": 0.3267022967338562, + "learning_rate": 0.00011388290952422075, + "loss": 1.3408, + "step": 5152 + }, + { + "epoch": 2.362802430913886, + "grad_norm": 0.17730341851711273, + "learning_rate": 0.00011372562020459231, + "loss": 0.8203, + "step": 5153 + }, + { + "epoch": 2.3632610939112486, + "grad_norm": 0.2721312344074249, + "learning_rate": 0.00011356842564085434, + "loss": 0.8142, + "step": 5154 + }, + { + "epoch": 2.3637197569086115, + "grad_norm": 0.34186965227127075, + "learning_rate": 0.00011341132587156793, + "loss": 1.9951, + "step": 5155 + }, + { + "epoch": 2.364178419905974, + "grad_norm": 0.36209332942962646, + "learning_rate": 0.00011325432093527077, + "loss": 2.0428, + "step": 5156 + }, + { + "epoch": 2.3646370829033367, + "grad_norm": 0.3819536864757538, + "learning_rate": 0.0001130974108704772, + "loss": 1.578, + "step": 5157 + }, + { + "epoch": 2.3650957459006996, + "grad_norm": 0.36597928404808044, + "learning_rate": 0.00011294059571567861, + "loss": 1.5025, + "step": 5158 + }, + { + "epoch": 2.365554408898062, + "grad_norm": 0.328401654958725, + "learning_rate": 0.0001127838755093426, + "loss": 0.9453, + "step": 5159 + }, + { + "epoch": 2.366013071895425, + "grad_norm": 0.29384055733680725, + "learning_rate": 0.00011262725028991388, + "loss": 1.6815, + "step": 5160 + }, + { + "epoch": 2.3664717348927873, + "grad_norm": 0.3777378499507904, + "learning_rate": 0.00011247072009581383, + "loss": 1.2265, + "step": 5161 + }, + { + "epoch": 2.36693039789015, + "grad_norm": 0.7261742949485779, + "learning_rate": 0.00011231428496544033, + "loss": 1.8345, + "step": 5162 + }, + { + "epoch": 2.367389060887513, + "grad_norm": 0.2863733172416687, + "learning_rate": 0.00011215794493716808, + "loss": 0.9066, + "step": 5163 + }, + { + "epoch": 2.3678477238848754, + "grad_norm": 0.3603268265724182, + "learning_rate": 0.00011200170004934839, + "loss": 1.6718, + "step": 5164 + }, + { + "epoch": 2.3683063868822383, + "grad_norm": 0.2777169346809387, + "learning_rate": 0.00011184555034030936, + "loss": 1.1851, + "step": 5165 + }, + { + "epoch": 2.368765049879601, + "grad_norm": 0.24623733758926392, + "learning_rate": 0.00011168949584835553, + "loss": 1.5477, + "step": 5166 + }, + { + "epoch": 2.3692237128769635, + "grad_norm": 0.2616453766822815, + "learning_rate": 0.00011153353661176824, + "loss": 0.8338, + "step": 5167 + }, + { + "epoch": 2.3696823758743264, + "grad_norm": 0.24914288520812988, + "learning_rate": 0.00011137767266880538, + "loss": 0.859, + "step": 5168 + }, + { + "epoch": 2.3701410388716893, + "grad_norm": 0.21932218968868256, + "learning_rate": 0.00011122190405770172, + "loss": 1.8062, + "step": 5169 + }, + { + "epoch": 2.3705997018690517, + "grad_norm": 0.38597074151039124, + "learning_rate": 0.00011106623081666806, + "loss": 1.8027, + "step": 5170 + }, + { + "epoch": 2.3710583648664145, + "grad_norm": 0.33088138699531555, + "learning_rate": 0.0001109106529838923, + "loss": 1.2009, + "step": 5171 + }, + { + "epoch": 2.371517027863777, + "grad_norm": 0.2947171628475189, + "learning_rate": 0.00011075517059753892, + "loss": 1.1661, + "step": 5172 + }, + { + "epoch": 2.37197569086114, + "grad_norm": 0.26863881945610046, + "learning_rate": 0.00011059978369574875, + "loss": 1.145, + "step": 5173 + }, + { + "epoch": 2.3724343538585027, + "grad_norm": 0.2579619586467743, + "learning_rate": 0.00011044449231663939, + "loss": 1.1743, + "step": 5174 + }, + { + "epoch": 2.372893016855865, + "grad_norm": 0.4377444088459015, + "learning_rate": 0.00011028929649830489, + "loss": 1.72, + "step": 5175 + }, + { + "epoch": 2.373351679853228, + "grad_norm": 0.6905438303947449, + "learning_rate": 0.00011013419627881587, + "loss": 0.7064, + "step": 5176 + }, + { + "epoch": 2.3738103428505903, + "grad_norm": 0.1522587239742279, + "learning_rate": 0.00010997919169621962, + "loss": 1.4874, + "step": 5177 + }, + { + "epoch": 2.374269005847953, + "grad_norm": 0.3481024205684662, + "learning_rate": 0.00010982428278853984, + "loss": 0.9126, + "step": 5178 + }, + { + "epoch": 2.374727668845316, + "grad_norm": 0.24200594425201416, + "learning_rate": 0.00010966946959377682, + "loss": 1.7862, + "step": 5179 + }, + { + "epoch": 2.3751863318426785, + "grad_norm": 0.40860190987586975, + "learning_rate": 0.00010951475214990747, + "loss": 1.0853, + "step": 5180 + }, + { + "epoch": 2.3756449948400413, + "grad_norm": 0.09399288147687912, + "learning_rate": 0.0001093601304948848, + "loss": 1.2584, + "step": 5181 + }, + { + "epoch": 2.3761036578374037, + "grad_norm": 0.31636422872543335, + "learning_rate": 0.00010920560466663882, + "loss": 0.8112, + "step": 5182 + }, + { + "epoch": 2.3765623208347666, + "grad_norm": 0.3569824695587158, + "learning_rate": 0.00010905117470307573, + "loss": 1.3749, + "step": 5183 + }, + { + "epoch": 2.3770209838321295, + "grad_norm": 0.07064448297023773, + "learning_rate": 0.00010889684064207845, + "loss": 0.731, + "step": 5184 + }, + { + "epoch": 2.377479646829492, + "grad_norm": 0.2785203456878662, + "learning_rate": 0.00010874260252150598, + "loss": 1.1004, + "step": 5185 + }, + { + "epoch": 2.3779383098268547, + "grad_norm": 0.32780954241752625, + "learning_rate": 0.00010858846037919451, + "loss": 1.3779, + "step": 5186 + }, + { + "epoch": 2.3783969728242176, + "grad_norm": 0.23089632391929626, + "learning_rate": 0.00010843441425295575, + "loss": 0.9364, + "step": 5187 + }, + { + "epoch": 2.37885563582158, + "grad_norm": 0.2271614968776703, + "learning_rate": 0.00010828046418057858, + "loss": 1.3, + "step": 5188 + }, + { + "epoch": 2.379314298818943, + "grad_norm": 0.32289209961891174, + "learning_rate": 0.00010812661019982795, + "loss": 1.4732, + "step": 5189 + }, + { + "epoch": 2.3797729618163057, + "grad_norm": 0.2017325609922409, + "learning_rate": 0.00010797285234844539, + "loss": 0.9344, + "step": 5190 + }, + { + "epoch": 2.380231624813668, + "grad_norm": 0.2721211910247803, + "learning_rate": 0.00010781919066414881, + "loss": 1.6788, + "step": 5191 + }, + { + "epoch": 2.380690287811031, + "grad_norm": 0.3644427955150604, + "learning_rate": 0.00010766562518463268, + "loss": 1.307, + "step": 5192 + }, + { + "epoch": 2.3811489508083934, + "grad_norm": 0.35269877314567566, + "learning_rate": 0.00010751215594756741, + "loss": 1.3381, + "step": 5193 + }, + { + "epoch": 2.3816076138057563, + "grad_norm": 0.3497646450996399, + "learning_rate": 0.00010735878299060014, + "loss": 1.5481, + "step": 5194 + }, + { + "epoch": 2.382066276803119, + "grad_norm": 0.28884097933769226, + "learning_rate": 0.00010720550635135467, + "loss": 1.1808, + "step": 5195 + }, + { + "epoch": 2.3825249398004815, + "grad_norm": 0.43060022592544556, + "learning_rate": 0.00010705232606743066, + "loss": 1.9484, + "step": 5196 + }, + { + "epoch": 2.3829836027978444, + "grad_norm": 0.25279608368873596, + "learning_rate": 0.00010689924217640445, + "loss": 1.1418, + "step": 5197 + }, + { + "epoch": 2.383442265795207, + "grad_norm": 0.36996960639953613, + "learning_rate": 0.00010674625471582849, + "loss": 1.4287, + "step": 5198 + }, + { + "epoch": 2.3839009287925697, + "grad_norm": 0.3359069228172302, + "learning_rate": 0.0001065933637232317, + "loss": 1.0623, + "step": 5199 + }, + { + "epoch": 2.3843595917899325, + "grad_norm": 0.2985363006591797, + "learning_rate": 0.00010644056923611939, + "loss": 1.366, + "step": 5200 + }, + { + "epoch": 2.384818254787295, + "grad_norm": 0.5279024243354797, + "learning_rate": 0.0001062878712919732, + "loss": 2.0491, + "step": 5201 + }, + { + "epoch": 2.385276917784658, + "grad_norm": 0.3357357680797577, + "learning_rate": 0.00010613526992825096, + "loss": 1.1142, + "step": 5202 + }, + { + "epoch": 2.38573558078202, + "grad_norm": 0.2649446427822113, + "learning_rate": 0.00010598276518238709, + "loss": 0.9735, + "step": 5203 + }, + { + "epoch": 2.386194243779383, + "grad_norm": 0.23111626505851746, + "learning_rate": 0.00010583035709179162, + "loss": 0.9959, + "step": 5204 + }, + { + "epoch": 2.386652906776746, + "grad_norm": 0.18565475940704346, + "learning_rate": 0.0001056780456938518, + "loss": 1.4027, + "step": 5205 + }, + { + "epoch": 2.3871115697741083, + "grad_norm": 0.3480309247970581, + "learning_rate": 0.00010552583102593061, + "loss": 2.1028, + "step": 5206 + }, + { + "epoch": 2.387570232771471, + "grad_norm": 0.2654421329498291, + "learning_rate": 0.00010537371312536736, + "loss": 1.1671, + "step": 5207 + }, + { + "epoch": 2.388028895768834, + "grad_norm": 0.4123840928077698, + "learning_rate": 0.00010522169202947784, + "loss": 1.4777, + "step": 5208 + }, + { + "epoch": 2.3884875587661965, + "grad_norm": 0.28358355164527893, + "learning_rate": 0.0001050697677755536, + "loss": 0.8817, + "step": 5209 + }, + { + "epoch": 2.3889462217635593, + "grad_norm": 0.29822635650634766, + "learning_rate": 0.00010491794040086289, + "loss": 1.6391, + "step": 5210 + }, + { + "epoch": 2.3894048847609217, + "grad_norm": 0.34754717350006104, + "learning_rate": 0.00010476620994265013, + "loss": 0.8802, + "step": 5211 + }, + { + "epoch": 2.3898635477582846, + "grad_norm": 0.08379904180765152, + "learning_rate": 0.00010461457643813588, + "loss": 0.3658, + "step": 5212 + }, + { + "epoch": 2.3903222107556474, + "grad_norm": 0.06315892189741135, + "learning_rate": 0.00010446303992451689, + "loss": 0.8891, + "step": 5213 + }, + { + "epoch": 2.39078087375301, + "grad_norm": 0.31999555230140686, + "learning_rate": 0.00010431160043896615, + "loss": 1.1747, + "step": 5214 + }, + { + "epoch": 2.3912395367503727, + "grad_norm": 0.2988559901714325, + "learning_rate": 0.00010416025801863289, + "loss": 1.2582, + "step": 5215 + }, + { + "epoch": 2.3916981997477356, + "grad_norm": 0.342385858297348, + "learning_rate": 0.0001040090127006425, + "loss": 1.0219, + "step": 5216 + }, + { + "epoch": 2.392156862745098, + "grad_norm": 0.21291863918304443, + "learning_rate": 0.00010385786452209656, + "loss": 1.1656, + "step": 5217 + }, + { + "epoch": 2.392615525742461, + "grad_norm": 0.29049792885780334, + "learning_rate": 0.00010370681352007272, + "loss": 1.4101, + "step": 5218 + }, + { + "epoch": 2.3930741887398232, + "grad_norm": 0.26569485664367676, + "learning_rate": 0.00010355585973162501, + "loss": 1.0166, + "step": 5219 + }, + { + "epoch": 2.393532851737186, + "grad_norm": 0.325291246175766, + "learning_rate": 0.00010340500319378348, + "loss": 1.544, + "step": 5220 + }, + { + "epoch": 2.393991514734549, + "grad_norm": 0.30292245745658875, + "learning_rate": 0.00010325424394355421, + "loss": 1.1687, + "step": 5221 + }, + { + "epoch": 2.3944501777319114, + "grad_norm": 0.28847742080688477, + "learning_rate": 0.00010310358201791953, + "loss": 1.7224, + "step": 5222 + }, + { + "epoch": 2.3949088407292742, + "grad_norm": 0.36918914318084717, + "learning_rate": 0.000102953017453838, + "loss": 1.5133, + "step": 5223 + }, + { + "epoch": 2.3953675037266366, + "grad_norm": 0.32062050700187683, + "learning_rate": 0.00010280255028824414, + "loss": 1.5388, + "step": 5224 + }, + { + "epoch": 2.3958261667239995, + "grad_norm": 0.4374137222766876, + "learning_rate": 0.0001026521805580486, + "loss": 1.0036, + "step": 5225 + }, + { + "epoch": 2.3962848297213624, + "grad_norm": 0.24663524329662323, + "learning_rate": 0.00010250190830013823, + "loss": 1.4458, + "step": 5226 + }, + { + "epoch": 2.3967434927187248, + "grad_norm": 0.39408281445503235, + "learning_rate": 0.00010235173355137583, + "loss": 1.0691, + "step": 5227 + }, + { + "epoch": 2.3972021557160876, + "grad_norm": 0.33270618319511414, + "learning_rate": 0.00010220165634860041, + "loss": 1.4243, + "step": 5228 + }, + { + "epoch": 2.39766081871345, + "grad_norm": 0.4075172543525696, + "learning_rate": 0.00010205167672862691, + "loss": 1.5062, + "step": 5229 + }, + { + "epoch": 2.398119481710813, + "grad_norm": 0.28742605447769165, + "learning_rate": 0.00010190179472824651, + "loss": 1.4183, + "step": 5230 + }, + { + "epoch": 2.3985781447081758, + "grad_norm": 0.35692864656448364, + "learning_rate": 0.00010175201038422644, + "loss": 1.2006, + "step": 5231 + }, + { + "epoch": 2.399036807705538, + "grad_norm": 0.20757196843624115, + "learning_rate": 0.00010160232373330963, + "loss": 0.9988, + "step": 5232 + }, + { + "epoch": 2.399495470702901, + "grad_norm": 0.3633536696434021, + "learning_rate": 0.00010145273481221534, + "loss": 1.744, + "step": 5233 + }, + { + "epoch": 2.399954133700264, + "grad_norm": 0.3276228904724121, + "learning_rate": 0.00010130324365763894, + "loss": 0.7654, + "step": 5234 + }, + { + "epoch": 2.4004127966976263, + "grad_norm": 0.2344532608985901, + "learning_rate": 0.00010115385030625157, + "loss": 1.82, + "step": 5235 + }, + { + "epoch": 2.400871459694989, + "grad_norm": 0.4215475022792816, + "learning_rate": 0.00010100455479470055, + "loss": 1.3953, + "step": 5236 + }, + { + "epoch": 2.401330122692352, + "grad_norm": 0.3673091232776642, + "learning_rate": 0.0001008553571596092, + "loss": 2.074, + "step": 5237 + }, + { + "epoch": 2.4017887856897144, + "grad_norm": 0.34490370750427246, + "learning_rate": 0.00010070625743757666, + "loss": 1.7086, + "step": 5238 + }, + { + "epoch": 2.4022474486870773, + "grad_norm": 0.35969072580337524, + "learning_rate": 0.00010055725566517826, + "loss": 1.1673, + "step": 5239 + }, + { + "epoch": 2.4027061116844397, + "grad_norm": 0.36554619669914246, + "learning_rate": 0.00010040835187896513, + "loss": 1.6442, + "step": 5240 + }, + { + "epoch": 2.4031647746818026, + "grad_norm": 0.39475345611572266, + "learning_rate": 0.00010025954611546457, + "loss": 0.4012, + "step": 5241 + }, + { + "epoch": 2.4036234376791654, + "grad_norm": 0.1467778980731964, + "learning_rate": 0.00010011083841117968, + "loss": 0.8082, + "step": 5242 + }, + { + "epoch": 2.404082100676528, + "grad_norm": 0.13162125647068024, + "learning_rate": 9.996222880258937e-05, + "loss": 0.8181, + "step": 5243 + }, + { + "epoch": 2.4045407636738907, + "grad_norm": 0.20284073054790497, + "learning_rate": 9.98137173261488e-05, + "loss": 1.213, + "step": 5244 + }, + { + "epoch": 2.404999426671253, + "grad_norm": 0.303290456533432, + "learning_rate": 9.966530401828883e-05, + "loss": 1.3629, + "step": 5245 + }, + { + "epoch": 2.405458089668616, + "grad_norm": 0.4616430997848511, + "learning_rate": 9.951698891541633e-05, + "loss": 1.5586, + "step": 5246 + }, + { + "epoch": 2.405916752665979, + "grad_norm": 0.3878755569458008, + "learning_rate": 9.936877205391398e-05, + "loss": 1.2528, + "step": 5247 + }, + { + "epoch": 2.4063754156633412, + "grad_norm": 0.29884567856788635, + "learning_rate": 9.922065347014075e-05, + "loss": 0.9254, + "step": 5248 + }, + { + "epoch": 2.406834078660704, + "grad_norm": 0.2206033170223236, + "learning_rate": 9.907263320043092e-05, + "loss": 1.3166, + "step": 5249 + }, + { + "epoch": 2.4072927416580665, + "grad_norm": 0.3127782642841339, + "learning_rate": 9.892471128109498e-05, + "loss": 1.0651, + "step": 5250 + }, + { + "epoch": 2.4077514046554294, + "grad_norm": 0.3565352261066437, + "learning_rate": 9.877688774841931e-05, + "loss": 1.317, + "step": 5251 + }, + { + "epoch": 2.408210067652792, + "grad_norm": 0.35081884264945984, + "learning_rate": 9.8629162638666e-05, + "loss": 0.9689, + "step": 5252 + }, + { + "epoch": 2.4086687306501546, + "grad_norm": 0.2946298122406006, + "learning_rate": 9.848153598807324e-05, + "loss": 2.0137, + "step": 5253 + }, + { + "epoch": 2.4091273936475175, + "grad_norm": 0.3904908001422882, + "learning_rate": 9.833400783285474e-05, + "loss": 1.0215, + "step": 5254 + }, + { + "epoch": 2.4095860566448803, + "grad_norm": 0.4543945789337158, + "learning_rate": 9.818657820920019e-05, + "loss": 1.4427, + "step": 5255 + }, + { + "epoch": 2.4100447196422428, + "grad_norm": 0.2723967730998993, + "learning_rate": 9.803924715327528e-05, + "loss": 1.3513, + "step": 5256 + }, + { + "epoch": 2.4105033826396056, + "grad_norm": 0.2895095646381378, + "learning_rate": 9.789201470122121e-05, + "loss": 0.9585, + "step": 5257 + }, + { + "epoch": 2.4109620456369685, + "grad_norm": 0.27330470085144043, + "learning_rate": 9.774488088915539e-05, + "loss": 1.174, + "step": 5258 + }, + { + "epoch": 2.411420708634331, + "grad_norm": 0.31435030698776245, + "learning_rate": 9.759784575317082e-05, + "loss": 1.9413, + "step": 5259 + }, + { + "epoch": 2.4118793716316937, + "grad_norm": 0.34424829483032227, + "learning_rate": 9.74509093293361e-05, + "loss": 1.6773, + "step": 5260 + }, + { + "epoch": 2.412338034629056, + "grad_norm": 0.37348756194114685, + "learning_rate": 9.730407165369576e-05, + "loss": 1.3452, + "step": 5261 + }, + { + "epoch": 2.412796697626419, + "grad_norm": 0.3043566346168518, + "learning_rate": 9.715733276227029e-05, + "loss": 1.5723, + "step": 5262 + }, + { + "epoch": 2.413255360623782, + "grad_norm": 0.3890668451786041, + "learning_rate": 9.701069269105567e-05, + "loss": 1.2312, + "step": 5263 + }, + { + "epoch": 2.4137140236211443, + "grad_norm": 0.30525264143943787, + "learning_rate": 9.686415147602401e-05, + "loss": 1.858, + "step": 5264 + }, + { + "epoch": 2.414172686618507, + "grad_norm": 0.33891257643699646, + "learning_rate": 9.671770915312267e-05, + "loss": 0.9398, + "step": 5265 + }, + { + "epoch": 2.4146313496158696, + "grad_norm": 0.2957022488117218, + "learning_rate": 9.657136575827491e-05, + "loss": 1.884, + "step": 5266 + }, + { + "epoch": 2.4150900126132324, + "grad_norm": 0.5537558197975159, + "learning_rate": 9.642512132738012e-05, + "loss": 1.1761, + "step": 5267 + }, + { + "epoch": 2.4155486756105953, + "grad_norm": 0.39335304498672485, + "learning_rate": 9.627897589631301e-05, + "loss": 1.5337, + "step": 5268 + }, + { + "epoch": 2.4160073386079577, + "grad_norm": 0.23258255422115326, + "learning_rate": 9.613292950092406e-05, + "loss": 1.4903, + "step": 5269 + }, + { + "epoch": 2.4164660016053205, + "grad_norm": 0.39902931451797485, + "learning_rate": 9.598698217703972e-05, + "loss": 1.5256, + "step": 5270 + }, + { + "epoch": 2.416924664602683, + "grad_norm": 0.3279167711734772, + "learning_rate": 9.584113396046157e-05, + "loss": 1.2243, + "step": 5271 + }, + { + "epoch": 2.417383327600046, + "grad_norm": 0.44774487614631653, + "learning_rate": 9.569538488696744e-05, + "loss": 1.1367, + "step": 5272 + }, + { + "epoch": 2.4178419905974087, + "grad_norm": 0.29976382851600647, + "learning_rate": 9.55497349923105e-05, + "loss": 1.3715, + "step": 5273 + }, + { + "epoch": 2.418300653594771, + "grad_norm": 0.3518228232860565, + "learning_rate": 9.540418431221986e-05, + "loss": 1.4482, + "step": 5274 + }, + { + "epoch": 2.418759316592134, + "grad_norm": 0.3567742109298706, + "learning_rate": 9.525873288240011e-05, + "loss": 1.937, + "step": 5275 + }, + { + "epoch": 2.419217979589497, + "grad_norm": 0.35031193494796753, + "learning_rate": 9.511338073853149e-05, + "loss": 1.551, + "step": 5276 + }, + { + "epoch": 2.419676642586859, + "grad_norm": 0.1989692896604538, + "learning_rate": 9.496812791626996e-05, + "loss": 1.1373, + "step": 5277 + }, + { + "epoch": 2.420135305584222, + "grad_norm": 0.3707393705844879, + "learning_rate": 9.48229744512471e-05, + "loss": 1.5035, + "step": 5278 + }, + { + "epoch": 2.4205939685815845, + "grad_norm": 0.3178713023662567, + "learning_rate": 9.467792037907008e-05, + "loss": 1.8657, + "step": 5279 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.30721351504325867, + "learning_rate": 9.453296573532172e-05, + "loss": 1.3541, + "step": 5280 + }, + { + "epoch": 2.42151129457631, + "grad_norm": 0.4439692199230194, + "learning_rate": 9.438811055556057e-05, + "loss": 1.243, + "step": 5281 + }, + { + "epoch": 2.4219699575736726, + "grad_norm": 0.206447571516037, + "learning_rate": 9.424335487532037e-05, + "loss": 0.8625, + "step": 5282 + }, + { + "epoch": 2.4224286205710355, + "grad_norm": 0.3932112157344818, + "learning_rate": 9.40986987301109e-05, + "loss": 1.208, + "step": 5283 + }, + { + "epoch": 2.4228872835683983, + "grad_norm": 0.28267911076545715, + "learning_rate": 9.395414215541731e-05, + "loss": 1.3602, + "step": 5284 + }, + { + "epoch": 2.4233459465657607, + "grad_norm": 0.3156595528125763, + "learning_rate": 9.380968518670036e-05, + "loss": 1.6733, + "step": 5285 + }, + { + "epoch": 2.4238046095631236, + "grad_norm": 0.3946542739868164, + "learning_rate": 9.366532785939647e-05, + "loss": 1.978, + "step": 5286 + }, + { + "epoch": 2.424263272560486, + "grad_norm": 0.4107709527015686, + "learning_rate": 9.352107020891743e-05, + "loss": 2.0332, + "step": 5287 + }, + { + "epoch": 2.424721935557849, + "grad_norm": 0.3392653167247772, + "learning_rate": 9.337691227065075e-05, + "loss": 1.1757, + "step": 5288 + }, + { + "epoch": 2.4251805985552117, + "grad_norm": 0.3073945939540863, + "learning_rate": 9.32328540799594e-05, + "loss": 1.5089, + "step": 5289 + }, + { + "epoch": 2.425639261552574, + "grad_norm": 0.37497302889823914, + "learning_rate": 9.308889567218193e-05, + "loss": 1.3739, + "step": 5290 + }, + { + "epoch": 2.426097924549937, + "grad_norm": 0.2676681578159332, + "learning_rate": 9.29450370826323e-05, + "loss": 0.9049, + "step": 5291 + }, + { + "epoch": 2.4265565875472994, + "grad_norm": 0.3354664742946625, + "learning_rate": 9.280127834660019e-05, + "loss": 1.2768, + "step": 5292 + }, + { + "epoch": 2.4270152505446623, + "grad_norm": 0.2157263159751892, + "learning_rate": 9.265761949935048e-05, + "loss": 1.1697, + "step": 5293 + }, + { + "epoch": 2.427473913542025, + "grad_norm": 0.323390394449234, + "learning_rate": 9.251406057612378e-05, + "loss": 1.2113, + "step": 5294 + }, + { + "epoch": 2.4279325765393875, + "grad_norm": 0.2757461965084076, + "learning_rate": 9.237060161213612e-05, + "loss": 1.727, + "step": 5295 + }, + { + "epoch": 2.4283912395367504, + "grad_norm": 0.33096444606781006, + "learning_rate": 9.222724264257904e-05, + "loss": 1.4423, + "step": 5296 + }, + { + "epoch": 2.428849902534113, + "grad_norm": 0.31503209471702576, + "learning_rate": 9.208398370261956e-05, + "loss": 1.6621, + "step": 5297 + }, + { + "epoch": 2.4293085655314757, + "grad_norm": 0.41407549381256104, + "learning_rate": 9.194082482740012e-05, + "loss": 1.9802, + "step": 5298 + }, + { + "epoch": 2.4297672285288385, + "grad_norm": 0.31064707040786743, + "learning_rate": 9.17977660520386e-05, + "loss": 0.7767, + "step": 5299 + }, + { + "epoch": 2.430225891526201, + "grad_norm": 0.26733309030532837, + "learning_rate": 9.165480741162829e-05, + "loss": 1.0861, + "step": 5300 + }, + { + "epoch": 2.430684554523564, + "grad_norm": 0.19575932621955872, + "learning_rate": 9.151194894123815e-05, + "loss": 1.223, + "step": 5301 + }, + { + "epoch": 2.4311432175209267, + "grad_norm": 0.22654877603054047, + "learning_rate": 9.13691906759122e-05, + "loss": 0.4944, + "step": 5302 + }, + { + "epoch": 2.431601880518289, + "grad_norm": 0.35672181844711304, + "learning_rate": 9.122653265067022e-05, + "loss": 1.9861, + "step": 5303 + }, + { + "epoch": 2.432060543515652, + "grad_norm": 0.32909905910491943, + "learning_rate": 9.10839749005073e-05, + "loss": 0.9556, + "step": 5304 + }, + { + "epoch": 2.432519206513015, + "grad_norm": 0.32880130410194397, + "learning_rate": 9.094151746039364e-05, + "loss": 1.7028, + "step": 5305 + }, + { + "epoch": 2.432977869510377, + "grad_norm": 0.3756924867630005, + "learning_rate": 9.079916036527519e-05, + "loss": 1.2567, + "step": 5306 + }, + { + "epoch": 2.43343653250774, + "grad_norm": 0.39229616522789, + "learning_rate": 9.065690365007323e-05, + "loss": 1.4464, + "step": 5307 + }, + { + "epoch": 2.4338951955051025, + "grad_norm": 0.19489213824272156, + "learning_rate": 9.051474734968429e-05, + "loss": 1.6805, + "step": 5308 + }, + { + "epoch": 2.4343538585024653, + "grad_norm": 0.35982856154441833, + "learning_rate": 9.037269149898036e-05, + "loss": 0.8735, + "step": 5309 + }, + { + "epoch": 2.434812521499828, + "grad_norm": 0.30394065380096436, + "learning_rate": 9.02307361328088e-05, + "loss": 1.3712, + "step": 5310 + }, + { + "epoch": 2.4352711844971906, + "grad_norm": 0.28870540857315063, + "learning_rate": 9.008888128599224e-05, + "loss": 2.0938, + "step": 5311 + }, + { + "epoch": 2.4357298474945535, + "grad_norm": 0.37780362367630005, + "learning_rate": 8.994712699332875e-05, + "loss": 1.53, + "step": 5312 + }, + { + "epoch": 2.436188510491916, + "grad_norm": 0.41944536566734314, + "learning_rate": 8.98054732895916e-05, + "loss": 1.4752, + "step": 5313 + }, + { + "epoch": 2.4366471734892787, + "grad_norm": 0.2813193202018738, + "learning_rate": 8.966392020952952e-05, + "loss": 1.2044, + "step": 5314 + }, + { + "epoch": 2.4371058364866416, + "grad_norm": 0.38517066836357117, + "learning_rate": 8.95224677878666e-05, + "loss": 1.5339, + "step": 5315 + }, + { + "epoch": 2.437564499484004, + "grad_norm": 0.38846316933631897, + "learning_rate": 8.938111605930194e-05, + "loss": 0.9716, + "step": 5316 + }, + { + "epoch": 2.438023162481367, + "grad_norm": 0.30371803045272827, + "learning_rate": 8.923986505851023e-05, + "loss": 1.2411, + "step": 5317 + }, + { + "epoch": 2.4384818254787293, + "grad_norm": 0.28628620505332947, + "learning_rate": 8.909871482014132e-05, + "loss": 0.8832, + "step": 5318 + }, + { + "epoch": 2.438940488476092, + "grad_norm": 0.17390215396881104, + "learning_rate": 8.895766537882027e-05, + "loss": 1.0545, + "step": 5319 + }, + { + "epoch": 2.439399151473455, + "grad_norm": 0.3960818946361542, + "learning_rate": 8.8816716769148e-05, + "loss": 1.9539, + "step": 5320 + }, + { + "epoch": 2.4398578144708174, + "grad_norm": 0.4454388916492462, + "learning_rate": 8.867586902569968e-05, + "loss": 1.5828, + "step": 5321 + }, + { + "epoch": 2.4403164774681803, + "grad_norm": 0.33164292573928833, + "learning_rate": 8.85351221830265e-05, + "loss": 1.5117, + "step": 5322 + }, + { + "epoch": 2.440775140465543, + "grad_norm": 0.31885433197021484, + "learning_rate": 8.839447627565472e-05, + "loss": 1.6736, + "step": 5323 + }, + { + "epoch": 2.4412338034629055, + "grad_norm": 0.28047341108322144, + "learning_rate": 8.825393133808574e-05, + "loss": 0.3669, + "step": 5324 + }, + { + "epoch": 2.4416924664602684, + "grad_norm": 0.09215112030506134, + "learning_rate": 8.811348740479619e-05, + "loss": 1.2191, + "step": 5325 + }, + { + "epoch": 2.4421511294576312, + "grad_norm": 0.33795610070228577, + "learning_rate": 8.797314451023819e-05, + "loss": 1.0369, + "step": 5326 + }, + { + "epoch": 2.4426097924549937, + "grad_norm": 0.34105727076530457, + "learning_rate": 8.783290268883859e-05, + "loss": 1.9551, + "step": 5327 + }, + { + "epoch": 2.4430684554523565, + "grad_norm": 0.2725188732147217, + "learning_rate": 8.76927619749998e-05, + "loss": 0.8592, + "step": 5328 + }, + { + "epoch": 2.443527118449719, + "grad_norm": 0.36431559920310974, + "learning_rate": 8.75527224030993e-05, + "loss": 1.7697, + "step": 5329 + }, + { + "epoch": 2.443985781447082, + "grad_norm": 0.38820746541023254, + "learning_rate": 8.741278400749003e-05, + "loss": 0.8046, + "step": 5330 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.25825342535972595, + "learning_rate": 8.72729468224997e-05, + "loss": 1.3749, + "step": 5331 + }, + { + "epoch": 2.444903107441807, + "grad_norm": 0.300819993019104, + "learning_rate": 8.713321088243159e-05, + "loss": 0.8929, + "step": 5332 + }, + { + "epoch": 2.44536177043917, + "grad_norm": 0.0780341699719429, + "learning_rate": 8.699357622156368e-05, + "loss": 0.6362, + "step": 5333 + }, + { + "epoch": 2.4458204334365323, + "grad_norm": 0.28823330998420715, + "learning_rate": 8.685404287414939e-05, + "loss": 1.5396, + "step": 5334 + }, + { + "epoch": 2.446279096433895, + "grad_norm": 0.3040931522846222, + "learning_rate": 8.671461087441735e-05, + "loss": 0.73, + "step": 5335 + }, + { + "epoch": 2.446737759431258, + "grad_norm": 0.4103958010673523, + "learning_rate": 8.657528025657118e-05, + "loss": 2.0576, + "step": 5336 + }, + { + "epoch": 2.4471964224286205, + "grad_norm": 0.33495497703552246, + "learning_rate": 8.643605105478986e-05, + "loss": 0.8664, + "step": 5337 + }, + { + "epoch": 2.4476550854259833, + "grad_norm": 0.2784813344478607, + "learning_rate": 8.629692330322691e-05, + "loss": 1.8088, + "step": 5338 + }, + { + "epoch": 2.4481137484233457, + "grad_norm": 0.3490314185619354, + "learning_rate": 8.615789703601179e-05, + "loss": 1.2238, + "step": 5339 + }, + { + "epoch": 2.4485724114207086, + "grad_norm": 0.2916855812072754, + "learning_rate": 8.601897228724842e-05, + "loss": 1.0304, + "step": 5340 + }, + { + "epoch": 2.4490310744180714, + "grad_norm": 0.3161444365978241, + "learning_rate": 8.588014909101616e-05, + "loss": 1.833, + "step": 5341 + }, + { + "epoch": 2.449489737415434, + "grad_norm": 0.3349079191684723, + "learning_rate": 8.574142748136926e-05, + "loss": 0.7148, + "step": 5342 + }, + { + "epoch": 2.4499484004127967, + "grad_norm": 0.19072647392749786, + "learning_rate": 8.560280749233729e-05, + "loss": 0.9673, + "step": 5343 + }, + { + "epoch": 2.4504070634101596, + "grad_norm": 0.09794793277978897, + "learning_rate": 8.546428915792449e-05, + "loss": 0.7698, + "step": 5344 + }, + { + "epoch": 2.450865726407522, + "grad_norm": 0.209795743227005, + "learning_rate": 8.53258725121105e-05, + "loss": 1.0825, + "step": 5345 + }, + { + "epoch": 2.451324389404885, + "grad_norm": 0.39282190799713135, + "learning_rate": 8.518755758884988e-05, + "loss": 2.1384, + "step": 5346 + }, + { + "epoch": 2.4517830524022477, + "grad_norm": 0.3572220504283905, + "learning_rate": 8.504934442207241e-05, + "loss": 0.8121, + "step": 5347 + }, + { + "epoch": 2.45224171539961, + "grad_norm": 0.20404981076717377, + "learning_rate": 8.491123304568271e-05, + "loss": 0.3737, + "step": 5348 + }, + { + "epoch": 2.452700378396973, + "grad_norm": 0.1945522129535675, + "learning_rate": 8.477322349356042e-05, + "loss": 1.6915, + "step": 5349 + }, + { + "epoch": 2.4531590413943354, + "grad_norm": 0.4479086101055145, + "learning_rate": 8.46353157995604e-05, + "loss": 2.0293, + "step": 5350 + }, + { + "epoch": 2.4536177043916982, + "grad_norm": 0.3536776602268219, + "learning_rate": 8.449750999751238e-05, + "loss": 1.1595, + "step": 5351 + }, + { + "epoch": 2.454076367389061, + "grad_norm": 0.23635387420654297, + "learning_rate": 8.435980612122101e-05, + "loss": 0.7237, + "step": 5352 + }, + { + "epoch": 2.4545350303864235, + "grad_norm": 0.32154580950737, + "learning_rate": 8.422220420446613e-05, + "loss": 1.7249, + "step": 5353 + }, + { + "epoch": 2.4549936933837864, + "grad_norm": 0.34665876626968384, + "learning_rate": 8.408470428100262e-05, + "loss": 1.2853, + "step": 5354 + }, + { + "epoch": 2.455452356381149, + "grad_norm": 0.5096401572227478, + "learning_rate": 8.394730638455994e-05, + "loss": 0.9551, + "step": 5355 + }, + { + "epoch": 2.4559110193785116, + "grad_norm": 0.28207308053970337, + "learning_rate": 8.381001054884291e-05, + "loss": 1.1176, + "step": 5356 + }, + { + "epoch": 2.4563696823758745, + "grad_norm": 0.23310762643814087, + "learning_rate": 8.367281680753114e-05, + "loss": 0.9396, + "step": 5357 + }, + { + "epoch": 2.456828345373237, + "grad_norm": 0.29276517033576965, + "learning_rate": 8.353572519427932e-05, + "loss": 1.1319, + "step": 5358 + }, + { + "epoch": 2.4572870083705998, + "grad_norm": 0.18109527230262756, + "learning_rate": 8.339873574271694e-05, + "loss": 0.9314, + "step": 5359 + }, + { + "epoch": 2.457745671367962, + "grad_norm": 0.26322728395462036, + "learning_rate": 8.326184848644852e-05, + "loss": 1.3008, + "step": 5360 + }, + { + "epoch": 2.458204334365325, + "grad_norm": 0.39521169662475586, + "learning_rate": 8.312506345905358e-05, + "loss": 1.3013, + "step": 5361 + }, + { + "epoch": 2.458662997362688, + "grad_norm": 0.2466443032026291, + "learning_rate": 8.298838069408632e-05, + "loss": 1.2078, + "step": 5362 + }, + { + "epoch": 2.4591216603600503, + "grad_norm": 0.31641414761543274, + "learning_rate": 8.28518002250761e-05, + "loss": 1.4739, + "step": 5363 + }, + { + "epoch": 2.459580323357413, + "grad_norm": 0.34621062874794006, + "learning_rate": 8.271532208552712e-05, + "loss": 1.1296, + "step": 5364 + }, + { + "epoch": 2.4600389863547756, + "grad_norm": 0.2962487041950226, + "learning_rate": 8.25789463089185e-05, + "loss": 1.2483, + "step": 5365 + }, + { + "epoch": 2.4604976493521384, + "grad_norm": 0.1835666447877884, + "learning_rate": 8.2442672928704e-05, + "loss": 0.8519, + "step": 5366 + }, + { + "epoch": 2.4609563123495013, + "grad_norm": 0.290197491645813, + "learning_rate": 8.230650197831252e-05, + "loss": 0.886, + "step": 5367 + }, + { + "epoch": 2.4614149753468637, + "grad_norm": 0.27010378241539, + "learning_rate": 8.217043349114789e-05, + "loss": 1.2361, + "step": 5368 + }, + { + "epoch": 2.4618736383442266, + "grad_norm": 0.5618923306465149, + "learning_rate": 8.203446750058862e-05, + "loss": 2.2034, + "step": 5369 + }, + { + "epoch": 2.4623323013415894, + "grad_norm": 0.3638753294944763, + "learning_rate": 8.189860403998816e-05, + "loss": 1.4188, + "step": 5370 + }, + { + "epoch": 2.462790964338952, + "grad_norm": 0.22352086007595062, + "learning_rate": 8.176284314267479e-05, + "loss": 0.8657, + "step": 5371 + }, + { + "epoch": 2.4632496273363147, + "grad_norm": 0.32366839051246643, + "learning_rate": 8.162718484195169e-05, + "loss": 1.392, + "step": 5372 + }, + { + "epoch": 2.4637082903336776, + "grad_norm": 0.25489479303359985, + "learning_rate": 8.14916291710967e-05, + "loss": 1.4828, + "step": 5373 + }, + { + "epoch": 2.46416695333104, + "grad_norm": 0.34520161151885986, + "learning_rate": 8.135617616336272e-05, + "loss": 1.5386, + "step": 5374 + }, + { + "epoch": 2.464625616328403, + "grad_norm": 0.4345172643661499, + "learning_rate": 8.122082585197732e-05, + "loss": 0.7834, + "step": 5375 + }, + { + "epoch": 2.4650842793257652, + "grad_norm": 0.24464713037014008, + "learning_rate": 8.108557827014295e-05, + "loss": 0.9882, + "step": 5376 + }, + { + "epoch": 2.465542942323128, + "grad_norm": 0.2222648561000824, + "learning_rate": 8.09504334510367e-05, + "loss": 1.5798, + "step": 5377 + }, + { + "epoch": 2.466001605320491, + "grad_norm": 0.2998507618904114, + "learning_rate": 8.081539142781058e-05, + "loss": 1.2544, + "step": 5378 + }, + { + "epoch": 2.4664602683178534, + "grad_norm": 0.3242281973361969, + "learning_rate": 8.068045223359144e-05, + "loss": 0.8808, + "step": 5379 + }, + { + "epoch": 2.4669189313152162, + "grad_norm": 0.30032265186309814, + "learning_rate": 8.054561590148085e-05, + "loss": 1.9047, + "step": 5380 + }, + { + "epoch": 2.4673775943125786, + "grad_norm": 0.3668994903564453, + "learning_rate": 8.041088246455492e-05, + "loss": 0.7418, + "step": 5381 + }, + { + "epoch": 2.4678362573099415, + "grad_norm": 0.18076832592487335, + "learning_rate": 8.027625195586519e-05, + "loss": 1.2765, + "step": 5382 + }, + { + "epoch": 2.4682949203073044, + "grad_norm": 0.32863399386405945, + "learning_rate": 8.014172440843714e-05, + "loss": 1.4399, + "step": 5383 + }, + { + "epoch": 2.4687535833046668, + "grad_norm": 0.27445659041404724, + "learning_rate": 8.000729985527139e-05, + "loss": 0.8303, + "step": 5384 + }, + { + "epoch": 2.4692122463020296, + "grad_norm": 0.23894087970256805, + "learning_rate": 7.987297832934326e-05, + "loss": 1.2864, + "step": 5385 + }, + { + "epoch": 2.469670909299392, + "grad_norm": 0.2814047336578369, + "learning_rate": 7.973875986360285e-05, + "loss": 1.5797, + "step": 5386 + }, + { + "epoch": 2.470129572296755, + "grad_norm": 0.4583660364151001, + "learning_rate": 7.960464449097498e-05, + "loss": 1.522, + "step": 5387 + }, + { + "epoch": 2.4705882352941178, + "grad_norm": 0.3099602460861206, + "learning_rate": 7.947063224435897e-05, + "loss": 1.2623, + "step": 5388 + }, + { + "epoch": 2.47104689829148, + "grad_norm": 0.2514623701572418, + "learning_rate": 7.933672315662898e-05, + "loss": 1.2364, + "step": 5389 + }, + { + "epoch": 2.471505561288843, + "grad_norm": 0.27281129360198975, + "learning_rate": 7.920291726063395e-05, + "loss": 1.3347, + "step": 5390 + }, + { + "epoch": 2.471964224286206, + "grad_norm": 0.2575758099555969, + "learning_rate": 7.906921458919731e-05, + "loss": 1.0359, + "step": 5391 + }, + { + "epoch": 2.4724228872835683, + "grad_norm": 0.3928264081478119, + "learning_rate": 7.893561517511754e-05, + "loss": 1.5787, + "step": 5392 + }, + { + "epoch": 2.472881550280931, + "grad_norm": 0.2101087123155594, + "learning_rate": 7.880211905116747e-05, + "loss": 0.6325, + "step": 5393 + }, + { + "epoch": 2.473340213278294, + "grad_norm": 0.22780464589595795, + "learning_rate": 7.866872625009453e-05, + "loss": 1.2313, + "step": 5394 + }, + { + "epoch": 2.4737988762756564, + "grad_norm": 0.19158299267292023, + "learning_rate": 7.853543680462094e-05, + "loss": 0.5635, + "step": 5395 + }, + { + "epoch": 2.4742575392730193, + "grad_norm": 0.14477090537548065, + "learning_rate": 7.840225074744367e-05, + "loss": 1.1157, + "step": 5396 + }, + { + "epoch": 2.4747162022703817, + "grad_norm": 0.34960320591926575, + "learning_rate": 7.826916811123414e-05, + "loss": 1.5063, + "step": 5397 + }, + { + "epoch": 2.4751748652677446, + "grad_norm": 0.510556161403656, + "learning_rate": 7.813618892863849e-05, + "loss": 1.4665, + "step": 5398 + }, + { + "epoch": 2.4756335282651074, + "grad_norm": 0.31196311116218567, + "learning_rate": 7.80033132322776e-05, + "loss": 1.184, + "step": 5399 + }, + { + "epoch": 2.47609219126247, + "grad_norm": 0.3243480920791626, + "learning_rate": 7.787054105474667e-05, + "loss": 1.3827, + "step": 5400 + }, + { + "epoch": 2.4765508542598327, + "grad_norm": 0.21474260091781616, + "learning_rate": 7.773787242861557e-05, + "loss": 0.7858, + "step": 5401 + }, + { + "epoch": 2.477009517257195, + "grad_norm": 0.07359279692173004, + "learning_rate": 7.760530738642918e-05, + "loss": 0.9702, + "step": 5402 + }, + { + "epoch": 2.477468180254558, + "grad_norm": 0.34640324115753174, + "learning_rate": 7.747284596070647e-05, + "loss": 1.6594, + "step": 5403 + }, + { + "epoch": 2.477926843251921, + "grad_norm": 0.217728853225708, + "learning_rate": 7.734048818394141e-05, + "loss": 1.1982, + "step": 5404 + }, + { + "epoch": 2.478385506249283, + "grad_norm": 0.3077888488769531, + "learning_rate": 7.720823408860195e-05, + "loss": 1.1781, + "step": 5405 + }, + { + "epoch": 2.478844169246646, + "grad_norm": 0.39549943804740906, + "learning_rate": 7.707608370713116e-05, + "loss": 1.9365, + "step": 5406 + }, + { + "epoch": 2.4793028322440085, + "grad_norm": 0.3485013246536255, + "learning_rate": 7.694403707194647e-05, + "loss": 1.3334, + "step": 5407 + }, + { + "epoch": 2.4797614952413713, + "grad_norm": 0.290386438369751, + "learning_rate": 7.681209421543994e-05, + "loss": 1.2493, + "step": 5408 + }, + { + "epoch": 2.480220158238734, + "grad_norm": 0.3037647306919098, + "learning_rate": 7.668025516997795e-05, + "loss": 1.694, + "step": 5409 + }, + { + "epoch": 2.4806788212360966, + "grad_norm": 0.4137989282608032, + "learning_rate": 7.654851996790174e-05, + "loss": 2.085, + "step": 5410 + }, + { + "epoch": 2.4811374842334595, + "grad_norm": 0.3738190531730652, + "learning_rate": 7.64168886415268e-05, + "loss": 1.1558, + "step": 5411 + }, + { + "epoch": 2.4815961472308223, + "grad_norm": 0.3116551339626312, + "learning_rate": 7.628536122314328e-05, + "loss": 1.9504, + "step": 5412 + }, + { + "epoch": 2.4820548102281847, + "grad_norm": 0.30240756273269653, + "learning_rate": 7.615393774501578e-05, + "loss": 0.7979, + "step": 5413 + }, + { + "epoch": 2.4825134732255476, + "grad_norm": 0.29710403084754944, + "learning_rate": 7.602261823938339e-05, + "loss": 1.5649, + "step": 5414 + }, + { + "epoch": 2.4829721362229105, + "grad_norm": 0.3587922155857086, + "learning_rate": 7.589140273845995e-05, + "loss": 1.4416, + "step": 5415 + }, + { + "epoch": 2.483430799220273, + "grad_norm": 0.3009377121925354, + "learning_rate": 7.576029127443329e-05, + "loss": 1.2025, + "step": 5416 + }, + { + "epoch": 2.4838894622176357, + "grad_norm": 0.3110191226005554, + "learning_rate": 7.562928387946611e-05, + "loss": 1.5738, + "step": 5417 + }, + { + "epoch": 2.484348125214998, + "grad_norm": 0.5595056414604187, + "learning_rate": 7.549838058569542e-05, + "loss": 1.5994, + "step": 5418 + }, + { + "epoch": 2.484806788212361, + "grad_norm": 0.37332621216773987, + "learning_rate": 7.536758142523281e-05, + "loss": 1.8284, + "step": 5419 + }, + { + "epoch": 2.485265451209724, + "grad_norm": 0.34219086170196533, + "learning_rate": 7.523688643016424e-05, + "loss": 0.7883, + "step": 5420 + }, + { + "epoch": 2.4857241142070863, + "grad_norm": 0.2129797488451004, + "learning_rate": 7.510629563255017e-05, + "loss": 1.3984, + "step": 5421 + }, + { + "epoch": 2.486182777204449, + "grad_norm": 0.2923283874988556, + "learning_rate": 7.497580906442537e-05, + "loss": 1.4247, + "step": 5422 + }, + { + "epoch": 2.4866414402018115, + "grad_norm": 0.3322497606277466, + "learning_rate": 7.48454267577992e-05, + "loss": 1.6427, + "step": 5423 + }, + { + "epoch": 2.4871001031991744, + "grad_norm": 0.3204174041748047, + "learning_rate": 7.471514874465535e-05, + "loss": 0.9109, + "step": 5424 + }, + { + "epoch": 2.4875587661965373, + "grad_norm": 0.23341026902198792, + "learning_rate": 7.4584975056952e-05, + "loss": 0.7723, + "step": 5425 + }, + { + "epoch": 2.4880174291938997, + "grad_norm": 0.29962819814682007, + "learning_rate": 7.445490572662168e-05, + "loss": 1.5294, + "step": 5426 + }, + { + "epoch": 2.4884760921912625, + "grad_norm": 0.30557799339294434, + "learning_rate": 7.432494078557145e-05, + "loss": 0.6204, + "step": 5427 + }, + { + "epoch": 2.488934755188625, + "grad_norm": 0.14737457036972046, + "learning_rate": 7.41950802656824e-05, + "loss": 1.0, + "step": 5428 + }, + { + "epoch": 2.489393418185988, + "grad_norm": 0.29757335782051086, + "learning_rate": 7.406532419881035e-05, + "loss": 1.3304, + "step": 5429 + }, + { + "epoch": 2.4898520811833507, + "grad_norm": 0.3414275348186493, + "learning_rate": 7.39356726167854e-05, + "loss": 1.3593, + "step": 5430 + }, + { + "epoch": 2.490310744180713, + "grad_norm": 0.1296243965625763, + "learning_rate": 7.380612555141209e-05, + "loss": 0.3876, + "step": 5431 + }, + { + "epoch": 2.490769407178076, + "grad_norm": 0.2734202444553375, + "learning_rate": 7.367668303446917e-05, + "loss": 1.3786, + "step": 5432 + }, + { + "epoch": 2.4912280701754383, + "grad_norm": 0.35316675901412964, + "learning_rate": 7.354734509770983e-05, + "loss": 1.3578, + "step": 5433 + }, + { + "epoch": 2.491686733172801, + "grad_norm": 0.33932507038116455, + "learning_rate": 7.341811177286167e-05, + "loss": 1.4916, + "step": 5434 + }, + { + "epoch": 2.492145396170164, + "grad_norm": 0.535605788230896, + "learning_rate": 7.328898309162652e-05, + "loss": 0.992, + "step": 5435 + }, + { + "epoch": 2.4926040591675265, + "grad_norm": 0.31163036823272705, + "learning_rate": 7.315995908568051e-05, + "loss": 1.804, + "step": 5436 + }, + { + "epoch": 2.4930627221648893, + "grad_norm": 0.29368847608566284, + "learning_rate": 7.303103978667425e-05, + "loss": 1.1411, + "step": 5437 + }, + { + "epoch": 2.493521385162252, + "grad_norm": 0.40923842787742615, + "learning_rate": 7.290222522623263e-05, + "loss": 1.6904, + "step": 5438 + }, + { + "epoch": 2.4939800481596146, + "grad_norm": 0.18885545432567596, + "learning_rate": 7.277351543595457e-05, + "loss": 0.9873, + "step": 5439 + }, + { + "epoch": 2.4944387111569775, + "grad_norm": 0.31630420684814453, + "learning_rate": 7.264491044741367e-05, + "loss": 0.9632, + "step": 5440 + }, + { + "epoch": 2.4948973741543403, + "grad_norm": 0.0943269282579422, + "learning_rate": 7.251641029215761e-05, + "loss": 0.6806, + "step": 5441 + }, + { + "epoch": 2.4953560371517027, + "grad_norm": 0.38258495926856995, + "learning_rate": 7.238801500170838e-05, + "loss": 1.7886, + "step": 5442 + }, + { + "epoch": 2.4958147001490656, + "grad_norm": 0.2949416935443878, + "learning_rate": 7.225972460756236e-05, + "loss": 1.0203, + "step": 5443 + }, + { + "epoch": 2.496273363146428, + "grad_norm": 0.3290545344352722, + "learning_rate": 7.213153914119008e-05, + "loss": 1.5944, + "step": 5444 + }, + { + "epoch": 2.496732026143791, + "grad_norm": 0.19472959637641907, + "learning_rate": 7.200345863403629e-05, + "loss": 1.22, + "step": 5445 + }, + { + "epoch": 2.4971906891411537, + "grad_norm": 0.27619174122810364, + "learning_rate": 7.187548311752012e-05, + "loss": 1.4291, + "step": 5446 + }, + { + "epoch": 2.497649352138516, + "grad_norm": 0.3692777156829834, + "learning_rate": 7.174761262303492e-05, + "loss": 1.534, + "step": 5447 + }, + { + "epoch": 2.498108015135879, + "grad_norm": 0.4094838798046112, + "learning_rate": 7.161984718194819e-05, + "loss": 1.1218, + "step": 5448 + }, + { + "epoch": 2.4985666781332414, + "grad_norm": 0.4053116738796234, + "learning_rate": 7.149218682560182e-05, + "loss": 1.2167, + "step": 5449 + }, + { + "epoch": 2.4990253411306043, + "grad_norm": 0.32053858041763306, + "learning_rate": 7.136463158531159e-05, + "loss": 1.58, + "step": 5450 + }, + { + "epoch": 2.499484004127967, + "grad_norm": 0.287165105342865, + "learning_rate": 7.123718149236791e-05, + "loss": 1.4514, + "step": 5451 + }, + { + "epoch": 2.4999426671253295, + "grad_norm": 0.2848725914955139, + "learning_rate": 7.11098365780351e-05, + "loss": 1.2621, + "step": 5452 + }, + { + "epoch": 2.5004013301226924, + "grad_norm": 0.398199200630188, + "learning_rate": 7.098259687355174e-05, + "loss": 1.1001, + "step": 5453 + }, + { + "epoch": 2.500859993120055, + "grad_norm": 0.29740533232688904, + "learning_rate": 7.085546241013085e-05, + "loss": 1.9265, + "step": 5454 + }, + { + "epoch": 2.5013186561174177, + "grad_norm": 0.25716161727905273, + "learning_rate": 7.07284332189595e-05, + "loss": 0.3225, + "step": 5455 + }, + { + "epoch": 2.5017773191147805, + "grad_norm": 0.19730542600154877, + "learning_rate": 7.060150933119852e-05, + "loss": 1.6379, + "step": 5456 + }, + { + "epoch": 2.5022359821121434, + "grad_norm": 0.3309273421764374, + "learning_rate": 7.04746907779834e-05, + "loss": 1.517, + "step": 5457 + }, + { + "epoch": 2.502694645109506, + "grad_norm": 0.32735908031463623, + "learning_rate": 7.034797759042371e-05, + "loss": 1.1667, + "step": 5458 + }, + { + "epoch": 2.503153308106868, + "grad_norm": 0.3031359910964966, + "learning_rate": 7.022136979960303e-05, + "loss": 1.3101, + "step": 5459 + }, + { + "epoch": 2.503611971104231, + "grad_norm": 0.18799692392349243, + "learning_rate": 7.009486743657934e-05, + "loss": 0.7399, + "step": 5460 + }, + { + "epoch": 2.504070634101594, + "grad_norm": 0.3134501576423645, + "learning_rate": 6.996847053238437e-05, + "loss": 1.1102, + "step": 5461 + }, + { + "epoch": 2.5045292970989568, + "grad_norm": 0.2856585681438446, + "learning_rate": 6.98421791180242e-05, + "loss": 1.4576, + "step": 5462 + }, + { + "epoch": 2.504987960096319, + "grad_norm": 0.34674420952796936, + "learning_rate": 6.971599322447903e-05, + "loss": 1.7148, + "step": 5463 + }, + { + "epoch": 2.505446623093682, + "grad_norm": 0.3734150528907776, + "learning_rate": 6.958991288270334e-05, + "loss": 0.9374, + "step": 5464 + }, + { + "epoch": 2.5059052860910445, + "grad_norm": 0.27259308099746704, + "learning_rate": 6.94639381236254e-05, + "loss": 1.8166, + "step": 5465 + }, + { + "epoch": 2.5063639490884073, + "grad_norm": 0.3674778640270233, + "learning_rate": 6.933806897814787e-05, + "loss": 1.4604, + "step": 5466 + }, + { + "epoch": 2.50682261208577, + "grad_norm": 0.45165976881980896, + "learning_rate": 6.921230547714719e-05, + "loss": 1.4612, + "step": 5467 + }, + { + "epoch": 2.5072812750831326, + "grad_norm": 0.2723102271556854, + "learning_rate": 6.90866476514741e-05, + "loss": 0.8123, + "step": 5468 + }, + { + "epoch": 2.5077399380804954, + "grad_norm": 0.27866679430007935, + "learning_rate": 6.896109553195334e-05, + "loss": 1.6746, + "step": 5469 + }, + { + "epoch": 2.508198601077858, + "grad_norm": 0.4871840476989746, + "learning_rate": 6.883564914938384e-05, + "loss": 2.0107, + "step": 5470 + }, + { + "epoch": 2.5086572640752207, + "grad_norm": 0.27734771370887756, + "learning_rate": 6.871030853453857e-05, + "loss": 0.4191, + "step": 5471 + }, + { + "epoch": 2.5091159270725836, + "grad_norm": 0.3640304207801819, + "learning_rate": 6.858507371816424e-05, + "loss": 1.7487, + "step": 5472 + }, + { + "epoch": 2.509574590069946, + "grad_norm": 0.42865926027297974, + "learning_rate": 6.845994473098194e-05, + "loss": 1.7012, + "step": 5473 + }, + { + "epoch": 2.510033253067309, + "grad_norm": 0.27769333124160767, + "learning_rate": 6.833492160368681e-05, + "loss": 0.8013, + "step": 5474 + }, + { + "epoch": 2.5104919160646713, + "grad_norm": 0.08679754287004471, + "learning_rate": 6.821000436694791e-05, + "loss": 1.0261, + "step": 5475 + }, + { + "epoch": 2.510950579062034, + "grad_norm": 0.41627833247184753, + "learning_rate": 6.808519305140831e-05, + "loss": 1.1582, + "step": 5476 + }, + { + "epoch": 2.511409242059397, + "grad_norm": 0.19535160064697266, + "learning_rate": 6.79604876876852e-05, + "loss": 1.0494, + "step": 5477 + }, + { + "epoch": 2.5118679050567594, + "grad_norm": 0.31459343433380127, + "learning_rate": 6.783588830636956e-05, + "loss": 1.9631, + "step": 5478 + }, + { + "epoch": 2.5123265680541222, + "grad_norm": 0.3410641551017761, + "learning_rate": 6.771139493802653e-05, + "loss": 1.2182, + "step": 5479 + }, + { + "epoch": 2.5127852310514847, + "grad_norm": 0.3488866090774536, + "learning_rate": 6.75870076131953e-05, + "loss": 1.1641, + "step": 5480 + }, + { + "epoch": 2.5132438940488475, + "grad_norm": 0.34782347083091736, + "learning_rate": 6.746272636238898e-05, + "loss": 1.5015, + "step": 5481 + }, + { + "epoch": 2.5137025570462104, + "grad_norm": 0.41583070158958435, + "learning_rate": 6.733855121609467e-05, + "loss": 1.5594, + "step": 5482 + }, + { + "epoch": 2.5141612200435732, + "grad_norm": 0.3468913435935974, + "learning_rate": 6.721448220477333e-05, + "loss": 1.8156, + "step": 5483 + }, + { + "epoch": 2.5146198830409356, + "grad_norm": 0.324745237827301, + "learning_rate": 6.709051935886007e-05, + "loss": 1.3654, + "step": 5484 + }, + { + "epoch": 2.5150785460382985, + "grad_norm": 0.31781327724456787, + "learning_rate": 6.696666270876389e-05, + "loss": 1.3649, + "step": 5485 + }, + { + "epoch": 2.515537209035661, + "grad_norm": 0.3324166238307953, + "learning_rate": 6.684291228486761e-05, + "loss": 0.8118, + "step": 5486 + }, + { + "epoch": 2.5159958720330238, + "grad_norm": 0.3521929085254669, + "learning_rate": 6.67192681175282e-05, + "loss": 2.1743, + "step": 5487 + }, + { + "epoch": 2.5164545350303866, + "grad_norm": 0.43346625566482544, + "learning_rate": 6.65957302370766e-05, + "loss": 1.2647, + "step": 5488 + }, + { + "epoch": 2.516913198027749, + "grad_norm": 0.10324136167764664, + "learning_rate": 6.647229867381722e-05, + "loss": 0.815, + "step": 5489 + }, + { + "epoch": 2.517371861025112, + "grad_norm": 0.23839758336544037, + "learning_rate": 6.634897345802888e-05, + "loss": 1.7404, + "step": 5490 + }, + { + "epoch": 2.5178305240224743, + "grad_norm": 0.39846575260162354, + "learning_rate": 6.62257546199641e-05, + "loss": 0.859, + "step": 5491 + }, + { + "epoch": 2.518289187019837, + "grad_norm": 0.2599179148674011, + "learning_rate": 6.610264218984946e-05, + "loss": 1.4614, + "step": 5492 + }, + { + "epoch": 2.5187478500172, + "grad_norm": 0.3531731963157654, + "learning_rate": 6.597963619788521e-05, + "loss": 1.4722, + "step": 5493 + }, + { + "epoch": 2.5192065130145624, + "grad_norm": 0.28518784046173096, + "learning_rate": 6.585673667424563e-05, + "loss": 0.9644, + "step": 5494 + }, + { + "epoch": 2.5196651760119253, + "grad_norm": 0.1243680790066719, + "learning_rate": 6.573394364907892e-05, + "loss": 0.4391, + "step": 5495 + }, + { + "epoch": 2.5201238390092877, + "grad_norm": 0.11469584703445435, + "learning_rate": 6.561125715250704e-05, + "loss": 1.0112, + "step": 5496 + }, + { + "epoch": 2.5205825020066506, + "grad_norm": 0.3533564507961273, + "learning_rate": 6.548867721462587e-05, + "loss": 1.1036, + "step": 5497 + }, + { + "epoch": 2.5210411650040134, + "grad_norm": 0.23048144578933716, + "learning_rate": 6.536620386550518e-05, + "loss": 1.0601, + "step": 5498 + }, + { + "epoch": 2.521499828001376, + "grad_norm": 0.20298047363758087, + "learning_rate": 6.524383713518867e-05, + "loss": 0.7747, + "step": 5499 + }, + { + "epoch": 2.5219584909987387, + "grad_norm": 0.3512571156024933, + "learning_rate": 6.512157705369354e-05, + "loss": 1.4583, + "step": 5500 + }, + { + "epoch": 2.522417153996101, + "grad_norm": 0.3375270962715149, + "learning_rate": 6.49994236510112e-05, + "loss": 1.4264, + "step": 5501 + }, + { + "epoch": 2.522875816993464, + "grad_norm": 0.27176031470298767, + "learning_rate": 6.487737695710677e-05, + "loss": 1.6927, + "step": 5502 + }, + { + "epoch": 2.523334479990827, + "grad_norm": 0.3321675956249237, + "learning_rate": 6.475543700191916e-05, + "loss": 0.9206, + "step": 5503 + }, + { + "epoch": 2.5237931429881897, + "grad_norm": 0.19732095301151276, + "learning_rate": 6.46336038153611e-05, + "loss": 1.333, + "step": 5504 + }, + { + "epoch": 2.524251805985552, + "grad_norm": 0.3470333218574524, + "learning_rate": 6.451187742731923e-05, + "loss": 1.6497, + "step": 5505 + }, + { + "epoch": 2.524710468982915, + "grad_norm": 0.3208187222480774, + "learning_rate": 6.43902578676538e-05, + "loss": 1.3061, + "step": 5506 + }, + { + "epoch": 2.5251691319802774, + "grad_norm": 0.353569358587265, + "learning_rate": 6.426874516619907e-05, + "loss": 2.2356, + "step": 5507 + }, + { + "epoch": 2.5256277949776402, + "grad_norm": 0.3554728031158447, + "learning_rate": 6.41473393527629e-05, + "loss": 1.2403, + "step": 5508 + }, + { + "epoch": 2.526086457975003, + "grad_norm": 0.3198685050010681, + "learning_rate": 6.402604045712707e-05, + "loss": 0.8093, + "step": 5509 + }, + { + "epoch": 2.5265451209723655, + "grad_norm": 0.20116372406482697, + "learning_rate": 6.39048485090471e-05, + "loss": 1.1317, + "step": 5510 + }, + { + "epoch": 2.5270037839697284, + "grad_norm": 0.33361703157424927, + "learning_rate": 6.378376353825222e-05, + "loss": 2.0852, + "step": 5511 + }, + { + "epoch": 2.5274624469670908, + "grad_norm": 0.3906664252281189, + "learning_rate": 6.366278557444538e-05, + "loss": 1.202, + "step": 5512 + }, + { + "epoch": 2.5279211099644536, + "grad_norm": 0.2128971815109253, + "learning_rate": 6.354191464730335e-05, + "loss": 0.8657, + "step": 5513 + }, + { + "epoch": 2.5283797729618165, + "grad_norm": 0.2909519672393799, + "learning_rate": 6.34211507864767e-05, + "loss": 1.7382, + "step": 5514 + }, + { + "epoch": 2.528838435959179, + "grad_norm": 0.4050029218196869, + "learning_rate": 6.330049402158955e-05, + "loss": 1.5528, + "step": 5515 + }, + { + "epoch": 2.5292970989565418, + "grad_norm": 0.27755218744277954, + "learning_rate": 6.317994438224023e-05, + "loss": 1.2415, + "step": 5516 + }, + { + "epoch": 2.529755761953904, + "grad_norm": 0.3861069083213806, + "learning_rate": 6.305950189800003e-05, + "loss": 2.1422, + "step": 5517 + }, + { + "epoch": 2.530214424951267, + "grad_norm": 0.5897046327590942, + "learning_rate": 6.293916659841454e-05, + "loss": 2.0242, + "step": 5518 + }, + { + "epoch": 2.53067308794863, + "grad_norm": 0.27490806579589844, + "learning_rate": 6.281893851300285e-05, + "loss": 0.8, + "step": 5519 + }, + { + "epoch": 2.5311317509459923, + "grad_norm": 0.35904571413993835, + "learning_rate": 6.269881767125778e-05, + "loss": 1.6329, + "step": 5520 + }, + { + "epoch": 2.531590413943355, + "grad_norm": 0.3965526819229126, + "learning_rate": 6.257880410264589e-05, + "loss": 1.6531, + "step": 5521 + }, + { + "epoch": 2.5320490769407176, + "grad_norm": 0.3628177046775818, + "learning_rate": 6.245889783660735e-05, + "loss": 1.5156, + "step": 5522 + }, + { + "epoch": 2.5325077399380804, + "grad_norm": 0.19028045237064362, + "learning_rate": 6.233909890255596e-05, + "loss": 0.7489, + "step": 5523 + }, + { + "epoch": 2.5329664029354433, + "grad_norm": 0.30474984645843506, + "learning_rate": 6.22194073298793e-05, + "loss": 1.6149, + "step": 5524 + }, + { + "epoch": 2.533425065932806, + "grad_norm": 0.3913409411907196, + "learning_rate": 6.209982314793845e-05, + "loss": 1.6556, + "step": 5525 + }, + { + "epoch": 2.5338837289301686, + "grad_norm": 0.3876248896121979, + "learning_rate": 6.198034638606854e-05, + "loss": 1.5082, + "step": 5526 + }, + { + "epoch": 2.5343423919275314, + "grad_norm": 0.324945867061615, + "learning_rate": 6.186097707357802e-05, + "loss": 1.6067, + "step": 5527 + }, + { + "epoch": 2.534801054924894, + "grad_norm": 0.26667168736457825, + "learning_rate": 6.174171523974886e-05, + "loss": 1.201, + "step": 5528 + }, + { + "epoch": 2.5352597179222567, + "grad_norm": 0.2764081656932831, + "learning_rate": 6.1622560913837e-05, + "loss": 1.2445, + "step": 5529 + }, + { + "epoch": 2.5357183809196195, + "grad_norm": 0.39166367053985596, + "learning_rate": 6.150351412507176e-05, + "loss": 1.3779, + "step": 5530 + }, + { + "epoch": 2.536177043916982, + "grad_norm": 0.2029440701007843, + "learning_rate": 6.138457490265625e-05, + "loss": 1.1893, + "step": 5531 + }, + { + "epoch": 2.536635706914345, + "grad_norm": 0.3429381847381592, + "learning_rate": 6.126574327576711e-05, + "loss": 1.2501, + "step": 5532 + }, + { + "epoch": 2.5370943699117072, + "grad_norm": 0.41793501377105713, + "learning_rate": 6.11470192735547e-05, + "loss": 1.7427, + "step": 5533 + }, + { + "epoch": 2.53755303290907, + "grad_norm": 0.3555110692977905, + "learning_rate": 6.1028402925142665e-05, + "loss": 1.5948, + "step": 5534 + }, + { + "epoch": 2.538011695906433, + "grad_norm": 0.26082339882850647, + "learning_rate": 6.090989425962839e-05, + "loss": 1.725, + "step": 5535 + }, + { + "epoch": 2.5384703589037954, + "grad_norm": 0.3338443338871002, + "learning_rate": 6.079149330608319e-05, + "loss": 1.4976, + "step": 5536 + }, + { + "epoch": 2.538929021901158, + "grad_norm": 0.35809195041656494, + "learning_rate": 6.067320009355148e-05, + "loss": 1.532, + "step": 5537 + }, + { + "epoch": 2.5393876848985206, + "grad_norm": 0.29855048656463623, + "learning_rate": 6.055501465105168e-05, + "loss": 1.3347, + "step": 5538 + }, + { + "epoch": 2.5398463478958835, + "grad_norm": 0.3272113800048828, + "learning_rate": 6.043693700757519e-05, + "loss": 0.8456, + "step": 5539 + }, + { + "epoch": 2.5403050108932463, + "grad_norm": 0.06610861420631409, + "learning_rate": 6.0318967192087516e-05, + "loss": 1.141, + "step": 5540 + }, + { + "epoch": 2.5407636738906088, + "grad_norm": 0.3634183406829834, + "learning_rate": 6.0201105233527453e-05, + "loss": 0.9582, + "step": 5541 + }, + { + "epoch": 2.5412223368879716, + "grad_norm": 0.13566255569458008, + "learning_rate": 6.008335116080732e-05, + "loss": 1.0286, + "step": 5542 + }, + { + "epoch": 2.541680999885334, + "grad_norm": 0.33389878273010254, + "learning_rate": 5.9965705002813174e-05, + "loss": 1.2472, + "step": 5543 + }, + { + "epoch": 2.542139662882697, + "grad_norm": 0.3005581796169281, + "learning_rate": 5.984816678840449e-05, + "loss": 1.3406, + "step": 5544 + }, + { + "epoch": 2.5425983258800597, + "grad_norm": 0.44604259729385376, + "learning_rate": 5.973073654641392e-05, + "loss": 1.6216, + "step": 5545 + }, + { + "epoch": 2.543056988877422, + "grad_norm": 0.365333616733551, + "learning_rate": 5.9613414305648314e-05, + "loss": 1.1402, + "step": 5546 + }, + { + "epoch": 2.543515651874785, + "grad_norm": 0.32972466945648193, + "learning_rate": 5.9496200094887546e-05, + "loss": 1.101, + "step": 5547 + }, + { + "epoch": 2.5439743148721474, + "grad_norm": 0.24824364483356476, + "learning_rate": 5.937909394288504e-05, + "loss": 1.7358, + "step": 5548 + }, + { + "epoch": 2.5444329778695103, + "grad_norm": 0.433464378118515, + "learning_rate": 5.92620958783679e-05, + "loss": 1.2081, + "step": 5549 + }, + { + "epoch": 2.544891640866873, + "grad_norm": 0.07062580436468124, + "learning_rate": 5.9145205930036583e-05, + "loss": 0.7539, + "step": 5550 + }, + { + "epoch": 2.545350303864236, + "grad_norm": 0.3250384032726288, + "learning_rate": 5.9028424126564894e-05, + "loss": 0.9884, + "step": 5551 + }, + { + "epoch": 2.5458089668615984, + "grad_norm": 0.3027406930923462, + "learning_rate": 5.89117504966003e-05, + "loss": 1.7088, + "step": 5552 + }, + { + "epoch": 2.5462676298589613, + "grad_norm": 0.23532333970069885, + "learning_rate": 5.879518506876375e-05, + "loss": 0.5515, + "step": 5553 + }, + { + "epoch": 2.5467262928563237, + "grad_norm": 0.25035008788108826, + "learning_rate": 5.867872787164952e-05, + "loss": 1.5544, + "step": 5554 + }, + { + "epoch": 2.5471849558536865, + "grad_norm": 0.38063761591911316, + "learning_rate": 5.8562378933825486e-05, + "loss": 1.6796, + "step": 5555 + }, + { + "epoch": 2.5476436188510494, + "grad_norm": 0.35923126339912415, + "learning_rate": 5.8446138283832826e-05, + "loss": 1.0818, + "step": 5556 + }, + { + "epoch": 2.548102281848412, + "grad_norm": 0.19541509449481964, + "learning_rate": 5.833000595018617e-05, + "loss": 1.6249, + "step": 5557 + }, + { + "epoch": 2.5485609448457747, + "grad_norm": 0.38436034321784973, + "learning_rate": 5.821398196137368e-05, + "loss": 1.3776, + "step": 5558 + }, + { + "epoch": 2.549019607843137, + "grad_norm": 0.2822701930999756, + "learning_rate": 5.8098066345856846e-05, + "loss": 0.4237, + "step": 5559 + }, + { + "epoch": 2.5494782708405, + "grad_norm": 0.15349756181240082, + "learning_rate": 5.798225913207067e-05, + "loss": 0.5825, + "step": 5560 + }, + { + "epoch": 2.549936933837863, + "grad_norm": 0.13313986361026764, + "learning_rate": 5.786656034842347e-05, + "loss": 1.3864, + "step": 5561 + }, + { + "epoch": 2.550395596835225, + "grad_norm": 0.45757579803466797, + "learning_rate": 5.775097002329693e-05, + "loss": 0.76, + "step": 5562 + }, + { + "epoch": 2.550854259832588, + "grad_norm": 0.1999332308769226, + "learning_rate": 5.763548818504616e-05, + "loss": 1.2987, + "step": 5563 + }, + { + "epoch": 2.5513129228299505, + "grad_norm": 0.30660200119018555, + "learning_rate": 5.752011486199982e-05, + "loss": 1.3302, + "step": 5564 + }, + { + "epoch": 2.5517715858273133, + "grad_norm": 0.27582883834838867, + "learning_rate": 5.740485008245966e-05, + "loss": 1.0434, + "step": 5565 + }, + { + "epoch": 2.552230248824676, + "grad_norm": 0.24701207876205444, + "learning_rate": 5.728969387470112e-05, + "loss": 0.4937, + "step": 5566 + }, + { + "epoch": 2.5526889118220386, + "grad_norm": 0.25263074040412903, + "learning_rate": 5.7174646266972776e-05, + "loss": 1.2341, + "step": 5567 + }, + { + "epoch": 2.5531475748194015, + "grad_norm": 0.3954515755176544, + "learning_rate": 5.70597072874966e-05, + "loss": 1.4972, + "step": 5568 + }, + { + "epoch": 2.553606237816764, + "grad_norm": 0.4183090031147003, + "learning_rate": 5.694487696446804e-05, + "loss": 2.063, + "step": 5569 + }, + { + "epoch": 2.5540649008141267, + "grad_norm": 0.3172152638435364, + "learning_rate": 5.683015532605573e-05, + "loss": 0.9099, + "step": 5570 + }, + { + "epoch": 2.5545235638114896, + "grad_norm": 0.3616786301136017, + "learning_rate": 5.671554240040172e-05, + "loss": 1.481, + "step": 5571 + }, + { + "epoch": 2.5549822268088525, + "grad_norm": 0.30609598755836487, + "learning_rate": 5.660103821562151e-05, + "loss": 1.6557, + "step": 5572 + }, + { + "epoch": 2.555440889806215, + "grad_norm": 0.4670886993408203, + "learning_rate": 5.648664279980365e-05, + "loss": 1.2957, + "step": 5573 + }, + { + "epoch": 2.5558995528035777, + "grad_norm": 0.2856581211090088, + "learning_rate": 5.6372356181010164e-05, + "loss": 1.1547, + "step": 5574 + }, + { + "epoch": 2.55635821580094, + "grad_norm": 0.21106205880641937, + "learning_rate": 5.625817838727643e-05, + "loss": 1.188, + "step": 5575 + }, + { + "epoch": 2.556816878798303, + "grad_norm": 0.1674429029226303, + "learning_rate": 5.614410944661108e-05, + "loss": 0.5747, + "step": 5576 + }, + { + "epoch": 2.557275541795666, + "grad_norm": 0.15634135901927948, + "learning_rate": 5.603014938699602e-05, + "loss": 1.5959, + "step": 5577 + }, + { + "epoch": 2.5577342047930283, + "grad_norm": 0.2876671552658081, + "learning_rate": 5.591629823638655e-05, + "loss": 0.8638, + "step": 5578 + }, + { + "epoch": 2.558192867790391, + "grad_norm": 0.3428886830806732, + "learning_rate": 5.5802556022711115e-05, + "loss": 1.4761, + "step": 5579 + }, + { + "epoch": 2.5586515307877535, + "grad_norm": 0.33712685108184814, + "learning_rate": 5.5688922773871555e-05, + "loss": 1.5991, + "step": 5580 + }, + { + "epoch": 2.5591101937851164, + "grad_norm": 0.3278370797634125, + "learning_rate": 5.557539851774285e-05, + "loss": 1.3031, + "step": 5581 + }, + { + "epoch": 2.5595688567824793, + "grad_norm": 0.30777499079704285, + "learning_rate": 5.546198328217333e-05, + "loss": 1.088, + "step": 5582 + }, + { + "epoch": 2.5600275197798417, + "grad_norm": 0.16568246483802795, + "learning_rate": 5.5348677094984755e-05, + "loss": 0.9488, + "step": 5583 + }, + { + "epoch": 2.5604861827772045, + "grad_norm": 0.3086935877799988, + "learning_rate": 5.523547998397166e-05, + "loss": 1.6018, + "step": 5584 + }, + { + "epoch": 2.560944845774567, + "grad_norm": 0.3688352406024933, + "learning_rate": 5.512239197690222e-05, + "loss": 1.5702, + "step": 5585 + }, + { + "epoch": 2.56140350877193, + "grad_norm": 0.34201377630233765, + "learning_rate": 5.5009413101517804e-05, + "loss": 1.51, + "step": 5586 + }, + { + "epoch": 2.5618621717692927, + "grad_norm": 0.5058449506759644, + "learning_rate": 5.4896543385532904e-05, + "loss": 2.0178, + "step": 5587 + }, + { + "epoch": 2.562320834766655, + "grad_norm": 0.34568533301353455, + "learning_rate": 5.4783782856635156e-05, + "loss": 1.5913, + "step": 5588 + }, + { + "epoch": 2.562779497764018, + "grad_norm": 0.27001887559890747, + "learning_rate": 5.467113154248582e-05, + "loss": 0.7074, + "step": 5589 + }, + { + "epoch": 2.5632381607613803, + "grad_norm": 0.3905376195907593, + "learning_rate": 5.455858947071885e-05, + "loss": 2.2896, + "step": 5590 + }, + { + "epoch": 2.563696823758743, + "grad_norm": 0.34636062383651733, + "learning_rate": 5.444615666894165e-05, + "loss": 1.4116, + "step": 5591 + }, + { + "epoch": 2.564155486756106, + "grad_norm": 0.3349077105522156, + "learning_rate": 5.433383316473484e-05, + "loss": 0.8179, + "step": 5592 + }, + { + "epoch": 2.564614149753469, + "grad_norm": 0.20421668887138367, + "learning_rate": 5.42216189856522e-05, + "loss": 1.3837, + "step": 5593 + }, + { + "epoch": 2.5650728127508313, + "grad_norm": 0.36774739623069763, + "learning_rate": 5.410951415922072e-05, + "loss": 1.0772, + "step": 5594 + }, + { + "epoch": 2.565531475748194, + "grad_norm": 0.27548274397850037, + "learning_rate": 5.399751871294034e-05, + "loss": 0.7288, + "step": 5595 + }, + { + "epoch": 2.5659901387455566, + "grad_norm": 0.27638116478919983, + "learning_rate": 5.388563267428448e-05, + "loss": 1.6965, + "step": 5596 + }, + { + "epoch": 2.5664488017429194, + "grad_norm": 0.3485362231731415, + "learning_rate": 5.377385607069951e-05, + "loss": 1.7012, + "step": 5597 + }, + { + "epoch": 2.5669074647402823, + "grad_norm": 0.4059440493583679, + "learning_rate": 5.366218892960517e-05, + "loss": 1.7341, + "step": 5598 + }, + { + "epoch": 2.5673661277376447, + "grad_norm": 0.3992423415184021, + "learning_rate": 5.3550631278394156e-05, + "loss": 1.6401, + "step": 5599 + }, + { + "epoch": 2.5678247907350076, + "grad_norm": 0.34082409739494324, + "learning_rate": 5.3439183144432414e-05, + "loss": 1.1353, + "step": 5600 + }, + { + "epoch": 2.56828345373237, + "grad_norm": 0.2575600743293762, + "learning_rate": 5.33278445550589e-05, + "loss": 0.4625, + "step": 5601 + }, + { + "epoch": 2.568742116729733, + "grad_norm": 0.30971759557724, + "learning_rate": 5.321661553758572e-05, + "loss": 1.5112, + "step": 5602 + }, + { + "epoch": 2.5692007797270957, + "grad_norm": 0.16957223415374756, + "learning_rate": 5.3105496119298266e-05, + "loss": 0.7928, + "step": 5603 + }, + { + "epoch": 2.569659442724458, + "grad_norm": 0.22056140005588531, + "learning_rate": 5.299448632745485e-05, + "loss": 0.8559, + "step": 5604 + }, + { + "epoch": 2.570118105721821, + "grad_norm": 0.2757347524166107, + "learning_rate": 5.288358618928707e-05, + "loss": 1.0507, + "step": 5605 + }, + { + "epoch": 2.5705767687191834, + "grad_norm": 0.2410385012626648, + "learning_rate": 5.27727957319995e-05, + "loss": 1.358, + "step": 5606 + }, + { + "epoch": 2.5710354317165462, + "grad_norm": 0.39340847730636597, + "learning_rate": 5.266211498276968e-05, + "loss": 1.5098, + "step": 5607 + }, + { + "epoch": 2.571494094713909, + "grad_norm": 0.23687437176704407, + "learning_rate": 5.2551543968748564e-05, + "loss": 1.2888, + "step": 5608 + }, + { + "epoch": 2.5719527577112715, + "grad_norm": 0.32233813405036926, + "learning_rate": 5.244108271706005e-05, + "loss": 1.6709, + "step": 5609 + }, + { + "epoch": 2.5724114207086344, + "grad_norm": 0.40911930799484253, + "learning_rate": 5.233073125480098e-05, + "loss": 1.2322, + "step": 5610 + }, + { + "epoch": 2.572870083705997, + "grad_norm": 0.24620671570301056, + "learning_rate": 5.2220489609041466e-05, + "loss": 0.8972, + "step": 5611 + }, + { + "epoch": 2.5733287467033596, + "grad_norm": 0.2210768610239029, + "learning_rate": 5.2110357806824494e-05, + "loss": 1.2861, + "step": 5612 + }, + { + "epoch": 2.5737874097007225, + "grad_norm": 0.9936086535453796, + "learning_rate": 5.2000335875166117e-05, + "loss": 1.4037, + "step": 5613 + }, + { + "epoch": 2.574246072698085, + "grad_norm": 0.22487474977970123, + "learning_rate": 5.1890423841055636e-05, + "loss": 1.1986, + "step": 5614 + }, + { + "epoch": 2.5747047356954478, + "grad_norm": 0.36320793628692627, + "learning_rate": 5.17806217314552e-05, + "loss": 1.7546, + "step": 5615 + }, + { + "epoch": 2.57516339869281, + "grad_norm": 0.3797517418861389, + "learning_rate": 5.167092957330011e-05, + "loss": 0.8319, + "step": 5616 + }, + { + "epoch": 2.575622061690173, + "grad_norm": 0.1391095519065857, + "learning_rate": 5.15613473934986e-05, + "loss": 0.9581, + "step": 5617 + }, + { + "epoch": 2.576080724687536, + "grad_norm": 0.3511468470096588, + "learning_rate": 5.145187521893202e-05, + "loss": 1.5688, + "step": 5618 + }, + { + "epoch": 2.5765393876848988, + "grad_norm": 0.33020275831222534, + "learning_rate": 5.13425130764546e-05, + "loss": 1.8931, + "step": 5619 + }, + { + "epoch": 2.576998050682261, + "grad_norm": 0.3568331003189087, + "learning_rate": 5.1233260992893715e-05, + "loss": 1.1766, + "step": 5620 + }, + { + "epoch": 2.577456713679624, + "grad_norm": 0.3786485195159912, + "learning_rate": 5.112411899504971e-05, + "loss": 1.5772, + "step": 5621 + }, + { + "epoch": 2.5779153766769864, + "grad_norm": 0.3674635887145996, + "learning_rate": 5.1015087109695934e-05, + "loss": 2.0232, + "step": 5622 + }, + { + "epoch": 2.5783740396743493, + "grad_norm": 0.428074449300766, + "learning_rate": 5.090616536357856e-05, + "loss": 0.9435, + "step": 5623 + }, + { + "epoch": 2.578832702671712, + "grad_norm": 0.10987308621406555, + "learning_rate": 5.0797353783416986e-05, + "loss": 0.4183, + "step": 5624 + }, + { + "epoch": 2.5792913656690746, + "grad_norm": 0.29732468724250793, + "learning_rate": 5.068865239590342e-05, + "loss": 0.8677, + "step": 5625 + }, + { + "epoch": 2.5797500286664374, + "grad_norm": 0.272393137216568, + "learning_rate": 5.0580061227703145e-05, + "loss": 0.9888, + "step": 5626 + }, + { + "epoch": 2.5802086916638, + "grad_norm": 0.23563776910305023, + "learning_rate": 5.047158030545434e-05, + "loss": 1.5809, + "step": 5627 + }, + { + "epoch": 2.5806673546611627, + "grad_norm": 0.4073021411895752, + "learning_rate": 5.036320965576813e-05, + "loss": 1.1274, + "step": 5628 + }, + { + "epoch": 2.5811260176585256, + "grad_norm": 0.4311738610267639, + "learning_rate": 5.0254949305228614e-05, + "loss": 0.9954, + "step": 5629 + }, + { + "epoch": 2.581584680655888, + "grad_norm": 0.2439291924238205, + "learning_rate": 5.01467992803929e-05, + "loss": 1.2027, + "step": 5630 + }, + { + "epoch": 2.582043343653251, + "grad_norm": 0.241475909948349, + "learning_rate": 5.003875960779097e-05, + "loss": 1.0108, + "step": 5631 + }, + { + "epoch": 2.5825020066506132, + "grad_norm": 0.25994980335235596, + "learning_rate": 4.9930830313925645e-05, + "loss": 1.0152, + "step": 5632 + }, + { + "epoch": 2.582960669647976, + "grad_norm": 0.234562948346138, + "learning_rate": 4.982301142527279e-05, + "loss": 0.7739, + "step": 5633 + }, + { + "epoch": 2.583419332645339, + "grad_norm": 0.2599080801010132, + "learning_rate": 4.971530296828131e-05, + "loss": 1.2073, + "step": 5634 + }, + { + "epoch": 2.5838779956427014, + "grad_norm": 0.3854179382324219, + "learning_rate": 4.960770496937267e-05, + "loss": 1.1806, + "step": 5635 + }, + { + "epoch": 2.5843366586400642, + "grad_norm": 0.21886670589447021, + "learning_rate": 4.9500217454941434e-05, + "loss": 1.6209, + "step": 5636 + }, + { + "epoch": 2.5847953216374266, + "grad_norm": 0.4453312158584595, + "learning_rate": 4.939284045135517e-05, + "loss": 1.5825, + "step": 5637 + }, + { + "epoch": 2.5852539846347895, + "grad_norm": 0.24376074969768524, + "learning_rate": 4.9285573984954144e-05, + "loss": 1.1272, + "step": 5638 + }, + { + "epoch": 2.5857126476321524, + "grad_norm": 0.3159853219985962, + "learning_rate": 4.91784180820517e-05, + "loss": 0.9751, + "step": 5639 + }, + { + "epoch": 2.586171310629515, + "grad_norm": 0.39112159609794617, + "learning_rate": 4.907137276893381e-05, + "loss": 1.7409, + "step": 5640 + }, + { + "epoch": 2.5866299736268776, + "grad_norm": 0.39343714714050293, + "learning_rate": 4.896443807185963e-05, + "loss": 1.5584, + "step": 5641 + }, + { + "epoch": 2.5870886366242405, + "grad_norm": 0.361751526594162, + "learning_rate": 4.88576140170609e-05, + "loss": 1.7249, + "step": 5642 + }, + { + "epoch": 2.587547299621603, + "grad_norm": 0.273359090089798, + "learning_rate": 4.875090063074233e-05, + "loss": 0.8665, + "step": 5643 + }, + { + "epoch": 2.5880059626189658, + "grad_norm": 0.11545918881893158, + "learning_rate": 4.864429793908154e-05, + "loss": 1.1998, + "step": 5644 + }, + { + "epoch": 2.5884646256163286, + "grad_norm": 1.5149635076522827, + "learning_rate": 4.8537805968228985e-05, + "loss": 1.7397, + "step": 5645 + }, + { + "epoch": 2.588923288613691, + "grad_norm": 0.37379226088523865, + "learning_rate": 4.843142474430773e-05, + "loss": 1.7961, + "step": 5646 + }, + { + "epoch": 2.589381951611054, + "grad_norm": 0.41186100244522095, + "learning_rate": 4.8325154293414e-05, + "loss": 1.182, + "step": 5647 + }, + { + "epoch": 2.5898406146084163, + "grad_norm": 0.23608848452568054, + "learning_rate": 4.8218994641616606e-05, + "loss": 1.4241, + "step": 5648 + }, + { + "epoch": 2.590299277605779, + "grad_norm": 0.2904841899871826, + "learning_rate": 4.8112945814957396e-05, + "loss": 0.8572, + "step": 5649 + }, + { + "epoch": 2.590757940603142, + "grad_norm": 0.20750272274017334, + "learning_rate": 4.800700783945083e-05, + "loss": 1.0704, + "step": 5650 + }, + { + "epoch": 2.5912166036005044, + "grad_norm": 0.2611089050769806, + "learning_rate": 4.790118074108429e-05, + "loss": 1.7778, + "step": 5651 + }, + { + "epoch": 2.5916752665978673, + "grad_norm": 0.4068661332130432, + "learning_rate": 4.77954645458179e-05, + "loss": 1.3848, + "step": 5652 + }, + { + "epoch": 2.5921339295952297, + "grad_norm": 0.2891179919242859, + "learning_rate": 4.768985927958464e-05, + "loss": 1.014, + "step": 5653 + }, + { + "epoch": 2.5925925925925926, + "grad_norm": 0.3051091432571411, + "learning_rate": 4.758436496829016e-05, + "loss": 1.1055, + "step": 5654 + }, + { + "epoch": 2.5930512555899554, + "grad_norm": 0.3727298080921173, + "learning_rate": 4.747898163781311e-05, + "loss": 1.7048, + "step": 5655 + }, + { + "epoch": 2.593509918587318, + "grad_norm": 0.26991981267929077, + "learning_rate": 4.737370931400475e-05, + "loss": 0.955, + "step": 5656 + }, + { + "epoch": 2.5939685815846807, + "grad_norm": 0.29294559359550476, + "learning_rate": 4.726854802268909e-05, + "loss": 0.7957, + "step": 5657 + }, + { + "epoch": 2.594427244582043, + "grad_norm": 0.28209206461906433, + "learning_rate": 4.716349778966289e-05, + "loss": 1.6527, + "step": 5658 + }, + { + "epoch": 2.594885907579406, + "grad_norm": 0.2640170454978943, + "learning_rate": 4.7058558640695805e-05, + "loss": 0.8565, + "step": 5659 + }, + { + "epoch": 2.595344570576769, + "grad_norm": 0.13128653168678284, + "learning_rate": 4.695373060153013e-05, + "loss": 0.6372, + "step": 5660 + }, + { + "epoch": 2.5958032335741317, + "grad_norm": 0.22628602385520935, + "learning_rate": 4.684901369788097e-05, + "loss": 1.334, + "step": 5661 + }, + { + "epoch": 2.596261896571494, + "grad_norm": 0.33850499987602234, + "learning_rate": 4.674440795543633e-05, + "loss": 1.685, + "step": 5662 + }, + { + "epoch": 2.596720559568857, + "grad_norm": 0.3774524927139282, + "learning_rate": 4.663991339985641e-05, + "loss": 1.4136, + "step": 5663 + }, + { + "epoch": 2.5971792225662194, + "grad_norm": 0.38456571102142334, + "learning_rate": 4.653553005677463e-05, + "loss": 1.7705, + "step": 5664 + }, + { + "epoch": 2.597637885563582, + "grad_norm": 0.2402615249156952, + "learning_rate": 4.643125795179698e-05, + "loss": 0.9066, + "step": 5665 + }, + { + "epoch": 2.598096548560945, + "grad_norm": 0.31829625368118286, + "learning_rate": 4.632709711050215e-05, + "loss": 1.3141, + "step": 5666 + }, + { + "epoch": 2.5985552115583075, + "grad_norm": 0.2783154547214508, + "learning_rate": 4.622304755844164e-05, + "loss": 1.2182, + "step": 5667 + }, + { + "epoch": 2.5990138745556703, + "grad_norm": 0.2189006507396698, + "learning_rate": 4.6119109321139384e-05, + "loss": 0.6295, + "step": 5668 + }, + { + "epoch": 2.5994725375530328, + "grad_norm": 0.20123249292373657, + "learning_rate": 4.6015282424092196e-05, + "loss": 1.3485, + "step": 5669 + }, + { + "epoch": 2.5999312005503956, + "grad_norm": 0.4019489884376526, + "learning_rate": 4.591156689276971e-05, + "loss": 1.5854, + "step": 5670 + }, + { + "epoch": 2.6003898635477585, + "grad_norm": 0.335580438375473, + "learning_rate": 4.580796275261395e-05, + "loss": 1.5316, + "step": 5671 + }, + { + "epoch": 2.600848526545121, + "grad_norm": 0.31390464305877686, + "learning_rate": 4.570447002903988e-05, + "loss": 1.3737, + "step": 5672 + }, + { + "epoch": 2.6013071895424837, + "grad_norm": 0.34774190187454224, + "learning_rate": 4.560108874743507e-05, + "loss": 1.2221, + "step": 5673 + }, + { + "epoch": 2.601765852539846, + "grad_norm": 0.13695718348026276, + "learning_rate": 4.5497818933159406e-05, + "loss": 1.0302, + "step": 5674 + }, + { + "epoch": 2.602224515537209, + "grad_norm": 0.28607991337776184, + "learning_rate": 4.5394660611545955e-05, + "loss": 0.8901, + "step": 5675 + }, + { + "epoch": 2.602683178534572, + "grad_norm": 0.2588864266872406, + "learning_rate": 4.529161380790009e-05, + "loss": 1.4887, + "step": 5676 + }, + { + "epoch": 2.6031418415319343, + "grad_norm": 0.2673766314983368, + "learning_rate": 4.5188678547499976e-05, + "loss": 1.3279, + "step": 5677 + }, + { + "epoch": 2.603600504529297, + "grad_norm": 0.22360926866531372, + "learning_rate": 4.508585485559652e-05, + "loss": 1.3532, + "step": 5678 + }, + { + "epoch": 2.6040591675266596, + "grad_norm": 0.35530173778533936, + "learning_rate": 4.498314275741272e-05, + "loss": 1.1174, + "step": 5679 + }, + { + "epoch": 2.6045178305240224, + "grad_norm": 0.20542040467262268, + "learning_rate": 4.488054227814497e-05, + "loss": 1.0656, + "step": 5680 + }, + { + "epoch": 2.6049764935213853, + "grad_norm": 0.3311353325843811, + "learning_rate": 4.4778053442961764e-05, + "loss": 0.7792, + "step": 5681 + }, + { + "epoch": 2.6054351565187477, + "grad_norm": 0.09570083022117615, + "learning_rate": 4.467567627700436e-05, + "loss": 1.1345, + "step": 5682 + }, + { + "epoch": 2.6058938195161105, + "grad_norm": 0.26272979378700256, + "learning_rate": 4.4573410805386624e-05, + "loss": 1.1146, + "step": 5683 + }, + { + "epoch": 2.606352482513473, + "grad_norm": 0.368933767080307, + "learning_rate": 4.447125705319516e-05, + "loss": 1.2422, + "step": 5684 + }, + { + "epoch": 2.606811145510836, + "grad_norm": 0.3198513686656952, + "learning_rate": 4.436921504548874e-05, + "loss": 1.2469, + "step": 5685 + }, + { + "epoch": 2.6072698085081987, + "grad_norm": 0.3850035071372986, + "learning_rate": 4.426728480729914e-05, + "loss": 1.6652, + "step": 5686 + }, + { + "epoch": 2.6077284715055615, + "grad_norm": 0.35521817207336426, + "learning_rate": 4.4165466363630556e-05, + "loss": 1.4731, + "step": 5687 + }, + { + "epoch": 2.608187134502924, + "grad_norm": 0.2965492308139801, + "learning_rate": 4.406375973945981e-05, + "loss": 1.5766, + "step": 5688 + }, + { + "epoch": 2.608645797500287, + "grad_norm": 0.3511826694011688, + "learning_rate": 4.396216495973632e-05, + "loss": 1.9038, + "step": 5689 + }, + { + "epoch": 2.609104460497649, + "grad_norm": 0.3754914104938507, + "learning_rate": 4.386068204938193e-05, + "loss": 1.5048, + "step": 5690 + }, + { + "epoch": 2.609563123495012, + "grad_norm": 0.374553382396698, + "learning_rate": 4.375931103329117e-05, + "loss": 1.7387, + "step": 5691 + }, + { + "epoch": 2.610021786492375, + "grad_norm": 0.3759397268295288, + "learning_rate": 4.36580519363311e-05, + "loss": 1.6533, + "step": 5692 + }, + { + "epoch": 2.6104804494897373, + "grad_norm": 0.3354843258857727, + "learning_rate": 4.355690478334129e-05, + "loss": 0.8031, + "step": 5693 + }, + { + "epoch": 2.6109391124871, + "grad_norm": 0.209818497300148, + "learning_rate": 4.3455869599133834e-05, + "loss": 1.2783, + "step": 5694 + }, + { + "epoch": 2.6113977754844626, + "grad_norm": 0.36011189222335815, + "learning_rate": 4.3354946408493524e-05, + "loss": 2.0714, + "step": 5695 + }, + { + "epoch": 2.6118564384818255, + "grad_norm": 0.2897484302520752, + "learning_rate": 4.325413523617733e-05, + "loss": 1.3004, + "step": 5696 + }, + { + "epoch": 2.6123151014791883, + "grad_norm": 0.4306742548942566, + "learning_rate": 4.315343610691508e-05, + "loss": 1.3275, + "step": 5697 + }, + { + "epoch": 2.6127737644765507, + "grad_norm": 0.14981167018413544, + "learning_rate": 4.305284904540901e-05, + "loss": 1.0299, + "step": 5698 + }, + { + "epoch": 2.6132324274739136, + "grad_norm": 0.3096173405647278, + "learning_rate": 4.295237407633379e-05, + "loss": 1.6353, + "step": 5699 + }, + { + "epoch": 2.613691090471276, + "grad_norm": 0.33780768513679504, + "learning_rate": 4.285201122433674e-05, + "loss": 0.4902, + "step": 5700 + }, + { + "epoch": 2.614149753468639, + "grad_norm": 0.06309907138347626, + "learning_rate": 4.2751760514037506e-05, + "loss": 1.1826, + "step": 5701 + }, + { + "epoch": 2.6146084164660017, + "grad_norm": 0.39696255326271057, + "learning_rate": 4.2651621970028366e-05, + "loss": 1.5219, + "step": 5702 + }, + { + "epoch": 2.615067079463364, + "grad_norm": 0.4694381058216095, + "learning_rate": 4.2551595616874e-05, + "loss": 1.8362, + "step": 5703 + }, + { + "epoch": 2.615525742460727, + "grad_norm": 0.29091694951057434, + "learning_rate": 4.24516814791116e-05, + "loss": 1.6509, + "step": 5704 + }, + { + "epoch": 2.6159844054580894, + "grad_norm": 0.40303540229797363, + "learning_rate": 4.235187958125086e-05, + "loss": 1.3851, + "step": 5705 + }, + { + "epoch": 2.6164430684554523, + "grad_norm": 0.3498489558696747, + "learning_rate": 4.225218994777397e-05, + "loss": 1.513, + "step": 5706 + }, + { + "epoch": 2.616901731452815, + "grad_norm": 0.3207775354385376, + "learning_rate": 4.2152612603135244e-05, + "loss": 1.484, + "step": 5707 + }, + { + "epoch": 2.617360394450178, + "grad_norm": 0.2937675416469574, + "learning_rate": 4.2053147571761985e-05, + "loss": 1.2656, + "step": 5708 + }, + { + "epoch": 2.6178190574475404, + "grad_norm": 0.3286793529987335, + "learning_rate": 4.1953794878053565e-05, + "loss": 1.202, + "step": 5709 + }, + { + "epoch": 2.6182777204449033, + "grad_norm": 0.22947460412979126, + "learning_rate": 4.185455454638193e-05, + "loss": 1.2922, + "step": 5710 + }, + { + "epoch": 2.6187363834422657, + "grad_norm": 0.22816713154315948, + "learning_rate": 4.17554266010915e-05, + "loss": 0.9779, + "step": 5711 + }, + { + "epoch": 2.6191950464396285, + "grad_norm": 0.34349822998046875, + "learning_rate": 4.165641106649898e-05, + "loss": 1.5968, + "step": 5712 + }, + { + "epoch": 2.6196537094369914, + "grad_norm": 0.3354540169239044, + "learning_rate": 4.155750796689373e-05, + "loss": 0.8374, + "step": 5713 + }, + { + "epoch": 2.620112372434354, + "grad_norm": 0.3276011645793915, + "learning_rate": 4.1458717326537276e-05, + "loss": 1.5134, + "step": 5714 + }, + { + "epoch": 2.6205710354317167, + "grad_norm": 0.3458406925201416, + "learning_rate": 4.1360039169663685e-05, + "loss": 1.4265, + "step": 5715 + }, + { + "epoch": 2.621029698429079, + "grad_norm": 0.3710658550262451, + "learning_rate": 4.1261473520479483e-05, + "loss": 1.4791, + "step": 5716 + }, + { + "epoch": 2.621488361426442, + "grad_norm": 0.3279964029788971, + "learning_rate": 4.116302040316361e-05, + "loss": 1.191, + "step": 5717 + }, + { + "epoch": 2.621947024423805, + "grad_norm": 0.2667905390262604, + "learning_rate": 4.106467984186707e-05, + "loss": 1.1352, + "step": 5718 + }, + { + "epoch": 2.622405687421167, + "grad_norm": 0.3724762201309204, + "learning_rate": 4.0966451860713736e-05, + "loss": 1.4844, + "step": 5719 + }, + { + "epoch": 2.62286435041853, + "grad_norm": 0.3980143070220947, + "learning_rate": 4.0868336483799505e-05, + "loss": 1.5843, + "step": 5720 + }, + { + "epoch": 2.6233230134158925, + "grad_norm": 0.2862567603588104, + "learning_rate": 4.07703337351929e-05, + "loss": 1.5024, + "step": 5721 + }, + { + "epoch": 2.6237816764132553, + "grad_norm": 0.3262464106082916, + "learning_rate": 4.067244363893452e-05, + "loss": 1.1802, + "step": 5722 + }, + { + "epoch": 2.624240339410618, + "grad_norm": 0.3230985999107361, + "learning_rate": 4.0574666219037824e-05, + "loss": 1.7435, + "step": 5723 + }, + { + "epoch": 2.6246990024079806, + "grad_norm": 0.3618335425853729, + "learning_rate": 4.047700149948802e-05, + "loss": 1.3806, + "step": 5724 + }, + { + "epoch": 2.6251576654053435, + "grad_norm": 0.33226069808006287, + "learning_rate": 4.037944950424311e-05, + "loss": 1.197, + "step": 5725 + }, + { + "epoch": 2.625616328402706, + "grad_norm": 0.22331880033016205, + "learning_rate": 4.028201025723327e-05, + "loss": 1.3807, + "step": 5726 + }, + { + "epoch": 2.6260749914000687, + "grad_norm": 0.27581825852394104, + "learning_rate": 4.018468378236106e-05, + "loss": 1.0424, + "step": 5727 + }, + { + "epoch": 2.6265336543974316, + "grad_norm": 0.3891744315624237, + "learning_rate": 4.008747010350133e-05, + "loss": 1.9856, + "step": 5728 + }, + { + "epoch": 2.6269923173947944, + "grad_norm": 0.3294212222099304, + "learning_rate": 3.9990369244501445e-05, + "loss": 1.2145, + "step": 5729 + }, + { + "epoch": 2.627450980392157, + "grad_norm": 0.35058024525642395, + "learning_rate": 3.989338122918068e-05, + "loss": 1.1814, + "step": 5730 + }, + { + "epoch": 2.6279096433895197, + "grad_norm": 0.3072217106819153, + "learning_rate": 3.979650608133112e-05, + "loss": 1.6852, + "step": 5731 + }, + { + "epoch": 2.628368306386882, + "grad_norm": 0.3541480004787445, + "learning_rate": 3.969974382471669e-05, + "loss": 1.1888, + "step": 5732 + }, + { + "epoch": 2.628826969384245, + "grad_norm": 0.26909348368644714, + "learning_rate": 3.96030944830742e-05, + "loss": 1.2612, + "step": 5733 + }, + { + "epoch": 2.629285632381608, + "grad_norm": 0.33335986733436584, + "learning_rate": 3.950655808011233e-05, + "loss": 1.7208, + "step": 5734 + }, + { + "epoch": 2.6297442953789703, + "grad_norm": 0.3109550178050995, + "learning_rate": 3.9410134639511986e-05, + "loss": 1.0018, + "step": 5735 + }, + { + "epoch": 2.630202958376333, + "grad_norm": 0.3982895612716675, + "learning_rate": 3.931382418492663e-05, + "loss": 1.1909, + "step": 5736 + }, + { + "epoch": 2.6306616213736955, + "grad_norm": 0.31409791111946106, + "learning_rate": 3.921762673998197e-05, + "loss": 1.0483, + "step": 5737 + }, + { + "epoch": 2.6311202843710584, + "grad_norm": 0.24062570929527283, + "learning_rate": 3.912154232827581e-05, + "loss": 0.5005, + "step": 5738 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.33497846126556396, + "learning_rate": 3.90255709733785e-05, + "loss": 1.5151, + "step": 5739 + }, + { + "epoch": 2.6320376103657837, + "grad_norm": 0.2941315770149231, + "learning_rate": 3.8929712698832445e-05, + "loss": 0.8872, + "step": 5740 + }, + { + "epoch": 2.6324962733631465, + "grad_norm": 0.23103603720664978, + "learning_rate": 3.883396752815221e-05, + "loss": 0.6996, + "step": 5741 + }, + { + "epoch": 2.632954936360509, + "grad_norm": 0.2940506935119629, + "learning_rate": 3.873833548482503e-05, + "loss": 1.7249, + "step": 5742 + }, + { + "epoch": 2.633413599357872, + "grad_norm": 0.24942685663700104, + "learning_rate": 3.864281659231001e-05, + "loss": 0.5719, + "step": 5743 + }, + { + "epoch": 2.6338722623552346, + "grad_norm": 0.1974136382341385, + "learning_rate": 3.8547410874038625e-05, + "loss": 1.5941, + "step": 5744 + }, + { + "epoch": 2.634330925352597, + "grad_norm": 0.31970781087875366, + "learning_rate": 3.845211835341472e-05, + "loss": 1.0022, + "step": 5745 + }, + { + "epoch": 2.63478958834996, + "grad_norm": 0.32989394664764404, + "learning_rate": 3.835693905381399e-05, + "loss": 1.4321, + "step": 5746 + }, + { + "epoch": 2.6352482513473223, + "grad_norm": 0.3487594723701477, + "learning_rate": 3.826187299858474e-05, + "loss": 1.421, + "step": 5747 + }, + { + "epoch": 2.635706914344685, + "grad_norm": 0.2510157823562622, + "learning_rate": 3.816692021104734e-05, + "loss": 0.8154, + "step": 5748 + }, + { + "epoch": 2.636165577342048, + "grad_norm": 0.10467905551195145, + "learning_rate": 3.807208071449442e-05, + "loss": 1.0183, + "step": 5749 + }, + { + "epoch": 2.6366242403394105, + "grad_norm": 0.34240782260894775, + "learning_rate": 3.7977354532190754e-05, + "loss": 1.3793, + "step": 5750 + }, + { + "epoch": 2.6370829033367733, + "grad_norm": 0.33239027857780457, + "learning_rate": 3.788274168737338e-05, + "loss": 1.5253, + "step": 5751 + }, + { + "epoch": 2.6375415663341357, + "grad_norm": 0.3215150535106659, + "learning_rate": 3.77882422032515e-05, + "loss": 1.173, + "step": 5752 + }, + { + "epoch": 2.6380002293314986, + "grad_norm": 0.3687779903411865, + "learning_rate": 3.7693856103006574e-05, + "loss": 1.6017, + "step": 5753 + }, + { + "epoch": 2.6384588923288614, + "grad_norm": 0.3637235164642334, + "learning_rate": 3.759958340979208e-05, + "loss": 1.7386, + "step": 5754 + }, + { + "epoch": 2.6389175553262243, + "grad_norm": 0.3436271846294403, + "learning_rate": 3.750542414673391e-05, + "loss": 0.7148, + "step": 5755 + }, + { + "epoch": 2.6393762183235867, + "grad_norm": 0.10530371963977814, + "learning_rate": 3.7411378336929925e-05, + "loss": 1.4356, + "step": 5756 + }, + { + "epoch": 2.6398348813209496, + "grad_norm": 0.4608929753303528, + "learning_rate": 3.731744600345038e-05, + "loss": 2.1367, + "step": 5757 + }, + { + "epoch": 2.640293544318312, + "grad_norm": 0.31003305315971375, + "learning_rate": 3.722362716933736e-05, + "loss": 1.2718, + "step": 5758 + }, + { + "epoch": 2.640752207315675, + "grad_norm": 0.34914591908454895, + "learning_rate": 3.712992185760533e-05, + "loss": 1.3382, + "step": 5759 + }, + { + "epoch": 2.6412108703130377, + "grad_norm": 0.4381144344806671, + "learning_rate": 3.703633009124102e-05, + "loss": 1.5305, + "step": 5760 + }, + { + "epoch": 2.6416695333104, + "grad_norm": 0.20248155295848846, + "learning_rate": 3.694285189320301e-05, + "loss": 1.1224, + "step": 5761 + }, + { + "epoch": 2.642128196307763, + "grad_norm": 0.3816075325012207, + "learning_rate": 3.684948728642229e-05, + "loss": 1.5889, + "step": 5762 + }, + { + "epoch": 2.6425868593051254, + "grad_norm": 0.3584928810596466, + "learning_rate": 3.675623629380181e-05, + "loss": 1.4312, + "step": 5763 + }, + { + "epoch": 2.6430455223024882, + "grad_norm": 0.4676406681537628, + "learning_rate": 3.6663098938216734e-05, + "loss": 1.2666, + "step": 5764 + }, + { + "epoch": 2.643504185299851, + "grad_norm": 0.2964949309825897, + "learning_rate": 3.657007524251427e-05, + "loss": 1.8092, + "step": 5765 + }, + { + "epoch": 2.6439628482972135, + "grad_norm": 0.28791379928588867, + "learning_rate": 3.6477165229513885e-05, + "loss": 0.9804, + "step": 5766 + }, + { + "epoch": 2.6444215112945764, + "grad_norm": 0.3603127598762512, + "learning_rate": 3.638436892200708e-05, + "loss": 1.5343, + "step": 5767 + }, + { + "epoch": 2.644880174291939, + "grad_norm": 0.28618577122688293, + "learning_rate": 3.629168634275748e-05, + "loss": 0.933, + "step": 5768 + }, + { + "epoch": 2.6453388372893016, + "grad_norm": 0.31322944164276123, + "learning_rate": 3.619911751450061e-05, + "loss": 1.6573, + "step": 5769 + }, + { + "epoch": 2.6457975002866645, + "grad_norm": 0.3839041590690613, + "learning_rate": 3.610666245994443e-05, + "loss": 0.8116, + "step": 5770 + }, + { + "epoch": 2.646256163284027, + "grad_norm": 0.11088380962610245, + "learning_rate": 3.601432120176879e-05, + "loss": 0.9275, + "step": 5771 + }, + { + "epoch": 2.6467148262813898, + "grad_norm": 0.29426756501197815, + "learning_rate": 3.5922093762625674e-05, + "loss": 0.9716, + "step": 5772 + }, + { + "epoch": 2.647173489278752, + "grad_norm": 0.21911796927452087, + "learning_rate": 3.582998016513911e-05, + "loss": 1.243, + "step": 5773 + }, + { + "epoch": 2.647632152276115, + "grad_norm": 0.40464484691619873, + "learning_rate": 3.573798043190524e-05, + "loss": 1.4725, + "step": 5774 + }, + { + "epoch": 2.648090815273478, + "grad_norm": 0.32122787833213806, + "learning_rate": 3.564609458549234e-05, + "loss": 1.5024, + "step": 5775 + }, + { + "epoch": 2.6485494782708408, + "grad_norm": 0.28623196482658386, + "learning_rate": 3.555432264844055e-05, + "loss": 1.4161, + "step": 5776 + }, + { + "epoch": 2.649008141268203, + "grad_norm": 0.28445377945899963, + "learning_rate": 3.5462664643262255e-05, + "loss": 1.1441, + "step": 5777 + }, + { + "epoch": 2.649466804265566, + "grad_norm": 0.27459508180618286, + "learning_rate": 3.53711205924418e-05, + "loss": 1.0607, + "step": 5778 + }, + { + "epoch": 2.6499254672629284, + "grad_norm": 0.3308379352092743, + "learning_rate": 3.5279690518435656e-05, + "loss": 0.8458, + "step": 5779 + }, + { + "epoch": 2.6503841302602913, + "grad_norm": 0.23437727987766266, + "learning_rate": 3.518837444367223e-05, + "loss": 1.3428, + "step": 5780 + }, + { + "epoch": 2.650842793257654, + "grad_norm": 0.30645015835762024, + "learning_rate": 3.509717239055199e-05, + "loss": 0.9823, + "step": 5781 + }, + { + "epoch": 2.6513014562550166, + "grad_norm": 0.20318692922592163, + "learning_rate": 3.500608438144748e-05, + "loss": 1.3997, + "step": 5782 + }, + { + "epoch": 2.6517601192523794, + "grad_norm": 0.3624488115310669, + "learning_rate": 3.491511043870321e-05, + "loss": 0.991, + "step": 5783 + }, + { + "epoch": 2.652218782249742, + "grad_norm": 0.3760741949081421, + "learning_rate": 3.482425058463579e-05, + "loss": 1.937, + "step": 5784 + }, + { + "epoch": 2.6526774452471047, + "grad_norm": 0.4173126816749573, + "learning_rate": 3.473350484153387e-05, + "loss": 1.5058, + "step": 5785 + }, + { + "epoch": 2.6531361082444676, + "grad_norm": 0.2894061803817749, + "learning_rate": 3.4642873231657934e-05, + "loss": 1.5995, + "step": 5786 + }, + { + "epoch": 2.65359477124183, + "grad_norm": 0.34869733452796936, + "learning_rate": 3.4552355777240585e-05, + "loss": 1.4747, + "step": 5787 + }, + { + "epoch": 2.654053434239193, + "grad_norm": 0.3714545965194702, + "learning_rate": 3.446195250048639e-05, + "loss": 1.4193, + "step": 5788 + }, + { + "epoch": 2.6545120972365552, + "grad_norm": 0.28376656770706177, + "learning_rate": 3.437166342357195e-05, + "loss": 1.3256, + "step": 5789 + }, + { + "epoch": 2.654970760233918, + "grad_norm": 0.37970438599586487, + "learning_rate": 3.4281488568645934e-05, + "loss": 1.584, + "step": 5790 + }, + { + "epoch": 2.655429423231281, + "grad_norm": 0.3399360179901123, + "learning_rate": 3.4191427957828705e-05, + "loss": 1.4299, + "step": 5791 + }, + { + "epoch": 2.6558880862286434, + "grad_norm": 0.4417398273944855, + "learning_rate": 3.41014816132128e-05, + "loss": 1.6428, + "step": 5792 + }, + { + "epoch": 2.656346749226006, + "grad_norm": 0.3878077268600464, + "learning_rate": 3.401164955686281e-05, + "loss": 1.587, + "step": 5793 + }, + { + "epoch": 2.6568054122233686, + "grad_norm": 0.5019458532333374, + "learning_rate": 3.392193181081504e-05, + "loss": 1.2179, + "step": 5794 + }, + { + "epoch": 2.6572640752207315, + "grad_norm": 0.3489948809146881, + "learning_rate": 3.383232839707806e-05, + "loss": 1.608, + "step": 5795 + }, + { + "epoch": 2.6577227382180943, + "grad_norm": 0.43875566124916077, + "learning_rate": 3.3742839337632223e-05, + "loss": 1.3773, + "step": 5796 + }, + { + "epoch": 2.658181401215457, + "grad_norm": 0.2078205943107605, + "learning_rate": 3.365346465442976e-05, + "loss": 1.2278, + "step": 5797 + }, + { + "epoch": 2.6586400642128196, + "grad_norm": 0.36126843094825745, + "learning_rate": 3.356420436939489e-05, + "loss": 1.799, + "step": 5798 + }, + { + "epoch": 2.6590987272101825, + "grad_norm": 0.40285730361938477, + "learning_rate": 3.347505850442395e-05, + "loss": 1.5258, + "step": 5799 + }, + { + "epoch": 2.659557390207545, + "grad_norm": 0.29165950417518616, + "learning_rate": 3.3386027081384886e-05, + "loss": 1.3616, + "step": 5800 + }, + { + "epoch": 2.6600160532049077, + "grad_norm": 0.2867632210254669, + "learning_rate": 3.3297110122118005e-05, + "loss": 1.2755, + "step": 5801 + }, + { + "epoch": 2.6604747162022706, + "grad_norm": 0.33592167496681213, + "learning_rate": 3.3208307648434964e-05, + "loss": 1.5153, + "step": 5802 + }, + { + "epoch": 2.660933379199633, + "grad_norm": 0.2837589979171753, + "learning_rate": 3.311961968211979e-05, + "loss": 1.204, + "step": 5803 + }, + { + "epoch": 2.661392042196996, + "grad_norm": 0.2693648040294647, + "learning_rate": 3.303104624492825e-05, + "loss": 1.2056, + "step": 5804 + }, + { + "epoch": 2.6618507051943583, + "grad_norm": 0.206869438290596, + "learning_rate": 3.294258735858818e-05, + "loss": 0.9919, + "step": 5805 + }, + { + "epoch": 2.662309368191721, + "grad_norm": 0.3209756314754486, + "learning_rate": 3.2854243044799056e-05, + "loss": 1.6161, + "step": 5806 + }, + { + "epoch": 2.662768031189084, + "grad_norm": 0.301139235496521, + "learning_rate": 3.276601332523249e-05, + "loss": 1.2684, + "step": 5807 + }, + { + "epoch": 2.6632266941864464, + "grad_norm": 0.2972935140132904, + "learning_rate": 3.267789822153172e-05, + "loss": 1.5424, + "step": 5808 + }, + { + "epoch": 2.6636853571838093, + "grad_norm": 0.3825678527355194, + "learning_rate": 3.258989775531212e-05, + "loss": 1.5092, + "step": 5809 + }, + { + "epoch": 2.6641440201811717, + "grad_norm": 0.22569845616817474, + "learning_rate": 3.250201194816077e-05, + "loss": 1.0415, + "step": 5810 + }, + { + "epoch": 2.6646026831785345, + "grad_norm": 0.2539108991622925, + "learning_rate": 3.2414240821636685e-05, + "loss": 1.3038, + "step": 5811 + }, + { + "epoch": 2.6650613461758974, + "grad_norm": 0.3397371470928192, + "learning_rate": 3.232658439727082e-05, + "loss": 1.1019, + "step": 5812 + }, + { + "epoch": 2.66552000917326, + "grad_norm": 0.35667502880096436, + "learning_rate": 3.2239042696565915e-05, + "loss": 1.9956, + "step": 5813 + }, + { + "epoch": 2.6659786721706227, + "grad_norm": 0.37884020805358887, + "learning_rate": 3.2151615740996565e-05, + "loss": 1.2626, + "step": 5814 + }, + { + "epoch": 2.666437335167985, + "grad_norm": 0.38707906007766724, + "learning_rate": 3.2064303552009236e-05, + "loss": 1.2571, + "step": 5815 + }, + { + "epoch": 2.666895998165348, + "grad_norm": 0.3102846145629883, + "learning_rate": 3.1977106151022226e-05, + "loss": 1.2002, + "step": 5816 + }, + { + "epoch": 2.667354661162711, + "grad_norm": 0.28075534105300903, + "learning_rate": 3.189002355942572e-05, + "loss": 1.3118, + "step": 5817 + }, + { + "epoch": 2.6678133241600737, + "grad_norm": 0.2957073748111725, + "learning_rate": 3.1803055798581725e-05, + "loss": 1.1588, + "step": 5818 + }, + { + "epoch": 2.668271987157436, + "grad_norm": 0.24751010537147522, + "learning_rate": 3.1716202889823966e-05, + "loss": 0.9748, + "step": 5819 + }, + { + "epoch": 2.6687306501547985, + "grad_norm": 0.301859587430954, + "learning_rate": 3.162946485445817e-05, + "loss": 1.3465, + "step": 5820 + }, + { + "epoch": 2.6691893131521613, + "grad_norm": 0.39302027225494385, + "learning_rate": 3.15428417137617e-05, + "loss": 1.4904, + "step": 5821 + }, + { + "epoch": 2.669647976149524, + "grad_norm": 0.27386486530303955, + "learning_rate": 3.145633348898397e-05, + "loss": 1.6713, + "step": 5822 + }, + { + "epoch": 2.670106639146887, + "grad_norm": 0.33279913663864136, + "learning_rate": 3.136994020134598e-05, + "loss": 1.2917, + "step": 5823 + }, + { + "epoch": 2.6705653021442495, + "grad_norm": 0.3797074258327484, + "learning_rate": 3.1283661872040626e-05, + "loss": 1.4031, + "step": 5824 + }, + { + "epoch": 2.6710239651416123, + "grad_norm": 0.3788340091705322, + "learning_rate": 3.119749852223269e-05, + "loss": 1.2273, + "step": 5825 + }, + { + "epoch": 2.6714826281389747, + "grad_norm": 0.2667097747325897, + "learning_rate": 3.1111450173058553e-05, + "loss": 0.9149, + "step": 5826 + }, + { + "epoch": 2.6719412911363376, + "grad_norm": 0.3300883173942566, + "learning_rate": 3.10255168456266e-05, + "loss": 1.7291, + "step": 5827 + }, + { + "epoch": 2.6723999541337005, + "grad_norm": 0.33332276344299316, + "learning_rate": 3.093969856101686e-05, + "loss": 1.4482, + "step": 5828 + }, + { + "epoch": 2.672858617131063, + "grad_norm": 0.27086135745048523, + "learning_rate": 3.085399534028116e-05, + "loss": 0.9669, + "step": 5829 + }, + { + "epoch": 2.6733172801284257, + "grad_norm": 0.10022089630365372, + "learning_rate": 3.0768407204443126e-05, + "loss": 0.5217, + "step": 5830 + }, + { + "epoch": 2.673775943125788, + "grad_norm": 0.1253892332315445, + "learning_rate": 3.068293417449808e-05, + "loss": 0.8534, + "step": 5831 + }, + { + "epoch": 2.674234606123151, + "grad_norm": 0.3180748224258423, + "learning_rate": 3.05975762714133e-05, + "loss": 1.609, + "step": 5832 + }, + { + "epoch": 2.674693269120514, + "grad_norm": 0.3440166413784027, + "learning_rate": 3.051233351612759e-05, + "loss": 0.7612, + "step": 5833 + }, + { + "epoch": 2.6751519321178763, + "grad_norm": 0.3188319802284241, + "learning_rate": 3.0427205929551615e-05, + "loss": 2.0381, + "step": 5834 + }, + { + "epoch": 2.675610595115239, + "grad_norm": 0.30568626523017883, + "learning_rate": 3.0342193532567842e-05, + "loss": 0.9399, + "step": 5835 + }, + { + "epoch": 2.6760692581126015, + "grad_norm": 0.373977392911911, + "learning_rate": 3.0257296346030416e-05, + "loss": 1.6466, + "step": 5836 + }, + { + "epoch": 2.6765279211099644, + "grad_norm": 0.372895210981369, + "learning_rate": 3.0172514390765238e-05, + "loss": 1.7245, + "step": 5837 + }, + { + "epoch": 2.6769865841073273, + "grad_norm": 0.37782806158065796, + "learning_rate": 3.0087847687569893e-05, + "loss": 1.6417, + "step": 5838 + }, + { + "epoch": 2.6774452471046897, + "grad_norm": 0.3993247449398041, + "learning_rate": 3.0003296257213708e-05, + "loss": 1.9526, + "step": 5839 + }, + { + "epoch": 2.6779039101020525, + "grad_norm": 0.3877170979976654, + "learning_rate": 2.9918860120437873e-05, + "loss": 1.5638, + "step": 5840 + }, + { + "epoch": 2.678362573099415, + "grad_norm": 0.38056743144989014, + "learning_rate": 2.9834539297955156e-05, + "loss": 1.2691, + "step": 5841 + }, + { + "epoch": 2.678821236096778, + "grad_norm": 0.3735964298248291, + "learning_rate": 2.975033381044995e-05, + "loss": 2.0476, + "step": 5842 + }, + { + "epoch": 2.6792798990941407, + "grad_norm": 0.37376493215560913, + "learning_rate": 2.966624367857851e-05, + "loss": 1.5699, + "step": 5843 + }, + { + "epoch": 2.6797385620915035, + "grad_norm": 0.3251396119594574, + "learning_rate": 2.958226892296878e-05, + "loss": 0.6622, + "step": 5844 + }, + { + "epoch": 2.680197225088866, + "grad_norm": 0.20608735084533691, + "learning_rate": 2.9498409564220396e-05, + "loss": 0.7508, + "step": 5845 + }, + { + "epoch": 2.680655888086229, + "grad_norm": 0.2004690021276474, + "learning_rate": 2.9414665622904623e-05, + "loss": 0.8976, + "step": 5846 + }, + { + "epoch": 2.681114551083591, + "grad_norm": 0.30501410365104675, + "learning_rate": 2.9331037119564473e-05, + "loss": 1.0064, + "step": 5847 + }, + { + "epoch": 2.681573214080954, + "grad_norm": 0.2096373289823532, + "learning_rate": 2.9247524074714594e-05, + "loss": 1.6792, + "step": 5848 + }, + { + "epoch": 2.682031877078317, + "grad_norm": 0.4180261790752411, + "learning_rate": 2.9164126508841428e-05, + "loss": 1.6902, + "step": 5849 + }, + { + "epoch": 2.6824905400756793, + "grad_norm": 0.2524012327194214, + "learning_rate": 2.9080844442402887e-05, + "loss": 0.4348, + "step": 5850 + }, + { + "epoch": 2.682949203073042, + "grad_norm": 0.2333446592092514, + "learning_rate": 2.8997677895828688e-05, + "loss": 1.4499, + "step": 5851 + }, + { + "epoch": 2.6834078660704046, + "grad_norm": 0.4264712631702423, + "learning_rate": 2.891462688952029e-05, + "loss": 1.3736, + "step": 5852 + }, + { + "epoch": 2.6838665290677675, + "grad_norm": 0.3079976737499237, + "learning_rate": 2.883169144385056e-05, + "loss": 1.2166, + "step": 5853 + }, + { + "epoch": 2.6843251920651303, + "grad_norm": 0.18179012835025787, + "learning_rate": 2.874887157916417e-05, + "loss": 0.6641, + "step": 5854 + }, + { + "epoch": 2.6847838550624927, + "grad_norm": 0.32719552516937256, + "learning_rate": 2.8666167315777535e-05, + "loss": 1.2209, + "step": 5855 + }, + { + "epoch": 2.6852425180598556, + "grad_norm": 0.30747801065444946, + "learning_rate": 2.8583578673978483e-05, + "loss": 1.7491, + "step": 5856 + }, + { + "epoch": 2.685701181057218, + "grad_norm": 0.39954906702041626, + "learning_rate": 2.8501105674026808e-05, + "loss": 1.2025, + "step": 5857 + }, + { + "epoch": 2.686159844054581, + "grad_norm": 0.29626455903053284, + "learning_rate": 2.841874833615349e-05, + "loss": 1.8599, + "step": 5858 + }, + { + "epoch": 2.6866185070519437, + "grad_norm": 0.44469526410102844, + "learning_rate": 2.833650668056148e-05, + "loss": 1.537, + "step": 5859 + }, + { + "epoch": 2.687077170049306, + "grad_norm": 0.325447142124176, + "learning_rate": 2.8254380727425255e-05, + "loss": 1.2097, + "step": 5860 + }, + { + "epoch": 2.687535833046669, + "grad_norm": 0.32134348154067993, + "learning_rate": 2.817237049689092e-05, + "loss": 1.6572, + "step": 5861 + }, + { + "epoch": 2.6879944960440314, + "grad_norm": 0.2648102045059204, + "learning_rate": 2.809047600907616e-05, + "loss": 0.9762, + "step": 5862 + }, + { + "epoch": 2.6884531590413943, + "grad_norm": 0.3223074972629547, + "learning_rate": 2.800869728407035e-05, + "loss": 1.5692, + "step": 5863 + }, + { + "epoch": 2.688911822038757, + "grad_norm": 0.2671777307987213, + "learning_rate": 2.792703434193422e-05, + "loss": 1.4353, + "step": 5864 + }, + { + "epoch": 2.68937048503612, + "grad_norm": 0.3055770695209503, + "learning_rate": 2.7845487202700416e-05, + "loss": 0.8293, + "step": 5865 + }, + { + "epoch": 2.6898291480334824, + "grad_norm": 0.2750590145587921, + "learning_rate": 2.7764055886372987e-05, + "loss": 1.4748, + "step": 5866 + }, + { + "epoch": 2.6902878110308452, + "grad_norm": 0.3781544268131256, + "learning_rate": 2.768274041292762e-05, + "loss": 0.7972, + "step": 5867 + }, + { + "epoch": 2.6907464740282077, + "grad_norm": 0.18787944316864014, + "learning_rate": 2.760154080231175e-05, + "loss": 1.6368, + "step": 5868 + }, + { + "epoch": 2.6912051370255705, + "grad_norm": 0.41110900044441223, + "learning_rate": 2.7520457074444005e-05, + "loss": 1.1579, + "step": 5869 + }, + { + "epoch": 2.6916638000229334, + "grad_norm": 0.28122058510780334, + "learning_rate": 2.743948924921491e-05, + "loss": 0.9766, + "step": 5870 + }, + { + "epoch": 2.692122463020296, + "grad_norm": 0.1648864597082138, + "learning_rate": 2.7358637346486414e-05, + "loss": 1.1961, + "step": 5871 + }, + { + "epoch": 2.6925811260176586, + "grad_norm": 0.45734626054763794, + "learning_rate": 2.7277901386092096e-05, + "loss": 1.5893, + "step": 5872 + }, + { + "epoch": 2.693039789015021, + "grad_norm": 0.31361323595046997, + "learning_rate": 2.7197281387837114e-05, + "loss": 0.8256, + "step": 5873 + }, + { + "epoch": 2.693498452012384, + "grad_norm": 0.22070448100566864, + "learning_rate": 2.7116777371498145e-05, + "loss": 1.4343, + "step": 5874 + }, + { + "epoch": 2.6939571150097468, + "grad_norm": 0.428138792514801, + "learning_rate": 2.703638935682323e-05, + "loss": 1.6013, + "step": 5875 + }, + { + "epoch": 2.694415778007109, + "grad_norm": 0.3450068235397339, + "learning_rate": 2.6956117363532207e-05, + "loss": 1.2018, + "step": 5876 + }, + { + "epoch": 2.694874441004472, + "grad_norm": 0.34051015973091125, + "learning_rate": 2.687596141131654e-05, + "loss": 1.6506, + "step": 5877 + }, + { + "epoch": 2.6953331040018345, + "grad_norm": 0.31475841999053955, + "learning_rate": 2.6795921519838895e-05, + "loss": 1.5417, + "step": 5878 + }, + { + "epoch": 2.6957917669991973, + "grad_norm": 0.216132253408432, + "learning_rate": 2.6715997708733675e-05, + "loss": 0.3737, + "step": 5879 + }, + { + "epoch": 2.69625042999656, + "grad_norm": 0.2257094830274582, + "learning_rate": 2.6636189997606864e-05, + "loss": 1.4002, + "step": 5880 + }, + { + "epoch": 2.6967090929939226, + "grad_norm": 0.3286268413066864, + "learning_rate": 2.655649840603569e-05, + "loss": 1.5927, + "step": 5881 + }, + { + "epoch": 2.6971677559912854, + "grad_norm": 0.32692962884902954, + "learning_rate": 2.6476922953569127e-05, + "loss": 0.7494, + "step": 5882 + }, + { + "epoch": 2.697626418988648, + "grad_norm": 0.29675719141960144, + "learning_rate": 2.6397463659727672e-05, + "loss": 1.6597, + "step": 5883 + }, + { + "epoch": 2.6980850819860107, + "grad_norm": 0.3485512137413025, + "learning_rate": 2.6318120544003234e-05, + "loss": 1.2645, + "step": 5884 + }, + { + "epoch": 2.6985437449833736, + "grad_norm": 0.41130709648132324, + "learning_rate": 2.623889362585924e-05, + "loss": 1.9763, + "step": 5885 + }, + { + "epoch": 2.6990024079807364, + "grad_norm": 0.2589356005191803, + "learning_rate": 2.6159782924730647e-05, + "loss": 0.3383, + "step": 5886 + }, + { + "epoch": 2.699461070978099, + "grad_norm": 0.15324239432811737, + "learning_rate": 2.6080788460023875e-05, + "loss": 1.1049, + "step": 5887 + }, + { + "epoch": 2.6999197339754613, + "grad_norm": 0.2607985734939575, + "learning_rate": 2.6001910251116812e-05, + "loss": 1.1836, + "step": 5888 + }, + { + "epoch": 2.700378396972824, + "grad_norm": 0.29069429636001587, + "learning_rate": 2.5923148317358926e-05, + "loss": 0.8517, + "step": 5889 + }, + { + "epoch": 2.700837059970187, + "grad_norm": 0.2796778976917267, + "learning_rate": 2.584450267807098e-05, + "loss": 0.8593, + "step": 5890 + }, + { + "epoch": 2.70129572296755, + "grad_norm": 0.2640216648578644, + "learning_rate": 2.5765973352545436e-05, + "loss": 1.5907, + "step": 5891 + }, + { + "epoch": 2.7017543859649122, + "grad_norm": 0.39561182260513306, + "learning_rate": 2.5687560360045935e-05, + "loss": 1.3107, + "step": 5892 + }, + { + "epoch": 2.702213048962275, + "grad_norm": 0.3599591553211212, + "learning_rate": 2.5609263719807875e-05, + "loss": 1.8921, + "step": 5893 + }, + { + "epoch": 2.7026717119596375, + "grad_norm": 0.37715888023376465, + "learning_rate": 2.553108345103794e-05, + "loss": 1.3452, + "step": 5894 + }, + { + "epoch": 2.7031303749570004, + "grad_norm": 0.3551565110683441, + "learning_rate": 2.545301957291435e-05, + "loss": 0.9477, + "step": 5895 + }, + { + "epoch": 2.7035890379543632, + "grad_norm": 0.12380225211381912, + "learning_rate": 2.5375072104586726e-05, + "loss": 0.5937, + "step": 5896 + }, + { + "epoch": 2.7040477009517256, + "grad_norm": 0.21938225626945496, + "learning_rate": 2.5297241065176168e-05, + "loss": 1.0974, + "step": 5897 + }, + { + "epoch": 2.7045063639490885, + "grad_norm": 0.3853704333305359, + "learning_rate": 2.5219526473775122e-05, + "loss": 1.6407, + "step": 5898 + }, + { + "epoch": 2.704965026946451, + "grad_norm": 0.2925291657447815, + "learning_rate": 2.5141928349447563e-05, + "loss": 1.2876, + "step": 5899 + }, + { + "epoch": 2.7054236899438138, + "grad_norm": 0.21706360578536987, + "learning_rate": 2.5064446711228872e-05, + "loss": 0.7466, + "step": 5900 + }, + { + "epoch": 2.7058823529411766, + "grad_norm": 0.32736656069755554, + "learning_rate": 2.4987081578125904e-05, + "loss": 2.0911, + "step": 5901 + }, + { + "epoch": 2.706341015938539, + "grad_norm": 0.4860953390598297, + "learning_rate": 2.4909832969116918e-05, + "loss": 1.9189, + "step": 5902 + }, + { + "epoch": 2.706799678935902, + "grad_norm": 0.3114350736141205, + "learning_rate": 2.4832700903151374e-05, + "loss": 0.5055, + "step": 5903 + }, + { + "epoch": 2.7072583419332643, + "grad_norm": 0.12829411029815674, + "learning_rate": 2.4755685399150463e-05, + "loss": 0.6061, + "step": 5904 + }, + { + "epoch": 2.707717004930627, + "grad_norm": 0.3164483308792114, + "learning_rate": 2.467878647600663e-05, + "loss": 1.7654, + "step": 5905 + }, + { + "epoch": 2.70817566792799, + "grad_norm": 0.37488433718681335, + "learning_rate": 2.460200415258368e-05, + "loss": 1.6296, + "step": 5906 + }, + { + "epoch": 2.7086343309253524, + "grad_norm": 0.28757867217063904, + "learning_rate": 2.4525338447716928e-05, + "loss": 1.1265, + "step": 5907 + }, + { + "epoch": 2.7090929939227153, + "grad_norm": 0.32620272040367126, + "learning_rate": 2.4448789380213e-05, + "loss": 1.1835, + "step": 5908 + }, + { + "epoch": 2.7095516569200777, + "grad_norm": 0.293837308883667, + "learning_rate": 2.437235696884993e-05, + "loss": 1.1319, + "step": 5909 + }, + { + "epoch": 2.7100103199174406, + "grad_norm": 0.1874661147594452, + "learning_rate": 2.4296041232377165e-05, + "loss": 0.5175, + "step": 5910 + }, + { + "epoch": 2.7104689829148034, + "grad_norm": 0.2423352599143982, + "learning_rate": 2.421984218951545e-05, + "loss": 1.2948, + "step": 5911 + }, + { + "epoch": 2.7109276459121663, + "grad_norm": 0.32054775953292847, + "learning_rate": 2.414375985895706e-05, + "loss": 1.2799, + "step": 5912 + }, + { + "epoch": 2.7113863089095287, + "grad_norm": 0.3801712989807129, + "learning_rate": 2.4067794259365504e-05, + "loss": 1.63, + "step": 5913 + }, + { + "epoch": 2.7118449719068916, + "grad_norm": 0.29861852526664734, + "learning_rate": 2.3991945409375604e-05, + "loss": 1.3125, + "step": 5914 + }, + { + "epoch": 2.712303634904254, + "grad_norm": 0.3143099248409271, + "learning_rate": 2.3916213327593694e-05, + "loss": 0.8409, + "step": 5915 + }, + { + "epoch": 2.712762297901617, + "grad_norm": 0.09029946476221085, + "learning_rate": 2.3840598032597417e-05, + "loss": 0.7455, + "step": 5916 + }, + { + "epoch": 2.7132209608989797, + "grad_norm": 0.4967743158340454, + "learning_rate": 2.376509954293571e-05, + "loss": 1.5552, + "step": 5917 + }, + { + "epoch": 2.713679623896342, + "grad_norm": 0.37226957082748413, + "learning_rate": 2.3689717877128815e-05, + "loss": 1.7788, + "step": 5918 + }, + { + "epoch": 2.714138286893705, + "grad_norm": 0.41915664076805115, + "learning_rate": 2.3614453053668716e-05, + "loss": 1.508, + "step": 5919 + }, + { + "epoch": 2.7145969498910674, + "grad_norm": 0.30586564540863037, + "learning_rate": 2.3539305091018038e-05, + "loss": 1.7762, + "step": 5920 + }, + { + "epoch": 2.7150556128884302, + "grad_norm": 0.4077855944633484, + "learning_rate": 2.3464274007611364e-05, + "loss": 1.6046, + "step": 5921 + }, + { + "epoch": 2.715514275885793, + "grad_norm": 4.286491870880127, + "learning_rate": 2.338935982185425e-05, + "loss": 1.3848, + "step": 5922 + }, + { + "epoch": 2.7159729388831555, + "grad_norm": 0.2541790306568146, + "learning_rate": 2.3314562552123663e-05, + "loss": 1.7035, + "step": 5923 + }, + { + "epoch": 2.7164316018805184, + "grad_norm": 0.4110478162765503, + "learning_rate": 2.3239882216768093e-05, + "loss": 2.1335, + "step": 5924 + }, + { + "epoch": 2.7168902648778808, + "grad_norm": 0.4424358606338501, + "learning_rate": 2.3165318834106942e-05, + "loss": 1.9265, + "step": 5925 + }, + { + "epoch": 2.7173489278752436, + "grad_norm": 0.3610834777355194, + "learning_rate": 2.309087242243124e-05, + "loss": 1.1285, + "step": 5926 + }, + { + "epoch": 2.7178075908726065, + "grad_norm": 0.2572648525238037, + "learning_rate": 2.3016543000003222e-05, + "loss": 0.8428, + "step": 5927 + }, + { + "epoch": 2.718266253869969, + "grad_norm": 0.36045536398887634, + "learning_rate": 2.2942330585056347e-05, + "loss": 1.8323, + "step": 5928 + }, + { + "epoch": 2.7187249168673318, + "grad_norm": 0.21581125259399414, + "learning_rate": 2.2868235195795672e-05, + "loss": 1.3904, + "step": 5929 + }, + { + "epoch": 2.719183579864694, + "grad_norm": 0.38794851303100586, + "learning_rate": 2.279425685039721e-05, + "loss": 1.0128, + "step": 5930 + }, + { + "epoch": 2.719642242862057, + "grad_norm": 0.3242373466491699, + "learning_rate": 2.2720395567008334e-05, + "loss": 1.4001, + "step": 5931 + }, + { + "epoch": 2.72010090585942, + "grad_norm": 0.130849227309227, + "learning_rate": 2.2646651363747773e-05, + "loss": 1.1965, + "step": 5932 + }, + { + "epoch": 2.7205595688567827, + "grad_norm": 0.3577413856983185, + "learning_rate": 2.2573024258705554e-05, + "loss": 0.9914, + "step": 5933 + }, + { + "epoch": 2.721018231854145, + "grad_norm": 0.2112027406692505, + "learning_rate": 2.24995142699429e-05, + "loss": 1.7675, + "step": 5934 + }, + { + "epoch": 2.721476894851508, + "grad_norm": 0.28808677196502686, + "learning_rate": 2.242612141549233e-05, + "loss": 0.3857, + "step": 5935 + }, + { + "epoch": 2.7219355578488704, + "grad_norm": 0.2923356294631958, + "learning_rate": 2.2352845713357772e-05, + "loss": 1.5071, + "step": 5936 + }, + { + "epoch": 2.7223942208462333, + "grad_norm": 0.24712756276130676, + "learning_rate": 2.2279687181514076e-05, + "loss": 1.0919, + "step": 5937 + }, + { + "epoch": 2.722852883843596, + "grad_norm": 0.3377127945423126, + "learning_rate": 2.2206645837907602e-05, + "loss": 1.8531, + "step": 5938 + }, + { + "epoch": 2.7233115468409586, + "grad_norm": 0.5375214219093323, + "learning_rate": 2.2133721700456022e-05, + "loss": 1.3341, + "step": 5939 + }, + { + "epoch": 2.7237702098383214, + "grad_norm": 0.37174880504608154, + "learning_rate": 2.2060914787048136e-05, + "loss": 1.8732, + "step": 5940 + }, + { + "epoch": 2.724228872835684, + "grad_norm": 0.319526344537735, + "learning_rate": 2.198822511554399e-05, + "loss": 0.9168, + "step": 5941 + }, + { + "epoch": 2.7246875358330467, + "grad_norm": 0.32301315665245056, + "learning_rate": 2.1915652703774824e-05, + "loss": 1.5364, + "step": 5942 + }, + { + "epoch": 2.7251461988304095, + "grad_norm": 0.2971673309803009, + "learning_rate": 2.184319756954323e-05, + "loss": 0.6956, + "step": 5943 + }, + { + "epoch": 2.725604861827772, + "grad_norm": 0.1800968199968338, + "learning_rate": 2.177085973062293e-05, + "loss": 0.9738, + "step": 5944 + }, + { + "epoch": 2.726063524825135, + "grad_norm": 0.3155420124530792, + "learning_rate": 2.1698639204759006e-05, + "loss": 1.2188, + "step": 5945 + }, + { + "epoch": 2.7265221878224972, + "grad_norm": 0.2110133171081543, + "learning_rate": 2.1626536009667575e-05, + "loss": 1.0003, + "step": 5946 + }, + { + "epoch": 2.72698085081986, + "grad_norm": 0.45827966928482056, + "learning_rate": 2.1554550163036145e-05, + "loss": 1.7184, + "step": 5947 + }, + { + "epoch": 2.727439513817223, + "grad_norm": 0.25445568561553955, + "learning_rate": 2.1482681682523263e-05, + "loss": 0.6922, + "step": 5948 + }, + { + "epoch": 2.7278981768145854, + "grad_norm": 0.3437955379486084, + "learning_rate": 2.141093058575888e-05, + "loss": 1.2655, + "step": 5949 + }, + { + "epoch": 2.728356839811948, + "grad_norm": 0.12270909547805786, + "learning_rate": 2.133929689034403e-05, + "loss": 0.6745, + "step": 5950 + }, + { + "epoch": 2.7288155028093106, + "grad_norm": 0.31163090467453003, + "learning_rate": 2.1267780613850986e-05, + "loss": 2.0465, + "step": 5951 + }, + { + "epoch": 2.7292741658066735, + "grad_norm": 0.2993125021457672, + "learning_rate": 2.119638177382327e-05, + "loss": 1.076, + "step": 5952 + }, + { + "epoch": 2.7297328288040363, + "grad_norm": 0.3309202790260315, + "learning_rate": 2.1125100387775375e-05, + "loss": 0.924, + "step": 5953 + }, + { + "epoch": 2.730191491801399, + "grad_norm": 0.35655951499938965, + "learning_rate": 2.105393647319326e-05, + "loss": 1.8235, + "step": 5954 + }, + { + "epoch": 2.7306501547987616, + "grad_norm": 0.3149603307247162, + "learning_rate": 2.0982890047533898e-05, + "loss": 1.5205, + "step": 5955 + }, + { + "epoch": 2.731108817796124, + "grad_norm": 0.43384552001953125, + "learning_rate": 2.0911961128225466e-05, + "loss": 1.6622, + "step": 5956 + }, + { + "epoch": 2.731567480793487, + "grad_norm": 0.30674198269844055, + "learning_rate": 2.0841149732667375e-05, + "loss": 1.2803, + "step": 5957 + }, + { + "epoch": 2.7320261437908497, + "grad_norm": 0.2231689691543579, + "learning_rate": 2.0770455878230178e-05, + "loss": 0.6584, + "step": 5958 + }, + { + "epoch": 2.7324848067882126, + "grad_norm": 0.23369106650352478, + "learning_rate": 2.069987958225561e-05, + "loss": 1.7311, + "step": 5959 + }, + { + "epoch": 2.732943469785575, + "grad_norm": 0.4220520853996277, + "learning_rate": 2.062942086205649e-05, + "loss": 1.9563, + "step": 5960 + }, + { + "epoch": 2.733402132782938, + "grad_norm": 0.4430352747440338, + "learning_rate": 2.055907973491683e-05, + "loss": 1.5979, + "step": 5961 + }, + { + "epoch": 2.7338607957803003, + "grad_norm": 0.22902828454971313, + "learning_rate": 2.0488856218091935e-05, + "loss": 1.2991, + "step": 5962 + }, + { + "epoch": 2.734319458777663, + "grad_norm": 0.2558428943157196, + "learning_rate": 2.0418750328808024e-05, + "loss": 1.2835, + "step": 5963 + }, + { + "epoch": 2.734778121775026, + "grad_norm": 0.5350182056427002, + "learning_rate": 2.034876208426267e-05, + "loss": 0.9716, + "step": 5964 + }, + { + "epoch": 2.7352367847723884, + "grad_norm": 0.13574962317943573, + "learning_rate": 2.0278891501624375e-05, + "loss": 1.4147, + "step": 5965 + }, + { + "epoch": 2.7356954477697513, + "grad_norm": 0.3386705815792084, + "learning_rate": 2.0209138598033026e-05, + "loss": 1.1691, + "step": 5966 + }, + { + "epoch": 2.7361541107671137, + "grad_norm": 0.34220030903816223, + "learning_rate": 2.013950339059939e-05, + "loss": 2.25, + "step": 5967 + }, + { + "epoch": 2.7366127737644765, + "grad_norm": 0.47397106885910034, + "learning_rate": 2.0069985896405574e-05, + "loss": 1.6315, + "step": 5968 + }, + { + "epoch": 2.7370714367618394, + "grad_norm": 0.3405340015888214, + "learning_rate": 2.0000586132504662e-05, + "loss": 1.6804, + "step": 5969 + }, + { + "epoch": 2.737530099759202, + "grad_norm": 0.3932589292526245, + "learning_rate": 1.993130411592098e-05, + "loss": 1.1376, + "step": 5970 + }, + { + "epoch": 2.7379887627565647, + "grad_norm": 0.3252299427986145, + "learning_rate": 1.986213986364982e-05, + "loss": 1.9539, + "step": 5971 + }, + { + "epoch": 2.738447425753927, + "grad_norm": 0.38866326212882996, + "learning_rate": 1.979309339265778e-05, + "loss": 1.2502, + "step": 5972 + }, + { + "epoch": 2.73890608875129, + "grad_norm": 0.2286403626203537, + "learning_rate": 1.9724164719882367e-05, + "loss": 1.0532, + "step": 5973 + }, + { + "epoch": 2.739364751748653, + "grad_norm": 0.35474932193756104, + "learning_rate": 1.9655353862232326e-05, + "loss": 2.0171, + "step": 5974 + }, + { + "epoch": 2.739823414746015, + "grad_norm": 0.3784157633781433, + "learning_rate": 1.9586660836587554e-05, + "loss": 1.5926, + "step": 5975 + }, + { + "epoch": 2.740282077743378, + "grad_norm": 0.4135417640209198, + "learning_rate": 1.9518085659798734e-05, + "loss": 1.5249, + "step": 5976 + }, + { + "epoch": 2.7407407407407405, + "grad_norm": 0.37199753522872925, + "learning_rate": 1.9449628348687964e-05, + "loss": 1.5783, + "step": 5977 + }, + { + "epoch": 2.7411994037381033, + "grad_norm": 0.29575780034065247, + "learning_rate": 1.9381288920048255e-05, + "loss": 1.1942, + "step": 5978 + }, + { + "epoch": 2.741658066735466, + "grad_norm": 0.2783096730709076, + "learning_rate": 1.9313067390643866e-05, + "loss": 1.7344, + "step": 5979 + }, + { + "epoch": 2.742116729732829, + "grad_norm": 0.37723588943481445, + "learning_rate": 1.9244963777209967e-05, + "loss": 1.3625, + "step": 5980 + }, + { + "epoch": 2.7425753927301915, + "grad_norm": 0.26153478026390076, + "learning_rate": 1.917697809645291e-05, + "loss": 0.8029, + "step": 5981 + }, + { + "epoch": 2.7430340557275543, + "grad_norm": 0.28612053394317627, + "learning_rate": 1.910911036505003e-05, + "loss": 1.6521, + "step": 5982 + }, + { + "epoch": 2.7434927187249167, + "grad_norm": 0.23514768481254578, + "learning_rate": 1.9041360599649725e-05, + "loss": 1.1054, + "step": 5983 + }, + { + "epoch": 2.7439513817222796, + "grad_norm": 0.44542109966278076, + "learning_rate": 1.8973728816871592e-05, + "loss": 1.3016, + "step": 5984 + }, + { + "epoch": 2.7444100447196424, + "grad_norm": 0.19931384921073914, + "learning_rate": 1.8906215033306196e-05, + "loss": 1.5551, + "step": 5985 + }, + { + "epoch": 2.744868707717005, + "grad_norm": 0.2757704555988312, + "learning_rate": 1.8838819265515117e-05, + "loss": 1.2303, + "step": 5986 + }, + { + "epoch": 2.7453273707143677, + "grad_norm": 0.39500927925109863, + "learning_rate": 1.8771541530031023e-05, + "loss": 1.5353, + "step": 5987 + }, + { + "epoch": 2.74578603371173, + "grad_norm": 0.350881963968277, + "learning_rate": 1.8704381843357598e-05, + "loss": 1.248, + "step": 5988 + }, + { + "epoch": 2.746244696709093, + "grad_norm": 0.30257681012153625, + "learning_rate": 1.8637340221969613e-05, + "loss": 0.8883, + "step": 5989 + }, + { + "epoch": 2.746703359706456, + "grad_norm": 0.258948415517807, + "learning_rate": 1.8570416682312908e-05, + "loss": 1.0292, + "step": 5990 + }, + { + "epoch": 2.7471620227038183, + "grad_norm": 0.16734470427036285, + "learning_rate": 1.8503611240804186e-05, + "loss": 1.1654, + "step": 5991 + }, + { + "epoch": 2.747620685701181, + "grad_norm": 0.2848394811153412, + "learning_rate": 1.8436923913831506e-05, + "loss": 1.6372, + "step": 5992 + }, + { + "epoch": 2.7480793486985435, + "grad_norm": 0.3137653172016144, + "learning_rate": 1.8370354717753612e-05, + "loss": 0.653, + "step": 5993 + }, + { + "epoch": 2.7485380116959064, + "grad_norm": 0.21349652111530304, + "learning_rate": 1.8303903668900446e-05, + "loss": 1.3882, + "step": 5994 + }, + { + "epoch": 2.7489966746932692, + "grad_norm": 0.3199220895767212, + "learning_rate": 1.823757078357291e-05, + "loss": 0.9641, + "step": 5995 + }, + { + "epoch": 2.7494553376906317, + "grad_norm": 0.31056568026542664, + "learning_rate": 1.8171356078042932e-05, + "loss": 1.7107, + "step": 5996 + }, + { + "epoch": 2.7499140006879945, + "grad_norm": 0.3857615888118744, + "learning_rate": 1.8105259568553524e-05, + "loss": 1.0108, + "step": 5997 + }, + { + "epoch": 2.750372663685357, + "grad_norm": 0.2920821011066437, + "learning_rate": 1.803928127131854e-05, + "loss": 1.8806, + "step": 5998 + }, + { + "epoch": 2.75083132668272, + "grad_norm": 0.28004190325737, + "learning_rate": 1.797342120252299e-05, + "loss": 0.4187, + "step": 5999 + }, + { + "epoch": 2.7512899896800826, + "grad_norm": 0.26144489645957947, + "learning_rate": 1.7907679378322716e-05, + "loss": 1.446, + "step": 6000 + }, + { + "epoch": 2.7517486526774455, + "grad_norm": 0.2627753019332886, + "learning_rate": 1.7842055814844828e-05, + "loss": 1.212, + "step": 6001 + }, + { + "epoch": 2.752207315674808, + "grad_norm": 0.3469284772872925, + "learning_rate": 1.777655052818722e-05, + "loss": 0.7107, + "step": 6002 + }, + { + "epoch": 2.7526659786721708, + "grad_norm": 0.06406933069229126, + "learning_rate": 1.771116353441876e-05, + "loss": 0.7278, + "step": 6003 + }, + { + "epoch": 2.753124641669533, + "grad_norm": 0.2524787485599518, + "learning_rate": 1.7645894849579403e-05, + "loss": 0.8452, + "step": 6004 + }, + { + "epoch": 2.753583304666896, + "grad_norm": 0.2804563343524933, + "learning_rate": 1.7580744489679945e-05, + "loss": 0.7473, + "step": 6005 + }, + { + "epoch": 2.754041967664259, + "grad_norm": 0.2500944435596466, + "learning_rate": 1.7515712470702272e-05, + "loss": 1.0894, + "step": 6006 + }, + { + "epoch": 2.7545006306616213, + "grad_norm": 0.2973553538322449, + "learning_rate": 1.7450798808599234e-05, + "loss": 1.3237, + "step": 6007 + }, + { + "epoch": 2.754959293658984, + "grad_norm": 0.201811283826828, + "learning_rate": 1.73860035192947e-05, + "loss": 1.2607, + "step": 6008 + }, + { + "epoch": 2.7554179566563466, + "grad_norm": 0.4674420952796936, + "learning_rate": 1.7321326618683243e-05, + "loss": 1.6664, + "step": 6009 + }, + { + "epoch": 2.7558766196537094, + "grad_norm": 0.37687787413597107, + "learning_rate": 1.7256768122630607e-05, + "loss": 1.213, + "step": 6010 + }, + { + "epoch": 2.7563352826510723, + "grad_norm": 0.20660851895809174, + "learning_rate": 1.7192328046973572e-05, + "loss": 0.5505, + "step": 6011 + }, + { + "epoch": 2.7567939456484347, + "grad_norm": 0.573073148727417, + "learning_rate": 1.712800640751966e-05, + "loss": 1.4566, + "step": 6012 + }, + { + "epoch": 2.7572526086457976, + "grad_norm": 0.20739376544952393, + "learning_rate": 1.7063803220047524e-05, + "loss": 1.18, + "step": 6013 + }, + { + "epoch": 2.75771127164316, + "grad_norm": 0.3631076514720917, + "learning_rate": 1.6999718500306626e-05, + "loss": 1.2751, + "step": 6014 + }, + { + "epoch": 2.758169934640523, + "grad_norm": 0.3393692076206207, + "learning_rate": 1.6935752264017334e-05, + "loss": 1.3281, + "step": 6015 + }, + { + "epoch": 2.7586285976378857, + "grad_norm": 0.435544490814209, + "learning_rate": 1.6871904526871096e-05, + "loss": 1.7262, + "step": 6016 + }, + { + "epoch": 2.759087260635248, + "grad_norm": 0.20737430453300476, + "learning_rate": 1.680817530453016e-05, + "loss": 0.6926, + "step": 6017 + }, + { + "epoch": 2.759545923632611, + "grad_norm": 0.2829097807407379, + "learning_rate": 1.674456461262791e-05, + "loss": 1.6014, + "step": 6018 + }, + { + "epoch": 2.7600045866299734, + "grad_norm": 0.3296710252761841, + "learning_rate": 1.6681072466768367e-05, + "loss": 2.1915, + "step": 6019 + }, + { + "epoch": 2.7604632496273362, + "grad_norm": 0.3263249099254608, + "learning_rate": 1.6617698882526623e-05, + "loss": 1.3439, + "step": 6020 + }, + { + "epoch": 2.760921912624699, + "grad_norm": 0.34194350242614746, + "learning_rate": 1.6554443875448744e-05, + "loss": 1.4912, + "step": 6021 + }, + { + "epoch": 2.761380575622062, + "grad_norm": 0.36899879574775696, + "learning_rate": 1.6491307461051595e-05, + "loss": 1.7578, + "step": 6022 + }, + { + "epoch": 2.7618392386194244, + "grad_norm": 0.3341805934906006, + "learning_rate": 1.6428289654823014e-05, + "loss": 1.1788, + "step": 6023 + }, + { + "epoch": 2.762297901616787, + "grad_norm": 0.32973915338516235, + "learning_rate": 1.6365390472221742e-05, + "loss": 1.2469, + "step": 6024 + }, + { + "epoch": 2.7627565646141496, + "grad_norm": 0.463471919298172, + "learning_rate": 1.6302609928677382e-05, + "loss": 1.6849, + "step": 6025 + }, + { + "epoch": 2.7632152276115125, + "grad_norm": 0.40361595153808594, + "learning_rate": 1.6239948039590393e-05, + "loss": 1.0661, + "step": 6026 + }, + { + "epoch": 2.7636738906088754, + "grad_norm": 0.2925053834915161, + "learning_rate": 1.6177404820332253e-05, + "loss": 1.342, + "step": 6027 + }, + { + "epoch": 2.7641325536062378, + "grad_norm": 0.30632370710372925, + "learning_rate": 1.611498028624525e-05, + "loss": 1.151, + "step": 6028 + }, + { + "epoch": 2.7645912166036006, + "grad_norm": 0.22111092507839203, + "learning_rate": 1.605267445264258e-05, + "loss": 0.4576, + "step": 6029 + }, + { + "epoch": 2.765049879600963, + "grad_norm": 0.14749830961227417, + "learning_rate": 1.5990487334808292e-05, + "loss": 1.0352, + "step": 6030 + }, + { + "epoch": 2.765508542598326, + "grad_norm": 0.2582720220088959, + "learning_rate": 1.59284189479974e-05, + "loss": 1.6917, + "step": 6031 + }, + { + "epoch": 2.7659672055956888, + "grad_norm": 0.41496822237968445, + "learning_rate": 1.5866469307435626e-05, + "loss": 1.5566, + "step": 6032 + }, + { + "epoch": 2.766425868593051, + "grad_norm": 0.3769316077232361, + "learning_rate": 1.5804638428319694e-05, + "loss": 1.65, + "step": 6033 + }, + { + "epoch": 2.766884531590414, + "grad_norm": 0.23421497642993927, + "learning_rate": 1.5742926325817253e-05, + "loss": 1.1492, + "step": 6034 + }, + { + "epoch": 2.7673431945877764, + "grad_norm": 0.33073118329048157, + "learning_rate": 1.5681333015066635e-05, + "loss": 1.5674, + "step": 6035 + }, + { + "epoch": 2.7678018575851393, + "grad_norm": 0.48587143421173096, + "learning_rate": 1.56198585111772e-05, + "loss": 0.745, + "step": 6036 + }, + { + "epoch": 2.768260520582502, + "grad_norm": 0.20380260050296783, + "learning_rate": 1.5558502829228937e-05, + "loss": 0.9425, + "step": 6037 + }, + { + "epoch": 2.7687191835798646, + "grad_norm": 0.2204909473657608, + "learning_rate": 1.549726598427298e-05, + "loss": 1.633, + "step": 6038 + }, + { + "epoch": 2.7691778465772274, + "grad_norm": 0.34915247559547424, + "learning_rate": 1.5436147991331083e-05, + "loss": 0.5378, + "step": 6039 + }, + { + "epoch": 2.76963650957459, + "grad_norm": 0.24728168547153473, + "learning_rate": 1.5375148865396038e-05, + "loss": 1.2788, + "step": 6040 + }, + { + "epoch": 2.7700951725719527, + "grad_norm": 0.24167189002037048, + "learning_rate": 1.531426862143126e-05, + "loss": 1.3767, + "step": 6041 + }, + { + "epoch": 2.7705538355693156, + "grad_norm": 0.3961116075515747, + "learning_rate": 1.5253507274371137e-05, + "loss": 1.0845, + "step": 6042 + }, + { + "epoch": 2.771012498566678, + "grad_norm": 0.28995588421821594, + "learning_rate": 1.5192864839120912e-05, + "loss": 1.2602, + "step": 6043 + }, + { + "epoch": 2.771471161564041, + "grad_norm": 0.2699783742427826, + "learning_rate": 1.5132341330556576e-05, + "loss": 0.9651, + "step": 6044 + }, + { + "epoch": 2.7719298245614032, + "grad_norm": 0.2684682011604309, + "learning_rate": 1.5071936763524974e-05, + "loss": 1.4155, + "step": 6045 + }, + { + "epoch": 2.772388487558766, + "grad_norm": 0.27157121896743774, + "learning_rate": 1.5011651152843809e-05, + "loss": 1.6052, + "step": 6046 + }, + { + "epoch": 2.772847150556129, + "grad_norm": 0.3431642949581146, + "learning_rate": 1.4951484513301583e-05, + "loss": 1.454, + "step": 6047 + }, + { + "epoch": 2.773305813553492, + "grad_norm": 0.32993775606155396, + "learning_rate": 1.4891436859657604e-05, + "loss": 0.9063, + "step": 6048 + }, + { + "epoch": 2.7737644765508542, + "grad_norm": 0.28404250741004944, + "learning_rate": 1.483150820664192e-05, + "loss": 1.5258, + "step": 6049 + }, + { + "epoch": 2.774223139548217, + "grad_norm": 0.4539187252521515, + "learning_rate": 1.477169856895555e-05, + "loss": 1.155, + "step": 6050 + }, + { + "epoch": 2.7746818025455795, + "grad_norm": 0.20708614587783813, + "learning_rate": 1.4712007961270146e-05, + "loss": 0.9088, + "step": 6051 + }, + { + "epoch": 2.7751404655429424, + "grad_norm": 0.09351902455091476, + "learning_rate": 1.4652436398228385e-05, + "loss": 0.8234, + "step": 6052 + }, + { + "epoch": 2.775599128540305, + "grad_norm": 0.2884185016155243, + "learning_rate": 1.4592983894443468e-05, + "loss": 1.8303, + "step": 6053 + }, + { + "epoch": 2.7760577915376676, + "grad_norm": 0.3106091618537903, + "learning_rate": 1.4533650464499559e-05, + "loss": 1.0048, + "step": 6054 + }, + { + "epoch": 2.7765164545350305, + "grad_norm": 0.42278361320495605, + "learning_rate": 1.4474436122951572e-05, + "loss": 1.63, + "step": 6055 + }, + { + "epoch": 2.776975117532393, + "grad_norm": 0.37238556146621704, + "learning_rate": 1.4415340884325223e-05, + "loss": 1.6514, + "step": 6056 + }, + { + "epoch": 2.7774337805297558, + "grad_norm": 0.28060218691825867, + "learning_rate": 1.4356364763117024e-05, + "loss": 0.8222, + "step": 6057 + }, + { + "epoch": 2.7778924435271186, + "grad_norm": 0.27165743708610535, + "learning_rate": 1.4297507773794239e-05, + "loss": 0.7359, + "step": 6058 + }, + { + "epoch": 2.778351106524481, + "grad_norm": 0.22083696722984314, + "learning_rate": 1.4238769930794926e-05, + "loss": 1.2175, + "step": 6059 + }, + { + "epoch": 2.778809769521844, + "grad_norm": 0.3628489375114441, + "learning_rate": 1.4180151248527784e-05, + "loss": 2.0398, + "step": 6060 + }, + { + "epoch": 2.7792684325192063, + "grad_norm": 0.36967116594314575, + "learning_rate": 1.4121651741372533e-05, + "loss": 1.1404, + "step": 6061 + }, + { + "epoch": 2.779727095516569, + "grad_norm": 0.30434438586235046, + "learning_rate": 1.4063271423679469e-05, + "loss": 1.418, + "step": 6062 + }, + { + "epoch": 2.780185758513932, + "grad_norm": 0.34148743748664856, + "learning_rate": 1.4005010309769638e-05, + "loss": 1.3456, + "step": 6063 + }, + { + "epoch": 2.7806444215112944, + "grad_norm": 0.2640988528728485, + "learning_rate": 1.3946868413935055e-05, + "loss": 1.0164, + "step": 6064 + }, + { + "epoch": 2.7811030845086573, + "grad_norm": 0.2465277761220932, + "learning_rate": 1.3888845750438306e-05, + "loss": 0.7947, + "step": 6065 + }, + { + "epoch": 2.7815617475060197, + "grad_norm": 0.35097736120224, + "learning_rate": 1.3830942333512675e-05, + "loss": 1.9631, + "step": 6066 + }, + { + "epoch": 2.7820204105033826, + "grad_norm": 0.409355103969574, + "learning_rate": 1.3773158177362356e-05, + "loss": 1.48, + "step": 6067 + }, + { + "epoch": 2.7824790735007454, + "grad_norm": 0.324785441160202, + "learning_rate": 1.371549329616223e-05, + "loss": 1.4276, + "step": 6068 + }, + { + "epoch": 2.7829377364981083, + "grad_norm": 0.372164249420166, + "learning_rate": 1.3657947704057872e-05, + "loss": 1.144, + "step": 6069 + }, + { + "epoch": 2.7833963994954707, + "grad_norm": 0.160455122590065, + "learning_rate": 1.3600521415165712e-05, + "loss": 1.3252, + "step": 6070 + }, + { + "epoch": 2.7838550624928335, + "grad_norm": 0.3352322280406952, + "learning_rate": 1.354321444357276e-05, + "loss": 1.2853, + "step": 6071 + }, + { + "epoch": 2.784313725490196, + "grad_norm": 0.2882045805454254, + "learning_rate": 1.3486026803336715e-05, + "loss": 1.1088, + "step": 6072 + }, + { + "epoch": 2.784772388487559, + "grad_norm": 0.28998827934265137, + "learning_rate": 1.3428958508486355e-05, + "loss": 1.6669, + "step": 6073 + }, + { + "epoch": 2.7852310514849217, + "grad_norm": 0.6466904282569885, + "learning_rate": 1.3372009573020816e-05, + "loss": 1.6118, + "step": 6074 + }, + { + "epoch": 2.785689714482284, + "grad_norm": 0.3447767496109009, + "learning_rate": 1.3315180010910145e-05, + "loss": 1.3989, + "step": 6075 + }, + { + "epoch": 2.786148377479647, + "grad_norm": 0.24244464933872223, + "learning_rate": 1.3258469836094911e-05, + "loss": 0.4131, + "step": 6076 + }, + { + "epoch": 2.7866070404770094, + "grad_norm": 0.264919251203537, + "learning_rate": 1.3201879062486655e-05, + "loss": 1.6393, + "step": 6077 + }, + { + "epoch": 2.787065703474372, + "grad_norm": 0.266245037317276, + "learning_rate": 1.3145407703967438e-05, + "loss": 1.474, + "step": 6078 + }, + { + "epoch": 2.787524366471735, + "grad_norm": 0.3023591935634613, + "learning_rate": 1.3089055774390124e-05, + "loss": 1.1628, + "step": 6079 + }, + { + "epoch": 2.7879830294690975, + "grad_norm": 0.28299427032470703, + "learning_rate": 1.3032823287578266e-05, + "loss": 1.1443, + "step": 6080 + }, + { + "epoch": 2.7884416924664603, + "grad_norm": 0.30756518244743347, + "learning_rate": 1.2976710257326053e-05, + "loss": 1.1689, + "step": 6081 + }, + { + "epoch": 2.7889003554638228, + "grad_norm": 0.2923178970813751, + "learning_rate": 1.2920716697398416e-05, + "loss": 1.3088, + "step": 6082 + }, + { + "epoch": 2.7893590184611856, + "grad_norm": 0.3415941894054413, + "learning_rate": 1.2864842621530982e-05, + "loss": 1.1528, + "step": 6083 + }, + { + "epoch": 2.7898176814585485, + "grad_norm": 0.37786003947257996, + "learning_rate": 1.2809088043430116e-05, + "loss": 1.2742, + "step": 6084 + }, + { + "epoch": 2.790276344455911, + "grad_norm": 0.27256593108177185, + "learning_rate": 1.2753452976772773e-05, + "loss": 1.4647, + "step": 6085 + }, + { + "epoch": 2.7907350074532737, + "grad_norm": 0.3987545073032379, + "learning_rate": 1.2697937435206642e-05, + "loss": 1.9492, + "step": 6086 + }, + { + "epoch": 2.791193670450636, + "grad_norm": 0.36392930150032043, + "learning_rate": 1.2642541432350108e-05, + "loss": 1.2538, + "step": 6087 + }, + { + "epoch": 2.791652333447999, + "grad_norm": 0.31756749749183655, + "learning_rate": 1.258726498179219e-05, + "loss": 0.9692, + "step": 6088 + }, + { + "epoch": 2.792110996445362, + "grad_norm": 0.20181064307689667, + "learning_rate": 1.2532108097092598e-05, + "loss": 0.8022, + "step": 6089 + }, + { + "epoch": 2.7925696594427247, + "grad_norm": 0.1365479826927185, + "learning_rate": 1.2477070791781675e-05, + "loss": 0.9823, + "step": 6090 + }, + { + "epoch": 2.793028322440087, + "grad_norm": 0.20917746424674988, + "learning_rate": 1.242215307936051e-05, + "loss": 0.7463, + "step": 6091 + }, + { + "epoch": 2.79348698543745, + "grad_norm": 0.3799673318862915, + "learning_rate": 1.2367354973300881e-05, + "loss": 1.5071, + "step": 6092 + }, + { + "epoch": 2.7939456484348124, + "grad_norm": 0.3432566523551941, + "learning_rate": 1.2312676487045038e-05, + "loss": 1.736, + "step": 6093 + }, + { + "epoch": 2.7944043114321753, + "grad_norm": 0.33574342727661133, + "learning_rate": 1.2258117634006028e-05, + "loss": 0.94, + "step": 6094 + }, + { + "epoch": 2.794862974429538, + "grad_norm": 0.35387691855430603, + "learning_rate": 1.2203678427567588e-05, + "loss": 1.8061, + "step": 6095 + }, + { + "epoch": 2.7953216374269005, + "grad_norm": 0.2692587971687317, + "learning_rate": 1.2149358881084039e-05, + "loss": 1.1821, + "step": 6096 + }, + { + "epoch": 2.7957803004242634, + "grad_norm": 0.36804336309432983, + "learning_rate": 1.2095159007880385e-05, + "loss": 1.5841, + "step": 6097 + }, + { + "epoch": 2.796238963421626, + "grad_norm": 0.36808767914772034, + "learning_rate": 1.204107882125216e-05, + "loss": 1.8313, + "step": 6098 + }, + { + "epoch": 2.7966976264189887, + "grad_norm": 0.33622780442237854, + "learning_rate": 1.1987118334465696e-05, + "loss": 1.2008, + "step": 6099 + }, + { + "epoch": 2.7971562894163515, + "grad_norm": 0.27367860078811646, + "learning_rate": 1.1933277560757793e-05, + "loss": 1.3219, + "step": 6100 + }, + { + "epoch": 2.797614952413714, + "grad_norm": 0.28815925121307373, + "learning_rate": 1.187955651333611e-05, + "loss": 0.7678, + "step": 6101 + }, + { + "epoch": 2.798073615411077, + "grad_norm": 0.27640852332115173, + "learning_rate": 1.1825955205378713e-05, + "loss": 1.75, + "step": 6102 + }, + { + "epoch": 2.798532278408439, + "grad_norm": 0.32954832911491394, + "learning_rate": 1.1772473650034421e-05, + "loss": 1.0438, + "step": 6103 + }, + { + "epoch": 2.798990941405802, + "grad_norm": 0.3560253381729126, + "learning_rate": 1.1719111860422627e-05, + "loss": 1.1265, + "step": 6104 + }, + { + "epoch": 2.799449604403165, + "grad_norm": 0.11905957758426666, + "learning_rate": 1.1665869849633414e-05, + "loss": 1.2286, + "step": 6105 + }, + { + "epoch": 2.7999082674005273, + "grad_norm": 0.2836627960205078, + "learning_rate": 1.1612747630727394e-05, + "loss": 1.1157, + "step": 6106 + }, + { + "epoch": 2.80036693039789, + "grad_norm": 0.33813512325286865, + "learning_rate": 1.1559745216735806e-05, + "loss": 0.8221, + "step": 6107 + }, + { + "epoch": 2.8008255933952526, + "grad_norm": 0.24317127466201782, + "learning_rate": 1.1506862620660586e-05, + "loss": 1.5408, + "step": 6108 + }, + { + "epoch": 2.8012842563926155, + "grad_norm": 0.33810973167419434, + "learning_rate": 1.1454099855474242e-05, + "loss": 1.2968, + "step": 6109 + }, + { + "epoch": 2.8017429193899783, + "grad_norm": 0.31549063324928284, + "learning_rate": 1.1401456934119703e-05, + "loss": 1.26, + "step": 6110 + }, + { + "epoch": 2.8022015823873407, + "grad_norm": 0.2274772822856903, + "learning_rate": 1.1348933869510802e-05, + "loss": 0.9997, + "step": 6111 + }, + { + "epoch": 2.8026602453847036, + "grad_norm": 0.3890027403831482, + "learning_rate": 1.1296530674531735e-05, + "loss": 1.1818, + "step": 6112 + }, + { + "epoch": 2.803118908382066, + "grad_norm": 0.2179887443780899, + "learning_rate": 1.1244247362037496e-05, + "loss": 1.7811, + "step": 6113 + }, + { + "epoch": 2.803577571379429, + "grad_norm": 0.3312167227268219, + "learning_rate": 1.1192083944853438e-05, + "loss": 1.1515, + "step": 6114 + }, + { + "epoch": 2.8040362343767917, + "grad_norm": 0.24871665239334106, + "learning_rate": 1.1140040435775655e-05, + "loss": 1.3264, + "step": 6115 + }, + { + "epoch": 2.8044948973741546, + "grad_norm": 0.30252861976623535, + "learning_rate": 1.1088116847570885e-05, + "loss": 1.4769, + "step": 6116 + }, + { + "epoch": 2.804953560371517, + "grad_norm": 0.31379151344299316, + "learning_rate": 1.1036313192976266e-05, + "loss": 1.2689, + "step": 6117 + }, + { + "epoch": 2.80541222336888, + "grad_norm": 0.30472010374069214, + "learning_rate": 1.0984629484699582e-05, + "loss": 0.8448, + "step": 6118 + }, + { + "epoch": 2.8058708863662423, + "grad_norm": 0.3143567144870758, + "learning_rate": 1.09330657354193e-05, + "loss": 1.4705, + "step": 6119 + }, + { + "epoch": 2.806329549363605, + "grad_norm": 0.34770259261131287, + "learning_rate": 1.0881621957784416e-05, + "loss": 1.1561, + "step": 6120 + }, + { + "epoch": 2.806788212360968, + "grad_norm": 0.09801855683326721, + "learning_rate": 1.0830298164414331e-05, + "loss": 0.8337, + "step": 6121 + }, + { + "epoch": 2.8072468753583304, + "grad_norm": 0.2806740701198578, + "learning_rate": 1.0779094367899201e-05, + "loss": 1.6565, + "step": 6122 + }, + { + "epoch": 2.8077055383556933, + "grad_norm": 0.7073934674263, + "learning_rate": 1.0728010580799696e-05, + "loss": 1.5767, + "step": 6123 + }, + { + "epoch": 2.8081642013530557, + "grad_norm": 0.32200196385383606, + "learning_rate": 1.0677046815647018e-05, + "loss": 0.7768, + "step": 6124 + }, + { + "epoch": 2.8086228643504185, + "grad_norm": 0.08174686133861542, + "learning_rate": 1.0626203084942886e-05, + "loss": 1.385, + "step": 6125 + }, + { + "epoch": 2.8090815273477814, + "grad_norm": 0.3870648741722107, + "learning_rate": 1.0575479401159827e-05, + "loss": 1.4463, + "step": 6126 + }, + { + "epoch": 2.809540190345144, + "grad_norm": 0.289341539144516, + "learning_rate": 1.052487577674055e-05, + "loss": 1.167, + "step": 6127 + }, + { + "epoch": 2.8099988533425067, + "grad_norm": 0.31475889682769775, + "learning_rate": 1.0474392224098572e-05, + "loss": 0.841, + "step": 6128 + }, + { + "epoch": 2.810457516339869, + "grad_norm": 0.29922276735305786, + "learning_rate": 1.042402875561782e-05, + "loss": 2.0278, + "step": 6129 + }, + { + "epoch": 2.810916179337232, + "grad_norm": 0.4082167148590088, + "learning_rate": 1.0373785383652856e-05, + "loss": 1.6776, + "step": 6130 + }, + { + "epoch": 2.811374842334595, + "grad_norm": 0.3589903712272644, + "learning_rate": 1.0323662120528765e-05, + "loss": 1.1015, + "step": 6131 + }, + { + "epoch": 2.811833505331957, + "grad_norm": 0.24516572058200836, + "learning_rate": 1.0273658978541044e-05, + "loss": 1.232, + "step": 6132 + }, + { + "epoch": 2.81229216832932, + "grad_norm": 0.21401365101337433, + "learning_rate": 1.0223775969955883e-05, + "loss": 1.2669, + "step": 6133 + }, + { + "epoch": 2.8127508313266825, + "grad_norm": 0.4826924800872803, + "learning_rate": 1.0174013107009938e-05, + "loss": 1.4506, + "step": 6134 + }, + { + "epoch": 2.8132094943240453, + "grad_norm": 0.5444715023040771, + "learning_rate": 1.0124370401910388e-05, + "loss": 1.0889, + "step": 6135 + }, + { + "epoch": 2.813668157321408, + "grad_norm": 0.32295656204223633, + "learning_rate": 1.0074847866834991e-05, + "loss": 1.4926, + "step": 6136 + }, + { + "epoch": 2.814126820318771, + "grad_norm": 0.3280322253704071, + "learning_rate": 1.002544551393203e-05, + "loss": 1.8701, + "step": 6137 + }, + { + "epoch": 2.8145854833161335, + "grad_norm": 0.3465189039707184, + "learning_rate": 9.976163355320089e-06, + "loss": 1.6473, + "step": 6138 + }, + { + "epoch": 2.8150441463134963, + "grad_norm": 0.406650573015213, + "learning_rate": 9.92700140308861e-06, + "loss": 1.2048, + "step": 6139 + }, + { + "epoch": 2.8155028093108587, + "grad_norm": 0.32793253660202026, + "learning_rate": 9.87795966929722e-06, + "loss": 1.0226, + "step": 6140 + }, + { + "epoch": 2.8159614723082216, + "grad_norm": 0.24950462579727173, + "learning_rate": 9.8290381659763e-06, + "loss": 1.614, + "step": 6141 + }, + { + "epoch": 2.8164201353055844, + "grad_norm": 0.27943155169487, + "learning_rate": 9.780236905126694e-06, + "loss": 1.1877, + "step": 6142 + }, + { + "epoch": 2.816878798302947, + "grad_norm": 0.33849433064460754, + "learning_rate": 9.731555898719601e-06, + "loss": 1.5759, + "step": 6143 + }, + { + "epoch": 2.8173374613003097, + "grad_norm": 0.28267404437065125, + "learning_rate": 9.682995158696806e-06, + "loss": 0.7854, + "step": 6144 + }, + { + "epoch": 2.817796124297672, + "grad_norm": 0.28812381625175476, + "learning_rate": 9.63455469697072e-06, + "loss": 1.5585, + "step": 6145 + }, + { + "epoch": 2.818254787295035, + "grad_norm": 0.3284319341182709, + "learning_rate": 9.58623452542412e-06, + "loss": 1.6185, + "step": 6146 + }, + { + "epoch": 2.818713450292398, + "grad_norm": 0.391230046749115, + "learning_rate": 9.538034655910189e-06, + "loss": 1.822, + "step": 6147 + }, + { + "epoch": 2.8191721132897603, + "grad_norm": 0.38372287154197693, + "learning_rate": 9.489955100252855e-06, + "loss": 1.2379, + "step": 6148 + }, + { + "epoch": 2.819630776287123, + "grad_norm": 0.37333911657333374, + "learning_rate": 9.441995870246244e-06, + "loss": 1.638, + "step": 6149 + }, + { + "epoch": 2.8200894392844855, + "grad_norm": 0.33912524580955505, + "learning_rate": 9.394156977655165e-06, + "loss": 1.0467, + "step": 6150 + }, + { + "epoch": 2.8205481022818484, + "grad_norm": 0.24765615165233612, + "learning_rate": 9.346438434214843e-06, + "loss": 1.575, + "step": 6151 + }, + { + "epoch": 2.8210067652792112, + "grad_norm": 0.6433941125869751, + "learning_rate": 9.298840251630913e-06, + "loss": 1.5814, + "step": 6152 + }, + { + "epoch": 2.8214654282765737, + "grad_norm": 0.2799464464187622, + "learning_rate": 9.251362441579646e-06, + "loss": 1.0583, + "step": 6153 + }, + { + "epoch": 2.8219240912739365, + "grad_norm": 0.21018120646476746, + "learning_rate": 9.204005015707673e-06, + "loss": 1.2, + "step": 6154 + }, + { + "epoch": 2.822382754271299, + "grad_norm": 0.2773264944553375, + "learning_rate": 9.15676798563203e-06, + "loss": 1.0197, + "step": 6155 + }, + { + "epoch": 2.822841417268662, + "grad_norm": 0.34957289695739746, + "learning_rate": 9.109651362940397e-06, + "loss": 1.3116, + "step": 6156 + }, + { + "epoch": 2.8233000802660246, + "grad_norm": 0.3279551863670349, + "learning_rate": 9.062655159190802e-06, + "loss": 2.0532, + "step": 6157 + }, + { + "epoch": 2.8237587432633875, + "grad_norm": 0.4215746521949768, + "learning_rate": 9.015779385911748e-06, + "loss": 1.4172, + "step": 6158 + }, + { + "epoch": 2.82421740626075, + "grad_norm": 0.41953399777412415, + "learning_rate": 8.969024054602204e-06, + "loss": 1.0516, + "step": 6159 + }, + { + "epoch": 2.8246760692581128, + "grad_norm": 0.08408641070127487, + "learning_rate": 8.922389176731549e-06, + "loss": 1.189, + "step": 6160 + }, + { + "epoch": 2.825134732255475, + "grad_norm": 0.3760605752468109, + "learning_rate": 8.875874763739633e-06, + "loss": 1.4748, + "step": 6161 + }, + { + "epoch": 2.825593395252838, + "grad_norm": 0.21087396144866943, + "learning_rate": 8.829480827036884e-06, + "loss": 0.7344, + "step": 6162 + }, + { + "epoch": 2.826052058250201, + "grad_norm": 0.19473138451576233, + "learning_rate": 8.783207378003977e-06, + "loss": 0.9306, + "step": 6163 + }, + { + "epoch": 2.8265107212475633, + "grad_norm": 0.3104762136936188, + "learning_rate": 8.737054427992164e-06, + "loss": 1.9001, + "step": 6164 + }, + { + "epoch": 2.826969384244926, + "grad_norm": 0.4818089008331299, + "learning_rate": 8.691021988323111e-06, + "loss": 1.6207, + "step": 6165 + }, + { + "epoch": 2.8274280472422886, + "grad_norm": 0.31389549374580383, + "learning_rate": 8.645110070288897e-06, + "loss": 1.3694, + "step": 6166 + }, + { + "epoch": 2.8278867102396514, + "grad_norm": 0.352982759475708, + "learning_rate": 8.599318685152014e-06, + "loss": 1.1738, + "step": 6167 + }, + { + "epoch": 2.8283453732370143, + "grad_norm": 0.20497311651706696, + "learning_rate": 8.553647844145418e-06, + "loss": 0.7585, + "step": 6168 + }, + { + "epoch": 2.8288040362343767, + "grad_norm": 0.30926766991615295, + "learning_rate": 8.508097558472538e-06, + "loss": 1.0177, + "step": 6169 + }, + { + "epoch": 2.8292626992317396, + "grad_norm": 0.3021996319293976, + "learning_rate": 8.462667839307159e-06, + "loss": 1.1897, + "step": 6170 + }, + { + "epoch": 2.829721362229102, + "grad_norm": 0.2463819533586502, + "learning_rate": 8.417358697793587e-06, + "loss": 1.6089, + "step": 6171 + }, + { + "epoch": 2.830180025226465, + "grad_norm": 0.3797639012336731, + "learning_rate": 8.372170145046376e-06, + "loss": 1.718, + "step": 6172 + }, + { + "epoch": 2.8306386882238277, + "grad_norm": 0.26537811756134033, + "learning_rate": 8.327102192150604e-06, + "loss": 1.5723, + "step": 6173 + }, + { + "epoch": 2.83109735122119, + "grad_norm": 0.26074454188346863, + "learning_rate": 8.282154850161871e-06, + "loss": 0.9977, + "step": 6174 + }, + { + "epoch": 2.831556014218553, + "grad_norm": 0.29502686858177185, + "learning_rate": 8.23732813010597e-06, + "loss": 1.4637, + "step": 6175 + }, + { + "epoch": 2.8320146772159154, + "grad_norm": 0.42862004041671753, + "learning_rate": 8.192622042979325e-06, + "loss": 1.2069, + "step": 6176 + }, + { + "epoch": 2.8324733402132782, + "grad_norm": 0.197200745344162, + "learning_rate": 8.148036599748554e-06, + "loss": 1.193, + "step": 6177 + }, + { + "epoch": 2.832932003210641, + "grad_norm": 0.34668564796447754, + "learning_rate": 8.103571811350851e-06, + "loss": 1.3786, + "step": 6178 + }, + { + "epoch": 2.8333906662080035, + "grad_norm": 0.22151388227939606, + "learning_rate": 8.059227688693771e-06, + "loss": 1.4785, + "step": 6179 + }, + { + "epoch": 2.8338493292053664, + "grad_norm": 0.38517090678215027, + "learning_rate": 8.015004242655222e-06, + "loss": 1.0836, + "step": 6180 + }, + { + "epoch": 2.8343079922027288, + "grad_norm": 0.40751755237579346, + "learning_rate": 7.970901484083471e-06, + "loss": 2.0908, + "step": 6181 + }, + { + "epoch": 2.8347666552000916, + "grad_norm": 0.32812926173210144, + "learning_rate": 7.926919423797362e-06, + "loss": 0.83, + "step": 6182 + }, + { + "epoch": 2.8352253181974545, + "grad_norm": 0.36086252331733704, + "learning_rate": 7.883058072585935e-06, + "loss": 1.5337, + "step": 6183 + }, + { + "epoch": 2.8356839811948173, + "grad_norm": 0.3034031391143799, + "learning_rate": 7.839317441208693e-06, + "loss": 1.334, + "step": 6184 + }, + { + "epoch": 2.8361426441921798, + "grad_norm": 0.16254828870296478, + "learning_rate": 7.795697540395552e-06, + "loss": 0.5699, + "step": 6185 + }, + { + "epoch": 2.8366013071895426, + "grad_norm": 0.30455073714256287, + "learning_rate": 7.752198380846787e-06, + "loss": 1.2536, + "step": 6186 + }, + { + "epoch": 2.837059970186905, + "grad_norm": 0.21652016043663025, + "learning_rate": 7.708819973233028e-06, + "loss": 1.4794, + "step": 6187 + }, + { + "epoch": 2.837518633184268, + "grad_norm": 0.2286727875471115, + "learning_rate": 7.665562328195375e-06, + "loss": 0.6623, + "step": 6188 + }, + { + "epoch": 2.8379772961816307, + "grad_norm": 0.31880277395248413, + "learning_rate": 7.622425456345172e-06, + "loss": 1.2148, + "step": 6189 + }, + { + "epoch": 2.838435959178993, + "grad_norm": 0.2918204367160797, + "learning_rate": 7.5794093682641785e-06, + "loss": 1.5525, + "step": 6190 + }, + { + "epoch": 2.838894622176356, + "grad_norm": 0.3420119881629944, + "learning_rate": 7.536514074504675e-06, + "loss": 1.3251, + "step": 6191 + }, + { + "epoch": 2.8393532851737184, + "grad_norm": 0.3030669391155243, + "learning_rate": 7.4937395855890765e-06, + "loss": 1.1624, + "step": 6192 + }, + { + "epoch": 2.8398119481710813, + "grad_norm": 0.22030462324619293, + "learning_rate": 7.45108591201038e-06, + "loss": 1.4805, + "step": 6193 + }, + { + "epoch": 2.840270611168444, + "grad_norm": 0.2560308873653412, + "learning_rate": 7.408553064231716e-06, + "loss": 0.6927, + "step": 6194 + }, + { + "epoch": 2.8407292741658066, + "grad_norm": 0.28317221999168396, + "learning_rate": 7.366141052686737e-06, + "loss": 1.3214, + "step": 6195 + }, + { + "epoch": 2.8411879371631694, + "grad_norm": 0.29174163937568665, + "learning_rate": 7.3238498877794544e-06, + "loss": 1.5961, + "step": 6196 + }, + { + "epoch": 2.841646600160532, + "grad_norm": 0.4265110492706299, + "learning_rate": 7.281679579884126e-06, + "loss": 1.8915, + "step": 6197 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.36179837584495544, + "learning_rate": 7.239630139345532e-06, + "loss": 0.9835, + "step": 6198 + }, + { + "epoch": 2.8425639261552575, + "grad_norm": 0.1963047832250595, + "learning_rate": 7.197701576478699e-06, + "loss": 1.0536, + "step": 6199 + }, + { + "epoch": 2.84302258915262, + "grad_norm": 0.26563286781311035, + "learning_rate": 7.1558939015689e-06, + "loss": 1.058, + "step": 6200 + }, + { + "epoch": 2.843481252149983, + "grad_norm": 0.295461505651474, + "learning_rate": 7.114207124871874e-06, + "loss": 1.3314, + "step": 6201 + }, + { + "epoch": 2.8439399151473452, + "grad_norm": 0.287113219499588, + "learning_rate": 7.072641256613777e-06, + "loss": 0.7959, + "step": 6202 + }, + { + "epoch": 2.844398578144708, + "grad_norm": 0.2837468385696411, + "learning_rate": 7.031196306991005e-06, + "loss": 1.5568, + "step": 6203 + }, + { + "epoch": 2.844857241142071, + "grad_norm": 0.2658827602863312, + "learning_rate": 6.989872286170262e-06, + "loss": 0.6964, + "step": 6204 + }, + { + "epoch": 2.845315904139434, + "grad_norm": 0.3154827654361725, + "learning_rate": 6.948669204288604e-06, + "loss": 1.7673, + "step": 6205 + }, + { + "epoch": 2.845774567136796, + "grad_norm": 0.3644176423549652, + "learning_rate": 6.907587071453447e-06, + "loss": 0.5235, + "step": 6206 + }, + { + "epoch": 2.846233230134159, + "grad_norm": 0.2641429603099823, + "learning_rate": 6.866625897742562e-06, + "loss": 1.2719, + "step": 6207 + }, + { + "epoch": 2.8466918931315215, + "grad_norm": 0.34545841813087463, + "learning_rate": 6.825785693204023e-06, + "loss": 1.8203, + "step": 6208 + }, + { + "epoch": 2.8471505561288843, + "grad_norm": 0.34669947624206543, + "learning_rate": 6.78506646785626e-06, + "loss": 1.2168, + "step": 6209 + }, + { + "epoch": 2.847609219126247, + "grad_norm": 0.2993127405643463, + "learning_rate": 6.744468231688006e-06, + "loss": 1.5106, + "step": 6210 + }, + { + "epoch": 2.8480678821236096, + "grad_norm": 0.36481523513793945, + "learning_rate": 6.7039909946581825e-06, + "loss": 0.9908, + "step": 6211 + }, + { + "epoch": 2.8485265451209725, + "grad_norm": 0.24906761944293976, + "learning_rate": 6.663634766696236e-06, + "loss": 1.5602, + "step": 6212 + }, + { + "epoch": 2.848985208118335, + "grad_norm": 0.3981691002845764, + "learning_rate": 6.623399557701803e-06, + "loss": 1.237, + "step": 6213 + }, + { + "epoch": 2.8494438711156977, + "grad_norm": 0.25347137451171875, + "learning_rate": 6.5832853775448784e-06, + "loss": 1.5596, + "step": 6214 + }, + { + "epoch": 2.8499025341130606, + "grad_norm": 0.33858397603034973, + "learning_rate": 6.543292236065812e-06, + "loss": 1.1393, + "step": 6215 + }, + { + "epoch": 2.850361197110423, + "grad_norm": 0.31756460666656494, + "learning_rate": 6.50342014307509e-06, + "loss": 2.2029, + "step": 6216 + }, + { + "epoch": 2.850819860107786, + "grad_norm": 0.4458785653114319, + "learning_rate": 6.463669108353776e-06, + "loss": 1.5674, + "step": 6217 + }, + { + "epoch": 2.8512785231051483, + "grad_norm": 0.3402140438556671, + "learning_rate": 6.42403914165296e-06, + "loss": 1.3496, + "step": 6218 + }, + { + "epoch": 2.851737186102511, + "grad_norm": 0.3040880858898163, + "learning_rate": 6.384530252694254e-06, + "loss": 1.3495, + "step": 6219 + }, + { + "epoch": 2.852195849099874, + "grad_norm": 0.4113820493221283, + "learning_rate": 6.345142451169405e-06, + "loss": 1.2961, + "step": 6220 + }, + { + "epoch": 2.8526545120972364, + "grad_norm": 0.3598925471305847, + "learning_rate": 6.305875746740574e-06, + "loss": 1.4615, + "step": 6221 + }, + { + "epoch": 2.8531131750945993, + "grad_norm": 0.3074336051940918, + "learning_rate": 6.266730149040112e-06, + "loss": 1.2253, + "step": 6222 + }, + { + "epoch": 2.8535718380919617, + "grad_norm": 0.3465123474597931, + "learning_rate": 6.22770566767078e-06, + "loss": 1.5122, + "step": 6223 + }, + { + "epoch": 2.8540305010893245, + "grad_norm": 0.3549046218395233, + "learning_rate": 6.188802312205477e-06, + "loss": 1.9436, + "step": 6224 + }, + { + "epoch": 2.8544891640866874, + "grad_norm": 0.30973848700523376, + "learning_rate": 6.1500200921875695e-06, + "loss": 0.8639, + "step": 6225 + }, + { + "epoch": 2.8549478270840503, + "grad_norm": 0.21493889391422272, + "learning_rate": 6.111359017130558e-06, + "loss": 1.3644, + "step": 6226 + }, + { + "epoch": 2.8554064900814127, + "grad_norm": 0.2825007140636444, + "learning_rate": 6.072819096518301e-06, + "loss": 1.3103, + "step": 6227 + }, + { + "epoch": 2.8558651530787755, + "grad_norm": 0.3511542081832886, + "learning_rate": 6.034400339804902e-06, + "loss": 1.46, + "step": 6228 + }, + { + "epoch": 2.856323816076138, + "grad_norm": 0.27357274293899536, + "learning_rate": 5.996102756414823e-06, + "loss": 0.7502, + "step": 6229 + }, + { + "epoch": 2.856782479073501, + "grad_norm": 0.22305287420749664, + "learning_rate": 5.95792635574266e-06, + "loss": 1.3862, + "step": 6230 + }, + { + "epoch": 2.8572411420708637, + "grad_norm": 0.3165452778339386, + "learning_rate": 5.919871147153422e-06, + "loss": 1.2199, + "step": 6231 + }, + { + "epoch": 2.857699805068226, + "grad_norm": 0.2813020646572113, + "learning_rate": 5.881937139982307e-06, + "loss": 0.8148, + "step": 6232 + }, + { + "epoch": 2.858158468065589, + "grad_norm": 0.32087957859039307, + "learning_rate": 5.844124343534707e-06, + "loss": 1.5054, + "step": 6233 + }, + { + "epoch": 2.8586171310629513, + "grad_norm": 0.22067861258983612, + "learning_rate": 5.806432767086534e-06, + "loss": 0.9947, + "step": 6234 + }, + { + "epoch": 2.859075794060314, + "grad_norm": 0.28397810459136963, + "learning_rate": 5.768862419883669e-06, + "loss": 1.1807, + "step": 6235 + }, + { + "epoch": 2.859534457057677, + "grad_norm": 0.28341740369796753, + "learning_rate": 5.731413311142464e-06, + "loss": 0.7978, + "step": 6236 + }, + { + "epoch": 2.8599931200550395, + "grad_norm": 0.30934879183769226, + "learning_rate": 5.694085450049402e-06, + "loss": 1.3267, + "step": 6237 + }, + { + "epoch": 2.8604517830524023, + "grad_norm": 0.32292166352272034, + "learning_rate": 5.656878845761326e-06, + "loss": 1.7622, + "step": 6238 + }, + { + "epoch": 2.8609104460497647, + "grad_norm": 0.41772255301475525, + "learning_rate": 5.619793507405324e-06, + "loss": 1.4691, + "step": 6239 + }, + { + "epoch": 2.8613691090471276, + "grad_norm": 0.3389241695404053, + "learning_rate": 5.582829444078563e-06, + "loss": 1.1824, + "step": 6240 + }, + { + "epoch": 2.8618277720444905, + "grad_norm": 0.15310680866241455, + "learning_rate": 5.5459866648487345e-06, + "loss": 0.7682, + "step": 6241 + }, + { + "epoch": 2.862286435041853, + "grad_norm": 0.33451229333877563, + "learning_rate": 5.509265178753497e-06, + "loss": 1.092, + "step": 6242 + }, + { + "epoch": 2.8627450980392157, + "grad_norm": 0.23663324117660522, + "learning_rate": 5.472664994801091e-06, + "loss": 1.3781, + "step": 6243 + }, + { + "epoch": 2.863203761036578, + "grad_norm": 0.3743632137775421, + "learning_rate": 5.436186121969611e-06, + "loss": 1.5989, + "step": 6244 + }, + { + "epoch": 2.863662424033941, + "grad_norm": 0.3161230683326721, + "learning_rate": 5.3998285692076765e-06, + "loss": 1.1666, + "step": 6245 + }, + { + "epoch": 2.864121087031304, + "grad_norm": 0.16574889421463013, + "learning_rate": 5.363592345434043e-06, + "loss": 1.4562, + "step": 6246 + }, + { + "epoch": 2.8645797500286663, + "grad_norm": 0.35472193360328674, + "learning_rate": 5.327477459537711e-06, + "loss": 0.7368, + "step": 6247 + }, + { + "epoch": 2.865038413026029, + "grad_norm": 0.2824570834636688, + "learning_rate": 5.291483920377926e-06, + "loss": 1.7105, + "step": 6248 + }, + { + "epoch": 2.8654970760233915, + "grad_norm": 0.2135975956916809, + "learning_rate": 5.255611736784183e-06, + "loss": 1.493, + "step": 6249 + }, + { + "epoch": 2.8659557390207544, + "grad_norm": 0.29121702909469604, + "learning_rate": 5.219860917556163e-06, + "loss": 1.2036, + "step": 6250 + }, + { + "epoch": 2.8664144020181173, + "grad_norm": 0.43849441409111023, + "learning_rate": 5.184231471463852e-06, + "loss": 1.9182, + "step": 6251 + }, + { + "epoch": 2.86687306501548, + "grad_norm": 0.3722185492515564, + "learning_rate": 5.1487234072473135e-06, + "loss": 1.6022, + "step": 6252 + }, + { + "epoch": 2.8673317280128425, + "grad_norm": 0.3918931782245636, + "learning_rate": 5.1133367336170245e-06, + "loss": 1.9431, + "step": 6253 + }, + { + "epoch": 2.8677903910102054, + "grad_norm": 0.4557853937149048, + "learning_rate": 5.078071459253541e-06, + "loss": 1.2453, + "step": 6254 + }, + { + "epoch": 2.868249054007568, + "grad_norm": 0.11661559343338013, + "learning_rate": 5.042927592807722e-06, + "loss": 0.6514, + "step": 6255 + }, + { + "epoch": 2.8687077170049307, + "grad_norm": 0.3350408673286438, + "learning_rate": 5.00790514290056e-06, + "loss": 1.8324, + "step": 6256 + }, + { + "epoch": 2.8691663800022935, + "grad_norm": 0.37260258197784424, + "learning_rate": 4.973004118123348e-06, + "loss": 2.1526, + "step": 6257 + }, + { + "epoch": 2.869625042999656, + "grad_norm": 0.6885807514190674, + "learning_rate": 4.938224527037516e-06, + "loss": 1.2802, + "step": 6258 + }, + { + "epoch": 2.870083705997019, + "grad_norm": 0.27935993671417236, + "learning_rate": 4.903566378174795e-06, + "loss": 1.3413, + "step": 6259 + }, + { + "epoch": 2.870542368994381, + "grad_norm": 0.3012205958366394, + "learning_rate": 4.869029680037162e-06, + "loss": 1.4973, + "step": 6260 + }, + { + "epoch": 2.871001031991744, + "grad_norm": 0.4477826654911041, + "learning_rate": 4.834614441096563e-06, + "loss": 1.712, + "step": 6261 + }, + { + "epoch": 2.871459694989107, + "grad_norm": 0.3502928912639618, + "learning_rate": 4.800320669795355e-06, + "loss": 1.2868, + "step": 6262 + }, + { + "epoch": 2.8719183579864693, + "grad_norm": 0.19043685495853424, + "learning_rate": 4.766148374546087e-06, + "loss": 1.1846, + "step": 6263 + }, + { + "epoch": 2.872377020983832, + "grad_norm": 0.27802857756614685, + "learning_rate": 4.7320975637314415e-06, + "loss": 1.6084, + "step": 6264 + }, + { + "epoch": 2.8728356839811946, + "grad_norm": 0.3813520669937134, + "learning_rate": 4.698168245704349e-06, + "loss": 1.7174, + "step": 6265 + }, + { + "epoch": 2.8732943469785575, + "grad_norm": 0.3918079137802124, + "learning_rate": 4.6643604287878726e-06, + "loss": 1.5721, + "step": 6266 + }, + { + "epoch": 2.8737530099759203, + "grad_norm": 0.2761102020740509, + "learning_rate": 4.630674121275325e-06, + "loss": 1.1836, + "step": 6267 + }, + { + "epoch": 2.8742116729732827, + "grad_norm": 0.33837902545928955, + "learning_rate": 4.59710933143026e-06, + "loss": 1.2286, + "step": 6268 + }, + { + "epoch": 2.8746703359706456, + "grad_norm": 0.24910743534564972, + "learning_rate": 4.56366606748626e-06, + "loss": 1.6427, + "step": 6269 + }, + { + "epoch": 2.875128998968008, + "grad_norm": 0.37364476919174194, + "learning_rate": 4.5303443376472635e-06, + "loss": 1.169, + "step": 6270 + }, + { + "epoch": 2.875587661965371, + "grad_norm": 0.38252532482147217, + "learning_rate": 4.49714415008734e-06, + "loss": 1.5502, + "step": 6271 + }, + { + "epoch": 2.8760463249627337, + "grad_norm": 0.35861456394195557, + "learning_rate": 4.464065512950754e-06, + "loss": 1.1111, + "step": 6272 + }, + { + "epoch": 2.8765049879600966, + "grad_norm": 0.3056200444698334, + "learning_rate": 4.4311084343518496e-06, + "loss": 1.8339, + "step": 6273 + }, + { + "epoch": 2.876963650957459, + "grad_norm": 0.2743782103061676, + "learning_rate": 4.398272922375268e-06, + "loss": 1.0009, + "step": 6274 + }, + { + "epoch": 2.877422313954822, + "grad_norm": 0.3063773214817047, + "learning_rate": 4.365558985075846e-06, + "loss": 1.2168, + "step": 6275 + }, + { + "epoch": 2.8778809769521843, + "grad_norm": 0.2390686422586441, + "learning_rate": 4.332966630478497e-06, + "loss": 0.8118, + "step": 6276 + }, + { + "epoch": 2.878339639949547, + "grad_norm": 0.3186044991016388, + "learning_rate": 4.300495866578435e-06, + "loss": 1.162, + "step": 6277 + }, + { + "epoch": 2.87879830294691, + "grad_norm": 0.32464686036109924, + "learning_rate": 4.268146701340847e-06, + "loss": 1.9541, + "step": 6278 + }, + { + "epoch": 2.8792569659442724, + "grad_norm": 0.3670261800289154, + "learning_rate": 4.235919142701272e-06, + "loss": 1.6052, + "step": 6279 + }, + { + "epoch": 2.8797156289416352, + "grad_norm": 0.2734290361404419, + "learning_rate": 4.203813198565387e-06, + "loss": 0.9119, + "step": 6280 + }, + { + "epoch": 2.8801742919389977, + "grad_norm": 0.20262974500656128, + "learning_rate": 4.171828876809003e-06, + "loss": 1.2017, + "step": 6281 + }, + { + "epoch": 2.8806329549363605, + "grad_norm": 0.3488999307155609, + "learning_rate": 4.1399661852781764e-06, + "loss": 1.2915, + "step": 6282 + }, + { + "epoch": 2.8810916179337234, + "grad_norm": 0.39077028632164, + "learning_rate": 4.108225131788934e-06, + "loss": 2.073, + "step": 6283 + }, + { + "epoch": 2.881550280931086, + "grad_norm": 0.3423274755477905, + "learning_rate": 4.076605724127602e-06, + "loss": 1.2798, + "step": 6284 + }, + { + "epoch": 2.8820089439284486, + "grad_norm": 0.3173692226409912, + "learning_rate": 4.045107970050699e-06, + "loss": 1.0493, + "step": 6285 + }, + { + "epoch": 2.882467606925811, + "grad_norm": 0.3519616425037384, + "learning_rate": 4.0137318772848205e-06, + "loss": 1.5614, + "step": 6286 + }, + { + "epoch": 2.882926269923174, + "grad_norm": 0.3195357024669647, + "learning_rate": 3.982477453526756e-06, + "loss": 1.1449, + "step": 6287 + }, + { + "epoch": 2.8833849329205368, + "grad_norm": 0.21202579140663147, + "learning_rate": 3.951344706443427e-06, + "loss": 1.2399, + "step": 6288 + }, + { + "epoch": 2.883843595917899, + "grad_norm": 0.28453898429870605, + "learning_rate": 3.920333643672003e-06, + "loss": 1.2978, + "step": 6289 + }, + { + "epoch": 2.884302258915262, + "grad_norm": 0.3411079943180084, + "learning_rate": 3.88944427281962e-06, + "loss": 1.5203, + "step": 6290 + }, + { + "epoch": 2.8847609219126245, + "grad_norm": 0.32296404242515564, + "learning_rate": 3.858676601463662e-06, + "loss": 1.1908, + "step": 6291 + }, + { + "epoch": 2.8852195849099873, + "grad_norm": 0.3648724853992462, + "learning_rate": 3.828030637151758e-06, + "loss": 1.6689, + "step": 6292 + }, + { + "epoch": 2.88567824790735, + "grad_norm": 0.26379936933517456, + "learning_rate": 3.797506387401506e-06, + "loss": 1.2486, + "step": 6293 + }, + { + "epoch": 2.886136910904713, + "grad_norm": 0.3017299175262451, + "learning_rate": 3.767103859700749e-06, + "loss": 0.9213, + "step": 6294 + }, + { + "epoch": 2.8865955739020754, + "grad_norm": 0.3231751322746277, + "learning_rate": 3.7368230615074105e-06, + "loss": 1.9851, + "step": 6295 + }, + { + "epoch": 2.8870542368994383, + "grad_norm": 0.401769757270813, + "learning_rate": 3.70666400024966e-06, + "loss": 1.3682, + "step": 6296 + }, + { + "epoch": 2.8875128998968007, + "grad_norm": 0.3910331428050995, + "learning_rate": 3.6766266833256346e-06, + "loss": 1.2714, + "step": 6297 + }, + { + "epoch": 2.8879715628941636, + "grad_norm": 0.3329823911190033, + "learning_rate": 3.646711118103774e-06, + "loss": 1.8126, + "step": 6298 + }, + { + "epoch": 2.8884302258915264, + "grad_norm": 0.4459758996963501, + "learning_rate": 3.616917311922596e-06, + "loss": 1.793, + "step": 6299 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.4213806986808777, + "learning_rate": 3.5872452720907e-06, + "loss": 1.9231, + "step": 6300 + }, + { + "epoch": 2.8893475518862517, + "grad_norm": 0.4440397620201111, + "learning_rate": 3.557695005886874e-06, + "loss": 1.5308, + "step": 6301 + }, + { + "epoch": 2.889806214883614, + "grad_norm": 0.263126939535141, + "learning_rate": 3.5282665205599306e-06, + "loss": 0.9987, + "step": 6302 + }, + { + "epoch": 2.890264877880977, + "grad_norm": 0.2880323529243469, + "learning_rate": 3.49895982332904e-06, + "loss": 1.6559, + "step": 6303 + }, + { + "epoch": 2.89072354087834, + "grad_norm": 0.9521461725234985, + "learning_rate": 3.4697749213832284e-06, + "loss": 1.5062, + "step": 6304 + }, + { + "epoch": 2.8911822038757022, + "grad_norm": 0.28019800782203674, + "learning_rate": 3.4407118218818256e-06, + "loss": 1.2073, + "step": 6305 + }, + { + "epoch": 2.891640866873065, + "grad_norm": 0.35234352946281433, + "learning_rate": 3.411770531954128e-06, + "loss": 1.5915, + "step": 6306 + }, + { + "epoch": 2.8920995298704275, + "grad_norm": 0.37154096364974976, + "learning_rate": 3.382951058699735e-06, + "loss": 0.9094, + "step": 6307 + }, + { + "epoch": 2.8925581928677904, + "grad_norm": 0.08573618531227112, + "learning_rate": 3.354253409188268e-06, + "loss": 1.1077, + "step": 6308 + }, + { + "epoch": 2.8930168558651532, + "grad_norm": 0.3723006844520569, + "learning_rate": 3.3256775904594307e-06, + "loss": 0.845, + "step": 6309 + }, + { + "epoch": 2.8934755188625156, + "grad_norm": 0.10502085089683533, + "learning_rate": 3.297223609523059e-06, + "loss": 1.2752, + "step": 6310 + }, + { + "epoch": 2.8939341818598785, + "grad_norm": 0.4659182131290436, + "learning_rate": 3.2688914733591814e-06, + "loss": 0.976, + "step": 6311 + }, + { + "epoch": 2.894392844857241, + "grad_norm": 0.3109420835971832, + "learning_rate": 3.2406811889177933e-06, + "loss": 0.9553, + "step": 6312 + }, + { + "epoch": 2.8948515078546038, + "grad_norm": 0.2608180344104767, + "learning_rate": 3.2125927631191933e-06, + "loss": 0.8474, + "step": 6313 + }, + { + "epoch": 2.8953101708519666, + "grad_norm": 0.21614636480808258, + "learning_rate": 3.184626202853591e-06, + "loss": 1.3879, + "step": 6314 + }, + { + "epoch": 2.8957688338493295, + "grad_norm": 0.3109268546104431, + "learning_rate": 3.1567815149813885e-06, + "loss": 1.1218, + "step": 6315 + }, + { + "epoch": 2.896227496846692, + "grad_norm": 0.2876843214035034, + "learning_rate": 3.129058706333121e-06, + "loss": 1.6882, + "step": 6316 + }, + { + "epoch": 2.8966861598440543, + "grad_norm": 0.38865727186203003, + "learning_rate": 3.1014577837093496e-06, + "loss": 0.799, + "step": 6317 + }, + { + "epoch": 2.897144822841417, + "grad_norm": 0.21412920951843262, + "learning_rate": 3.073978753880824e-06, + "loss": 1.5184, + "step": 6318 + }, + { + "epoch": 2.89760348583878, + "grad_norm": 0.4019043743610382, + "learning_rate": 3.046621623588375e-06, + "loss": 1.6321, + "step": 6319 + }, + { + "epoch": 2.898062148836143, + "grad_norm": 0.18866117298603058, + "learning_rate": 3.0193863995428005e-06, + "loss": 1.1956, + "step": 6320 + }, + { + "epoch": 2.8985208118335053, + "grad_norm": 0.3983370065689087, + "learning_rate": 2.992273088425146e-06, + "loss": 0.9323, + "step": 6321 + }, + { + "epoch": 2.898979474830868, + "grad_norm": 0.17893198132514954, + "learning_rate": 2.96528169688659e-06, + "loss": 1.5333, + "step": 6322 + }, + { + "epoch": 2.8994381378282306, + "grad_norm": 0.3908742368221283, + "learning_rate": 2.938412231548171e-06, + "loss": 1.3581, + "step": 6323 + }, + { + "epoch": 2.8998968008255934, + "grad_norm": 0.28429678082466125, + "learning_rate": 2.911664699001282e-06, + "loss": 1.7076, + "step": 6324 + }, + { + "epoch": 2.9003554638229563, + "grad_norm": 0.36891356110572815, + "learning_rate": 2.8850391058071747e-06, + "loss": 1.2607, + "step": 6325 + }, + { + "epoch": 2.9008141268203187, + "grad_norm": 0.2793447971343994, + "learning_rate": 2.8585354584974022e-06, + "loss": 1.29, + "step": 6326 + }, + { + "epoch": 2.9012727898176816, + "grad_norm": 0.29457083344459534, + "learning_rate": 2.832153763573486e-06, + "loss": 0.7886, + "step": 6327 + }, + { + "epoch": 2.901731452815044, + "grad_norm": 0.3590116798877716, + "learning_rate": 2.8058940275069722e-06, + "loss": 1.5443, + "step": 6328 + }, + { + "epoch": 2.902190115812407, + "grad_norm": 0.43476438522338867, + "learning_rate": 2.7797562567395963e-06, + "loss": 1.8802, + "step": 6329 + }, + { + "epoch": 2.9026487788097697, + "grad_norm": 0.3066580891609192, + "learning_rate": 2.7537404576831737e-06, + "loss": 0.789, + "step": 6330 + }, + { + "epoch": 2.903107441807132, + "grad_norm": 0.07547781616449356, + "learning_rate": 2.727846636719544e-06, + "loss": 0.803, + "step": 6331 + }, + { + "epoch": 2.903566104804495, + "grad_norm": 0.38495829701423645, + "learning_rate": 2.7020748002006824e-06, + "loss": 2.0801, + "step": 6332 + }, + { + "epoch": 2.9040247678018574, + "grad_norm": 0.5015888810157776, + "learning_rate": 2.676424954448531e-06, + "loss": 1.3515, + "step": 6333 + }, + { + "epoch": 2.9044834307992202, + "grad_norm": 0.4295569658279419, + "learning_rate": 2.650897105755279e-06, + "loss": 1.6154, + "step": 6334 + }, + { + "epoch": 2.904942093796583, + "grad_norm": 0.24834929406642914, + "learning_rate": 2.625491260382973e-06, + "loss": 0.7925, + "step": 6335 + }, + { + "epoch": 2.9054007567939455, + "grad_norm": 0.3073466420173645, + "learning_rate": 2.600207424563961e-06, + "loss": 1.0556, + "step": 6336 + }, + { + "epoch": 2.9058594197913084, + "grad_norm": 0.21239478886127472, + "learning_rate": 2.5750456045005035e-06, + "loss": 1.2195, + "step": 6337 + }, + { + "epoch": 2.9063180827886708, + "grad_norm": 0.32616496086120605, + "learning_rate": 2.5500058063649965e-06, + "loss": 0.7554, + "step": 6338 + }, + { + "epoch": 2.9067767457860336, + "grad_norm": 0.08231624960899353, + "learning_rate": 2.5250880362998607e-06, + "loss": 1.3146, + "step": 6339 + }, + { + "epoch": 2.9072354087833965, + "grad_norm": 0.4305974245071411, + "learning_rate": 2.500292300417595e-06, + "loss": 1.6447, + "step": 6340 + }, + { + "epoch": 2.9076940717807593, + "grad_norm": 0.27547237277030945, + "learning_rate": 2.4756186048007225e-06, + "loss": 1.3123, + "step": 6341 + }, + { + "epoch": 2.9081527347781218, + "grad_norm": 0.25122833251953125, + "learning_rate": 2.4510669555020125e-06, + "loss": 1.1544, + "step": 6342 + }, + { + "epoch": 2.9086113977754846, + "grad_norm": 0.27697306871414185, + "learning_rate": 2.4266373585440924e-06, + "loss": 0.4727, + "step": 6343 + }, + { + "epoch": 2.909070060772847, + "grad_norm": 0.3418446481227875, + "learning_rate": 2.402329819919724e-06, + "loss": 1.6129, + "step": 6344 + }, + { + "epoch": 2.90952872377021, + "grad_norm": 0.4216833710670471, + "learning_rate": 2.3781443455916927e-06, + "loss": 1.2566, + "step": 6345 + }, + { + "epoch": 2.9099873867675727, + "grad_norm": 0.3174903690814972, + "learning_rate": 2.3540809414929196e-06, + "loss": 1.3995, + "step": 6346 + }, + { + "epoch": 2.910446049764935, + "grad_norm": 0.2971450984477997, + "learning_rate": 2.3301396135262387e-06, + "loss": 1.087, + "step": 6347 + }, + { + "epoch": 2.910904712762298, + "grad_norm": 0.41538989543914795, + "learning_rate": 2.3063203675647848e-06, + "loss": 1.3513, + "step": 6348 + }, + { + "epoch": 2.9113633757596604, + "grad_norm": 0.36953091621398926, + "learning_rate": 2.28262320945144e-06, + "loss": 1.7742, + "step": 6349 + }, + { + "epoch": 2.9118220387570233, + "grad_norm": 0.31279754638671875, + "learning_rate": 2.259048144999387e-06, + "loss": 1.3542, + "step": 6350 + }, + { + "epoch": 2.912280701754386, + "grad_norm": 0.2468571960926056, + "learning_rate": 2.2355951799916674e-06, + "loss": 1.3647, + "step": 6351 + }, + { + "epoch": 2.9127393647517485, + "grad_norm": 0.577455461025238, + "learning_rate": 2.212264320181567e-06, + "loss": 1.2788, + "step": 6352 + }, + { + "epoch": 2.9131980277491114, + "grad_norm": 0.3301372230052948, + "learning_rate": 2.1890555712922313e-06, + "loss": 1.6881, + "step": 6353 + }, + { + "epoch": 2.913656690746474, + "grad_norm": 0.3213047385215759, + "learning_rate": 2.1659689390169934e-06, + "loss": 1.3643, + "step": 6354 + }, + { + "epoch": 2.9141153537438367, + "grad_norm": 0.3952171504497528, + "learning_rate": 2.1430044290191573e-06, + "loss": 1.7974, + "step": 6355 + }, + { + "epoch": 2.9145740167411995, + "grad_norm": 0.3204817771911621, + "learning_rate": 2.1201620469320503e-06, + "loss": 1.4843, + "step": 6356 + }, + { + "epoch": 2.915032679738562, + "grad_norm": 0.4052104949951172, + "learning_rate": 2.0974417983590787e-06, + "loss": 1.7517, + "step": 6357 + }, + { + "epoch": 2.915491342735925, + "grad_norm": 0.28286999464035034, + "learning_rate": 2.0748436888737286e-06, + "loss": 1.0675, + "step": 6358 + }, + { + "epoch": 2.915950005733287, + "grad_norm": 0.4025648534297943, + "learning_rate": 2.0523677240193994e-06, + "loss": 1.6526, + "step": 6359 + }, + { + "epoch": 2.91640866873065, + "grad_norm": 0.41483741998672485, + "learning_rate": 2.030013909309736e-06, + "loss": 2.1431, + "step": 6360 + }, + { + "epoch": 2.916867331728013, + "grad_norm": 0.36023521423339844, + "learning_rate": 2.0077822502281295e-06, + "loss": 1.3888, + "step": 6361 + }, + { + "epoch": 2.917325994725376, + "grad_norm": 0.3408292233943939, + "learning_rate": 1.9856727522282734e-06, + "loss": 1.2604, + "step": 6362 + }, + { + "epoch": 2.917784657722738, + "grad_norm": 0.2545235753059387, + "learning_rate": 1.963685420733774e-06, + "loss": 1.4949, + "step": 6363 + }, + { + "epoch": 2.918243320720101, + "grad_norm": 0.27326932549476624, + "learning_rate": 1.9418202611382607e-06, + "loss": 0.7846, + "step": 6364 + }, + { + "epoch": 2.9187019837174635, + "grad_norm": 0.33998316526412964, + "learning_rate": 1.920077278805443e-06, + "loss": 1.5005, + "step": 6365 + }, + { + "epoch": 2.9191606467148263, + "grad_norm": 0.3685235381126404, + "learning_rate": 1.8984564790689996e-06, + "loss": 1.4185, + "step": 6366 + }, + { + "epoch": 2.919619309712189, + "grad_norm": 0.18980297446250916, + "learning_rate": 1.8769578672326316e-06, + "loss": 1.2739, + "step": 6367 + }, + { + "epoch": 2.9200779727095516, + "grad_norm": 0.30294135212898254, + "learning_rate": 1.8555814485702316e-06, + "loss": 1.1948, + "step": 6368 + }, + { + "epoch": 2.9205366357069145, + "grad_norm": 0.3762916922569275, + "learning_rate": 1.8343272283254386e-06, + "loss": 1.7822, + "step": 6369 + }, + { + "epoch": 2.920995298704277, + "grad_norm": 0.3765801787376404, + "learning_rate": 1.813195211712193e-06, + "loss": 1.1016, + "step": 6370 + }, + { + "epoch": 2.9214539617016397, + "grad_norm": 0.22130340337753296, + "learning_rate": 1.792185403914237e-06, + "loss": 1.6736, + "step": 6371 + }, + { + "epoch": 2.9219126246990026, + "grad_norm": 0.33724433183670044, + "learning_rate": 1.7712978100854482e-06, + "loss": 1.2495, + "step": 6372 + }, + { + "epoch": 2.922371287696365, + "grad_norm": 0.30301743745803833, + "learning_rate": 1.7505324353497831e-06, + "loss": 1.7449, + "step": 6373 + }, + { + "epoch": 2.922829950693728, + "grad_norm": 0.2647053003311157, + "learning_rate": 1.7298892848010006e-06, + "loss": 1.3491, + "step": 6374 + }, + { + "epoch": 2.9232886136910903, + "grad_norm": 0.47797808051109314, + "learning_rate": 1.7093683635031609e-06, + "loss": 1.2687, + "step": 6375 + }, + { + "epoch": 2.923747276688453, + "grad_norm": 0.32065433263778687, + "learning_rate": 1.6889696764900708e-06, + "loss": 1.3747, + "step": 6376 + }, + { + "epoch": 2.924205939685816, + "grad_norm": 0.24311882257461548, + "learning_rate": 1.668693228765783e-06, + "loss": 0.6366, + "step": 6377 + }, + { + "epoch": 2.9246646026831784, + "grad_norm": 0.15887467563152313, + "learning_rate": 1.6485390253041521e-06, + "loss": 1.2029, + "step": 6378 + }, + { + "epoch": 2.9251232656805413, + "grad_norm": 0.33973756432533264, + "learning_rate": 1.6285070710492233e-06, + "loss": 1.6896, + "step": 6379 + }, + { + "epoch": 2.9255819286779037, + "grad_norm": 0.38102298974990845, + "learning_rate": 1.6085973709149548e-06, + "loss": 1.0677, + "step": 6380 + }, + { + "epoch": 2.9260405916752665, + "grad_norm": 0.3507750332355499, + "learning_rate": 1.5888099297853288e-06, + "loss": 1.2086, + "step": 6381 + }, + { + "epoch": 2.9264992546726294, + "grad_norm": 0.1782960146665573, + "learning_rate": 1.5691447525143509e-06, + "loss": 1.05, + "step": 6382 + }, + { + "epoch": 2.9269579176699922, + "grad_norm": 0.25808510184288025, + "learning_rate": 1.5496018439260518e-06, + "loss": 0.8826, + "step": 6383 + }, + { + "epoch": 2.9274165806673547, + "grad_norm": 0.2699086368083954, + "learning_rate": 1.5301812088144296e-06, + "loss": 0.7635, + "step": 6384 + }, + { + "epoch": 2.927875243664717, + "grad_norm": 0.12165196985006332, + "learning_rate": 1.510882851943507e-06, + "loss": 1.2339, + "step": 6385 + }, + { + "epoch": 2.92833390666208, + "grad_norm": 0.27769702672958374, + "learning_rate": 1.4917067780473304e-06, + "loss": 0.799, + "step": 6386 + }, + { + "epoch": 2.928792569659443, + "grad_norm": 0.29953867197036743, + "learning_rate": 1.4726529918299148e-06, + "loss": 1.9141, + "step": 6387 + }, + { + "epoch": 2.9292512326568056, + "grad_norm": 0.39282310009002686, + "learning_rate": 1.4537214979652435e-06, + "loss": 1.6138, + "step": 6388 + }, + { + "epoch": 2.929709895654168, + "grad_norm": 0.28682559728622437, + "learning_rate": 1.4349123010974908e-06, + "loss": 0.9948, + "step": 6389 + }, + { + "epoch": 2.930168558651531, + "grad_norm": 0.3742808699607849, + "learning_rate": 1.416225405840521e-06, + "loss": 1.5709, + "step": 6390 + }, + { + "epoch": 2.9306272216488933, + "grad_norm": 0.36167392134666443, + "learning_rate": 1.39766081677839e-06, + "loss": 1.3771, + "step": 6391 + }, + { + "epoch": 2.931085884646256, + "grad_norm": 0.33344385027885437, + "learning_rate": 1.3792185384652322e-06, + "loss": 1.4467, + "step": 6392 + }, + { + "epoch": 2.931544547643619, + "grad_norm": 0.3055780231952667, + "learning_rate": 1.3608985754249848e-06, + "loss": 1.4431, + "step": 6393 + }, + { + "epoch": 2.9320032106409815, + "grad_norm": 0.39428263902664185, + "learning_rate": 1.3427009321517191e-06, + "loss": 2.0354, + "step": 6394 + }, + { + "epoch": 2.9324618736383443, + "grad_norm": 0.4126293361186981, + "learning_rate": 1.3246256131093647e-06, + "loss": 1.7002, + "step": 6395 + }, + { + "epoch": 2.9329205366357067, + "grad_norm": 0.2903102934360504, + "learning_rate": 1.3066726227320414e-06, + "loss": 1.148, + "step": 6396 + }, + { + "epoch": 2.9333791996330696, + "grad_norm": 0.35992351174354553, + "learning_rate": 1.2888419654236706e-06, + "loss": 1.39, + "step": 6397 + }, + { + "epoch": 2.9338378626304324, + "grad_norm": 0.37300676107406616, + "learning_rate": 1.2711336455582533e-06, + "loss": 1.2245, + "step": 6398 + }, + { + "epoch": 2.934296525627795, + "grad_norm": 0.21308903396129608, + "learning_rate": 1.2535476674797596e-06, + "loss": 1.2061, + "step": 6399 + }, + { + "epoch": 2.9347551886251577, + "grad_norm": 0.2780902683734894, + "learning_rate": 1.2360840355022386e-06, + "loss": 1.5712, + "step": 6400 + }, + { + "epoch": 2.93521385162252, + "grad_norm": 0.36661916971206665, + "learning_rate": 1.2187427539094854e-06, + "loss": 1.1625, + "step": 6401 + }, + { + "epoch": 2.935672514619883, + "grad_norm": 0.28895169496536255, + "learning_rate": 1.2015238269555972e-06, + "loss": 0.7739, + "step": 6402 + }, + { + "epoch": 2.936131177617246, + "grad_norm": 0.2886188328266144, + "learning_rate": 1.184427258864418e-06, + "loss": 1.7198, + "step": 6403 + }, + { + "epoch": 2.9365898406146083, + "grad_norm": 0.36322250962257385, + "learning_rate": 1.16745305382987e-06, + "loss": 0.7532, + "step": 6404 + }, + { + "epoch": 2.937048503611971, + "grad_norm": 0.3152855634689331, + "learning_rate": 1.1506012160158452e-06, + "loss": 1.7031, + "step": 6405 + }, + { + "epoch": 2.9375071666093335, + "grad_norm": 0.39350810647010803, + "learning_rate": 1.133871749556259e-06, + "loss": 1.6578, + "step": 6406 + }, + { + "epoch": 2.9379658296066964, + "grad_norm": 0.23590238392353058, + "learning_rate": 1.1172646585549396e-06, + "loss": 1.6073, + "step": 6407 + }, + { + "epoch": 2.9384244926040592, + "grad_norm": 0.3189743161201477, + "learning_rate": 1.1007799470857393e-06, + "loss": 1.1992, + "step": 6408 + }, + { + "epoch": 2.938883155601422, + "grad_norm": 0.45767655968666077, + "learning_rate": 1.0844176191924237e-06, + "loss": 1.8853, + "step": 6409 + }, + { + "epoch": 2.9393418185987845, + "grad_norm": 0.3135172128677368, + "learning_rate": 1.068177678888893e-06, + "loss": 1.7242, + "step": 6410 + }, + { + "epoch": 2.9398004815961474, + "grad_norm": 0.3678114712238312, + "learning_rate": 1.0520601301588494e-06, + "loss": 1.4546, + "step": 6411 + }, + { + "epoch": 2.94025914459351, + "grad_norm": 0.33061683177948, + "learning_rate": 1.0360649769560193e-06, + "loss": 1.214, + "step": 6412 + }, + { + "epoch": 2.9407178075908726, + "grad_norm": 0.31986305117607117, + "learning_rate": 1.0201922232041528e-06, + "loss": 1.6663, + "step": 6413 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 0.4389913082122803, + "learning_rate": 1.0044418727970238e-06, + "loss": 1.4183, + "step": 6414 + }, + { + "epoch": 2.941635133585598, + "grad_norm": 0.22724516689777374, + "learning_rate": 9.88813929598209e-07, + "loss": 1.3397, + "step": 6415 + }, + { + "epoch": 2.9420937965829608, + "grad_norm": 0.41958609223365784, + "learning_rate": 9.733083974414193e-07, + "loss": 1.3082, + "step": 6416 + }, + { + "epoch": 2.942552459580323, + "grad_norm": 0.3317757546901703, + "learning_rate": 9.579252801302785e-07, + "loss": 1.5914, + "step": 6417 + }, + { + "epoch": 2.943011122577686, + "grad_norm": 0.2724321782588959, + "learning_rate": 9.426645814382683e-07, + "loss": 1.1695, + "step": 6418 + }, + { + "epoch": 2.943469785575049, + "grad_norm": 0.3204740285873413, + "learning_rate": 9.27526305109061e-07, + "loss": 1.5457, + "step": 6419 + }, + { + "epoch": 2.9439284485724113, + "grad_norm": 0.34586411714553833, + "learning_rate": 9.125104548561857e-07, + "loss": 1.2657, + "step": 6420 + }, + { + "epoch": 2.944387111569774, + "grad_norm": 0.3183816075325012, + "learning_rate": 8.976170343630297e-07, + "loss": 0.9479, + "step": 6421 + }, + { + "epoch": 2.9448457745671366, + "grad_norm": 0.23148377239704132, + "learning_rate": 8.828460472832256e-07, + "loss": 1.7264, + "step": 6422 + }, + { + "epoch": 2.9453044375644994, + "grad_norm": 0.37857529520988464, + "learning_rate": 8.68197497240042e-07, + "loss": 0.9766, + "step": 6423 + }, + { + "epoch": 2.9457631005618623, + "grad_norm": 0.10237754881381989, + "learning_rate": 8.536713878269376e-07, + "loss": 0.4709, + "step": 6424 + }, + { + "epoch": 2.9462217635592247, + "grad_norm": 0.21351243555545807, + "learning_rate": 8.392677226072843e-07, + "loss": 1.2244, + "step": 6425 + }, + { + "epoch": 2.9466804265565876, + "grad_norm": 0.2921064794063568, + "learning_rate": 8.249865051143668e-07, + "loss": 1.2155, + "step": 6426 + }, + { + "epoch": 2.94713908955395, + "grad_norm": 0.3035675585269928, + "learning_rate": 8.108277388515495e-07, + "loss": 1.0669, + "step": 6427 + }, + { + "epoch": 2.947597752551313, + "grad_norm": 0.2804781198501587, + "learning_rate": 7.967914272919985e-07, + "loss": 0.9862, + "step": 6428 + }, + { + "epoch": 2.9480564155486757, + "grad_norm": 0.34943005442619324, + "learning_rate": 7.828775738789595e-07, + "loss": 1.194, + "step": 6429 + }, + { + "epoch": 2.9485150785460386, + "grad_norm": 0.13486827909946442, + "learning_rate": 7.690861820255912e-07, + "loss": 0.9005, + "step": 6430 + }, + { + "epoch": 2.948973741543401, + "grad_norm": 0.3738475739955902, + "learning_rate": 7.554172551150206e-07, + "loss": 1.5659, + "step": 6431 + }, + { + "epoch": 2.949432404540764, + "grad_norm": 0.2654702961444855, + "learning_rate": 7.418707965003435e-07, + "loss": 1.1194, + "step": 6432 + }, + { + "epoch": 2.9498910675381262, + "grad_norm": 0.36275702714920044, + "learning_rate": 7.284468095045682e-07, + "loss": 1.5873, + "step": 6433 + }, + { + "epoch": 2.950349730535489, + "grad_norm": 0.386787474155426, + "learning_rate": 7.151452974207828e-07, + "loss": 1.2502, + "step": 6434 + }, + { + "epoch": 2.950808393532852, + "grad_norm": 0.1252823770046234, + "learning_rate": 7.019662635118218e-07, + "loss": 0.8411, + "step": 6435 + }, + { + "epoch": 2.9512670565302144, + "grad_norm": 0.24427084624767303, + "learning_rate": 6.8890971101071e-07, + "loss": 0.7842, + "step": 6436 + }, + { + "epoch": 2.9517257195275772, + "grad_norm": 0.26967909932136536, + "learning_rate": 6.759756431202746e-07, + "loss": 1.2764, + "step": 6437 + }, + { + "epoch": 2.9521843825249396, + "grad_norm": 0.3875824511051178, + "learning_rate": 6.631640630133106e-07, + "loss": 1.6353, + "step": 6438 + }, + { + "epoch": 2.9526430455223025, + "grad_norm": 0.3367568850517273, + "learning_rate": 6.504749738325822e-07, + "loss": 1.6357, + "step": 6439 + }, + { + "epoch": 2.9531017085196654, + "grad_norm": 0.36924004554748535, + "learning_rate": 6.379083786908768e-07, + "loss": 1.1762, + "step": 6440 + }, + { + "epoch": 2.9535603715170278, + "grad_norm": 0.09012544900178909, + "learning_rate": 6.254642806707845e-07, + "loss": 0.7179, + "step": 6441 + }, + { + "epoch": 2.9540190345143906, + "grad_norm": 0.40828028321266174, + "learning_rate": 6.131426828250297e-07, + "loss": 2.0659, + "step": 6442 + }, + { + "epoch": 2.954477697511753, + "grad_norm": 0.7955999374389648, + "learning_rate": 6.009435881760838e-07, + "loss": 1.0877, + "step": 6443 + }, + { + "epoch": 2.954936360509116, + "grad_norm": 0.1659400314092636, + "learning_rate": 5.888669997165529e-07, + "loss": 1.1334, + "step": 6444 + }, + { + "epoch": 2.9553950235064788, + "grad_norm": 0.3118785619735718, + "learning_rate": 5.769129204089007e-07, + "loss": 1.781, + "step": 6445 + }, + { + "epoch": 2.955853686503841, + "grad_norm": 0.4559882879257202, + "learning_rate": 5.650813531855592e-07, + "loss": 2.0151, + "step": 6446 + }, + { + "epoch": 2.956312349501204, + "grad_norm": 0.4162604808807373, + "learning_rate": 5.533723009488734e-07, + "loss": 1.5276, + "step": 6447 + }, + { + "epoch": 2.9567710124985664, + "grad_norm": 0.3055538237094879, + "learning_rate": 5.417857665711012e-07, + "loss": 0.8876, + "step": 6448 + }, + { + "epoch": 2.9572296754959293, + "grad_norm": 0.28854599595069885, + "learning_rate": 5.303217528945802e-07, + "loss": 1.6246, + "step": 6449 + }, + { + "epoch": 2.957688338493292, + "grad_norm": 0.3452959358692169, + "learning_rate": 5.18980262731561e-07, + "loss": 0.9966, + "step": 6450 + }, + { + "epoch": 2.958147001490655, + "grad_norm": 0.16069626808166504, + "learning_rate": 5.077612988640401e-07, + "loss": 0.626, + "step": 6451 + }, + { + "epoch": 2.9586056644880174, + "grad_norm": 0.2646504342556, + "learning_rate": 4.966648640442606e-07, + "loss": 1.5773, + "step": 6452 + }, + { + "epoch": 2.95906432748538, + "grad_norm": 0.40957656502723694, + "learning_rate": 4.856909609941562e-07, + "loss": 1.1276, + "step": 6453 + }, + { + "epoch": 2.9595229904827427, + "grad_norm": 0.27915382385253906, + "learning_rate": 4.7483959240574025e-07, + "loss": 1.4488, + "step": 6454 + }, + { + "epoch": 2.9599816534801056, + "grad_norm": 0.36528947949409485, + "learning_rate": 4.6411076094099447e-07, + "loss": 1.6633, + "step": 6455 + }, + { + "epoch": 2.9604403164774684, + "grad_norm": 0.3213283121585846, + "learning_rate": 4.53504469231647e-07, + "loss": 1.697, + "step": 6456 + }, + { + "epoch": 2.960898979474831, + "grad_norm": 0.3525945842266083, + "learning_rate": 4.430207198796166e-07, + "loss": 1.1332, + "step": 6457 + }, + { + "epoch": 2.9613576424721937, + "grad_norm": 0.3523452877998352, + "learning_rate": 4.3265951545656823e-07, + "loss": 1.9775, + "step": 6458 + }, + { + "epoch": 2.961816305469556, + "grad_norm": 0.3788760006427765, + "learning_rate": 4.224208585042466e-07, + "loss": 1.3544, + "step": 6459 + }, + { + "epoch": 2.962274968466919, + "grad_norm": 0.3480757772922516, + "learning_rate": 4.123047515341982e-07, + "loss": 1.3392, + "step": 6460 + }, + { + "epoch": 2.962733631464282, + "grad_norm": 0.33295026421546936, + "learning_rate": 4.0231119702799355e-07, + "loss": 1.4835, + "step": 6461 + }, + { + "epoch": 2.9631922944616442, + "grad_norm": 0.32605263590812683, + "learning_rate": 3.924401974371716e-07, + "loss": 1.1706, + "step": 6462 + }, + { + "epoch": 2.963650957459007, + "grad_norm": 0.3713498115539551, + "learning_rate": 3.826917551831288e-07, + "loss": 1.5077, + "step": 6463 + }, + { + "epoch": 2.9641096204563695, + "grad_norm": 0.41880208253860474, + "learning_rate": 3.7306587265717453e-07, + "loss": 1.3178, + "step": 6464 + }, + { + "epoch": 2.9645682834537324, + "grad_norm": 0.19970370829105377, + "learning_rate": 3.6356255222069756e-07, + "loss": 0.8712, + "step": 6465 + }, + { + "epoch": 2.965026946451095, + "grad_norm": 0.27967116236686707, + "learning_rate": 3.5418179620488876e-07, + "loss": 1.4177, + "step": 6466 + }, + { + "epoch": 2.9654856094484576, + "grad_norm": 0.43477344512939453, + "learning_rate": 3.449236069109074e-07, + "loss": 1.1676, + "step": 6467 + }, + { + "epoch": 2.9659442724458205, + "grad_norm": 0.274976909160614, + "learning_rate": 3.357879866098812e-07, + "loss": 1.6171, + "step": 6468 + }, + { + "epoch": 2.966402935443183, + "grad_norm": 0.28893306851387024, + "learning_rate": 3.267749375427953e-07, + "loss": 0.8573, + "step": 6469 + }, + { + "epoch": 2.9668615984405458, + "grad_norm": 0.12806662917137146, + "learning_rate": 3.178844619207144e-07, + "loss": 0.8589, + "step": 6470 + }, + { + "epoch": 2.9673202614379086, + "grad_norm": 0.24989382922649384, + "learning_rate": 3.091165619243941e-07, + "loss": 1.3807, + "step": 6471 + }, + { + "epoch": 2.967778924435271, + "grad_norm": 0.2839709520339966, + "learning_rate": 3.004712397047804e-07, + "loss": 1.2, + "step": 6472 + }, + { + "epoch": 2.968237587432634, + "grad_norm": 0.4096415638923645, + "learning_rate": 2.919484973826214e-07, + "loss": 1.9077, + "step": 6473 + }, + { + "epoch": 2.9686962504299963, + "grad_norm": 0.32572391629219055, + "learning_rate": 2.8354833704852257e-07, + "loss": 1.0639, + "step": 6474 + }, + { + "epoch": 2.969154913427359, + "grad_norm": 0.3826916515827179, + "learning_rate": 2.752707607631688e-07, + "loss": 1.9023, + "step": 6475 + }, + { + "epoch": 2.969613576424722, + "grad_norm": 0.48622024059295654, + "learning_rate": 2.6711577055710255e-07, + "loss": 1.61, + "step": 6476 + }, + { + "epoch": 2.970072239422085, + "grad_norm": 0.262005478143692, + "learning_rate": 2.590833684307792e-07, + "loss": 1.0063, + "step": 6477 + }, + { + "epoch": 2.9705309024194473, + "grad_norm": 0.21147648990154266, + "learning_rate": 2.51173556354678e-07, + "loss": 0.8564, + "step": 6478 + }, + { + "epoch": 2.97098956541681, + "grad_norm": 0.34185197949409485, + "learning_rate": 2.433863362690247e-07, + "loss": 1.8838, + "step": 6479 + }, + { + "epoch": 2.9714482284141726, + "grad_norm": 0.334324449300766, + "learning_rate": 2.357217100841802e-07, + "loss": 0.827, + "step": 6480 + }, + { + "epoch": 2.9719068914115354, + "grad_norm": 0.28424781560897827, + "learning_rate": 2.2817967968025155e-07, + "loss": 1.6332, + "step": 6481 + }, + { + "epoch": 2.9723655544088983, + "grad_norm": 0.31317272782325745, + "learning_rate": 2.2076024690742546e-07, + "loss": 1.1931, + "step": 6482 + }, + { + "epoch": 2.9728242174062607, + "grad_norm": 0.24583421647548676, + "learning_rate": 2.1346341358569054e-07, + "loss": 0.6048, + "step": 6483 + }, + { + "epoch": 2.9732828804036235, + "grad_norm": 0.38179898262023926, + "learning_rate": 2.0628918150500387e-07, + "loss": 1.2948, + "step": 6484 + }, + { + "epoch": 2.973741543400986, + "grad_norm": 0.18613789975643158, + "learning_rate": 1.9923755242529094e-07, + "loss": 1.0493, + "step": 6485 + }, + { + "epoch": 2.974200206398349, + "grad_norm": 0.25649017095565796, + "learning_rate": 1.9230852807639031e-07, + "loss": 1.1181, + "step": 6486 + }, + { + "epoch": 2.9746588693957117, + "grad_norm": 0.30037227272987366, + "learning_rate": 1.8550211015794239e-07, + "loss": 1.7121, + "step": 6487 + }, + { + "epoch": 2.975117532393074, + "grad_norm": 0.31024396419525146, + "learning_rate": 1.7881830033972258e-07, + "loss": 1.5497, + "step": 6488 + }, + { + "epoch": 2.975576195390437, + "grad_norm": 0.4017513692378998, + "learning_rate": 1.7225710026125275e-07, + "loss": 1.1522, + "step": 6489 + }, + { + "epoch": 2.9760348583877994, + "grad_norm": 0.34826523065567017, + "learning_rate": 1.658185115320787e-07, + "loss": 1.6506, + "step": 6490 + }, + { + "epoch": 2.976493521385162, + "grad_norm": 0.36130425333976746, + "learning_rate": 1.5950253573160377e-07, + "loss": 1.4875, + "step": 6491 + }, + { + "epoch": 2.976952184382525, + "grad_norm": 0.31285083293914795, + "learning_rate": 1.5330917440919967e-07, + "loss": 1.5449, + "step": 6492 + }, + { + "epoch": 2.9774108473798875, + "grad_norm": 0.3980162441730499, + "learning_rate": 1.472384290841511e-07, + "loss": 1.8258, + "step": 6493 + }, + { + "epoch": 2.9778695103772503, + "grad_norm": 0.3458695709705353, + "learning_rate": 1.4129030124560016e-07, + "loss": 0.7553, + "step": 6494 + }, + { + "epoch": 2.9783281733746128, + "grad_norm": 0.2076312005519867, + "learning_rate": 1.3546479235276854e-07, + "loss": 1.4727, + "step": 6495 + }, + { + "epoch": 2.9787868363719756, + "grad_norm": 0.3255481421947479, + "learning_rate": 1.2976190383456876e-07, + "loss": 0.87, + "step": 6496 + }, + { + "epoch": 2.9792454993693385, + "grad_norm": 0.2653532326221466, + "learning_rate": 1.2418163709004836e-07, + "loss": 1.4395, + "step": 6497 + }, + { + "epoch": 2.9797041623667013, + "grad_norm": 0.22563296556472778, + "learning_rate": 1.1872399348805685e-07, + "loss": 1.1741, + "step": 6498 + }, + { + "epoch": 2.9801628253640637, + "grad_norm": 0.3683829605579376, + "learning_rate": 1.1338897436741213e-07, + "loss": 0.9402, + "step": 6499 + }, + { + "epoch": 2.9806214883614266, + "grad_norm": 0.10145165026187897, + "learning_rate": 1.0817658103684513e-07, + "loss": 0.6497, + "step": 6500 + }, + { + "epoch": 2.981080151358789, + "grad_norm": 0.24345941841602325, + "learning_rate": 1.030868147749442e-07, + "loss": 1.7396, + "step": 6501 + }, + { + "epoch": 2.981538814356152, + "grad_norm": 0.37198102474212646, + "learning_rate": 9.811967683026613e-08, + "loss": 1.5526, + "step": 6502 + }, + { + "epoch": 2.9819974773535147, + "grad_norm": 0.3141441345214844, + "learning_rate": 9.32751684213362e-08, + "loss": 1.2018, + "step": 6503 + }, + { + "epoch": 2.982456140350877, + "grad_norm": 0.40361887216567993, + "learning_rate": 8.855329073653717e-08, + "loss": 1.7252, + "step": 6504 + }, + { + "epoch": 2.98291480334824, + "grad_norm": 0.4085117280483246, + "learning_rate": 8.395404493410919e-08, + "loss": 1.1441, + "step": 6505 + }, + { + "epoch": 2.9833734663456024, + "grad_norm": 0.20205476880073547, + "learning_rate": 7.947743214237191e-08, + "loss": 0.9854, + "step": 6506 + }, + { + "epoch": 2.9838321293429653, + "grad_norm": 0.31978803873062134, + "learning_rate": 7.512345345944693e-08, + "loss": 1.3225, + "step": 6507 + }, + { + "epoch": 2.984290792340328, + "grad_norm": 0.30714917182922363, + "learning_rate": 7.089210995336881e-08, + "loss": 1.1396, + "step": 6508 + }, + { + "epoch": 2.9847494553376905, + "grad_norm": 0.07466486096382141, + "learning_rate": 6.678340266214056e-08, + "loss": 1.0693, + "step": 6509 + }, + { + "epoch": 2.9852081183350534, + "grad_norm": 0.3598880469799042, + "learning_rate": 6.279733259362264e-08, + "loss": 0.9477, + "step": 6510 + }, + { + "epoch": 2.985666781332416, + "grad_norm": 0.12736907601356506, + "learning_rate": 5.8933900725699486e-08, + "loss": 0.9232, + "step": 6511 + }, + { + "epoch": 2.9861254443297787, + "grad_norm": 0.20195496082305908, + "learning_rate": 5.519310800600197e-08, + "loss": 0.8135, + "step": 6512 + }, + { + "epoch": 2.9865841073271415, + "grad_norm": 0.2941180169582367, + "learning_rate": 5.157495535229595e-08, + "loss": 1.6885, + "step": 6513 + }, + { + "epoch": 2.987042770324504, + "grad_norm": 0.5080846548080444, + "learning_rate": 4.807944365198269e-08, + "loss": 1.3666, + "step": 6514 + }, + { + "epoch": 2.987501433321867, + "grad_norm": 0.2823677659034729, + "learning_rate": 4.470657376265397e-08, + "loss": 0.9075, + "step": 6515 + }, + { + "epoch": 2.987960096319229, + "grad_norm": 0.18029870092868805, + "learning_rate": 4.145634651170349e-08, + "loss": 0.8622, + "step": 6516 + }, + { + "epoch": 2.988418759316592, + "grad_norm": 0.267039954662323, + "learning_rate": 3.83287626963269e-08, + "loss": 1.145, + "step": 6517 + }, + { + "epoch": 2.988877422313955, + "grad_norm": 0.28512319922447205, + "learning_rate": 3.5323823083854845e-08, + "loss": 1.2058, + "step": 6518 + }, + { + "epoch": 2.989336085311318, + "grad_norm": 0.27092644572257996, + "learning_rate": 3.24415284114199e-08, + "loss": 1.3474, + "step": 6519 + }, + { + "epoch": 2.98979474830868, + "grad_norm": 0.34065067768096924, + "learning_rate": 2.9681879385956566e-08, + "loss": 2.1089, + "step": 6520 + }, + { + "epoch": 2.9902534113060426, + "grad_norm": 0.4673296809196472, + "learning_rate": 2.704487668453437e-08, + "loss": 2.0068, + "step": 6521 + }, + { + "epoch": 2.9907120743034055, + "grad_norm": 0.2979157865047455, + "learning_rate": 2.4530520954024748e-08, + "loss": 0.7067, + "step": 6522 + }, + { + "epoch": 2.9911707373007683, + "grad_norm": 0.2104611098766327, + "learning_rate": 2.2138812811156596e-08, + "loss": 1.7247, + "step": 6523 + }, + { + "epoch": 2.991629400298131, + "grad_norm": 0.4868721067905426, + "learning_rate": 1.9869752842682777e-08, + "loss": 1.6491, + "step": 6524 + }, + { + "epoch": 2.9920880632954936, + "grad_norm": 0.33822518587112427, + "learning_rate": 1.7723341605158095e-08, + "loss": 2.0029, + "step": 6525 + }, + { + "epoch": 2.9925467262928565, + "grad_norm": 0.3121846616268158, + "learning_rate": 1.5699579625216842e-08, + "loss": 1.4298, + "step": 6526 + }, + { + "epoch": 2.993005389290219, + "grad_norm": 0.3756405711174011, + "learning_rate": 1.3798467399184222e-08, + "loss": 1.7493, + "step": 6527 + }, + { + "epoch": 2.9934640522875817, + "grad_norm": 0.37064328789711, + "learning_rate": 1.2020005393520439e-08, + "loss": 1.5358, + "step": 6528 + }, + { + "epoch": 2.9939227152849446, + "grad_norm": 0.2886911928653717, + "learning_rate": 1.0364194044432117e-08, + "loss": 1.3199, + "step": 6529 + }, + { + "epoch": 2.994381378282307, + "grad_norm": 0.3759532868862152, + "learning_rate": 8.831033758149865e-09, + "loss": 1.3483, + "step": 6530 + }, + { + "epoch": 2.99484004127967, + "grad_norm": 0.21223415434360504, + "learning_rate": 7.420524910706217e-09, + "loss": 0.9441, + "step": 6531 + }, + { + "epoch": 2.9952987042770323, + "grad_norm": 0.2493351846933365, + "learning_rate": 6.13266784810218e-09, + "loss": 0.9465, + "step": 6532 + }, + { + "epoch": 2.995757367274395, + "grad_norm": 0.27161523699760437, + "learning_rate": 4.967462886362739e-09, + "loss": 1.5159, + "step": 6533 + }, + { + "epoch": 2.996216030271758, + "grad_norm": 0.40845414996147156, + "learning_rate": 3.924910311203789e-09, + "loss": 1.468, + "step": 6534 + }, + { + "epoch": 2.9966746932691204, + "grad_norm": 0.281252920627594, + "learning_rate": 3.005010378476225e-09, + "loss": 0.8934, + "step": 6535 + }, + { + "epoch": 2.9971333562664833, + "grad_norm": 0.27226901054382324, + "learning_rate": 2.207763313777367e-09, + "loss": 0.8065, + "step": 6536 + }, + { + "epoch": 2.9975920192638457, + "grad_norm": 0.0754655972123146, + "learning_rate": 1.5331693126174884e-09, + "loss": 0.4985, + "step": 6537 + }, + { + "epoch": 2.9980506822612085, + "grad_norm": 0.20777086913585663, + "learning_rate": 9.812285406418653e-10, + "loss": 1.425, + "step": 6538 + }, + { + "epoch": 2.9985093452585714, + "grad_norm": 0.3428349196910858, + "learning_rate": 5.519411330756619e-10, + "loss": 1.5652, + "step": 6539 + }, + { + "epoch": 2.998968008255934, + "grad_norm": 0.3928774297237396, + "learning_rate": 2.4530719533455425e-10, + "loss": 1.4103, + "step": 6540 + }, + { + "epoch": 2.998968008255934, + "step": 6540, + "total_flos": 4.3151621005372293e+18, + "train_loss": 1.4276331583658854, + "train_runtime": 125872.0926, + "train_samples_per_second": 13.302, + "train_steps_per_second": 0.052 + } + ], + "logging_steps": 1.0, + "max_steps": 6540, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.3151621005372293e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}