{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 677, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014771048744460858, "grad_norm": 2.1287364959716797, "learning_rate": 0.0002, "loss": 1.4334, "step": 1 }, { "epoch": 0.0029542097488921715, "grad_norm": 1.5033221244812012, "learning_rate": 0.0002, "loss": 1.1642, "step": 2 }, { "epoch": 0.004431314623338257, "grad_norm": 1.5286452770233154, "learning_rate": 0.0002, "loss": 0.9292, "step": 3 }, { "epoch": 0.005908419497784343, "grad_norm": 1.2362889051437378, "learning_rate": 0.0002, "loss": 0.7608, "step": 4 }, { "epoch": 0.007385524372230428, "grad_norm": 1.0593241453170776, "learning_rate": 0.0002, "loss": 0.6653, "step": 5 }, { "epoch": 0.008862629246676515, "grad_norm": 1.0034171342849731, "learning_rate": 0.0002, "loss": 0.5738, "step": 6 }, { "epoch": 0.0103397341211226, "grad_norm": 0.723822832107544, "learning_rate": 0.0002, "loss": 0.5328, "step": 7 }, { "epoch": 0.011816838995568686, "grad_norm": 0.7309075593948364, "learning_rate": 0.0002, "loss": 0.5088, "step": 8 }, { "epoch": 0.013293943870014771, "grad_norm": 0.6442256569862366, "learning_rate": 0.0002, "loss": 0.4999, "step": 9 }, { "epoch": 0.014771048744460856, "grad_norm": 0.6145352721214294, "learning_rate": 0.0002, "loss": 0.5046, "step": 10 }, { "epoch": 0.01624815361890694, "grad_norm": 0.5789129734039307, "learning_rate": 0.0002, "loss": 0.489, "step": 11 }, { "epoch": 0.01772525849335303, "grad_norm": 0.5824376940727234, "learning_rate": 0.0002, "loss": 0.5328, "step": 12 }, { "epoch": 0.019202363367799114, "grad_norm": 0.5699394941329956, "learning_rate": 0.0002, "loss": 0.4755, "step": 13 }, { "epoch": 0.0206794682422452, "grad_norm": 0.5292893052101135, "learning_rate": 0.0002, "loss": 0.4108, "step": 14 }, { "epoch": 0.022156573116691284, "grad_norm": 0.5537489056587219, "learning_rate": 0.0002, "loss": 0.4807, "step": 15 }, { "epoch": 0.023633677991137372, "grad_norm": 0.546784520149231, "learning_rate": 0.0002, "loss": 0.4427, "step": 16 }, { "epoch": 0.025110782865583457, "grad_norm": 0.5094020962715149, "learning_rate": 0.0002, "loss": 0.4617, "step": 17 }, { "epoch": 0.026587887740029542, "grad_norm": 0.549403190612793, "learning_rate": 0.0002, "loss": 0.452, "step": 18 }, { "epoch": 0.028064992614475627, "grad_norm": 0.47281214594841003, "learning_rate": 0.0002, "loss": 0.3916, "step": 19 }, { "epoch": 0.029542097488921712, "grad_norm": 0.4933842122554779, "learning_rate": 0.0002, "loss": 0.4344, "step": 20 }, { "epoch": 0.0310192023633678, "grad_norm": 0.5650342106819153, "learning_rate": 0.0002, "loss": 0.5192, "step": 21 }, { "epoch": 0.03249630723781388, "grad_norm": 0.5102580189704895, "learning_rate": 0.0002, "loss": 0.4521, "step": 22 }, { "epoch": 0.033973412112259974, "grad_norm": 0.47124335169792175, "learning_rate": 0.0002, "loss": 0.3719, "step": 23 }, { "epoch": 0.03545051698670606, "grad_norm": 0.4769236445426941, "learning_rate": 0.0002, "loss": 0.4359, "step": 24 }, { "epoch": 0.03692762186115214, "grad_norm": 0.49603205919265747, "learning_rate": 0.0002, "loss": 0.438, "step": 25 }, { "epoch": 0.03840472673559823, "grad_norm": 0.42155203223228455, "learning_rate": 0.0002, "loss": 0.3311, "step": 26 }, { "epoch": 0.03988183161004431, "grad_norm": 0.4394625723361969, "learning_rate": 0.0002, "loss": 0.4033, "step": 27 }, { "epoch": 0.0413589364844904, "grad_norm": 0.4578387141227722, "learning_rate": 0.0002, "loss": 0.399, "step": 28 }, { "epoch": 0.04283604135893648, "grad_norm": 0.4147898256778717, "learning_rate": 0.0002, "loss": 0.3599, "step": 29 }, { "epoch": 0.04431314623338257, "grad_norm": 0.47084635496139526, "learning_rate": 0.0002, "loss": 0.4668, "step": 30 }, { "epoch": 0.04579025110782865, "grad_norm": 0.399994820356369, "learning_rate": 0.0002, "loss": 0.3108, "step": 31 }, { "epoch": 0.047267355982274745, "grad_norm": 0.4256761074066162, "learning_rate": 0.0002, "loss": 0.3928, "step": 32 }, { "epoch": 0.04874446085672083, "grad_norm": 0.4237106442451477, "learning_rate": 0.0002, "loss": 0.4036, "step": 33 }, { "epoch": 0.050221565731166914, "grad_norm": 0.4622955024242401, "learning_rate": 0.0002, "loss": 0.4394, "step": 34 }, { "epoch": 0.051698670605613, "grad_norm": 0.8845525979995728, "learning_rate": 0.0002, "loss": 0.3714, "step": 35 }, { "epoch": 0.053175775480059084, "grad_norm": 0.3846614360809326, "learning_rate": 0.0002, "loss": 0.3625, "step": 36 }, { "epoch": 0.05465288035450517, "grad_norm": 0.41804981231689453, "learning_rate": 0.0002, "loss": 0.4027, "step": 37 }, { "epoch": 0.056129985228951254, "grad_norm": 0.3947773575782776, "learning_rate": 0.0002, "loss": 0.3523, "step": 38 }, { "epoch": 0.05760709010339734, "grad_norm": 0.3716173470020294, "learning_rate": 0.0002, "loss": 0.3333, "step": 39 }, { "epoch": 0.059084194977843424, "grad_norm": 0.4511498808860779, "learning_rate": 0.0002, "loss": 0.4104, "step": 40 }, { "epoch": 0.060561299852289516, "grad_norm": 0.4428117573261261, "learning_rate": 0.0002, "loss": 0.4217, "step": 41 }, { "epoch": 0.0620384047267356, "grad_norm": 0.4312277138233185, "learning_rate": 0.0002, "loss": 0.4458, "step": 42 }, { "epoch": 0.06351550960118169, "grad_norm": 0.4207220673561096, "learning_rate": 0.0002, "loss": 0.4206, "step": 43 }, { "epoch": 0.06499261447562776, "grad_norm": 0.4463505744934082, "learning_rate": 0.0002, "loss": 0.3911, "step": 44 }, { "epoch": 0.06646971935007386, "grad_norm": 0.4605293869972229, "learning_rate": 0.0002, "loss": 0.4154, "step": 45 }, { "epoch": 0.06794682422451995, "grad_norm": 0.380751371383667, "learning_rate": 0.0002, "loss": 0.3556, "step": 46 }, { "epoch": 0.06942392909896603, "grad_norm": 0.3776094615459442, "learning_rate": 0.0002, "loss": 0.3275, "step": 47 }, { "epoch": 0.07090103397341212, "grad_norm": 0.39152535796165466, "learning_rate": 0.0002, "loss": 0.3749, "step": 48 }, { "epoch": 0.0723781388478582, "grad_norm": 0.4888671338558197, "learning_rate": 0.0002, "loss": 0.4408, "step": 49 }, { "epoch": 0.07385524372230429, "grad_norm": 0.38958850502967834, "learning_rate": 0.0002, "loss": 0.3551, "step": 50 }, { "epoch": 0.07533234859675036, "grad_norm": 0.39890560507774353, "learning_rate": 0.0002, "loss": 0.387, "step": 51 }, { "epoch": 0.07680945347119646, "grad_norm": 0.4128841757774353, "learning_rate": 0.0002, "loss": 0.3945, "step": 52 }, { "epoch": 0.07828655834564253, "grad_norm": 0.45516759157180786, "learning_rate": 0.0002, "loss": 0.4049, "step": 53 }, { "epoch": 0.07976366322008863, "grad_norm": 0.4038144648075104, "learning_rate": 0.0002, "loss": 0.3789, "step": 54 }, { "epoch": 0.08124076809453472, "grad_norm": 0.37849175930023193, "learning_rate": 0.0002, "loss": 0.3955, "step": 55 }, { "epoch": 0.0827178729689808, "grad_norm": 0.4295189082622528, "learning_rate": 0.0002, "loss": 0.4112, "step": 56 }, { "epoch": 0.08419497784342689, "grad_norm": 0.4347020387649536, "learning_rate": 0.0002, "loss": 0.4542, "step": 57 }, { "epoch": 0.08567208271787297, "grad_norm": 0.41407692432403564, "learning_rate": 0.0002, "loss": 0.4035, "step": 58 }, { "epoch": 0.08714918759231906, "grad_norm": 0.33283814787864685, "learning_rate": 0.0002, "loss": 0.2851, "step": 59 }, { "epoch": 0.08862629246676514, "grad_norm": 0.39427581429481506, "learning_rate": 0.0002, "loss": 0.4494, "step": 60 }, { "epoch": 0.09010339734121123, "grad_norm": 1.3357727527618408, "learning_rate": 0.0002, "loss": 0.3335, "step": 61 }, { "epoch": 0.0915805022156573, "grad_norm": 0.37050360441207886, "learning_rate": 0.0002, "loss": 0.3224, "step": 62 }, { "epoch": 0.0930576070901034, "grad_norm": 0.36000698804855347, "learning_rate": 0.0002, "loss": 0.3679, "step": 63 }, { "epoch": 0.09453471196454949, "grad_norm": 0.3739371597766876, "learning_rate": 0.0002, "loss": 0.4041, "step": 64 }, { "epoch": 0.09601181683899557, "grad_norm": 0.3365491032600403, "learning_rate": 0.0002, "loss": 0.3462, "step": 65 }, { "epoch": 0.09748892171344166, "grad_norm": 0.3357471823692322, "learning_rate": 0.0002, "loss": 0.3416, "step": 66 }, { "epoch": 0.09896602658788774, "grad_norm": 0.38020288944244385, "learning_rate": 0.0002, "loss": 0.3521, "step": 67 }, { "epoch": 0.10044313146233383, "grad_norm": 0.37143656611442566, "learning_rate": 0.0002, "loss": 0.3873, "step": 68 }, { "epoch": 0.1019202363367799, "grad_norm": 0.3613298535346985, "learning_rate": 0.0002, "loss": 0.3695, "step": 69 }, { "epoch": 0.103397341211226, "grad_norm": 0.3881225287914276, "learning_rate": 0.0002, "loss": 0.3686, "step": 70 }, { "epoch": 0.10487444608567208, "grad_norm": 0.35213181376457214, "learning_rate": 0.0002, "loss": 0.3276, "step": 71 }, { "epoch": 0.10635155096011817, "grad_norm": 0.3477317988872528, "learning_rate": 0.0002, "loss": 0.3261, "step": 72 }, { "epoch": 0.10782865583456426, "grad_norm": 0.326730340719223, "learning_rate": 0.0002, "loss": 0.2784, "step": 73 }, { "epoch": 0.10930576070901034, "grad_norm": 0.3316071629524231, "learning_rate": 0.0002, "loss": 0.3317, "step": 74 }, { "epoch": 0.11078286558345643, "grad_norm": 0.37388283014297485, "learning_rate": 0.0002, "loss": 0.3845, "step": 75 }, { "epoch": 0.11225997045790251, "grad_norm": 0.39761313796043396, "learning_rate": 0.0002, "loss": 0.4043, "step": 76 }, { "epoch": 0.1137370753323486, "grad_norm": 0.35033172369003296, "learning_rate": 0.0002, "loss": 0.3212, "step": 77 }, { "epoch": 0.11521418020679468, "grad_norm": 0.7551948428153992, "learning_rate": 0.0002, "loss": 0.3387, "step": 78 }, { "epoch": 0.11669128508124077, "grad_norm": 0.2940291166305542, "learning_rate": 0.0002, "loss": 0.2742, "step": 79 }, { "epoch": 0.11816838995568685, "grad_norm": 0.4048764407634735, "learning_rate": 0.0002, "loss": 0.4176, "step": 80 }, { "epoch": 0.11964549483013294, "grad_norm": 0.36520177125930786, "learning_rate": 0.0002, "loss": 0.317, "step": 81 }, { "epoch": 0.12112259970457903, "grad_norm": 0.3602144718170166, "learning_rate": 0.0002, "loss": 0.3648, "step": 82 }, { "epoch": 0.12259970457902511, "grad_norm": 0.34669214487075806, "learning_rate": 0.0002, "loss": 0.3389, "step": 83 }, { "epoch": 0.1240768094534712, "grad_norm": 0.34198257327079773, "learning_rate": 0.0002, "loss": 0.3174, "step": 84 }, { "epoch": 0.1255539143279173, "grad_norm": 0.3409755825996399, "learning_rate": 0.0002, "loss": 0.3376, "step": 85 }, { "epoch": 0.12703101920236337, "grad_norm": 0.38363194465637207, "learning_rate": 0.0002, "loss": 0.4002, "step": 86 }, { "epoch": 0.12850812407680945, "grad_norm": 0.35614731907844543, "learning_rate": 0.0002, "loss": 0.3581, "step": 87 }, { "epoch": 0.12998522895125553, "grad_norm": 0.3808327615261078, "learning_rate": 0.0002, "loss": 0.3966, "step": 88 }, { "epoch": 0.13146233382570163, "grad_norm": 0.3924517035484314, "learning_rate": 0.0002, "loss": 0.4161, "step": 89 }, { "epoch": 0.1329394387001477, "grad_norm": 0.3589531183242798, "learning_rate": 0.0002, "loss": 0.3233, "step": 90 }, { "epoch": 0.1344165435745938, "grad_norm": 0.37429341673851013, "learning_rate": 0.0002, "loss": 0.3778, "step": 91 }, { "epoch": 0.1358936484490399, "grad_norm": 0.3594294488430023, "learning_rate": 0.0002, "loss": 0.3472, "step": 92 }, { "epoch": 0.13737075332348597, "grad_norm": 0.3481505215167999, "learning_rate": 0.0002, "loss": 0.2961, "step": 93 }, { "epoch": 0.13884785819793205, "grad_norm": 0.3697575330734253, "learning_rate": 0.0002, "loss": 0.3954, "step": 94 }, { "epoch": 0.14032496307237813, "grad_norm": 0.3154103457927704, "learning_rate": 0.0002, "loss": 0.3148, "step": 95 }, { "epoch": 0.14180206794682423, "grad_norm": 0.32966312766075134, "learning_rate": 0.0002, "loss": 0.3211, "step": 96 }, { "epoch": 0.1432791728212703, "grad_norm": 0.3409123718738556, "learning_rate": 0.0002, "loss": 0.3318, "step": 97 }, { "epoch": 0.1447562776957164, "grad_norm": 0.346122682094574, "learning_rate": 0.0002, "loss": 0.3296, "step": 98 }, { "epoch": 0.14623338257016247, "grad_norm": 0.35875195264816284, "learning_rate": 0.0002, "loss": 0.3884, "step": 99 }, { "epoch": 0.14771048744460857, "grad_norm": 0.3223486542701721, "learning_rate": 0.0002, "loss": 0.3282, "step": 100 }, { "epoch": 0.14918759231905465, "grad_norm": 0.34657180309295654, "learning_rate": 0.0002, "loss": 0.364, "step": 101 }, { "epoch": 0.15066469719350073, "grad_norm": 0.34456005692481995, "learning_rate": 0.0002, "loss": 0.3541, "step": 102 }, { "epoch": 0.15214180206794684, "grad_norm": 0.3482792377471924, "learning_rate": 0.0002, "loss": 0.3435, "step": 103 }, { "epoch": 0.1536189069423929, "grad_norm": 0.37781214714050293, "learning_rate": 0.0002, "loss": 0.3716, "step": 104 }, { "epoch": 0.155096011816839, "grad_norm": 0.46567779779434204, "learning_rate": 0.0002, "loss": 0.3143, "step": 105 }, { "epoch": 0.15657311669128507, "grad_norm": 0.32534581422805786, "learning_rate": 0.0002, "loss": 0.3561, "step": 106 }, { "epoch": 0.15805022156573117, "grad_norm": 0.3262612521648407, "learning_rate": 0.0002, "loss": 0.3396, "step": 107 }, { "epoch": 0.15952732644017725, "grad_norm": 0.3691346049308777, "learning_rate": 0.0002, "loss": 0.4014, "step": 108 }, { "epoch": 0.16100443131462333, "grad_norm": 0.36267197132110596, "learning_rate": 0.0002, "loss": 0.3768, "step": 109 }, { "epoch": 0.16248153618906944, "grad_norm": 0.3206377923488617, "learning_rate": 0.0002, "loss": 0.326, "step": 110 }, { "epoch": 0.16395864106351551, "grad_norm": 0.32631710171699524, "learning_rate": 0.0002, "loss": 0.3438, "step": 111 }, { "epoch": 0.1654357459379616, "grad_norm": 0.33969393372535706, "learning_rate": 0.0002, "loss": 0.3602, "step": 112 }, { "epoch": 0.16691285081240767, "grad_norm": 0.361987829208374, "learning_rate": 0.0002, "loss": 0.3378, "step": 113 }, { "epoch": 0.16838995568685378, "grad_norm": 0.33116045594215393, "learning_rate": 0.0002, "loss": 0.3444, "step": 114 }, { "epoch": 0.16986706056129985, "grad_norm": 0.3474065363407135, "learning_rate": 0.0002, "loss": 0.3717, "step": 115 }, { "epoch": 0.17134416543574593, "grad_norm": 0.3335750699043274, "learning_rate": 0.0002, "loss": 0.351, "step": 116 }, { "epoch": 0.172821270310192, "grad_norm": 0.34676527976989746, "learning_rate": 0.0002, "loss": 0.3536, "step": 117 }, { "epoch": 0.17429837518463812, "grad_norm": 0.36145490407943726, "learning_rate": 0.0002, "loss": 0.407, "step": 118 }, { "epoch": 0.1757754800590842, "grad_norm": 0.3694964647293091, "learning_rate": 0.0002, "loss": 0.4143, "step": 119 }, { "epoch": 0.17725258493353027, "grad_norm": 0.31005293130874634, "learning_rate": 0.0002, "loss": 0.3306, "step": 120 }, { "epoch": 0.17872968980797638, "grad_norm": 0.32366085052490234, "learning_rate": 0.0002, "loss": 0.3342, "step": 121 }, { "epoch": 0.18020679468242246, "grad_norm": 0.3252504765987396, "learning_rate": 0.0002, "loss": 0.3503, "step": 122 }, { "epoch": 0.18168389955686853, "grad_norm": 0.32292550802230835, "learning_rate": 0.0002, "loss": 0.3694, "step": 123 }, { "epoch": 0.1831610044313146, "grad_norm": 0.32740291953086853, "learning_rate": 0.0002, "loss": 0.3296, "step": 124 }, { "epoch": 0.18463810930576072, "grad_norm": 0.3438139855861664, "learning_rate": 0.0002, "loss": 0.3107, "step": 125 }, { "epoch": 0.1861152141802068, "grad_norm": 0.33904099464416504, "learning_rate": 0.0002, "loss": 0.39, "step": 126 }, { "epoch": 0.18759231905465287, "grad_norm": 0.3464205265045166, "learning_rate": 0.0002, "loss": 0.3679, "step": 127 }, { "epoch": 0.18906942392909898, "grad_norm": 0.3387203514575958, "learning_rate": 0.0002, "loss": 0.3375, "step": 128 }, { "epoch": 0.19054652880354506, "grad_norm": 0.40050801634788513, "learning_rate": 0.0002, "loss": 0.3965, "step": 129 }, { "epoch": 0.19202363367799113, "grad_norm": 0.31067872047424316, "learning_rate": 0.0002, "loss": 0.3108, "step": 130 }, { "epoch": 0.1935007385524372, "grad_norm": 0.35977062582969666, "learning_rate": 0.0002, "loss": 0.4023, "step": 131 }, { "epoch": 0.19497784342688332, "grad_norm": 0.3153740167617798, "learning_rate": 0.0002, "loss": 0.3317, "step": 132 }, { "epoch": 0.1964549483013294, "grad_norm": 0.3306857645511627, "learning_rate": 0.0002, "loss": 0.3408, "step": 133 }, { "epoch": 0.19793205317577547, "grad_norm": 0.32012930512428284, "learning_rate": 0.0002, "loss": 0.3218, "step": 134 }, { "epoch": 0.19940915805022155, "grad_norm": 0.3159703314304352, "learning_rate": 0.0002, "loss": 0.3481, "step": 135 }, { "epoch": 0.20088626292466766, "grad_norm": 0.3230080306529999, "learning_rate": 0.0002, "loss": 0.3779, "step": 136 }, { "epoch": 0.20236336779911374, "grad_norm": 0.34753701090812683, "learning_rate": 0.0002, "loss": 0.3775, "step": 137 }, { "epoch": 0.2038404726735598, "grad_norm": 0.3315640687942505, "learning_rate": 0.0002, "loss": 0.339, "step": 138 }, { "epoch": 0.20531757754800592, "grad_norm": 0.33685439825057983, "learning_rate": 0.0002, "loss": 0.3575, "step": 139 }, { "epoch": 0.206794682422452, "grad_norm": 0.3179871439933777, "learning_rate": 0.0002, "loss": 0.3338, "step": 140 }, { "epoch": 0.20827178729689808, "grad_norm": 0.32391220331192017, "learning_rate": 0.0002, "loss": 0.3665, "step": 141 }, { "epoch": 0.20974889217134415, "grad_norm": 0.3102681338787079, "learning_rate": 0.0002, "loss": 0.2948, "step": 142 }, { "epoch": 0.21122599704579026, "grad_norm": 0.33224979043006897, "learning_rate": 0.0002, "loss": 0.392, "step": 143 }, { "epoch": 0.21270310192023634, "grad_norm": 0.30173906683921814, "learning_rate": 0.0002, "loss": 0.2775, "step": 144 }, { "epoch": 0.21418020679468242, "grad_norm": 0.3212149739265442, "learning_rate": 0.0002, "loss": 0.3408, "step": 145 }, { "epoch": 0.21565731166912852, "grad_norm": 0.3113839328289032, "learning_rate": 0.0002, "loss": 0.314, "step": 146 }, { "epoch": 0.2171344165435746, "grad_norm": 0.3435472548007965, "learning_rate": 0.0002, "loss": 0.3617, "step": 147 }, { "epoch": 0.21861152141802068, "grad_norm": 0.3423033058643341, "learning_rate": 0.0002, "loss": 0.3523, "step": 148 }, { "epoch": 0.22008862629246675, "grad_norm": 0.3202575445175171, "learning_rate": 0.0002, "loss": 0.349, "step": 149 }, { "epoch": 0.22156573116691286, "grad_norm": 0.2999582886695862, "learning_rate": 0.0002, "loss": 0.2906, "step": 150 }, { "epoch": 0.22304283604135894, "grad_norm": 0.33576205372810364, "learning_rate": 0.0002, "loss": 0.329, "step": 151 }, { "epoch": 0.22451994091580502, "grad_norm": 0.31811273097991943, "learning_rate": 0.0002, "loss": 0.3151, "step": 152 }, { "epoch": 0.2259970457902511, "grad_norm": 0.34126049280166626, "learning_rate": 0.0002, "loss": 0.335, "step": 153 }, { "epoch": 0.2274741506646972, "grad_norm": 0.29068347811698914, "learning_rate": 0.0002, "loss": 0.2996, "step": 154 }, { "epoch": 0.22895125553914328, "grad_norm": 0.3677709698677063, "learning_rate": 0.0002, "loss": 0.357, "step": 155 }, { "epoch": 0.23042836041358936, "grad_norm": 0.319380521774292, "learning_rate": 0.0002, "loss": 0.3283, "step": 156 }, { "epoch": 0.23190546528803546, "grad_norm": 0.2935948669910431, "learning_rate": 0.0002, "loss": 0.2755, "step": 157 }, { "epoch": 0.23338257016248154, "grad_norm": 0.30784815549850464, "learning_rate": 0.0002, "loss": 0.3171, "step": 158 }, { "epoch": 0.23485967503692762, "grad_norm": 0.3345930874347687, "learning_rate": 0.0002, "loss": 0.3526, "step": 159 }, { "epoch": 0.2363367799113737, "grad_norm": 0.3269497752189636, "learning_rate": 0.0002, "loss": 0.3492, "step": 160 }, { "epoch": 0.2378138847858198, "grad_norm": 0.32217973470687866, "learning_rate": 0.0002, "loss": 0.36, "step": 161 }, { "epoch": 0.23929098966026588, "grad_norm": 0.3381323516368866, "learning_rate": 0.0002, "loss": 0.3534, "step": 162 }, { "epoch": 0.24076809453471196, "grad_norm": 0.3131888210773468, "learning_rate": 0.0002, "loss": 0.3224, "step": 163 }, { "epoch": 0.24224519940915806, "grad_norm": 0.30917319655418396, "learning_rate": 0.0002, "loss": 0.3132, "step": 164 }, { "epoch": 0.24372230428360414, "grad_norm": 0.31469786167144775, "learning_rate": 0.0002, "loss": 0.3218, "step": 165 }, { "epoch": 0.24519940915805022, "grad_norm": 0.31420794129371643, "learning_rate": 0.0002, "loss": 0.3471, "step": 166 }, { "epoch": 0.2466765140324963, "grad_norm": 0.31471043825149536, "learning_rate": 0.0002, "loss": 0.3056, "step": 167 }, { "epoch": 0.2481536189069424, "grad_norm": 0.30315864086151123, "learning_rate": 0.0002, "loss": 0.3355, "step": 168 }, { "epoch": 0.24963072378138848, "grad_norm": 0.29710718989372253, "learning_rate": 0.0002, "loss": 0.3077, "step": 169 }, { "epoch": 0.2511078286558346, "grad_norm": 0.30408531427383423, "learning_rate": 0.0002, "loss": 0.3087, "step": 170 }, { "epoch": 0.25258493353028066, "grad_norm": 0.29702916741371155, "learning_rate": 0.0002, "loss": 0.2993, "step": 171 }, { "epoch": 0.25406203840472674, "grad_norm": 0.2939663827419281, "learning_rate": 0.0002, "loss": 0.2996, "step": 172 }, { "epoch": 0.2555391432791728, "grad_norm": 0.36591342091560364, "learning_rate": 0.0002, "loss": 0.356, "step": 173 }, { "epoch": 0.2570162481536189, "grad_norm": 0.30867043137550354, "learning_rate": 0.0002, "loss": 0.2961, "step": 174 }, { "epoch": 0.258493353028065, "grad_norm": 0.34252026677131653, "learning_rate": 0.0002, "loss": 0.3849, "step": 175 }, { "epoch": 0.25997045790251105, "grad_norm": 0.34753838181495667, "learning_rate": 0.0002, "loss": 0.3838, "step": 176 }, { "epoch": 0.2614475627769572, "grad_norm": 0.31399980187416077, "learning_rate": 0.0002, "loss": 0.33, "step": 177 }, { "epoch": 0.26292466765140327, "grad_norm": 0.32648637890815735, "learning_rate": 0.0002, "loss": 0.3678, "step": 178 }, { "epoch": 0.26440177252584934, "grad_norm": 0.2866675853729248, "learning_rate": 0.0002, "loss": 0.295, "step": 179 }, { "epoch": 0.2658788774002954, "grad_norm": 0.32054954767227173, "learning_rate": 0.0002, "loss": 0.3342, "step": 180 }, { "epoch": 0.2673559822747415, "grad_norm": 0.30476486682891846, "learning_rate": 0.0002, "loss": 0.3381, "step": 181 }, { "epoch": 0.2688330871491876, "grad_norm": 0.2891450524330139, "learning_rate": 0.0002, "loss": 0.2984, "step": 182 }, { "epoch": 0.27031019202363366, "grad_norm": 0.3023356795310974, "learning_rate": 0.0002, "loss": 0.2991, "step": 183 }, { "epoch": 0.2717872968980798, "grad_norm": 0.31025779247283936, "learning_rate": 0.0002, "loss": 0.3198, "step": 184 }, { "epoch": 0.27326440177252587, "grad_norm": 0.27903226017951965, "learning_rate": 0.0002, "loss": 0.274, "step": 185 }, { "epoch": 0.27474150664697194, "grad_norm": 0.2925949692726135, "learning_rate": 0.0002, "loss": 0.3051, "step": 186 }, { "epoch": 0.276218611521418, "grad_norm": 0.3387667238712311, "learning_rate": 0.0002, "loss": 0.3677, "step": 187 }, { "epoch": 0.2776957163958641, "grad_norm": 0.316540390253067, "learning_rate": 0.0002, "loss": 0.3196, "step": 188 }, { "epoch": 0.2791728212703102, "grad_norm": 0.3089348375797272, "learning_rate": 0.0002, "loss": 0.3338, "step": 189 }, { "epoch": 0.28064992614475626, "grad_norm": 0.313431054353714, "learning_rate": 0.0002, "loss": 0.3178, "step": 190 }, { "epoch": 0.2821270310192024, "grad_norm": 0.30025985836982727, "learning_rate": 0.0002, "loss": 0.3086, "step": 191 }, { "epoch": 0.28360413589364847, "grad_norm": 0.3058534860610962, "learning_rate": 0.0002, "loss": 0.3128, "step": 192 }, { "epoch": 0.28508124076809455, "grad_norm": 0.334710031747818, "learning_rate": 0.0002, "loss": 0.3418, "step": 193 }, { "epoch": 0.2865583456425406, "grad_norm": 0.3021548092365265, "learning_rate": 0.0002, "loss": 0.2995, "step": 194 }, { "epoch": 0.2880354505169867, "grad_norm": 0.27398747205734253, "learning_rate": 0.0002, "loss": 0.2743, "step": 195 }, { "epoch": 0.2895125553914328, "grad_norm": 0.33194372057914734, "learning_rate": 0.0002, "loss": 0.2824, "step": 196 }, { "epoch": 0.29098966026587886, "grad_norm": 0.3193664848804474, "learning_rate": 0.0002, "loss": 0.3361, "step": 197 }, { "epoch": 0.29246676514032494, "grad_norm": 0.3320102393627167, "learning_rate": 0.0002, "loss": 0.3154, "step": 198 }, { "epoch": 0.29394387001477107, "grad_norm": 0.2951314449310303, "learning_rate": 0.0002, "loss": 0.2699, "step": 199 }, { "epoch": 0.29542097488921715, "grad_norm": 0.3117165267467499, "learning_rate": 0.0002, "loss": 0.3359, "step": 200 }, { "epoch": 0.2968980797636632, "grad_norm": 0.30885782837867737, "learning_rate": 0.0002, "loss": 0.3181, "step": 201 }, { "epoch": 0.2983751846381093, "grad_norm": 0.3114778399467468, "learning_rate": 0.0002, "loss": 0.3409, "step": 202 }, { "epoch": 0.2998522895125554, "grad_norm": 0.32142388820648193, "learning_rate": 0.0002, "loss": 0.3491, "step": 203 }, { "epoch": 0.30132939438700146, "grad_norm": 0.3159630000591278, "learning_rate": 0.0002, "loss": 0.3176, "step": 204 }, { "epoch": 0.30280649926144754, "grad_norm": 0.2813749313354492, "learning_rate": 0.0002, "loss": 0.2745, "step": 205 }, { "epoch": 0.30428360413589367, "grad_norm": 0.3174036145210266, "learning_rate": 0.0002, "loss": 0.3527, "step": 206 }, { "epoch": 0.30576070901033975, "grad_norm": 0.311678409576416, "learning_rate": 0.0002, "loss": 0.3075, "step": 207 }, { "epoch": 0.3072378138847858, "grad_norm": 0.2867993712425232, "learning_rate": 0.0002, "loss": 0.32, "step": 208 }, { "epoch": 0.3087149187592319, "grad_norm": 0.29298824071884155, "learning_rate": 0.0002, "loss": 0.3226, "step": 209 }, { "epoch": 0.310192023633678, "grad_norm": 0.3173938989639282, "learning_rate": 0.0002, "loss": 0.32, "step": 210 }, { "epoch": 0.31166912850812406, "grad_norm": 0.27944210171699524, "learning_rate": 0.0002, "loss": 0.2825, "step": 211 }, { "epoch": 0.31314623338257014, "grad_norm": 0.3196215331554413, "learning_rate": 0.0002, "loss": 0.3321, "step": 212 }, { "epoch": 0.31462333825701627, "grad_norm": 0.3193184733390808, "learning_rate": 0.0002, "loss": 0.3394, "step": 213 }, { "epoch": 0.31610044313146235, "grad_norm": 0.2783777713775635, "learning_rate": 0.0002, "loss": 0.3134, "step": 214 }, { "epoch": 0.3175775480059084, "grad_norm": 0.35627251863479614, "learning_rate": 0.0002, "loss": 0.3973, "step": 215 }, { "epoch": 0.3190546528803545, "grad_norm": 0.32312896847724915, "learning_rate": 0.0002, "loss": 0.3388, "step": 216 }, { "epoch": 0.3205317577548006, "grad_norm": 0.2931472659111023, "learning_rate": 0.0002, "loss": 0.3134, "step": 217 }, { "epoch": 0.32200886262924666, "grad_norm": 0.3059196174144745, "learning_rate": 0.0002, "loss": 0.3249, "step": 218 }, { "epoch": 0.32348596750369274, "grad_norm": 0.3171478807926178, "learning_rate": 0.0002, "loss": 0.3459, "step": 219 }, { "epoch": 0.3249630723781389, "grad_norm": 0.31810346245765686, "learning_rate": 0.0002, "loss": 0.3455, "step": 220 }, { "epoch": 0.32644017725258495, "grad_norm": 0.30696892738342285, "learning_rate": 0.0002, "loss": 0.3037, "step": 221 }, { "epoch": 0.32791728212703103, "grad_norm": 0.3519222140312195, "learning_rate": 0.0002, "loss": 0.3585, "step": 222 }, { "epoch": 0.3293943870014771, "grad_norm": 0.2762470245361328, "learning_rate": 0.0002, "loss": 0.2615, "step": 223 }, { "epoch": 0.3308714918759232, "grad_norm": 0.2909640967845917, "learning_rate": 0.0002, "loss": 0.2994, "step": 224 }, { "epoch": 0.33234859675036926, "grad_norm": 0.3310638964176178, "learning_rate": 0.0002, "loss": 0.368, "step": 225 }, { "epoch": 0.33382570162481534, "grad_norm": 0.337228387594223, "learning_rate": 0.0002, "loss": 0.358, "step": 226 }, { "epoch": 0.3353028064992615, "grad_norm": 0.3182266652584076, "learning_rate": 0.0002, "loss": 0.3425, "step": 227 }, { "epoch": 0.33677991137370755, "grad_norm": 0.32053616642951965, "learning_rate": 0.0002, "loss": 0.3604, "step": 228 }, { "epoch": 0.33825701624815363, "grad_norm": 0.3377324342727661, "learning_rate": 0.0002, "loss": 0.3783, "step": 229 }, { "epoch": 0.3397341211225997, "grad_norm": 0.28743067383766174, "learning_rate": 0.0002, "loss": 0.304, "step": 230 }, { "epoch": 0.3412112259970458, "grad_norm": 0.30108213424682617, "learning_rate": 0.0002, "loss": 0.3129, "step": 231 }, { "epoch": 0.34268833087149186, "grad_norm": 0.3191213607788086, "learning_rate": 0.0002, "loss": 0.3331, "step": 232 }, { "epoch": 0.34416543574593794, "grad_norm": 0.2999110519886017, "learning_rate": 0.0002, "loss": 0.3074, "step": 233 }, { "epoch": 0.345642540620384, "grad_norm": 0.2682500183582306, "learning_rate": 0.0002, "loss": 0.2635, "step": 234 }, { "epoch": 0.34711964549483015, "grad_norm": 0.2817941904067993, "learning_rate": 0.0002, "loss": 0.3048, "step": 235 }, { "epoch": 0.34859675036927623, "grad_norm": 0.3110464811325073, "learning_rate": 0.0002, "loss": 0.3228, "step": 236 }, { "epoch": 0.3500738552437223, "grad_norm": 0.3088606297969818, "learning_rate": 0.0002, "loss": 0.3161, "step": 237 }, { "epoch": 0.3515509601181684, "grad_norm": 0.2990322411060333, "learning_rate": 0.0002, "loss": 0.3085, "step": 238 }, { "epoch": 0.35302806499261447, "grad_norm": 0.33097386360168457, "learning_rate": 0.0002, "loss": 0.3615, "step": 239 }, { "epoch": 0.35450516986706054, "grad_norm": 0.3397606313228607, "learning_rate": 0.0002, "loss": 0.3957, "step": 240 }, { "epoch": 0.3559822747415066, "grad_norm": 0.2756197452545166, "learning_rate": 0.0002, "loss": 0.2731, "step": 241 }, { "epoch": 0.35745937961595275, "grad_norm": 0.3435852825641632, "learning_rate": 0.0002, "loss": 0.3855, "step": 242 }, { "epoch": 0.35893648449039883, "grad_norm": 0.33727383613586426, "learning_rate": 0.0002, "loss": 0.3101, "step": 243 }, { "epoch": 0.3604135893648449, "grad_norm": 0.3684369921684265, "learning_rate": 0.0002, "loss": 0.3378, "step": 244 }, { "epoch": 0.361890694239291, "grad_norm": 0.3006575107574463, "learning_rate": 0.0002, "loss": 0.3295, "step": 245 }, { "epoch": 0.36336779911373707, "grad_norm": 0.31223273277282715, "learning_rate": 0.0002, "loss": 0.2977, "step": 246 }, { "epoch": 0.36484490398818314, "grad_norm": 0.3001905381679535, "learning_rate": 0.0002, "loss": 0.294, "step": 247 }, { "epoch": 0.3663220088626292, "grad_norm": 0.2907404899597168, "learning_rate": 0.0002, "loss": 0.2839, "step": 248 }, { "epoch": 0.36779911373707536, "grad_norm": 0.31060346961021423, "learning_rate": 0.0002, "loss": 0.3333, "step": 249 }, { "epoch": 0.36927621861152143, "grad_norm": 0.3394862413406372, "learning_rate": 0.0002, "loss": 0.3217, "step": 250 }, { "epoch": 0.3707533234859675, "grad_norm": 0.2912856340408325, "learning_rate": 0.0002, "loss": 0.3072, "step": 251 }, { "epoch": 0.3722304283604136, "grad_norm": 0.2991478741168976, "learning_rate": 0.0002, "loss": 0.3349, "step": 252 }, { "epoch": 0.37370753323485967, "grad_norm": 0.304868221282959, "learning_rate": 0.0002, "loss": 0.3142, "step": 253 }, { "epoch": 0.37518463810930575, "grad_norm": 0.3008173704147339, "learning_rate": 0.0002, "loss": 0.3166, "step": 254 }, { "epoch": 0.3766617429837518, "grad_norm": 0.290526807308197, "learning_rate": 0.0002, "loss": 0.3228, "step": 255 }, { "epoch": 0.37813884785819796, "grad_norm": 0.2846904695034027, "learning_rate": 0.0002, "loss": 0.31, "step": 256 }, { "epoch": 0.37961595273264404, "grad_norm": 0.306904137134552, "learning_rate": 0.0002, "loss": 0.3238, "step": 257 }, { "epoch": 0.3810930576070901, "grad_norm": 0.30683666467666626, "learning_rate": 0.0002, "loss": 0.3327, "step": 258 }, { "epoch": 0.3825701624815362, "grad_norm": 0.2824447751045227, "learning_rate": 0.0002, "loss": 0.2962, "step": 259 }, { "epoch": 0.38404726735598227, "grad_norm": 0.29804757237434387, "learning_rate": 0.0002, "loss": 0.3025, "step": 260 }, { "epoch": 0.38552437223042835, "grad_norm": 0.3133246600627899, "learning_rate": 0.0002, "loss": 0.3095, "step": 261 }, { "epoch": 0.3870014771048744, "grad_norm": 0.3098774254322052, "learning_rate": 0.0002, "loss": 0.3031, "step": 262 }, { "epoch": 0.38847858197932056, "grad_norm": 0.3248344361782074, "learning_rate": 0.0002, "loss": 0.3402, "step": 263 }, { "epoch": 0.38995568685376664, "grad_norm": 0.30645236372947693, "learning_rate": 0.0002, "loss": 0.3277, "step": 264 }, { "epoch": 0.3914327917282127, "grad_norm": 0.29753726720809937, "learning_rate": 0.0002, "loss": 0.3322, "step": 265 }, { "epoch": 0.3929098966026588, "grad_norm": 0.33593639731407166, "learning_rate": 0.0002, "loss": 0.337, "step": 266 }, { "epoch": 0.39438700147710487, "grad_norm": 0.3059685528278351, "learning_rate": 0.0002, "loss": 0.2896, "step": 267 }, { "epoch": 0.39586410635155095, "grad_norm": 0.30055829882621765, "learning_rate": 0.0002, "loss": 0.3385, "step": 268 }, { "epoch": 0.397341211225997, "grad_norm": 0.27567949891090393, "learning_rate": 0.0002, "loss": 0.2799, "step": 269 }, { "epoch": 0.3988183161004431, "grad_norm": 0.33319681882858276, "learning_rate": 0.0002, "loss": 0.3735, "step": 270 }, { "epoch": 0.40029542097488924, "grad_norm": 0.28851690888404846, "learning_rate": 0.0002, "loss": 0.2934, "step": 271 }, { "epoch": 0.4017725258493353, "grad_norm": 0.3188093900680542, "learning_rate": 0.0002, "loss": 0.3276, "step": 272 }, { "epoch": 0.4032496307237814, "grad_norm": 0.29944342374801636, "learning_rate": 0.0002, "loss": 0.3351, "step": 273 }, { "epoch": 0.40472673559822747, "grad_norm": 0.31611138582229614, "learning_rate": 0.0002, "loss": 0.3616, "step": 274 }, { "epoch": 0.40620384047267355, "grad_norm": 0.3243541419506073, "learning_rate": 0.0002, "loss": 0.3394, "step": 275 }, { "epoch": 0.4076809453471196, "grad_norm": 0.31130653619766235, "learning_rate": 0.0002, "loss": 0.3182, "step": 276 }, { "epoch": 0.4091580502215657, "grad_norm": 0.2761830687522888, "learning_rate": 0.0002, "loss": 0.23, "step": 277 }, { "epoch": 0.41063515509601184, "grad_norm": 0.3256094455718994, "learning_rate": 0.0002, "loss": 0.3921, "step": 278 }, { "epoch": 0.4121122599704579, "grad_norm": 0.30812302231788635, "learning_rate": 0.0002, "loss": 0.3559, "step": 279 }, { "epoch": 0.413589364844904, "grad_norm": 0.28198400139808655, "learning_rate": 0.0002, "loss": 0.285, "step": 280 }, { "epoch": 0.4150664697193501, "grad_norm": 0.2873023450374603, "learning_rate": 0.0002, "loss": 0.2963, "step": 281 }, { "epoch": 0.41654357459379615, "grad_norm": 0.29413530230522156, "learning_rate": 0.0002, "loss": 0.2885, "step": 282 }, { "epoch": 0.41802067946824223, "grad_norm": 0.2963588237762451, "learning_rate": 0.0002, "loss": 0.2996, "step": 283 }, { "epoch": 0.4194977843426883, "grad_norm": 0.2581465542316437, "learning_rate": 0.0002, "loss": 0.255, "step": 284 }, { "epoch": 0.42097488921713444, "grad_norm": 0.3365771472454071, "learning_rate": 0.0002, "loss": 0.3473, "step": 285 }, { "epoch": 0.4224519940915805, "grad_norm": 0.3092253804206848, "learning_rate": 0.0002, "loss": 0.29, "step": 286 }, { "epoch": 0.4239290989660266, "grad_norm": 0.300626277923584, "learning_rate": 0.0002, "loss": 0.3183, "step": 287 }, { "epoch": 0.4254062038404727, "grad_norm": 0.3320425748825073, "learning_rate": 0.0002, "loss": 0.3322, "step": 288 }, { "epoch": 0.42688330871491875, "grad_norm": 0.2749597728252411, "learning_rate": 0.0002, "loss": 0.2995, "step": 289 }, { "epoch": 0.42836041358936483, "grad_norm": 0.280134916305542, "learning_rate": 0.0002, "loss": 0.2729, "step": 290 }, { "epoch": 0.4298375184638109, "grad_norm": 0.27060407400131226, "learning_rate": 0.0002, "loss": 0.2694, "step": 291 }, { "epoch": 0.43131462333825704, "grad_norm": 0.28500011563301086, "learning_rate": 0.0002, "loss": 0.2852, "step": 292 }, { "epoch": 0.4327917282127031, "grad_norm": 0.2733040452003479, "learning_rate": 0.0002, "loss": 0.276, "step": 293 }, { "epoch": 0.4342688330871492, "grad_norm": 0.30365538597106934, "learning_rate": 0.0002, "loss": 0.2806, "step": 294 }, { "epoch": 0.4357459379615953, "grad_norm": 0.3079434335231781, "learning_rate": 0.0002, "loss": 0.3014, "step": 295 }, { "epoch": 0.43722304283604135, "grad_norm": 0.2746562659740448, "learning_rate": 0.0002, "loss": 0.2601, "step": 296 }, { "epoch": 0.43870014771048743, "grad_norm": 0.3027852475643158, "learning_rate": 0.0002, "loss": 0.2978, "step": 297 }, { "epoch": 0.4401772525849335, "grad_norm": 0.2862493395805359, "learning_rate": 0.0002, "loss": 0.2712, "step": 298 }, { "epoch": 0.44165435745937964, "grad_norm": 0.30820953845977783, "learning_rate": 0.0002, "loss": 0.3198, "step": 299 }, { "epoch": 0.4431314623338257, "grad_norm": 0.2891389727592468, "learning_rate": 0.0002, "loss": 0.2914, "step": 300 }, { "epoch": 0.4446085672082718, "grad_norm": 0.29976293444633484, "learning_rate": 0.0002, "loss": 0.3155, "step": 301 }, { "epoch": 0.4460856720827179, "grad_norm": 0.26029616594314575, "learning_rate": 0.0002, "loss": 0.2585, "step": 302 }, { "epoch": 0.44756277695716395, "grad_norm": 0.2925141751766205, "learning_rate": 0.0002, "loss": 0.2799, "step": 303 }, { "epoch": 0.44903988183161003, "grad_norm": 0.3378995358943939, "learning_rate": 0.0002, "loss": 0.3397, "step": 304 }, { "epoch": 0.4505169867060561, "grad_norm": 0.3140377104282379, "learning_rate": 0.0002, "loss": 0.3065, "step": 305 }, { "epoch": 0.4519940915805022, "grad_norm": 0.30882659554481506, "learning_rate": 0.0002, "loss": 0.2976, "step": 306 }, { "epoch": 0.4534711964549483, "grad_norm": 0.2986995577812195, "learning_rate": 0.0002, "loss": 0.3086, "step": 307 }, { "epoch": 0.4549483013293944, "grad_norm": 0.37128734588623047, "learning_rate": 0.0002, "loss": 0.3018, "step": 308 }, { "epoch": 0.4564254062038405, "grad_norm": 0.2967352271080017, "learning_rate": 0.0002, "loss": 0.2855, "step": 309 }, { "epoch": 0.45790251107828656, "grad_norm": 0.3116573691368103, "learning_rate": 0.0002, "loss": 0.3248, "step": 310 }, { "epoch": 0.45937961595273263, "grad_norm": 0.27394649386405945, "learning_rate": 0.0002, "loss": 0.2894, "step": 311 }, { "epoch": 0.4608567208271787, "grad_norm": 0.31190183758735657, "learning_rate": 0.0002, "loss": 0.3311, "step": 312 }, { "epoch": 0.4623338257016248, "grad_norm": 0.28978461027145386, "learning_rate": 0.0002, "loss": 0.2896, "step": 313 }, { "epoch": 0.4638109305760709, "grad_norm": 0.29586443305015564, "learning_rate": 0.0002, "loss": 0.3062, "step": 314 }, { "epoch": 0.465288035450517, "grad_norm": 0.3034004271030426, "learning_rate": 0.0002, "loss": 0.2797, "step": 315 }, { "epoch": 0.4667651403249631, "grad_norm": 0.3083277940750122, "learning_rate": 0.0002, "loss": 0.2912, "step": 316 }, { "epoch": 0.46824224519940916, "grad_norm": 0.31153154373168945, "learning_rate": 0.0002, "loss": 0.3403, "step": 317 }, { "epoch": 0.46971935007385524, "grad_norm": 0.26065292954444885, "learning_rate": 0.0002, "loss": 0.2289, "step": 318 }, { "epoch": 0.4711964549483013, "grad_norm": 0.29736757278442383, "learning_rate": 0.0002, "loss": 0.3202, "step": 319 }, { "epoch": 0.4726735598227474, "grad_norm": 0.362541526556015, "learning_rate": 0.0002, "loss": 0.393, "step": 320 }, { "epoch": 0.4741506646971935, "grad_norm": 0.3045463263988495, "learning_rate": 0.0002, "loss": 0.2843, "step": 321 }, { "epoch": 0.4756277695716396, "grad_norm": 0.33905521035194397, "learning_rate": 0.0002, "loss": 0.368, "step": 322 }, { "epoch": 0.4771048744460857, "grad_norm": 0.3574953079223633, "learning_rate": 0.0002, "loss": 0.2273, "step": 323 }, { "epoch": 0.47858197932053176, "grad_norm": 0.3134016990661621, "learning_rate": 0.0002, "loss": 0.3134, "step": 324 }, { "epoch": 0.48005908419497784, "grad_norm": 0.32262158393859863, "learning_rate": 0.0002, "loss": 0.3028, "step": 325 }, { "epoch": 0.4815361890694239, "grad_norm": 0.26441511511802673, "learning_rate": 0.0002, "loss": 0.2447, "step": 326 }, { "epoch": 0.48301329394387, "grad_norm": 0.3419596552848816, "learning_rate": 0.0002, "loss": 0.3258, "step": 327 }, { "epoch": 0.4844903988183161, "grad_norm": 0.3031555116176605, "learning_rate": 0.0002, "loss": 0.322, "step": 328 }, { "epoch": 0.4859675036927622, "grad_norm": 0.29226183891296387, "learning_rate": 0.0002, "loss": 0.2803, "step": 329 }, { "epoch": 0.4874446085672083, "grad_norm": 0.2874895930290222, "learning_rate": 0.0002, "loss": 0.2824, "step": 330 }, { "epoch": 0.48892171344165436, "grad_norm": 0.31009188294410706, "learning_rate": 0.0002, "loss": 0.3218, "step": 331 }, { "epoch": 0.49039881831610044, "grad_norm": 0.31250134110450745, "learning_rate": 0.0002, "loss": 0.2972, "step": 332 }, { "epoch": 0.4918759231905465, "grad_norm": 0.30231741070747375, "learning_rate": 0.0002, "loss": 0.3255, "step": 333 }, { "epoch": 0.4933530280649926, "grad_norm": 0.32139065861701965, "learning_rate": 0.0002, "loss": 0.3712, "step": 334 }, { "epoch": 0.4948301329394387, "grad_norm": 0.2788805365562439, "learning_rate": 0.0002, "loss": 0.3069, "step": 335 }, { "epoch": 0.4963072378138848, "grad_norm": 0.3206048011779785, "learning_rate": 0.0002, "loss": 0.3519, "step": 336 }, { "epoch": 0.4977843426883309, "grad_norm": 0.316514253616333, "learning_rate": 0.0002, "loss": 0.3554, "step": 337 }, { "epoch": 0.49926144756277696, "grad_norm": 0.3080296516418457, "learning_rate": 0.0002, "loss": 0.3366, "step": 338 }, { "epoch": 0.5007385524372231, "grad_norm": 0.3183678090572357, "learning_rate": 0.0002, "loss": 0.2967, "step": 339 }, { "epoch": 0.5022156573116692, "grad_norm": 0.31313014030456543, "learning_rate": 0.0002, "loss": 0.3188, "step": 340 }, { "epoch": 0.5036927621861153, "grad_norm": 0.2989446520805359, "learning_rate": 0.0002, "loss": 0.3235, "step": 341 }, { "epoch": 0.5051698670605613, "grad_norm": 0.2817307412624359, "learning_rate": 0.0002, "loss": 0.2959, "step": 342 }, { "epoch": 0.5066469719350074, "grad_norm": 0.487758606672287, "learning_rate": 0.0002, "loss": 0.3308, "step": 343 }, { "epoch": 0.5081240768094535, "grad_norm": 0.26448920369148254, "learning_rate": 0.0002, "loss": 0.2942, "step": 344 }, { "epoch": 0.5096011816838996, "grad_norm": 0.3182467818260193, "learning_rate": 0.0002, "loss": 0.291, "step": 345 }, { "epoch": 0.5110782865583456, "grad_norm": 0.2950560450553894, "learning_rate": 0.0002, "loss": 0.3014, "step": 346 }, { "epoch": 0.5125553914327917, "grad_norm": 0.3176344633102417, "learning_rate": 0.0002, "loss": 0.3425, "step": 347 }, { "epoch": 0.5140324963072378, "grad_norm": 0.30496424436569214, "learning_rate": 0.0002, "loss": 0.3447, "step": 348 }, { "epoch": 0.5155096011816839, "grad_norm": 0.28272292017936707, "learning_rate": 0.0002, "loss": 0.2645, "step": 349 }, { "epoch": 0.51698670605613, "grad_norm": 0.2600267231464386, "learning_rate": 0.0002, "loss": 0.2525, "step": 350 }, { "epoch": 0.518463810930576, "grad_norm": 0.2765870988368988, "learning_rate": 0.0002, "loss": 0.2907, "step": 351 }, { "epoch": 0.5199409158050221, "grad_norm": 0.30320316553115845, "learning_rate": 0.0002, "loss": 0.3404, "step": 352 }, { "epoch": 0.5214180206794683, "grad_norm": 0.33050844073295593, "learning_rate": 0.0002, "loss": 0.3436, "step": 353 }, { "epoch": 0.5228951255539144, "grad_norm": 0.2716812193393707, "learning_rate": 0.0002, "loss": 0.2912, "step": 354 }, { "epoch": 0.5243722304283605, "grad_norm": 0.2944520115852356, "learning_rate": 0.0002, "loss": 0.3212, "step": 355 }, { "epoch": 0.5258493353028065, "grad_norm": 0.334228515625, "learning_rate": 0.0002, "loss": 0.3675, "step": 356 }, { "epoch": 0.5273264401772526, "grad_norm": 0.27948203682899475, "learning_rate": 0.0002, "loss": 0.2648, "step": 357 }, { "epoch": 0.5288035450516987, "grad_norm": 0.32159537076950073, "learning_rate": 0.0002, "loss": 0.3659, "step": 358 }, { "epoch": 0.5302806499261448, "grad_norm": 0.29499179124832153, "learning_rate": 0.0002, "loss": 0.2718, "step": 359 }, { "epoch": 0.5317577548005908, "grad_norm": 0.3503305912017822, "learning_rate": 0.0002, "loss": 0.2972, "step": 360 }, { "epoch": 0.5332348596750369, "grad_norm": 0.29388928413391113, "learning_rate": 0.0002, "loss": 0.3063, "step": 361 }, { "epoch": 0.534711964549483, "grad_norm": 0.2753749191761017, "learning_rate": 0.0002, "loss": 0.2706, "step": 362 }, { "epoch": 0.5361890694239291, "grad_norm": 0.2902815341949463, "learning_rate": 0.0002, "loss": 0.2918, "step": 363 }, { "epoch": 0.5376661742983752, "grad_norm": 0.2991829216480255, "learning_rate": 0.0002, "loss": 0.3148, "step": 364 }, { "epoch": 0.5391432791728212, "grad_norm": 0.3151837885379791, "learning_rate": 0.0002, "loss": 0.3187, "step": 365 }, { "epoch": 0.5406203840472673, "grad_norm": 0.2935662865638733, "learning_rate": 0.0002, "loss": 0.3065, "step": 366 }, { "epoch": 0.5420974889217134, "grad_norm": 0.2787752151489258, "learning_rate": 0.0002, "loss": 0.2677, "step": 367 }, { "epoch": 0.5435745937961596, "grad_norm": 0.2826704680919647, "learning_rate": 0.0002, "loss": 0.2673, "step": 368 }, { "epoch": 0.5450516986706057, "grad_norm": 0.3015994429588318, "learning_rate": 0.0002, "loss": 0.3377, "step": 369 }, { "epoch": 0.5465288035450517, "grad_norm": 0.27995777130126953, "learning_rate": 0.0002, "loss": 0.2672, "step": 370 }, { "epoch": 0.5480059084194978, "grad_norm": 0.2902574837207794, "learning_rate": 0.0002, "loss": 0.2684, "step": 371 }, { "epoch": 0.5494830132939439, "grad_norm": 0.2957216501235962, "learning_rate": 0.0002, "loss": 0.3061, "step": 372 }, { "epoch": 0.55096011816839, "grad_norm": 0.2945306599140167, "learning_rate": 0.0002, "loss": 0.3248, "step": 373 }, { "epoch": 0.552437223042836, "grad_norm": 0.2922048568725586, "learning_rate": 0.0002, "loss": 0.2987, "step": 374 }, { "epoch": 0.5539143279172821, "grad_norm": 0.30333656072616577, "learning_rate": 0.0002, "loss": 0.3072, "step": 375 }, { "epoch": 0.5553914327917282, "grad_norm": 0.2855093479156494, "learning_rate": 0.0002, "loss": 0.2758, "step": 376 }, { "epoch": 0.5568685376661743, "grad_norm": 0.2911272943019867, "learning_rate": 0.0002, "loss": 0.2722, "step": 377 }, { "epoch": 0.5583456425406204, "grad_norm": 0.289193332195282, "learning_rate": 0.0002, "loss": 0.3035, "step": 378 }, { "epoch": 0.5598227474150664, "grad_norm": 0.2716032564640045, "learning_rate": 0.0002, "loss": 0.2692, "step": 379 }, { "epoch": 0.5612998522895125, "grad_norm": 0.33022886514663696, "learning_rate": 0.0002, "loss": 0.3139, "step": 380 }, { "epoch": 0.5627769571639586, "grad_norm": 0.27433738112449646, "learning_rate": 0.0002, "loss": 0.262, "step": 381 }, { "epoch": 0.5642540620384048, "grad_norm": 0.27598345279693604, "learning_rate": 0.0002, "loss": 0.2657, "step": 382 }, { "epoch": 0.5657311669128509, "grad_norm": 0.28790509700775146, "learning_rate": 0.0002, "loss": 0.3024, "step": 383 }, { "epoch": 0.5672082717872969, "grad_norm": 0.2914026379585266, "learning_rate": 0.0002, "loss": 0.2972, "step": 384 }, { "epoch": 0.568685376661743, "grad_norm": 0.3148682117462158, "learning_rate": 0.0002, "loss": 0.2982, "step": 385 }, { "epoch": 0.5701624815361891, "grad_norm": 0.29025575518608093, "learning_rate": 0.0002, "loss": 0.2821, "step": 386 }, { "epoch": 0.5716395864106352, "grad_norm": 0.267362117767334, "learning_rate": 0.0002, "loss": 0.244, "step": 387 }, { "epoch": 0.5731166912850812, "grad_norm": 0.32638978958129883, "learning_rate": 0.0002, "loss": 0.3058, "step": 388 }, { "epoch": 0.5745937961595273, "grad_norm": 0.31582197546958923, "learning_rate": 0.0002, "loss": 0.3285, "step": 389 }, { "epoch": 0.5760709010339734, "grad_norm": 0.2933168113231659, "learning_rate": 0.0002, "loss": 0.2794, "step": 390 }, { "epoch": 0.5775480059084195, "grad_norm": 0.29435229301452637, "learning_rate": 0.0002, "loss": 0.287, "step": 391 }, { "epoch": 0.5790251107828656, "grad_norm": 0.29208388924598694, "learning_rate": 0.0002, "loss": 0.2786, "step": 392 }, { "epoch": 0.5805022156573116, "grad_norm": 0.2712183892726898, "learning_rate": 0.0002, "loss": 0.2708, "step": 393 }, { "epoch": 0.5819793205317577, "grad_norm": 0.27574923634529114, "learning_rate": 0.0002, "loss": 0.269, "step": 394 }, { "epoch": 0.5834564254062038, "grad_norm": 0.30967944860458374, "learning_rate": 0.0002, "loss": 0.2827, "step": 395 }, { "epoch": 0.5849335302806499, "grad_norm": 0.29655173420906067, "learning_rate": 0.0002, "loss": 0.306, "step": 396 }, { "epoch": 0.5864106351550961, "grad_norm": 1.6516242027282715, "learning_rate": 0.0002, "loss": 0.3294, "step": 397 }, { "epoch": 0.5878877400295421, "grad_norm": 0.2701549828052521, "learning_rate": 0.0002, "loss": 0.2451, "step": 398 }, { "epoch": 0.5893648449039882, "grad_norm": 0.2530956268310547, "learning_rate": 0.0002, "loss": 0.2341, "step": 399 }, { "epoch": 0.5908419497784343, "grad_norm": 0.3096421957015991, "learning_rate": 0.0002, "loss": 0.3, "step": 400 }, { "epoch": 0.5923190546528804, "grad_norm": 0.3079342842102051, "learning_rate": 0.0002, "loss": 0.3291, "step": 401 }, { "epoch": 0.5937961595273265, "grad_norm": 0.29586726427078247, "learning_rate": 0.0002, "loss": 0.3094, "step": 402 }, { "epoch": 0.5952732644017725, "grad_norm": 0.28764981031417847, "learning_rate": 0.0002, "loss": 0.2961, "step": 403 }, { "epoch": 0.5967503692762186, "grad_norm": 0.30434954166412354, "learning_rate": 0.0002, "loss": 0.2936, "step": 404 }, { "epoch": 0.5982274741506647, "grad_norm": 0.2840517461299896, "learning_rate": 0.0002, "loss": 0.2964, "step": 405 }, { "epoch": 0.5997045790251108, "grad_norm": 0.2927243113517761, "learning_rate": 0.0002, "loss": 0.313, "step": 406 }, { "epoch": 0.6011816838995568, "grad_norm": 0.26455628871917725, "learning_rate": 0.0002, "loss": 0.244, "step": 407 }, { "epoch": 0.6026587887740029, "grad_norm": 0.327934592962265, "learning_rate": 0.0002, "loss": 0.3271, "step": 408 }, { "epoch": 0.604135893648449, "grad_norm": 0.28486961126327515, "learning_rate": 0.0002, "loss": 0.2742, "step": 409 }, { "epoch": 0.6056129985228951, "grad_norm": 0.3310534656047821, "learning_rate": 0.0002, "loss": 0.2888, "step": 410 }, { "epoch": 0.6070901033973413, "grad_norm": 0.32391390204429626, "learning_rate": 0.0002, "loss": 0.3123, "step": 411 }, { "epoch": 0.6085672082717873, "grad_norm": 0.5019936561584473, "learning_rate": 0.0002, "loss": 0.3494, "step": 412 }, { "epoch": 0.6100443131462334, "grad_norm": 0.2915607988834381, "learning_rate": 0.0002, "loss": 0.2845, "step": 413 }, { "epoch": 0.6115214180206795, "grad_norm": 0.34125831723213196, "learning_rate": 0.0002, "loss": 0.2985, "step": 414 }, { "epoch": 0.6129985228951256, "grad_norm": 0.28235796093940735, "learning_rate": 0.0002, "loss": 0.29, "step": 415 }, { "epoch": 0.6144756277695717, "grad_norm": 0.30712956190109253, "learning_rate": 0.0002, "loss": 0.2863, "step": 416 }, { "epoch": 0.6159527326440177, "grad_norm": 0.3005330562591553, "learning_rate": 0.0002, "loss": 0.3186, "step": 417 }, { "epoch": 0.6174298375184638, "grad_norm": 0.4083673655986786, "learning_rate": 0.0002, "loss": 0.31, "step": 418 }, { "epoch": 0.6189069423929099, "grad_norm": 0.2704838812351227, "learning_rate": 0.0002, "loss": 0.2649, "step": 419 }, { "epoch": 0.620384047267356, "grad_norm": 0.29053810238838196, "learning_rate": 0.0002, "loss": 0.2789, "step": 420 }, { "epoch": 0.621861152141802, "grad_norm": 0.329973429441452, "learning_rate": 0.0002, "loss": 0.3313, "step": 421 }, { "epoch": 0.6233382570162481, "grad_norm": 0.31070685386657715, "learning_rate": 0.0002, "loss": 0.3045, "step": 422 }, { "epoch": 0.6248153618906942, "grad_norm": 0.3487679958343506, "learning_rate": 0.0002, "loss": 0.3286, "step": 423 }, { "epoch": 0.6262924667651403, "grad_norm": 0.3269588351249695, "learning_rate": 0.0002, "loss": 0.3326, "step": 424 }, { "epoch": 0.6277695716395865, "grad_norm": 0.26015186309814453, "learning_rate": 0.0002, "loss": 0.2457, "step": 425 }, { "epoch": 0.6292466765140325, "grad_norm": 0.2547609508037567, "learning_rate": 0.0002, "loss": 0.262, "step": 426 }, { "epoch": 0.6307237813884786, "grad_norm": 0.2524930238723755, "learning_rate": 0.0002, "loss": 0.23, "step": 427 }, { "epoch": 0.6322008862629247, "grad_norm": 0.3031904101371765, "learning_rate": 0.0002, "loss": 0.3427, "step": 428 }, { "epoch": 0.6336779911373708, "grad_norm": 0.3007690906524658, "learning_rate": 0.0002, "loss": 0.2974, "step": 429 }, { "epoch": 0.6351550960118169, "grad_norm": 0.28696200251579285, "learning_rate": 0.0002, "loss": 0.2911, "step": 430 }, { "epoch": 0.6366322008862629, "grad_norm": 0.2805304229259491, "learning_rate": 0.0002, "loss": 0.2745, "step": 431 }, { "epoch": 0.638109305760709, "grad_norm": 0.2757206857204437, "learning_rate": 0.0002, "loss": 0.2517, "step": 432 }, { "epoch": 0.6395864106351551, "grad_norm": 0.26851919293403625, "learning_rate": 0.0002, "loss": 0.2537, "step": 433 }, { "epoch": 0.6410635155096012, "grad_norm": 0.28059712052345276, "learning_rate": 0.0002, "loss": 0.2616, "step": 434 }, { "epoch": 0.6425406203840472, "grad_norm": 0.2718868553638458, "learning_rate": 0.0002, "loss": 0.2652, "step": 435 }, { "epoch": 0.6440177252584933, "grad_norm": 0.28253173828125, "learning_rate": 0.0002, "loss": 0.2866, "step": 436 }, { "epoch": 0.6454948301329394, "grad_norm": 0.3183034658432007, "learning_rate": 0.0002, "loss": 0.3485, "step": 437 }, { "epoch": 0.6469719350073855, "grad_norm": 0.2451733946800232, "learning_rate": 0.0002, "loss": 0.2312, "step": 438 }, { "epoch": 0.6484490398818316, "grad_norm": 0.3208939731121063, "learning_rate": 0.0002, "loss": 0.3245, "step": 439 }, { "epoch": 0.6499261447562777, "grad_norm": 0.26186874508857727, "learning_rate": 0.0002, "loss": 0.2485, "step": 440 }, { "epoch": 0.6514032496307238, "grad_norm": 0.27923303842544556, "learning_rate": 0.0002, "loss": 0.3221, "step": 441 }, { "epoch": 0.6528803545051699, "grad_norm": 0.28155946731567383, "learning_rate": 0.0002, "loss": 0.2843, "step": 442 }, { "epoch": 0.654357459379616, "grad_norm": 0.28456977009773254, "learning_rate": 0.0002, "loss": 0.296, "step": 443 }, { "epoch": 0.6558345642540621, "grad_norm": 0.27252209186553955, "learning_rate": 0.0002, "loss": 0.2765, "step": 444 }, { "epoch": 0.6573116691285081, "grad_norm": 0.30992233753204346, "learning_rate": 0.0002, "loss": 0.3055, "step": 445 }, { "epoch": 0.6587887740029542, "grad_norm": 0.30148544907569885, "learning_rate": 0.0002, "loss": 0.3059, "step": 446 }, { "epoch": 0.6602658788774003, "grad_norm": 0.29087716341018677, "learning_rate": 0.0002, "loss": 0.257, "step": 447 }, { "epoch": 0.6617429837518464, "grad_norm": 0.30917656421661377, "learning_rate": 0.0002, "loss": 0.3096, "step": 448 }, { "epoch": 0.6632200886262924, "grad_norm": 0.311759352684021, "learning_rate": 0.0002, "loss": 0.2842, "step": 449 }, { "epoch": 0.6646971935007385, "grad_norm": 0.2612153887748718, "learning_rate": 0.0002, "loss": 0.2659, "step": 450 }, { "epoch": 0.6661742983751846, "grad_norm": 0.2954850196838379, "learning_rate": 0.0002, "loss": 0.2755, "step": 451 }, { "epoch": 0.6676514032496307, "grad_norm": 0.3181207776069641, "learning_rate": 0.0002, "loss": 0.3163, "step": 452 }, { "epoch": 0.6691285081240768, "grad_norm": 0.2802172899246216, "learning_rate": 0.0002, "loss": 0.3007, "step": 453 }, { "epoch": 0.670605612998523, "grad_norm": 0.2662009298801422, "learning_rate": 0.0002, "loss": 0.2571, "step": 454 }, { "epoch": 0.672082717872969, "grad_norm": 0.2844826579093933, "learning_rate": 0.0002, "loss": 0.3074, "step": 455 }, { "epoch": 0.6735598227474151, "grad_norm": 0.2758782207965851, "learning_rate": 0.0002, "loss": 0.2773, "step": 456 }, { "epoch": 0.6750369276218612, "grad_norm": 0.2567600607872009, "learning_rate": 0.0002, "loss": 0.2742, "step": 457 }, { "epoch": 0.6765140324963073, "grad_norm": 0.34004896879196167, "learning_rate": 0.0002, "loss": 0.2288, "step": 458 }, { "epoch": 0.6779911373707533, "grad_norm": 0.2983347475528717, "learning_rate": 0.0002, "loss": 0.2812, "step": 459 }, { "epoch": 0.6794682422451994, "grad_norm": 0.29728880524635315, "learning_rate": 0.0002, "loss": 0.307, "step": 460 }, { "epoch": 0.6809453471196455, "grad_norm": 0.31359198689460754, "learning_rate": 0.0002, "loss": 0.3101, "step": 461 }, { "epoch": 0.6824224519940916, "grad_norm": 0.27619168162345886, "learning_rate": 0.0002, "loss": 0.2779, "step": 462 }, { "epoch": 0.6838995568685377, "grad_norm": 0.3086981773376465, "learning_rate": 0.0002, "loss": 0.3097, "step": 463 }, { "epoch": 0.6853766617429837, "grad_norm": 0.25216472148895264, "learning_rate": 0.0002, "loss": 0.2468, "step": 464 }, { "epoch": 0.6868537666174298, "grad_norm": 0.26497989892959595, "learning_rate": 0.0002, "loss": 0.2528, "step": 465 }, { "epoch": 0.6883308714918759, "grad_norm": 0.27617159485816956, "learning_rate": 0.0002, "loss": 0.2749, "step": 466 }, { "epoch": 0.689807976366322, "grad_norm": 0.30501970648765564, "learning_rate": 0.0002, "loss": 0.3091, "step": 467 }, { "epoch": 0.691285081240768, "grad_norm": 0.3360370099544525, "learning_rate": 0.0002, "loss": 0.3565, "step": 468 }, { "epoch": 0.6927621861152142, "grad_norm": 0.27070116996765137, "learning_rate": 0.0002, "loss": 0.2705, "step": 469 }, { "epoch": 0.6942392909896603, "grad_norm": 0.29874977469444275, "learning_rate": 0.0002, "loss": 0.2996, "step": 470 }, { "epoch": 0.6957163958641064, "grad_norm": 0.294386088848114, "learning_rate": 0.0002, "loss": 0.2894, "step": 471 }, { "epoch": 0.6971935007385525, "grad_norm": 0.3233067989349365, "learning_rate": 0.0002, "loss": 0.3024, "step": 472 }, { "epoch": 0.6986706056129985, "grad_norm": 0.31051644682884216, "learning_rate": 0.0002, "loss": 0.3339, "step": 473 }, { "epoch": 0.7001477104874446, "grad_norm": 0.28541213274002075, "learning_rate": 0.0002, "loss": 0.3097, "step": 474 }, { "epoch": 0.7016248153618907, "grad_norm": 0.30758950114250183, "learning_rate": 0.0002, "loss": 0.3221, "step": 475 }, { "epoch": 0.7031019202363368, "grad_norm": 0.37882164120674133, "learning_rate": 0.0002, "loss": 0.2922, "step": 476 }, { "epoch": 0.7045790251107829, "grad_norm": 0.2521478533744812, "learning_rate": 0.0002, "loss": 0.2535, "step": 477 }, { "epoch": 0.7060561299852289, "grad_norm": 0.36088013648986816, "learning_rate": 0.0002, "loss": 0.2863, "step": 478 }, { "epoch": 0.707533234859675, "grad_norm": 0.31090793013572693, "learning_rate": 0.0002, "loss": 0.311, "step": 479 }, { "epoch": 0.7090103397341211, "grad_norm": 0.2360762357711792, "learning_rate": 0.0002, "loss": 0.216, "step": 480 }, { "epoch": 0.7104874446085672, "grad_norm": 0.34354060888290405, "learning_rate": 0.0002, "loss": 0.2838, "step": 481 }, { "epoch": 0.7119645494830132, "grad_norm": 0.2607513666152954, "learning_rate": 0.0002, "loss": 0.2527, "step": 482 }, { "epoch": 0.7134416543574594, "grad_norm": 0.3016189634799957, "learning_rate": 0.0002, "loss": 0.2782, "step": 483 }, { "epoch": 0.7149187592319055, "grad_norm": 3.6188247203826904, "learning_rate": 0.0002, "loss": 0.2884, "step": 484 }, { "epoch": 0.7163958641063516, "grad_norm": 0.3072677552700043, "learning_rate": 0.0002, "loss": 0.3263, "step": 485 }, { "epoch": 0.7178729689807977, "grad_norm": 0.28074517846107483, "learning_rate": 0.0002, "loss": 0.298, "step": 486 }, { "epoch": 0.7193500738552437, "grad_norm": 0.3235277831554413, "learning_rate": 0.0002, "loss": 0.2615, "step": 487 }, { "epoch": 0.7208271787296898, "grad_norm": 2.001945734024048, "learning_rate": 0.0002, "loss": 0.3925, "step": 488 }, { "epoch": 0.7223042836041359, "grad_norm": 0.29725533723831177, "learning_rate": 0.0002, "loss": 0.2874, "step": 489 }, { "epoch": 0.723781388478582, "grad_norm": 0.28706061840057373, "learning_rate": 0.0002, "loss": 0.2898, "step": 490 }, { "epoch": 0.725258493353028, "grad_norm": 0.2864967882633209, "learning_rate": 0.0002, "loss": 0.2894, "step": 491 }, { "epoch": 0.7267355982274741, "grad_norm": 0.2824801802635193, "learning_rate": 0.0002, "loss": 0.2706, "step": 492 }, { "epoch": 0.7282127031019202, "grad_norm": 0.27492067217826843, "learning_rate": 0.0002, "loss": 0.2516, "step": 493 }, { "epoch": 0.7296898079763663, "grad_norm": 0.2876488268375397, "learning_rate": 0.0002, "loss": 0.2862, "step": 494 }, { "epoch": 0.7311669128508124, "grad_norm": 0.30311787128448486, "learning_rate": 0.0002, "loss": 0.2953, "step": 495 }, { "epoch": 0.7326440177252584, "grad_norm": 0.277235209941864, "learning_rate": 0.0002, "loss": 0.2981, "step": 496 }, { "epoch": 0.7341211225997046, "grad_norm": 0.30590546131134033, "learning_rate": 0.0002, "loss": 0.3557, "step": 497 }, { "epoch": 0.7355982274741507, "grad_norm": 0.3205493986606598, "learning_rate": 0.0002, "loss": 0.3304, "step": 498 }, { "epoch": 0.7370753323485968, "grad_norm": 0.2640839219093323, "learning_rate": 0.0002, "loss": 0.2807, "step": 499 }, { "epoch": 0.7385524372230429, "grad_norm": 0.27507102489471436, "learning_rate": 0.0002, "loss": 0.2565, "step": 500 }, { "epoch": 0.740029542097489, "grad_norm": 0.2716003954410553, "learning_rate": 0.0002, "loss": 0.2684, "step": 501 }, { "epoch": 0.741506646971935, "grad_norm": 0.2893518805503845, "learning_rate": 0.0002, "loss": 0.2708, "step": 502 }, { "epoch": 0.7429837518463811, "grad_norm": 0.2790103256702423, "learning_rate": 0.0002, "loss": 0.2809, "step": 503 }, { "epoch": 0.7444608567208272, "grad_norm": 0.29344794154167175, "learning_rate": 0.0002, "loss": 0.2961, "step": 504 }, { "epoch": 0.7459379615952733, "grad_norm": 0.3118347227573395, "learning_rate": 0.0002, "loss": 0.3184, "step": 505 }, { "epoch": 0.7474150664697193, "grad_norm": 0.29491183161735535, "learning_rate": 0.0002, "loss": 0.3194, "step": 506 }, { "epoch": 0.7488921713441654, "grad_norm": 0.3007814586162567, "learning_rate": 0.0002, "loss": 0.3, "step": 507 }, { "epoch": 0.7503692762186115, "grad_norm": 0.3303704261779785, "learning_rate": 0.0002, "loss": 0.2544, "step": 508 }, { "epoch": 0.7518463810930576, "grad_norm": 0.28095510601997375, "learning_rate": 0.0002, "loss": 0.2774, "step": 509 }, { "epoch": 0.7533234859675036, "grad_norm": 0.2669844329357147, "learning_rate": 0.0002, "loss": 0.2575, "step": 510 }, { "epoch": 0.7548005908419497, "grad_norm": 0.29896053671836853, "learning_rate": 0.0002, "loss": 0.2823, "step": 511 }, { "epoch": 0.7562776957163959, "grad_norm": 0.27470019459724426, "learning_rate": 0.0002, "loss": 0.2843, "step": 512 }, { "epoch": 0.757754800590842, "grad_norm": 0.27731189131736755, "learning_rate": 0.0002, "loss": 0.2504, "step": 513 }, { "epoch": 0.7592319054652881, "grad_norm": 0.2816368639469147, "learning_rate": 0.0002, "loss": 0.2877, "step": 514 }, { "epoch": 0.7607090103397341, "grad_norm": 0.2858635485172272, "learning_rate": 0.0002, "loss": 0.3348, "step": 515 }, { "epoch": 0.7621861152141802, "grad_norm": 0.2964169979095459, "learning_rate": 0.0002, "loss": 0.2775, "step": 516 }, { "epoch": 0.7636632200886263, "grad_norm": 0.2534787058830261, "learning_rate": 0.0002, "loss": 0.2274, "step": 517 }, { "epoch": 0.7651403249630724, "grad_norm": 0.28982672095298767, "learning_rate": 0.0002, "loss": 0.2939, "step": 518 }, { "epoch": 0.7666174298375185, "grad_norm": 0.27323317527770996, "learning_rate": 0.0002, "loss": 0.2842, "step": 519 }, { "epoch": 0.7680945347119645, "grad_norm": 0.27642300724983215, "learning_rate": 0.0002, "loss": 0.2966, "step": 520 }, { "epoch": 0.7695716395864106, "grad_norm": 0.26599329710006714, "learning_rate": 0.0002, "loss": 0.2326, "step": 521 }, { "epoch": 0.7710487444608567, "grad_norm": 0.2631528079509735, "learning_rate": 0.0002, "loss": 0.2771, "step": 522 }, { "epoch": 0.7725258493353028, "grad_norm": 0.2790911793708801, "learning_rate": 0.0002, "loss": 0.2898, "step": 523 }, { "epoch": 0.7740029542097489, "grad_norm": 0.266379714012146, "learning_rate": 0.0002, "loss": 0.2685, "step": 524 }, { "epoch": 0.7754800590841949, "grad_norm": 0.30508288741111755, "learning_rate": 0.0002, "loss": 0.2909, "step": 525 }, { "epoch": 0.7769571639586411, "grad_norm": 0.2602393329143524, "learning_rate": 0.0002, "loss": 0.2305, "step": 526 }, { "epoch": 0.7784342688330872, "grad_norm": 0.3033619523048401, "learning_rate": 0.0002, "loss": 0.2689, "step": 527 }, { "epoch": 0.7799113737075333, "grad_norm": 0.2758871614933014, "learning_rate": 0.0002, "loss": 0.2631, "step": 528 }, { "epoch": 0.7813884785819794, "grad_norm": 0.2910580039024353, "learning_rate": 0.0002, "loss": 0.2844, "step": 529 }, { "epoch": 0.7828655834564254, "grad_norm": 0.33454883098602295, "learning_rate": 0.0002, "loss": 0.301, "step": 530 }, { "epoch": 0.7843426883308715, "grad_norm": 0.31416234374046326, "learning_rate": 0.0002, "loss": 0.2948, "step": 531 }, { "epoch": 0.7858197932053176, "grad_norm": 0.3144732117652893, "learning_rate": 0.0002, "loss": 0.2649, "step": 532 }, { "epoch": 0.7872968980797637, "grad_norm": 0.2666049599647522, "learning_rate": 0.0002, "loss": 0.2602, "step": 533 }, { "epoch": 0.7887740029542097, "grad_norm": 0.26852795481681824, "learning_rate": 0.0002, "loss": 0.2761, "step": 534 }, { "epoch": 0.7902511078286558, "grad_norm": 0.2828836143016815, "learning_rate": 0.0002, "loss": 0.2643, "step": 535 }, { "epoch": 0.7917282127031019, "grad_norm": 0.24941638112068176, "learning_rate": 0.0002, "loss": 0.2715, "step": 536 }, { "epoch": 0.793205317577548, "grad_norm": 0.28167465329170227, "learning_rate": 0.0002, "loss": 0.2886, "step": 537 }, { "epoch": 0.794682422451994, "grad_norm": 0.27295514941215515, "learning_rate": 0.0002, "loss": 0.2838, "step": 538 }, { "epoch": 0.7961595273264401, "grad_norm": 0.28401198983192444, "learning_rate": 0.0002, "loss": 0.3027, "step": 539 }, { "epoch": 0.7976366322008862, "grad_norm": 0.36002475023269653, "learning_rate": 0.0002, "loss": 0.2743, "step": 540 }, { "epoch": 0.7991137370753324, "grad_norm": 0.24884235858917236, "learning_rate": 0.0002, "loss": 0.2236, "step": 541 }, { "epoch": 0.8005908419497785, "grad_norm": 0.29792970418930054, "learning_rate": 0.0002, "loss": 0.2685, "step": 542 }, { "epoch": 0.8020679468242246, "grad_norm": 0.293630450963974, "learning_rate": 0.0002, "loss": 0.3121, "step": 543 }, { "epoch": 0.8035450516986706, "grad_norm": 0.30826666951179504, "learning_rate": 0.0002, "loss": 0.2886, "step": 544 }, { "epoch": 0.8050221565731167, "grad_norm": 0.2855941355228424, "learning_rate": 0.0002, "loss": 0.2947, "step": 545 }, { "epoch": 0.8064992614475628, "grad_norm": 0.2649870812892914, "learning_rate": 0.0002, "loss": 0.2655, "step": 546 }, { "epoch": 0.8079763663220089, "grad_norm": 0.27176880836486816, "learning_rate": 0.0002, "loss": 0.2715, "step": 547 }, { "epoch": 0.8094534711964549, "grad_norm": 0.3225911557674408, "learning_rate": 0.0002, "loss": 0.3404, "step": 548 }, { "epoch": 0.810930576070901, "grad_norm": 0.30113476514816284, "learning_rate": 0.0002, "loss": 0.3347, "step": 549 }, { "epoch": 0.8124076809453471, "grad_norm": 0.2784980535507202, "learning_rate": 0.0002, "loss": 0.2599, "step": 550 }, { "epoch": 0.8138847858197932, "grad_norm": 0.2825387716293335, "learning_rate": 0.0002, "loss": 0.2759, "step": 551 }, { "epoch": 0.8153618906942393, "grad_norm": 0.26612088084220886, "learning_rate": 0.0002, "loss": 0.2464, "step": 552 }, { "epoch": 0.8168389955686853, "grad_norm": 0.2672181725502014, "learning_rate": 0.0002, "loss": 0.2182, "step": 553 }, { "epoch": 0.8183161004431314, "grad_norm": 0.28279784321784973, "learning_rate": 0.0002, "loss": 0.2612, "step": 554 }, { "epoch": 0.8197932053175776, "grad_norm": 0.277281790971756, "learning_rate": 0.0002, "loss": 0.2647, "step": 555 }, { "epoch": 0.8212703101920237, "grad_norm": 0.2784774899482727, "learning_rate": 0.0002, "loss": 0.2738, "step": 556 }, { "epoch": 0.8227474150664698, "grad_norm": 0.2438610941171646, "learning_rate": 0.0002, "loss": 0.2274, "step": 557 }, { "epoch": 0.8242245199409158, "grad_norm": 0.28168389201164246, "learning_rate": 0.0002, "loss": 0.2604, "step": 558 }, { "epoch": 0.8257016248153619, "grad_norm": 0.26112061738967896, "learning_rate": 0.0002, "loss": 0.2215, "step": 559 }, { "epoch": 0.827178729689808, "grad_norm": 0.25962984561920166, "learning_rate": 0.0002, "loss": 0.2834, "step": 560 }, { "epoch": 0.8286558345642541, "grad_norm": 0.27150726318359375, "learning_rate": 0.0002, "loss": 0.2877, "step": 561 }, { "epoch": 0.8301329394387001, "grad_norm": 0.2753923237323761, "learning_rate": 0.0002, "loss": 0.298, "step": 562 }, { "epoch": 0.8316100443131462, "grad_norm": 0.37228959798812866, "learning_rate": 0.0002, "loss": 0.333, "step": 563 }, { "epoch": 0.8330871491875923, "grad_norm": 0.27188584208488464, "learning_rate": 0.0002, "loss": 0.2578, "step": 564 }, { "epoch": 0.8345642540620384, "grad_norm": 0.2894970178604126, "learning_rate": 0.0002, "loss": 0.3051, "step": 565 }, { "epoch": 0.8360413589364845, "grad_norm": 0.2769443690776825, "learning_rate": 0.0002, "loss": 0.2833, "step": 566 }, { "epoch": 0.8375184638109305, "grad_norm": 0.25693845748901367, "learning_rate": 0.0002, "loss": 0.2571, "step": 567 }, { "epoch": 0.8389955686853766, "grad_norm": 0.27856937050819397, "learning_rate": 0.0002, "loss": 0.2942, "step": 568 }, { "epoch": 0.8404726735598228, "grad_norm": 0.2575175166130066, "learning_rate": 0.0002, "loss": 0.2733, "step": 569 }, { "epoch": 0.8419497784342689, "grad_norm": 0.27574828267097473, "learning_rate": 0.0002, "loss": 0.2642, "step": 570 }, { "epoch": 0.843426883308715, "grad_norm": 0.2522878646850586, "learning_rate": 0.0002, "loss": 0.2377, "step": 571 }, { "epoch": 0.844903988183161, "grad_norm": 0.26878973841667175, "learning_rate": 0.0002, "loss": 0.2635, "step": 572 }, { "epoch": 0.8463810930576071, "grad_norm": 0.25874340534210205, "learning_rate": 0.0002, "loss": 0.2622, "step": 573 }, { "epoch": 0.8478581979320532, "grad_norm": 0.2808675765991211, "learning_rate": 0.0002, "loss": 0.2679, "step": 574 }, { "epoch": 0.8493353028064993, "grad_norm": 0.3035877048969269, "learning_rate": 0.0002, "loss": 0.3097, "step": 575 }, { "epoch": 0.8508124076809453, "grad_norm": 0.2748059928417206, "learning_rate": 0.0002, "loss": 0.2986, "step": 576 }, { "epoch": 0.8522895125553914, "grad_norm": 0.2966136932373047, "learning_rate": 0.0002, "loss": 0.2799, "step": 577 }, { "epoch": 0.8537666174298375, "grad_norm": 1.3606016635894775, "learning_rate": 0.0002, "loss": 0.2808, "step": 578 }, { "epoch": 0.8552437223042836, "grad_norm": 0.2695050835609436, "learning_rate": 0.0002, "loss": 0.2417, "step": 579 }, { "epoch": 0.8567208271787297, "grad_norm": 0.26403385400772095, "learning_rate": 0.0002, "loss": 0.2474, "step": 580 }, { "epoch": 0.8581979320531757, "grad_norm": 0.2719348669052124, "learning_rate": 0.0002, "loss": 0.2558, "step": 581 }, { "epoch": 0.8596750369276218, "grad_norm": 0.2620692253112793, "learning_rate": 0.0002, "loss": 0.2704, "step": 582 }, { "epoch": 0.8611521418020679, "grad_norm": 0.3160097897052765, "learning_rate": 0.0002, "loss": 0.2967, "step": 583 }, { "epoch": 0.8626292466765141, "grad_norm": 0.27527111768722534, "learning_rate": 0.0002, "loss": 0.2508, "step": 584 }, { "epoch": 0.8641063515509602, "grad_norm": 0.27846094965934753, "learning_rate": 0.0002, "loss": 0.2766, "step": 585 }, { "epoch": 0.8655834564254062, "grad_norm": 0.2789734899997711, "learning_rate": 0.0002, "loss": 0.2857, "step": 586 }, { "epoch": 0.8670605612998523, "grad_norm": 0.30942806601524353, "learning_rate": 0.0002, "loss": 0.3216, "step": 587 }, { "epoch": 0.8685376661742984, "grad_norm": 0.27556589245796204, "learning_rate": 0.0002, "loss": 0.257, "step": 588 }, { "epoch": 0.8700147710487445, "grad_norm": 0.2209852784872055, "learning_rate": 0.0002, "loss": 0.1862, "step": 589 }, { "epoch": 0.8714918759231906, "grad_norm": 0.29638856649398804, "learning_rate": 0.0002, "loss": 0.3049, "step": 590 }, { "epoch": 0.8729689807976366, "grad_norm": 0.309600830078125, "learning_rate": 0.0002, "loss": 0.2675, "step": 591 }, { "epoch": 0.8744460856720827, "grad_norm": 0.28644561767578125, "learning_rate": 0.0002, "loss": 0.2859, "step": 592 }, { "epoch": 0.8759231905465288, "grad_norm": 0.25827983021736145, "learning_rate": 0.0002, "loss": 0.2543, "step": 593 }, { "epoch": 0.8774002954209749, "grad_norm": 0.2538520395755768, "learning_rate": 0.0002, "loss": 0.2523, "step": 594 }, { "epoch": 0.8788774002954209, "grad_norm": 0.26979878544807434, "learning_rate": 0.0002, "loss": 0.2379, "step": 595 }, { "epoch": 0.880354505169867, "grad_norm": 0.2815455496311188, "learning_rate": 0.0002, "loss": 0.2793, "step": 596 }, { "epoch": 0.8818316100443131, "grad_norm": 0.2549828588962555, "learning_rate": 0.0002, "loss": 0.2743, "step": 597 }, { "epoch": 0.8833087149187593, "grad_norm": 0.24497728049755096, "learning_rate": 0.0002, "loss": 0.2234, "step": 598 }, { "epoch": 0.8847858197932054, "grad_norm": 0.2854422628879547, "learning_rate": 0.0002, "loss": 0.2858, "step": 599 }, { "epoch": 0.8862629246676514, "grad_norm": 0.2807024121284485, "learning_rate": 0.0002, "loss": 0.2608, "step": 600 }, { "epoch": 0.8877400295420975, "grad_norm": 0.2663458585739136, "learning_rate": 0.0002, "loss": 0.2399, "step": 601 }, { "epoch": 0.8892171344165436, "grad_norm": 0.2760714888572693, "learning_rate": 0.0002, "loss": 0.2934, "step": 602 }, { "epoch": 0.8906942392909897, "grad_norm": 0.3003925383090973, "learning_rate": 0.0002, "loss": 0.2765, "step": 603 }, { "epoch": 0.8921713441654358, "grad_norm": 0.32742151618003845, "learning_rate": 0.0002, "loss": 0.3116, "step": 604 }, { "epoch": 0.8936484490398818, "grad_norm": 0.29396241903305054, "learning_rate": 0.0002, "loss": 0.2548, "step": 605 }, { "epoch": 0.8951255539143279, "grad_norm": 0.28835952281951904, "learning_rate": 0.0002, "loss": 0.2665, "step": 606 }, { "epoch": 0.896602658788774, "grad_norm": 0.2689400017261505, "learning_rate": 0.0002, "loss": 0.2554, "step": 607 }, { "epoch": 0.8980797636632201, "grad_norm": 0.27350932359695435, "learning_rate": 0.0002, "loss": 0.2474, "step": 608 }, { "epoch": 0.8995568685376661, "grad_norm": 0.26769059896469116, "learning_rate": 0.0002, "loss": 0.286, "step": 609 }, { "epoch": 0.9010339734121122, "grad_norm": 0.25921839475631714, "learning_rate": 0.0002, "loss": 0.2438, "step": 610 }, { "epoch": 0.9025110782865583, "grad_norm": 0.26628950238227844, "learning_rate": 0.0002, "loss": 0.2713, "step": 611 }, { "epoch": 0.9039881831610044, "grad_norm": 0.26283326745033264, "learning_rate": 0.0002, "loss": 0.2696, "step": 612 }, { "epoch": 0.9054652880354506, "grad_norm": 0.29980388283729553, "learning_rate": 0.0002, "loss": 0.2581, "step": 613 }, { "epoch": 0.9069423929098966, "grad_norm": 0.2768777310848236, "learning_rate": 0.0002, "loss": 0.2853, "step": 614 }, { "epoch": 0.9084194977843427, "grad_norm": 0.27376455068588257, "learning_rate": 0.0002, "loss": 0.2836, "step": 615 }, { "epoch": 0.9098966026587888, "grad_norm": 0.28933191299438477, "learning_rate": 0.0002, "loss": 0.3071, "step": 616 }, { "epoch": 0.9113737075332349, "grad_norm": 0.3081536293029785, "learning_rate": 0.0002, "loss": 0.2461, "step": 617 }, { "epoch": 0.912850812407681, "grad_norm": 0.2886345386505127, "learning_rate": 0.0002, "loss": 0.3, "step": 618 }, { "epoch": 0.914327917282127, "grad_norm": 0.2829267680644989, "learning_rate": 0.0002, "loss": 0.2746, "step": 619 }, { "epoch": 0.9158050221565731, "grad_norm": 0.2512478232383728, "learning_rate": 0.0002, "loss": 0.2435, "step": 620 }, { "epoch": 0.9172821270310192, "grad_norm": 0.4229198694229126, "learning_rate": 0.0002, "loss": 0.2847, "step": 621 }, { "epoch": 0.9187592319054653, "grad_norm": 0.2993115186691284, "learning_rate": 0.0002, "loss": 0.2869, "step": 622 }, { "epoch": 0.9202363367799113, "grad_norm": 0.2935909330844879, "learning_rate": 0.0002, "loss": 0.2691, "step": 623 }, { "epoch": 0.9217134416543574, "grad_norm": 0.3156206011772156, "learning_rate": 0.0002, "loss": 0.2936, "step": 624 }, { "epoch": 0.9231905465288035, "grad_norm": 0.2829430401325226, "learning_rate": 0.0002, "loss": 0.2782, "step": 625 }, { "epoch": 0.9246676514032496, "grad_norm": 0.2769679129123688, "learning_rate": 0.0002, "loss": 0.2751, "step": 626 }, { "epoch": 0.9261447562776958, "grad_norm": 0.2695547044277191, "learning_rate": 0.0002, "loss": 0.2768, "step": 627 }, { "epoch": 0.9276218611521418, "grad_norm": 0.2564750909805298, "learning_rate": 0.0002, "loss": 0.2691, "step": 628 }, { "epoch": 0.9290989660265879, "grad_norm": 0.3216243386268616, "learning_rate": 0.0002, "loss": 0.2895, "step": 629 }, { "epoch": 0.930576070901034, "grad_norm": 0.26920050382614136, "learning_rate": 0.0002, "loss": 0.2943, "step": 630 }, { "epoch": 0.9320531757754801, "grad_norm": 0.23926717042922974, "learning_rate": 0.0002, "loss": 0.2114, "step": 631 }, { "epoch": 0.9335302806499262, "grad_norm": 0.3015134036540985, "learning_rate": 0.0002, "loss": 0.2889, "step": 632 }, { "epoch": 0.9350073855243722, "grad_norm": 0.29262953996658325, "learning_rate": 0.0002, "loss": 0.2977, "step": 633 }, { "epoch": 0.9364844903988183, "grad_norm": 0.27330338954925537, "learning_rate": 0.0002, "loss": 0.2706, "step": 634 }, { "epoch": 0.9379615952732644, "grad_norm": 0.2691650092601776, "learning_rate": 0.0002, "loss": 0.2471, "step": 635 }, { "epoch": 0.9394387001477105, "grad_norm": 0.30574268102645874, "learning_rate": 0.0002, "loss": 0.2977, "step": 636 }, { "epoch": 0.9409158050221565, "grad_norm": 0.2739352881908417, "learning_rate": 0.0002, "loss": 0.2825, "step": 637 }, { "epoch": 0.9423929098966026, "grad_norm": 0.3041648268699646, "learning_rate": 0.0002, "loss": 0.3344, "step": 638 }, { "epoch": 0.9438700147710487, "grad_norm": 0.2827674150466919, "learning_rate": 0.0002, "loss": 0.2618, "step": 639 }, { "epoch": 0.9453471196454948, "grad_norm": 0.25939705967903137, "learning_rate": 0.0002, "loss": 0.2279, "step": 640 }, { "epoch": 0.946824224519941, "grad_norm": 0.3013932406902313, "learning_rate": 0.0002, "loss": 0.3114, "step": 641 }, { "epoch": 0.948301329394387, "grad_norm": 4.885525703430176, "learning_rate": 0.0002, "loss": 0.2818, "step": 642 }, { "epoch": 0.9497784342688331, "grad_norm": 0.25590044260025024, "learning_rate": 0.0002, "loss": 0.2427, "step": 643 }, { "epoch": 0.9512555391432792, "grad_norm": 0.2372172772884369, "learning_rate": 0.0002, "loss": 0.2397, "step": 644 }, { "epoch": 0.9527326440177253, "grad_norm": 0.26376283168792725, "learning_rate": 0.0002, "loss": 0.2624, "step": 645 }, { "epoch": 0.9542097488921714, "grad_norm": 0.27342459559440613, "learning_rate": 0.0002, "loss": 0.2652, "step": 646 }, { "epoch": 0.9556868537666174, "grad_norm": 0.260745644569397, "learning_rate": 0.0002, "loss": 0.2523, "step": 647 }, { "epoch": 0.9571639586410635, "grad_norm": 0.24873754382133484, "learning_rate": 0.0002, "loss": 0.2238, "step": 648 }, { "epoch": 0.9586410635155096, "grad_norm": 0.281990110874176, "learning_rate": 0.0002, "loss": 0.235, "step": 649 }, { "epoch": 0.9601181683899557, "grad_norm": 0.25676026940345764, "learning_rate": 0.0002, "loss": 0.215, "step": 650 }, { "epoch": 0.9615952732644018, "grad_norm": 0.2927687466144562, "learning_rate": 0.0002, "loss": 0.2764, "step": 651 }, { "epoch": 0.9630723781388478, "grad_norm": 0.26322099566459656, "learning_rate": 0.0002, "loss": 0.2511, "step": 652 }, { "epoch": 0.9645494830132939, "grad_norm": 0.2764233350753784, "learning_rate": 0.0002, "loss": 0.2439, "step": 653 }, { "epoch": 0.96602658788774, "grad_norm": 0.29849788546562195, "learning_rate": 0.0002, "loss": 0.2689, "step": 654 }, { "epoch": 0.9675036927621861, "grad_norm": 0.2834247648715973, "learning_rate": 0.0002, "loss": 0.3398, "step": 655 }, { "epoch": 0.9689807976366323, "grad_norm": 0.28436174988746643, "learning_rate": 0.0002, "loss": 0.3086, "step": 656 }, { "epoch": 0.9704579025110783, "grad_norm": 0.24340803921222687, "learning_rate": 0.0002, "loss": 0.2288, "step": 657 }, { "epoch": 0.9719350073855244, "grad_norm": 0.2577742338180542, "learning_rate": 0.0002, "loss": 0.2598, "step": 658 }, { "epoch": 0.9734121122599705, "grad_norm": 0.28326281905174255, "learning_rate": 0.0002, "loss": 0.2862, "step": 659 }, { "epoch": 0.9748892171344166, "grad_norm": 0.27066269516944885, "learning_rate": 0.0002, "loss": 0.2585, "step": 660 }, { "epoch": 0.9763663220088626, "grad_norm": 0.26694634556770325, "learning_rate": 0.0002, "loss": 0.2788, "step": 661 }, { "epoch": 0.9778434268833087, "grad_norm": 0.2890130281448364, "learning_rate": 0.0002, "loss": 0.3073, "step": 662 }, { "epoch": 0.9793205317577548, "grad_norm": 0.26095882058143616, "learning_rate": 0.0002, "loss": 0.2184, "step": 663 }, { "epoch": 0.9807976366322009, "grad_norm": 0.2648635506629944, "learning_rate": 0.0002, "loss": 0.2465, "step": 664 }, { "epoch": 0.982274741506647, "grad_norm": 0.2354656457901001, "learning_rate": 0.0002, "loss": 0.2457, "step": 665 }, { "epoch": 0.983751846381093, "grad_norm": 0.2767215669155121, "learning_rate": 0.0002, "loss": 0.2735, "step": 666 }, { "epoch": 0.9852289512555391, "grad_norm": 0.27141231298446655, "learning_rate": 0.0002, "loss": 0.2589, "step": 667 }, { "epoch": 0.9867060561299852, "grad_norm": 0.254549115896225, "learning_rate": 0.0002, "loss": 0.2785, "step": 668 }, { "epoch": 0.9881831610044313, "grad_norm": 0.2712014317512512, "learning_rate": 0.0002, "loss": 0.2579, "step": 669 }, { "epoch": 0.9896602658788775, "grad_norm": 0.26712852716445923, "learning_rate": 0.0002, "loss": 0.3165, "step": 670 }, { "epoch": 0.9911373707533235, "grad_norm": 0.2829815447330475, "learning_rate": 0.0002, "loss": 0.2438, "step": 671 }, { "epoch": 0.9926144756277696, "grad_norm": 0.27326712012290955, "learning_rate": 0.0002, "loss": 0.2622, "step": 672 }, { "epoch": 0.9940915805022157, "grad_norm": 0.2569233179092407, "learning_rate": 0.0002, "loss": 0.2353, "step": 673 }, { "epoch": 0.9955686853766618, "grad_norm": 0.28441140055656433, "learning_rate": 0.0002, "loss": 0.2732, "step": 674 }, { "epoch": 0.9970457902511078, "grad_norm": 0.2831505835056305, "learning_rate": 0.0002, "loss": 0.2605, "step": 675 }, { "epoch": 0.9985228951255539, "grad_norm": 0.26520466804504395, "learning_rate": 0.0002, "loss": 0.2614, "step": 676 }, { "epoch": 1.0, "grad_norm": 0.7976667881011963, "learning_rate": 0.0002, "loss": 0.3423, "step": 677 }, { "epoch": 1.0, "step": 677, "total_flos": 1.1323313955746611e+17, "train_loss": 0.3172708253432588, "train_runtime": 2971.8662, "train_samples_per_second": 1.82, "train_steps_per_second": 0.228 } ], "logging_steps": 1, "max_steps": 677, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1323313955746611e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }