hangzou's picture
Model save
0aabe5b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 677,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014771048744460858,
"grad_norm": 2.1287364959716797,
"learning_rate": 0.0002,
"loss": 1.4334,
"step": 1
},
{
"epoch": 0.0029542097488921715,
"grad_norm": 1.5033221244812012,
"learning_rate": 0.0002,
"loss": 1.1642,
"step": 2
},
{
"epoch": 0.004431314623338257,
"grad_norm": 1.5286452770233154,
"learning_rate": 0.0002,
"loss": 0.9292,
"step": 3
},
{
"epoch": 0.005908419497784343,
"grad_norm": 1.2362889051437378,
"learning_rate": 0.0002,
"loss": 0.7608,
"step": 4
},
{
"epoch": 0.007385524372230428,
"grad_norm": 1.0593241453170776,
"learning_rate": 0.0002,
"loss": 0.6653,
"step": 5
},
{
"epoch": 0.008862629246676515,
"grad_norm": 1.0034171342849731,
"learning_rate": 0.0002,
"loss": 0.5738,
"step": 6
},
{
"epoch": 0.0103397341211226,
"grad_norm": 0.723822832107544,
"learning_rate": 0.0002,
"loss": 0.5328,
"step": 7
},
{
"epoch": 0.011816838995568686,
"grad_norm": 0.7309075593948364,
"learning_rate": 0.0002,
"loss": 0.5088,
"step": 8
},
{
"epoch": 0.013293943870014771,
"grad_norm": 0.6442256569862366,
"learning_rate": 0.0002,
"loss": 0.4999,
"step": 9
},
{
"epoch": 0.014771048744460856,
"grad_norm": 0.6145352721214294,
"learning_rate": 0.0002,
"loss": 0.5046,
"step": 10
},
{
"epoch": 0.01624815361890694,
"grad_norm": 0.5789129734039307,
"learning_rate": 0.0002,
"loss": 0.489,
"step": 11
},
{
"epoch": 0.01772525849335303,
"grad_norm": 0.5824376940727234,
"learning_rate": 0.0002,
"loss": 0.5328,
"step": 12
},
{
"epoch": 0.019202363367799114,
"grad_norm": 0.5699394941329956,
"learning_rate": 0.0002,
"loss": 0.4755,
"step": 13
},
{
"epoch": 0.0206794682422452,
"grad_norm": 0.5292893052101135,
"learning_rate": 0.0002,
"loss": 0.4108,
"step": 14
},
{
"epoch": 0.022156573116691284,
"grad_norm": 0.5537489056587219,
"learning_rate": 0.0002,
"loss": 0.4807,
"step": 15
},
{
"epoch": 0.023633677991137372,
"grad_norm": 0.546784520149231,
"learning_rate": 0.0002,
"loss": 0.4427,
"step": 16
},
{
"epoch": 0.025110782865583457,
"grad_norm": 0.5094020962715149,
"learning_rate": 0.0002,
"loss": 0.4617,
"step": 17
},
{
"epoch": 0.026587887740029542,
"grad_norm": 0.549403190612793,
"learning_rate": 0.0002,
"loss": 0.452,
"step": 18
},
{
"epoch": 0.028064992614475627,
"grad_norm": 0.47281214594841003,
"learning_rate": 0.0002,
"loss": 0.3916,
"step": 19
},
{
"epoch": 0.029542097488921712,
"grad_norm": 0.4933842122554779,
"learning_rate": 0.0002,
"loss": 0.4344,
"step": 20
},
{
"epoch": 0.0310192023633678,
"grad_norm": 0.5650342106819153,
"learning_rate": 0.0002,
"loss": 0.5192,
"step": 21
},
{
"epoch": 0.03249630723781388,
"grad_norm": 0.5102580189704895,
"learning_rate": 0.0002,
"loss": 0.4521,
"step": 22
},
{
"epoch": 0.033973412112259974,
"grad_norm": 0.47124335169792175,
"learning_rate": 0.0002,
"loss": 0.3719,
"step": 23
},
{
"epoch": 0.03545051698670606,
"grad_norm": 0.4769236445426941,
"learning_rate": 0.0002,
"loss": 0.4359,
"step": 24
},
{
"epoch": 0.03692762186115214,
"grad_norm": 0.49603205919265747,
"learning_rate": 0.0002,
"loss": 0.438,
"step": 25
},
{
"epoch": 0.03840472673559823,
"grad_norm": 0.42155203223228455,
"learning_rate": 0.0002,
"loss": 0.3311,
"step": 26
},
{
"epoch": 0.03988183161004431,
"grad_norm": 0.4394625723361969,
"learning_rate": 0.0002,
"loss": 0.4033,
"step": 27
},
{
"epoch": 0.0413589364844904,
"grad_norm": 0.4578387141227722,
"learning_rate": 0.0002,
"loss": 0.399,
"step": 28
},
{
"epoch": 0.04283604135893648,
"grad_norm": 0.4147898256778717,
"learning_rate": 0.0002,
"loss": 0.3599,
"step": 29
},
{
"epoch": 0.04431314623338257,
"grad_norm": 0.47084635496139526,
"learning_rate": 0.0002,
"loss": 0.4668,
"step": 30
},
{
"epoch": 0.04579025110782865,
"grad_norm": 0.399994820356369,
"learning_rate": 0.0002,
"loss": 0.3108,
"step": 31
},
{
"epoch": 0.047267355982274745,
"grad_norm": 0.4256761074066162,
"learning_rate": 0.0002,
"loss": 0.3928,
"step": 32
},
{
"epoch": 0.04874446085672083,
"grad_norm": 0.4237106442451477,
"learning_rate": 0.0002,
"loss": 0.4036,
"step": 33
},
{
"epoch": 0.050221565731166914,
"grad_norm": 0.4622955024242401,
"learning_rate": 0.0002,
"loss": 0.4394,
"step": 34
},
{
"epoch": 0.051698670605613,
"grad_norm": 0.8845525979995728,
"learning_rate": 0.0002,
"loss": 0.3714,
"step": 35
},
{
"epoch": 0.053175775480059084,
"grad_norm": 0.3846614360809326,
"learning_rate": 0.0002,
"loss": 0.3625,
"step": 36
},
{
"epoch": 0.05465288035450517,
"grad_norm": 0.41804981231689453,
"learning_rate": 0.0002,
"loss": 0.4027,
"step": 37
},
{
"epoch": 0.056129985228951254,
"grad_norm": 0.3947773575782776,
"learning_rate": 0.0002,
"loss": 0.3523,
"step": 38
},
{
"epoch": 0.05760709010339734,
"grad_norm": 0.3716173470020294,
"learning_rate": 0.0002,
"loss": 0.3333,
"step": 39
},
{
"epoch": 0.059084194977843424,
"grad_norm": 0.4511498808860779,
"learning_rate": 0.0002,
"loss": 0.4104,
"step": 40
},
{
"epoch": 0.060561299852289516,
"grad_norm": 0.4428117573261261,
"learning_rate": 0.0002,
"loss": 0.4217,
"step": 41
},
{
"epoch": 0.0620384047267356,
"grad_norm": 0.4312277138233185,
"learning_rate": 0.0002,
"loss": 0.4458,
"step": 42
},
{
"epoch": 0.06351550960118169,
"grad_norm": 0.4207220673561096,
"learning_rate": 0.0002,
"loss": 0.4206,
"step": 43
},
{
"epoch": 0.06499261447562776,
"grad_norm": 0.4463505744934082,
"learning_rate": 0.0002,
"loss": 0.3911,
"step": 44
},
{
"epoch": 0.06646971935007386,
"grad_norm": 0.4605293869972229,
"learning_rate": 0.0002,
"loss": 0.4154,
"step": 45
},
{
"epoch": 0.06794682422451995,
"grad_norm": 0.380751371383667,
"learning_rate": 0.0002,
"loss": 0.3556,
"step": 46
},
{
"epoch": 0.06942392909896603,
"grad_norm": 0.3776094615459442,
"learning_rate": 0.0002,
"loss": 0.3275,
"step": 47
},
{
"epoch": 0.07090103397341212,
"grad_norm": 0.39152535796165466,
"learning_rate": 0.0002,
"loss": 0.3749,
"step": 48
},
{
"epoch": 0.0723781388478582,
"grad_norm": 0.4888671338558197,
"learning_rate": 0.0002,
"loss": 0.4408,
"step": 49
},
{
"epoch": 0.07385524372230429,
"grad_norm": 0.38958850502967834,
"learning_rate": 0.0002,
"loss": 0.3551,
"step": 50
},
{
"epoch": 0.07533234859675036,
"grad_norm": 0.39890560507774353,
"learning_rate": 0.0002,
"loss": 0.387,
"step": 51
},
{
"epoch": 0.07680945347119646,
"grad_norm": 0.4128841757774353,
"learning_rate": 0.0002,
"loss": 0.3945,
"step": 52
},
{
"epoch": 0.07828655834564253,
"grad_norm": 0.45516759157180786,
"learning_rate": 0.0002,
"loss": 0.4049,
"step": 53
},
{
"epoch": 0.07976366322008863,
"grad_norm": 0.4038144648075104,
"learning_rate": 0.0002,
"loss": 0.3789,
"step": 54
},
{
"epoch": 0.08124076809453472,
"grad_norm": 0.37849175930023193,
"learning_rate": 0.0002,
"loss": 0.3955,
"step": 55
},
{
"epoch": 0.0827178729689808,
"grad_norm": 0.4295189082622528,
"learning_rate": 0.0002,
"loss": 0.4112,
"step": 56
},
{
"epoch": 0.08419497784342689,
"grad_norm": 0.4347020387649536,
"learning_rate": 0.0002,
"loss": 0.4542,
"step": 57
},
{
"epoch": 0.08567208271787297,
"grad_norm": 0.41407692432403564,
"learning_rate": 0.0002,
"loss": 0.4035,
"step": 58
},
{
"epoch": 0.08714918759231906,
"grad_norm": 0.33283814787864685,
"learning_rate": 0.0002,
"loss": 0.2851,
"step": 59
},
{
"epoch": 0.08862629246676514,
"grad_norm": 0.39427581429481506,
"learning_rate": 0.0002,
"loss": 0.4494,
"step": 60
},
{
"epoch": 0.09010339734121123,
"grad_norm": 1.3357727527618408,
"learning_rate": 0.0002,
"loss": 0.3335,
"step": 61
},
{
"epoch": 0.0915805022156573,
"grad_norm": 0.37050360441207886,
"learning_rate": 0.0002,
"loss": 0.3224,
"step": 62
},
{
"epoch": 0.0930576070901034,
"grad_norm": 0.36000698804855347,
"learning_rate": 0.0002,
"loss": 0.3679,
"step": 63
},
{
"epoch": 0.09453471196454949,
"grad_norm": 0.3739371597766876,
"learning_rate": 0.0002,
"loss": 0.4041,
"step": 64
},
{
"epoch": 0.09601181683899557,
"grad_norm": 0.3365491032600403,
"learning_rate": 0.0002,
"loss": 0.3462,
"step": 65
},
{
"epoch": 0.09748892171344166,
"grad_norm": 0.3357471823692322,
"learning_rate": 0.0002,
"loss": 0.3416,
"step": 66
},
{
"epoch": 0.09896602658788774,
"grad_norm": 0.38020288944244385,
"learning_rate": 0.0002,
"loss": 0.3521,
"step": 67
},
{
"epoch": 0.10044313146233383,
"grad_norm": 0.37143656611442566,
"learning_rate": 0.0002,
"loss": 0.3873,
"step": 68
},
{
"epoch": 0.1019202363367799,
"grad_norm": 0.3613298535346985,
"learning_rate": 0.0002,
"loss": 0.3695,
"step": 69
},
{
"epoch": 0.103397341211226,
"grad_norm": 0.3881225287914276,
"learning_rate": 0.0002,
"loss": 0.3686,
"step": 70
},
{
"epoch": 0.10487444608567208,
"grad_norm": 0.35213181376457214,
"learning_rate": 0.0002,
"loss": 0.3276,
"step": 71
},
{
"epoch": 0.10635155096011817,
"grad_norm": 0.3477317988872528,
"learning_rate": 0.0002,
"loss": 0.3261,
"step": 72
},
{
"epoch": 0.10782865583456426,
"grad_norm": 0.326730340719223,
"learning_rate": 0.0002,
"loss": 0.2784,
"step": 73
},
{
"epoch": 0.10930576070901034,
"grad_norm": 0.3316071629524231,
"learning_rate": 0.0002,
"loss": 0.3317,
"step": 74
},
{
"epoch": 0.11078286558345643,
"grad_norm": 0.37388283014297485,
"learning_rate": 0.0002,
"loss": 0.3845,
"step": 75
},
{
"epoch": 0.11225997045790251,
"grad_norm": 0.39761313796043396,
"learning_rate": 0.0002,
"loss": 0.4043,
"step": 76
},
{
"epoch": 0.1137370753323486,
"grad_norm": 0.35033172369003296,
"learning_rate": 0.0002,
"loss": 0.3212,
"step": 77
},
{
"epoch": 0.11521418020679468,
"grad_norm": 0.7551948428153992,
"learning_rate": 0.0002,
"loss": 0.3387,
"step": 78
},
{
"epoch": 0.11669128508124077,
"grad_norm": 0.2940291166305542,
"learning_rate": 0.0002,
"loss": 0.2742,
"step": 79
},
{
"epoch": 0.11816838995568685,
"grad_norm": 0.4048764407634735,
"learning_rate": 0.0002,
"loss": 0.4176,
"step": 80
},
{
"epoch": 0.11964549483013294,
"grad_norm": 0.36520177125930786,
"learning_rate": 0.0002,
"loss": 0.317,
"step": 81
},
{
"epoch": 0.12112259970457903,
"grad_norm": 0.3602144718170166,
"learning_rate": 0.0002,
"loss": 0.3648,
"step": 82
},
{
"epoch": 0.12259970457902511,
"grad_norm": 0.34669214487075806,
"learning_rate": 0.0002,
"loss": 0.3389,
"step": 83
},
{
"epoch": 0.1240768094534712,
"grad_norm": 0.34198257327079773,
"learning_rate": 0.0002,
"loss": 0.3174,
"step": 84
},
{
"epoch": 0.1255539143279173,
"grad_norm": 0.3409755825996399,
"learning_rate": 0.0002,
"loss": 0.3376,
"step": 85
},
{
"epoch": 0.12703101920236337,
"grad_norm": 0.38363194465637207,
"learning_rate": 0.0002,
"loss": 0.4002,
"step": 86
},
{
"epoch": 0.12850812407680945,
"grad_norm": 0.35614731907844543,
"learning_rate": 0.0002,
"loss": 0.3581,
"step": 87
},
{
"epoch": 0.12998522895125553,
"grad_norm": 0.3808327615261078,
"learning_rate": 0.0002,
"loss": 0.3966,
"step": 88
},
{
"epoch": 0.13146233382570163,
"grad_norm": 0.3924517035484314,
"learning_rate": 0.0002,
"loss": 0.4161,
"step": 89
},
{
"epoch": 0.1329394387001477,
"grad_norm": 0.3589531183242798,
"learning_rate": 0.0002,
"loss": 0.3233,
"step": 90
},
{
"epoch": 0.1344165435745938,
"grad_norm": 0.37429341673851013,
"learning_rate": 0.0002,
"loss": 0.3778,
"step": 91
},
{
"epoch": 0.1358936484490399,
"grad_norm": 0.3594294488430023,
"learning_rate": 0.0002,
"loss": 0.3472,
"step": 92
},
{
"epoch": 0.13737075332348597,
"grad_norm": 0.3481505215167999,
"learning_rate": 0.0002,
"loss": 0.2961,
"step": 93
},
{
"epoch": 0.13884785819793205,
"grad_norm": 0.3697575330734253,
"learning_rate": 0.0002,
"loss": 0.3954,
"step": 94
},
{
"epoch": 0.14032496307237813,
"grad_norm": 0.3154103457927704,
"learning_rate": 0.0002,
"loss": 0.3148,
"step": 95
},
{
"epoch": 0.14180206794682423,
"grad_norm": 0.32966312766075134,
"learning_rate": 0.0002,
"loss": 0.3211,
"step": 96
},
{
"epoch": 0.1432791728212703,
"grad_norm": 0.3409123718738556,
"learning_rate": 0.0002,
"loss": 0.3318,
"step": 97
},
{
"epoch": 0.1447562776957164,
"grad_norm": 0.346122682094574,
"learning_rate": 0.0002,
"loss": 0.3296,
"step": 98
},
{
"epoch": 0.14623338257016247,
"grad_norm": 0.35875195264816284,
"learning_rate": 0.0002,
"loss": 0.3884,
"step": 99
},
{
"epoch": 0.14771048744460857,
"grad_norm": 0.3223486542701721,
"learning_rate": 0.0002,
"loss": 0.3282,
"step": 100
},
{
"epoch": 0.14918759231905465,
"grad_norm": 0.34657180309295654,
"learning_rate": 0.0002,
"loss": 0.364,
"step": 101
},
{
"epoch": 0.15066469719350073,
"grad_norm": 0.34456005692481995,
"learning_rate": 0.0002,
"loss": 0.3541,
"step": 102
},
{
"epoch": 0.15214180206794684,
"grad_norm": 0.3482792377471924,
"learning_rate": 0.0002,
"loss": 0.3435,
"step": 103
},
{
"epoch": 0.1536189069423929,
"grad_norm": 0.37781214714050293,
"learning_rate": 0.0002,
"loss": 0.3716,
"step": 104
},
{
"epoch": 0.155096011816839,
"grad_norm": 0.46567779779434204,
"learning_rate": 0.0002,
"loss": 0.3143,
"step": 105
},
{
"epoch": 0.15657311669128507,
"grad_norm": 0.32534581422805786,
"learning_rate": 0.0002,
"loss": 0.3561,
"step": 106
},
{
"epoch": 0.15805022156573117,
"grad_norm": 0.3262612521648407,
"learning_rate": 0.0002,
"loss": 0.3396,
"step": 107
},
{
"epoch": 0.15952732644017725,
"grad_norm": 0.3691346049308777,
"learning_rate": 0.0002,
"loss": 0.4014,
"step": 108
},
{
"epoch": 0.16100443131462333,
"grad_norm": 0.36267197132110596,
"learning_rate": 0.0002,
"loss": 0.3768,
"step": 109
},
{
"epoch": 0.16248153618906944,
"grad_norm": 0.3206377923488617,
"learning_rate": 0.0002,
"loss": 0.326,
"step": 110
},
{
"epoch": 0.16395864106351551,
"grad_norm": 0.32631710171699524,
"learning_rate": 0.0002,
"loss": 0.3438,
"step": 111
},
{
"epoch": 0.1654357459379616,
"grad_norm": 0.33969393372535706,
"learning_rate": 0.0002,
"loss": 0.3602,
"step": 112
},
{
"epoch": 0.16691285081240767,
"grad_norm": 0.361987829208374,
"learning_rate": 0.0002,
"loss": 0.3378,
"step": 113
},
{
"epoch": 0.16838995568685378,
"grad_norm": 0.33116045594215393,
"learning_rate": 0.0002,
"loss": 0.3444,
"step": 114
},
{
"epoch": 0.16986706056129985,
"grad_norm": 0.3474065363407135,
"learning_rate": 0.0002,
"loss": 0.3717,
"step": 115
},
{
"epoch": 0.17134416543574593,
"grad_norm": 0.3335750699043274,
"learning_rate": 0.0002,
"loss": 0.351,
"step": 116
},
{
"epoch": 0.172821270310192,
"grad_norm": 0.34676527976989746,
"learning_rate": 0.0002,
"loss": 0.3536,
"step": 117
},
{
"epoch": 0.17429837518463812,
"grad_norm": 0.36145490407943726,
"learning_rate": 0.0002,
"loss": 0.407,
"step": 118
},
{
"epoch": 0.1757754800590842,
"grad_norm": 0.3694964647293091,
"learning_rate": 0.0002,
"loss": 0.4143,
"step": 119
},
{
"epoch": 0.17725258493353027,
"grad_norm": 0.31005293130874634,
"learning_rate": 0.0002,
"loss": 0.3306,
"step": 120
},
{
"epoch": 0.17872968980797638,
"grad_norm": 0.32366085052490234,
"learning_rate": 0.0002,
"loss": 0.3342,
"step": 121
},
{
"epoch": 0.18020679468242246,
"grad_norm": 0.3252504765987396,
"learning_rate": 0.0002,
"loss": 0.3503,
"step": 122
},
{
"epoch": 0.18168389955686853,
"grad_norm": 0.32292550802230835,
"learning_rate": 0.0002,
"loss": 0.3694,
"step": 123
},
{
"epoch": 0.1831610044313146,
"grad_norm": 0.32740291953086853,
"learning_rate": 0.0002,
"loss": 0.3296,
"step": 124
},
{
"epoch": 0.18463810930576072,
"grad_norm": 0.3438139855861664,
"learning_rate": 0.0002,
"loss": 0.3107,
"step": 125
},
{
"epoch": 0.1861152141802068,
"grad_norm": 0.33904099464416504,
"learning_rate": 0.0002,
"loss": 0.39,
"step": 126
},
{
"epoch": 0.18759231905465287,
"grad_norm": 0.3464205265045166,
"learning_rate": 0.0002,
"loss": 0.3679,
"step": 127
},
{
"epoch": 0.18906942392909898,
"grad_norm": 0.3387203514575958,
"learning_rate": 0.0002,
"loss": 0.3375,
"step": 128
},
{
"epoch": 0.19054652880354506,
"grad_norm": 0.40050801634788513,
"learning_rate": 0.0002,
"loss": 0.3965,
"step": 129
},
{
"epoch": 0.19202363367799113,
"grad_norm": 0.31067872047424316,
"learning_rate": 0.0002,
"loss": 0.3108,
"step": 130
},
{
"epoch": 0.1935007385524372,
"grad_norm": 0.35977062582969666,
"learning_rate": 0.0002,
"loss": 0.4023,
"step": 131
},
{
"epoch": 0.19497784342688332,
"grad_norm": 0.3153740167617798,
"learning_rate": 0.0002,
"loss": 0.3317,
"step": 132
},
{
"epoch": 0.1964549483013294,
"grad_norm": 0.3306857645511627,
"learning_rate": 0.0002,
"loss": 0.3408,
"step": 133
},
{
"epoch": 0.19793205317577547,
"grad_norm": 0.32012930512428284,
"learning_rate": 0.0002,
"loss": 0.3218,
"step": 134
},
{
"epoch": 0.19940915805022155,
"grad_norm": 0.3159703314304352,
"learning_rate": 0.0002,
"loss": 0.3481,
"step": 135
},
{
"epoch": 0.20088626292466766,
"grad_norm": 0.3230080306529999,
"learning_rate": 0.0002,
"loss": 0.3779,
"step": 136
},
{
"epoch": 0.20236336779911374,
"grad_norm": 0.34753701090812683,
"learning_rate": 0.0002,
"loss": 0.3775,
"step": 137
},
{
"epoch": 0.2038404726735598,
"grad_norm": 0.3315640687942505,
"learning_rate": 0.0002,
"loss": 0.339,
"step": 138
},
{
"epoch": 0.20531757754800592,
"grad_norm": 0.33685439825057983,
"learning_rate": 0.0002,
"loss": 0.3575,
"step": 139
},
{
"epoch": 0.206794682422452,
"grad_norm": 0.3179871439933777,
"learning_rate": 0.0002,
"loss": 0.3338,
"step": 140
},
{
"epoch": 0.20827178729689808,
"grad_norm": 0.32391220331192017,
"learning_rate": 0.0002,
"loss": 0.3665,
"step": 141
},
{
"epoch": 0.20974889217134415,
"grad_norm": 0.3102681338787079,
"learning_rate": 0.0002,
"loss": 0.2948,
"step": 142
},
{
"epoch": 0.21122599704579026,
"grad_norm": 0.33224979043006897,
"learning_rate": 0.0002,
"loss": 0.392,
"step": 143
},
{
"epoch": 0.21270310192023634,
"grad_norm": 0.30173906683921814,
"learning_rate": 0.0002,
"loss": 0.2775,
"step": 144
},
{
"epoch": 0.21418020679468242,
"grad_norm": 0.3212149739265442,
"learning_rate": 0.0002,
"loss": 0.3408,
"step": 145
},
{
"epoch": 0.21565731166912852,
"grad_norm": 0.3113839328289032,
"learning_rate": 0.0002,
"loss": 0.314,
"step": 146
},
{
"epoch": 0.2171344165435746,
"grad_norm": 0.3435472548007965,
"learning_rate": 0.0002,
"loss": 0.3617,
"step": 147
},
{
"epoch": 0.21861152141802068,
"grad_norm": 0.3423033058643341,
"learning_rate": 0.0002,
"loss": 0.3523,
"step": 148
},
{
"epoch": 0.22008862629246675,
"grad_norm": 0.3202575445175171,
"learning_rate": 0.0002,
"loss": 0.349,
"step": 149
},
{
"epoch": 0.22156573116691286,
"grad_norm": 0.2999582886695862,
"learning_rate": 0.0002,
"loss": 0.2906,
"step": 150
},
{
"epoch": 0.22304283604135894,
"grad_norm": 0.33576205372810364,
"learning_rate": 0.0002,
"loss": 0.329,
"step": 151
},
{
"epoch": 0.22451994091580502,
"grad_norm": 0.31811273097991943,
"learning_rate": 0.0002,
"loss": 0.3151,
"step": 152
},
{
"epoch": 0.2259970457902511,
"grad_norm": 0.34126049280166626,
"learning_rate": 0.0002,
"loss": 0.335,
"step": 153
},
{
"epoch": 0.2274741506646972,
"grad_norm": 0.29068347811698914,
"learning_rate": 0.0002,
"loss": 0.2996,
"step": 154
},
{
"epoch": 0.22895125553914328,
"grad_norm": 0.3677709698677063,
"learning_rate": 0.0002,
"loss": 0.357,
"step": 155
},
{
"epoch": 0.23042836041358936,
"grad_norm": 0.319380521774292,
"learning_rate": 0.0002,
"loss": 0.3283,
"step": 156
},
{
"epoch": 0.23190546528803546,
"grad_norm": 0.2935948669910431,
"learning_rate": 0.0002,
"loss": 0.2755,
"step": 157
},
{
"epoch": 0.23338257016248154,
"grad_norm": 0.30784815549850464,
"learning_rate": 0.0002,
"loss": 0.3171,
"step": 158
},
{
"epoch": 0.23485967503692762,
"grad_norm": 0.3345930874347687,
"learning_rate": 0.0002,
"loss": 0.3526,
"step": 159
},
{
"epoch": 0.2363367799113737,
"grad_norm": 0.3269497752189636,
"learning_rate": 0.0002,
"loss": 0.3492,
"step": 160
},
{
"epoch": 0.2378138847858198,
"grad_norm": 0.32217973470687866,
"learning_rate": 0.0002,
"loss": 0.36,
"step": 161
},
{
"epoch": 0.23929098966026588,
"grad_norm": 0.3381323516368866,
"learning_rate": 0.0002,
"loss": 0.3534,
"step": 162
},
{
"epoch": 0.24076809453471196,
"grad_norm": 0.3131888210773468,
"learning_rate": 0.0002,
"loss": 0.3224,
"step": 163
},
{
"epoch": 0.24224519940915806,
"grad_norm": 0.30917319655418396,
"learning_rate": 0.0002,
"loss": 0.3132,
"step": 164
},
{
"epoch": 0.24372230428360414,
"grad_norm": 0.31469786167144775,
"learning_rate": 0.0002,
"loss": 0.3218,
"step": 165
},
{
"epoch": 0.24519940915805022,
"grad_norm": 0.31420794129371643,
"learning_rate": 0.0002,
"loss": 0.3471,
"step": 166
},
{
"epoch": 0.2466765140324963,
"grad_norm": 0.31471043825149536,
"learning_rate": 0.0002,
"loss": 0.3056,
"step": 167
},
{
"epoch": 0.2481536189069424,
"grad_norm": 0.30315864086151123,
"learning_rate": 0.0002,
"loss": 0.3355,
"step": 168
},
{
"epoch": 0.24963072378138848,
"grad_norm": 0.29710718989372253,
"learning_rate": 0.0002,
"loss": 0.3077,
"step": 169
},
{
"epoch": 0.2511078286558346,
"grad_norm": 0.30408531427383423,
"learning_rate": 0.0002,
"loss": 0.3087,
"step": 170
},
{
"epoch": 0.25258493353028066,
"grad_norm": 0.29702916741371155,
"learning_rate": 0.0002,
"loss": 0.2993,
"step": 171
},
{
"epoch": 0.25406203840472674,
"grad_norm": 0.2939663827419281,
"learning_rate": 0.0002,
"loss": 0.2996,
"step": 172
},
{
"epoch": 0.2555391432791728,
"grad_norm": 0.36591342091560364,
"learning_rate": 0.0002,
"loss": 0.356,
"step": 173
},
{
"epoch": 0.2570162481536189,
"grad_norm": 0.30867043137550354,
"learning_rate": 0.0002,
"loss": 0.2961,
"step": 174
},
{
"epoch": 0.258493353028065,
"grad_norm": 0.34252026677131653,
"learning_rate": 0.0002,
"loss": 0.3849,
"step": 175
},
{
"epoch": 0.25997045790251105,
"grad_norm": 0.34753838181495667,
"learning_rate": 0.0002,
"loss": 0.3838,
"step": 176
},
{
"epoch": 0.2614475627769572,
"grad_norm": 0.31399980187416077,
"learning_rate": 0.0002,
"loss": 0.33,
"step": 177
},
{
"epoch": 0.26292466765140327,
"grad_norm": 0.32648637890815735,
"learning_rate": 0.0002,
"loss": 0.3678,
"step": 178
},
{
"epoch": 0.26440177252584934,
"grad_norm": 0.2866675853729248,
"learning_rate": 0.0002,
"loss": 0.295,
"step": 179
},
{
"epoch": 0.2658788774002954,
"grad_norm": 0.32054954767227173,
"learning_rate": 0.0002,
"loss": 0.3342,
"step": 180
},
{
"epoch": 0.2673559822747415,
"grad_norm": 0.30476486682891846,
"learning_rate": 0.0002,
"loss": 0.3381,
"step": 181
},
{
"epoch": 0.2688330871491876,
"grad_norm": 0.2891450524330139,
"learning_rate": 0.0002,
"loss": 0.2984,
"step": 182
},
{
"epoch": 0.27031019202363366,
"grad_norm": 0.3023356795310974,
"learning_rate": 0.0002,
"loss": 0.2991,
"step": 183
},
{
"epoch": 0.2717872968980798,
"grad_norm": 0.31025779247283936,
"learning_rate": 0.0002,
"loss": 0.3198,
"step": 184
},
{
"epoch": 0.27326440177252587,
"grad_norm": 0.27903226017951965,
"learning_rate": 0.0002,
"loss": 0.274,
"step": 185
},
{
"epoch": 0.27474150664697194,
"grad_norm": 0.2925949692726135,
"learning_rate": 0.0002,
"loss": 0.3051,
"step": 186
},
{
"epoch": 0.276218611521418,
"grad_norm": 0.3387667238712311,
"learning_rate": 0.0002,
"loss": 0.3677,
"step": 187
},
{
"epoch": 0.2776957163958641,
"grad_norm": 0.316540390253067,
"learning_rate": 0.0002,
"loss": 0.3196,
"step": 188
},
{
"epoch": 0.2791728212703102,
"grad_norm": 0.3089348375797272,
"learning_rate": 0.0002,
"loss": 0.3338,
"step": 189
},
{
"epoch": 0.28064992614475626,
"grad_norm": 0.313431054353714,
"learning_rate": 0.0002,
"loss": 0.3178,
"step": 190
},
{
"epoch": 0.2821270310192024,
"grad_norm": 0.30025985836982727,
"learning_rate": 0.0002,
"loss": 0.3086,
"step": 191
},
{
"epoch": 0.28360413589364847,
"grad_norm": 0.3058534860610962,
"learning_rate": 0.0002,
"loss": 0.3128,
"step": 192
},
{
"epoch": 0.28508124076809455,
"grad_norm": 0.334710031747818,
"learning_rate": 0.0002,
"loss": 0.3418,
"step": 193
},
{
"epoch": 0.2865583456425406,
"grad_norm": 0.3021548092365265,
"learning_rate": 0.0002,
"loss": 0.2995,
"step": 194
},
{
"epoch": 0.2880354505169867,
"grad_norm": 0.27398747205734253,
"learning_rate": 0.0002,
"loss": 0.2743,
"step": 195
},
{
"epoch": 0.2895125553914328,
"grad_norm": 0.33194372057914734,
"learning_rate": 0.0002,
"loss": 0.2824,
"step": 196
},
{
"epoch": 0.29098966026587886,
"grad_norm": 0.3193664848804474,
"learning_rate": 0.0002,
"loss": 0.3361,
"step": 197
},
{
"epoch": 0.29246676514032494,
"grad_norm": 0.3320102393627167,
"learning_rate": 0.0002,
"loss": 0.3154,
"step": 198
},
{
"epoch": 0.29394387001477107,
"grad_norm": 0.2951314449310303,
"learning_rate": 0.0002,
"loss": 0.2699,
"step": 199
},
{
"epoch": 0.29542097488921715,
"grad_norm": 0.3117165267467499,
"learning_rate": 0.0002,
"loss": 0.3359,
"step": 200
},
{
"epoch": 0.2968980797636632,
"grad_norm": 0.30885782837867737,
"learning_rate": 0.0002,
"loss": 0.3181,
"step": 201
},
{
"epoch": 0.2983751846381093,
"grad_norm": 0.3114778399467468,
"learning_rate": 0.0002,
"loss": 0.3409,
"step": 202
},
{
"epoch": 0.2998522895125554,
"grad_norm": 0.32142388820648193,
"learning_rate": 0.0002,
"loss": 0.3491,
"step": 203
},
{
"epoch": 0.30132939438700146,
"grad_norm": 0.3159630000591278,
"learning_rate": 0.0002,
"loss": 0.3176,
"step": 204
},
{
"epoch": 0.30280649926144754,
"grad_norm": 0.2813749313354492,
"learning_rate": 0.0002,
"loss": 0.2745,
"step": 205
},
{
"epoch": 0.30428360413589367,
"grad_norm": 0.3174036145210266,
"learning_rate": 0.0002,
"loss": 0.3527,
"step": 206
},
{
"epoch": 0.30576070901033975,
"grad_norm": 0.311678409576416,
"learning_rate": 0.0002,
"loss": 0.3075,
"step": 207
},
{
"epoch": 0.3072378138847858,
"grad_norm": 0.2867993712425232,
"learning_rate": 0.0002,
"loss": 0.32,
"step": 208
},
{
"epoch": 0.3087149187592319,
"grad_norm": 0.29298824071884155,
"learning_rate": 0.0002,
"loss": 0.3226,
"step": 209
},
{
"epoch": 0.310192023633678,
"grad_norm": 0.3173938989639282,
"learning_rate": 0.0002,
"loss": 0.32,
"step": 210
},
{
"epoch": 0.31166912850812406,
"grad_norm": 0.27944210171699524,
"learning_rate": 0.0002,
"loss": 0.2825,
"step": 211
},
{
"epoch": 0.31314623338257014,
"grad_norm": 0.3196215331554413,
"learning_rate": 0.0002,
"loss": 0.3321,
"step": 212
},
{
"epoch": 0.31462333825701627,
"grad_norm": 0.3193184733390808,
"learning_rate": 0.0002,
"loss": 0.3394,
"step": 213
},
{
"epoch": 0.31610044313146235,
"grad_norm": 0.2783777713775635,
"learning_rate": 0.0002,
"loss": 0.3134,
"step": 214
},
{
"epoch": 0.3175775480059084,
"grad_norm": 0.35627251863479614,
"learning_rate": 0.0002,
"loss": 0.3973,
"step": 215
},
{
"epoch": 0.3190546528803545,
"grad_norm": 0.32312896847724915,
"learning_rate": 0.0002,
"loss": 0.3388,
"step": 216
},
{
"epoch": 0.3205317577548006,
"grad_norm": 0.2931472659111023,
"learning_rate": 0.0002,
"loss": 0.3134,
"step": 217
},
{
"epoch": 0.32200886262924666,
"grad_norm": 0.3059196174144745,
"learning_rate": 0.0002,
"loss": 0.3249,
"step": 218
},
{
"epoch": 0.32348596750369274,
"grad_norm": 0.3171478807926178,
"learning_rate": 0.0002,
"loss": 0.3459,
"step": 219
},
{
"epoch": 0.3249630723781389,
"grad_norm": 0.31810346245765686,
"learning_rate": 0.0002,
"loss": 0.3455,
"step": 220
},
{
"epoch": 0.32644017725258495,
"grad_norm": 0.30696892738342285,
"learning_rate": 0.0002,
"loss": 0.3037,
"step": 221
},
{
"epoch": 0.32791728212703103,
"grad_norm": 0.3519222140312195,
"learning_rate": 0.0002,
"loss": 0.3585,
"step": 222
},
{
"epoch": 0.3293943870014771,
"grad_norm": 0.2762470245361328,
"learning_rate": 0.0002,
"loss": 0.2615,
"step": 223
},
{
"epoch": 0.3308714918759232,
"grad_norm": 0.2909640967845917,
"learning_rate": 0.0002,
"loss": 0.2994,
"step": 224
},
{
"epoch": 0.33234859675036926,
"grad_norm": 0.3310638964176178,
"learning_rate": 0.0002,
"loss": 0.368,
"step": 225
},
{
"epoch": 0.33382570162481534,
"grad_norm": 0.337228387594223,
"learning_rate": 0.0002,
"loss": 0.358,
"step": 226
},
{
"epoch": 0.3353028064992615,
"grad_norm": 0.3182266652584076,
"learning_rate": 0.0002,
"loss": 0.3425,
"step": 227
},
{
"epoch": 0.33677991137370755,
"grad_norm": 0.32053616642951965,
"learning_rate": 0.0002,
"loss": 0.3604,
"step": 228
},
{
"epoch": 0.33825701624815363,
"grad_norm": 0.3377324342727661,
"learning_rate": 0.0002,
"loss": 0.3783,
"step": 229
},
{
"epoch": 0.3397341211225997,
"grad_norm": 0.28743067383766174,
"learning_rate": 0.0002,
"loss": 0.304,
"step": 230
},
{
"epoch": 0.3412112259970458,
"grad_norm": 0.30108213424682617,
"learning_rate": 0.0002,
"loss": 0.3129,
"step": 231
},
{
"epoch": 0.34268833087149186,
"grad_norm": 0.3191213607788086,
"learning_rate": 0.0002,
"loss": 0.3331,
"step": 232
},
{
"epoch": 0.34416543574593794,
"grad_norm": 0.2999110519886017,
"learning_rate": 0.0002,
"loss": 0.3074,
"step": 233
},
{
"epoch": 0.345642540620384,
"grad_norm": 0.2682500183582306,
"learning_rate": 0.0002,
"loss": 0.2635,
"step": 234
},
{
"epoch": 0.34711964549483015,
"grad_norm": 0.2817941904067993,
"learning_rate": 0.0002,
"loss": 0.3048,
"step": 235
},
{
"epoch": 0.34859675036927623,
"grad_norm": 0.3110464811325073,
"learning_rate": 0.0002,
"loss": 0.3228,
"step": 236
},
{
"epoch": 0.3500738552437223,
"grad_norm": 0.3088606297969818,
"learning_rate": 0.0002,
"loss": 0.3161,
"step": 237
},
{
"epoch": 0.3515509601181684,
"grad_norm": 0.2990322411060333,
"learning_rate": 0.0002,
"loss": 0.3085,
"step": 238
},
{
"epoch": 0.35302806499261447,
"grad_norm": 0.33097386360168457,
"learning_rate": 0.0002,
"loss": 0.3615,
"step": 239
},
{
"epoch": 0.35450516986706054,
"grad_norm": 0.3397606313228607,
"learning_rate": 0.0002,
"loss": 0.3957,
"step": 240
},
{
"epoch": 0.3559822747415066,
"grad_norm": 0.2756197452545166,
"learning_rate": 0.0002,
"loss": 0.2731,
"step": 241
},
{
"epoch": 0.35745937961595275,
"grad_norm": 0.3435852825641632,
"learning_rate": 0.0002,
"loss": 0.3855,
"step": 242
},
{
"epoch": 0.35893648449039883,
"grad_norm": 0.33727383613586426,
"learning_rate": 0.0002,
"loss": 0.3101,
"step": 243
},
{
"epoch": 0.3604135893648449,
"grad_norm": 0.3684369921684265,
"learning_rate": 0.0002,
"loss": 0.3378,
"step": 244
},
{
"epoch": 0.361890694239291,
"grad_norm": 0.3006575107574463,
"learning_rate": 0.0002,
"loss": 0.3295,
"step": 245
},
{
"epoch": 0.36336779911373707,
"grad_norm": 0.31223273277282715,
"learning_rate": 0.0002,
"loss": 0.2977,
"step": 246
},
{
"epoch": 0.36484490398818314,
"grad_norm": 0.3001905381679535,
"learning_rate": 0.0002,
"loss": 0.294,
"step": 247
},
{
"epoch": 0.3663220088626292,
"grad_norm": 0.2907404899597168,
"learning_rate": 0.0002,
"loss": 0.2839,
"step": 248
},
{
"epoch": 0.36779911373707536,
"grad_norm": 0.31060346961021423,
"learning_rate": 0.0002,
"loss": 0.3333,
"step": 249
},
{
"epoch": 0.36927621861152143,
"grad_norm": 0.3394862413406372,
"learning_rate": 0.0002,
"loss": 0.3217,
"step": 250
},
{
"epoch": 0.3707533234859675,
"grad_norm": 0.2912856340408325,
"learning_rate": 0.0002,
"loss": 0.3072,
"step": 251
},
{
"epoch": 0.3722304283604136,
"grad_norm": 0.2991478741168976,
"learning_rate": 0.0002,
"loss": 0.3349,
"step": 252
},
{
"epoch": 0.37370753323485967,
"grad_norm": 0.304868221282959,
"learning_rate": 0.0002,
"loss": 0.3142,
"step": 253
},
{
"epoch": 0.37518463810930575,
"grad_norm": 0.3008173704147339,
"learning_rate": 0.0002,
"loss": 0.3166,
"step": 254
},
{
"epoch": 0.3766617429837518,
"grad_norm": 0.290526807308197,
"learning_rate": 0.0002,
"loss": 0.3228,
"step": 255
},
{
"epoch": 0.37813884785819796,
"grad_norm": 0.2846904695034027,
"learning_rate": 0.0002,
"loss": 0.31,
"step": 256
},
{
"epoch": 0.37961595273264404,
"grad_norm": 0.306904137134552,
"learning_rate": 0.0002,
"loss": 0.3238,
"step": 257
},
{
"epoch": 0.3810930576070901,
"grad_norm": 0.30683666467666626,
"learning_rate": 0.0002,
"loss": 0.3327,
"step": 258
},
{
"epoch": 0.3825701624815362,
"grad_norm": 0.2824447751045227,
"learning_rate": 0.0002,
"loss": 0.2962,
"step": 259
},
{
"epoch": 0.38404726735598227,
"grad_norm": 0.29804757237434387,
"learning_rate": 0.0002,
"loss": 0.3025,
"step": 260
},
{
"epoch": 0.38552437223042835,
"grad_norm": 0.3133246600627899,
"learning_rate": 0.0002,
"loss": 0.3095,
"step": 261
},
{
"epoch": 0.3870014771048744,
"grad_norm": 0.3098774254322052,
"learning_rate": 0.0002,
"loss": 0.3031,
"step": 262
},
{
"epoch": 0.38847858197932056,
"grad_norm": 0.3248344361782074,
"learning_rate": 0.0002,
"loss": 0.3402,
"step": 263
},
{
"epoch": 0.38995568685376664,
"grad_norm": 0.30645236372947693,
"learning_rate": 0.0002,
"loss": 0.3277,
"step": 264
},
{
"epoch": 0.3914327917282127,
"grad_norm": 0.29753726720809937,
"learning_rate": 0.0002,
"loss": 0.3322,
"step": 265
},
{
"epoch": 0.3929098966026588,
"grad_norm": 0.33593639731407166,
"learning_rate": 0.0002,
"loss": 0.337,
"step": 266
},
{
"epoch": 0.39438700147710487,
"grad_norm": 0.3059685528278351,
"learning_rate": 0.0002,
"loss": 0.2896,
"step": 267
},
{
"epoch": 0.39586410635155095,
"grad_norm": 0.30055829882621765,
"learning_rate": 0.0002,
"loss": 0.3385,
"step": 268
},
{
"epoch": 0.397341211225997,
"grad_norm": 0.27567949891090393,
"learning_rate": 0.0002,
"loss": 0.2799,
"step": 269
},
{
"epoch": 0.3988183161004431,
"grad_norm": 0.33319681882858276,
"learning_rate": 0.0002,
"loss": 0.3735,
"step": 270
},
{
"epoch": 0.40029542097488924,
"grad_norm": 0.28851690888404846,
"learning_rate": 0.0002,
"loss": 0.2934,
"step": 271
},
{
"epoch": 0.4017725258493353,
"grad_norm": 0.3188093900680542,
"learning_rate": 0.0002,
"loss": 0.3276,
"step": 272
},
{
"epoch": 0.4032496307237814,
"grad_norm": 0.29944342374801636,
"learning_rate": 0.0002,
"loss": 0.3351,
"step": 273
},
{
"epoch": 0.40472673559822747,
"grad_norm": 0.31611138582229614,
"learning_rate": 0.0002,
"loss": 0.3616,
"step": 274
},
{
"epoch": 0.40620384047267355,
"grad_norm": 0.3243541419506073,
"learning_rate": 0.0002,
"loss": 0.3394,
"step": 275
},
{
"epoch": 0.4076809453471196,
"grad_norm": 0.31130653619766235,
"learning_rate": 0.0002,
"loss": 0.3182,
"step": 276
},
{
"epoch": 0.4091580502215657,
"grad_norm": 0.2761830687522888,
"learning_rate": 0.0002,
"loss": 0.23,
"step": 277
},
{
"epoch": 0.41063515509601184,
"grad_norm": 0.3256094455718994,
"learning_rate": 0.0002,
"loss": 0.3921,
"step": 278
},
{
"epoch": 0.4121122599704579,
"grad_norm": 0.30812302231788635,
"learning_rate": 0.0002,
"loss": 0.3559,
"step": 279
},
{
"epoch": 0.413589364844904,
"grad_norm": 0.28198400139808655,
"learning_rate": 0.0002,
"loss": 0.285,
"step": 280
},
{
"epoch": 0.4150664697193501,
"grad_norm": 0.2873023450374603,
"learning_rate": 0.0002,
"loss": 0.2963,
"step": 281
},
{
"epoch": 0.41654357459379615,
"grad_norm": 0.29413530230522156,
"learning_rate": 0.0002,
"loss": 0.2885,
"step": 282
},
{
"epoch": 0.41802067946824223,
"grad_norm": 0.2963588237762451,
"learning_rate": 0.0002,
"loss": 0.2996,
"step": 283
},
{
"epoch": 0.4194977843426883,
"grad_norm": 0.2581465542316437,
"learning_rate": 0.0002,
"loss": 0.255,
"step": 284
},
{
"epoch": 0.42097488921713444,
"grad_norm": 0.3365771472454071,
"learning_rate": 0.0002,
"loss": 0.3473,
"step": 285
},
{
"epoch": 0.4224519940915805,
"grad_norm": 0.3092253804206848,
"learning_rate": 0.0002,
"loss": 0.29,
"step": 286
},
{
"epoch": 0.4239290989660266,
"grad_norm": 0.300626277923584,
"learning_rate": 0.0002,
"loss": 0.3183,
"step": 287
},
{
"epoch": 0.4254062038404727,
"grad_norm": 0.3320425748825073,
"learning_rate": 0.0002,
"loss": 0.3322,
"step": 288
},
{
"epoch": 0.42688330871491875,
"grad_norm": 0.2749597728252411,
"learning_rate": 0.0002,
"loss": 0.2995,
"step": 289
},
{
"epoch": 0.42836041358936483,
"grad_norm": 0.280134916305542,
"learning_rate": 0.0002,
"loss": 0.2729,
"step": 290
},
{
"epoch": 0.4298375184638109,
"grad_norm": 0.27060407400131226,
"learning_rate": 0.0002,
"loss": 0.2694,
"step": 291
},
{
"epoch": 0.43131462333825704,
"grad_norm": 0.28500011563301086,
"learning_rate": 0.0002,
"loss": 0.2852,
"step": 292
},
{
"epoch": 0.4327917282127031,
"grad_norm": 0.2733040452003479,
"learning_rate": 0.0002,
"loss": 0.276,
"step": 293
},
{
"epoch": 0.4342688330871492,
"grad_norm": 0.30365538597106934,
"learning_rate": 0.0002,
"loss": 0.2806,
"step": 294
},
{
"epoch": 0.4357459379615953,
"grad_norm": 0.3079434335231781,
"learning_rate": 0.0002,
"loss": 0.3014,
"step": 295
},
{
"epoch": 0.43722304283604135,
"grad_norm": 0.2746562659740448,
"learning_rate": 0.0002,
"loss": 0.2601,
"step": 296
},
{
"epoch": 0.43870014771048743,
"grad_norm": 0.3027852475643158,
"learning_rate": 0.0002,
"loss": 0.2978,
"step": 297
},
{
"epoch": 0.4401772525849335,
"grad_norm": 0.2862493395805359,
"learning_rate": 0.0002,
"loss": 0.2712,
"step": 298
},
{
"epoch": 0.44165435745937964,
"grad_norm": 0.30820953845977783,
"learning_rate": 0.0002,
"loss": 0.3198,
"step": 299
},
{
"epoch": 0.4431314623338257,
"grad_norm": 0.2891389727592468,
"learning_rate": 0.0002,
"loss": 0.2914,
"step": 300
},
{
"epoch": 0.4446085672082718,
"grad_norm": 0.29976293444633484,
"learning_rate": 0.0002,
"loss": 0.3155,
"step": 301
},
{
"epoch": 0.4460856720827179,
"grad_norm": 0.26029616594314575,
"learning_rate": 0.0002,
"loss": 0.2585,
"step": 302
},
{
"epoch": 0.44756277695716395,
"grad_norm": 0.2925141751766205,
"learning_rate": 0.0002,
"loss": 0.2799,
"step": 303
},
{
"epoch": 0.44903988183161003,
"grad_norm": 0.3378995358943939,
"learning_rate": 0.0002,
"loss": 0.3397,
"step": 304
},
{
"epoch": 0.4505169867060561,
"grad_norm": 0.3140377104282379,
"learning_rate": 0.0002,
"loss": 0.3065,
"step": 305
},
{
"epoch": 0.4519940915805022,
"grad_norm": 0.30882659554481506,
"learning_rate": 0.0002,
"loss": 0.2976,
"step": 306
},
{
"epoch": 0.4534711964549483,
"grad_norm": 0.2986995577812195,
"learning_rate": 0.0002,
"loss": 0.3086,
"step": 307
},
{
"epoch": 0.4549483013293944,
"grad_norm": 0.37128734588623047,
"learning_rate": 0.0002,
"loss": 0.3018,
"step": 308
},
{
"epoch": 0.4564254062038405,
"grad_norm": 0.2967352271080017,
"learning_rate": 0.0002,
"loss": 0.2855,
"step": 309
},
{
"epoch": 0.45790251107828656,
"grad_norm": 0.3116573691368103,
"learning_rate": 0.0002,
"loss": 0.3248,
"step": 310
},
{
"epoch": 0.45937961595273263,
"grad_norm": 0.27394649386405945,
"learning_rate": 0.0002,
"loss": 0.2894,
"step": 311
},
{
"epoch": 0.4608567208271787,
"grad_norm": 0.31190183758735657,
"learning_rate": 0.0002,
"loss": 0.3311,
"step": 312
},
{
"epoch": 0.4623338257016248,
"grad_norm": 0.28978461027145386,
"learning_rate": 0.0002,
"loss": 0.2896,
"step": 313
},
{
"epoch": 0.4638109305760709,
"grad_norm": 0.29586443305015564,
"learning_rate": 0.0002,
"loss": 0.3062,
"step": 314
},
{
"epoch": 0.465288035450517,
"grad_norm": 0.3034004271030426,
"learning_rate": 0.0002,
"loss": 0.2797,
"step": 315
},
{
"epoch": 0.4667651403249631,
"grad_norm": 0.3083277940750122,
"learning_rate": 0.0002,
"loss": 0.2912,
"step": 316
},
{
"epoch": 0.46824224519940916,
"grad_norm": 0.31153154373168945,
"learning_rate": 0.0002,
"loss": 0.3403,
"step": 317
},
{
"epoch": 0.46971935007385524,
"grad_norm": 0.26065292954444885,
"learning_rate": 0.0002,
"loss": 0.2289,
"step": 318
},
{
"epoch": 0.4711964549483013,
"grad_norm": 0.29736757278442383,
"learning_rate": 0.0002,
"loss": 0.3202,
"step": 319
},
{
"epoch": 0.4726735598227474,
"grad_norm": 0.362541526556015,
"learning_rate": 0.0002,
"loss": 0.393,
"step": 320
},
{
"epoch": 0.4741506646971935,
"grad_norm": 0.3045463263988495,
"learning_rate": 0.0002,
"loss": 0.2843,
"step": 321
},
{
"epoch": 0.4756277695716396,
"grad_norm": 0.33905521035194397,
"learning_rate": 0.0002,
"loss": 0.368,
"step": 322
},
{
"epoch": 0.4771048744460857,
"grad_norm": 0.3574953079223633,
"learning_rate": 0.0002,
"loss": 0.2273,
"step": 323
},
{
"epoch": 0.47858197932053176,
"grad_norm": 0.3134016990661621,
"learning_rate": 0.0002,
"loss": 0.3134,
"step": 324
},
{
"epoch": 0.48005908419497784,
"grad_norm": 0.32262158393859863,
"learning_rate": 0.0002,
"loss": 0.3028,
"step": 325
},
{
"epoch": 0.4815361890694239,
"grad_norm": 0.26441511511802673,
"learning_rate": 0.0002,
"loss": 0.2447,
"step": 326
},
{
"epoch": 0.48301329394387,
"grad_norm": 0.3419596552848816,
"learning_rate": 0.0002,
"loss": 0.3258,
"step": 327
},
{
"epoch": 0.4844903988183161,
"grad_norm": 0.3031555116176605,
"learning_rate": 0.0002,
"loss": 0.322,
"step": 328
},
{
"epoch": 0.4859675036927622,
"grad_norm": 0.29226183891296387,
"learning_rate": 0.0002,
"loss": 0.2803,
"step": 329
},
{
"epoch": 0.4874446085672083,
"grad_norm": 0.2874895930290222,
"learning_rate": 0.0002,
"loss": 0.2824,
"step": 330
},
{
"epoch": 0.48892171344165436,
"grad_norm": 0.31009188294410706,
"learning_rate": 0.0002,
"loss": 0.3218,
"step": 331
},
{
"epoch": 0.49039881831610044,
"grad_norm": 0.31250134110450745,
"learning_rate": 0.0002,
"loss": 0.2972,
"step": 332
},
{
"epoch": 0.4918759231905465,
"grad_norm": 0.30231741070747375,
"learning_rate": 0.0002,
"loss": 0.3255,
"step": 333
},
{
"epoch": 0.4933530280649926,
"grad_norm": 0.32139065861701965,
"learning_rate": 0.0002,
"loss": 0.3712,
"step": 334
},
{
"epoch": 0.4948301329394387,
"grad_norm": 0.2788805365562439,
"learning_rate": 0.0002,
"loss": 0.3069,
"step": 335
},
{
"epoch": 0.4963072378138848,
"grad_norm": 0.3206048011779785,
"learning_rate": 0.0002,
"loss": 0.3519,
"step": 336
},
{
"epoch": 0.4977843426883309,
"grad_norm": 0.316514253616333,
"learning_rate": 0.0002,
"loss": 0.3554,
"step": 337
},
{
"epoch": 0.49926144756277696,
"grad_norm": 0.3080296516418457,
"learning_rate": 0.0002,
"loss": 0.3366,
"step": 338
},
{
"epoch": 0.5007385524372231,
"grad_norm": 0.3183678090572357,
"learning_rate": 0.0002,
"loss": 0.2967,
"step": 339
},
{
"epoch": 0.5022156573116692,
"grad_norm": 0.31313014030456543,
"learning_rate": 0.0002,
"loss": 0.3188,
"step": 340
},
{
"epoch": 0.5036927621861153,
"grad_norm": 0.2989446520805359,
"learning_rate": 0.0002,
"loss": 0.3235,
"step": 341
},
{
"epoch": 0.5051698670605613,
"grad_norm": 0.2817307412624359,
"learning_rate": 0.0002,
"loss": 0.2959,
"step": 342
},
{
"epoch": 0.5066469719350074,
"grad_norm": 0.487758606672287,
"learning_rate": 0.0002,
"loss": 0.3308,
"step": 343
},
{
"epoch": 0.5081240768094535,
"grad_norm": 0.26448920369148254,
"learning_rate": 0.0002,
"loss": 0.2942,
"step": 344
},
{
"epoch": 0.5096011816838996,
"grad_norm": 0.3182467818260193,
"learning_rate": 0.0002,
"loss": 0.291,
"step": 345
},
{
"epoch": 0.5110782865583456,
"grad_norm": 0.2950560450553894,
"learning_rate": 0.0002,
"loss": 0.3014,
"step": 346
},
{
"epoch": 0.5125553914327917,
"grad_norm": 0.3176344633102417,
"learning_rate": 0.0002,
"loss": 0.3425,
"step": 347
},
{
"epoch": 0.5140324963072378,
"grad_norm": 0.30496424436569214,
"learning_rate": 0.0002,
"loss": 0.3447,
"step": 348
},
{
"epoch": 0.5155096011816839,
"grad_norm": 0.28272292017936707,
"learning_rate": 0.0002,
"loss": 0.2645,
"step": 349
},
{
"epoch": 0.51698670605613,
"grad_norm": 0.2600267231464386,
"learning_rate": 0.0002,
"loss": 0.2525,
"step": 350
},
{
"epoch": 0.518463810930576,
"grad_norm": 0.2765870988368988,
"learning_rate": 0.0002,
"loss": 0.2907,
"step": 351
},
{
"epoch": 0.5199409158050221,
"grad_norm": 0.30320316553115845,
"learning_rate": 0.0002,
"loss": 0.3404,
"step": 352
},
{
"epoch": 0.5214180206794683,
"grad_norm": 0.33050844073295593,
"learning_rate": 0.0002,
"loss": 0.3436,
"step": 353
},
{
"epoch": 0.5228951255539144,
"grad_norm": 0.2716812193393707,
"learning_rate": 0.0002,
"loss": 0.2912,
"step": 354
},
{
"epoch": 0.5243722304283605,
"grad_norm": 0.2944520115852356,
"learning_rate": 0.0002,
"loss": 0.3212,
"step": 355
},
{
"epoch": 0.5258493353028065,
"grad_norm": 0.334228515625,
"learning_rate": 0.0002,
"loss": 0.3675,
"step": 356
},
{
"epoch": 0.5273264401772526,
"grad_norm": 0.27948203682899475,
"learning_rate": 0.0002,
"loss": 0.2648,
"step": 357
},
{
"epoch": 0.5288035450516987,
"grad_norm": 0.32159537076950073,
"learning_rate": 0.0002,
"loss": 0.3659,
"step": 358
},
{
"epoch": 0.5302806499261448,
"grad_norm": 0.29499179124832153,
"learning_rate": 0.0002,
"loss": 0.2718,
"step": 359
},
{
"epoch": 0.5317577548005908,
"grad_norm": 0.3503305912017822,
"learning_rate": 0.0002,
"loss": 0.2972,
"step": 360
},
{
"epoch": 0.5332348596750369,
"grad_norm": 0.29388928413391113,
"learning_rate": 0.0002,
"loss": 0.3063,
"step": 361
},
{
"epoch": 0.534711964549483,
"grad_norm": 0.2753749191761017,
"learning_rate": 0.0002,
"loss": 0.2706,
"step": 362
},
{
"epoch": 0.5361890694239291,
"grad_norm": 0.2902815341949463,
"learning_rate": 0.0002,
"loss": 0.2918,
"step": 363
},
{
"epoch": 0.5376661742983752,
"grad_norm": 0.2991829216480255,
"learning_rate": 0.0002,
"loss": 0.3148,
"step": 364
},
{
"epoch": 0.5391432791728212,
"grad_norm": 0.3151837885379791,
"learning_rate": 0.0002,
"loss": 0.3187,
"step": 365
},
{
"epoch": 0.5406203840472673,
"grad_norm": 0.2935662865638733,
"learning_rate": 0.0002,
"loss": 0.3065,
"step": 366
},
{
"epoch": 0.5420974889217134,
"grad_norm": 0.2787752151489258,
"learning_rate": 0.0002,
"loss": 0.2677,
"step": 367
},
{
"epoch": 0.5435745937961596,
"grad_norm": 0.2826704680919647,
"learning_rate": 0.0002,
"loss": 0.2673,
"step": 368
},
{
"epoch": 0.5450516986706057,
"grad_norm": 0.3015994429588318,
"learning_rate": 0.0002,
"loss": 0.3377,
"step": 369
},
{
"epoch": 0.5465288035450517,
"grad_norm": 0.27995777130126953,
"learning_rate": 0.0002,
"loss": 0.2672,
"step": 370
},
{
"epoch": 0.5480059084194978,
"grad_norm": 0.2902574837207794,
"learning_rate": 0.0002,
"loss": 0.2684,
"step": 371
},
{
"epoch": 0.5494830132939439,
"grad_norm": 0.2957216501235962,
"learning_rate": 0.0002,
"loss": 0.3061,
"step": 372
},
{
"epoch": 0.55096011816839,
"grad_norm": 0.2945306599140167,
"learning_rate": 0.0002,
"loss": 0.3248,
"step": 373
},
{
"epoch": 0.552437223042836,
"grad_norm": 0.2922048568725586,
"learning_rate": 0.0002,
"loss": 0.2987,
"step": 374
},
{
"epoch": 0.5539143279172821,
"grad_norm": 0.30333656072616577,
"learning_rate": 0.0002,
"loss": 0.3072,
"step": 375
},
{
"epoch": 0.5553914327917282,
"grad_norm": 0.2855093479156494,
"learning_rate": 0.0002,
"loss": 0.2758,
"step": 376
},
{
"epoch": 0.5568685376661743,
"grad_norm": 0.2911272943019867,
"learning_rate": 0.0002,
"loss": 0.2722,
"step": 377
},
{
"epoch": 0.5583456425406204,
"grad_norm": 0.289193332195282,
"learning_rate": 0.0002,
"loss": 0.3035,
"step": 378
},
{
"epoch": 0.5598227474150664,
"grad_norm": 0.2716032564640045,
"learning_rate": 0.0002,
"loss": 0.2692,
"step": 379
},
{
"epoch": 0.5612998522895125,
"grad_norm": 0.33022886514663696,
"learning_rate": 0.0002,
"loss": 0.3139,
"step": 380
},
{
"epoch": 0.5627769571639586,
"grad_norm": 0.27433738112449646,
"learning_rate": 0.0002,
"loss": 0.262,
"step": 381
},
{
"epoch": 0.5642540620384048,
"grad_norm": 0.27598345279693604,
"learning_rate": 0.0002,
"loss": 0.2657,
"step": 382
},
{
"epoch": 0.5657311669128509,
"grad_norm": 0.28790509700775146,
"learning_rate": 0.0002,
"loss": 0.3024,
"step": 383
},
{
"epoch": 0.5672082717872969,
"grad_norm": 0.2914026379585266,
"learning_rate": 0.0002,
"loss": 0.2972,
"step": 384
},
{
"epoch": 0.568685376661743,
"grad_norm": 0.3148682117462158,
"learning_rate": 0.0002,
"loss": 0.2982,
"step": 385
},
{
"epoch": 0.5701624815361891,
"grad_norm": 0.29025575518608093,
"learning_rate": 0.0002,
"loss": 0.2821,
"step": 386
},
{
"epoch": 0.5716395864106352,
"grad_norm": 0.267362117767334,
"learning_rate": 0.0002,
"loss": 0.244,
"step": 387
},
{
"epoch": 0.5731166912850812,
"grad_norm": 0.32638978958129883,
"learning_rate": 0.0002,
"loss": 0.3058,
"step": 388
},
{
"epoch": 0.5745937961595273,
"grad_norm": 0.31582197546958923,
"learning_rate": 0.0002,
"loss": 0.3285,
"step": 389
},
{
"epoch": 0.5760709010339734,
"grad_norm": 0.2933168113231659,
"learning_rate": 0.0002,
"loss": 0.2794,
"step": 390
},
{
"epoch": 0.5775480059084195,
"grad_norm": 0.29435229301452637,
"learning_rate": 0.0002,
"loss": 0.287,
"step": 391
},
{
"epoch": 0.5790251107828656,
"grad_norm": 0.29208388924598694,
"learning_rate": 0.0002,
"loss": 0.2786,
"step": 392
},
{
"epoch": 0.5805022156573116,
"grad_norm": 0.2712183892726898,
"learning_rate": 0.0002,
"loss": 0.2708,
"step": 393
},
{
"epoch": 0.5819793205317577,
"grad_norm": 0.27574923634529114,
"learning_rate": 0.0002,
"loss": 0.269,
"step": 394
},
{
"epoch": 0.5834564254062038,
"grad_norm": 0.30967944860458374,
"learning_rate": 0.0002,
"loss": 0.2827,
"step": 395
},
{
"epoch": 0.5849335302806499,
"grad_norm": 0.29655173420906067,
"learning_rate": 0.0002,
"loss": 0.306,
"step": 396
},
{
"epoch": 0.5864106351550961,
"grad_norm": 1.6516242027282715,
"learning_rate": 0.0002,
"loss": 0.3294,
"step": 397
},
{
"epoch": 0.5878877400295421,
"grad_norm": 0.2701549828052521,
"learning_rate": 0.0002,
"loss": 0.2451,
"step": 398
},
{
"epoch": 0.5893648449039882,
"grad_norm": 0.2530956268310547,
"learning_rate": 0.0002,
"loss": 0.2341,
"step": 399
},
{
"epoch": 0.5908419497784343,
"grad_norm": 0.3096421957015991,
"learning_rate": 0.0002,
"loss": 0.3,
"step": 400
},
{
"epoch": 0.5923190546528804,
"grad_norm": 0.3079342842102051,
"learning_rate": 0.0002,
"loss": 0.3291,
"step": 401
},
{
"epoch": 0.5937961595273265,
"grad_norm": 0.29586726427078247,
"learning_rate": 0.0002,
"loss": 0.3094,
"step": 402
},
{
"epoch": 0.5952732644017725,
"grad_norm": 0.28764981031417847,
"learning_rate": 0.0002,
"loss": 0.2961,
"step": 403
},
{
"epoch": 0.5967503692762186,
"grad_norm": 0.30434954166412354,
"learning_rate": 0.0002,
"loss": 0.2936,
"step": 404
},
{
"epoch": 0.5982274741506647,
"grad_norm": 0.2840517461299896,
"learning_rate": 0.0002,
"loss": 0.2964,
"step": 405
},
{
"epoch": 0.5997045790251108,
"grad_norm": 0.2927243113517761,
"learning_rate": 0.0002,
"loss": 0.313,
"step": 406
},
{
"epoch": 0.6011816838995568,
"grad_norm": 0.26455628871917725,
"learning_rate": 0.0002,
"loss": 0.244,
"step": 407
},
{
"epoch": 0.6026587887740029,
"grad_norm": 0.327934592962265,
"learning_rate": 0.0002,
"loss": 0.3271,
"step": 408
},
{
"epoch": 0.604135893648449,
"grad_norm": 0.28486961126327515,
"learning_rate": 0.0002,
"loss": 0.2742,
"step": 409
},
{
"epoch": 0.6056129985228951,
"grad_norm": 0.3310534656047821,
"learning_rate": 0.0002,
"loss": 0.2888,
"step": 410
},
{
"epoch": 0.6070901033973413,
"grad_norm": 0.32391390204429626,
"learning_rate": 0.0002,
"loss": 0.3123,
"step": 411
},
{
"epoch": 0.6085672082717873,
"grad_norm": 0.5019936561584473,
"learning_rate": 0.0002,
"loss": 0.3494,
"step": 412
},
{
"epoch": 0.6100443131462334,
"grad_norm": 0.2915607988834381,
"learning_rate": 0.0002,
"loss": 0.2845,
"step": 413
},
{
"epoch": 0.6115214180206795,
"grad_norm": 0.34125831723213196,
"learning_rate": 0.0002,
"loss": 0.2985,
"step": 414
},
{
"epoch": 0.6129985228951256,
"grad_norm": 0.28235796093940735,
"learning_rate": 0.0002,
"loss": 0.29,
"step": 415
},
{
"epoch": 0.6144756277695717,
"grad_norm": 0.30712956190109253,
"learning_rate": 0.0002,
"loss": 0.2863,
"step": 416
},
{
"epoch": 0.6159527326440177,
"grad_norm": 0.3005330562591553,
"learning_rate": 0.0002,
"loss": 0.3186,
"step": 417
},
{
"epoch": 0.6174298375184638,
"grad_norm": 0.4083673655986786,
"learning_rate": 0.0002,
"loss": 0.31,
"step": 418
},
{
"epoch": 0.6189069423929099,
"grad_norm": 0.2704838812351227,
"learning_rate": 0.0002,
"loss": 0.2649,
"step": 419
},
{
"epoch": 0.620384047267356,
"grad_norm": 0.29053810238838196,
"learning_rate": 0.0002,
"loss": 0.2789,
"step": 420
},
{
"epoch": 0.621861152141802,
"grad_norm": 0.329973429441452,
"learning_rate": 0.0002,
"loss": 0.3313,
"step": 421
},
{
"epoch": 0.6233382570162481,
"grad_norm": 0.31070685386657715,
"learning_rate": 0.0002,
"loss": 0.3045,
"step": 422
},
{
"epoch": 0.6248153618906942,
"grad_norm": 0.3487679958343506,
"learning_rate": 0.0002,
"loss": 0.3286,
"step": 423
},
{
"epoch": 0.6262924667651403,
"grad_norm": 0.3269588351249695,
"learning_rate": 0.0002,
"loss": 0.3326,
"step": 424
},
{
"epoch": 0.6277695716395865,
"grad_norm": 0.26015186309814453,
"learning_rate": 0.0002,
"loss": 0.2457,
"step": 425
},
{
"epoch": 0.6292466765140325,
"grad_norm": 0.2547609508037567,
"learning_rate": 0.0002,
"loss": 0.262,
"step": 426
},
{
"epoch": 0.6307237813884786,
"grad_norm": 0.2524930238723755,
"learning_rate": 0.0002,
"loss": 0.23,
"step": 427
},
{
"epoch": 0.6322008862629247,
"grad_norm": 0.3031904101371765,
"learning_rate": 0.0002,
"loss": 0.3427,
"step": 428
},
{
"epoch": 0.6336779911373708,
"grad_norm": 0.3007690906524658,
"learning_rate": 0.0002,
"loss": 0.2974,
"step": 429
},
{
"epoch": 0.6351550960118169,
"grad_norm": 0.28696200251579285,
"learning_rate": 0.0002,
"loss": 0.2911,
"step": 430
},
{
"epoch": 0.6366322008862629,
"grad_norm": 0.2805304229259491,
"learning_rate": 0.0002,
"loss": 0.2745,
"step": 431
},
{
"epoch": 0.638109305760709,
"grad_norm": 0.2757206857204437,
"learning_rate": 0.0002,
"loss": 0.2517,
"step": 432
},
{
"epoch": 0.6395864106351551,
"grad_norm": 0.26851919293403625,
"learning_rate": 0.0002,
"loss": 0.2537,
"step": 433
},
{
"epoch": 0.6410635155096012,
"grad_norm": 0.28059712052345276,
"learning_rate": 0.0002,
"loss": 0.2616,
"step": 434
},
{
"epoch": 0.6425406203840472,
"grad_norm": 0.2718868553638458,
"learning_rate": 0.0002,
"loss": 0.2652,
"step": 435
},
{
"epoch": 0.6440177252584933,
"grad_norm": 0.28253173828125,
"learning_rate": 0.0002,
"loss": 0.2866,
"step": 436
},
{
"epoch": 0.6454948301329394,
"grad_norm": 0.3183034658432007,
"learning_rate": 0.0002,
"loss": 0.3485,
"step": 437
},
{
"epoch": 0.6469719350073855,
"grad_norm": 0.2451733946800232,
"learning_rate": 0.0002,
"loss": 0.2312,
"step": 438
},
{
"epoch": 0.6484490398818316,
"grad_norm": 0.3208939731121063,
"learning_rate": 0.0002,
"loss": 0.3245,
"step": 439
},
{
"epoch": 0.6499261447562777,
"grad_norm": 0.26186874508857727,
"learning_rate": 0.0002,
"loss": 0.2485,
"step": 440
},
{
"epoch": 0.6514032496307238,
"grad_norm": 0.27923303842544556,
"learning_rate": 0.0002,
"loss": 0.3221,
"step": 441
},
{
"epoch": 0.6528803545051699,
"grad_norm": 0.28155946731567383,
"learning_rate": 0.0002,
"loss": 0.2843,
"step": 442
},
{
"epoch": 0.654357459379616,
"grad_norm": 0.28456977009773254,
"learning_rate": 0.0002,
"loss": 0.296,
"step": 443
},
{
"epoch": 0.6558345642540621,
"grad_norm": 0.27252209186553955,
"learning_rate": 0.0002,
"loss": 0.2765,
"step": 444
},
{
"epoch": 0.6573116691285081,
"grad_norm": 0.30992233753204346,
"learning_rate": 0.0002,
"loss": 0.3055,
"step": 445
},
{
"epoch": 0.6587887740029542,
"grad_norm": 0.30148544907569885,
"learning_rate": 0.0002,
"loss": 0.3059,
"step": 446
},
{
"epoch": 0.6602658788774003,
"grad_norm": 0.29087716341018677,
"learning_rate": 0.0002,
"loss": 0.257,
"step": 447
},
{
"epoch": 0.6617429837518464,
"grad_norm": 0.30917656421661377,
"learning_rate": 0.0002,
"loss": 0.3096,
"step": 448
},
{
"epoch": 0.6632200886262924,
"grad_norm": 0.311759352684021,
"learning_rate": 0.0002,
"loss": 0.2842,
"step": 449
},
{
"epoch": 0.6646971935007385,
"grad_norm": 0.2612153887748718,
"learning_rate": 0.0002,
"loss": 0.2659,
"step": 450
},
{
"epoch": 0.6661742983751846,
"grad_norm": 0.2954850196838379,
"learning_rate": 0.0002,
"loss": 0.2755,
"step": 451
},
{
"epoch": 0.6676514032496307,
"grad_norm": 0.3181207776069641,
"learning_rate": 0.0002,
"loss": 0.3163,
"step": 452
},
{
"epoch": 0.6691285081240768,
"grad_norm": 0.2802172899246216,
"learning_rate": 0.0002,
"loss": 0.3007,
"step": 453
},
{
"epoch": 0.670605612998523,
"grad_norm": 0.2662009298801422,
"learning_rate": 0.0002,
"loss": 0.2571,
"step": 454
},
{
"epoch": 0.672082717872969,
"grad_norm": 0.2844826579093933,
"learning_rate": 0.0002,
"loss": 0.3074,
"step": 455
},
{
"epoch": 0.6735598227474151,
"grad_norm": 0.2758782207965851,
"learning_rate": 0.0002,
"loss": 0.2773,
"step": 456
},
{
"epoch": 0.6750369276218612,
"grad_norm": 0.2567600607872009,
"learning_rate": 0.0002,
"loss": 0.2742,
"step": 457
},
{
"epoch": 0.6765140324963073,
"grad_norm": 0.34004896879196167,
"learning_rate": 0.0002,
"loss": 0.2288,
"step": 458
},
{
"epoch": 0.6779911373707533,
"grad_norm": 0.2983347475528717,
"learning_rate": 0.0002,
"loss": 0.2812,
"step": 459
},
{
"epoch": 0.6794682422451994,
"grad_norm": 0.29728880524635315,
"learning_rate": 0.0002,
"loss": 0.307,
"step": 460
},
{
"epoch": 0.6809453471196455,
"grad_norm": 0.31359198689460754,
"learning_rate": 0.0002,
"loss": 0.3101,
"step": 461
},
{
"epoch": 0.6824224519940916,
"grad_norm": 0.27619168162345886,
"learning_rate": 0.0002,
"loss": 0.2779,
"step": 462
},
{
"epoch": 0.6838995568685377,
"grad_norm": 0.3086981773376465,
"learning_rate": 0.0002,
"loss": 0.3097,
"step": 463
},
{
"epoch": 0.6853766617429837,
"grad_norm": 0.25216472148895264,
"learning_rate": 0.0002,
"loss": 0.2468,
"step": 464
},
{
"epoch": 0.6868537666174298,
"grad_norm": 0.26497989892959595,
"learning_rate": 0.0002,
"loss": 0.2528,
"step": 465
},
{
"epoch": 0.6883308714918759,
"grad_norm": 0.27617159485816956,
"learning_rate": 0.0002,
"loss": 0.2749,
"step": 466
},
{
"epoch": 0.689807976366322,
"grad_norm": 0.30501970648765564,
"learning_rate": 0.0002,
"loss": 0.3091,
"step": 467
},
{
"epoch": 0.691285081240768,
"grad_norm": 0.3360370099544525,
"learning_rate": 0.0002,
"loss": 0.3565,
"step": 468
},
{
"epoch": 0.6927621861152142,
"grad_norm": 0.27070116996765137,
"learning_rate": 0.0002,
"loss": 0.2705,
"step": 469
},
{
"epoch": 0.6942392909896603,
"grad_norm": 0.29874977469444275,
"learning_rate": 0.0002,
"loss": 0.2996,
"step": 470
},
{
"epoch": 0.6957163958641064,
"grad_norm": 0.294386088848114,
"learning_rate": 0.0002,
"loss": 0.2894,
"step": 471
},
{
"epoch": 0.6971935007385525,
"grad_norm": 0.3233067989349365,
"learning_rate": 0.0002,
"loss": 0.3024,
"step": 472
},
{
"epoch": 0.6986706056129985,
"grad_norm": 0.31051644682884216,
"learning_rate": 0.0002,
"loss": 0.3339,
"step": 473
},
{
"epoch": 0.7001477104874446,
"grad_norm": 0.28541213274002075,
"learning_rate": 0.0002,
"loss": 0.3097,
"step": 474
},
{
"epoch": 0.7016248153618907,
"grad_norm": 0.30758950114250183,
"learning_rate": 0.0002,
"loss": 0.3221,
"step": 475
},
{
"epoch": 0.7031019202363368,
"grad_norm": 0.37882164120674133,
"learning_rate": 0.0002,
"loss": 0.2922,
"step": 476
},
{
"epoch": 0.7045790251107829,
"grad_norm": 0.2521478533744812,
"learning_rate": 0.0002,
"loss": 0.2535,
"step": 477
},
{
"epoch": 0.7060561299852289,
"grad_norm": 0.36088013648986816,
"learning_rate": 0.0002,
"loss": 0.2863,
"step": 478
},
{
"epoch": 0.707533234859675,
"grad_norm": 0.31090793013572693,
"learning_rate": 0.0002,
"loss": 0.311,
"step": 479
},
{
"epoch": 0.7090103397341211,
"grad_norm": 0.2360762357711792,
"learning_rate": 0.0002,
"loss": 0.216,
"step": 480
},
{
"epoch": 0.7104874446085672,
"grad_norm": 0.34354060888290405,
"learning_rate": 0.0002,
"loss": 0.2838,
"step": 481
},
{
"epoch": 0.7119645494830132,
"grad_norm": 0.2607513666152954,
"learning_rate": 0.0002,
"loss": 0.2527,
"step": 482
},
{
"epoch": 0.7134416543574594,
"grad_norm": 0.3016189634799957,
"learning_rate": 0.0002,
"loss": 0.2782,
"step": 483
},
{
"epoch": 0.7149187592319055,
"grad_norm": 3.6188247203826904,
"learning_rate": 0.0002,
"loss": 0.2884,
"step": 484
},
{
"epoch": 0.7163958641063516,
"grad_norm": 0.3072677552700043,
"learning_rate": 0.0002,
"loss": 0.3263,
"step": 485
},
{
"epoch": 0.7178729689807977,
"grad_norm": 0.28074517846107483,
"learning_rate": 0.0002,
"loss": 0.298,
"step": 486
},
{
"epoch": 0.7193500738552437,
"grad_norm": 0.3235277831554413,
"learning_rate": 0.0002,
"loss": 0.2615,
"step": 487
},
{
"epoch": 0.7208271787296898,
"grad_norm": 2.001945734024048,
"learning_rate": 0.0002,
"loss": 0.3925,
"step": 488
},
{
"epoch": 0.7223042836041359,
"grad_norm": 0.29725533723831177,
"learning_rate": 0.0002,
"loss": 0.2874,
"step": 489
},
{
"epoch": 0.723781388478582,
"grad_norm": 0.28706061840057373,
"learning_rate": 0.0002,
"loss": 0.2898,
"step": 490
},
{
"epoch": 0.725258493353028,
"grad_norm": 0.2864967882633209,
"learning_rate": 0.0002,
"loss": 0.2894,
"step": 491
},
{
"epoch": 0.7267355982274741,
"grad_norm": 0.2824801802635193,
"learning_rate": 0.0002,
"loss": 0.2706,
"step": 492
},
{
"epoch": 0.7282127031019202,
"grad_norm": 0.27492067217826843,
"learning_rate": 0.0002,
"loss": 0.2516,
"step": 493
},
{
"epoch": 0.7296898079763663,
"grad_norm": 0.2876488268375397,
"learning_rate": 0.0002,
"loss": 0.2862,
"step": 494
},
{
"epoch": 0.7311669128508124,
"grad_norm": 0.30311787128448486,
"learning_rate": 0.0002,
"loss": 0.2953,
"step": 495
},
{
"epoch": 0.7326440177252584,
"grad_norm": 0.277235209941864,
"learning_rate": 0.0002,
"loss": 0.2981,
"step": 496
},
{
"epoch": 0.7341211225997046,
"grad_norm": 0.30590546131134033,
"learning_rate": 0.0002,
"loss": 0.3557,
"step": 497
},
{
"epoch": 0.7355982274741507,
"grad_norm": 0.3205493986606598,
"learning_rate": 0.0002,
"loss": 0.3304,
"step": 498
},
{
"epoch": 0.7370753323485968,
"grad_norm": 0.2640839219093323,
"learning_rate": 0.0002,
"loss": 0.2807,
"step": 499
},
{
"epoch": 0.7385524372230429,
"grad_norm": 0.27507102489471436,
"learning_rate": 0.0002,
"loss": 0.2565,
"step": 500
},
{
"epoch": 0.740029542097489,
"grad_norm": 0.2716003954410553,
"learning_rate": 0.0002,
"loss": 0.2684,
"step": 501
},
{
"epoch": 0.741506646971935,
"grad_norm": 0.2893518805503845,
"learning_rate": 0.0002,
"loss": 0.2708,
"step": 502
},
{
"epoch": 0.7429837518463811,
"grad_norm": 0.2790103256702423,
"learning_rate": 0.0002,
"loss": 0.2809,
"step": 503
},
{
"epoch": 0.7444608567208272,
"grad_norm": 0.29344794154167175,
"learning_rate": 0.0002,
"loss": 0.2961,
"step": 504
},
{
"epoch": 0.7459379615952733,
"grad_norm": 0.3118347227573395,
"learning_rate": 0.0002,
"loss": 0.3184,
"step": 505
},
{
"epoch": 0.7474150664697193,
"grad_norm": 0.29491183161735535,
"learning_rate": 0.0002,
"loss": 0.3194,
"step": 506
},
{
"epoch": 0.7488921713441654,
"grad_norm": 0.3007814586162567,
"learning_rate": 0.0002,
"loss": 0.3,
"step": 507
},
{
"epoch": 0.7503692762186115,
"grad_norm": 0.3303704261779785,
"learning_rate": 0.0002,
"loss": 0.2544,
"step": 508
},
{
"epoch": 0.7518463810930576,
"grad_norm": 0.28095510601997375,
"learning_rate": 0.0002,
"loss": 0.2774,
"step": 509
},
{
"epoch": 0.7533234859675036,
"grad_norm": 0.2669844329357147,
"learning_rate": 0.0002,
"loss": 0.2575,
"step": 510
},
{
"epoch": 0.7548005908419497,
"grad_norm": 0.29896053671836853,
"learning_rate": 0.0002,
"loss": 0.2823,
"step": 511
},
{
"epoch": 0.7562776957163959,
"grad_norm": 0.27470019459724426,
"learning_rate": 0.0002,
"loss": 0.2843,
"step": 512
},
{
"epoch": 0.757754800590842,
"grad_norm": 0.27731189131736755,
"learning_rate": 0.0002,
"loss": 0.2504,
"step": 513
},
{
"epoch": 0.7592319054652881,
"grad_norm": 0.2816368639469147,
"learning_rate": 0.0002,
"loss": 0.2877,
"step": 514
},
{
"epoch": 0.7607090103397341,
"grad_norm": 0.2858635485172272,
"learning_rate": 0.0002,
"loss": 0.3348,
"step": 515
},
{
"epoch": 0.7621861152141802,
"grad_norm": 0.2964169979095459,
"learning_rate": 0.0002,
"loss": 0.2775,
"step": 516
},
{
"epoch": 0.7636632200886263,
"grad_norm": 0.2534787058830261,
"learning_rate": 0.0002,
"loss": 0.2274,
"step": 517
},
{
"epoch": 0.7651403249630724,
"grad_norm": 0.28982672095298767,
"learning_rate": 0.0002,
"loss": 0.2939,
"step": 518
},
{
"epoch": 0.7666174298375185,
"grad_norm": 0.27323317527770996,
"learning_rate": 0.0002,
"loss": 0.2842,
"step": 519
},
{
"epoch": 0.7680945347119645,
"grad_norm": 0.27642300724983215,
"learning_rate": 0.0002,
"loss": 0.2966,
"step": 520
},
{
"epoch": 0.7695716395864106,
"grad_norm": 0.26599329710006714,
"learning_rate": 0.0002,
"loss": 0.2326,
"step": 521
},
{
"epoch": 0.7710487444608567,
"grad_norm": 0.2631528079509735,
"learning_rate": 0.0002,
"loss": 0.2771,
"step": 522
},
{
"epoch": 0.7725258493353028,
"grad_norm": 0.2790911793708801,
"learning_rate": 0.0002,
"loss": 0.2898,
"step": 523
},
{
"epoch": 0.7740029542097489,
"grad_norm": 0.266379714012146,
"learning_rate": 0.0002,
"loss": 0.2685,
"step": 524
},
{
"epoch": 0.7754800590841949,
"grad_norm": 0.30508288741111755,
"learning_rate": 0.0002,
"loss": 0.2909,
"step": 525
},
{
"epoch": 0.7769571639586411,
"grad_norm": 0.2602393329143524,
"learning_rate": 0.0002,
"loss": 0.2305,
"step": 526
},
{
"epoch": 0.7784342688330872,
"grad_norm": 0.3033619523048401,
"learning_rate": 0.0002,
"loss": 0.2689,
"step": 527
},
{
"epoch": 0.7799113737075333,
"grad_norm": 0.2758871614933014,
"learning_rate": 0.0002,
"loss": 0.2631,
"step": 528
},
{
"epoch": 0.7813884785819794,
"grad_norm": 0.2910580039024353,
"learning_rate": 0.0002,
"loss": 0.2844,
"step": 529
},
{
"epoch": 0.7828655834564254,
"grad_norm": 0.33454883098602295,
"learning_rate": 0.0002,
"loss": 0.301,
"step": 530
},
{
"epoch": 0.7843426883308715,
"grad_norm": 0.31416234374046326,
"learning_rate": 0.0002,
"loss": 0.2948,
"step": 531
},
{
"epoch": 0.7858197932053176,
"grad_norm": 0.3144732117652893,
"learning_rate": 0.0002,
"loss": 0.2649,
"step": 532
},
{
"epoch": 0.7872968980797637,
"grad_norm": 0.2666049599647522,
"learning_rate": 0.0002,
"loss": 0.2602,
"step": 533
},
{
"epoch": 0.7887740029542097,
"grad_norm": 0.26852795481681824,
"learning_rate": 0.0002,
"loss": 0.2761,
"step": 534
},
{
"epoch": 0.7902511078286558,
"grad_norm": 0.2828836143016815,
"learning_rate": 0.0002,
"loss": 0.2643,
"step": 535
},
{
"epoch": 0.7917282127031019,
"grad_norm": 0.24941638112068176,
"learning_rate": 0.0002,
"loss": 0.2715,
"step": 536
},
{
"epoch": 0.793205317577548,
"grad_norm": 0.28167465329170227,
"learning_rate": 0.0002,
"loss": 0.2886,
"step": 537
},
{
"epoch": 0.794682422451994,
"grad_norm": 0.27295514941215515,
"learning_rate": 0.0002,
"loss": 0.2838,
"step": 538
},
{
"epoch": 0.7961595273264401,
"grad_norm": 0.28401198983192444,
"learning_rate": 0.0002,
"loss": 0.3027,
"step": 539
},
{
"epoch": 0.7976366322008862,
"grad_norm": 0.36002475023269653,
"learning_rate": 0.0002,
"loss": 0.2743,
"step": 540
},
{
"epoch": 0.7991137370753324,
"grad_norm": 0.24884235858917236,
"learning_rate": 0.0002,
"loss": 0.2236,
"step": 541
},
{
"epoch": 0.8005908419497785,
"grad_norm": 0.29792970418930054,
"learning_rate": 0.0002,
"loss": 0.2685,
"step": 542
},
{
"epoch": 0.8020679468242246,
"grad_norm": 0.293630450963974,
"learning_rate": 0.0002,
"loss": 0.3121,
"step": 543
},
{
"epoch": 0.8035450516986706,
"grad_norm": 0.30826666951179504,
"learning_rate": 0.0002,
"loss": 0.2886,
"step": 544
},
{
"epoch": 0.8050221565731167,
"grad_norm": 0.2855941355228424,
"learning_rate": 0.0002,
"loss": 0.2947,
"step": 545
},
{
"epoch": 0.8064992614475628,
"grad_norm": 0.2649870812892914,
"learning_rate": 0.0002,
"loss": 0.2655,
"step": 546
},
{
"epoch": 0.8079763663220089,
"grad_norm": 0.27176880836486816,
"learning_rate": 0.0002,
"loss": 0.2715,
"step": 547
},
{
"epoch": 0.8094534711964549,
"grad_norm": 0.3225911557674408,
"learning_rate": 0.0002,
"loss": 0.3404,
"step": 548
},
{
"epoch": 0.810930576070901,
"grad_norm": 0.30113476514816284,
"learning_rate": 0.0002,
"loss": 0.3347,
"step": 549
},
{
"epoch": 0.8124076809453471,
"grad_norm": 0.2784980535507202,
"learning_rate": 0.0002,
"loss": 0.2599,
"step": 550
},
{
"epoch": 0.8138847858197932,
"grad_norm": 0.2825387716293335,
"learning_rate": 0.0002,
"loss": 0.2759,
"step": 551
},
{
"epoch": 0.8153618906942393,
"grad_norm": 0.26612088084220886,
"learning_rate": 0.0002,
"loss": 0.2464,
"step": 552
},
{
"epoch": 0.8168389955686853,
"grad_norm": 0.2672181725502014,
"learning_rate": 0.0002,
"loss": 0.2182,
"step": 553
},
{
"epoch": 0.8183161004431314,
"grad_norm": 0.28279784321784973,
"learning_rate": 0.0002,
"loss": 0.2612,
"step": 554
},
{
"epoch": 0.8197932053175776,
"grad_norm": 0.277281790971756,
"learning_rate": 0.0002,
"loss": 0.2647,
"step": 555
},
{
"epoch": 0.8212703101920237,
"grad_norm": 0.2784774899482727,
"learning_rate": 0.0002,
"loss": 0.2738,
"step": 556
},
{
"epoch": 0.8227474150664698,
"grad_norm": 0.2438610941171646,
"learning_rate": 0.0002,
"loss": 0.2274,
"step": 557
},
{
"epoch": 0.8242245199409158,
"grad_norm": 0.28168389201164246,
"learning_rate": 0.0002,
"loss": 0.2604,
"step": 558
},
{
"epoch": 0.8257016248153619,
"grad_norm": 0.26112061738967896,
"learning_rate": 0.0002,
"loss": 0.2215,
"step": 559
},
{
"epoch": 0.827178729689808,
"grad_norm": 0.25962984561920166,
"learning_rate": 0.0002,
"loss": 0.2834,
"step": 560
},
{
"epoch": 0.8286558345642541,
"grad_norm": 0.27150726318359375,
"learning_rate": 0.0002,
"loss": 0.2877,
"step": 561
},
{
"epoch": 0.8301329394387001,
"grad_norm": 0.2753923237323761,
"learning_rate": 0.0002,
"loss": 0.298,
"step": 562
},
{
"epoch": 0.8316100443131462,
"grad_norm": 0.37228959798812866,
"learning_rate": 0.0002,
"loss": 0.333,
"step": 563
},
{
"epoch": 0.8330871491875923,
"grad_norm": 0.27188584208488464,
"learning_rate": 0.0002,
"loss": 0.2578,
"step": 564
},
{
"epoch": 0.8345642540620384,
"grad_norm": 0.2894970178604126,
"learning_rate": 0.0002,
"loss": 0.3051,
"step": 565
},
{
"epoch": 0.8360413589364845,
"grad_norm": 0.2769443690776825,
"learning_rate": 0.0002,
"loss": 0.2833,
"step": 566
},
{
"epoch": 0.8375184638109305,
"grad_norm": 0.25693845748901367,
"learning_rate": 0.0002,
"loss": 0.2571,
"step": 567
},
{
"epoch": 0.8389955686853766,
"grad_norm": 0.27856937050819397,
"learning_rate": 0.0002,
"loss": 0.2942,
"step": 568
},
{
"epoch": 0.8404726735598228,
"grad_norm": 0.2575175166130066,
"learning_rate": 0.0002,
"loss": 0.2733,
"step": 569
},
{
"epoch": 0.8419497784342689,
"grad_norm": 0.27574828267097473,
"learning_rate": 0.0002,
"loss": 0.2642,
"step": 570
},
{
"epoch": 0.843426883308715,
"grad_norm": 0.2522878646850586,
"learning_rate": 0.0002,
"loss": 0.2377,
"step": 571
},
{
"epoch": 0.844903988183161,
"grad_norm": 0.26878973841667175,
"learning_rate": 0.0002,
"loss": 0.2635,
"step": 572
},
{
"epoch": 0.8463810930576071,
"grad_norm": 0.25874340534210205,
"learning_rate": 0.0002,
"loss": 0.2622,
"step": 573
},
{
"epoch": 0.8478581979320532,
"grad_norm": 0.2808675765991211,
"learning_rate": 0.0002,
"loss": 0.2679,
"step": 574
},
{
"epoch": 0.8493353028064993,
"grad_norm": 0.3035877048969269,
"learning_rate": 0.0002,
"loss": 0.3097,
"step": 575
},
{
"epoch": 0.8508124076809453,
"grad_norm": 0.2748059928417206,
"learning_rate": 0.0002,
"loss": 0.2986,
"step": 576
},
{
"epoch": 0.8522895125553914,
"grad_norm": 0.2966136932373047,
"learning_rate": 0.0002,
"loss": 0.2799,
"step": 577
},
{
"epoch": 0.8537666174298375,
"grad_norm": 1.3606016635894775,
"learning_rate": 0.0002,
"loss": 0.2808,
"step": 578
},
{
"epoch": 0.8552437223042836,
"grad_norm": 0.2695050835609436,
"learning_rate": 0.0002,
"loss": 0.2417,
"step": 579
},
{
"epoch": 0.8567208271787297,
"grad_norm": 0.26403385400772095,
"learning_rate": 0.0002,
"loss": 0.2474,
"step": 580
},
{
"epoch": 0.8581979320531757,
"grad_norm": 0.2719348669052124,
"learning_rate": 0.0002,
"loss": 0.2558,
"step": 581
},
{
"epoch": 0.8596750369276218,
"grad_norm": 0.2620692253112793,
"learning_rate": 0.0002,
"loss": 0.2704,
"step": 582
},
{
"epoch": 0.8611521418020679,
"grad_norm": 0.3160097897052765,
"learning_rate": 0.0002,
"loss": 0.2967,
"step": 583
},
{
"epoch": 0.8626292466765141,
"grad_norm": 0.27527111768722534,
"learning_rate": 0.0002,
"loss": 0.2508,
"step": 584
},
{
"epoch": 0.8641063515509602,
"grad_norm": 0.27846094965934753,
"learning_rate": 0.0002,
"loss": 0.2766,
"step": 585
},
{
"epoch": 0.8655834564254062,
"grad_norm": 0.2789734899997711,
"learning_rate": 0.0002,
"loss": 0.2857,
"step": 586
},
{
"epoch": 0.8670605612998523,
"grad_norm": 0.30942806601524353,
"learning_rate": 0.0002,
"loss": 0.3216,
"step": 587
},
{
"epoch": 0.8685376661742984,
"grad_norm": 0.27556589245796204,
"learning_rate": 0.0002,
"loss": 0.257,
"step": 588
},
{
"epoch": 0.8700147710487445,
"grad_norm": 0.2209852784872055,
"learning_rate": 0.0002,
"loss": 0.1862,
"step": 589
},
{
"epoch": 0.8714918759231906,
"grad_norm": 0.29638856649398804,
"learning_rate": 0.0002,
"loss": 0.3049,
"step": 590
},
{
"epoch": 0.8729689807976366,
"grad_norm": 0.309600830078125,
"learning_rate": 0.0002,
"loss": 0.2675,
"step": 591
},
{
"epoch": 0.8744460856720827,
"grad_norm": 0.28644561767578125,
"learning_rate": 0.0002,
"loss": 0.2859,
"step": 592
},
{
"epoch": 0.8759231905465288,
"grad_norm": 0.25827983021736145,
"learning_rate": 0.0002,
"loss": 0.2543,
"step": 593
},
{
"epoch": 0.8774002954209749,
"grad_norm": 0.2538520395755768,
"learning_rate": 0.0002,
"loss": 0.2523,
"step": 594
},
{
"epoch": 0.8788774002954209,
"grad_norm": 0.26979878544807434,
"learning_rate": 0.0002,
"loss": 0.2379,
"step": 595
},
{
"epoch": 0.880354505169867,
"grad_norm": 0.2815455496311188,
"learning_rate": 0.0002,
"loss": 0.2793,
"step": 596
},
{
"epoch": 0.8818316100443131,
"grad_norm": 0.2549828588962555,
"learning_rate": 0.0002,
"loss": 0.2743,
"step": 597
},
{
"epoch": 0.8833087149187593,
"grad_norm": 0.24497728049755096,
"learning_rate": 0.0002,
"loss": 0.2234,
"step": 598
},
{
"epoch": 0.8847858197932054,
"grad_norm": 0.2854422628879547,
"learning_rate": 0.0002,
"loss": 0.2858,
"step": 599
},
{
"epoch": 0.8862629246676514,
"grad_norm": 0.2807024121284485,
"learning_rate": 0.0002,
"loss": 0.2608,
"step": 600
},
{
"epoch": 0.8877400295420975,
"grad_norm": 0.2663458585739136,
"learning_rate": 0.0002,
"loss": 0.2399,
"step": 601
},
{
"epoch": 0.8892171344165436,
"grad_norm": 0.2760714888572693,
"learning_rate": 0.0002,
"loss": 0.2934,
"step": 602
},
{
"epoch": 0.8906942392909897,
"grad_norm": 0.3003925383090973,
"learning_rate": 0.0002,
"loss": 0.2765,
"step": 603
},
{
"epoch": 0.8921713441654358,
"grad_norm": 0.32742151618003845,
"learning_rate": 0.0002,
"loss": 0.3116,
"step": 604
},
{
"epoch": 0.8936484490398818,
"grad_norm": 0.29396241903305054,
"learning_rate": 0.0002,
"loss": 0.2548,
"step": 605
},
{
"epoch": 0.8951255539143279,
"grad_norm": 0.28835952281951904,
"learning_rate": 0.0002,
"loss": 0.2665,
"step": 606
},
{
"epoch": 0.896602658788774,
"grad_norm": 0.2689400017261505,
"learning_rate": 0.0002,
"loss": 0.2554,
"step": 607
},
{
"epoch": 0.8980797636632201,
"grad_norm": 0.27350932359695435,
"learning_rate": 0.0002,
"loss": 0.2474,
"step": 608
},
{
"epoch": 0.8995568685376661,
"grad_norm": 0.26769059896469116,
"learning_rate": 0.0002,
"loss": 0.286,
"step": 609
},
{
"epoch": 0.9010339734121122,
"grad_norm": 0.25921839475631714,
"learning_rate": 0.0002,
"loss": 0.2438,
"step": 610
},
{
"epoch": 0.9025110782865583,
"grad_norm": 0.26628950238227844,
"learning_rate": 0.0002,
"loss": 0.2713,
"step": 611
},
{
"epoch": 0.9039881831610044,
"grad_norm": 0.26283326745033264,
"learning_rate": 0.0002,
"loss": 0.2696,
"step": 612
},
{
"epoch": 0.9054652880354506,
"grad_norm": 0.29980388283729553,
"learning_rate": 0.0002,
"loss": 0.2581,
"step": 613
},
{
"epoch": 0.9069423929098966,
"grad_norm": 0.2768777310848236,
"learning_rate": 0.0002,
"loss": 0.2853,
"step": 614
},
{
"epoch": 0.9084194977843427,
"grad_norm": 0.27376455068588257,
"learning_rate": 0.0002,
"loss": 0.2836,
"step": 615
},
{
"epoch": 0.9098966026587888,
"grad_norm": 0.28933191299438477,
"learning_rate": 0.0002,
"loss": 0.3071,
"step": 616
},
{
"epoch": 0.9113737075332349,
"grad_norm": 0.3081536293029785,
"learning_rate": 0.0002,
"loss": 0.2461,
"step": 617
},
{
"epoch": 0.912850812407681,
"grad_norm": 0.2886345386505127,
"learning_rate": 0.0002,
"loss": 0.3,
"step": 618
},
{
"epoch": 0.914327917282127,
"grad_norm": 0.2829267680644989,
"learning_rate": 0.0002,
"loss": 0.2746,
"step": 619
},
{
"epoch": 0.9158050221565731,
"grad_norm": 0.2512478232383728,
"learning_rate": 0.0002,
"loss": 0.2435,
"step": 620
},
{
"epoch": 0.9172821270310192,
"grad_norm": 0.4229198694229126,
"learning_rate": 0.0002,
"loss": 0.2847,
"step": 621
},
{
"epoch": 0.9187592319054653,
"grad_norm": 0.2993115186691284,
"learning_rate": 0.0002,
"loss": 0.2869,
"step": 622
},
{
"epoch": 0.9202363367799113,
"grad_norm": 0.2935909330844879,
"learning_rate": 0.0002,
"loss": 0.2691,
"step": 623
},
{
"epoch": 0.9217134416543574,
"grad_norm": 0.3156206011772156,
"learning_rate": 0.0002,
"loss": 0.2936,
"step": 624
},
{
"epoch": 0.9231905465288035,
"grad_norm": 0.2829430401325226,
"learning_rate": 0.0002,
"loss": 0.2782,
"step": 625
},
{
"epoch": 0.9246676514032496,
"grad_norm": 0.2769679129123688,
"learning_rate": 0.0002,
"loss": 0.2751,
"step": 626
},
{
"epoch": 0.9261447562776958,
"grad_norm": 0.2695547044277191,
"learning_rate": 0.0002,
"loss": 0.2768,
"step": 627
},
{
"epoch": 0.9276218611521418,
"grad_norm": 0.2564750909805298,
"learning_rate": 0.0002,
"loss": 0.2691,
"step": 628
},
{
"epoch": 0.9290989660265879,
"grad_norm": 0.3216243386268616,
"learning_rate": 0.0002,
"loss": 0.2895,
"step": 629
},
{
"epoch": 0.930576070901034,
"grad_norm": 0.26920050382614136,
"learning_rate": 0.0002,
"loss": 0.2943,
"step": 630
},
{
"epoch": 0.9320531757754801,
"grad_norm": 0.23926717042922974,
"learning_rate": 0.0002,
"loss": 0.2114,
"step": 631
},
{
"epoch": 0.9335302806499262,
"grad_norm": 0.3015134036540985,
"learning_rate": 0.0002,
"loss": 0.2889,
"step": 632
},
{
"epoch": 0.9350073855243722,
"grad_norm": 0.29262953996658325,
"learning_rate": 0.0002,
"loss": 0.2977,
"step": 633
},
{
"epoch": 0.9364844903988183,
"grad_norm": 0.27330338954925537,
"learning_rate": 0.0002,
"loss": 0.2706,
"step": 634
},
{
"epoch": 0.9379615952732644,
"grad_norm": 0.2691650092601776,
"learning_rate": 0.0002,
"loss": 0.2471,
"step": 635
},
{
"epoch": 0.9394387001477105,
"grad_norm": 0.30574268102645874,
"learning_rate": 0.0002,
"loss": 0.2977,
"step": 636
},
{
"epoch": 0.9409158050221565,
"grad_norm": 0.2739352881908417,
"learning_rate": 0.0002,
"loss": 0.2825,
"step": 637
},
{
"epoch": 0.9423929098966026,
"grad_norm": 0.3041648268699646,
"learning_rate": 0.0002,
"loss": 0.3344,
"step": 638
},
{
"epoch": 0.9438700147710487,
"grad_norm": 0.2827674150466919,
"learning_rate": 0.0002,
"loss": 0.2618,
"step": 639
},
{
"epoch": 0.9453471196454948,
"grad_norm": 0.25939705967903137,
"learning_rate": 0.0002,
"loss": 0.2279,
"step": 640
},
{
"epoch": 0.946824224519941,
"grad_norm": 0.3013932406902313,
"learning_rate": 0.0002,
"loss": 0.3114,
"step": 641
},
{
"epoch": 0.948301329394387,
"grad_norm": 4.885525703430176,
"learning_rate": 0.0002,
"loss": 0.2818,
"step": 642
},
{
"epoch": 0.9497784342688331,
"grad_norm": 0.25590044260025024,
"learning_rate": 0.0002,
"loss": 0.2427,
"step": 643
},
{
"epoch": 0.9512555391432792,
"grad_norm": 0.2372172772884369,
"learning_rate": 0.0002,
"loss": 0.2397,
"step": 644
},
{
"epoch": 0.9527326440177253,
"grad_norm": 0.26376283168792725,
"learning_rate": 0.0002,
"loss": 0.2624,
"step": 645
},
{
"epoch": 0.9542097488921714,
"grad_norm": 0.27342459559440613,
"learning_rate": 0.0002,
"loss": 0.2652,
"step": 646
},
{
"epoch": 0.9556868537666174,
"grad_norm": 0.260745644569397,
"learning_rate": 0.0002,
"loss": 0.2523,
"step": 647
},
{
"epoch": 0.9571639586410635,
"grad_norm": 0.24873754382133484,
"learning_rate": 0.0002,
"loss": 0.2238,
"step": 648
},
{
"epoch": 0.9586410635155096,
"grad_norm": 0.281990110874176,
"learning_rate": 0.0002,
"loss": 0.235,
"step": 649
},
{
"epoch": 0.9601181683899557,
"grad_norm": 0.25676026940345764,
"learning_rate": 0.0002,
"loss": 0.215,
"step": 650
},
{
"epoch": 0.9615952732644018,
"grad_norm": 0.2927687466144562,
"learning_rate": 0.0002,
"loss": 0.2764,
"step": 651
},
{
"epoch": 0.9630723781388478,
"grad_norm": 0.26322099566459656,
"learning_rate": 0.0002,
"loss": 0.2511,
"step": 652
},
{
"epoch": 0.9645494830132939,
"grad_norm": 0.2764233350753784,
"learning_rate": 0.0002,
"loss": 0.2439,
"step": 653
},
{
"epoch": 0.96602658788774,
"grad_norm": 0.29849788546562195,
"learning_rate": 0.0002,
"loss": 0.2689,
"step": 654
},
{
"epoch": 0.9675036927621861,
"grad_norm": 0.2834247648715973,
"learning_rate": 0.0002,
"loss": 0.3398,
"step": 655
},
{
"epoch": 0.9689807976366323,
"grad_norm": 0.28436174988746643,
"learning_rate": 0.0002,
"loss": 0.3086,
"step": 656
},
{
"epoch": 0.9704579025110783,
"grad_norm": 0.24340803921222687,
"learning_rate": 0.0002,
"loss": 0.2288,
"step": 657
},
{
"epoch": 0.9719350073855244,
"grad_norm": 0.2577742338180542,
"learning_rate": 0.0002,
"loss": 0.2598,
"step": 658
},
{
"epoch": 0.9734121122599705,
"grad_norm": 0.28326281905174255,
"learning_rate": 0.0002,
"loss": 0.2862,
"step": 659
},
{
"epoch": 0.9748892171344166,
"grad_norm": 0.27066269516944885,
"learning_rate": 0.0002,
"loss": 0.2585,
"step": 660
},
{
"epoch": 0.9763663220088626,
"grad_norm": 0.26694634556770325,
"learning_rate": 0.0002,
"loss": 0.2788,
"step": 661
},
{
"epoch": 0.9778434268833087,
"grad_norm": 0.2890130281448364,
"learning_rate": 0.0002,
"loss": 0.3073,
"step": 662
},
{
"epoch": 0.9793205317577548,
"grad_norm": 0.26095882058143616,
"learning_rate": 0.0002,
"loss": 0.2184,
"step": 663
},
{
"epoch": 0.9807976366322009,
"grad_norm": 0.2648635506629944,
"learning_rate": 0.0002,
"loss": 0.2465,
"step": 664
},
{
"epoch": 0.982274741506647,
"grad_norm": 0.2354656457901001,
"learning_rate": 0.0002,
"loss": 0.2457,
"step": 665
},
{
"epoch": 0.983751846381093,
"grad_norm": 0.2767215669155121,
"learning_rate": 0.0002,
"loss": 0.2735,
"step": 666
},
{
"epoch": 0.9852289512555391,
"grad_norm": 0.27141231298446655,
"learning_rate": 0.0002,
"loss": 0.2589,
"step": 667
},
{
"epoch": 0.9867060561299852,
"grad_norm": 0.254549115896225,
"learning_rate": 0.0002,
"loss": 0.2785,
"step": 668
},
{
"epoch": 0.9881831610044313,
"grad_norm": 0.2712014317512512,
"learning_rate": 0.0002,
"loss": 0.2579,
"step": 669
},
{
"epoch": 0.9896602658788775,
"grad_norm": 0.26712852716445923,
"learning_rate": 0.0002,
"loss": 0.3165,
"step": 670
},
{
"epoch": 0.9911373707533235,
"grad_norm": 0.2829815447330475,
"learning_rate": 0.0002,
"loss": 0.2438,
"step": 671
},
{
"epoch": 0.9926144756277696,
"grad_norm": 0.27326712012290955,
"learning_rate": 0.0002,
"loss": 0.2622,
"step": 672
},
{
"epoch": 0.9940915805022157,
"grad_norm": 0.2569233179092407,
"learning_rate": 0.0002,
"loss": 0.2353,
"step": 673
},
{
"epoch": 0.9955686853766618,
"grad_norm": 0.28441140055656433,
"learning_rate": 0.0002,
"loss": 0.2732,
"step": 674
},
{
"epoch": 0.9970457902511078,
"grad_norm": 0.2831505835056305,
"learning_rate": 0.0002,
"loss": 0.2605,
"step": 675
},
{
"epoch": 0.9985228951255539,
"grad_norm": 0.26520466804504395,
"learning_rate": 0.0002,
"loss": 0.2614,
"step": 676
},
{
"epoch": 1.0,
"grad_norm": 0.7976667881011963,
"learning_rate": 0.0002,
"loss": 0.3423,
"step": 677
},
{
"epoch": 1.0,
"step": 677,
"total_flos": 1.1323313955746611e+17,
"train_loss": 0.3172708253432588,
"train_runtime": 2971.8662,
"train_samples_per_second": 1.82,
"train_steps_per_second": 0.228
}
],
"logging_steps": 1,
"max_steps": 677,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1323313955746611e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}