llava-next-video-7b-vqa-3000 / trainer_state.json
shuoxing's picture
Upload folder using huggingface_hub
27855f4 verified
{
"best_metric": 0.12199707,
"best_model_checkpoint": "/data1/tzz/VQA/ckpt/llava_next_video/v2-20250226-080739/checkpoint-185",
"epoch": 0.9966329966329966,
"eval_steps": 500,
"global_step": 185,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0053872053872053875,
"grad_norm": 14.159545000064247,
"learning_rate": 1.0000000000000002e-06,
"loss": 31.90625,
"memory(GiB)": 22.53,
"step": 1,
"train_speed(iter/s)": 0.022985
},
{
"epoch": 0.010774410774410775,
"grad_norm": 14.616283963493206,
"learning_rate": 2.0000000000000003e-06,
"loss": 31.5234375,
"memory(GiB)": 22.53,
"step": 2,
"train_speed(iter/s)": 0.028711
},
{
"epoch": 0.01616161616161616,
"grad_norm": 13.121864716464238,
"learning_rate": 3e-06,
"loss": 33.6796875,
"memory(GiB)": 22.53,
"step": 3,
"train_speed(iter/s)": 0.031289
},
{
"epoch": 0.02154882154882155,
"grad_norm": 11.258740067609244,
"learning_rate": 4.000000000000001e-06,
"loss": 31.8203125,
"memory(GiB)": 22.53,
"step": 4,
"train_speed(iter/s)": 0.032739
},
{
"epoch": 0.026936026936026935,
"grad_norm": 13.170936715126654,
"learning_rate": 5e-06,
"loss": 29.2109375,
"memory(GiB)": 22.55,
"step": 5,
"train_speed(iter/s)": 0.033213
},
{
"epoch": 0.03232323232323232,
"grad_norm": 14.330929445232412,
"learning_rate": 6e-06,
"loss": 28.078125,
"memory(GiB)": 22.55,
"step": 6,
"train_speed(iter/s)": 0.033152
},
{
"epoch": 0.03771043771043771,
"grad_norm": 13.548506738998086,
"learning_rate": 7e-06,
"loss": 27.5078125,
"memory(GiB)": 22.55,
"step": 7,
"train_speed(iter/s)": 0.033486
},
{
"epoch": 0.0430976430976431,
"grad_norm": 8.666929263748118,
"learning_rate": 8.000000000000001e-06,
"loss": 24.9609375,
"memory(GiB)": 22.56,
"step": 8,
"train_speed(iter/s)": 0.033686
},
{
"epoch": 0.048484848484848485,
"grad_norm": 11.066925048714,
"learning_rate": 9e-06,
"loss": 19.890625,
"memory(GiB)": 22.57,
"step": 9,
"train_speed(iter/s)": 0.0337
},
{
"epoch": 0.05387205387205387,
"grad_norm": 8.973276554829988,
"learning_rate": 1e-05,
"loss": 14.328125,
"memory(GiB)": 22.58,
"step": 10,
"train_speed(iter/s)": 0.03368
},
{
"epoch": 0.05925925925925926,
"grad_norm": 4.92025256078084,
"learning_rate": 9.999194339645292e-06,
"loss": 11.275390625,
"memory(GiB)": 22.58,
"step": 11,
"train_speed(iter/s)": 0.033773
},
{
"epoch": 0.06464646464646465,
"grad_norm": 2.5239985209180706,
"learning_rate": 9.996777618216608e-06,
"loss": 9.6875,
"memory(GiB)": 22.58,
"step": 12,
"train_speed(iter/s)": 0.034222
},
{
"epoch": 0.07003367003367003,
"grad_norm": 3.3965201854332046,
"learning_rate": 9.992750614536606e-06,
"loss": 7.869140625,
"memory(GiB)": 22.58,
"step": 13,
"train_speed(iter/s)": 0.034559
},
{
"epoch": 0.07542087542087542,
"grad_norm": 3.83999730322345,
"learning_rate": 9.987114626364172e-06,
"loss": 7.22265625,
"memory(GiB)": 22.58,
"step": 14,
"train_speed(iter/s)": 0.034521
},
{
"epoch": 0.08080808080808081,
"grad_norm": 4.824626769554233,
"learning_rate": 9.979871469976197e-06,
"loss": 7.0576171875,
"memory(GiB)": 22.58,
"step": 15,
"train_speed(iter/s)": 0.03441
},
{
"epoch": 0.0861952861952862,
"grad_norm": 3.043760096951554,
"learning_rate": 9.971023479582258e-06,
"loss": 5.4990234375,
"memory(GiB)": 22.58,
"step": 16,
"train_speed(iter/s)": 0.034356
},
{
"epoch": 0.09158249158249158,
"grad_norm": 1.3971212531371173,
"learning_rate": 9.960573506572391e-06,
"loss": 4.044921875,
"memory(GiB)": 22.58,
"step": 17,
"train_speed(iter/s)": 0.03432
},
{
"epoch": 0.09696969696969697,
"grad_norm": 1.5367962214559587,
"learning_rate": 9.948524918598175e-06,
"loss": 3.44189453125,
"memory(GiB)": 22.58,
"step": 18,
"train_speed(iter/s)": 0.034225
},
{
"epoch": 0.10235690235690235,
"grad_norm": 1.2329087385122603,
"learning_rate": 9.934881598487478e-06,
"loss": 3.4072265625,
"memory(GiB)": 22.58,
"step": 19,
"train_speed(iter/s)": 0.034123
},
{
"epoch": 0.10774410774410774,
"grad_norm": 0.8648810367159049,
"learning_rate": 9.91964794299315e-06,
"loss": 3.0048828125,
"memory(GiB)": 22.58,
"step": 20,
"train_speed(iter/s)": 0.03406
},
{
"epoch": 0.11313131313131314,
"grad_norm": 1.1333084548737522,
"learning_rate": 9.902828861376101e-06,
"loss": 2.973876953125,
"memory(GiB)": 22.58,
"step": 21,
"train_speed(iter/s)": 0.03407
},
{
"epoch": 0.11851851851851852,
"grad_norm": 1.67328747436259,
"learning_rate": 9.884429773823238e-06,
"loss": 2.460693359375,
"memory(GiB)": 22.58,
"step": 22,
"train_speed(iter/s)": 0.033985
},
{
"epoch": 0.12390572390572391,
"grad_norm": 0.8370283899907709,
"learning_rate": 9.864456609700726e-06,
"loss": 2.162109375,
"memory(GiB)": 22.58,
"step": 23,
"train_speed(iter/s)": 0.033817
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.7984037408374535,
"learning_rate": 9.842915805643156e-06,
"loss": 2.711669921875,
"memory(GiB)": 22.58,
"step": 24,
"train_speed(iter/s)": 0.033701
},
{
"epoch": 0.13468013468013468,
"grad_norm": 0.5877571918682093,
"learning_rate": 9.819814303479268e-06,
"loss": 1.707275390625,
"memory(GiB)": 22.58,
"step": 25,
"train_speed(iter/s)": 0.033872
},
{
"epoch": 0.14006734006734006,
"grad_norm": 1.4800629465642858,
"learning_rate": 9.79515954799483e-06,
"loss": 2.813720703125,
"memory(GiB)": 22.58,
"step": 26,
"train_speed(iter/s)": 0.034031
},
{
"epoch": 0.14545454545454545,
"grad_norm": 2.1222533390916443,
"learning_rate": 9.768959484533461e-06,
"loss": 3.59912109375,
"memory(GiB)": 22.58,
"step": 27,
"train_speed(iter/s)": 0.034169
},
{
"epoch": 0.15084175084175083,
"grad_norm": 0.8369490081884605,
"learning_rate": 9.741222556436132e-06,
"loss": 1.89404296875,
"memory(GiB)": 22.58,
"step": 28,
"train_speed(iter/s)": 0.034295
},
{
"epoch": 0.15622895622895622,
"grad_norm": 0.5854633514891076,
"learning_rate": 9.711957702320176e-06,
"loss": 1.986328125,
"memory(GiB)": 22.58,
"step": 29,
"train_speed(iter/s)": 0.034448
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.35782476089852655,
"learning_rate": 9.681174353198687e-06,
"loss": 2.087890625,
"memory(GiB)": 22.58,
"step": 30,
"train_speed(iter/s)": 0.034568
},
{
"epoch": 0.16700336700336701,
"grad_norm": 0.7861618699933016,
"learning_rate": 9.648882429441258e-06,
"loss": 2.669921875,
"memory(GiB)": 22.58,
"step": 31,
"train_speed(iter/s)": 0.034675
},
{
"epoch": 0.1723905723905724,
"grad_norm": 0.536791680824106,
"learning_rate": 9.615092337576987e-06,
"loss": 2.203125,
"memory(GiB)": 22.58,
"step": 32,
"train_speed(iter/s)": 0.034758
},
{
"epoch": 0.17777777777777778,
"grad_norm": 1.3726808261834198,
"learning_rate": 9.579814966940833e-06,
"loss": 2.114501953125,
"memory(GiB)": 22.58,
"step": 33,
"train_speed(iter/s)": 0.034839
},
{
"epoch": 0.18316498316498317,
"grad_norm": 0.8535138723050261,
"learning_rate": 9.543061686164374e-06,
"loss": 2.1591796875,
"memory(GiB)": 22.58,
"step": 34,
"train_speed(iter/s)": 0.034969
},
{
"epoch": 0.18855218855218855,
"grad_norm": 0.6726334477065563,
"learning_rate": 9.504844339512096e-06,
"loss": 2.35791015625,
"memory(GiB)": 22.58,
"step": 35,
"train_speed(iter/s)": 0.035076
},
{
"epoch": 0.19393939393939394,
"grad_norm": 0.7227226956981251,
"learning_rate": 9.465175243064428e-06,
"loss": 2.400390625,
"memory(GiB)": 22.58,
"step": 36,
"train_speed(iter/s)": 0.035195
},
{
"epoch": 0.19932659932659932,
"grad_norm": 0.7075241914063357,
"learning_rate": 9.424067180748692e-06,
"loss": 1.476318359375,
"memory(GiB)": 22.58,
"step": 37,
"train_speed(iter/s)": 0.035278
},
{
"epoch": 0.2047138047138047,
"grad_norm": 0.8285808812880359,
"learning_rate": 9.381533400219319e-06,
"loss": 2.50634765625,
"memory(GiB)": 22.58,
"step": 38,
"train_speed(iter/s)": 0.035354
},
{
"epoch": 0.2101010101010101,
"grad_norm": 0.747109858212397,
"learning_rate": 9.337587608588588e-06,
"loss": 2.397216796875,
"memory(GiB)": 22.58,
"step": 39,
"train_speed(iter/s)": 0.035434
},
{
"epoch": 0.21548821548821548,
"grad_norm": 0.8997236382866319,
"learning_rate": 9.292243968009332e-06,
"loss": 2.3466796875,
"memory(GiB)": 22.58,
"step": 40,
"train_speed(iter/s)": 0.035447
},
{
"epoch": 0.22087542087542086,
"grad_norm": 0.3854506877674985,
"learning_rate": 9.24551709111097e-06,
"loss": 1.607421875,
"memory(GiB)": 22.58,
"step": 41,
"train_speed(iter/s)": 0.035398
},
{
"epoch": 0.22626262626262628,
"grad_norm": 0.4259732475000951,
"learning_rate": 9.197422036290386e-06,
"loss": 1.921630859375,
"memory(GiB)": 22.58,
"step": 42,
"train_speed(iter/s)": 0.035349
},
{
"epoch": 0.23164983164983166,
"grad_norm": 0.46150408574103824,
"learning_rate": 9.147974302859158e-06,
"loss": 1.41650390625,
"memory(GiB)": 22.58,
"step": 43,
"train_speed(iter/s)": 0.035321
},
{
"epoch": 0.23703703703703705,
"grad_norm": 0.5918291232050616,
"learning_rate": 9.09718982604866e-06,
"loss": 1.58154296875,
"memory(GiB)": 22.58,
"step": 44,
"train_speed(iter/s)": 0.03529
},
{
"epoch": 0.24242424242424243,
"grad_norm": 1.1984794966626473,
"learning_rate": 9.045084971874738e-06,
"loss": 2.67236328125,
"memory(GiB)": 22.58,
"step": 45,
"train_speed(iter/s)": 0.035244
},
{
"epoch": 0.24781144781144782,
"grad_norm": 0.7304425352094286,
"learning_rate": 8.991676531863507e-06,
"loss": 1.993408203125,
"memory(GiB)": 22.58,
"step": 46,
"train_speed(iter/s)": 0.0352
},
{
"epoch": 0.2531986531986532,
"grad_norm": 0.8247667804924503,
"learning_rate": 8.936981717640061e-06,
"loss": 2.8740234375,
"memory(GiB)": 22.58,
"step": 47,
"train_speed(iter/s)": 0.035111
},
{
"epoch": 0.2585858585858586,
"grad_norm": 1.072788633508109,
"learning_rate": 8.881018155381766e-06,
"loss": 1.845458984375,
"memory(GiB)": 22.58,
"step": 48,
"train_speed(iter/s)": 0.035139
},
{
"epoch": 0.26397306397306397,
"grad_norm": 0.6949566674892941,
"learning_rate": 8.823803880137993e-06,
"loss": 2.345458984375,
"memory(GiB)": 22.58,
"step": 49,
"train_speed(iter/s)": 0.035224
},
{
"epoch": 0.26936026936026936,
"grad_norm": 0.3214051528089464,
"learning_rate": 8.765357330018056e-06,
"loss": 1.640869140625,
"memory(GiB)": 22.58,
"step": 50,
"train_speed(iter/s)": 0.035311
},
{
"epoch": 0.27474747474747474,
"grad_norm": 0.8127331172569063,
"learning_rate": 8.705697340249275e-06,
"loss": 2.334716796875,
"memory(GiB)": 22.58,
"step": 51,
"train_speed(iter/s)": 0.035368
},
{
"epoch": 0.2801346801346801,
"grad_norm": 0.6993353179443554,
"learning_rate": 8.644843137107058e-06,
"loss": 2.2666015625,
"memory(GiB)": 22.58,
"step": 52,
"train_speed(iter/s)": 0.03541
},
{
"epoch": 0.2855218855218855,
"grad_norm": 0.7930646229400613,
"learning_rate": 8.582814331718961e-06,
"loss": 1.73876953125,
"memory(GiB)": 22.58,
"step": 53,
"train_speed(iter/s)": 0.035443
},
{
"epoch": 0.2909090909090909,
"grad_norm": 0.47348696234661886,
"learning_rate": 8.519630913744726e-06,
"loss": 1.8544921875,
"memory(GiB)": 22.58,
"step": 54,
"train_speed(iter/s)": 0.035485
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.5105789152298116,
"learning_rate": 8.455313244934324e-06,
"loss": 2.10107421875,
"memory(GiB)": 22.58,
"step": 55,
"train_speed(iter/s)": 0.03552
},
{
"epoch": 0.30168350168350166,
"grad_norm": 0.48874730617457113,
"learning_rate": 8.389882052566106e-06,
"loss": 2.19189453125,
"memory(GiB)": 22.58,
"step": 56,
"train_speed(iter/s)": 0.035547
},
{
"epoch": 0.30707070707070705,
"grad_norm": 0.7017590448005361,
"learning_rate": 8.32335842276713e-06,
"loss": 1.605224609375,
"memory(GiB)": 22.58,
"step": 57,
"train_speed(iter/s)": 0.035484
},
{
"epoch": 0.31245791245791243,
"grad_norm": 0.7736924894631574,
"learning_rate": 8.255763793717868e-06,
"loss": 2.123779296875,
"memory(GiB)": 22.58,
"step": 58,
"train_speed(iter/s)": 0.035432
},
{
"epoch": 0.3178451178451178,
"grad_norm": 0.6091631207035194,
"learning_rate": 8.18711994874345e-06,
"loss": 1.8798828125,
"memory(GiB)": 22.58,
"step": 59,
"train_speed(iter/s)": 0.035351
},
{
"epoch": 0.32323232323232326,
"grad_norm": 0.6745360872937951,
"learning_rate": 8.117449009293668e-06,
"loss": 2.36767578125,
"memory(GiB)": 22.58,
"step": 60,
"train_speed(iter/s)": 0.035291
},
{
"epoch": 0.32861952861952864,
"grad_norm": 1.1170607516843722,
"learning_rate": 8.046773427814043e-06,
"loss": 2.153076171875,
"memory(GiB)": 22.58,
"step": 61,
"train_speed(iter/s)": 0.035255
},
{
"epoch": 0.33400673400673403,
"grad_norm": 0.42517306211931166,
"learning_rate": 7.975115980510187e-06,
"loss": 1.717041015625,
"memory(GiB)": 22.58,
"step": 62,
"train_speed(iter/s)": 0.035224
},
{
"epoch": 0.3393939393939394,
"grad_norm": 0.8043024113222557,
"learning_rate": 7.902499760007867e-06,
"loss": 1.85888671875,
"memory(GiB)": 22.58,
"step": 63,
"train_speed(iter/s)": 0.035142
},
{
"epoch": 0.3447811447811448,
"grad_norm": 0.9761638945939747,
"learning_rate": 7.828948167911073e-06,
"loss": 1.906005859375,
"memory(GiB)": 22.58,
"step": 64,
"train_speed(iter/s)": 0.035063
},
{
"epoch": 0.3501683501683502,
"grad_norm": 0.4137734068293326,
"learning_rate": 7.754484907260513e-06,
"loss": 2.05712890625,
"memory(GiB)": 22.58,
"step": 65,
"train_speed(iter/s)": 0.034992
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.6313489954771672,
"learning_rate": 7.679133974894984e-06,
"loss": 1.56591796875,
"memory(GiB)": 22.58,
"step": 66,
"train_speed(iter/s)": 0.035062
},
{
"epoch": 0.36094276094276095,
"grad_norm": 0.7916770866661113,
"learning_rate": 7.602919653718044e-06,
"loss": 1.32373046875,
"memory(GiB)": 22.58,
"step": 67,
"train_speed(iter/s)": 0.035123
},
{
"epoch": 0.36632996632996634,
"grad_norm": 0.7005145101509135,
"learning_rate": 7.5258665048725065e-06,
"loss": 1.677490234375,
"memory(GiB)": 22.58,
"step": 68,
"train_speed(iter/s)": 0.035192
},
{
"epoch": 0.3717171717171717,
"grad_norm": 0.5600472715983401,
"learning_rate": 7.447999359825263e-06,
"loss": 1.8934326171875,
"memory(GiB)": 22.58,
"step": 69,
"train_speed(iter/s)": 0.035242
},
{
"epoch": 0.3771043771043771,
"grad_norm": 0.7799156688047453,
"learning_rate": 7.369343312364994e-06,
"loss": 1.737060546875,
"memory(GiB)": 22.58,
"step": 70,
"train_speed(iter/s)": 0.035303
},
{
"epoch": 0.3824915824915825,
"grad_norm": 1.0088361337375438,
"learning_rate": 7.289923710515338e-06,
"loss": 2.55859375,
"memory(GiB)": 22.58,
"step": 71,
"train_speed(iter/s)": 0.035339
},
{
"epoch": 0.3878787878787879,
"grad_norm": 0.7778606766770365,
"learning_rate": 7.2097661483661355e-06,
"loss": 2.3927001953125,
"memory(GiB)": 22.58,
"step": 72,
"train_speed(iter/s)": 0.035398
},
{
"epoch": 0.39326599326599326,
"grad_norm": 0.7503526567701239,
"learning_rate": 7.128896457825364e-06,
"loss": 2.4095458984375,
"memory(GiB)": 22.58,
"step": 73,
"train_speed(iter/s)": 0.035435
},
{
"epoch": 0.39865319865319865,
"grad_norm": 0.9293852718192778,
"learning_rate": 7.047340700294454e-06,
"loss": 2.0943603515625,
"memory(GiB)": 22.58,
"step": 74,
"train_speed(iter/s)": 0.035473
},
{
"epoch": 0.40404040404040403,
"grad_norm": 1.2981158494810365,
"learning_rate": 6.965125158269619e-06,
"loss": 2.36279296875,
"memory(GiB)": 22.58,
"step": 75,
"train_speed(iter/s)": 0.035498
},
{
"epoch": 0.4094276094276094,
"grad_norm": 0.5915357318010657,
"learning_rate": 6.88227632687196e-06,
"loss": 1.13037109375,
"memory(GiB)": 22.58,
"step": 76,
"train_speed(iter/s)": 0.035521
},
{
"epoch": 0.4148148148148148,
"grad_norm": 0.8289109263502568,
"learning_rate": 6.798820905309036e-06,
"loss": 2.245849609375,
"memory(GiB)": 22.58,
"step": 77,
"train_speed(iter/s)": 0.035549
},
{
"epoch": 0.4202020202020202,
"grad_norm": 0.7332772758108902,
"learning_rate": 6.714785788270658e-06,
"loss": 1.794189453125,
"memory(GiB)": 22.58,
"step": 78,
"train_speed(iter/s)": 0.035574
},
{
"epoch": 0.4255892255892256,
"grad_norm": 0.8695389561000924,
"learning_rate": 6.63019805726171e-06,
"loss": 2.107177734375,
"memory(GiB)": 22.58,
"step": 79,
"train_speed(iter/s)": 0.035564
},
{
"epoch": 0.43097643097643096,
"grad_norm": 1.0578963540355828,
"learning_rate": 6.545084971874738e-06,
"loss": 2.2099609375,
"memory(GiB)": 22.58,
"step": 80,
"train_speed(iter/s)": 0.035518
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.5355473518839581,
"learning_rate": 6.459473961005168e-06,
"loss": 1.679931640625,
"memory(GiB)": 22.58,
"step": 81,
"train_speed(iter/s)": 0.035449
},
{
"epoch": 0.4417508417508417,
"grad_norm": 0.47562295475695077,
"learning_rate": 6.373392614011952e-06,
"loss": 1.548828125,
"memory(GiB)": 22.58,
"step": 82,
"train_speed(iter/s)": 0.03541
},
{
"epoch": 0.4471380471380471,
"grad_norm": 1.1873250939202482,
"learning_rate": 6.286868671826513e-06,
"loss": 2.3310546875,
"memory(GiB)": 22.58,
"step": 83,
"train_speed(iter/s)": 0.035383
},
{
"epoch": 0.45252525252525255,
"grad_norm": 0.6325848523967413,
"learning_rate": 6.19993001801283e-06,
"loss": 1.63232421875,
"memory(GiB)": 22.58,
"step": 84,
"train_speed(iter/s)": 0.035357
},
{
"epoch": 0.45791245791245794,
"grad_norm": 0.6180246232374331,
"learning_rate": 6.112604669781572e-06,
"loss": 2.5283203125,
"memory(GiB)": 22.58,
"step": 85,
"train_speed(iter/s)": 0.035328
},
{
"epoch": 0.4632996632996633,
"grad_norm": 0.9254342636136799,
"learning_rate": 6.024920768961153e-06,
"loss": 2.09814453125,
"memory(GiB)": 22.58,
"step": 86,
"train_speed(iter/s)": 0.03531
},
{
"epoch": 0.4686868686868687,
"grad_norm": 1.0220943585915119,
"learning_rate": 5.936906572928625e-06,
"loss": 1.8603515625,
"memory(GiB)": 22.58,
"step": 87,
"train_speed(iter/s)": 0.035243
},
{
"epoch": 0.4740740740740741,
"grad_norm": 0.547874150160307,
"learning_rate": 5.848590445503345e-06,
"loss": 2.2890625,
"memory(GiB)": 22.58,
"step": 88,
"train_speed(iter/s)": 0.03516
},
{
"epoch": 0.4794612794612795,
"grad_norm": 0.7203446700675221,
"learning_rate": 5.760000847806337e-06,
"loss": 1.68115234375,
"memory(GiB)": 22.58,
"step": 89,
"train_speed(iter/s)": 0.035117
},
{
"epoch": 0.48484848484848486,
"grad_norm": 0.7628245708662847,
"learning_rate": 5.671166329088278e-06,
"loss": 2.126953125,
"memory(GiB)": 22.58,
"step": 90,
"train_speed(iter/s)": 0.035147
},
{
"epoch": 0.49023569023569025,
"grad_norm": 0.8089999734614459,
"learning_rate": 5.582115517529114e-06,
"loss": 1.948486328125,
"memory(GiB)": 22.58,
"step": 91,
"train_speed(iter/s)": 0.035179
},
{
"epoch": 0.49562289562289563,
"grad_norm": 0.5039876551970663,
"learning_rate": 5.4928771110122185e-06,
"loss": 1.849853515625,
"memory(GiB)": 22.58,
"step": 92,
"train_speed(iter/s)": 0.035212
},
{
"epoch": 0.501010101010101,
"grad_norm": 0.9008917409254343,
"learning_rate": 5.403479867876087e-06,
"loss": 2.642578125,
"memory(GiB)": 22.58,
"step": 93,
"train_speed(iter/s)": 0.035235
},
{
"epoch": 0.5063973063973064,
"grad_norm": 1.1384096826151604,
"learning_rate": 5.3139525976465675e-06,
"loss": 2.49365234375,
"memory(GiB)": 22.58,
"step": 94,
"train_speed(iter/s)": 0.035265
},
{
"epoch": 0.5117845117845118,
"grad_norm": 0.7491826485818727,
"learning_rate": 5.224324151752575e-06,
"loss": 1.88037109375,
"memory(GiB)": 22.58,
"step": 95,
"train_speed(iter/s)": 0.035291
},
{
"epoch": 0.5171717171717172,
"grad_norm": 0.6169314437426718,
"learning_rate": 5.134623414228315e-06,
"loss": 1.485595703125,
"memory(GiB)": 22.58,
"step": 96,
"train_speed(iter/s)": 0.035322
},
{
"epoch": 0.5225589225589226,
"grad_norm": 0.7458411085328407,
"learning_rate": 5.04487929240499e-06,
"loss": 2.030517578125,
"memory(GiB)": 22.58,
"step": 97,
"train_speed(iter/s)": 0.035358
},
{
"epoch": 0.5279461279461279,
"grad_norm": 0.36969067992245414,
"learning_rate": 4.955120707595011e-06,
"loss": 1.82421875,
"memory(GiB)": 22.58,
"step": 98,
"train_speed(iter/s)": 0.03539
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.7184526746731991,
"learning_rate": 4.865376585771687e-06,
"loss": 2.1650390625,
"memory(GiB)": 22.58,
"step": 99,
"train_speed(iter/s)": 0.035417
},
{
"epoch": 0.5387205387205387,
"grad_norm": 0.5860047275017632,
"learning_rate": 4.775675848247427e-06,
"loss": 2.016845703125,
"memory(GiB)": 22.58,
"step": 100,
"train_speed(iter/s)": 0.035451
},
{
"epoch": 0.5441077441077441,
"grad_norm": 0.6740666234718802,
"learning_rate": 4.686047402353433e-06,
"loss": 1.481689453125,
"memory(GiB)": 22.58,
"step": 101,
"train_speed(iter/s)": 0.035484
},
{
"epoch": 0.5494949494949495,
"grad_norm": 0.5962985498733315,
"learning_rate": 4.596520132123915e-06,
"loss": 2.225341796875,
"memory(GiB)": 22.58,
"step": 102,
"train_speed(iter/s)": 0.035522
},
{
"epoch": 0.5548821548821549,
"grad_norm": 0.6185754487719404,
"learning_rate": 4.507122888987782e-06,
"loss": 2.630615234375,
"memory(GiB)": 22.58,
"step": 103,
"train_speed(iter/s)": 0.035566
},
{
"epoch": 0.5602693602693603,
"grad_norm": 0.8891703200104817,
"learning_rate": 4.417884482470887e-06,
"loss": 1.98291015625,
"memory(GiB)": 22.58,
"step": 104,
"train_speed(iter/s)": 0.03558
},
{
"epoch": 0.5656565656565656,
"grad_norm": 0.5620520767612842,
"learning_rate": 4.3288336709117246e-06,
"loss": 1.933349609375,
"memory(GiB)": 22.58,
"step": 105,
"train_speed(iter/s)": 0.035549
},
{
"epoch": 0.571043771043771,
"grad_norm": 1.3690550098042635,
"learning_rate": 4.239999152193664e-06,
"loss": 2.217529296875,
"memory(GiB)": 22.58,
"step": 106,
"train_speed(iter/s)": 0.035527
},
{
"epoch": 0.5764309764309764,
"grad_norm": 0.4160377433886458,
"learning_rate": 4.1514095544966556e-06,
"loss": 1.737060546875,
"memory(GiB)": 22.58,
"step": 107,
"train_speed(iter/s)": 0.03551
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.8209806760015574,
"learning_rate": 4.063093427071376e-06,
"loss": 2.782470703125,
"memory(GiB)": 22.58,
"step": 108,
"train_speed(iter/s)": 0.035486
},
{
"epoch": 0.5872053872053872,
"grad_norm": 0.726795857048424,
"learning_rate": 3.975079231038848e-06,
"loss": 2.009521484375,
"memory(GiB)": 22.58,
"step": 109,
"train_speed(iter/s)": 0.035449
},
{
"epoch": 0.5925925925925926,
"grad_norm": 1.2624010183388914,
"learning_rate": 3.887395330218429e-06,
"loss": 2.59814453125,
"memory(GiB)": 22.58,
"step": 110,
"train_speed(iter/s)": 0.035431
},
{
"epoch": 0.597979797979798,
"grad_norm": 0.7513165711048129,
"learning_rate": 3.8000699819871704e-06,
"loss": 1.6396484375,
"memory(GiB)": 22.58,
"step": 111,
"train_speed(iter/s)": 0.035402
},
{
"epoch": 0.6033670033670033,
"grad_norm": 0.4587115862936887,
"learning_rate": 3.7131313281734895e-06,
"loss": 2.044189453125,
"memory(GiB)": 22.58,
"step": 112,
"train_speed(iter/s)": 0.035373
},
{
"epoch": 0.6087542087542087,
"grad_norm": 0.41256540620865373,
"learning_rate": 3.62660738598805e-06,
"loss": 1.9287109375,
"memory(GiB)": 22.58,
"step": 113,
"train_speed(iter/s)": 0.03534
},
{
"epoch": 0.6141414141414141,
"grad_norm": 0.4286929355926436,
"learning_rate": 3.540526038994834e-06,
"loss": 1.646728515625,
"memory(GiB)": 22.58,
"step": 114,
"train_speed(iter/s)": 0.035359
},
{
"epoch": 0.6195286195286195,
"grad_norm": 0.8246295061207459,
"learning_rate": 3.4549150281252635e-06,
"loss": 1.7587890625,
"memory(GiB)": 22.58,
"step": 115,
"train_speed(iter/s)": 0.035387
},
{
"epoch": 0.6249158249158249,
"grad_norm": 0.653674454928138,
"learning_rate": 3.3698019427382912e-06,
"loss": 1.9765625,
"memory(GiB)": 22.58,
"step": 116,
"train_speed(iter/s)": 0.035417
},
{
"epoch": 0.6303030303030303,
"grad_norm": 0.6402748838297282,
"learning_rate": 3.2852142117293435e-06,
"loss": 1.94970703125,
"memory(GiB)": 22.58,
"step": 117,
"train_speed(iter/s)": 0.035431
},
{
"epoch": 0.6356902356902356,
"grad_norm": 0.5582058394376362,
"learning_rate": 3.2011790946909673e-06,
"loss": 1.9755859375,
"memory(GiB)": 22.58,
"step": 118,
"train_speed(iter/s)": 0.03546
},
{
"epoch": 0.641077441077441,
"grad_norm": 0.8447371297083311,
"learning_rate": 3.11772367312804e-06,
"loss": 1.784423828125,
"memory(GiB)": 22.58,
"step": 119,
"train_speed(iter/s)": 0.035493
},
{
"epoch": 0.6464646464646465,
"grad_norm": 0.7640836687261319,
"learning_rate": 3.0348748417303826e-06,
"loss": 1.76171875,
"memory(GiB)": 22.58,
"step": 120,
"train_speed(iter/s)": 0.035513
},
{
"epoch": 0.6518518518518519,
"grad_norm": 0.6689239585125656,
"learning_rate": 2.9526592997055488e-06,
"loss": 2.076904296875,
"memory(GiB)": 22.58,
"step": 121,
"train_speed(iter/s)": 0.03554
},
{
"epoch": 0.6572390572390573,
"grad_norm": 0.8205443169011045,
"learning_rate": 2.871103542174637e-06,
"loss": 2.4423828125,
"memory(GiB)": 22.58,
"step": 122,
"train_speed(iter/s)": 0.035564
},
{
"epoch": 0.6626262626262627,
"grad_norm": 0.3861380215034983,
"learning_rate": 2.790233851633868e-06,
"loss": 1.405517578125,
"memory(GiB)": 22.58,
"step": 123,
"train_speed(iter/s)": 0.035589
},
{
"epoch": 0.6680134680134681,
"grad_norm": 0.9319720706049784,
"learning_rate": 2.7100762894846633e-06,
"loss": 1.884033203125,
"memory(GiB)": 22.58,
"step": 124,
"train_speed(iter/s)": 0.035611
},
{
"epoch": 0.6734006734006734,
"grad_norm": 0.4894495365923113,
"learning_rate": 2.6306566876350072e-06,
"loss": 1.992431640625,
"memory(GiB)": 22.58,
"step": 125,
"train_speed(iter/s)": 0.035618
},
{
"epoch": 0.6787878787878788,
"grad_norm": 0.5156966779296556,
"learning_rate": 2.55200064017474e-06,
"loss": 1.7987060546875,
"memory(GiB)": 22.58,
"step": 126,
"train_speed(iter/s)": 0.035593
},
{
"epoch": 0.6841750841750842,
"grad_norm": 0.39627149470201456,
"learning_rate": 2.4741334951274948e-06,
"loss": 1.779541015625,
"memory(GiB)": 22.58,
"step": 127,
"train_speed(iter/s)": 0.035563
},
{
"epoch": 0.6895622895622896,
"grad_norm": 0.7990132228587018,
"learning_rate": 2.3970803462819586e-06,
"loss": 2.385498046875,
"memory(GiB)": 22.58,
"step": 128,
"train_speed(iter/s)": 0.035533
},
{
"epoch": 0.694949494949495,
"grad_norm": 0.542336867995926,
"learning_rate": 2.320866025105016e-06,
"loss": 1.775390625,
"memory(GiB)": 22.58,
"step": 129,
"train_speed(iter/s)": 0.035487
},
{
"epoch": 0.7003367003367004,
"grad_norm": 0.40553603638944413,
"learning_rate": 2.245515092739488e-06,
"loss": 1.65771484375,
"memory(GiB)": 22.58,
"step": 130,
"train_speed(iter/s)": 0.035457
},
{
"epoch": 0.7057239057239058,
"grad_norm": 0.5705311307759141,
"learning_rate": 2.171051832088928e-06,
"loss": 1.392578125,
"memory(GiB)": 22.58,
"step": 131,
"train_speed(iter/s)": 0.035439
},
{
"epoch": 0.7111111111111111,
"grad_norm": 0.5637194621292295,
"learning_rate": 2.097500239992132e-06,
"loss": 1.808349609375,
"memory(GiB)": 22.58,
"step": 132,
"train_speed(iter/s)": 0.035412
},
{
"epoch": 0.7164983164983165,
"grad_norm": 1.0166298249729564,
"learning_rate": 2.0248840194898155e-06,
"loss": 1.88232421875,
"memory(GiB)": 22.58,
"step": 133,
"train_speed(iter/s)": 0.035367
},
{
"epoch": 0.7218855218855219,
"grad_norm": 0.365517442677317,
"learning_rate": 1.95322657218596e-06,
"loss": 1.8359375,
"memory(GiB)": 22.58,
"step": 134,
"train_speed(iter/s)": 0.035304
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.5937921378630181,
"learning_rate": 1.8825509907063328e-06,
"loss": 2.16162109375,
"memory(GiB)": 22.58,
"step": 135,
"train_speed(iter/s)": 0.035244
},
{
"epoch": 0.7326599326599327,
"grad_norm": 0.5630691840328598,
"learning_rate": 1.8128800512565514e-06,
"loss": 1.953369140625,
"memory(GiB)": 22.58,
"step": 136,
"train_speed(iter/s)": 0.035272
},
{
"epoch": 0.7380471380471381,
"grad_norm": 0.9036946278139879,
"learning_rate": 1.7442362062821323e-06,
"loss": 3.1923828125,
"memory(GiB)": 22.58,
"step": 137,
"train_speed(iter/s)": 0.035287
},
{
"epoch": 0.7434343434343434,
"grad_norm": 0.5335511498935785,
"learning_rate": 1.6766415772328732e-06,
"loss": 1.705322265625,
"memory(GiB)": 22.58,
"step": 138,
"train_speed(iter/s)": 0.035295
},
{
"epoch": 0.7488215488215488,
"grad_norm": 0.8149099249815346,
"learning_rate": 1.610117947433897e-06,
"loss": 2.81689453125,
"memory(GiB)": 22.58,
"step": 139,
"train_speed(iter/s)": 0.035308
},
{
"epoch": 0.7542087542087542,
"grad_norm": 0.5287002241309334,
"learning_rate": 1.544686755065677e-06,
"loss": 1.266357421875,
"memory(GiB)": 22.58,
"step": 140,
"train_speed(iter/s)": 0.035318
},
{
"epoch": 0.7595959595959596,
"grad_norm": 0.6139302197140588,
"learning_rate": 1.4803690862552755e-06,
"loss": 1.817626953125,
"memory(GiB)": 22.58,
"step": 141,
"train_speed(iter/s)": 0.035343
},
{
"epoch": 0.764983164983165,
"grad_norm": 0.6333656991964685,
"learning_rate": 1.4171856682810386e-06,
"loss": 2.101806640625,
"memory(GiB)": 22.58,
"step": 142,
"train_speed(iter/s)": 0.035364
},
{
"epoch": 0.7703703703703704,
"grad_norm": 0.8829740683592863,
"learning_rate": 1.3551568628929434e-06,
"loss": 2.508056640625,
"memory(GiB)": 22.58,
"step": 143,
"train_speed(iter/s)": 0.03539
},
{
"epoch": 0.7757575757575758,
"grad_norm": 0.5801508492146695,
"learning_rate": 1.2943026597507268e-06,
"loss": 1.6142578125,
"memory(GiB)": 22.58,
"step": 144,
"train_speed(iter/s)": 0.035413
},
{
"epoch": 0.7811447811447811,
"grad_norm": 0.48056036748223746,
"learning_rate": 1.234642669981946e-06,
"loss": 1.942138671875,
"memory(GiB)": 22.58,
"step": 145,
"train_speed(iter/s)": 0.035431
},
{
"epoch": 0.7865319865319865,
"grad_norm": 0.5473637984491948,
"learning_rate": 1.1761961198620081e-06,
"loss": 1.748779296875,
"memory(GiB)": 22.58,
"step": 146,
"train_speed(iter/s)": 0.035455
},
{
"epoch": 0.7919191919191919,
"grad_norm": 0.7226102542834439,
"learning_rate": 1.118981844618236e-06,
"loss": 1.657470703125,
"memory(GiB)": 22.58,
"step": 147,
"train_speed(iter/s)": 0.035472
},
{
"epoch": 0.7973063973063973,
"grad_norm": 0.677002948688539,
"learning_rate": 1.06301828235994e-06,
"loss": 1.730224609375,
"memory(GiB)": 22.58,
"step": 148,
"train_speed(iter/s)": 0.035492
},
{
"epoch": 0.8026936026936027,
"grad_norm": 0.4690206204454014,
"learning_rate": 1.0083234681364934e-06,
"loss": 1.97509765625,
"memory(GiB)": 22.58,
"step": 149,
"train_speed(iter/s)": 0.035513
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.38431237166068455,
"learning_rate": 9.549150281252633e-07,
"loss": 1.977783203125,
"memory(GiB)": 22.58,
"step": 150,
"train_speed(iter/s)": 0.035533
},
{
"epoch": 0.8134680134680135,
"grad_norm": 1.4318443328161967,
"learning_rate": 9.028101739513406e-07,
"loss": 2.696533203125,
"memory(GiB)": 22.58,
"step": 151,
"train_speed(iter/s)": 0.035549
},
{
"epoch": 0.8188552188552188,
"grad_norm": 0.39825000243591335,
"learning_rate": 8.520256971408453e-07,
"loss": 1.52294921875,
"memory(GiB)": 22.58,
"step": 152,
"train_speed(iter/s)": 0.035566
},
{
"epoch": 0.8242424242424242,
"grad_norm": 0.403223921534723,
"learning_rate": 8.025779637096138e-07,
"loss": 2.0869140625,
"memory(GiB)": 22.58,
"step": 153,
"train_speed(iter/s)": 0.035581
},
{
"epoch": 0.8296296296296296,
"grad_norm": 0.39408518518211616,
"learning_rate": 7.544829088890326e-07,
"loss": 2.085693359375,
"memory(GiB)": 22.58,
"step": 154,
"train_speed(iter/s)": 0.035601
},
{
"epoch": 0.835016835016835,
"grad_norm": 0.6580639598152973,
"learning_rate": 7.077560319906696e-07,
"loss": 1.58740234375,
"memory(GiB)": 22.58,
"step": 155,
"train_speed(iter/s)": 0.035601
},
{
"epoch": 0.8404040404040404,
"grad_norm": 0.5455643216936216,
"learning_rate": 6.624123914114122e-07,
"loss": 1.76953125,
"memory(GiB)": 22.58,
"step": 156,
"train_speed(iter/s)": 0.035584
},
{
"epoch": 0.8457912457912458,
"grad_norm": 0.9580661740362665,
"learning_rate": 6.184665997806832e-07,
"loss": 2.3505859375,
"memory(GiB)": 22.58,
"step": 157,
"train_speed(iter/s)": 0.035562
},
{
"epoch": 0.8511784511784511,
"grad_norm": 0.49273093322057226,
"learning_rate": 5.759328192513075e-07,
"loss": 1.632080078125,
"memory(GiB)": 22.58,
"step": 158,
"train_speed(iter/s)": 0.035543
},
{
"epoch": 0.8565656565656565,
"grad_norm": 0.5074137587596991,
"learning_rate": 5.348247569355736e-07,
"loss": 1.71240234375,
"memory(GiB)": 22.58,
"step": 159,
"train_speed(iter/s)": 0.03552
},
{
"epoch": 0.8619528619528619,
"grad_norm": 0.7185716029221749,
"learning_rate": 4.951556604879049e-07,
"loss": 2.36669921875,
"memory(GiB)": 22.58,
"step": 160,
"train_speed(iter/s)": 0.035503
},
{
"epoch": 0.8673400673400673,
"grad_norm": 0.7811542866299452,
"learning_rate": 4.569383138356276e-07,
"loss": 1.678955078125,
"memory(GiB)": 22.58,
"step": 161,
"train_speed(iter/s)": 0.035485
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.5128778192942757,
"learning_rate": 4.201850330591678e-07,
"loss": 2.072998046875,
"memory(GiB)": 22.58,
"step": 162,
"train_speed(iter/s)": 0.035459
},
{
"epoch": 0.8781144781144781,
"grad_norm": 0.6851552480944826,
"learning_rate": 3.8490766242301356e-07,
"loss": 1.55322265625,
"memory(GiB)": 22.58,
"step": 163,
"train_speed(iter/s)": 0.035422
},
{
"epoch": 0.8835016835016835,
"grad_norm": 1.0656634793505568,
"learning_rate": 3.511175705587433e-07,
"loss": 2.09228515625,
"memory(GiB)": 22.58,
"step": 164,
"train_speed(iter/s)": 0.035412
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.4704867924767551,
"learning_rate": 3.18825646801314e-07,
"loss": 2.178955078125,
"memory(GiB)": 22.58,
"step": 165,
"train_speed(iter/s)": 0.03542
},
{
"epoch": 0.8942760942760942,
"grad_norm": 0.3438531193817133,
"learning_rate": 2.8804229767982637e-07,
"loss": 1.828125,
"memory(GiB)": 22.58,
"step": 166,
"train_speed(iter/s)": 0.035441
},
{
"epoch": 0.8996632996632996,
"grad_norm": 0.9072486327466182,
"learning_rate": 2.587774435638679e-07,
"loss": 1.902099609375,
"memory(GiB)": 22.58,
"step": 167,
"train_speed(iter/s)": 0.035458
},
{
"epoch": 0.9050505050505051,
"grad_norm": 0.40209833194248146,
"learning_rate": 2.3104051546654016e-07,
"loss": 1.72314453125,
"memory(GiB)": 22.58,
"step": 168,
"train_speed(iter/s)": 0.035472
},
{
"epoch": 0.9104377104377105,
"grad_norm": 0.6534758670706157,
"learning_rate": 2.0484045200517222e-07,
"loss": 1.73095703125,
"memory(GiB)": 22.58,
"step": 169,
"train_speed(iter/s)": 0.03548
},
{
"epoch": 0.9158249158249159,
"grad_norm": 0.36229213242531244,
"learning_rate": 1.801856965207338e-07,
"loss": 1.954345703125,
"memory(GiB)": 22.58,
"step": 170,
"train_speed(iter/s)": 0.035499
},
{
"epoch": 0.9212121212121213,
"grad_norm": 0.41462023840060064,
"learning_rate": 1.5708419435684463e-07,
"loss": 1.726318359375,
"memory(GiB)": 22.58,
"step": 171,
"train_speed(iter/s)": 0.035512
},
{
"epoch": 0.9265993265993266,
"grad_norm": 0.793282464162017,
"learning_rate": 1.3554339029927532e-07,
"loss": 2.07861328125,
"memory(GiB)": 22.58,
"step": 172,
"train_speed(iter/s)": 0.035526
},
{
"epoch": 0.931986531986532,
"grad_norm": 0.4924403822397691,
"learning_rate": 1.1557022617676217e-07,
"loss": 1.400634765625,
"memory(GiB)": 22.58,
"step": 173,
"train_speed(iter/s)": 0.035541
},
{
"epoch": 0.9373737373737374,
"grad_norm": 0.41980069690346106,
"learning_rate": 9.717113862389993e-08,
"loss": 2.12158203125,
"memory(GiB)": 22.58,
"step": 174,
"train_speed(iter/s)": 0.03556
},
{
"epoch": 0.9427609427609428,
"grad_norm": 0.8809220146060189,
"learning_rate": 8.035205700685167e-08,
"loss": 2.621826171875,
"memory(GiB)": 22.58,
"step": 175,
"train_speed(iter/s)": 0.035577
},
{
"epoch": 0.9481481481481482,
"grad_norm": 0.6908254679787823,
"learning_rate": 6.511840151252169e-08,
"loss": 1.813232421875,
"memory(GiB)": 22.58,
"step": 176,
"train_speed(iter/s)": 0.035597
},
{
"epoch": 0.9535353535353536,
"grad_norm": 0.49484208186969647,
"learning_rate": 5.1475081401825553e-08,
"loss": 1.9814453125,
"memory(GiB)": 22.58,
"step": 177,
"train_speed(iter/s)": 0.035578
},
{
"epoch": 0.958922558922559,
"grad_norm": 0.6989450753180266,
"learning_rate": 3.9426493427611177e-08,
"loss": 1.78466796875,
"memory(GiB)": 22.58,
"step": 178,
"train_speed(iter/s)": 0.035563
},
{
"epoch": 0.9643097643097643,
"grad_norm": 0.5543481036485521,
"learning_rate": 2.8976520417742794e-08,
"loss": 1.727783203125,
"memory(GiB)": 22.58,
"step": 179,
"train_speed(iter/s)": 0.035552
},
{
"epoch": 0.9696969696969697,
"grad_norm": 0.5545843045026326,
"learning_rate": 2.012853002380466e-08,
"loss": 1.75634765625,
"memory(GiB)": 22.58,
"step": 180,
"train_speed(iter/s)": 0.035543
},
{
"epoch": 0.9750841750841751,
"grad_norm": 0.3433152184276571,
"learning_rate": 1.2885373635829756e-08,
"loss": 1.64208984375,
"memory(GiB)": 22.58,
"step": 181,
"train_speed(iter/s)": 0.035533
},
{
"epoch": 0.9804713804713805,
"grad_norm": 0.9002229182397717,
"learning_rate": 7.249385463395375e-09,
"loss": 2.177490234375,
"memory(GiB)": 22.58,
"step": 182,
"train_speed(iter/s)": 0.035517
},
{
"epoch": 0.9858585858585859,
"grad_norm": 0.5840020558119475,
"learning_rate": 3.2223817833931803e-09,
"loss": 1.4775390625,
"memory(GiB)": 22.58,
"step": 183,
"train_speed(iter/s)": 0.035499
},
{
"epoch": 0.9912457912457913,
"grad_norm": 0.31651969118225726,
"learning_rate": 8.056603547090813e-10,
"loss": 1.804931640625,
"memory(GiB)": 22.58,
"step": 184,
"train_speed(iter/s)": 0.035513
},
{
"epoch": 0.9966329966329966,
"grad_norm": 0.5699524292753597,
"learning_rate": 0.0,
"loss": 1.653076171875,
"memory(GiB)": 22.58,
"step": 185,
"train_speed(iter/s)": 0.035529
},
{
"epoch": 0.9966329966329966,
"eval_loss": 0.12199707329273224,
"eval_runtime": 16.4404,
"eval_samples_per_second": 1.825,
"eval_steps_per_second": 1.825,
"step": 185
},
{
"epoch": 0.9966329966329966,
"eval_loss": 0.12199707329273224,
"eval_runtime": 18.3596,
"eval_samples_per_second": 1.634,
"eval_steps_per_second": 1.634,
"step": 185
}
],
"logging_steps": 1,
"max_steps": 185,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 664501364736.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}