qwen2.5-vl-7b-cam-motion-preview / trainer_state.json
chancharikm's picture
Upload folder using huggingface_hub
e62b592 verified
raw
history blame contribute delete
65.3 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.975144705481783,
"eval_steps": 1000,
"global_step": 3670,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.027238678924072182,
"grad_norm": 94.36308714522585,
"learning_rate": 2.4523160762942784e-07,
"loss": 3.2941,
"step": 10
},
{
"epoch": 0.054477357848144364,
"grad_norm": 72.09113030659383,
"learning_rate": 5.177111716621253e-07,
"loss": 2.9481,
"step": 20
},
{
"epoch": 0.08171603677221655,
"grad_norm": 18.50935923305202,
"learning_rate": 7.90190735694823e-07,
"loss": 1.921,
"step": 30
},
{
"epoch": 0.10895471569628873,
"grad_norm": 8.59280831709028,
"learning_rate": 1.0626702997275206e-06,
"loss": 1.5096,
"step": 40
},
{
"epoch": 0.13619339462036092,
"grad_norm": 5.941916510585534,
"learning_rate": 1.335149863760218e-06,
"loss": 1.1101,
"step": 50
},
{
"epoch": 0.1634320735444331,
"grad_norm": 4.39962525080334,
"learning_rate": 1.6076294277929156e-06,
"loss": 0.9091,
"step": 60
},
{
"epoch": 0.1906707524685053,
"grad_norm": 4.489642566752767,
"learning_rate": 1.8801089918256133e-06,
"loss": 0.7886,
"step": 70
},
{
"epoch": 0.21790943139257746,
"grad_norm": 3.8997170693605914,
"learning_rate": 2.152588555858311e-06,
"loss": 0.7292,
"step": 80
},
{
"epoch": 0.24514811031664965,
"grad_norm": 4.2337949046127115,
"learning_rate": 2.4250681198910083e-06,
"loss": 0.6295,
"step": 90
},
{
"epoch": 0.27238678924072185,
"grad_norm": 3.887748571626415,
"learning_rate": 2.697547683923706e-06,
"loss": 0.5892,
"step": 100
},
{
"epoch": 0.299625468164794,
"grad_norm": 4.071563747798551,
"learning_rate": 2.9700272479564033e-06,
"loss": 0.4879,
"step": 110
},
{
"epoch": 0.3268641470888662,
"grad_norm": 4.396940499759756,
"learning_rate": 3.2425068119891012e-06,
"loss": 0.4542,
"step": 120
},
{
"epoch": 0.35410282601293835,
"grad_norm": 3.784810689454143,
"learning_rate": 3.5149863760217988e-06,
"loss": 0.3211,
"step": 130
},
{
"epoch": 0.3813415049370106,
"grad_norm": 3.7737741184600595,
"learning_rate": 3.7874659400544963e-06,
"loss": 0.2891,
"step": 140
},
{
"epoch": 0.40858018386108275,
"grad_norm": 3.361771664086218,
"learning_rate": 4.059945504087194e-06,
"loss": 0.1996,
"step": 150
},
{
"epoch": 0.4358188627851549,
"grad_norm": 3.2907408057054237,
"learning_rate": 4.332425068119892e-06,
"loss": 0.133,
"step": 160
},
{
"epoch": 0.4630575417092271,
"grad_norm": 3.040778638316543,
"learning_rate": 4.604904632152589e-06,
"loss": 0.1101,
"step": 170
},
{
"epoch": 0.4902962206332993,
"grad_norm": 3.5750949211792156,
"learning_rate": 4.877384196185287e-06,
"loss": 0.0936,
"step": 180
},
{
"epoch": 0.5175348995573714,
"grad_norm": 3.857830728644118,
"learning_rate": 5.149863760217984e-06,
"loss": 0.0757,
"step": 190
},
{
"epoch": 0.5447735784814437,
"grad_norm": 4.5053152903816915,
"learning_rate": 5.422343324250682e-06,
"loss": 0.0677,
"step": 200
},
{
"epoch": 0.5720122574055159,
"grad_norm": 2.0169258478044423,
"learning_rate": 5.694822888283379e-06,
"loss": 0.0607,
"step": 210
},
{
"epoch": 0.599250936329588,
"grad_norm": 2.7354337720763566,
"learning_rate": 5.9673024523160776e-06,
"loss": 0.057,
"step": 220
},
{
"epoch": 0.6264896152536602,
"grad_norm": 2.41187775653569,
"learning_rate": 6.239782016348774e-06,
"loss": 0.056,
"step": 230
},
{
"epoch": 0.6537282941777324,
"grad_norm": 2.7992598675350258,
"learning_rate": 6.512261580381472e-06,
"loss": 0.0515,
"step": 240
},
{
"epoch": 0.6809669731018045,
"grad_norm": 3.885137671280064,
"learning_rate": 6.78474114441417e-06,
"loss": 0.0517,
"step": 250
},
{
"epoch": 0.7082056520258767,
"grad_norm": 2.47654168655763,
"learning_rate": 7.057220708446867e-06,
"loss": 0.0462,
"step": 260
},
{
"epoch": 0.7354443309499489,
"grad_norm": 2.5720242451467454,
"learning_rate": 7.329700272479565e-06,
"loss": 0.0454,
"step": 270
},
{
"epoch": 0.7626830098740212,
"grad_norm": 4.695199938458052,
"learning_rate": 7.602179836512263e-06,
"loss": 0.0464,
"step": 280
},
{
"epoch": 0.7899216887980933,
"grad_norm": 2.080104008186552,
"learning_rate": 7.87465940054496e-06,
"loss": 0.0416,
"step": 290
},
{
"epoch": 0.8171603677221655,
"grad_norm": 2.1981255385062517,
"learning_rate": 8.147138964577658e-06,
"loss": 0.0446,
"step": 300
},
{
"epoch": 0.8443990466462377,
"grad_norm": 1.7239470049599528,
"learning_rate": 8.419618528610354e-06,
"loss": 0.0452,
"step": 310
},
{
"epoch": 0.8716377255703098,
"grad_norm": 1.8811480620279255,
"learning_rate": 8.692098092643052e-06,
"loss": 0.033,
"step": 320
},
{
"epoch": 0.898876404494382,
"grad_norm": 2.4779574516049556,
"learning_rate": 8.964577656675751e-06,
"loss": 0.0438,
"step": 330
},
{
"epoch": 0.9261150834184542,
"grad_norm": 2.42289553821888,
"learning_rate": 9.237057220708447e-06,
"loss": 0.0392,
"step": 340
},
{
"epoch": 0.9533537623425263,
"grad_norm": 2.3174600464823256,
"learning_rate": 9.509536784741146e-06,
"loss": 0.0394,
"step": 350
},
{
"epoch": 0.9805924412665986,
"grad_norm": 2.076316899586314,
"learning_rate": 9.782016348773843e-06,
"loss": 0.0387,
"step": 360
},
{
"epoch": 1.0054477357848144,
"grad_norm": 1.7952293400096322,
"learning_rate": 9.99999095346085e-06,
"loss": 0.0332,
"step": 370
},
{
"epoch": 1.0326864147088866,
"grad_norm": 2.3966331090661925,
"learning_rate": 9.999674328027824e-06,
"loss": 0.0349,
"step": 380
},
{
"epoch": 1.0599250936329587,
"grad_norm": 1.8578054931433101,
"learning_rate": 9.998905408372662e-06,
"loss": 0.0332,
"step": 390
},
{
"epoch": 1.087163772557031,
"grad_norm": 2.067301033604286,
"learning_rate": 9.997684264055478e-06,
"loss": 0.036,
"step": 400
},
{
"epoch": 1.1144024514811033,
"grad_norm": 1.656723292655721,
"learning_rate": 9.99601100554677e-06,
"loss": 0.0315,
"step": 410
},
{
"epoch": 1.1416411304051755,
"grad_norm": 1.4305496511087652,
"learning_rate": 9.99388578421743e-06,
"loss": 0.0307,
"step": 420
},
{
"epoch": 1.1688798093292476,
"grad_norm": 1.4756680048744966,
"learning_rate": 9.991308792325045e-06,
"loss": 0.0279,
"step": 430
},
{
"epoch": 1.1961184882533198,
"grad_norm": 2.488649223393654,
"learning_rate": 9.988280262996507e-06,
"loss": 0.0263,
"step": 440
},
{
"epoch": 1.223357167177392,
"grad_norm": 1.4666750141420732,
"learning_rate": 9.98480047020693e-06,
"loss": 0.0268,
"step": 450
},
{
"epoch": 1.2505958461014641,
"grad_norm": 1.320119597330066,
"learning_rate": 9.980869728754847e-06,
"loss": 0.0298,
"step": 460
},
{
"epoch": 1.2778345250255363,
"grad_norm": 1.390841137051498,
"learning_rate": 9.976488394233752e-06,
"loss": 0.021,
"step": 470
},
{
"epoch": 1.3050732039496085,
"grad_norm": 1.4228820783227014,
"learning_rate": 9.971656862999917e-06,
"loss": 0.0276,
"step": 480
},
{
"epoch": 1.3323118828736806,
"grad_norm": 1.1183882912895564,
"learning_rate": 9.966375572136546e-06,
"loss": 0.0282,
"step": 490
},
{
"epoch": 1.3595505617977528,
"grad_norm": 1.1616366909466755,
"learning_rate": 9.960644999414226e-06,
"loss": 0.0214,
"step": 500
},
{
"epoch": 1.386789240721825,
"grad_norm": 1.409120984041759,
"learning_rate": 9.954465663247708e-06,
"loss": 0.0178,
"step": 510
},
{
"epoch": 1.4140279196458971,
"grad_norm": 0.9803319101168316,
"learning_rate": 9.947838122649014e-06,
"loss": 0.0202,
"step": 520
},
{
"epoch": 1.4412665985699693,
"grad_norm": 1.8131277665703773,
"learning_rate": 9.94076297717686e-06,
"loss": 0.0208,
"step": 530
},
{
"epoch": 1.4685052774940415,
"grad_norm": 1.1662863071032854,
"learning_rate": 9.933240866882418e-06,
"loss": 0.0169,
"step": 540
},
{
"epoch": 1.4957439564181136,
"grad_norm": 1.1465063039077388,
"learning_rate": 9.925272472251415e-06,
"loss": 0.0211,
"step": 550
},
{
"epoch": 1.5229826353421858,
"grad_norm": 1.193468165618273,
"learning_rate": 9.916858514142575e-06,
"loss": 0.0198,
"step": 560
},
{
"epoch": 1.550221314266258,
"grad_norm": 1.109324425932322,
"learning_rate": 9.907999753722407e-06,
"loss": 0.021,
"step": 570
},
{
"epoch": 1.5774599931903301,
"grad_norm": 1.290613695650256,
"learning_rate": 9.898696992396333e-06,
"loss": 0.0171,
"step": 580
},
{
"epoch": 1.6046986721144023,
"grad_norm": 1.6974083987042456,
"learning_rate": 9.888951071736215e-06,
"loss": 0.0192,
"step": 590
},
{
"epoch": 1.6319373510384745,
"grad_norm": 1.2163159155887682,
"learning_rate": 9.878762873404197e-06,
"loss": 0.013,
"step": 600
},
{
"epoch": 1.6591760299625467,
"grad_norm": 1.0193875076300951,
"learning_rate": 9.86813331907296e-06,
"loss": 0.0163,
"step": 610
},
{
"epoch": 1.686414708886619,
"grad_norm": 1.1034023564810655,
"learning_rate": 9.857063370342338e-06,
"loss": 0.0174,
"step": 620
},
{
"epoch": 1.7136533878106912,
"grad_norm": 1.5309168123244021,
"learning_rate": 9.845554028652331e-06,
"loss": 0.0158,
"step": 630
},
{
"epoch": 1.7408920667347634,
"grad_norm": 0.5532697619897224,
"learning_rate": 9.833606335192506e-06,
"loss": 0.0108,
"step": 640
},
{
"epoch": 1.7681307456588355,
"grad_norm": 1.0576521653821709,
"learning_rate": 9.821221370807805e-06,
"loss": 0.0154,
"step": 650
},
{
"epoch": 1.7953694245829077,
"grad_norm": 1.0783675700881836,
"learning_rate": 9.808400255900772e-06,
"loss": 0.0114,
"step": 660
},
{
"epoch": 1.8226081035069799,
"grad_norm": 0.599102289097004,
"learning_rate": 9.795144150330194e-06,
"loss": 0.0126,
"step": 670
},
{
"epoch": 1.849846782431052,
"grad_norm": 1.1075249779167584,
"learning_rate": 9.781454253306169e-06,
"loss": 0.0141,
"step": 680
},
{
"epoch": 1.8770854613551244,
"grad_norm": 0.69017494598675,
"learning_rate": 9.76733180328163e-06,
"loss": 0.0146,
"step": 690
},
{
"epoch": 1.9043241402791966,
"grad_norm": 0.9541591518502084,
"learning_rate": 9.752778077840302e-06,
"loss": 0.0097,
"step": 700
},
{
"epoch": 1.9315628192032688,
"grad_norm": 1.515252524842471,
"learning_rate": 9.737794393581125e-06,
"loss": 0.0102,
"step": 710
},
{
"epoch": 1.958801498127341,
"grad_norm": 0.6520067186874438,
"learning_rate": 9.722382105999156e-06,
"loss": 0.0121,
"step": 720
},
{
"epoch": 1.9860401770514131,
"grad_norm": 1.111673947257393,
"learning_rate": 9.706542609362928e-06,
"loss": 0.0101,
"step": 730
},
{
"epoch": 2.010895471569629,
"grad_norm": 1.2740352672206354,
"learning_rate": 9.690277336588338e-06,
"loss": 0.0093,
"step": 740
},
{
"epoch": 2.038134150493701,
"grad_norm": 1.2314399048766502,
"learning_rate": 9.673587759109007e-06,
"loss": 0.0082,
"step": 750
},
{
"epoch": 2.065372829417773,
"grad_norm": 0.6515664977184386,
"learning_rate": 9.656475386743166e-06,
"loss": 0.0078,
"step": 760
},
{
"epoch": 2.0926115083418453,
"grad_norm": 0.6076512914812847,
"learning_rate": 9.638941767557085e-06,
"loss": 0.0086,
"step": 770
},
{
"epoch": 2.1198501872659175,
"grad_norm": 0.49805773777686746,
"learning_rate": 9.620988487724999e-06,
"loss": 0.0094,
"step": 780
},
{
"epoch": 2.1470888661899896,
"grad_norm": 1.1149194861607004,
"learning_rate": 9.602617171385646e-06,
"loss": 0.0075,
"step": 790
},
{
"epoch": 2.174327545114062,
"grad_norm": 0.33357565062562405,
"learning_rate": 9.583829480495325e-06,
"loss": 0.0062,
"step": 800
},
{
"epoch": 2.201566224038134,
"grad_norm": 1.2725673141069516,
"learning_rate": 9.564627114677546e-06,
"loss": 0.0074,
"step": 810
},
{
"epoch": 2.2288049029622066,
"grad_norm": 0.8173124298927218,
"learning_rate": 9.54501181106928e-06,
"loss": 0.0102,
"step": 820
},
{
"epoch": 2.2560435818862787,
"grad_norm": 0.6950238807140082,
"learning_rate": 9.524985344163801e-06,
"loss": 0.0065,
"step": 830
},
{
"epoch": 2.283282260810351,
"grad_norm": 0.9944262630317349,
"learning_rate": 9.504549525650173e-06,
"loss": 0.0067,
"step": 840
},
{
"epoch": 2.310520939734423,
"grad_norm": 0.7645573101281118,
"learning_rate": 9.483706204249332e-06,
"loss": 0.0072,
"step": 850
},
{
"epoch": 2.3377596186584952,
"grad_norm": 0.5284367702467161,
"learning_rate": 9.462457265546867e-06,
"loss": 0.007,
"step": 860
},
{
"epoch": 2.3649982975825674,
"grad_norm": 0.5487759594771856,
"learning_rate": 9.440804631822421e-06,
"loss": 0.0073,
"step": 870
},
{
"epoch": 2.3922369765066396,
"grad_norm": 0.6876376014240485,
"learning_rate": 9.418750261875811e-06,
"loss": 0.006,
"step": 880
},
{
"epoch": 2.4194756554307117,
"grad_norm": 0.5930505890719068,
"learning_rate": 9.396296150849804e-06,
"loss": 0.0034,
"step": 890
},
{
"epoch": 2.446714334354784,
"grad_norm": 0.2924797737902157,
"learning_rate": 9.373444330049645e-06,
"loss": 0.0048,
"step": 900
},
{
"epoch": 2.473953013278856,
"grad_norm": 0.818504316665287,
"learning_rate": 9.350196866759289e-06,
"loss": 0.0065,
"step": 910
},
{
"epoch": 2.5011916922029283,
"grad_norm": 0.6173390676022147,
"learning_rate": 9.326555864054383e-06,
"loss": 0.0063,
"step": 920
},
{
"epoch": 2.5284303711270004,
"grad_norm": 0.42627927587804193,
"learning_rate": 9.302523460612015e-06,
"loss": 0.0055,
"step": 930
},
{
"epoch": 2.5556690500510726,
"grad_norm": 0.5539596964886677,
"learning_rate": 9.278101830517234e-06,
"loss": 0.0047,
"step": 940
},
{
"epoch": 2.5829077289751448,
"grad_norm": 0.3896212375163213,
"learning_rate": 9.253293183066382e-06,
"loss": 0.0056,
"step": 950
},
{
"epoch": 2.610146407899217,
"grad_norm": 0.658832029703366,
"learning_rate": 9.228099762567221e-06,
"loss": 0.0053,
"step": 960
},
{
"epoch": 2.637385086823289,
"grad_norm": 0.2940172656833637,
"learning_rate": 9.202523848135903e-06,
"loss": 0.005,
"step": 970
},
{
"epoch": 2.6646237657473613,
"grad_norm": 0.8264261097315277,
"learning_rate": 9.176567753490795e-06,
"loss": 0.0088,
"step": 980
},
{
"epoch": 2.6918624446714334,
"grad_norm": 0.47332662322314495,
"learning_rate": 9.15023382674317e-06,
"loss": 0.0049,
"step": 990
},
{
"epoch": 2.7191011235955056,
"grad_norm": 0.6043816603050725,
"learning_rate": 9.12352445018478e-06,
"loss": 0.0054,
"step": 1000
},
{
"epoch": 2.7191011235955056,
"eval_loss": 0.009971115738153458,
"eval_runtime": 149.6934,
"eval_samples_per_second": 1.336,
"eval_steps_per_second": 0.167,
"step": 1000
},
{
"epoch": 2.7463398025195778,
"grad_norm": 0.18838425652727245,
"learning_rate": 9.096442040072342e-06,
"loss": 0.0036,
"step": 1010
},
{
"epoch": 2.77357848144365,
"grad_norm": 0.33857822247434793,
"learning_rate": 9.06898904640896e-06,
"loss": 0.0038,
"step": 1020
},
{
"epoch": 2.800817160367722,
"grad_norm": 0.56645910713706,
"learning_rate": 9.04116795272248e-06,
"loss": 0.0055,
"step": 1030
},
{
"epoch": 2.8280558392917943,
"grad_norm": 0.28109810510949723,
"learning_rate": 9.01298127584082e-06,
"loss": 0.0036,
"step": 1040
},
{
"epoch": 2.8552945182158664,
"grad_norm": 0.542296858675033,
"learning_rate": 8.984431565664287e-06,
"loss": 0.0048,
"step": 1050
},
{
"epoch": 2.8825331971399386,
"grad_norm": 0.5373500607763515,
"learning_rate": 8.955521404934895e-06,
"loss": 0.0043,
"step": 1060
},
{
"epoch": 2.9097718760640108,
"grad_norm": 0.5880158628582186,
"learning_rate": 8.926253409002724e-06,
"loss": 0.0045,
"step": 1070
},
{
"epoch": 2.937010554988083,
"grad_norm": 1.0360372633008765,
"learning_rate": 8.896630225589325e-06,
"loss": 0.003,
"step": 1080
},
{
"epoch": 2.964249233912155,
"grad_norm": 0.6664430992854442,
"learning_rate": 8.866654534548188e-06,
"loss": 0.0035,
"step": 1090
},
{
"epoch": 2.9914879128362273,
"grad_norm": 0.3828974814299205,
"learning_rate": 8.836329047622315e-06,
"loss": 0.0051,
"step": 1100
},
{
"epoch": 3.0163432073544434,
"grad_norm": 0.5237696754640224,
"learning_rate": 8.805656508198893e-06,
"loss": 0.0025,
"step": 1110
},
{
"epoch": 3.0435818862785156,
"grad_norm": 0.31976718993311765,
"learning_rate": 8.774639691061133e-06,
"loss": 0.0027,
"step": 1120
},
{
"epoch": 3.0708205652025877,
"grad_norm": 0.8311789315891146,
"learning_rate": 8.743281402137234e-06,
"loss": 0.0043,
"step": 1130
},
{
"epoch": 3.09805924412666,
"grad_norm": 0.5395564063408896,
"learning_rate": 8.711584478246545e-06,
"loss": 0.0037,
"step": 1140
},
{
"epoch": 3.125297923050732,
"grad_norm": 0.4868560419941669,
"learning_rate": 8.679551786842947e-06,
"loss": 0.003,
"step": 1150
},
{
"epoch": 3.1525366019748042,
"grad_norm": 0.09982986237608656,
"learning_rate": 8.647186225755435e-06,
"loss": 0.004,
"step": 1160
},
{
"epoch": 3.1797752808988764,
"grad_norm": 0.2992508205991956,
"learning_rate": 8.614490722925976e-06,
"loss": 0.0025,
"step": 1170
},
{
"epoch": 3.2070139598229486,
"grad_norm": 0.7674760077175568,
"learning_rate": 8.581468236144624e-06,
"loss": 0.0029,
"step": 1180
},
{
"epoch": 3.2342526387470207,
"grad_norm": 0.688415374822658,
"learning_rate": 8.548121752781958e-06,
"loss": 0.0024,
"step": 1190
},
{
"epoch": 3.261491317671093,
"grad_norm": 0.9052988126796538,
"learning_rate": 8.514454289518815e-06,
"loss": 0.0034,
"step": 1200
},
{
"epoch": 3.288729996595165,
"grad_norm": 0.40120185903679884,
"learning_rate": 8.480468892073396e-06,
"loss": 0.002,
"step": 1210
},
{
"epoch": 3.3159686755192372,
"grad_norm": 0.5944020494018145,
"learning_rate": 8.446168634925744e-06,
"loss": 0.0027,
"step": 1220
},
{
"epoch": 3.3432073544433094,
"grad_norm": 0.4472237168192407,
"learning_rate": 8.411556621039587e-06,
"loss": 0.0076,
"step": 1230
},
{
"epoch": 3.3704460333673816,
"grad_norm": 0.28045654193358444,
"learning_rate": 8.376635981581652e-06,
"loss": 0.0035,
"step": 1240
},
{
"epoch": 3.3976847122914537,
"grad_norm": 0.17536506030522428,
"learning_rate": 8.341409875638396e-06,
"loss": 0.0038,
"step": 1250
},
{
"epoch": 3.424923391215526,
"grad_norm": 5.902141227200851,
"learning_rate": 8.305881489930224e-06,
"loss": 0.0075,
"step": 1260
},
{
"epoch": 3.452162070139598,
"grad_norm": 1.1226970845212438,
"learning_rate": 8.270054038523194e-06,
"loss": 0.0039,
"step": 1270
},
{
"epoch": 3.4794007490636703,
"grad_norm": 0.2305609441856671,
"learning_rate": 8.233930762538271e-06,
"loss": 0.0038,
"step": 1280
},
{
"epoch": 3.506639427987743,
"grad_norm": 0.17972249159415393,
"learning_rate": 8.197514929858108e-06,
"loss": 0.0033,
"step": 1290
},
{
"epoch": 3.533878106911815,
"grad_norm": 0.22572705207090887,
"learning_rate": 8.160809834831422e-06,
"loss": 0.0018,
"step": 1300
},
{
"epoch": 3.561116785835887,
"grad_norm": 0.48294663787369996,
"learning_rate": 8.123818797974973e-06,
"loss": 0.001,
"step": 1310
},
{
"epoch": 3.5883554647599594,
"grad_norm": 0.0324163008144768,
"learning_rate": 8.08654516567318e-06,
"loss": 0.0017,
"step": 1320
},
{
"epoch": 3.6155941436840315,
"grad_norm": 0.2702928606649647,
"learning_rate": 8.04899230987537e-06,
"loss": 0.0007,
"step": 1330
},
{
"epoch": 3.6428328226081037,
"grad_norm": 0.02733106163122347,
"learning_rate": 8.011163627790765e-06,
"loss": 0.0015,
"step": 1340
},
{
"epoch": 3.670071501532176,
"grad_norm": 0.17653493830017525,
"learning_rate": 7.97306254158113e-06,
"loss": 0.0033,
"step": 1350
},
{
"epoch": 3.697310180456248,
"grad_norm": 0.06779788652309977,
"learning_rate": 7.934692498051202e-06,
"loss": 0.0015,
"step": 1360
},
{
"epoch": 3.72454885938032,
"grad_norm": 0.04283617369672373,
"learning_rate": 7.896056968336868e-06,
"loss": 0.0016,
"step": 1370
},
{
"epoch": 3.7517875383043924,
"grad_norm": 0.4528535210316289,
"learning_rate": 7.857159447591153e-06,
"loss": 0.0015,
"step": 1380
},
{
"epoch": 3.7790262172284645,
"grad_norm": 0.03703542904667366,
"learning_rate": 7.81800345466804e-06,
"loss": 0.0007,
"step": 1390
},
{
"epoch": 3.8062648961525367,
"grad_norm": 0.22254648728704393,
"learning_rate": 7.778592531804115e-06,
"loss": 0.0018,
"step": 1400
},
{
"epoch": 3.833503575076609,
"grad_norm": 0.1551875795468263,
"learning_rate": 7.738930244298146e-06,
"loss": 0.003,
"step": 1410
},
{
"epoch": 3.860742254000681,
"grad_norm": 0.4498866311661179,
"learning_rate": 7.699020180188533e-06,
"loss": 0.0017,
"step": 1420
},
{
"epoch": 3.887980932924753,
"grad_norm": 1.4505936210625598,
"learning_rate": 7.658865949928717e-06,
"loss": 0.0021,
"step": 1430
},
{
"epoch": 3.9152196118488254,
"grad_norm": 0.2597247781008665,
"learning_rate": 7.618471186060574e-06,
"loss": 0.0009,
"step": 1440
},
{
"epoch": 3.9424582907728976,
"grad_norm": 0.05461306630022223,
"learning_rate": 7.577839542885783e-06,
"loss": 0.0012,
"step": 1450
},
{
"epoch": 3.9696969696969697,
"grad_norm": 0.2510446146562708,
"learning_rate": 7.5369746961352505e-06,
"loss": 0.0012,
"step": 1460
},
{
"epoch": 3.996935648621042,
"grad_norm": 0.027247865962538042,
"learning_rate": 7.495880342636581e-06,
"loss": 0.0016,
"step": 1470
},
{
"epoch": 4.021790943139258,
"grad_norm": 0.3181735974801834,
"learning_rate": 7.454560199979647e-06,
"loss": 0.0005,
"step": 1480
},
{
"epoch": 4.04902962206333,
"grad_norm": 0.03550322906594092,
"learning_rate": 7.413018006180278e-06,
"loss": 0.0006,
"step": 1490
},
{
"epoch": 4.076268300987402,
"grad_norm": 0.7839865968633526,
"learning_rate": 7.371257519342103e-06,
"loss": 0.0023,
"step": 1500
},
{
"epoch": 4.103506979911474,
"grad_norm": 0.27056209178067114,
"learning_rate": 7.329282517316574e-06,
"loss": 0.0013,
"step": 1510
},
{
"epoch": 4.130745658835546,
"grad_norm": 0.025007951464925165,
"learning_rate": 7.287096797361197e-06,
"loss": 0.0014,
"step": 1520
},
{
"epoch": 4.157984337759618,
"grad_norm": 0.1639293410219479,
"learning_rate": 7.244704175796028e-06,
"loss": 0.002,
"step": 1530
},
{
"epoch": 4.185223016683691,
"grad_norm": 0.09539274967398963,
"learning_rate": 7.202108487658416e-06,
"loss": 0.0017,
"step": 1540
},
{
"epoch": 4.212461695607763,
"grad_norm": 0.18901065014774657,
"learning_rate": 7.159313586356077e-06,
"loss": 0.002,
"step": 1550
},
{
"epoch": 4.239700374531835,
"grad_norm": 0.027388119425054664,
"learning_rate": 7.116323343318495e-06,
"loss": 0.0017,
"step": 1560
},
{
"epoch": 4.266939053455907,
"grad_norm": 0.13933358678308516,
"learning_rate": 7.073141647646691e-06,
"loss": 0.0009,
"step": 1570
},
{
"epoch": 4.294177732379979,
"grad_norm": 0.011964487372353175,
"learning_rate": 7.029772405761397e-06,
"loss": 0.0005,
"step": 1580
},
{
"epoch": 4.321416411304051,
"grad_norm": 0.044579473352322464,
"learning_rate": 6.9862195410496655e-06,
"loss": 0.0007,
"step": 1590
},
{
"epoch": 4.348655090228124,
"grad_norm": 0.9421209582985217,
"learning_rate": 6.942486993509941e-06,
"loss": 0.0011,
"step": 1600
},
{
"epoch": 4.375893769152196,
"grad_norm": 0.165168485428461,
"learning_rate": 6.898578719395622e-06,
"loss": 0.0007,
"step": 1610
},
{
"epoch": 4.403132448076268,
"grad_norm": 0.012552169908338339,
"learning_rate": 6.854498690857173e-06,
"loss": 0.0024,
"step": 1620
},
{
"epoch": 4.43037112700034,
"grad_norm": 0.33669756307886667,
"learning_rate": 6.810250895582773e-06,
"loss": 0.0013,
"step": 1630
},
{
"epoch": 4.457609805924413,
"grad_norm": 0.03430019485251983,
"learning_rate": 6.765839336437574e-06,
"loss": 0.001,
"step": 1640
},
{
"epoch": 4.484848484848484,
"grad_norm": 0.7568315535654873,
"learning_rate": 6.721268031101586e-06,
"loss": 0.0018,
"step": 1650
},
{
"epoch": 4.5120871637725575,
"grad_norm": 0.36926846488952053,
"learning_rate": 6.676541011706212e-06,
"loss": 0.0032,
"step": 1660
},
{
"epoch": 4.539325842696629,
"grad_norm": 0.30700403135022064,
"learning_rate": 6.631662324469492e-06,
"loss": 0.0021,
"step": 1670
},
{
"epoch": 4.566564521620702,
"grad_norm": 0.07283695365864443,
"learning_rate": 6.586636029330054e-06,
"loss": 0.0015,
"step": 1680
},
{
"epoch": 4.593803200544773,
"grad_norm": 0.5098936742183463,
"learning_rate": 6.5414661995798346e-06,
"loss": 0.0026,
"step": 1690
},
{
"epoch": 4.621041879468846,
"grad_norm": 0.15016221369549917,
"learning_rate": 6.496156921495594e-06,
"loss": 0.0023,
"step": 1700
},
{
"epoch": 4.648280558392918,
"grad_norm": 0.7027337621531187,
"learning_rate": 6.450712293969251e-06,
"loss": 0.0036,
"step": 1710
},
{
"epoch": 4.6755192373169905,
"grad_norm": 0.48920213901161386,
"learning_rate": 6.405136428137072e-06,
"loss": 0.0024,
"step": 1720
},
{
"epoch": 4.702757916241063,
"grad_norm": 0.526663059492545,
"learning_rate": 6.359433447007761e-06,
"loss": 0.0037,
"step": 1730
},
{
"epoch": 4.729996595165135,
"grad_norm": 0.43485515633750277,
"learning_rate": 6.313607485089479e-06,
"loss": 0.002,
"step": 1740
},
{
"epoch": 4.757235274089207,
"grad_norm": 0.1608557350260687,
"learning_rate": 6.267662688015811e-06,
"loss": 0.0011,
"step": 1750
},
{
"epoch": 4.784473953013279,
"grad_norm": 0.016233665978459856,
"learning_rate": 6.221603212170727e-06,
"loss": 0.0016,
"step": 1760
},
{
"epoch": 4.811712631937351,
"grad_norm": 0.3060301856403388,
"learning_rate": 6.175433224312588e-06,
"loss": 0.0008,
"step": 1770
},
{
"epoch": 4.8389513108614235,
"grad_norm": 0.0535023008279656,
"learning_rate": 6.129156901197195e-06,
"loss": 0.0007,
"step": 1780
},
{
"epoch": 4.866189989785496,
"grad_norm": 0.22805192659166784,
"learning_rate": 6.082778429199937e-06,
"loss": 0.0011,
"step": 1790
},
{
"epoch": 4.893428668709568,
"grad_norm": 0.06801175552041476,
"learning_rate": 6.036302003937076e-06,
"loss": 0.0004,
"step": 1800
},
{
"epoch": 4.92066734763364,
"grad_norm": 0.3046742023784698,
"learning_rate": 5.9897318298861885e-06,
"loss": 0.0007,
"step": 1810
},
{
"epoch": 4.947906026557712,
"grad_norm": 0.033986290353038136,
"learning_rate": 5.943072120005816e-06,
"loss": 0.0007,
"step": 1820
},
{
"epoch": 4.975144705481784,
"grad_norm": 0.09040671159275827,
"learning_rate": 5.89632709535433e-06,
"loss": 0.0019,
"step": 1830
},
{
"epoch": 5.0,
"grad_norm": 0.2530736784412786,
"learning_rate": 5.849500984708082e-06,
"loss": 0.0016,
"step": 1840
},
{
"epoch": 5.027238678924072,
"grad_norm": 0.08687153636471827,
"learning_rate": 5.802598024178848e-06,
"loss": 0.0004,
"step": 1850
},
{
"epoch": 5.054477357848144,
"grad_norm": 0.45420511928877233,
"learning_rate": 5.755622456830605e-06,
"loss": 0.0008,
"step": 1860
},
{
"epoch": 5.0817160367722165,
"grad_norm": 0.1542430485091151,
"learning_rate": 5.708578532295691e-06,
"loss": 0.0016,
"step": 1870
},
{
"epoch": 5.108954715696289,
"grad_norm": 1.3515082865895989,
"learning_rate": 5.661470506390354e-06,
"loss": 0.0011,
"step": 1880
},
{
"epoch": 5.136193394620361,
"grad_norm": 0.1596903735504535,
"learning_rate": 5.61430264072976e-06,
"loss": 0.0014,
"step": 1890
},
{
"epoch": 5.163432073544433,
"grad_norm": 0.19923117752022435,
"learning_rate": 5.5670792023424615e-06,
"loss": 0.0015,
"step": 1900
},
{
"epoch": 5.190670752468505,
"grad_norm": 0.302828135997675,
"learning_rate": 5.519804463284382e-06,
"loss": 0.0009,
"step": 1910
},
{
"epoch": 5.217909431392577,
"grad_norm": 0.04653233725601432,
"learning_rate": 5.472482700252347e-06,
"loss": 0.0012,
"step": 1920
},
{
"epoch": 5.2451481103166495,
"grad_norm": 0.5190292480319276,
"learning_rate": 5.425118194197196e-06,
"loss": 0.0023,
"step": 1930
},
{
"epoch": 5.272386789240722,
"grad_norm": 0.007225303260017864,
"learning_rate": 5.3777152299365e-06,
"loss": 0.0005,
"step": 1940
},
{
"epoch": 5.299625468164794,
"grad_norm": 0.08842755160666287,
"learning_rate": 5.3302780957669454e-06,
"loss": 0.0006,
"step": 1950
},
{
"epoch": 5.326864147088866,
"grad_norm": 0.024822557291839333,
"learning_rate": 5.282811083076388e-06,
"loss": 0.0004,
"step": 1960
},
{
"epoch": 5.354102826012938,
"grad_norm": 1.2627706166913806,
"learning_rate": 5.235318485955638e-06,
"loss": 0.0007,
"step": 1970
},
{
"epoch": 5.38134150493701,
"grad_norm": 0.03392392848083845,
"learning_rate": 5.187804600809995e-06,
"loss": 0.0014,
"step": 1980
},
{
"epoch": 5.4085801838610825,
"grad_norm": 0.009487721281030682,
"learning_rate": 5.140273725970569e-06,
"loss": 0.0019,
"step": 1990
},
{
"epoch": 5.435818862785155,
"grad_norm": 0.03169860186889457,
"learning_rate": 5.092730161305444e-06,
"loss": 0.0005,
"step": 2000
},
{
"epoch": 5.435818862785155,
"eval_loss": 0.003610835410654545,
"eval_runtime": 149.9472,
"eval_samples_per_second": 1.334,
"eval_steps_per_second": 0.167,
"step": 2000
},
{
"epoch": 5.463057541709227,
"grad_norm": 0.26681987017023606,
"learning_rate": 5.045178207830687e-06,
"loss": 0.0005,
"step": 2010
},
{
"epoch": 5.490296220633299,
"grad_norm": 0.020589913051757485,
"learning_rate": 4.997622167321246e-06,
"loss": 0.0004,
"step": 2020
},
{
"epoch": 5.517534899557371,
"grad_norm": 0.005508716128027595,
"learning_rate": 4.950066341921813e-06,
"loss": 0.0002,
"step": 2030
},
{
"epoch": 5.544773578481443,
"grad_norm": 0.12421491718000478,
"learning_rate": 4.902515033757617e-06,
"loss": 0.0002,
"step": 2040
},
{
"epoch": 5.5720122574055155,
"grad_norm": 0.6904314051957775,
"learning_rate": 4.854972544545231e-06,
"loss": 0.0011,
"step": 2050
},
{
"epoch": 5.599250936329588,
"grad_norm": 0.020582980767102473,
"learning_rate": 4.807443175203432e-06,
"loss": 0.0008,
"step": 2060
},
{
"epoch": 5.62648961525366,
"grad_norm": 0.26044403454004866,
"learning_rate": 4.759931225464107e-06,
"loss": 0.0011,
"step": 2070
},
{
"epoch": 5.653728294177732,
"grad_norm": 0.13198469815528743,
"learning_rate": 4.712440993483281e-06,
"loss": 0.0005,
"step": 2080
},
{
"epoch": 5.680966973101804,
"grad_norm": 0.24787019272348249,
"learning_rate": 4.664976775452293e-06,
"loss": 0.0016,
"step": 2090
},
{
"epoch": 5.708205652025876,
"grad_norm": 0.06673597677280482,
"learning_rate": 4.617542865209133e-06,
"loss": 0.0001,
"step": 2100
},
{
"epoch": 5.7354443309499485,
"grad_norm": 0.01989819994546987,
"learning_rate": 4.5701435538500065e-06,
"loss": 0.0002,
"step": 2110
},
{
"epoch": 5.762683009874021,
"grad_norm": 0.06711337766264915,
"learning_rate": 4.522783129341141e-06,
"loss": 0.0011,
"step": 2120
},
{
"epoch": 5.789921688798094,
"grad_norm": 0.0258406372816757,
"learning_rate": 4.475465876130872e-06,
"loss": 0.0002,
"step": 2130
},
{
"epoch": 5.817160367722165,
"grad_norm": 0.0069848118261474626,
"learning_rate": 4.428196074762057e-06,
"loss": 0.0001,
"step": 2140
},
{
"epoch": 5.844399046646238,
"grad_norm": 0.00465294693967376,
"learning_rate": 4.380978001484836e-06,
"loss": 0.0008,
"step": 2150
},
{
"epoch": 5.871637725570309,
"grad_norm": 0.062291913740785465,
"learning_rate": 4.33381592786978e-06,
"loss": 0.0001,
"step": 2160
},
{
"epoch": 5.898876404494382,
"grad_norm": 0.00324106094253182,
"learning_rate": 4.286714120421465e-06,
"loss": 0.0001,
"step": 2170
},
{
"epoch": 5.926115083418454,
"grad_norm": 0.010405479601857403,
"learning_rate": 4.2396768401925044e-06,
"loss": 0.0001,
"step": 2180
},
{
"epoch": 5.953353762342527,
"grad_norm": 0.003621476287705226,
"learning_rate": 4.1927083423980755e-06,
"loss": 0.0001,
"step": 2190
},
{
"epoch": 5.980592441266599,
"grad_norm": 0.15175481500791388,
"learning_rate": 4.145812876030965e-06,
"loss": 0.0001,
"step": 2200
},
{
"epoch": 6.005447735784815,
"grad_norm": 0.009474114226140813,
"learning_rate": 4.098994683477197e-06,
"loss": 0.0,
"step": 2210
},
{
"epoch": 6.032686414708887,
"grad_norm": 0.010773928421478833,
"learning_rate": 4.0522580001322365e-06,
"loss": 0.0,
"step": 2220
},
{
"epoch": 6.059925093632959,
"grad_norm": 0.002749306413145174,
"learning_rate": 4.0056070540178425e-06,
"loss": 0.0,
"step": 2230
},
{
"epoch": 6.087163772557031,
"grad_norm": 0.001948904462067843,
"learning_rate": 3.959046065399575e-06,
"loss": 0.0,
"step": 2240
},
{
"epoch": 6.114402451481103,
"grad_norm": 0.001621993315411648,
"learning_rate": 3.912579246405016e-06,
"loss": 0.0,
"step": 2250
},
{
"epoch": 6.1416411304051755,
"grad_norm": 0.001255150538557096,
"learning_rate": 3.8662108006427165e-06,
"loss": 0.0,
"step": 2260
},
{
"epoch": 6.168879809329248,
"grad_norm": 0.004272455750195676,
"learning_rate": 3.819944922821914e-06,
"loss": 0.0,
"step": 2270
},
{
"epoch": 6.19611848825332,
"grad_norm": 0.00187243023068361,
"learning_rate": 3.773785798373069e-06,
"loss": 0.0,
"step": 2280
},
{
"epoch": 6.223357167177392,
"grad_norm": 0.0009374589516518995,
"learning_rate": 3.7277376030692263e-06,
"loss": 0.0,
"step": 2290
},
{
"epoch": 6.250595846101464,
"grad_norm": 0.001253252761018019,
"learning_rate": 3.681804502648254e-06,
"loss": 0.0,
"step": 2300
},
{
"epoch": 6.277834525025536,
"grad_norm": 0.002173317313832616,
"learning_rate": 3.6359906524359932e-06,
"loss": 0.0,
"step": 2310
},
{
"epoch": 6.3050732039496085,
"grad_norm": 0.00122096897637743,
"learning_rate": 3.590300196970341e-06,
"loss": 0.0,
"step": 2320
},
{
"epoch": 6.332311882873681,
"grad_norm": 0.0008158321760346172,
"learning_rate": 3.544737269626328e-06,
"loss": 0.0,
"step": 2330
},
{
"epoch": 6.359550561797753,
"grad_norm": 0.0011021236484427205,
"learning_rate": 3.4993059922421835e-06,
"loss": 0.0,
"step": 2340
},
{
"epoch": 6.386789240721825,
"grad_norm": 0.0007113597066715369,
"learning_rate": 3.4540104747464575e-06,
"loss": 0.0,
"step": 2350
},
{
"epoch": 6.414027919645897,
"grad_norm": 0.0007186713752736677,
"learning_rate": 3.408854814786219e-06,
"loss": 0.0,
"step": 2360
},
{
"epoch": 6.441266598569969,
"grad_norm": 0.0005956363546399899,
"learning_rate": 3.3638430973563597e-06,
"loss": 0.0,
"step": 2370
},
{
"epoch": 6.4685052774940415,
"grad_norm": 0.0013985565483587347,
"learning_rate": 3.318979394430051e-06,
"loss": 0.0,
"step": 2380
},
{
"epoch": 6.495743956418114,
"grad_norm": 0.0008815209213564751,
"learning_rate": 3.27426776459037e-06,
"loss": 0.0003,
"step": 2390
},
{
"epoch": 6.522982635342186,
"grad_norm": 0.0024178562381092818,
"learning_rate": 3.22971225266314e-06,
"loss": 0.0,
"step": 2400
},
{
"epoch": 6.550221314266258,
"grad_norm": 0.0010728069443376714,
"learning_rate": 3.1853168893510223e-06,
"loss": 0.0,
"step": 2410
},
{
"epoch": 6.57745999319033,
"grad_norm": 0.0010548514445552371,
"learning_rate": 3.141085690868871e-06,
"loss": 0.0,
"step": 2420
},
{
"epoch": 6.604698672114402,
"grad_norm": 0.0021998928664552233,
"learning_rate": 3.0970226585804175e-06,
"loss": 0.0001,
"step": 2430
},
{
"epoch": 6.6319373510384745,
"grad_norm": 0.0011062246966726207,
"learning_rate": 3.053131778636278e-06,
"loss": 0.0,
"step": 2440
},
{
"epoch": 6.659176029962547,
"grad_norm": 0.0011797120379395778,
"learning_rate": 3.0094170216133545e-06,
"loss": 0.0,
"step": 2450
},
{
"epoch": 6.686414708886619,
"grad_norm": 0.007225871987921568,
"learning_rate": 2.965882342155637e-06,
"loss": 0.0,
"step": 2460
},
{
"epoch": 6.713653387810691,
"grad_norm": 0.0010544539477212198,
"learning_rate": 2.9225316786164417e-06,
"loss": 0.0,
"step": 2470
},
{
"epoch": 6.740892066734763,
"grad_norm": 0.0006976448579587069,
"learning_rate": 2.8793689527021377e-06,
"loss": 0.0,
"step": 2480
},
{
"epoch": 6.768130745658835,
"grad_norm": 0.0027697202568828954,
"learning_rate": 2.836398069117362e-06,
"loss": 0.0,
"step": 2490
},
{
"epoch": 6.7953694245829075,
"grad_norm": 0.0005279000452700497,
"learning_rate": 2.7936229152117896e-06,
"loss": 0.0,
"step": 2500
},
{
"epoch": 6.82260810350698,
"grad_norm": 0.000826146277516193,
"learning_rate": 2.751047360628458e-06,
"loss": 0.0,
"step": 2510
},
{
"epoch": 6.849846782431052,
"grad_norm": 0.0007622689637331964,
"learning_rate": 2.708675256953708e-06,
"loss": 0.0,
"step": 2520
},
{
"epoch": 6.877085461355124,
"grad_norm": 0.0011470272220122446,
"learning_rate": 2.6665104373687455e-06,
"loss": 0.0006,
"step": 2530
},
{
"epoch": 6.904324140279196,
"grad_norm": 0.0009952201118956113,
"learning_rate": 2.624556716302876e-06,
"loss": 0.0,
"step": 2540
},
{
"epoch": 6.931562819203268,
"grad_norm": 0.0011435928941426047,
"learning_rate": 2.582817889088435e-06,
"loss": 0.0,
"step": 2550
},
{
"epoch": 6.9588014981273405,
"grad_norm": 0.01166505316182784,
"learning_rate": 2.541297731617437e-06,
"loss": 0.0,
"step": 2560
},
{
"epoch": 6.986040177051413,
"grad_norm": 0.0012091660156564501,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.0,
"step": 2570
},
{
"epoch": 7.010895471569629,
"grad_norm": 0.0015980505913388763,
"learning_rate": 2.458928430224548e-06,
"loss": 0.0,
"step": 2580
},
{
"epoch": 7.038134150493701,
"grad_norm": 0.0009393729565236982,
"learning_rate": 2.4180867378198274e-06,
"loss": 0.0,
"step": 2590
},
{
"epoch": 7.065372829417774,
"grad_norm": 0.0008257129118958041,
"learning_rate": 2.3774786175187932e-06,
"loss": 0.0,
"step": 2600
},
{
"epoch": 7.092611508341846,
"grad_norm": 0.0018052707906574057,
"learning_rate": 2.337107742924359e-06,
"loss": 0.0,
"step": 2610
},
{
"epoch": 7.119850187265918,
"grad_norm": 0.1645764072539005,
"learning_rate": 2.29697776617707e-06,
"loss": 0.0,
"step": 2620
},
{
"epoch": 7.14708886618999,
"grad_norm": 0.0007519843719597862,
"learning_rate": 2.25709231762471e-06,
"loss": 0.0,
"step": 2630
},
{
"epoch": 7.174327545114062,
"grad_norm": 0.0008477596842611903,
"learning_rate": 2.217455005493884e-06,
"loss": 0.0,
"step": 2640
},
{
"epoch": 7.201566224038134,
"grad_norm": 0.004509417330392989,
"learning_rate": 2.1780694155636014e-06,
"loss": 0.0,
"step": 2650
},
{
"epoch": 7.228804902962207,
"grad_norm": 0.0005766378225871001,
"learning_rate": 2.138939110840888e-06,
"loss": 0.0,
"step": 2660
},
{
"epoch": 7.256043581886279,
"grad_norm": 0.004100067804604992,
"learning_rate": 2.100067631238464e-06,
"loss": 0.0,
"step": 2670
},
{
"epoch": 7.283282260810351,
"grad_norm": 0.003930226882351156,
"learning_rate": 2.0614584932544955e-06,
"loss": 0.0,
"step": 2680
},
{
"epoch": 7.310520939734423,
"grad_norm": 0.0004427737628504764,
"learning_rate": 2.023115189654491e-06,
"loss": 0.0,
"step": 2690
},
{
"epoch": 7.337759618658495,
"grad_norm": 0.0008888326172641632,
"learning_rate": 1.9850411891553186e-06,
"loss": 0.0,
"step": 2700
},
{
"epoch": 7.364998297582567,
"grad_norm": 0.0006039128437105985,
"learning_rate": 1.9472399361114126e-06,
"loss": 0.0,
"step": 2710
},
{
"epoch": 7.39223697650664,
"grad_norm": 0.0004893946381054215,
"learning_rate": 1.909714850203177e-06,
"loss": 0.0,
"step": 2720
},
{
"epoch": 7.419475655430712,
"grad_norm": 0.0004317709133183237,
"learning_rate": 1.8724693261276345e-06,
"loss": 0.0,
"step": 2730
},
{
"epoch": 7.446714334354784,
"grad_norm": 0.0006212197750363989,
"learning_rate": 1.8355067332913156e-06,
"loss": 0.0,
"step": 2740
},
{
"epoch": 7.473953013278856,
"grad_norm": 0.0007456921468394779,
"learning_rate": 1.7988304155054541e-06,
"loss": 0.0,
"step": 2750
},
{
"epoch": 7.501191692202928,
"grad_norm": 0.0008149478511071161,
"learning_rate": 1.7624436906834842e-06,
"loss": 0.0,
"step": 2760
},
{
"epoch": 7.528430371127,
"grad_norm": 0.0007225352461383864,
"learning_rate": 1.7263498505408893e-06,
"loss": 0.0,
"step": 2770
},
{
"epoch": 7.555669050051073,
"grad_norm": 0.0005959357394697677,
"learning_rate": 1.6905521602974183e-06,
"loss": 0.0,
"step": 2780
},
{
"epoch": 7.582907728975145,
"grad_norm": 0.000635015561952419,
"learning_rate": 1.6550538583816967e-06,
"loss": 0.0,
"step": 2790
},
{
"epoch": 7.610146407899217,
"grad_norm": 0.0005433753417863713,
"learning_rate": 1.6198581561382643e-06,
"loss": 0.0,
"step": 2800
},
{
"epoch": 7.637385086823289,
"grad_norm": 0.00046276580358275586,
"learning_rate": 1.5849682375370601e-06,
"loss": 0.0,
"step": 2810
},
{
"epoch": 7.664623765747361,
"grad_norm": 0.0008516896607091807,
"learning_rate": 1.550387258885388e-06,
"loss": 0.0,
"step": 2820
},
{
"epoch": 7.691862444671433,
"grad_norm": 0.0005163692295676414,
"learning_rate": 1.5161183485423785e-06,
"loss": 0.0,
"step": 2830
},
{
"epoch": 7.719101123595506,
"grad_norm": 0.0011709181420231178,
"learning_rate": 1.482164606635989e-06,
"loss": 0.0,
"step": 2840
},
{
"epoch": 7.746339802519578,
"grad_norm": 0.0013056291673384528,
"learning_rate": 1.44852910478254e-06,
"loss": 0.0,
"step": 2850
},
{
"epoch": 7.77357848144365,
"grad_norm": 0.0004465783868552864,
"learning_rate": 1.4152148858088554e-06,
"loss": 0.0,
"step": 2860
},
{
"epoch": 7.800817160367722,
"grad_norm": 0.0005635784726540899,
"learning_rate": 1.3822249634769864e-06,
"loss": 0.0,
"step": 2870
},
{
"epoch": 7.828055839291794,
"grad_norm": 0.0003537589230160337,
"learning_rate": 1.3495623222115735e-06,
"loss": 0.0,
"step": 2880
},
{
"epoch": 7.855294518215866,
"grad_norm": 0.0004597321862407293,
"learning_rate": 1.3172299168298614e-06,
"loss": 0.0,
"step": 2890
},
{
"epoch": 7.882533197139939,
"grad_norm": 0.00040170519106549137,
"learning_rate": 1.2852306722743934e-06,
"loss": 0.0,
"step": 2900
},
{
"epoch": 7.909771876064011,
"grad_norm": 0.0016780869556999771,
"learning_rate": 1.253567483348407e-06,
"loss": 0.0,
"step": 2910
},
{
"epoch": 7.937010554988083,
"grad_norm": 0.000593202963073429,
"learning_rate": 1.222243214453951e-06,
"loss": 0.0,
"step": 2920
},
{
"epoch": 7.964249233912155,
"grad_norm": 0.0003643083864401578,
"learning_rate": 1.1912606993327685e-06,
"loss": 0.0,
"step": 2930
},
{
"epoch": 7.991487912836227,
"grad_norm": 0.0007434796810202931,
"learning_rate": 1.1606227408099347e-06,
"loss": 0.0,
"step": 2940
},
{
"epoch": 8.016343207354444,
"grad_norm": 0.00047702794678360533,
"learning_rate": 1.1303321105403026e-06,
"loss": 0.0,
"step": 2950
},
{
"epoch": 8.043581886278515,
"grad_norm": 0.0005298663692543725,
"learning_rate": 1.1003915487577683e-06,
"loss": 0.0,
"step": 2960
},
{
"epoch": 8.070820565202588,
"grad_norm": 0.004074165794804811,
"learning_rate": 1.0708037640273715e-06,
"loss": 0.0,
"step": 2970
},
{
"epoch": 8.09805924412666,
"grad_norm": 0.00040511964399428683,
"learning_rate": 1.0415714330002729e-06,
"loss": 0.0,
"step": 2980
},
{
"epoch": 8.125297923050733,
"grad_norm": 0.0012113673680468461,
"learning_rate": 1.0126972001716007e-06,
"loss": 0.0,
"step": 2990
},
{
"epoch": 8.152536601974804,
"grad_norm": 0.00041889827396330973,
"learning_rate": 9.841836776412294e-07,
"loss": 0.0,
"step": 3000
},
{
"epoch": 8.152536601974804,
"eval_loss": 1.2345339200692251e-05,
"eval_runtime": 149.4558,
"eval_samples_per_second": 1.338,
"eval_steps_per_second": 0.167,
"step": 3000
},
{
"epoch": 8.179775280898877,
"grad_norm": 0.00048789814951889245,
"learning_rate": 9.560334448774705e-07,
"loss": 0.0,
"step": 3010
},
{
"epoch": 8.207013959822948,
"grad_norm": 0.00047638289528187925,
"learning_rate": 9.282490484837215e-07,
"loss": 0.0,
"step": 3020
},
{
"epoch": 8.234252638747021,
"grad_norm": 0.000455674971676968,
"learning_rate": 9.008330019680883e-07,
"loss": 0.0,
"step": 3030
},
{
"epoch": 8.261491317671092,
"grad_norm": 0.00041234844635284676,
"learning_rate": 8.737877855160032e-07,
"loss": 0.0,
"step": 3040
},
{
"epoch": 8.288729996595166,
"grad_norm": 0.0004512145872248025,
"learning_rate": 8.471158457658546e-07,
"loss": 0.0,
"step": 3050
},
{
"epoch": 8.315968675519237,
"grad_norm": 0.0003955500503942679,
"learning_rate": 8.208195955876513e-07,
"loss": 0.0,
"step": 3060
},
{
"epoch": 8.34320735444331,
"grad_norm": 0.00037757417965770856,
"learning_rate": 7.949014138647442e-07,
"loss": 0.0,
"step": 3070
},
{
"epoch": 8.370446033367381,
"grad_norm": 0.0005928754851042547,
"learning_rate": 7.693636452786213e-07,
"loss": 0.0,
"step": 3080
},
{
"epoch": 8.397684712291454,
"grad_norm": 0.0011757489143008526,
"learning_rate": 7.442086000967962e-07,
"loss": 0.0,
"step": 3090
},
{
"epoch": 8.424923391215525,
"grad_norm": 0.001091795641988052,
"learning_rate": 7.194385539638099e-07,
"loss": 0.0,
"step": 3100
},
{
"epoch": 8.452162070139599,
"grad_norm": 0.0004490144524655851,
"learning_rate": 6.950557476953674e-07,
"loss": 0.0,
"step": 3110
},
{
"epoch": 8.47940074906367,
"grad_norm": 0.0005170136229882478,
"learning_rate": 6.710623870756178e-07,
"loss": 0.0,
"step": 3120
},
{
"epoch": 8.506639427987743,
"grad_norm": 0.0005606121390368469,
"learning_rate": 6.474606426576157e-07,
"loss": 0.0,
"step": 3130
},
{
"epoch": 8.533878106911814,
"grad_norm": 0.0006494691276362171,
"learning_rate": 6.242526495669587e-07,
"loss": 0.0,
"step": 3140
},
{
"epoch": 8.561116785835887,
"grad_norm": 0.000705761138730125,
"learning_rate": 6.01440507308631e-07,
"loss": 0.0,
"step": 3150
},
{
"epoch": 8.588355464759958,
"grad_norm": 0.00037580712390274833,
"learning_rate": 5.790262795770785e-07,
"loss": 0.0,
"step": 3160
},
{
"epoch": 8.615594143684032,
"grad_norm": 0.00037712299260292257,
"learning_rate": 5.570119940695135e-07,
"loss": 0.0,
"step": 3170
},
{
"epoch": 8.642832822608103,
"grad_norm": 0.00038488025576517655,
"learning_rate": 5.353996423024804e-07,
"loss": 0.0,
"step": 3180
},
{
"epoch": 8.670071501532176,
"grad_norm": 0.0004034621134836744,
"learning_rate": 5.141911794316934e-07,
"loss": 0.0,
"step": 3190
},
{
"epoch": 8.697310180456247,
"grad_norm": 0.00038266845732824815,
"learning_rate": 4.93388524075164e-07,
"loss": 0.0,
"step": 3200
},
{
"epoch": 8.72454885938032,
"grad_norm": 0.00038362011868780437,
"learning_rate": 4.729935581396328e-07,
"loss": 0.0,
"step": 3210
},
{
"epoch": 8.751787538304391,
"grad_norm": 0.00035607543651581207,
"learning_rate": 4.5300812665032557e-07,
"loss": 0.0,
"step": 3220
},
{
"epoch": 8.779026217228465,
"grad_norm": 0.0006068778689349405,
"learning_rate": 4.334340375840418e-07,
"loss": 0.0,
"step": 3230
},
{
"epoch": 8.806264896152536,
"grad_norm": 0.00033490740677774155,
"learning_rate": 4.1427306170559624e-07,
"loss": 0.0,
"step": 3240
},
{
"epoch": 8.833503575076609,
"grad_norm": 0.00046859576305887497,
"learning_rate": 3.955269324076294e-07,
"loss": 0.0,
"step": 3250
},
{
"epoch": 8.86074225400068,
"grad_norm": 0.0008093853787619229,
"learning_rate": 3.771973455537936e-07,
"loss": 0.0,
"step": 3260
},
{
"epoch": 8.887980932924753,
"grad_norm": 0.0010513228996015518,
"learning_rate": 3.5928595932534005e-07,
"loss": 0.0,
"step": 3270
},
{
"epoch": 8.915219611848826,
"grad_norm": 0.0003623450487854711,
"learning_rate": 3.4179439407110714e-07,
"loss": 0.0,
"step": 3280
},
{
"epoch": 8.942458290772898,
"grad_norm": 0.000339362832377934,
"learning_rate": 3.247242321609434e-07,
"loss": 0.0,
"step": 3290
},
{
"epoch": 8.969696969696969,
"grad_norm": 0.0005334885527562191,
"learning_rate": 3.0807701784255296e-07,
"loss": 0.0,
"step": 3300
},
{
"epoch": 8.996935648621042,
"grad_norm": 0.00044685136023717045,
"learning_rate": 2.9185425710179737e-07,
"loss": 0.0,
"step": 3310
},
{
"epoch": 9.021790943139258,
"grad_norm": 0.0008018093014633273,
"learning_rate": 2.7605741752645686e-07,
"loss": 0.0,
"step": 3320
},
{
"epoch": 9.04902962206333,
"grad_norm": 0.0003794485076320959,
"learning_rate": 2.606879281734659e-07,
"loss": 0.0,
"step": 3330
},
{
"epoch": 9.076268300987403,
"grad_norm": 0.000514433360537291,
"learning_rate": 2.457471794396338e-07,
"loss": 0.0,
"step": 3340
},
{
"epoch": 9.103506979911474,
"grad_norm": 0.0006110392205499266,
"learning_rate": 2.3123652293586207e-07,
"loss": 0.0,
"step": 3350
},
{
"epoch": 9.130745658835547,
"grad_norm": 0.0004051883420581768,
"learning_rate": 2.1715727136487174e-07,
"loss": 0.0,
"step": 3360
},
{
"epoch": 9.157984337759618,
"grad_norm": 0.0017022735873803575,
"learning_rate": 2.0351069840244986e-07,
"loss": 0.0,
"step": 3370
},
{
"epoch": 9.185223016683691,
"grad_norm": 0.000833252754224706,
"learning_rate": 1.9029803858222896e-07,
"loss": 0.0,
"step": 3380
},
{
"epoch": 9.212461695607763,
"grad_norm": 0.0038641354937789443,
"learning_rate": 1.775204871840014e-07,
"loss": 0.0,
"step": 3390
},
{
"epoch": 9.239700374531836,
"grad_norm": 0.0003339132774971038,
"learning_rate": 1.6517920012559086e-07,
"loss": 0.0,
"step": 3400
},
{
"epoch": 9.266939053455907,
"grad_norm": 0.0003633204131676538,
"learning_rate": 1.5327529385828377e-07,
"loss": 0.0,
"step": 3410
},
{
"epoch": 9.29417773237998,
"grad_norm": 0.000322045602551698,
"learning_rate": 1.4180984526582675e-07,
"loss": 0.0,
"step": 3420
},
{
"epoch": 9.321416411304051,
"grad_norm": 0.0010744986200642887,
"learning_rate": 1.3078389156700842e-07,
"loss": 0.0,
"step": 3430
},
{
"epoch": 9.348655090228124,
"grad_norm": 0.0005973674276566756,
"learning_rate": 1.2019843022182898e-07,
"loss": 0.0,
"step": 3440
},
{
"epoch": 9.375893769152196,
"grad_norm": 0.00035574829244011577,
"learning_rate": 1.1005441884126278e-07,
"loss": 0.0,
"step": 3450
},
{
"epoch": 9.403132448076269,
"grad_norm": 0.0005334084677271788,
"learning_rate": 1.0035277510062835e-07,
"loss": 0.0,
"step": 3460
},
{
"epoch": 9.43037112700034,
"grad_norm": 0.00032947386696114886,
"learning_rate": 9.109437665657473e-08,
"loss": 0.0,
"step": 3470
},
{
"epoch": 9.457609805924413,
"grad_norm": 0.0003827344363811748,
"learning_rate": 8.228006106767883e-08,
"loss": 0.0,
"step": 3480
},
{
"epoch": 9.484848484848484,
"grad_norm": 0.00030067997503277467,
"learning_rate": 7.391062571868113e-08,
"loss": 0.0,
"step": 3490
},
{
"epoch": 9.512087163772557,
"grad_norm": 0.0003243561088926095,
"learning_rate": 6.598682774834775e-08,
"loss": 0.0,
"step": 3500
},
{
"epoch": 9.539325842696629,
"grad_norm": 0.0003467443287273098,
"learning_rate": 5.850938398097583e-08,
"loss": 0.0,
"step": 3510
},
{
"epoch": 9.566564521620702,
"grad_norm": 0.00035286356515664503,
"learning_rate": 5.1478970861548185e-08,
"loss": 0.0,
"step": 3520
},
{
"epoch": 9.593803200544773,
"grad_norm": 0.0009776284074226168,
"learning_rate": 4.4896224394537226e-08,
"loss": 0.0,
"step": 3530
},
{
"epoch": 9.621041879468846,
"grad_norm": 0.00039195791750442195,
"learning_rate": 3.8761740086369345e-08,
"loss": 0.0,
"step": 3540
},
{
"epoch": 9.648280558392917,
"grad_norm": 0.0005306094646929393,
"learning_rate": 3.307607289155301e-08,
"loss": 0.0,
"step": 3550
},
{
"epoch": 9.67551923731699,
"grad_norm": 0.0004368446632988839,
"learning_rate": 2.78397371624739e-08,
"loss": 0.0,
"step": 3560
},
{
"epoch": 9.702757916241062,
"grad_norm": 0.0006477180396400463,
"learning_rate": 2.305320660286603e-08,
"loss": 0.0,
"step": 3570
},
{
"epoch": 9.729996595165135,
"grad_norm": 0.0005051573397374285,
"learning_rate": 1.8716914224957138e-08,
"loss": 0.0,
"step": 3580
},
{
"epoch": 9.757235274089206,
"grad_norm": 0.00035959712658455894,
"learning_rate": 1.4831252310294474e-08,
"loss": 0.0,
"step": 3590
},
{
"epoch": 9.78447395301328,
"grad_norm": 0.00047611287827840345,
"learning_rate": 1.1396572374261505e-08,
"loss": 0.0,
"step": 3600
},
{
"epoch": 9.81171263193735,
"grad_norm": 0.0009275444049437501,
"learning_rate": 8.413185134273916e-09,
"loss": 0.0,
"step": 3610
},
{
"epoch": 9.838951310861423,
"grad_norm": 0.0010032462120567445,
"learning_rate": 5.881360481673759e-09,
"loss": 0.0,
"step": 3620
},
{
"epoch": 9.866189989785495,
"grad_norm": 0.0008774671042869975,
"learning_rate": 3.801327457311765e-09,
"loss": 0.0,
"step": 3630
},
{
"epoch": 9.893428668709568,
"grad_norm": 0.0007470443454518441,
"learning_rate": 2.173274230827249e-09,
"loss": 0.0,
"step": 3640
},
{
"epoch": 9.920667347633639,
"grad_norm": 0.0006590459160652287,
"learning_rate": 9.97348083627836e-10,
"loss": 0.0,
"step": 3650
},
{
"epoch": 9.947906026557712,
"grad_norm": 0.0008391834531967672,
"learning_rate": 2.7365539556234444e-10,
"loss": 0.0,
"step": 3660
},
{
"epoch": 9.975144705481783,
"grad_norm": 0.00034623849620746836,
"learning_rate": 2.261635299039e-12,
"loss": 0.0,
"step": 3670
},
{
"epoch": 9.975144705481783,
"step": 3670,
"total_flos": 3659279013773312.0,
"train_loss": 0.050615286758120444,
"train_runtime": 386241.8476,
"train_samples_per_second": 2.433,
"train_steps_per_second": 0.01
}
],
"logging_steps": 10,
"max_steps": 3670,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3659279013773312.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}