Vigogne_Qwen2.5-1.5B / trainer_state.json
moussaKam's picture
upload
52a3f37 verified
raw
history blame
22.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500.0,
"global_step": 6237,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02405002405002405,
"grad_norm": 0.4139963388442993,
"learning_rate": 0.00019996828714700116,
"loss": 1.5971,
"step": 50
},
{
"epoch": 0.0481000481000481,
"grad_norm": 0.3423018157482147,
"learning_rate": 0.00019987316870210547,
"loss": 1.274,
"step": 100
},
{
"epoch": 0.07215007215007214,
"grad_norm": 0.3551710247993469,
"learning_rate": 0.0001997147049948582,
"loss": 1.2519,
"step": 150
},
{
"epoch": 0.0962000962000962,
"grad_norm": 0.32329073548316956,
"learning_rate": 0.0001994929965319844,
"loss": 1.2382,
"step": 200
},
{
"epoch": 0.12025012025012025,
"grad_norm": 0.48585018515586853,
"learning_rate": 0.0001992081839336419,
"loss": 1.2293,
"step": 250
},
{
"epoch": 0.1443001443001443,
"grad_norm": 0.40136224031448364,
"learning_rate": 0.00019886044784423197,
"loss": 1.2214,
"step": 300
},
{
"epoch": 0.16835016835016836,
"grad_norm": 0.574002206325531,
"learning_rate": 0.00019845000881782432,
"loss": 1.2184,
"step": 350
},
{
"epoch": 0.1924001924001924,
"grad_norm": 0.4179827570915222,
"learning_rate": 0.00019797712717826914,
"loss": 1.2064,
"step": 400
},
{
"epoch": 0.21645021645021645,
"grad_norm": 0.33033809065818787,
"learning_rate": 0.00019744210285408488,
"loss": 1.2055,
"step": 450
},
{
"epoch": 0.2405002405002405,
"grad_norm": 0.2719138562679291,
"learning_rate": 0.0001968452751882264,
"loss": 1.2077,
"step": 500
},
{
"epoch": 0.26455026455026454,
"grad_norm": 0.29797521233558655,
"learning_rate": 0.00019618702272285434,
"loss": 1.2096,
"step": 550
},
{
"epoch": 0.2886002886002886,
"grad_norm": 0.3336372673511505,
"learning_rate": 0.00019546776295924212,
"loss": 1.2072,
"step": 600
},
{
"epoch": 0.3126503126503126,
"grad_norm": 0.26755037903785706,
"learning_rate": 0.0001946879520929728,
"loss": 1.1974,
"step": 650
},
{
"epoch": 0.3367003367003367,
"grad_norm": 0.36268576979637146,
"learning_rate": 0.00019384808472459368,
"loss": 1.2045,
"step": 700
},
{
"epoch": 0.36075036075036077,
"grad_norm": 0.3121575713157654,
"learning_rate": 0.0001929486935459127,
"loss": 1.1889,
"step": 750
},
{
"epoch": 0.3848003848003848,
"grad_norm": 0.3159404993057251,
"learning_rate": 0.00019199034900213452,
"loss": 1.1921,
"step": 800
},
{
"epoch": 0.40885040885040885,
"grad_norm": 0.7236579060554504,
"learning_rate": 0.000190973658930052,
"loss": 1.194,
"step": 850
},
{
"epoch": 0.4329004329004329,
"grad_norm": 0.24907168745994568,
"learning_rate": 0.00018989926817252113,
"loss": 1.191,
"step": 900
},
{
"epoch": 0.45695045695045694,
"grad_norm": 0.24481187760829926,
"learning_rate": 0.00018876785816946505,
"loss": 1.1857,
"step": 950
},
{
"epoch": 0.481000481000481,
"grad_norm": 0.2668200731277466,
"learning_rate": 0.00018758014652566597,
"loss": 1.1957,
"step": 1000
},
{
"epoch": 0.5050505050505051,
"grad_norm": 0.2687171399593353,
"learning_rate": 0.0001863368865556191,
"loss": 1.1864,
"step": 1050
},
{
"epoch": 0.5291005291005291,
"grad_norm": 0.23915782570838928,
"learning_rate": 0.0001850388668057379,
"loss": 1.184,
"step": 1100
},
{
"epoch": 0.5531505531505532,
"grad_norm": 0.37159469723701477,
"learning_rate": 0.0001836869105542127,
"loss": 1.1849,
"step": 1150
},
{
"epoch": 0.5772005772005772,
"grad_norm": 0.2752649784088135,
"learning_rate": 0.0001822818752888408,
"loss": 1.1843,
"step": 1200
},
{
"epoch": 0.6012506012506013,
"grad_norm": 0.19733025133609772,
"learning_rate": 0.00018082465216315882,
"loss": 1.1766,
"step": 1250
},
{
"epoch": 0.6253006253006252,
"grad_norm": 0.2180165797472,
"learning_rate": 0.00017931616543122214,
"loss": 1.1865,
"step": 1300
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.25025510787963867,
"learning_rate": 0.00017775737186139038,
"loss": 1.1723,
"step": 1350
},
{
"epoch": 0.6734006734006734,
"grad_norm": 0.2865007817745209,
"learning_rate": 0.00017614926012949028,
"loss": 1.172,
"step": 1400
},
{
"epoch": 0.6974506974506974,
"grad_norm": 0.3406023681163788,
"learning_rate": 0.00017449285019174098,
"loss": 1.1795,
"step": 1450
},
{
"epoch": 0.7215007215007215,
"grad_norm": 0.19766800105571747,
"learning_rate": 0.00017278919263783978,
"loss": 1.1784,
"step": 1500
},
{
"epoch": 0.7455507455507455,
"grad_norm": 0.1965962052345276,
"learning_rate": 0.00017103936802461797,
"loss": 1.1754,
"step": 1550
},
{
"epoch": 0.7696007696007696,
"grad_norm": 0.2381555736064911,
"learning_rate": 0.00016924448619069023,
"loss": 1.1671,
"step": 1600
},
{
"epoch": 0.7936507936507936,
"grad_norm": 0.20156389474868774,
"learning_rate": 0.00016740568555253155,
"loss": 1.1738,
"step": 1650
},
{
"epoch": 0.8177008177008177,
"grad_norm": 0.18294361233711243,
"learning_rate": 0.00016552413238242857,
"loss": 1.1727,
"step": 1700
},
{
"epoch": 0.8417508417508418,
"grad_norm": 0.2975623309612274,
"learning_rate": 0.00016360102006876317,
"loss": 1.1677,
"step": 1750
},
{
"epoch": 0.8658008658008658,
"grad_norm": 0.1871371865272522,
"learning_rate": 0.0001616375683590974,
"loss": 1.1689,
"step": 1800
},
{
"epoch": 0.8898508898508899,
"grad_norm": 0.21457934379577637,
"learning_rate": 0.00015963502258654005,
"loss": 1.1605,
"step": 1850
},
{
"epoch": 0.9139009139009139,
"grad_norm": 0.20261706411838531,
"learning_rate": 0.0001575946528798853,
"loss": 1.1627,
"step": 1900
},
{
"epoch": 0.937950937950938,
"grad_norm": 0.17685186862945557,
"learning_rate": 0.0001555177533580245,
"loss": 1.1627,
"step": 1950
},
{
"epoch": 0.962000962000962,
"grad_norm": 0.212468221783638,
"learning_rate": 0.00015340564130914233,
"loss": 1.161,
"step": 2000
},
{
"epoch": 0.9860509860509861,
"grad_norm": 0.175174742937088,
"learning_rate": 0.00015125965635521724,
"loss": 1.1688,
"step": 2050
},
{
"epoch": 1.0101010101010102,
"grad_norm": 0.19970253109931946,
"learning_rate": 0.00014908115960235682,
"loss": 1.142,
"step": 2100
},
{
"epoch": 1.034151034151034,
"grad_norm": 0.21254608035087585,
"learning_rate": 0.00014687153277750676,
"loss": 1.1271,
"step": 2150
},
{
"epoch": 1.0582010582010581,
"grad_norm": 0.1651500016450882,
"learning_rate": 0.00014463217735208062,
"loss": 1.121,
"step": 2200
},
{
"epoch": 1.0822510822510822,
"grad_norm": 0.2405405044555664,
"learning_rate": 0.00014236451365306674,
"loss": 1.1313,
"step": 2250
},
{
"epoch": 1.1063011063011063,
"grad_norm": 0.17223596572875977,
"learning_rate": 0.00014006997996217593,
"loss": 1.1344,
"step": 2300
},
{
"epoch": 1.1303511303511304,
"grad_norm": 0.1969347894191742,
"learning_rate": 0.00013775003160360096,
"loss": 1.1176,
"step": 2350
},
{
"epoch": 1.1544011544011543,
"grad_norm": 0.187143936753273,
"learning_rate": 0.00013540614002096701,
"loss": 1.1322,
"step": 2400
},
{
"epoch": 1.1784511784511784,
"grad_norm": 0.1838238537311554,
"learning_rate": 0.00013303979184405826,
"loss": 1.1293,
"step": 2450
},
{
"epoch": 1.2025012025012025,
"grad_norm": 0.17928341031074524,
"learning_rate": 0.00013065248794591223,
"loss": 1.1268,
"step": 2500
},
{
"epoch": 1.2265512265512266,
"grad_norm": 0.2683047950267792,
"learning_rate": 0.00012824574249088063,
"loss": 1.1234,
"step": 2550
},
{
"epoch": 1.2506012506012505,
"grad_norm": 0.18034860491752625,
"learning_rate": 0.0001258210819742599,
"loss": 1.125,
"step": 2600
},
{
"epoch": 1.2746512746512746,
"grad_norm": 0.26357391476631165,
"learning_rate": 0.00012338004425410074,
"loss": 1.1217,
"step": 2650
},
{
"epoch": 1.2987012987012987,
"grad_norm": 0.17828579246997833,
"learning_rate": 0.00012092417757581085,
"loss": 1.1262,
"step": 2700
},
{
"epoch": 1.3227513227513228,
"grad_norm": 0.20247310400009155,
"learning_rate": 0.00011845503959016928,
"loss": 1.1246,
"step": 2750
},
{
"epoch": 1.3468013468013469,
"grad_norm": 0.17381271719932556,
"learning_rate": 0.0001159741963653755,
"loss": 1.1181,
"step": 2800
},
{
"epoch": 1.370851370851371,
"grad_norm": 0.19958114624023438,
"learning_rate": 0.00011348322139375948,
"loss": 1.1307,
"step": 2850
},
{
"epoch": 1.3949013949013949,
"grad_norm": 0.21912401914596558,
"learning_rate": 0.00011098369459378328,
"loss": 1.1264,
"step": 2900
},
{
"epoch": 1.418951418951419,
"grad_norm": 0.1694297194480896,
"learning_rate": 0.00010847720130796631,
"loss": 1.1256,
"step": 2950
},
{
"epoch": 1.443001443001443,
"grad_norm": 0.13446395099163055,
"learning_rate": 0.00010596533129737092,
"loss": 1.1258,
"step": 3000
},
{
"epoch": 1.467051467051467,
"grad_norm": 0.140371173620224,
"learning_rate": 0.00010344967773328507,
"loss": 1.1191,
"step": 3050
},
{
"epoch": 1.491101491101491,
"grad_norm": 0.18016813695430756,
"learning_rate": 0.00010093183618674224,
"loss": 1.114,
"step": 3100
},
{
"epoch": 1.5151515151515151,
"grad_norm": 0.17306862771511078,
"learning_rate": 9.84134036165192e-05,
"loss": 1.1149,
"step": 3150
},
{
"epoch": 1.5392015392015392,
"grad_norm": 0.14116255939006805,
"learning_rate": 9.589597735625377e-05,
"loss": 1.123,
"step": 3200
},
{
"epoch": 1.5632515632515633,
"grad_norm": 0.16819800436496735,
"learning_rate": 9.338115410132441e-05,
"loss": 1.1203,
"step": 3250
},
{
"epoch": 1.5873015873015874,
"grad_norm": 0.21958529949188232,
"learning_rate": 9.087052889613518e-05,
"loss": 1.1226,
"step": 3300
},
{
"epoch": 1.6113516113516113,
"grad_norm": 0.15786272287368774,
"learning_rate": 8.836569412244745e-05,
"loss": 1.1212,
"step": 3350
},
{
"epoch": 1.6354016354016354,
"grad_norm": 0.17366796731948853,
"learning_rate": 8.586823848940047e-05,
"loss": 1.1129,
"step": 3400
},
{
"epoch": 1.6594516594516593,
"grad_norm": 0.21448016166687012,
"learning_rate": 8.337974602586152e-05,
"loss": 1.1216,
"step": 3450
},
{
"epoch": 1.6835016835016834,
"grad_norm": 0.17243099212646484,
"learning_rate": 8.090179507574427e-05,
"loss": 1.1096,
"step": 3500
},
{
"epoch": 1.7075517075517075,
"grad_norm": 0.1429734081029892,
"learning_rate": 7.843595729693316e-05,
"loss": 1.1071,
"step": 3550
},
{
"epoch": 1.7316017316017316,
"grad_norm": 0.15200386941432953,
"learning_rate": 7.598379666444808e-05,
"loss": 1.1158,
"step": 3600
},
{
"epoch": 1.7556517556517557,
"grad_norm": 0.1442406326532364,
"learning_rate": 7.354686847848242e-05,
"loss": 1.112,
"step": 3650
},
{
"epoch": 1.7797017797017798,
"grad_norm": 0.17678239941596985,
"learning_rate": 7.11267183779428e-05,
"loss": 1.1118,
"step": 3700
},
{
"epoch": 1.8037518037518039,
"grad_norm": 0.147593155503273,
"learning_rate": 6.872488136011667e-05,
"loss": 1.1165,
"step": 3750
},
{
"epoch": 1.8278018278018278,
"grad_norm": 0.1334652155637741,
"learning_rate": 6.634288080708952e-05,
"loss": 1.1135,
"step": 3800
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.14890378713607788,
"learning_rate": 6.398222751952899e-05,
"loss": 1.1086,
"step": 3850
},
{
"epoch": 1.8759018759018757,
"grad_norm": 0.1334807574748993,
"learning_rate": 6.164441875844882e-05,
"loss": 1.1144,
"step": 3900
},
{
"epoch": 1.8999518999518998,
"grad_norm": 0.12897680699825287,
"learning_rate": 5.933093729556062e-05,
"loss": 1.1116,
"step": 3950
},
{
"epoch": 1.924001924001924,
"grad_norm": 0.17530564963817596,
"learning_rate": 5.7043250472815356e-05,
"loss": 1.1039,
"step": 4000
},
{
"epoch": 1.948051948051948,
"grad_norm": 0.15966495871543884,
"learning_rate": 5.478280927173145e-05,
"loss": 1.101,
"step": 4050
},
{
"epoch": 1.9721019721019721,
"grad_norm": 0.18890446424484253,
"learning_rate": 5.255104739309924e-05,
"loss": 1.1077,
"step": 4100
},
{
"epoch": 1.9961519961519962,
"grad_norm": 0.1547369807958603,
"learning_rate": 5.0349380347646494e-05,
"loss": 1.103,
"step": 4150
},
{
"epoch": 2.0202020202020203,
"grad_norm": 0.13888758420944214,
"learning_rate": 4.8179204558240444e-05,
"loss": 1.0826,
"step": 4200
},
{
"epoch": 2.0442520442520444,
"grad_norm": 0.11266086250543594,
"learning_rate": 4.6041896474197e-05,
"loss": 1.071,
"step": 4250
},
{
"epoch": 2.068302068302068,
"grad_norm": 0.14245671033859253,
"learning_rate": 4.393881169825779e-05,
"loss": 1.0759,
"step": 4300
},
{
"epoch": 2.092352092352092,
"grad_norm": 0.1226249411702156,
"learning_rate": 4.187128412678969e-05,
"loss": 1.0742,
"step": 4350
},
{
"epoch": 2.1164021164021163,
"grad_norm": 0.12307476997375488,
"learning_rate": 3.984062510375155e-05,
"loss": 1.0721,
"step": 4400
},
{
"epoch": 2.1404521404521404,
"grad_norm": 0.12813834846019745,
"learning_rate": 3.7848122588965144e-05,
"loss": 1.0726,
"step": 4450
},
{
"epoch": 2.1645021645021645,
"grad_norm": 0.13432885706424713,
"learning_rate": 3.5895040341217543e-05,
"loss": 1.0745,
"step": 4500
},
{
"epoch": 2.1885521885521886,
"grad_norm": 0.11649097502231598,
"learning_rate": 3.398261711671309e-05,
"loss": 1.079,
"step": 4550
},
{
"epoch": 2.2126022126022127,
"grad_norm": 0.11140163242816925,
"learning_rate": 3.211206588338358e-05,
"loss": 1.0748,
"step": 4600
},
{
"epoch": 2.236652236652237,
"grad_norm": 0.10978424549102783,
"learning_rate": 3.028457305155483e-05,
"loss": 1.0726,
"step": 4650
},
{
"epoch": 2.260702260702261,
"grad_norm": 0.11395589262247086,
"learning_rate": 2.8501297721457422e-05,
"loss": 1.0656,
"step": 4700
},
{
"epoch": 2.284752284752285,
"grad_norm": 0.10599405318498611,
"learning_rate": 2.6763370948059353e-05,
"loss": 1.0765,
"step": 4750
},
{
"epoch": 2.3088023088023086,
"grad_norm": 0.11157254874706268,
"learning_rate": 2.5071895023686442e-05,
"loss": 1.0726,
"step": 4800
},
{
"epoch": 2.3328523328523327,
"grad_norm": 0.1390163153409958,
"learning_rate": 2.342794277888547e-05,
"loss": 1.0731,
"step": 4850
},
{
"epoch": 2.356902356902357,
"grad_norm": 0.1519329994916916,
"learning_rate": 2.1832556901973965e-05,
"loss": 1.0704,
"step": 4900
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.1278182566165924,
"learning_rate": 2.0286749277707782e-05,
"loss": 1.0661,
"step": 4950
},
{
"epoch": 2.405002405002405,
"grad_norm": 0.10508263111114502,
"learning_rate": 1.879150034548588e-05,
"loss": 1.0758,
"step": 5000
},
{
"epoch": 2.429052429052429,
"grad_norm": 0.09690719097852707,
"learning_rate": 1.7347758477500044e-05,
"loss": 1.0644,
"step": 5050
},
{
"epoch": 2.4531024531024532,
"grad_norm": 0.10174595564603806,
"learning_rate": 1.5956439377222798e-05,
"loss": 1.0726,
"step": 5100
},
{
"epoch": 2.4771524771524773,
"grad_norm": 0.10294167697429657,
"learning_rate": 1.4618425498616162e-05,
"loss": 1.0655,
"step": 5150
},
{
"epoch": 2.501202501202501,
"grad_norm": 0.11103129386901855,
"learning_rate": 1.3334565486428996e-05,
"loss": 1.0651,
"step": 5200
},
{
"epoch": 2.525252525252525,
"grad_norm": 0.10614852607250214,
"learning_rate": 1.2105673637938053e-05,
"loss": 1.0701,
"step": 5250
},
{
"epoch": 2.549302549302549,
"grad_norm": 0.09437720477581024,
"learning_rate": 1.0932529386474188e-05,
"loss": 1.0673,
"step": 5300
},
{
"epoch": 2.5733525733525733,
"grad_norm": 0.0965106412768364,
"learning_rate": 9.815876807061264e-06,
"loss": 1.0769,
"step": 5350
},
{
"epoch": 2.5974025974025974,
"grad_norm": 0.09335634112358093,
"learning_rate": 8.756424144481312e-06,
"loss": 1.0646,
"step": 5400
},
{
"epoch": 2.6214526214526215,
"grad_norm": 0.09890544414520264,
"learning_rate": 7.75484336406529e-06,
"loss": 1.0757,
"step": 5450
},
{
"epoch": 2.6455026455026456,
"grad_norm": 0.09670912474393845,
"learning_rate": 6.8117697254943106e-06,
"loss": 1.0668,
"step": 5500
},
{
"epoch": 2.6695526695526697,
"grad_norm": 0.09898468106985092,
"learning_rate": 5.927801379881714e-06,
"loss": 1.0745,
"step": 5550
},
{
"epoch": 2.6936026936026938,
"grad_norm": 0.08697386831045151,
"learning_rate": 5.103498990391509e-06,
"loss": 1.0653,
"step": 5600
},
{
"epoch": 2.717652717652718,
"grad_norm": 0.09457134455442429,
"learning_rate": 4.339385376633775e-06,
"loss": 1.0678,
"step": 5650
},
{
"epoch": 2.741702741702742,
"grad_norm": 0.09092475473880768,
"learning_rate": 3.6359451830626723e-06,
"loss": 1.0635,
"step": 5700
},
{
"epoch": 2.7657527657527656,
"grad_norm": 0.08736653625965118,
"learning_rate": 2.993624571587239e-06,
"loss": 1.0639,
"step": 5750
},
{
"epoch": 2.7898027898027897,
"grad_norm": 0.09138292819261551,
"learning_rate": 2.4128309385900717e-06,
"loss": 1.065,
"step": 5800
},
{
"epoch": 2.813852813852814,
"grad_norm": 0.08842656016349792,
"learning_rate": 1.8939326565333037e-06,
"loss": 1.0636,
"step": 5850
},
{
"epoch": 2.837902837902838,
"grad_norm": 0.08870802819728851,
"learning_rate": 1.437258840315714e-06,
"loss": 1.0706,
"step": 5900
},
{
"epoch": 2.861952861952862,
"grad_norm": 0.08659425377845764,
"learning_rate": 1.0430991385293575e-06,
"loss": 1.0673,
"step": 5950
},
{
"epoch": 2.886002886002886,
"grad_norm": 0.08142086863517761,
"learning_rate": 7.117035497478553e-07,
"loss": 1.0697,
"step": 6000
},
{
"epoch": 2.91005291005291,
"grad_norm": 0.080448217689991,
"learning_rate": 4.432822639630407e-07,
"loss": 1.0655,
"step": 6050
},
{
"epoch": 2.934102934102934,
"grad_norm": 0.08980288356542587,
"learning_rate": 2.380055292704575e-07,
"loss": 1.0701,
"step": 6100
},
{
"epoch": 2.958152958152958,
"grad_norm": 0.08309097588062286,
"learning_rate": 9.600354388833443e-08,
"loss": 1.0684,
"step": 6150
},
{
"epoch": 2.982202982202982,
"grad_norm": 0.08456841111183167,
"learning_rate": 1.7366373578442397e-08,
"loss": 1.0684,
"step": 6200
}
],
"logging_steps": 50,
"max_steps": 6237,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.056700790948663e+20,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}