|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500.0, |
|
"global_step": 6237, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02405002405002405, |
|
"grad_norm": 0.4139963388442993, |
|
"learning_rate": 0.00019996828714700116, |
|
"loss": 1.5971, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0481000481000481, |
|
"grad_norm": 0.3423018157482147, |
|
"learning_rate": 0.00019987316870210547, |
|
"loss": 1.274, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07215007215007214, |
|
"grad_norm": 0.3551710247993469, |
|
"learning_rate": 0.0001997147049948582, |
|
"loss": 1.2519, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0962000962000962, |
|
"grad_norm": 0.32329073548316956, |
|
"learning_rate": 0.0001994929965319844, |
|
"loss": 1.2382, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12025012025012025, |
|
"grad_norm": 0.48585018515586853, |
|
"learning_rate": 0.0001992081839336419, |
|
"loss": 1.2293, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1443001443001443, |
|
"grad_norm": 0.40136224031448364, |
|
"learning_rate": 0.00019886044784423197, |
|
"loss": 1.2214, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16835016835016836, |
|
"grad_norm": 0.574002206325531, |
|
"learning_rate": 0.00019845000881782432, |
|
"loss": 1.2184, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1924001924001924, |
|
"grad_norm": 0.4179827570915222, |
|
"learning_rate": 0.00019797712717826914, |
|
"loss": 1.2064, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"grad_norm": 0.33033809065818787, |
|
"learning_rate": 0.00019744210285408488, |
|
"loss": 1.2055, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2405002405002405, |
|
"grad_norm": 0.2719138562679291, |
|
"learning_rate": 0.0001968452751882264, |
|
"loss": 1.2077, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26455026455026454, |
|
"grad_norm": 0.29797521233558655, |
|
"learning_rate": 0.00019618702272285434, |
|
"loss": 1.2096, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2886002886002886, |
|
"grad_norm": 0.3336372673511505, |
|
"learning_rate": 0.00019546776295924212, |
|
"loss": 1.2072, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3126503126503126, |
|
"grad_norm": 0.26755037903785706, |
|
"learning_rate": 0.0001946879520929728, |
|
"loss": 1.1974, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3367003367003367, |
|
"grad_norm": 0.36268576979637146, |
|
"learning_rate": 0.00019384808472459368, |
|
"loss": 1.2045, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.36075036075036077, |
|
"grad_norm": 0.3121575713157654, |
|
"learning_rate": 0.0001929486935459127, |
|
"loss": 1.1889, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3848003848003848, |
|
"grad_norm": 0.3159404993057251, |
|
"learning_rate": 0.00019199034900213452, |
|
"loss": 1.1921, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.40885040885040885, |
|
"grad_norm": 0.7236579060554504, |
|
"learning_rate": 0.000190973658930052, |
|
"loss": 1.194, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"grad_norm": 0.24907168745994568, |
|
"learning_rate": 0.00018989926817252113, |
|
"loss": 1.191, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.45695045695045694, |
|
"grad_norm": 0.24481187760829926, |
|
"learning_rate": 0.00018876785816946505, |
|
"loss": 1.1857, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.481000481000481, |
|
"grad_norm": 0.2668200731277466, |
|
"learning_rate": 0.00018758014652566597, |
|
"loss": 1.1957, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5050505050505051, |
|
"grad_norm": 0.2687171399593353, |
|
"learning_rate": 0.0001863368865556191, |
|
"loss": 1.1864, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5291005291005291, |
|
"grad_norm": 0.23915782570838928, |
|
"learning_rate": 0.0001850388668057379, |
|
"loss": 1.184, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5531505531505532, |
|
"grad_norm": 0.37159469723701477, |
|
"learning_rate": 0.0001836869105542127, |
|
"loss": 1.1849, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5772005772005772, |
|
"grad_norm": 0.2752649784088135, |
|
"learning_rate": 0.0001822818752888408, |
|
"loss": 1.1843, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6012506012506013, |
|
"grad_norm": 0.19733025133609772, |
|
"learning_rate": 0.00018082465216315882, |
|
"loss": 1.1766, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6253006253006252, |
|
"grad_norm": 0.2180165797472, |
|
"learning_rate": 0.00017931616543122214, |
|
"loss": 1.1865, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.25025510787963867, |
|
"learning_rate": 0.00017775737186139038, |
|
"loss": 1.1723, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6734006734006734, |
|
"grad_norm": 0.2865007817745209, |
|
"learning_rate": 0.00017614926012949028, |
|
"loss": 1.172, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6974506974506974, |
|
"grad_norm": 0.3406023681163788, |
|
"learning_rate": 0.00017449285019174098, |
|
"loss": 1.1795, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7215007215007215, |
|
"grad_norm": 0.19766800105571747, |
|
"learning_rate": 0.00017278919263783978, |
|
"loss": 1.1784, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7455507455507455, |
|
"grad_norm": 0.1965962052345276, |
|
"learning_rate": 0.00017103936802461797, |
|
"loss": 1.1754, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7696007696007696, |
|
"grad_norm": 0.2381555736064911, |
|
"learning_rate": 0.00016924448619069023, |
|
"loss": 1.1671, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7936507936507936, |
|
"grad_norm": 0.20156389474868774, |
|
"learning_rate": 0.00016740568555253155, |
|
"loss": 1.1738, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8177008177008177, |
|
"grad_norm": 0.18294361233711243, |
|
"learning_rate": 0.00016552413238242857, |
|
"loss": 1.1727, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8417508417508418, |
|
"grad_norm": 0.2975623309612274, |
|
"learning_rate": 0.00016360102006876317, |
|
"loss": 1.1677, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"grad_norm": 0.1871371865272522, |
|
"learning_rate": 0.0001616375683590974, |
|
"loss": 1.1689, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8898508898508899, |
|
"grad_norm": 0.21457934379577637, |
|
"learning_rate": 0.00015963502258654005, |
|
"loss": 1.1605, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9139009139009139, |
|
"grad_norm": 0.20261706411838531, |
|
"learning_rate": 0.0001575946528798853, |
|
"loss": 1.1627, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.937950937950938, |
|
"grad_norm": 0.17685186862945557, |
|
"learning_rate": 0.0001555177533580245, |
|
"loss": 1.1627, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.962000962000962, |
|
"grad_norm": 0.212468221783638, |
|
"learning_rate": 0.00015340564130914233, |
|
"loss": 1.161, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9860509860509861, |
|
"grad_norm": 0.175174742937088, |
|
"learning_rate": 0.00015125965635521724, |
|
"loss": 1.1688, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.0101010101010102, |
|
"grad_norm": 0.19970253109931946, |
|
"learning_rate": 0.00014908115960235682, |
|
"loss": 1.142, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.034151034151034, |
|
"grad_norm": 0.21254608035087585, |
|
"learning_rate": 0.00014687153277750676, |
|
"loss": 1.1271, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.0582010582010581, |
|
"grad_norm": 0.1651500016450882, |
|
"learning_rate": 0.00014463217735208062, |
|
"loss": 1.121, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0822510822510822, |
|
"grad_norm": 0.2405405044555664, |
|
"learning_rate": 0.00014236451365306674, |
|
"loss": 1.1313, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.1063011063011063, |
|
"grad_norm": 0.17223596572875977, |
|
"learning_rate": 0.00014006997996217593, |
|
"loss": 1.1344, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.1303511303511304, |
|
"grad_norm": 0.1969347894191742, |
|
"learning_rate": 0.00013775003160360096, |
|
"loss": 1.1176, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.1544011544011543, |
|
"grad_norm": 0.187143936753273, |
|
"learning_rate": 0.00013540614002096701, |
|
"loss": 1.1322, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1784511784511784, |
|
"grad_norm": 0.1838238537311554, |
|
"learning_rate": 0.00013303979184405826, |
|
"loss": 1.1293, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.2025012025012025, |
|
"grad_norm": 0.17928341031074524, |
|
"learning_rate": 0.00013065248794591223, |
|
"loss": 1.1268, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.2265512265512266, |
|
"grad_norm": 0.2683047950267792, |
|
"learning_rate": 0.00012824574249088063, |
|
"loss": 1.1234, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.2506012506012505, |
|
"grad_norm": 0.18034860491752625, |
|
"learning_rate": 0.0001258210819742599, |
|
"loss": 1.125, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2746512746512746, |
|
"grad_norm": 0.26357391476631165, |
|
"learning_rate": 0.00012338004425410074, |
|
"loss": 1.1217, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.17828579246997833, |
|
"learning_rate": 0.00012092417757581085, |
|
"loss": 1.1262, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.3227513227513228, |
|
"grad_norm": 0.20247310400009155, |
|
"learning_rate": 0.00011845503959016928, |
|
"loss": 1.1246, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.3468013468013469, |
|
"grad_norm": 0.17381271719932556, |
|
"learning_rate": 0.0001159741963653755, |
|
"loss": 1.1181, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.370851370851371, |
|
"grad_norm": 0.19958114624023438, |
|
"learning_rate": 0.00011348322139375948, |
|
"loss": 1.1307, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.3949013949013949, |
|
"grad_norm": 0.21912401914596558, |
|
"learning_rate": 0.00011098369459378328, |
|
"loss": 1.1264, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.418951418951419, |
|
"grad_norm": 0.1694297194480896, |
|
"learning_rate": 0.00010847720130796631, |
|
"loss": 1.1256, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.443001443001443, |
|
"grad_norm": 0.13446395099163055, |
|
"learning_rate": 0.00010596533129737092, |
|
"loss": 1.1258, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.467051467051467, |
|
"grad_norm": 0.140371173620224, |
|
"learning_rate": 0.00010344967773328507, |
|
"loss": 1.1191, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.491101491101491, |
|
"grad_norm": 0.18016813695430756, |
|
"learning_rate": 0.00010093183618674224, |
|
"loss": 1.114, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 0.17306862771511078, |
|
"learning_rate": 9.84134036165192e-05, |
|
"loss": 1.1149, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.5392015392015392, |
|
"grad_norm": 0.14116255939006805, |
|
"learning_rate": 9.589597735625377e-05, |
|
"loss": 1.123, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.5632515632515633, |
|
"grad_norm": 0.16819800436496735, |
|
"learning_rate": 9.338115410132441e-05, |
|
"loss": 1.1203, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.5873015873015874, |
|
"grad_norm": 0.21958529949188232, |
|
"learning_rate": 9.087052889613518e-05, |
|
"loss": 1.1226, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.6113516113516113, |
|
"grad_norm": 0.15786272287368774, |
|
"learning_rate": 8.836569412244745e-05, |
|
"loss": 1.1212, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.6354016354016354, |
|
"grad_norm": 0.17366796731948853, |
|
"learning_rate": 8.586823848940047e-05, |
|
"loss": 1.1129, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.6594516594516593, |
|
"grad_norm": 0.21448016166687012, |
|
"learning_rate": 8.337974602586152e-05, |
|
"loss": 1.1216, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.6835016835016834, |
|
"grad_norm": 0.17243099212646484, |
|
"learning_rate": 8.090179507574427e-05, |
|
"loss": 1.1096, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.7075517075517075, |
|
"grad_norm": 0.1429734081029892, |
|
"learning_rate": 7.843595729693316e-05, |
|
"loss": 1.1071, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"grad_norm": 0.15200386941432953, |
|
"learning_rate": 7.598379666444808e-05, |
|
"loss": 1.1158, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.7556517556517557, |
|
"grad_norm": 0.1442406326532364, |
|
"learning_rate": 7.354686847848242e-05, |
|
"loss": 1.112, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.7797017797017798, |
|
"grad_norm": 0.17678239941596985, |
|
"learning_rate": 7.11267183779428e-05, |
|
"loss": 1.1118, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.8037518037518039, |
|
"grad_norm": 0.147593155503273, |
|
"learning_rate": 6.872488136011667e-05, |
|
"loss": 1.1165, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.8278018278018278, |
|
"grad_norm": 0.1334652155637741, |
|
"learning_rate": 6.634288080708952e-05, |
|
"loss": 1.1135, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.14890378713607788, |
|
"learning_rate": 6.398222751952899e-05, |
|
"loss": 1.1086, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.8759018759018757, |
|
"grad_norm": 0.1334807574748993, |
|
"learning_rate": 6.164441875844882e-05, |
|
"loss": 1.1144, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8999518999518998, |
|
"grad_norm": 0.12897680699825287, |
|
"learning_rate": 5.933093729556062e-05, |
|
"loss": 1.1116, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.924001924001924, |
|
"grad_norm": 0.17530564963817596, |
|
"learning_rate": 5.7043250472815356e-05, |
|
"loss": 1.1039, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"grad_norm": 0.15966495871543884, |
|
"learning_rate": 5.478280927173145e-05, |
|
"loss": 1.101, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.9721019721019721, |
|
"grad_norm": 0.18890446424484253, |
|
"learning_rate": 5.255104739309924e-05, |
|
"loss": 1.1077, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.9961519961519962, |
|
"grad_norm": 0.1547369807958603, |
|
"learning_rate": 5.0349380347646494e-05, |
|
"loss": 1.103, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 2.0202020202020203, |
|
"grad_norm": 0.13888758420944214, |
|
"learning_rate": 4.8179204558240444e-05, |
|
"loss": 1.0826, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.0442520442520444, |
|
"grad_norm": 0.11266086250543594, |
|
"learning_rate": 4.6041896474197e-05, |
|
"loss": 1.071, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 2.068302068302068, |
|
"grad_norm": 0.14245671033859253, |
|
"learning_rate": 4.393881169825779e-05, |
|
"loss": 1.0759, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.092352092352092, |
|
"grad_norm": 0.1226249411702156, |
|
"learning_rate": 4.187128412678969e-05, |
|
"loss": 1.0742, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.1164021164021163, |
|
"grad_norm": 0.12307476997375488, |
|
"learning_rate": 3.984062510375155e-05, |
|
"loss": 1.0721, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.1404521404521404, |
|
"grad_norm": 0.12813834846019745, |
|
"learning_rate": 3.7848122588965144e-05, |
|
"loss": 1.0726, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"grad_norm": 0.13432885706424713, |
|
"learning_rate": 3.5895040341217543e-05, |
|
"loss": 1.0745, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.1885521885521886, |
|
"grad_norm": 0.11649097502231598, |
|
"learning_rate": 3.398261711671309e-05, |
|
"loss": 1.079, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.2126022126022127, |
|
"grad_norm": 0.11140163242816925, |
|
"learning_rate": 3.211206588338358e-05, |
|
"loss": 1.0748, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.236652236652237, |
|
"grad_norm": 0.10978424549102783, |
|
"learning_rate": 3.028457305155483e-05, |
|
"loss": 1.0726, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.260702260702261, |
|
"grad_norm": 0.11395589262247086, |
|
"learning_rate": 2.8501297721457422e-05, |
|
"loss": 1.0656, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.284752284752285, |
|
"grad_norm": 0.10599405318498611, |
|
"learning_rate": 2.6763370948059353e-05, |
|
"loss": 1.0765, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.3088023088023086, |
|
"grad_norm": 0.11157254874706268, |
|
"learning_rate": 2.5071895023686442e-05, |
|
"loss": 1.0726, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.3328523328523327, |
|
"grad_norm": 0.1390163153409958, |
|
"learning_rate": 2.342794277888547e-05, |
|
"loss": 1.0731, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.356902356902357, |
|
"grad_norm": 0.1519329994916916, |
|
"learning_rate": 2.1832556901973965e-05, |
|
"loss": 1.0704, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.1278182566165924, |
|
"learning_rate": 2.0286749277707782e-05, |
|
"loss": 1.0661, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.405002405002405, |
|
"grad_norm": 0.10508263111114502, |
|
"learning_rate": 1.879150034548588e-05, |
|
"loss": 1.0758, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.429052429052429, |
|
"grad_norm": 0.09690719097852707, |
|
"learning_rate": 1.7347758477500044e-05, |
|
"loss": 1.0644, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.4531024531024532, |
|
"grad_norm": 0.10174595564603806, |
|
"learning_rate": 1.5956439377222798e-05, |
|
"loss": 1.0726, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.4771524771524773, |
|
"grad_norm": 0.10294167697429657, |
|
"learning_rate": 1.4618425498616162e-05, |
|
"loss": 1.0655, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.501202501202501, |
|
"grad_norm": 0.11103129386901855, |
|
"learning_rate": 1.3334565486428996e-05, |
|
"loss": 1.0651, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.525252525252525, |
|
"grad_norm": 0.10614852607250214, |
|
"learning_rate": 1.2105673637938053e-05, |
|
"loss": 1.0701, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.549302549302549, |
|
"grad_norm": 0.09437720477581024, |
|
"learning_rate": 1.0932529386474188e-05, |
|
"loss": 1.0673, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.5733525733525733, |
|
"grad_norm": 0.0965106412768364, |
|
"learning_rate": 9.815876807061264e-06, |
|
"loss": 1.0769, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.09335634112358093, |
|
"learning_rate": 8.756424144481312e-06, |
|
"loss": 1.0646, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.6214526214526215, |
|
"grad_norm": 0.09890544414520264, |
|
"learning_rate": 7.75484336406529e-06, |
|
"loss": 1.0757, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 2.6455026455026456, |
|
"grad_norm": 0.09670912474393845, |
|
"learning_rate": 6.8117697254943106e-06, |
|
"loss": 1.0668, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.6695526695526697, |
|
"grad_norm": 0.09898468106985092, |
|
"learning_rate": 5.927801379881714e-06, |
|
"loss": 1.0745, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.6936026936026938, |
|
"grad_norm": 0.08697386831045151, |
|
"learning_rate": 5.103498990391509e-06, |
|
"loss": 1.0653, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.717652717652718, |
|
"grad_norm": 0.09457134455442429, |
|
"learning_rate": 4.339385376633775e-06, |
|
"loss": 1.0678, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.741702741702742, |
|
"grad_norm": 0.09092475473880768, |
|
"learning_rate": 3.6359451830626723e-06, |
|
"loss": 1.0635, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.7657527657527656, |
|
"grad_norm": 0.08736653625965118, |
|
"learning_rate": 2.993624571587239e-06, |
|
"loss": 1.0639, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.7898027898027897, |
|
"grad_norm": 0.09138292819261551, |
|
"learning_rate": 2.4128309385900717e-06, |
|
"loss": 1.065, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.813852813852814, |
|
"grad_norm": 0.08842656016349792, |
|
"learning_rate": 1.8939326565333037e-06, |
|
"loss": 1.0636, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.837902837902838, |
|
"grad_norm": 0.08870802819728851, |
|
"learning_rate": 1.437258840315714e-06, |
|
"loss": 1.0706, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.861952861952862, |
|
"grad_norm": 0.08659425377845764, |
|
"learning_rate": 1.0430991385293575e-06, |
|
"loss": 1.0673, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.886002886002886, |
|
"grad_norm": 0.08142086863517761, |
|
"learning_rate": 7.117035497478553e-07, |
|
"loss": 1.0697, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.91005291005291, |
|
"grad_norm": 0.080448217689991, |
|
"learning_rate": 4.432822639630407e-07, |
|
"loss": 1.0655, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.934102934102934, |
|
"grad_norm": 0.08980288356542587, |
|
"learning_rate": 2.380055292704575e-07, |
|
"loss": 1.0701, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.958152958152958, |
|
"grad_norm": 0.08309097588062286, |
|
"learning_rate": 9.600354388833443e-08, |
|
"loss": 1.0684, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.982202982202982, |
|
"grad_norm": 0.08456841111183167, |
|
"learning_rate": 1.7366373578442397e-08, |
|
"loss": 1.0684, |
|
"step": 6200 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 6237, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.056700790948663e+20, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|