|
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.99949083503055,
|
|
"eval_steps": 100,
|
|
"global_step": 5891,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.025458248472505093,
|
|
"grad_norm": 0.7198461890220642,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2893,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.05091649694501019,
|
|
"grad_norm": 1.0035477876663208,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2938,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.05091649694501019,
|
|
"eval_loss": 0.30817005038261414,
|
|
"eval_runtime": 22.8318,
|
|
"eval_samples_per_second": 4.38,
|
|
"eval_steps_per_second": 0.569,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.07637474541751528,
|
|
"grad_norm": 1.1024292707443237,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.3017,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.10183299389002037,
|
|
"grad_norm": 0.8950141668319702,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2912,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.10183299389002037,
|
|
"eval_loss": 0.30671748518943787,
|
|
"eval_runtime": 21.6577,
|
|
"eval_samples_per_second": 4.617,
|
|
"eval_steps_per_second": 0.6,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.12729124236252545,
|
|
"grad_norm": 0.9789568781852722,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2928,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.15274949083503056,
|
|
"grad_norm": 1.0154632329940796,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2839,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.15274949083503056,
|
|
"eval_loss": 0.30376389622688293,
|
|
"eval_runtime": 21.5085,
|
|
"eval_samples_per_second": 4.649,
|
|
"eval_steps_per_second": 0.604,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.17820773930753564,
|
|
"grad_norm": 0.977684736251831,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.299,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.20366598778004075,
|
|
"grad_norm": 1.02386474609375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2922,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.20366598778004075,
|
|
"eval_loss": 0.3035086393356323,
|
|
"eval_runtime": 21.9907,
|
|
"eval_samples_per_second": 4.547,
|
|
"eval_steps_per_second": 0.591,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.22912423625254583,
|
|
"grad_norm": 1.0174798965454102,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2955,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.2545824847250509,
|
|
"grad_norm": 1.0312519073486328,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.3013,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.2545824847250509,
|
|
"eval_loss": 0.29991400241851807,
|
|
"eval_runtime": 21.9988,
|
|
"eval_samples_per_second": 4.546,
|
|
"eval_steps_per_second": 0.591,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.280040733197556,
|
|
"grad_norm": 0.8153128623962402,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2902,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.3054989816700611,
|
|
"grad_norm": 0.9280871748924255,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2933,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.3054989816700611,
|
|
"eval_loss": 0.29873424768447876,
|
|
"eval_runtime": 21.424,
|
|
"eval_samples_per_second": 4.668,
|
|
"eval_steps_per_second": 0.607,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.33095723014256617,
|
|
"grad_norm": 1.0311402082443237,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2871,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.3564154786150713,
|
|
"grad_norm": 1.1811566352844238,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2968,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.3564154786150713,
|
|
"eval_loss": 0.29955142736434937,
|
|
"eval_runtime": 23.1012,
|
|
"eval_samples_per_second": 4.329,
|
|
"eval_steps_per_second": 0.563,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.3818737270875764,
|
|
"grad_norm": 1.2003265619277954,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2887,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.4073319755600815,
|
|
"grad_norm": 1.488318920135498,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2891,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.4073319755600815,
|
|
"eval_loss": 0.29836124181747437,
|
|
"eval_runtime": 22.3121,
|
|
"eval_samples_per_second": 4.482,
|
|
"eval_steps_per_second": 0.583,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.43279022403258655,
|
|
"grad_norm": 0.858514130115509,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2985,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.45824847250509165,
|
|
"grad_norm": 0.9991205930709839,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2877,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.45824847250509165,
|
|
"eval_loss": 0.2956816554069519,
|
|
"eval_runtime": 21.6536,
|
|
"eval_samples_per_second": 4.618,
|
|
"eval_steps_per_second": 0.6,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.48370672097759676,
|
|
"grad_norm": 1.130011796951294,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2831,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 0.5091649694501018,
|
|
"grad_norm": 0.97832852602005,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2844,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.5091649694501018,
|
|
"eval_loss": 0.29429855942726135,
|
|
"eval_runtime": 21.7902,
|
|
"eval_samples_per_second": 4.589,
|
|
"eval_steps_per_second": 0.597,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.5346232179226069,
|
|
"grad_norm": 1.1608392000198364,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2802,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 0.560081466395112,
|
|
"grad_norm": 0.9106999635696411,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2736,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.560081466395112,
|
|
"eval_loss": 0.29138484597206116,
|
|
"eval_runtime": 21.847,
|
|
"eval_samples_per_second": 4.577,
|
|
"eval_steps_per_second": 0.595,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.5855397148676171,
|
|
"grad_norm": 1.077606201171875,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2916,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 0.6109979633401222,
|
|
"grad_norm": 1.078594446182251,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2877,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.6109979633401222,
|
|
"eval_loss": 0.2930351495742798,
|
|
"eval_runtime": 21.8584,
|
|
"eval_samples_per_second": 4.575,
|
|
"eval_steps_per_second": 0.595,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.6364562118126272,
|
|
"grad_norm": 1.044995665550232,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2852,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 0.6619144602851323,
|
|
"grad_norm": 1.0691392421722412,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2914,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.6619144602851323,
|
|
"eval_loss": 0.29031333327293396,
|
|
"eval_runtime": 21.8883,
|
|
"eval_samples_per_second": 4.569,
|
|
"eval_steps_per_second": 0.594,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.6873727087576375,
|
|
"grad_norm": 1.165562629699707,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2854,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 0.7128309572301426,
|
|
"grad_norm": 1.1224968433380127,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2846,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.7128309572301426,
|
|
"eval_loss": 0.2897338271141052,
|
|
"eval_runtime": 22.173,
|
|
"eval_samples_per_second": 4.51,
|
|
"eval_steps_per_second": 0.586,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.7382892057026477,
|
|
"grad_norm": 0.9951677918434143,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2754,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 0.7637474541751528,
|
|
"grad_norm": 1.116921305656433,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2754,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.7637474541751528,
|
|
"eval_loss": 0.2867298424243927,
|
|
"eval_runtime": 21.7838,
|
|
"eval_samples_per_second": 4.591,
|
|
"eval_steps_per_second": 0.597,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.7892057026476579,
|
|
"grad_norm": 1.104265570640564,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2824,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 0.814663951120163,
|
|
"grad_norm": 0.8793336749076843,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2901,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.814663951120163,
|
|
"eval_loss": 0.28652000427246094,
|
|
"eval_runtime": 21.8736,
|
|
"eval_samples_per_second": 4.572,
|
|
"eval_steps_per_second": 0.594,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.840122199592668,
|
|
"grad_norm": 1.2304877042770386,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2718,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 0.8655804480651731,
|
|
"grad_norm": 0.9079441428184509,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2787,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.8655804480651731,
|
|
"eval_loss": 0.2839984893798828,
|
|
"eval_runtime": 21.7844,
|
|
"eval_samples_per_second": 4.59,
|
|
"eval_steps_per_second": 0.597,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.8910386965376782,
|
|
"grad_norm": 1.359052300453186,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2703,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 0.9164969450101833,
|
|
"grad_norm": 1.0245873928070068,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2678,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.9164969450101833,
|
|
"eval_loss": 0.2825533151626587,
|
|
"eval_runtime": 22.022,
|
|
"eval_samples_per_second": 4.541,
|
|
"eval_steps_per_second": 0.59,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.9419551934826884,
|
|
"grad_norm": 0.9011121988296509,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2747,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 0.9674134419551935,
|
|
"grad_norm": 1.006032943725586,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2721,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.9674134419551935,
|
|
"eval_loss": 0.2824758291244507,
|
|
"eval_runtime": 22.1289,
|
|
"eval_samples_per_second": 4.519,
|
|
"eval_steps_per_second": 0.587,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.9928716904276986,
|
|
"grad_norm": 0.9993594288825989,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2784,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 1.0183299389002036,
|
|
"grad_norm": 0.9710284471511841,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2713,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.0183299389002036,
|
|
"eval_loss": 0.2820639908313751,
|
|
"eval_runtime": 21.7507,
|
|
"eval_samples_per_second": 4.598,
|
|
"eval_steps_per_second": 0.598,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 1.0437881873727088,
|
|
"grad_norm": 1.2046750783920288,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2799,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 1.0692464358452138,
|
|
"grad_norm": 0.9969730377197266,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2842,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 1.0692464358452138,
|
|
"eval_loss": 0.28064805269241333,
|
|
"eval_runtime": 21.7423,
|
|
"eval_samples_per_second": 4.599,
|
|
"eval_steps_per_second": 0.598,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 1.094704684317719,
|
|
"grad_norm": 0.9368526339530945,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2783,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 1.120162932790224,
|
|
"grad_norm": 1.2995036840438843,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2781,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 1.120162932790224,
|
|
"eval_loss": 0.2789928913116455,
|
|
"eval_runtime": 21.6436,
|
|
"eval_samples_per_second": 4.62,
|
|
"eval_steps_per_second": 0.601,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 1.145621181262729,
|
|
"grad_norm": 1.2737852334976196,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2731,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 1.1710794297352343,
|
|
"grad_norm": 1.0202410221099854,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.273,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 1.1710794297352343,
|
|
"eval_loss": 0.27778831124305725,
|
|
"eval_runtime": 21.9156,
|
|
"eval_samples_per_second": 4.563,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 1.1965376782077393,
|
|
"grad_norm": 0.9710997939109802,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2578,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 1.2219959266802445,
|
|
"grad_norm": 0.86209636926651,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2714,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 1.2219959266802445,
|
|
"eval_loss": 0.27559801936149597,
|
|
"eval_runtime": 21.9259,
|
|
"eval_samples_per_second": 4.561,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 1.2474541751527495,
|
|
"grad_norm": 1.0652376413345337,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.265,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 1.2729124236252547,
|
|
"grad_norm": 1.002944827079773,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2536,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 1.2729124236252547,
|
|
"eval_loss": 0.2747356593608856,
|
|
"eval_runtime": 21.9091,
|
|
"eval_samples_per_second": 4.564,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 1.2983706720977597,
|
|
"grad_norm": 1.0743255615234375,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2742,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 1.3238289205702647,
|
|
"grad_norm": 1.2174959182739258,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.268,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 1.3238289205702647,
|
|
"eval_loss": 0.27502280473709106,
|
|
"eval_runtime": 21.9371,
|
|
"eval_samples_per_second": 4.558,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 1.34928716904277,
|
|
"grad_norm": 1.1314553022384644,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.26,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 1.374745417515275,
|
|
"grad_norm": 1.007804036140442,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2776,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 1.374745417515275,
|
|
"eval_loss": 0.27191075682640076,
|
|
"eval_runtime": 21.9972,
|
|
"eval_samples_per_second": 4.546,
|
|
"eval_steps_per_second": 0.591,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 1.4002036659877801,
|
|
"grad_norm": 1.1400426626205444,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2577,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 1.4256619144602851,
|
|
"grad_norm": 0.9301505088806152,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2726,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 1.4256619144602851,
|
|
"eval_loss": 0.2725023925304413,
|
|
"eval_runtime": 21.722,
|
|
"eval_samples_per_second": 4.604,
|
|
"eval_steps_per_second": 0.598,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 1.4511201629327903,
|
|
"grad_norm": 1.142259120941162,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2635,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 1.4765784114052953,
|
|
"grad_norm": 1.1009142398834229,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2572,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 1.4765784114052953,
|
|
"eval_loss": 0.2723616063594818,
|
|
"eval_runtime": 21.8662,
|
|
"eval_samples_per_second": 4.573,
|
|
"eval_steps_per_second": 0.595,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 1.5020366598778003,
|
|
"grad_norm": 1.2985098361968994,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2477,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 1.5274949083503055,
|
|
"grad_norm": 1.2853788137435913,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2518,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 1.5274949083503055,
|
|
"eval_loss": 0.2711648643016815,
|
|
"eval_runtime": 22.1089,
|
|
"eval_samples_per_second": 4.523,
|
|
"eval_steps_per_second": 0.588,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 1.5529531568228105,
|
|
"grad_norm": 1.3646196126937866,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2545,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 1.5784114052953155,
|
|
"grad_norm": 1.0580254793167114,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2665,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 1.5784114052953155,
|
|
"eval_loss": 0.27152860164642334,
|
|
"eval_runtime": 21.563,
|
|
"eval_samples_per_second": 4.638,
|
|
"eval_steps_per_second": 0.603,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 1.6038696537678208,
|
|
"grad_norm": 1.529466152191162,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2589,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 1.629327902240326,
|
|
"grad_norm": 1.0657099485397339,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2559,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 1.629327902240326,
|
|
"eval_loss": 0.2696399390697479,
|
|
"eval_runtime": 21.6487,
|
|
"eval_samples_per_second": 4.619,
|
|
"eval_steps_per_second": 0.6,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 1.654786150712831,
|
|
"grad_norm": 1.0233310461044312,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2654,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 1.680244399185336,
|
|
"grad_norm": 1.151859998703003,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2609,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 1.680244399185336,
|
|
"eval_loss": 0.26971447467803955,
|
|
"eval_runtime": 21.8279,
|
|
"eval_samples_per_second": 4.581,
|
|
"eval_steps_per_second": 0.596,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 1.7057026476578412,
|
|
"grad_norm": 1.1169921159744263,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.266,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 1.7311608961303462,
|
|
"grad_norm": 0.9375786185264587,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2603,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 1.7311608961303462,
|
|
"eval_loss": 0.26760581135749817,
|
|
"eval_runtime": 21.6655,
|
|
"eval_samples_per_second": 4.616,
|
|
"eval_steps_per_second": 0.6,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 1.7566191446028512,
|
|
"grad_norm": 1.0559574365615845,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2644,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 1.7820773930753564,
|
|
"grad_norm": 1.1326260566711426,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2448,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 1.7820773930753564,
|
|
"eval_loss": 0.2665054500102997,
|
|
"eval_runtime": 21.6484,
|
|
"eval_samples_per_second": 4.619,
|
|
"eval_steps_per_second": 0.601,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 1.8075356415478616,
|
|
"grad_norm": 1.396811842918396,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.252,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 1.8329938900203666,
|
|
"grad_norm": 1.6235796213150024,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2587,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 1.8329938900203666,
|
|
"eval_loss": 0.2665034234523773,
|
|
"eval_runtime": 21.9855,
|
|
"eval_samples_per_second": 4.548,
|
|
"eval_steps_per_second": 0.591,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 1.8584521384928716,
|
|
"grad_norm": 1.2640048265457153,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2525,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 1.8839103869653768,
|
|
"grad_norm": 1.1669272184371948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2534,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 1.8839103869653768,
|
|
"eval_loss": 0.26496145129203796,
|
|
"eval_runtime": 22.0324,
|
|
"eval_samples_per_second": 4.539,
|
|
"eval_steps_per_second": 0.59,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 1.9093686354378818,
|
|
"grad_norm": 1.2015262842178345,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2532,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 1.9348268839103868,
|
|
"grad_norm": 1.0683043003082275,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2496,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 1.9348268839103868,
|
|
"eval_loss": 0.2648490369319916,
|
|
"eval_runtime": 21.9752,
|
|
"eval_samples_per_second": 4.551,
|
|
"eval_steps_per_second": 0.592,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 1.960285132382892,
|
|
"grad_norm": 1.4562475681304932,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.242,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 1.9857433808553973,
|
|
"grad_norm": 0.9929770231246948,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2528,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 1.9857433808553973,
|
|
"eval_loss": 0.263571560382843,
|
|
"eval_runtime": 21.7216,
|
|
"eval_samples_per_second": 4.604,
|
|
"eval_steps_per_second": 0.598,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 2.011201629327902,
|
|
"grad_norm": 1.0417041778564453,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2452,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 2.0366598778004072,
|
|
"grad_norm": 1.0510022640228271,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2652,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 2.0366598778004072,
|
|
"eval_loss": 0.262724906206131,
|
|
"eval_runtime": 22.0633,
|
|
"eval_samples_per_second": 4.532,
|
|
"eval_steps_per_second": 0.589,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 2.0621181262729125,
|
|
"grad_norm": 1.383092999458313,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2369,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 2.0875763747454177,
|
|
"grad_norm": 1.3613831996917725,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2493,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 2.0875763747454177,
|
|
"eval_loss": 0.2625581622123718,
|
|
"eval_runtime": 21.7251,
|
|
"eval_samples_per_second": 4.603,
|
|
"eval_steps_per_second": 0.598,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 2.1130346232179225,
|
|
"grad_norm": 1.0267040729522705,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2511,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 2.1384928716904277,
|
|
"grad_norm": 0.9748584032058716,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2383,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 2.1384928716904277,
|
|
"eval_loss": 0.26004984974861145,
|
|
"eval_runtime": 21.6797,
|
|
"eval_samples_per_second": 4.613,
|
|
"eval_steps_per_second": 0.6,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 2.163951120162933,
|
|
"grad_norm": 1.3859432935714722,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2544,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 2.189409368635438,
|
|
"grad_norm": 1.598718285560608,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2527,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 2.189409368635438,
|
|
"eval_loss": 0.25936272740364075,
|
|
"eval_runtime": 21.803,
|
|
"eval_samples_per_second": 4.587,
|
|
"eval_steps_per_second": 0.596,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 2.214867617107943,
|
|
"grad_norm": 0.9896050691604614,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2423,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 2.240325865580448,
|
|
"grad_norm": 1.4134578704833984,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2446,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 2.240325865580448,
|
|
"eval_loss": 0.2597595751285553,
|
|
"eval_runtime": 21.9078,
|
|
"eval_samples_per_second": 4.565,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 2.2657841140529533,
|
|
"grad_norm": 1.1840572357177734,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.251,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 2.291242362525458,
|
|
"grad_norm": 1.3326150178909302,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2504,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 2.291242362525458,
|
|
"eval_loss": 0.2583908438682556,
|
|
"eval_runtime": 21.7836,
|
|
"eval_samples_per_second": 4.591,
|
|
"eval_steps_per_second": 0.597,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 2.3167006109979633,
|
|
"grad_norm": 1.4150619506835938,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2481,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 2.3421588594704685,
|
|
"grad_norm": 1.4056681394577026,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2474,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 2.3421588594704685,
|
|
"eval_loss": 0.25896069407463074,
|
|
"eval_runtime": 21.9165,
|
|
"eval_samples_per_second": 4.563,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 2.3676171079429738,
|
|
"grad_norm": 1.1109027862548828,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2447,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 2.3930753564154785,
|
|
"grad_norm": 1.1159225702285767,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2482,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 2.3930753564154785,
|
|
"eval_loss": 0.2578243911266327,
|
|
"eval_runtime": 21.5943,
|
|
"eval_samples_per_second": 4.631,
|
|
"eval_steps_per_second": 0.602,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 2.4185336048879837,
|
|
"grad_norm": 1.1924070119857788,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2506,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 2.443991853360489,
|
|
"grad_norm": 1.5512938499450684,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2405,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 2.443991853360489,
|
|
"eval_loss": 0.2575813829898834,
|
|
"eval_runtime": 22.0278,
|
|
"eval_samples_per_second": 4.54,
|
|
"eval_steps_per_second": 0.59,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 2.4694501018329937,
|
|
"grad_norm": 1.0490576028823853,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2476,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 2.494908350305499,
|
|
"grad_norm": 1.0198191404342651,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2399,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 2.494908350305499,
|
|
"eval_loss": 0.25637081265449524,
|
|
"eval_runtime": 21.9152,
|
|
"eval_samples_per_second": 4.563,
|
|
"eval_steps_per_second": 0.593,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 2.520366598778004,
|
|
"grad_norm": 1.122515320777893,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2501,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 2.5458248472505094,
|
|
"grad_norm": 1.082686424255371,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2443,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 2.5458248472505094,
|
|
"eval_loss": 0.25523728132247925,
|
|
"eval_runtime": 21.0107,
|
|
"eval_samples_per_second": 4.759,
|
|
"eval_steps_per_second": 0.619,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 2.571283095723014,
|
|
"grad_norm": 1.0135226249694824,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2414,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 2.5967413441955194,
|
|
"grad_norm": 0.9929371476173401,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.248,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 2.5967413441955194,
|
|
"eval_loss": 0.2532651722431183,
|
|
"eval_runtime": 21.3518,
|
|
"eval_samples_per_second": 4.683,
|
|
"eval_steps_per_second": 0.609,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 2.6221995926680246,
|
|
"grad_norm": 1.1128815412521362,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2375,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 2.6476578411405294,
|
|
"grad_norm": 1.5581951141357422,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2432,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 2.6476578411405294,
|
|
"eval_loss": 0.2521425485610962,
|
|
"eval_runtime": 21.0088,
|
|
"eval_samples_per_second": 4.76,
|
|
"eval_steps_per_second": 0.619,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 2.6731160896130346,
|
|
"grad_norm": 1.1291751861572266,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2415,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 2.69857433808554,
|
|
"grad_norm": 1.139137625694275,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2398,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 2.69857433808554,
|
|
"eval_loss": 0.25268828868865967,
|
|
"eval_runtime": 21.7715,
|
|
"eval_samples_per_second": 4.593,
|
|
"eval_steps_per_second": 0.597,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 2.724032586558045,
|
|
"grad_norm": 1.0960917472839355,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2397,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 2.74949083503055,
|
|
"grad_norm": 1.4622306823730469,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2371,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 2.74949083503055,
|
|
"eval_loss": 0.25145551562309265,
|
|
"eval_runtime": 21.3534,
|
|
"eval_samples_per_second": 4.683,
|
|
"eval_steps_per_second": 0.609,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 2.774949083503055,
|
|
"grad_norm": 1.0508596897125244,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2389,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 2.8004073319755602,
|
|
"grad_norm": 0.8241857886314392,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2403,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 2.8004073319755602,
|
|
"eval_loss": 0.25168588757514954,
|
|
"eval_runtime": 21.2395,
|
|
"eval_samples_per_second": 4.708,
|
|
"eval_steps_per_second": 0.612,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 2.825865580448065,
|
|
"grad_norm": 1.214141845703125,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2406,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 2.8513238289205702,
|
|
"grad_norm": 1.3861531019210815,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2479,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 2.8513238289205702,
|
|
"eval_loss": 0.24935197830200195,
|
|
"eval_runtime": 21.4872,
|
|
"eval_samples_per_second": 4.654,
|
|
"eval_steps_per_second": 0.605,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 2.8767820773930755,
|
|
"grad_norm": 1.1861828565597534,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2318,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 2.9022403258655807,
|
|
"grad_norm": 1.1281813383102417,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2413,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 2.9022403258655807,
|
|
"eval_loss": 0.25039103627204895,
|
|
"eval_runtime": 20.9675,
|
|
"eval_samples_per_second": 4.769,
|
|
"eval_steps_per_second": 0.62,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 2.9276985743380854,
|
|
"grad_norm": 1.1463491916656494,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2364,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 2.9531568228105907,
|
|
"grad_norm": 1.3118984699249268,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2388,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 2.9531568228105907,
|
|
"eval_loss": 0.24982962012290955,
|
|
"eval_runtime": 21.5908,
|
|
"eval_samples_per_second": 4.632,
|
|
"eval_steps_per_second": 0.602,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 2.978615071283096,
|
|
"grad_norm": 1.0851026773452759,
|
|
"learning_rate": 1e-05,
|
|
"loss": 0.2348,
|
|
"step": 5850
|
|
}
|
|
],
|
|
"logging_steps": 50,
|
|
"max_steps": 5891,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 3,
|
|
"save_steps": 100,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 6.421089311391744e+20,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|