rikeshsilwalekg's picture
Upload 15 files
6ea04c7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99949083503055,
"eval_steps": 100,
"global_step": 5891,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025458248472505093,
"grad_norm": 0.7198461890220642,
"learning_rate": 1e-05,
"loss": 0.2893,
"step": 50
},
{
"epoch": 0.05091649694501019,
"grad_norm": 1.0035477876663208,
"learning_rate": 1e-05,
"loss": 0.2938,
"step": 100
},
{
"epoch": 0.05091649694501019,
"eval_loss": 0.30817005038261414,
"eval_runtime": 22.8318,
"eval_samples_per_second": 4.38,
"eval_steps_per_second": 0.569,
"step": 100
},
{
"epoch": 0.07637474541751528,
"grad_norm": 1.1024292707443237,
"learning_rate": 1e-05,
"loss": 0.3017,
"step": 150
},
{
"epoch": 0.10183299389002037,
"grad_norm": 0.8950141668319702,
"learning_rate": 1e-05,
"loss": 0.2912,
"step": 200
},
{
"epoch": 0.10183299389002037,
"eval_loss": 0.30671748518943787,
"eval_runtime": 21.6577,
"eval_samples_per_second": 4.617,
"eval_steps_per_second": 0.6,
"step": 200
},
{
"epoch": 0.12729124236252545,
"grad_norm": 0.9789568781852722,
"learning_rate": 1e-05,
"loss": 0.2928,
"step": 250
},
{
"epoch": 0.15274949083503056,
"grad_norm": 1.0154632329940796,
"learning_rate": 1e-05,
"loss": 0.2839,
"step": 300
},
{
"epoch": 0.15274949083503056,
"eval_loss": 0.30376389622688293,
"eval_runtime": 21.5085,
"eval_samples_per_second": 4.649,
"eval_steps_per_second": 0.604,
"step": 300
},
{
"epoch": 0.17820773930753564,
"grad_norm": 0.977684736251831,
"learning_rate": 1e-05,
"loss": 0.299,
"step": 350
},
{
"epoch": 0.20366598778004075,
"grad_norm": 1.02386474609375,
"learning_rate": 1e-05,
"loss": 0.2922,
"step": 400
},
{
"epoch": 0.20366598778004075,
"eval_loss": 0.3035086393356323,
"eval_runtime": 21.9907,
"eval_samples_per_second": 4.547,
"eval_steps_per_second": 0.591,
"step": 400
},
{
"epoch": 0.22912423625254583,
"grad_norm": 1.0174798965454102,
"learning_rate": 1e-05,
"loss": 0.2955,
"step": 450
},
{
"epoch": 0.2545824847250509,
"grad_norm": 1.0312519073486328,
"learning_rate": 1e-05,
"loss": 0.3013,
"step": 500
},
{
"epoch": 0.2545824847250509,
"eval_loss": 0.29991400241851807,
"eval_runtime": 21.9988,
"eval_samples_per_second": 4.546,
"eval_steps_per_second": 0.591,
"step": 500
},
{
"epoch": 0.280040733197556,
"grad_norm": 0.8153128623962402,
"learning_rate": 1e-05,
"loss": 0.2902,
"step": 550
},
{
"epoch": 0.3054989816700611,
"grad_norm": 0.9280871748924255,
"learning_rate": 1e-05,
"loss": 0.2933,
"step": 600
},
{
"epoch": 0.3054989816700611,
"eval_loss": 0.29873424768447876,
"eval_runtime": 21.424,
"eval_samples_per_second": 4.668,
"eval_steps_per_second": 0.607,
"step": 600
},
{
"epoch": 0.33095723014256617,
"grad_norm": 1.0311402082443237,
"learning_rate": 1e-05,
"loss": 0.2871,
"step": 650
},
{
"epoch": 0.3564154786150713,
"grad_norm": 1.1811566352844238,
"learning_rate": 1e-05,
"loss": 0.2968,
"step": 700
},
{
"epoch": 0.3564154786150713,
"eval_loss": 0.29955142736434937,
"eval_runtime": 23.1012,
"eval_samples_per_second": 4.329,
"eval_steps_per_second": 0.563,
"step": 700
},
{
"epoch": 0.3818737270875764,
"grad_norm": 1.2003265619277954,
"learning_rate": 1e-05,
"loss": 0.2887,
"step": 750
},
{
"epoch": 0.4073319755600815,
"grad_norm": 1.488318920135498,
"learning_rate": 1e-05,
"loss": 0.2891,
"step": 800
},
{
"epoch": 0.4073319755600815,
"eval_loss": 0.29836124181747437,
"eval_runtime": 22.3121,
"eval_samples_per_second": 4.482,
"eval_steps_per_second": 0.583,
"step": 800
},
{
"epoch": 0.43279022403258655,
"grad_norm": 0.858514130115509,
"learning_rate": 1e-05,
"loss": 0.2985,
"step": 850
},
{
"epoch": 0.45824847250509165,
"grad_norm": 0.9991205930709839,
"learning_rate": 1e-05,
"loss": 0.2877,
"step": 900
},
{
"epoch": 0.45824847250509165,
"eval_loss": 0.2956816554069519,
"eval_runtime": 21.6536,
"eval_samples_per_second": 4.618,
"eval_steps_per_second": 0.6,
"step": 900
},
{
"epoch": 0.48370672097759676,
"grad_norm": 1.130011796951294,
"learning_rate": 1e-05,
"loss": 0.2831,
"step": 950
},
{
"epoch": 0.5091649694501018,
"grad_norm": 0.97832852602005,
"learning_rate": 1e-05,
"loss": 0.2844,
"step": 1000
},
{
"epoch": 0.5091649694501018,
"eval_loss": 0.29429855942726135,
"eval_runtime": 21.7902,
"eval_samples_per_second": 4.589,
"eval_steps_per_second": 0.597,
"step": 1000
},
{
"epoch": 0.5346232179226069,
"grad_norm": 1.1608392000198364,
"learning_rate": 1e-05,
"loss": 0.2802,
"step": 1050
},
{
"epoch": 0.560081466395112,
"grad_norm": 0.9106999635696411,
"learning_rate": 1e-05,
"loss": 0.2736,
"step": 1100
},
{
"epoch": 0.560081466395112,
"eval_loss": 0.29138484597206116,
"eval_runtime": 21.847,
"eval_samples_per_second": 4.577,
"eval_steps_per_second": 0.595,
"step": 1100
},
{
"epoch": 0.5855397148676171,
"grad_norm": 1.077606201171875,
"learning_rate": 1e-05,
"loss": 0.2916,
"step": 1150
},
{
"epoch": 0.6109979633401222,
"grad_norm": 1.078594446182251,
"learning_rate": 1e-05,
"loss": 0.2877,
"step": 1200
},
{
"epoch": 0.6109979633401222,
"eval_loss": 0.2930351495742798,
"eval_runtime": 21.8584,
"eval_samples_per_second": 4.575,
"eval_steps_per_second": 0.595,
"step": 1200
},
{
"epoch": 0.6364562118126272,
"grad_norm": 1.044995665550232,
"learning_rate": 1e-05,
"loss": 0.2852,
"step": 1250
},
{
"epoch": 0.6619144602851323,
"grad_norm": 1.0691392421722412,
"learning_rate": 1e-05,
"loss": 0.2914,
"step": 1300
},
{
"epoch": 0.6619144602851323,
"eval_loss": 0.29031333327293396,
"eval_runtime": 21.8883,
"eval_samples_per_second": 4.569,
"eval_steps_per_second": 0.594,
"step": 1300
},
{
"epoch": 0.6873727087576375,
"grad_norm": 1.165562629699707,
"learning_rate": 1e-05,
"loss": 0.2854,
"step": 1350
},
{
"epoch": 0.7128309572301426,
"grad_norm": 1.1224968433380127,
"learning_rate": 1e-05,
"loss": 0.2846,
"step": 1400
},
{
"epoch": 0.7128309572301426,
"eval_loss": 0.2897338271141052,
"eval_runtime": 22.173,
"eval_samples_per_second": 4.51,
"eval_steps_per_second": 0.586,
"step": 1400
},
{
"epoch": 0.7382892057026477,
"grad_norm": 0.9951677918434143,
"learning_rate": 1e-05,
"loss": 0.2754,
"step": 1450
},
{
"epoch": 0.7637474541751528,
"grad_norm": 1.116921305656433,
"learning_rate": 1e-05,
"loss": 0.2754,
"step": 1500
},
{
"epoch": 0.7637474541751528,
"eval_loss": 0.2867298424243927,
"eval_runtime": 21.7838,
"eval_samples_per_second": 4.591,
"eval_steps_per_second": 0.597,
"step": 1500
},
{
"epoch": 0.7892057026476579,
"grad_norm": 1.104265570640564,
"learning_rate": 1e-05,
"loss": 0.2824,
"step": 1550
},
{
"epoch": 0.814663951120163,
"grad_norm": 0.8793336749076843,
"learning_rate": 1e-05,
"loss": 0.2901,
"step": 1600
},
{
"epoch": 0.814663951120163,
"eval_loss": 0.28652000427246094,
"eval_runtime": 21.8736,
"eval_samples_per_second": 4.572,
"eval_steps_per_second": 0.594,
"step": 1600
},
{
"epoch": 0.840122199592668,
"grad_norm": 1.2304877042770386,
"learning_rate": 1e-05,
"loss": 0.2718,
"step": 1650
},
{
"epoch": 0.8655804480651731,
"grad_norm": 0.9079441428184509,
"learning_rate": 1e-05,
"loss": 0.2787,
"step": 1700
},
{
"epoch": 0.8655804480651731,
"eval_loss": 0.2839984893798828,
"eval_runtime": 21.7844,
"eval_samples_per_second": 4.59,
"eval_steps_per_second": 0.597,
"step": 1700
},
{
"epoch": 0.8910386965376782,
"grad_norm": 1.359052300453186,
"learning_rate": 1e-05,
"loss": 0.2703,
"step": 1750
},
{
"epoch": 0.9164969450101833,
"grad_norm": 1.0245873928070068,
"learning_rate": 1e-05,
"loss": 0.2678,
"step": 1800
},
{
"epoch": 0.9164969450101833,
"eval_loss": 0.2825533151626587,
"eval_runtime": 22.022,
"eval_samples_per_second": 4.541,
"eval_steps_per_second": 0.59,
"step": 1800
},
{
"epoch": 0.9419551934826884,
"grad_norm": 0.9011121988296509,
"learning_rate": 1e-05,
"loss": 0.2747,
"step": 1850
},
{
"epoch": 0.9674134419551935,
"grad_norm": 1.006032943725586,
"learning_rate": 1e-05,
"loss": 0.2721,
"step": 1900
},
{
"epoch": 0.9674134419551935,
"eval_loss": 0.2824758291244507,
"eval_runtime": 22.1289,
"eval_samples_per_second": 4.519,
"eval_steps_per_second": 0.587,
"step": 1900
},
{
"epoch": 0.9928716904276986,
"grad_norm": 0.9993594288825989,
"learning_rate": 1e-05,
"loss": 0.2784,
"step": 1950
},
{
"epoch": 1.0183299389002036,
"grad_norm": 0.9710284471511841,
"learning_rate": 1e-05,
"loss": 0.2713,
"step": 2000
},
{
"epoch": 1.0183299389002036,
"eval_loss": 0.2820639908313751,
"eval_runtime": 21.7507,
"eval_samples_per_second": 4.598,
"eval_steps_per_second": 0.598,
"step": 2000
},
{
"epoch": 1.0437881873727088,
"grad_norm": 1.2046750783920288,
"learning_rate": 1e-05,
"loss": 0.2799,
"step": 2050
},
{
"epoch": 1.0692464358452138,
"grad_norm": 0.9969730377197266,
"learning_rate": 1e-05,
"loss": 0.2842,
"step": 2100
},
{
"epoch": 1.0692464358452138,
"eval_loss": 0.28064805269241333,
"eval_runtime": 21.7423,
"eval_samples_per_second": 4.599,
"eval_steps_per_second": 0.598,
"step": 2100
},
{
"epoch": 1.094704684317719,
"grad_norm": 0.9368526339530945,
"learning_rate": 1e-05,
"loss": 0.2783,
"step": 2150
},
{
"epoch": 1.120162932790224,
"grad_norm": 1.2995036840438843,
"learning_rate": 1e-05,
"loss": 0.2781,
"step": 2200
},
{
"epoch": 1.120162932790224,
"eval_loss": 0.2789928913116455,
"eval_runtime": 21.6436,
"eval_samples_per_second": 4.62,
"eval_steps_per_second": 0.601,
"step": 2200
},
{
"epoch": 1.145621181262729,
"grad_norm": 1.2737852334976196,
"learning_rate": 1e-05,
"loss": 0.2731,
"step": 2250
},
{
"epoch": 1.1710794297352343,
"grad_norm": 1.0202410221099854,
"learning_rate": 1e-05,
"loss": 0.273,
"step": 2300
},
{
"epoch": 1.1710794297352343,
"eval_loss": 0.27778831124305725,
"eval_runtime": 21.9156,
"eval_samples_per_second": 4.563,
"eval_steps_per_second": 0.593,
"step": 2300
},
{
"epoch": 1.1965376782077393,
"grad_norm": 0.9710997939109802,
"learning_rate": 1e-05,
"loss": 0.2578,
"step": 2350
},
{
"epoch": 1.2219959266802445,
"grad_norm": 0.86209636926651,
"learning_rate": 1e-05,
"loss": 0.2714,
"step": 2400
},
{
"epoch": 1.2219959266802445,
"eval_loss": 0.27559801936149597,
"eval_runtime": 21.9259,
"eval_samples_per_second": 4.561,
"eval_steps_per_second": 0.593,
"step": 2400
},
{
"epoch": 1.2474541751527495,
"grad_norm": 1.0652376413345337,
"learning_rate": 1e-05,
"loss": 0.265,
"step": 2450
},
{
"epoch": 1.2729124236252547,
"grad_norm": 1.002944827079773,
"learning_rate": 1e-05,
"loss": 0.2536,
"step": 2500
},
{
"epoch": 1.2729124236252547,
"eval_loss": 0.2747356593608856,
"eval_runtime": 21.9091,
"eval_samples_per_second": 4.564,
"eval_steps_per_second": 0.593,
"step": 2500
},
{
"epoch": 1.2983706720977597,
"grad_norm": 1.0743255615234375,
"learning_rate": 1e-05,
"loss": 0.2742,
"step": 2550
},
{
"epoch": 1.3238289205702647,
"grad_norm": 1.2174959182739258,
"learning_rate": 1e-05,
"loss": 0.268,
"step": 2600
},
{
"epoch": 1.3238289205702647,
"eval_loss": 0.27502280473709106,
"eval_runtime": 21.9371,
"eval_samples_per_second": 4.558,
"eval_steps_per_second": 0.593,
"step": 2600
},
{
"epoch": 1.34928716904277,
"grad_norm": 1.1314553022384644,
"learning_rate": 1e-05,
"loss": 0.26,
"step": 2650
},
{
"epoch": 1.374745417515275,
"grad_norm": 1.007804036140442,
"learning_rate": 1e-05,
"loss": 0.2776,
"step": 2700
},
{
"epoch": 1.374745417515275,
"eval_loss": 0.27191075682640076,
"eval_runtime": 21.9972,
"eval_samples_per_second": 4.546,
"eval_steps_per_second": 0.591,
"step": 2700
},
{
"epoch": 1.4002036659877801,
"grad_norm": 1.1400426626205444,
"learning_rate": 1e-05,
"loss": 0.2577,
"step": 2750
},
{
"epoch": 1.4256619144602851,
"grad_norm": 0.9301505088806152,
"learning_rate": 1e-05,
"loss": 0.2726,
"step": 2800
},
{
"epoch": 1.4256619144602851,
"eval_loss": 0.2725023925304413,
"eval_runtime": 21.722,
"eval_samples_per_second": 4.604,
"eval_steps_per_second": 0.598,
"step": 2800
},
{
"epoch": 1.4511201629327903,
"grad_norm": 1.142259120941162,
"learning_rate": 1e-05,
"loss": 0.2635,
"step": 2850
},
{
"epoch": 1.4765784114052953,
"grad_norm": 1.1009142398834229,
"learning_rate": 1e-05,
"loss": 0.2572,
"step": 2900
},
{
"epoch": 1.4765784114052953,
"eval_loss": 0.2723616063594818,
"eval_runtime": 21.8662,
"eval_samples_per_second": 4.573,
"eval_steps_per_second": 0.595,
"step": 2900
},
{
"epoch": 1.5020366598778003,
"grad_norm": 1.2985098361968994,
"learning_rate": 1e-05,
"loss": 0.2477,
"step": 2950
},
{
"epoch": 1.5274949083503055,
"grad_norm": 1.2853788137435913,
"learning_rate": 1e-05,
"loss": 0.2518,
"step": 3000
},
{
"epoch": 1.5274949083503055,
"eval_loss": 0.2711648643016815,
"eval_runtime": 22.1089,
"eval_samples_per_second": 4.523,
"eval_steps_per_second": 0.588,
"step": 3000
},
{
"epoch": 1.5529531568228105,
"grad_norm": 1.3646196126937866,
"learning_rate": 1e-05,
"loss": 0.2545,
"step": 3050
},
{
"epoch": 1.5784114052953155,
"grad_norm": 1.0580254793167114,
"learning_rate": 1e-05,
"loss": 0.2665,
"step": 3100
},
{
"epoch": 1.5784114052953155,
"eval_loss": 0.27152860164642334,
"eval_runtime": 21.563,
"eval_samples_per_second": 4.638,
"eval_steps_per_second": 0.603,
"step": 3100
},
{
"epoch": 1.6038696537678208,
"grad_norm": 1.529466152191162,
"learning_rate": 1e-05,
"loss": 0.2589,
"step": 3150
},
{
"epoch": 1.629327902240326,
"grad_norm": 1.0657099485397339,
"learning_rate": 1e-05,
"loss": 0.2559,
"step": 3200
},
{
"epoch": 1.629327902240326,
"eval_loss": 0.2696399390697479,
"eval_runtime": 21.6487,
"eval_samples_per_second": 4.619,
"eval_steps_per_second": 0.6,
"step": 3200
},
{
"epoch": 1.654786150712831,
"grad_norm": 1.0233310461044312,
"learning_rate": 1e-05,
"loss": 0.2654,
"step": 3250
},
{
"epoch": 1.680244399185336,
"grad_norm": 1.151859998703003,
"learning_rate": 1e-05,
"loss": 0.2609,
"step": 3300
},
{
"epoch": 1.680244399185336,
"eval_loss": 0.26971447467803955,
"eval_runtime": 21.8279,
"eval_samples_per_second": 4.581,
"eval_steps_per_second": 0.596,
"step": 3300
},
{
"epoch": 1.7057026476578412,
"grad_norm": 1.1169921159744263,
"learning_rate": 1e-05,
"loss": 0.266,
"step": 3350
},
{
"epoch": 1.7311608961303462,
"grad_norm": 0.9375786185264587,
"learning_rate": 1e-05,
"loss": 0.2603,
"step": 3400
},
{
"epoch": 1.7311608961303462,
"eval_loss": 0.26760581135749817,
"eval_runtime": 21.6655,
"eval_samples_per_second": 4.616,
"eval_steps_per_second": 0.6,
"step": 3400
},
{
"epoch": 1.7566191446028512,
"grad_norm": 1.0559574365615845,
"learning_rate": 1e-05,
"loss": 0.2644,
"step": 3450
},
{
"epoch": 1.7820773930753564,
"grad_norm": 1.1326260566711426,
"learning_rate": 1e-05,
"loss": 0.2448,
"step": 3500
},
{
"epoch": 1.7820773930753564,
"eval_loss": 0.2665054500102997,
"eval_runtime": 21.6484,
"eval_samples_per_second": 4.619,
"eval_steps_per_second": 0.601,
"step": 3500
},
{
"epoch": 1.8075356415478616,
"grad_norm": 1.396811842918396,
"learning_rate": 1e-05,
"loss": 0.252,
"step": 3550
},
{
"epoch": 1.8329938900203666,
"grad_norm": 1.6235796213150024,
"learning_rate": 1e-05,
"loss": 0.2587,
"step": 3600
},
{
"epoch": 1.8329938900203666,
"eval_loss": 0.2665034234523773,
"eval_runtime": 21.9855,
"eval_samples_per_second": 4.548,
"eval_steps_per_second": 0.591,
"step": 3600
},
{
"epoch": 1.8584521384928716,
"grad_norm": 1.2640048265457153,
"learning_rate": 1e-05,
"loss": 0.2525,
"step": 3650
},
{
"epoch": 1.8839103869653768,
"grad_norm": 1.1669272184371948,
"learning_rate": 1e-05,
"loss": 0.2534,
"step": 3700
},
{
"epoch": 1.8839103869653768,
"eval_loss": 0.26496145129203796,
"eval_runtime": 22.0324,
"eval_samples_per_second": 4.539,
"eval_steps_per_second": 0.59,
"step": 3700
},
{
"epoch": 1.9093686354378818,
"grad_norm": 1.2015262842178345,
"learning_rate": 1e-05,
"loss": 0.2532,
"step": 3750
},
{
"epoch": 1.9348268839103868,
"grad_norm": 1.0683043003082275,
"learning_rate": 1e-05,
"loss": 0.2496,
"step": 3800
},
{
"epoch": 1.9348268839103868,
"eval_loss": 0.2648490369319916,
"eval_runtime": 21.9752,
"eval_samples_per_second": 4.551,
"eval_steps_per_second": 0.592,
"step": 3800
},
{
"epoch": 1.960285132382892,
"grad_norm": 1.4562475681304932,
"learning_rate": 1e-05,
"loss": 0.242,
"step": 3850
},
{
"epoch": 1.9857433808553973,
"grad_norm": 0.9929770231246948,
"learning_rate": 1e-05,
"loss": 0.2528,
"step": 3900
},
{
"epoch": 1.9857433808553973,
"eval_loss": 0.263571560382843,
"eval_runtime": 21.7216,
"eval_samples_per_second": 4.604,
"eval_steps_per_second": 0.598,
"step": 3900
},
{
"epoch": 2.011201629327902,
"grad_norm": 1.0417041778564453,
"learning_rate": 1e-05,
"loss": 0.2452,
"step": 3950
},
{
"epoch": 2.0366598778004072,
"grad_norm": 1.0510022640228271,
"learning_rate": 1e-05,
"loss": 0.2652,
"step": 4000
},
{
"epoch": 2.0366598778004072,
"eval_loss": 0.262724906206131,
"eval_runtime": 22.0633,
"eval_samples_per_second": 4.532,
"eval_steps_per_second": 0.589,
"step": 4000
},
{
"epoch": 2.0621181262729125,
"grad_norm": 1.383092999458313,
"learning_rate": 1e-05,
"loss": 0.2369,
"step": 4050
},
{
"epoch": 2.0875763747454177,
"grad_norm": 1.3613831996917725,
"learning_rate": 1e-05,
"loss": 0.2493,
"step": 4100
},
{
"epoch": 2.0875763747454177,
"eval_loss": 0.2625581622123718,
"eval_runtime": 21.7251,
"eval_samples_per_second": 4.603,
"eval_steps_per_second": 0.598,
"step": 4100
},
{
"epoch": 2.1130346232179225,
"grad_norm": 1.0267040729522705,
"learning_rate": 1e-05,
"loss": 0.2511,
"step": 4150
},
{
"epoch": 2.1384928716904277,
"grad_norm": 0.9748584032058716,
"learning_rate": 1e-05,
"loss": 0.2383,
"step": 4200
},
{
"epoch": 2.1384928716904277,
"eval_loss": 0.26004984974861145,
"eval_runtime": 21.6797,
"eval_samples_per_second": 4.613,
"eval_steps_per_second": 0.6,
"step": 4200
},
{
"epoch": 2.163951120162933,
"grad_norm": 1.3859432935714722,
"learning_rate": 1e-05,
"loss": 0.2544,
"step": 4250
},
{
"epoch": 2.189409368635438,
"grad_norm": 1.598718285560608,
"learning_rate": 1e-05,
"loss": 0.2527,
"step": 4300
},
{
"epoch": 2.189409368635438,
"eval_loss": 0.25936272740364075,
"eval_runtime": 21.803,
"eval_samples_per_second": 4.587,
"eval_steps_per_second": 0.596,
"step": 4300
},
{
"epoch": 2.214867617107943,
"grad_norm": 0.9896050691604614,
"learning_rate": 1e-05,
"loss": 0.2423,
"step": 4350
},
{
"epoch": 2.240325865580448,
"grad_norm": 1.4134578704833984,
"learning_rate": 1e-05,
"loss": 0.2446,
"step": 4400
},
{
"epoch": 2.240325865580448,
"eval_loss": 0.2597595751285553,
"eval_runtime": 21.9078,
"eval_samples_per_second": 4.565,
"eval_steps_per_second": 0.593,
"step": 4400
},
{
"epoch": 2.2657841140529533,
"grad_norm": 1.1840572357177734,
"learning_rate": 1e-05,
"loss": 0.251,
"step": 4450
},
{
"epoch": 2.291242362525458,
"grad_norm": 1.3326150178909302,
"learning_rate": 1e-05,
"loss": 0.2504,
"step": 4500
},
{
"epoch": 2.291242362525458,
"eval_loss": 0.2583908438682556,
"eval_runtime": 21.7836,
"eval_samples_per_second": 4.591,
"eval_steps_per_second": 0.597,
"step": 4500
},
{
"epoch": 2.3167006109979633,
"grad_norm": 1.4150619506835938,
"learning_rate": 1e-05,
"loss": 0.2481,
"step": 4550
},
{
"epoch": 2.3421588594704685,
"grad_norm": 1.4056681394577026,
"learning_rate": 1e-05,
"loss": 0.2474,
"step": 4600
},
{
"epoch": 2.3421588594704685,
"eval_loss": 0.25896069407463074,
"eval_runtime": 21.9165,
"eval_samples_per_second": 4.563,
"eval_steps_per_second": 0.593,
"step": 4600
},
{
"epoch": 2.3676171079429738,
"grad_norm": 1.1109027862548828,
"learning_rate": 1e-05,
"loss": 0.2447,
"step": 4650
},
{
"epoch": 2.3930753564154785,
"grad_norm": 1.1159225702285767,
"learning_rate": 1e-05,
"loss": 0.2482,
"step": 4700
},
{
"epoch": 2.3930753564154785,
"eval_loss": 0.2578243911266327,
"eval_runtime": 21.5943,
"eval_samples_per_second": 4.631,
"eval_steps_per_second": 0.602,
"step": 4700
},
{
"epoch": 2.4185336048879837,
"grad_norm": 1.1924070119857788,
"learning_rate": 1e-05,
"loss": 0.2506,
"step": 4750
},
{
"epoch": 2.443991853360489,
"grad_norm": 1.5512938499450684,
"learning_rate": 1e-05,
"loss": 0.2405,
"step": 4800
},
{
"epoch": 2.443991853360489,
"eval_loss": 0.2575813829898834,
"eval_runtime": 22.0278,
"eval_samples_per_second": 4.54,
"eval_steps_per_second": 0.59,
"step": 4800
},
{
"epoch": 2.4694501018329937,
"grad_norm": 1.0490576028823853,
"learning_rate": 1e-05,
"loss": 0.2476,
"step": 4850
},
{
"epoch": 2.494908350305499,
"grad_norm": 1.0198191404342651,
"learning_rate": 1e-05,
"loss": 0.2399,
"step": 4900
},
{
"epoch": 2.494908350305499,
"eval_loss": 0.25637081265449524,
"eval_runtime": 21.9152,
"eval_samples_per_second": 4.563,
"eval_steps_per_second": 0.593,
"step": 4900
},
{
"epoch": 2.520366598778004,
"grad_norm": 1.122515320777893,
"learning_rate": 1e-05,
"loss": 0.2501,
"step": 4950
},
{
"epoch": 2.5458248472505094,
"grad_norm": 1.082686424255371,
"learning_rate": 1e-05,
"loss": 0.2443,
"step": 5000
},
{
"epoch": 2.5458248472505094,
"eval_loss": 0.25523728132247925,
"eval_runtime": 21.0107,
"eval_samples_per_second": 4.759,
"eval_steps_per_second": 0.619,
"step": 5000
},
{
"epoch": 2.571283095723014,
"grad_norm": 1.0135226249694824,
"learning_rate": 1e-05,
"loss": 0.2414,
"step": 5050
},
{
"epoch": 2.5967413441955194,
"grad_norm": 0.9929371476173401,
"learning_rate": 1e-05,
"loss": 0.248,
"step": 5100
},
{
"epoch": 2.5967413441955194,
"eval_loss": 0.2532651722431183,
"eval_runtime": 21.3518,
"eval_samples_per_second": 4.683,
"eval_steps_per_second": 0.609,
"step": 5100
},
{
"epoch": 2.6221995926680246,
"grad_norm": 1.1128815412521362,
"learning_rate": 1e-05,
"loss": 0.2375,
"step": 5150
},
{
"epoch": 2.6476578411405294,
"grad_norm": 1.5581951141357422,
"learning_rate": 1e-05,
"loss": 0.2432,
"step": 5200
},
{
"epoch": 2.6476578411405294,
"eval_loss": 0.2521425485610962,
"eval_runtime": 21.0088,
"eval_samples_per_second": 4.76,
"eval_steps_per_second": 0.619,
"step": 5200
},
{
"epoch": 2.6731160896130346,
"grad_norm": 1.1291751861572266,
"learning_rate": 1e-05,
"loss": 0.2415,
"step": 5250
},
{
"epoch": 2.69857433808554,
"grad_norm": 1.139137625694275,
"learning_rate": 1e-05,
"loss": 0.2398,
"step": 5300
},
{
"epoch": 2.69857433808554,
"eval_loss": 0.25268828868865967,
"eval_runtime": 21.7715,
"eval_samples_per_second": 4.593,
"eval_steps_per_second": 0.597,
"step": 5300
},
{
"epoch": 2.724032586558045,
"grad_norm": 1.0960917472839355,
"learning_rate": 1e-05,
"loss": 0.2397,
"step": 5350
},
{
"epoch": 2.74949083503055,
"grad_norm": 1.4622306823730469,
"learning_rate": 1e-05,
"loss": 0.2371,
"step": 5400
},
{
"epoch": 2.74949083503055,
"eval_loss": 0.25145551562309265,
"eval_runtime": 21.3534,
"eval_samples_per_second": 4.683,
"eval_steps_per_second": 0.609,
"step": 5400
},
{
"epoch": 2.774949083503055,
"grad_norm": 1.0508596897125244,
"learning_rate": 1e-05,
"loss": 0.2389,
"step": 5450
},
{
"epoch": 2.8004073319755602,
"grad_norm": 0.8241857886314392,
"learning_rate": 1e-05,
"loss": 0.2403,
"step": 5500
},
{
"epoch": 2.8004073319755602,
"eval_loss": 0.25168588757514954,
"eval_runtime": 21.2395,
"eval_samples_per_second": 4.708,
"eval_steps_per_second": 0.612,
"step": 5500
},
{
"epoch": 2.825865580448065,
"grad_norm": 1.214141845703125,
"learning_rate": 1e-05,
"loss": 0.2406,
"step": 5550
},
{
"epoch": 2.8513238289205702,
"grad_norm": 1.3861531019210815,
"learning_rate": 1e-05,
"loss": 0.2479,
"step": 5600
},
{
"epoch": 2.8513238289205702,
"eval_loss": 0.24935197830200195,
"eval_runtime": 21.4872,
"eval_samples_per_second": 4.654,
"eval_steps_per_second": 0.605,
"step": 5600
},
{
"epoch": 2.8767820773930755,
"grad_norm": 1.1861828565597534,
"learning_rate": 1e-05,
"loss": 0.2318,
"step": 5650
},
{
"epoch": 2.9022403258655807,
"grad_norm": 1.1281813383102417,
"learning_rate": 1e-05,
"loss": 0.2413,
"step": 5700
},
{
"epoch": 2.9022403258655807,
"eval_loss": 0.25039103627204895,
"eval_runtime": 20.9675,
"eval_samples_per_second": 4.769,
"eval_steps_per_second": 0.62,
"step": 5700
},
{
"epoch": 2.9276985743380854,
"grad_norm": 1.1463491916656494,
"learning_rate": 1e-05,
"loss": 0.2364,
"step": 5750
},
{
"epoch": 2.9531568228105907,
"grad_norm": 1.3118984699249268,
"learning_rate": 1e-05,
"loss": 0.2388,
"step": 5800
},
{
"epoch": 2.9531568228105907,
"eval_loss": 0.24982962012290955,
"eval_runtime": 21.5908,
"eval_samples_per_second": 4.632,
"eval_steps_per_second": 0.602,
"step": 5800
},
{
"epoch": 2.978615071283096,
"grad_norm": 1.0851026773452759,
"learning_rate": 1e-05,
"loss": 0.2348,
"step": 5850
}
],
"logging_steps": 50,
"max_steps": 5891,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.421089311391744e+20,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}