{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99949083503055, "eval_steps": 100, "global_step": 5891, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025458248472505093, "grad_norm": 0.7198461890220642, "learning_rate": 1e-05, "loss": 0.2893, "step": 50 }, { "epoch": 0.05091649694501019, "grad_norm": 1.0035477876663208, "learning_rate": 1e-05, "loss": 0.2938, "step": 100 }, { "epoch": 0.05091649694501019, "eval_loss": 0.30817005038261414, "eval_runtime": 22.8318, "eval_samples_per_second": 4.38, "eval_steps_per_second": 0.569, "step": 100 }, { "epoch": 0.07637474541751528, "grad_norm": 1.1024292707443237, "learning_rate": 1e-05, "loss": 0.3017, "step": 150 }, { "epoch": 0.10183299389002037, "grad_norm": 0.8950141668319702, "learning_rate": 1e-05, "loss": 0.2912, "step": 200 }, { "epoch": 0.10183299389002037, "eval_loss": 0.30671748518943787, "eval_runtime": 21.6577, "eval_samples_per_second": 4.617, "eval_steps_per_second": 0.6, "step": 200 }, { "epoch": 0.12729124236252545, "grad_norm": 0.9789568781852722, "learning_rate": 1e-05, "loss": 0.2928, "step": 250 }, { "epoch": 0.15274949083503056, "grad_norm": 1.0154632329940796, "learning_rate": 1e-05, "loss": 0.2839, "step": 300 }, { "epoch": 0.15274949083503056, "eval_loss": 0.30376389622688293, "eval_runtime": 21.5085, "eval_samples_per_second": 4.649, "eval_steps_per_second": 0.604, "step": 300 }, { "epoch": 0.17820773930753564, "grad_norm": 0.977684736251831, "learning_rate": 1e-05, "loss": 0.299, "step": 350 }, { "epoch": 0.20366598778004075, "grad_norm": 1.02386474609375, "learning_rate": 1e-05, "loss": 0.2922, "step": 400 }, { "epoch": 0.20366598778004075, "eval_loss": 0.3035086393356323, "eval_runtime": 21.9907, "eval_samples_per_second": 4.547, "eval_steps_per_second": 0.591, "step": 400 }, { "epoch": 0.22912423625254583, "grad_norm": 1.0174798965454102, "learning_rate": 1e-05, "loss": 0.2955, "step": 450 }, { "epoch": 0.2545824847250509, "grad_norm": 1.0312519073486328, "learning_rate": 1e-05, "loss": 0.3013, "step": 500 }, { "epoch": 0.2545824847250509, "eval_loss": 0.29991400241851807, "eval_runtime": 21.9988, "eval_samples_per_second": 4.546, "eval_steps_per_second": 0.591, "step": 500 }, { "epoch": 0.280040733197556, "grad_norm": 0.8153128623962402, "learning_rate": 1e-05, "loss": 0.2902, "step": 550 }, { "epoch": 0.3054989816700611, "grad_norm": 0.9280871748924255, "learning_rate": 1e-05, "loss": 0.2933, "step": 600 }, { "epoch": 0.3054989816700611, "eval_loss": 0.29873424768447876, "eval_runtime": 21.424, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.607, "step": 600 }, { "epoch": 0.33095723014256617, "grad_norm": 1.0311402082443237, "learning_rate": 1e-05, "loss": 0.2871, "step": 650 }, { "epoch": 0.3564154786150713, "grad_norm": 1.1811566352844238, "learning_rate": 1e-05, "loss": 0.2968, "step": 700 }, { "epoch": 0.3564154786150713, "eval_loss": 0.29955142736434937, "eval_runtime": 23.1012, "eval_samples_per_second": 4.329, "eval_steps_per_second": 0.563, "step": 700 }, { "epoch": 0.3818737270875764, "grad_norm": 1.2003265619277954, "learning_rate": 1e-05, "loss": 0.2887, "step": 750 }, { "epoch": 0.4073319755600815, "grad_norm": 1.488318920135498, "learning_rate": 1e-05, "loss": 0.2891, "step": 800 }, { "epoch": 0.4073319755600815, "eval_loss": 0.29836124181747437, "eval_runtime": 22.3121, "eval_samples_per_second": 4.482, "eval_steps_per_second": 0.583, "step": 800 }, { "epoch": 0.43279022403258655, "grad_norm": 0.858514130115509, "learning_rate": 1e-05, "loss": 0.2985, "step": 850 }, { "epoch": 0.45824847250509165, "grad_norm": 0.9991205930709839, "learning_rate": 1e-05, "loss": 0.2877, "step": 900 }, { "epoch": 0.45824847250509165, "eval_loss": 0.2956816554069519, "eval_runtime": 21.6536, "eval_samples_per_second": 4.618, "eval_steps_per_second": 0.6, "step": 900 }, { "epoch": 0.48370672097759676, "grad_norm": 1.130011796951294, "learning_rate": 1e-05, "loss": 0.2831, "step": 950 }, { "epoch": 0.5091649694501018, "grad_norm": 0.97832852602005, "learning_rate": 1e-05, "loss": 0.2844, "step": 1000 }, { "epoch": 0.5091649694501018, "eval_loss": 0.29429855942726135, "eval_runtime": 21.7902, "eval_samples_per_second": 4.589, "eval_steps_per_second": 0.597, "step": 1000 }, { "epoch": 0.5346232179226069, "grad_norm": 1.1608392000198364, "learning_rate": 1e-05, "loss": 0.2802, "step": 1050 }, { "epoch": 0.560081466395112, "grad_norm": 0.9106999635696411, "learning_rate": 1e-05, "loss": 0.2736, "step": 1100 }, { "epoch": 0.560081466395112, "eval_loss": 0.29138484597206116, "eval_runtime": 21.847, "eval_samples_per_second": 4.577, "eval_steps_per_second": 0.595, "step": 1100 }, { "epoch": 0.5855397148676171, "grad_norm": 1.077606201171875, "learning_rate": 1e-05, "loss": 0.2916, "step": 1150 }, { "epoch": 0.6109979633401222, "grad_norm": 1.078594446182251, "learning_rate": 1e-05, "loss": 0.2877, "step": 1200 }, { "epoch": 0.6109979633401222, "eval_loss": 0.2930351495742798, "eval_runtime": 21.8584, "eval_samples_per_second": 4.575, "eval_steps_per_second": 0.595, "step": 1200 }, { "epoch": 0.6364562118126272, "grad_norm": 1.044995665550232, "learning_rate": 1e-05, "loss": 0.2852, "step": 1250 }, { "epoch": 0.6619144602851323, "grad_norm": 1.0691392421722412, "learning_rate": 1e-05, "loss": 0.2914, "step": 1300 }, { "epoch": 0.6619144602851323, "eval_loss": 0.29031333327293396, "eval_runtime": 21.8883, "eval_samples_per_second": 4.569, "eval_steps_per_second": 0.594, "step": 1300 }, { "epoch": 0.6873727087576375, "grad_norm": 1.165562629699707, "learning_rate": 1e-05, "loss": 0.2854, "step": 1350 }, { "epoch": 0.7128309572301426, "grad_norm": 1.1224968433380127, "learning_rate": 1e-05, "loss": 0.2846, "step": 1400 }, { "epoch": 0.7128309572301426, "eval_loss": 0.2897338271141052, "eval_runtime": 22.173, "eval_samples_per_second": 4.51, "eval_steps_per_second": 0.586, "step": 1400 }, { "epoch": 0.7382892057026477, "grad_norm": 0.9951677918434143, "learning_rate": 1e-05, "loss": 0.2754, "step": 1450 }, { "epoch": 0.7637474541751528, "grad_norm": 1.116921305656433, "learning_rate": 1e-05, "loss": 0.2754, "step": 1500 }, { "epoch": 0.7637474541751528, "eval_loss": 0.2867298424243927, "eval_runtime": 21.7838, "eval_samples_per_second": 4.591, "eval_steps_per_second": 0.597, "step": 1500 }, { "epoch": 0.7892057026476579, "grad_norm": 1.104265570640564, "learning_rate": 1e-05, "loss": 0.2824, "step": 1550 }, { "epoch": 0.814663951120163, "grad_norm": 0.8793336749076843, "learning_rate": 1e-05, "loss": 0.2901, "step": 1600 }, { "epoch": 0.814663951120163, "eval_loss": 0.28652000427246094, "eval_runtime": 21.8736, "eval_samples_per_second": 4.572, "eval_steps_per_second": 0.594, "step": 1600 }, { "epoch": 0.840122199592668, "grad_norm": 1.2304877042770386, "learning_rate": 1e-05, "loss": 0.2718, "step": 1650 }, { "epoch": 0.8655804480651731, "grad_norm": 0.9079441428184509, "learning_rate": 1e-05, "loss": 0.2787, "step": 1700 }, { "epoch": 0.8655804480651731, "eval_loss": 0.2839984893798828, "eval_runtime": 21.7844, "eval_samples_per_second": 4.59, "eval_steps_per_second": 0.597, "step": 1700 }, { "epoch": 0.8910386965376782, "grad_norm": 1.359052300453186, "learning_rate": 1e-05, "loss": 0.2703, "step": 1750 }, { "epoch": 0.9164969450101833, "grad_norm": 1.0245873928070068, "learning_rate": 1e-05, "loss": 0.2678, "step": 1800 }, { "epoch": 0.9164969450101833, "eval_loss": 0.2825533151626587, "eval_runtime": 22.022, "eval_samples_per_second": 4.541, "eval_steps_per_second": 0.59, "step": 1800 }, { "epoch": 0.9419551934826884, "grad_norm": 0.9011121988296509, "learning_rate": 1e-05, "loss": 0.2747, "step": 1850 }, { "epoch": 0.9674134419551935, "grad_norm": 1.006032943725586, "learning_rate": 1e-05, "loss": 0.2721, "step": 1900 }, { "epoch": 0.9674134419551935, "eval_loss": 0.2824758291244507, "eval_runtime": 22.1289, "eval_samples_per_second": 4.519, "eval_steps_per_second": 0.587, "step": 1900 }, { "epoch": 0.9928716904276986, "grad_norm": 0.9993594288825989, "learning_rate": 1e-05, "loss": 0.2784, "step": 1950 }, { "epoch": 1.0183299389002036, "grad_norm": 0.9710284471511841, "learning_rate": 1e-05, "loss": 0.2713, "step": 2000 }, { "epoch": 1.0183299389002036, "eval_loss": 0.2820639908313751, "eval_runtime": 21.7507, "eval_samples_per_second": 4.598, "eval_steps_per_second": 0.598, "step": 2000 }, { "epoch": 1.0437881873727088, "grad_norm": 1.2046750783920288, "learning_rate": 1e-05, "loss": 0.2799, "step": 2050 }, { "epoch": 1.0692464358452138, "grad_norm": 0.9969730377197266, "learning_rate": 1e-05, "loss": 0.2842, "step": 2100 }, { "epoch": 1.0692464358452138, "eval_loss": 0.28064805269241333, "eval_runtime": 21.7423, "eval_samples_per_second": 4.599, "eval_steps_per_second": 0.598, "step": 2100 }, { "epoch": 1.094704684317719, "grad_norm": 0.9368526339530945, "learning_rate": 1e-05, "loss": 0.2783, "step": 2150 }, { "epoch": 1.120162932790224, "grad_norm": 1.2995036840438843, "learning_rate": 1e-05, "loss": 0.2781, "step": 2200 }, { "epoch": 1.120162932790224, "eval_loss": 0.2789928913116455, "eval_runtime": 21.6436, "eval_samples_per_second": 4.62, "eval_steps_per_second": 0.601, "step": 2200 }, { "epoch": 1.145621181262729, "grad_norm": 1.2737852334976196, "learning_rate": 1e-05, "loss": 0.2731, "step": 2250 }, { "epoch": 1.1710794297352343, "grad_norm": 1.0202410221099854, "learning_rate": 1e-05, "loss": 0.273, "step": 2300 }, { "epoch": 1.1710794297352343, "eval_loss": 0.27778831124305725, "eval_runtime": 21.9156, "eval_samples_per_second": 4.563, "eval_steps_per_second": 0.593, "step": 2300 }, { "epoch": 1.1965376782077393, "grad_norm": 0.9710997939109802, "learning_rate": 1e-05, "loss": 0.2578, "step": 2350 }, { "epoch": 1.2219959266802445, "grad_norm": 0.86209636926651, "learning_rate": 1e-05, "loss": 0.2714, "step": 2400 }, { "epoch": 1.2219959266802445, "eval_loss": 0.27559801936149597, "eval_runtime": 21.9259, "eval_samples_per_second": 4.561, "eval_steps_per_second": 0.593, "step": 2400 }, { "epoch": 1.2474541751527495, "grad_norm": 1.0652376413345337, "learning_rate": 1e-05, "loss": 0.265, "step": 2450 }, { "epoch": 1.2729124236252547, "grad_norm": 1.002944827079773, "learning_rate": 1e-05, "loss": 0.2536, "step": 2500 }, { "epoch": 1.2729124236252547, "eval_loss": 0.2747356593608856, "eval_runtime": 21.9091, "eval_samples_per_second": 4.564, "eval_steps_per_second": 0.593, "step": 2500 }, { "epoch": 1.2983706720977597, "grad_norm": 1.0743255615234375, "learning_rate": 1e-05, "loss": 0.2742, "step": 2550 }, { "epoch": 1.3238289205702647, "grad_norm": 1.2174959182739258, "learning_rate": 1e-05, "loss": 0.268, "step": 2600 }, { "epoch": 1.3238289205702647, "eval_loss": 0.27502280473709106, "eval_runtime": 21.9371, "eval_samples_per_second": 4.558, "eval_steps_per_second": 0.593, "step": 2600 }, { "epoch": 1.34928716904277, "grad_norm": 1.1314553022384644, "learning_rate": 1e-05, "loss": 0.26, "step": 2650 }, { "epoch": 1.374745417515275, "grad_norm": 1.007804036140442, "learning_rate": 1e-05, "loss": 0.2776, "step": 2700 }, { "epoch": 1.374745417515275, "eval_loss": 0.27191075682640076, "eval_runtime": 21.9972, "eval_samples_per_second": 4.546, "eval_steps_per_second": 0.591, "step": 2700 }, { "epoch": 1.4002036659877801, "grad_norm": 1.1400426626205444, "learning_rate": 1e-05, "loss": 0.2577, "step": 2750 }, { "epoch": 1.4256619144602851, "grad_norm": 0.9301505088806152, "learning_rate": 1e-05, "loss": 0.2726, "step": 2800 }, { "epoch": 1.4256619144602851, "eval_loss": 0.2725023925304413, "eval_runtime": 21.722, "eval_samples_per_second": 4.604, "eval_steps_per_second": 0.598, "step": 2800 }, { "epoch": 1.4511201629327903, "grad_norm": 1.142259120941162, "learning_rate": 1e-05, "loss": 0.2635, "step": 2850 }, { "epoch": 1.4765784114052953, "grad_norm": 1.1009142398834229, "learning_rate": 1e-05, "loss": 0.2572, "step": 2900 }, { "epoch": 1.4765784114052953, "eval_loss": 0.2723616063594818, "eval_runtime": 21.8662, "eval_samples_per_second": 4.573, "eval_steps_per_second": 0.595, "step": 2900 }, { "epoch": 1.5020366598778003, "grad_norm": 1.2985098361968994, "learning_rate": 1e-05, "loss": 0.2477, "step": 2950 }, { "epoch": 1.5274949083503055, "grad_norm": 1.2853788137435913, "learning_rate": 1e-05, "loss": 0.2518, "step": 3000 }, { "epoch": 1.5274949083503055, "eval_loss": 0.2711648643016815, "eval_runtime": 22.1089, "eval_samples_per_second": 4.523, "eval_steps_per_second": 0.588, "step": 3000 }, { "epoch": 1.5529531568228105, "grad_norm": 1.3646196126937866, "learning_rate": 1e-05, "loss": 0.2545, "step": 3050 }, { "epoch": 1.5784114052953155, "grad_norm": 1.0580254793167114, "learning_rate": 1e-05, "loss": 0.2665, "step": 3100 }, { "epoch": 1.5784114052953155, "eval_loss": 0.27152860164642334, "eval_runtime": 21.563, "eval_samples_per_second": 4.638, "eval_steps_per_second": 0.603, "step": 3100 }, { "epoch": 1.6038696537678208, "grad_norm": 1.529466152191162, "learning_rate": 1e-05, "loss": 0.2589, "step": 3150 }, { "epoch": 1.629327902240326, "grad_norm": 1.0657099485397339, "learning_rate": 1e-05, "loss": 0.2559, "step": 3200 }, { "epoch": 1.629327902240326, "eval_loss": 0.2696399390697479, "eval_runtime": 21.6487, "eval_samples_per_second": 4.619, "eval_steps_per_second": 0.6, "step": 3200 }, { "epoch": 1.654786150712831, "grad_norm": 1.0233310461044312, "learning_rate": 1e-05, "loss": 0.2654, "step": 3250 }, { "epoch": 1.680244399185336, "grad_norm": 1.151859998703003, "learning_rate": 1e-05, "loss": 0.2609, "step": 3300 }, { "epoch": 1.680244399185336, "eval_loss": 0.26971447467803955, "eval_runtime": 21.8279, "eval_samples_per_second": 4.581, "eval_steps_per_second": 0.596, "step": 3300 }, { "epoch": 1.7057026476578412, "grad_norm": 1.1169921159744263, "learning_rate": 1e-05, "loss": 0.266, "step": 3350 }, { "epoch": 1.7311608961303462, "grad_norm": 0.9375786185264587, "learning_rate": 1e-05, "loss": 0.2603, "step": 3400 }, { "epoch": 1.7311608961303462, "eval_loss": 0.26760581135749817, "eval_runtime": 21.6655, "eval_samples_per_second": 4.616, "eval_steps_per_second": 0.6, "step": 3400 }, { "epoch": 1.7566191446028512, "grad_norm": 1.0559574365615845, "learning_rate": 1e-05, "loss": 0.2644, "step": 3450 }, { "epoch": 1.7820773930753564, "grad_norm": 1.1326260566711426, "learning_rate": 1e-05, "loss": 0.2448, "step": 3500 }, { "epoch": 1.7820773930753564, "eval_loss": 0.2665054500102997, "eval_runtime": 21.6484, "eval_samples_per_second": 4.619, "eval_steps_per_second": 0.601, "step": 3500 }, { "epoch": 1.8075356415478616, "grad_norm": 1.396811842918396, "learning_rate": 1e-05, "loss": 0.252, "step": 3550 }, { "epoch": 1.8329938900203666, "grad_norm": 1.6235796213150024, "learning_rate": 1e-05, "loss": 0.2587, "step": 3600 }, { "epoch": 1.8329938900203666, "eval_loss": 0.2665034234523773, "eval_runtime": 21.9855, "eval_samples_per_second": 4.548, "eval_steps_per_second": 0.591, "step": 3600 }, { "epoch": 1.8584521384928716, "grad_norm": 1.2640048265457153, "learning_rate": 1e-05, "loss": 0.2525, "step": 3650 }, { "epoch": 1.8839103869653768, "grad_norm": 1.1669272184371948, "learning_rate": 1e-05, "loss": 0.2534, "step": 3700 }, { "epoch": 1.8839103869653768, "eval_loss": 0.26496145129203796, "eval_runtime": 22.0324, "eval_samples_per_second": 4.539, "eval_steps_per_second": 0.59, "step": 3700 }, { "epoch": 1.9093686354378818, "grad_norm": 1.2015262842178345, "learning_rate": 1e-05, "loss": 0.2532, "step": 3750 }, { "epoch": 1.9348268839103868, "grad_norm": 1.0683043003082275, "learning_rate": 1e-05, "loss": 0.2496, "step": 3800 }, { "epoch": 1.9348268839103868, "eval_loss": 0.2648490369319916, "eval_runtime": 21.9752, "eval_samples_per_second": 4.551, "eval_steps_per_second": 0.592, "step": 3800 }, { "epoch": 1.960285132382892, "grad_norm": 1.4562475681304932, "learning_rate": 1e-05, "loss": 0.242, "step": 3850 }, { "epoch": 1.9857433808553973, "grad_norm": 0.9929770231246948, "learning_rate": 1e-05, "loss": 0.2528, "step": 3900 }, { "epoch": 1.9857433808553973, "eval_loss": 0.263571560382843, "eval_runtime": 21.7216, "eval_samples_per_second": 4.604, "eval_steps_per_second": 0.598, "step": 3900 }, { "epoch": 2.011201629327902, "grad_norm": 1.0417041778564453, "learning_rate": 1e-05, "loss": 0.2452, "step": 3950 }, { "epoch": 2.0366598778004072, "grad_norm": 1.0510022640228271, "learning_rate": 1e-05, "loss": 0.2652, "step": 4000 }, { "epoch": 2.0366598778004072, "eval_loss": 0.262724906206131, "eval_runtime": 22.0633, "eval_samples_per_second": 4.532, "eval_steps_per_second": 0.589, "step": 4000 }, { "epoch": 2.0621181262729125, "grad_norm": 1.383092999458313, "learning_rate": 1e-05, "loss": 0.2369, "step": 4050 }, { "epoch": 2.0875763747454177, "grad_norm": 1.3613831996917725, "learning_rate": 1e-05, "loss": 0.2493, "step": 4100 }, { "epoch": 2.0875763747454177, "eval_loss": 0.2625581622123718, "eval_runtime": 21.7251, "eval_samples_per_second": 4.603, "eval_steps_per_second": 0.598, "step": 4100 }, { "epoch": 2.1130346232179225, "grad_norm": 1.0267040729522705, "learning_rate": 1e-05, "loss": 0.2511, "step": 4150 }, { "epoch": 2.1384928716904277, "grad_norm": 0.9748584032058716, "learning_rate": 1e-05, "loss": 0.2383, "step": 4200 }, { "epoch": 2.1384928716904277, "eval_loss": 0.26004984974861145, "eval_runtime": 21.6797, "eval_samples_per_second": 4.613, "eval_steps_per_second": 0.6, "step": 4200 }, { "epoch": 2.163951120162933, "grad_norm": 1.3859432935714722, "learning_rate": 1e-05, "loss": 0.2544, "step": 4250 }, { "epoch": 2.189409368635438, "grad_norm": 1.598718285560608, "learning_rate": 1e-05, "loss": 0.2527, "step": 4300 }, { "epoch": 2.189409368635438, "eval_loss": 0.25936272740364075, "eval_runtime": 21.803, "eval_samples_per_second": 4.587, "eval_steps_per_second": 0.596, "step": 4300 }, { "epoch": 2.214867617107943, "grad_norm": 0.9896050691604614, "learning_rate": 1e-05, "loss": 0.2423, "step": 4350 }, { "epoch": 2.240325865580448, "grad_norm": 1.4134578704833984, "learning_rate": 1e-05, "loss": 0.2446, "step": 4400 }, { "epoch": 2.240325865580448, "eval_loss": 0.2597595751285553, "eval_runtime": 21.9078, "eval_samples_per_second": 4.565, "eval_steps_per_second": 0.593, "step": 4400 }, { "epoch": 2.2657841140529533, "grad_norm": 1.1840572357177734, "learning_rate": 1e-05, "loss": 0.251, "step": 4450 }, { "epoch": 2.291242362525458, "grad_norm": 1.3326150178909302, "learning_rate": 1e-05, "loss": 0.2504, "step": 4500 }, { "epoch": 2.291242362525458, "eval_loss": 0.2583908438682556, "eval_runtime": 21.7836, "eval_samples_per_second": 4.591, "eval_steps_per_second": 0.597, "step": 4500 }, { "epoch": 2.3167006109979633, "grad_norm": 1.4150619506835938, "learning_rate": 1e-05, "loss": 0.2481, "step": 4550 }, { "epoch": 2.3421588594704685, "grad_norm": 1.4056681394577026, "learning_rate": 1e-05, "loss": 0.2474, "step": 4600 }, { "epoch": 2.3421588594704685, "eval_loss": 0.25896069407463074, "eval_runtime": 21.9165, "eval_samples_per_second": 4.563, "eval_steps_per_second": 0.593, "step": 4600 }, { "epoch": 2.3676171079429738, "grad_norm": 1.1109027862548828, "learning_rate": 1e-05, "loss": 0.2447, "step": 4650 }, { "epoch": 2.3930753564154785, "grad_norm": 1.1159225702285767, "learning_rate": 1e-05, "loss": 0.2482, "step": 4700 }, { "epoch": 2.3930753564154785, "eval_loss": 0.2578243911266327, "eval_runtime": 21.5943, "eval_samples_per_second": 4.631, "eval_steps_per_second": 0.602, "step": 4700 }, { "epoch": 2.4185336048879837, "grad_norm": 1.1924070119857788, "learning_rate": 1e-05, "loss": 0.2506, "step": 4750 }, { "epoch": 2.443991853360489, "grad_norm": 1.5512938499450684, "learning_rate": 1e-05, "loss": 0.2405, "step": 4800 }, { "epoch": 2.443991853360489, "eval_loss": 0.2575813829898834, "eval_runtime": 22.0278, "eval_samples_per_second": 4.54, "eval_steps_per_second": 0.59, "step": 4800 }, { "epoch": 2.4694501018329937, "grad_norm": 1.0490576028823853, "learning_rate": 1e-05, "loss": 0.2476, "step": 4850 }, { "epoch": 2.494908350305499, "grad_norm": 1.0198191404342651, "learning_rate": 1e-05, "loss": 0.2399, "step": 4900 }, { "epoch": 2.494908350305499, "eval_loss": 0.25637081265449524, "eval_runtime": 21.9152, "eval_samples_per_second": 4.563, "eval_steps_per_second": 0.593, "step": 4900 }, { "epoch": 2.520366598778004, "grad_norm": 1.122515320777893, "learning_rate": 1e-05, "loss": 0.2501, "step": 4950 }, { "epoch": 2.5458248472505094, "grad_norm": 1.082686424255371, "learning_rate": 1e-05, "loss": 0.2443, "step": 5000 }, { "epoch": 2.5458248472505094, "eval_loss": 0.25523728132247925, "eval_runtime": 21.0107, "eval_samples_per_second": 4.759, "eval_steps_per_second": 0.619, "step": 5000 }, { "epoch": 2.571283095723014, "grad_norm": 1.0135226249694824, "learning_rate": 1e-05, "loss": 0.2414, "step": 5050 }, { "epoch": 2.5967413441955194, "grad_norm": 0.9929371476173401, "learning_rate": 1e-05, "loss": 0.248, "step": 5100 }, { "epoch": 2.5967413441955194, "eval_loss": 0.2532651722431183, "eval_runtime": 21.3518, "eval_samples_per_second": 4.683, "eval_steps_per_second": 0.609, "step": 5100 }, { "epoch": 2.6221995926680246, "grad_norm": 1.1128815412521362, "learning_rate": 1e-05, "loss": 0.2375, "step": 5150 }, { "epoch": 2.6476578411405294, "grad_norm": 1.5581951141357422, "learning_rate": 1e-05, "loss": 0.2432, "step": 5200 }, { "epoch": 2.6476578411405294, "eval_loss": 0.2521425485610962, "eval_runtime": 21.0088, "eval_samples_per_second": 4.76, "eval_steps_per_second": 0.619, "step": 5200 }, { "epoch": 2.6731160896130346, "grad_norm": 1.1291751861572266, "learning_rate": 1e-05, "loss": 0.2415, "step": 5250 }, { "epoch": 2.69857433808554, "grad_norm": 1.139137625694275, "learning_rate": 1e-05, "loss": 0.2398, "step": 5300 }, { "epoch": 2.69857433808554, "eval_loss": 0.25268828868865967, "eval_runtime": 21.7715, "eval_samples_per_second": 4.593, "eval_steps_per_second": 0.597, "step": 5300 }, { "epoch": 2.724032586558045, "grad_norm": 1.0960917472839355, "learning_rate": 1e-05, "loss": 0.2397, "step": 5350 }, { "epoch": 2.74949083503055, "grad_norm": 1.4622306823730469, "learning_rate": 1e-05, "loss": 0.2371, "step": 5400 }, { "epoch": 2.74949083503055, "eval_loss": 0.25145551562309265, "eval_runtime": 21.3534, "eval_samples_per_second": 4.683, "eval_steps_per_second": 0.609, "step": 5400 }, { "epoch": 2.774949083503055, "grad_norm": 1.0508596897125244, "learning_rate": 1e-05, "loss": 0.2389, "step": 5450 }, { "epoch": 2.8004073319755602, "grad_norm": 0.8241857886314392, "learning_rate": 1e-05, "loss": 0.2403, "step": 5500 }, { "epoch": 2.8004073319755602, "eval_loss": 0.25168588757514954, "eval_runtime": 21.2395, "eval_samples_per_second": 4.708, "eval_steps_per_second": 0.612, "step": 5500 }, { "epoch": 2.825865580448065, "grad_norm": 1.214141845703125, "learning_rate": 1e-05, "loss": 0.2406, "step": 5550 }, { "epoch": 2.8513238289205702, "grad_norm": 1.3861531019210815, "learning_rate": 1e-05, "loss": 0.2479, "step": 5600 }, { "epoch": 2.8513238289205702, "eval_loss": 0.24935197830200195, "eval_runtime": 21.4872, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.605, "step": 5600 }, { "epoch": 2.8767820773930755, "grad_norm": 1.1861828565597534, "learning_rate": 1e-05, "loss": 0.2318, "step": 5650 }, { "epoch": 2.9022403258655807, "grad_norm": 1.1281813383102417, "learning_rate": 1e-05, "loss": 0.2413, "step": 5700 }, { "epoch": 2.9022403258655807, "eval_loss": 0.25039103627204895, "eval_runtime": 20.9675, "eval_samples_per_second": 4.769, "eval_steps_per_second": 0.62, "step": 5700 }, { "epoch": 2.9276985743380854, "grad_norm": 1.1463491916656494, "learning_rate": 1e-05, "loss": 0.2364, "step": 5750 }, { "epoch": 2.9531568228105907, "grad_norm": 1.3118984699249268, "learning_rate": 1e-05, "loss": 0.2388, "step": 5800 }, { "epoch": 2.9531568228105907, "eval_loss": 0.24982962012290955, "eval_runtime": 21.5908, "eval_samples_per_second": 4.632, "eval_steps_per_second": 0.602, "step": 5800 }, { "epoch": 2.978615071283096, "grad_norm": 1.0851026773452759, "learning_rate": 1e-05, "loss": 0.2348, "step": 5850 } ], "logging_steps": 50, "max_steps": 5891, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.421089311391744e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }