|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7811457624923088, |
|
"eval_steps": 1000, |
|
"global_step": 55000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016192234204475534, |
|
"grad_norm": 2.899235963821411, |
|
"learning_rate": 4.9919038828977625e-05, |
|
"loss": 7.6546, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03238446840895107, |
|
"grad_norm": 3.1383473873138428, |
|
"learning_rate": 4.983807765795525e-05, |
|
"loss": 7.0771, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03238446840895107, |
|
"eval_loss": 10.114262580871582, |
|
"eval_runtime": 20.3013, |
|
"eval_samples_per_second": 2704.02, |
|
"eval_steps_per_second": 169.004, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0485767026134266, |
|
"grad_norm": 3.265001058578491, |
|
"learning_rate": 4.975711648693287e-05, |
|
"loss": 6.9745, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06476893681790213, |
|
"grad_norm": 4.482234477996826, |
|
"learning_rate": 4.967615531591049e-05, |
|
"loss": 6.8578, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06476893681790213, |
|
"eval_loss": 10.50239086151123, |
|
"eval_runtime": 20.2747, |
|
"eval_samples_per_second": 2707.566, |
|
"eval_steps_per_second": 169.226, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08096117102237767, |
|
"grad_norm": 3.455843925476074, |
|
"learning_rate": 4.9595194144888116e-05, |
|
"loss": 6.8529, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.0971534052268532, |
|
"grad_norm": 4.888940811157227, |
|
"learning_rate": 4.951423297386574e-05, |
|
"loss": 6.7865, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0971534052268532, |
|
"eval_loss": 11.206120491027832, |
|
"eval_runtime": 20.2787, |
|
"eval_samples_per_second": 2707.023, |
|
"eval_steps_per_second": 169.192, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.11334563943132873, |
|
"grad_norm": 2.9272000789642334, |
|
"learning_rate": 4.9433271802843354e-05, |
|
"loss": 6.7582, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.12953787363580427, |
|
"grad_norm": 3.68345046043396, |
|
"learning_rate": 4.9352310631820984e-05, |
|
"loss": 6.6886, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.12953787363580427, |
|
"eval_loss": 11.919740676879883, |
|
"eval_runtime": 20.2985, |
|
"eval_samples_per_second": 2704.382, |
|
"eval_steps_per_second": 169.027, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.14573010784027982, |
|
"grad_norm": 3.9165866374969482, |
|
"learning_rate": 4.927151138314065e-05, |
|
"loss": 6.657, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.16192234204475534, |
|
"grad_norm": 4.2097015380859375, |
|
"learning_rate": 4.9190550212118266e-05, |
|
"loss": 6.6567, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16192234204475534, |
|
"eval_loss": 11.381248474121094, |
|
"eval_runtime": 20.3121, |
|
"eval_samples_per_second": 2702.582, |
|
"eval_steps_per_second": 168.914, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.17811457624923088, |
|
"grad_norm": 3.525871753692627, |
|
"learning_rate": 4.9109589041095895e-05, |
|
"loss": 6.6297, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.1943068104537064, |
|
"grad_norm": 5.512066841125488, |
|
"learning_rate": 4.902862787007351e-05, |
|
"loss": 6.6049, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.1943068104537064, |
|
"eval_loss": 11.815380096435547, |
|
"eval_runtime": 20.3269, |
|
"eval_samples_per_second": 2700.605, |
|
"eval_steps_per_second": 168.791, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.21049904465818195, |
|
"grad_norm": 6.878340244293213, |
|
"learning_rate": 4.8947828621393185e-05, |
|
"loss": 6.5587, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.22669127886265747, |
|
"grad_norm": 3.512578248977661, |
|
"learning_rate": 4.886686745037081e-05, |
|
"loss": 6.5451, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.22669127886265747, |
|
"eval_loss": 11.993084907531738, |
|
"eval_runtime": 20.3217, |
|
"eval_samples_per_second": 2701.3, |
|
"eval_steps_per_second": 168.834, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.24288351306713302, |
|
"grad_norm": 4.394603729248047, |
|
"learning_rate": 4.8786068201690474e-05, |
|
"loss": 6.5283, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.25907574727160854, |
|
"grad_norm": 3.8106327056884766, |
|
"learning_rate": 4.8705107030668096e-05, |
|
"loss": 6.5246, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.25907574727160854, |
|
"eval_loss": 12.192233085632324, |
|
"eval_runtime": 20.3069, |
|
"eval_samples_per_second": 2703.263, |
|
"eval_steps_per_second": 168.957, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.27526798147608406, |
|
"grad_norm": 4.158311367034912, |
|
"learning_rate": 4.862414585964571e-05, |
|
"loss": 6.4727, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.29146021568055963, |
|
"grad_norm": 3.797197103500366, |
|
"learning_rate": 4.854318468862334e-05, |
|
"loss": 6.4594, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.29146021568055963, |
|
"eval_loss": 12.250760078430176, |
|
"eval_runtime": 20.2943, |
|
"eval_samples_per_second": 2704.945, |
|
"eval_steps_per_second": 169.062, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.30765244988503515, |
|
"grad_norm": 4.4851908683776855, |
|
"learning_rate": 4.8462223517600964e-05, |
|
"loss": 6.4441, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.32384468408951067, |
|
"grad_norm": 5.158078670501709, |
|
"learning_rate": 4.838142426892063e-05, |
|
"loss": 6.4537, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.32384468408951067, |
|
"eval_loss": 12.235699653625488, |
|
"eval_runtime": 20.3426, |
|
"eval_samples_per_second": 2698.522, |
|
"eval_steps_per_second": 168.661, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3400369182939862, |
|
"grad_norm": 4.838212966918945, |
|
"learning_rate": 4.8300463097898254e-05, |
|
"loss": 6.4313, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.35622915249846177, |
|
"grad_norm": 4.873743534088135, |
|
"learning_rate": 4.821950192687587e-05, |
|
"loss": 6.4375, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.35622915249846177, |
|
"eval_loss": 13.167688369750977, |
|
"eval_runtime": 20.3858, |
|
"eval_samples_per_second": 2692.805, |
|
"eval_steps_per_second": 168.303, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3724213867029373, |
|
"grad_norm": 5.022457122802734, |
|
"learning_rate": 4.813854075585349e-05, |
|
"loss": 6.4249, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.3886136209074128, |
|
"grad_norm": 4.3410964012146, |
|
"learning_rate": 4.805774150717316e-05, |
|
"loss": 6.3944, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3886136209074128, |
|
"eval_loss": 12.753792762756348, |
|
"eval_runtime": 20.3234, |
|
"eval_samples_per_second": 2701.08, |
|
"eval_steps_per_second": 168.821, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4048058551118883, |
|
"grad_norm": 4.282736301422119, |
|
"learning_rate": 4.797678033615078e-05, |
|
"loss": 6.3343, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.4209980893163639, |
|
"grad_norm": 4.2941484451293945, |
|
"learning_rate": 4.789581916512841e-05, |
|
"loss": 6.3576, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.4209980893163639, |
|
"eval_loss": 14.182137489318848, |
|
"eval_runtime": 20.2915, |
|
"eval_samples_per_second": 2705.322, |
|
"eval_steps_per_second": 169.086, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.4371903235208394, |
|
"grad_norm": 4.584780693054199, |
|
"learning_rate": 4.7814857994106027e-05, |
|
"loss": 6.3133, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.45338255772531494, |
|
"grad_norm": 6.5496134757995605, |
|
"learning_rate": 4.77340587454257e-05, |
|
"loss": 6.3042, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.45338255772531494, |
|
"eval_loss": 13.6593656539917, |
|
"eval_runtime": 20.2751, |
|
"eval_samples_per_second": 2707.503, |
|
"eval_steps_per_second": 169.222, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.46957479192979046, |
|
"grad_norm": 4.188933849334717, |
|
"learning_rate": 4.7653097574403316e-05, |
|
"loss": 6.2764, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.48576702613426603, |
|
"grad_norm": 4.540693283081055, |
|
"learning_rate": 4.757213640338094e-05, |
|
"loss": 6.261, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.48576702613426603, |
|
"eval_loss": 13.340327262878418, |
|
"eval_runtime": 20.2957, |
|
"eval_samples_per_second": 2704.755, |
|
"eval_steps_per_second": 169.05, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5019592603387415, |
|
"grad_norm": 4.090181827545166, |
|
"learning_rate": 4.749117523235857e-05, |
|
"loss": 6.2382, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5181514945432171, |
|
"grad_norm": 5.604825973510742, |
|
"learning_rate": 4.7410214061336184e-05, |
|
"loss": 6.2225, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5181514945432171, |
|
"eval_loss": 14.243009567260742, |
|
"eval_runtime": 20.3274, |
|
"eval_samples_per_second": 2700.537, |
|
"eval_steps_per_second": 168.787, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5343437287476926, |
|
"grad_norm": 5.334819316864014, |
|
"learning_rate": 4.732941481265585e-05, |
|
"loss": 6.2051, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.5505359629521681, |
|
"grad_norm": 5.907509803771973, |
|
"learning_rate": 4.724845364163347e-05, |
|
"loss": 6.1475, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5505359629521681, |
|
"eval_loss": 14.208978652954102, |
|
"eval_runtime": 20.3636, |
|
"eval_samples_per_second": 2695.742, |
|
"eval_steps_per_second": 168.487, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5667281971566437, |
|
"grad_norm": 5.113221645355225, |
|
"learning_rate": 4.7167492470611096e-05, |
|
"loss": 6.1765, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5829204313611193, |
|
"grad_norm": 5.8240461349487305, |
|
"learning_rate": 4.708653129958872e-05, |
|
"loss": 6.1567, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5829204313611193, |
|
"eval_loss": 14.480341911315918, |
|
"eval_runtime": 20.3098, |
|
"eval_samples_per_second": 2702.888, |
|
"eval_steps_per_second": 168.934, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5991126655655947, |
|
"grad_norm": 4.948818683624268, |
|
"learning_rate": 4.7005732050908385e-05, |
|
"loss": 6.1198, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.6153048997700703, |
|
"grad_norm": 5.95670223236084, |
|
"learning_rate": 4.692477087988601e-05, |
|
"loss": 6.1038, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6153048997700703, |
|
"eval_loss": 14.150490760803223, |
|
"eval_runtime": 20.2957, |
|
"eval_samples_per_second": 2704.759, |
|
"eval_steps_per_second": 169.051, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6314971339745458, |
|
"grad_norm": 7.957947731018066, |
|
"learning_rate": 4.684380970886363e-05, |
|
"loss": 6.0658, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.6476893681790213, |
|
"grad_norm": 6.398547649383545, |
|
"learning_rate": 4.676284853784125e-05, |
|
"loss": 6.007, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6476893681790213, |
|
"eval_loss": 13.682233810424805, |
|
"eval_runtime": 20.2919, |
|
"eval_samples_per_second": 2705.271, |
|
"eval_steps_per_second": 169.083, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6638816023834969, |
|
"grad_norm": 7.564326763153076, |
|
"learning_rate": 4.6681887366818875e-05, |
|
"loss": 6.0063, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.6800738365879724, |
|
"grad_norm": 6.847624778747559, |
|
"learning_rate": 4.66009261957965e-05, |
|
"loss": 5.9528, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6800738365879724, |
|
"eval_loss": 13.830253601074219, |
|
"eval_runtime": 20.3183, |
|
"eval_samples_per_second": 2701.748, |
|
"eval_steps_per_second": 168.862, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.696266070792448, |
|
"grad_norm": 6.5545125007629395, |
|
"learning_rate": 4.651996502477412e-05, |
|
"loss": 5.9257, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.7124583049969235, |
|
"grad_norm": 8.006608009338379, |
|
"learning_rate": 4.643900385375174e-05, |
|
"loss": 5.9066, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.7124583049969235, |
|
"eval_loss": 14.178547859191895, |
|
"eval_runtime": 20.3334, |
|
"eval_samples_per_second": 2699.748, |
|
"eval_steps_per_second": 168.737, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.728650539201399, |
|
"grad_norm": 6.9318766593933105, |
|
"learning_rate": 4.635820460507141e-05, |
|
"loss": 5.8562, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.7448427734058746, |
|
"grad_norm": 6.815021991729736, |
|
"learning_rate": 4.627724343404903e-05, |
|
"loss": 5.7984, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.7448427734058746, |
|
"eval_loss": 14.005717277526855, |
|
"eval_runtime": 20.3208, |
|
"eval_samples_per_second": 2701.413, |
|
"eval_steps_per_second": 168.841, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.76103500761035, |
|
"grad_norm": 7.4206976890563965, |
|
"learning_rate": 4.6196282263026655e-05, |
|
"loss": 5.7574, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.7772272418148256, |
|
"grad_norm": 7.273144721984863, |
|
"learning_rate": 4.611532109200428e-05, |
|
"loss": 5.7247, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.7772272418148256, |
|
"eval_loss": 13.36209774017334, |
|
"eval_runtime": 20.3187, |
|
"eval_samples_per_second": 2701.693, |
|
"eval_steps_per_second": 168.859, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.7934194760193012, |
|
"grad_norm": 8.667037963867188, |
|
"learning_rate": 4.6034521843323944e-05, |
|
"loss": 5.682, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.8096117102237766, |
|
"grad_norm": 9.467761039733887, |
|
"learning_rate": 4.595356067230157e-05, |
|
"loss": 5.6262, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8096117102237766, |
|
"eval_loss": 14.30241584777832, |
|
"eval_runtime": 20.3265, |
|
"eval_samples_per_second": 2700.664, |
|
"eval_steps_per_second": 168.795, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8258039444282522, |
|
"grad_norm": 7.378128528594971, |
|
"learning_rate": 4.587259950127919e-05, |
|
"loss": 5.5776, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.8419961786327278, |
|
"grad_norm": 8.246053695678711, |
|
"learning_rate": 4.5791638330256806e-05, |
|
"loss": 5.5251, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.8419961786327278, |
|
"eval_loss": 14.098834037780762, |
|
"eval_runtime": 20.3189, |
|
"eval_samples_per_second": 2701.67, |
|
"eval_steps_per_second": 168.857, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.8581884128372033, |
|
"grad_norm": 6.545131206512451, |
|
"learning_rate": 4.571083908157648e-05, |
|
"loss": 5.4848, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.8743806470416788, |
|
"grad_norm": 7.735929489135742, |
|
"learning_rate": 4.56298779105541e-05, |
|
"loss": 5.4504, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.8743806470416788, |
|
"eval_loss": 13.383395195007324, |
|
"eval_runtime": 20.3058, |
|
"eval_samples_per_second": 2703.417, |
|
"eval_steps_per_second": 168.967, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.8905728812461543, |
|
"grad_norm": 7.53142786026001, |
|
"learning_rate": 4.5548916739531724e-05, |
|
"loss": 5.4094, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.9067651154506299, |
|
"grad_norm": 6.753902912139893, |
|
"learning_rate": 4.546795556850935e-05, |
|
"loss": 5.382, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9067651154506299, |
|
"eval_loss": 13.9662446975708, |
|
"eval_runtime": 20.3511, |
|
"eval_samples_per_second": 2697.397, |
|
"eval_steps_per_second": 168.59, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9229573496551055, |
|
"grad_norm": 7.170286655426025, |
|
"learning_rate": 4.538715631982901e-05, |
|
"loss": 5.3786, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.9391495838595809, |
|
"grad_norm": 7.388674736022949, |
|
"learning_rate": 4.5306195148806636e-05, |
|
"loss": 5.3468, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.9391495838595809, |
|
"eval_loss": 14.00700855255127, |
|
"eval_runtime": 20.2945, |
|
"eval_samples_per_second": 2704.924, |
|
"eval_steps_per_second": 169.061, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.9553418180640565, |
|
"grad_norm": 8.680002212524414, |
|
"learning_rate": 4.522523397778426e-05, |
|
"loss": 5.3159, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.9715340522685321, |
|
"grad_norm": 6.966347694396973, |
|
"learning_rate": 4.514427280676188e-05, |
|
"loss": 5.3321, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.9715340522685321, |
|
"eval_loss": 13.2510986328125, |
|
"eval_runtime": 20.2734, |
|
"eval_samples_per_second": 2707.742, |
|
"eval_steps_per_second": 169.237, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.9877262864730075, |
|
"grad_norm": 7.675726413726807, |
|
"learning_rate": 4.506347355808155e-05, |
|
"loss": 5.2547, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.003918520677483, |
|
"grad_norm": 9.16435432434082, |
|
"learning_rate": 4.4982512387059164e-05, |
|
"loss": 5.2597, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.003918520677483, |
|
"eval_loss": 13.722345352172852, |
|
"eval_runtime": 20.3123, |
|
"eval_samples_per_second": 2702.549, |
|
"eval_steps_per_second": 168.912, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.0201107548819586, |
|
"grad_norm": 8.053180694580078, |
|
"learning_rate": 4.490155121603679e-05, |
|
"loss": 5.2168, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.0363029890864341, |
|
"grad_norm": 7.8324432373046875, |
|
"learning_rate": 4.4820590045014416e-05, |
|
"loss": 5.2229, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.0363029890864341, |
|
"eval_loss": 13.410867691040039, |
|
"eval_runtime": 20.3235, |
|
"eval_samples_per_second": 2701.057, |
|
"eval_steps_per_second": 168.819, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.0524952232909097, |
|
"grad_norm": 6.322490692138672, |
|
"learning_rate": 4.473979079633408e-05, |
|
"loss": 5.1849, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.0686874574953853, |
|
"grad_norm": 7.511089324951172, |
|
"learning_rate": 4.4658829625311705e-05, |
|
"loss": 5.1838, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.0686874574953853, |
|
"eval_loss": 13.21849250793457, |
|
"eval_runtime": 20.3207, |
|
"eval_samples_per_second": 2701.433, |
|
"eval_steps_per_second": 168.843, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.0848796916998606, |
|
"grad_norm": 7.725784778594971, |
|
"learning_rate": 4.457786845428932e-05, |
|
"loss": 5.1687, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.1010719259043362, |
|
"grad_norm": 7.167893886566162, |
|
"learning_rate": 4.449690728326695e-05, |
|
"loss": 5.157, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.1010719259043362, |
|
"eval_loss": 13.764806747436523, |
|
"eval_runtime": 20.3095, |
|
"eval_samples_per_second": 2702.924, |
|
"eval_steps_per_second": 168.936, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.1172641601088118, |
|
"grad_norm": 9.298649787902832, |
|
"learning_rate": 4.441610803458661e-05, |
|
"loss": 5.1152, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.1334563943132874, |
|
"grad_norm": 7.694969654083252, |
|
"learning_rate": 4.433514686356424e-05, |
|
"loss": 5.0736, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.1334563943132874, |
|
"eval_loss": 13.989611625671387, |
|
"eval_runtime": 20.324, |
|
"eval_samples_per_second": 2700.988, |
|
"eval_steps_per_second": 168.815, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.149648628517763, |
|
"grad_norm": 7.617096424102783, |
|
"learning_rate": 4.425418569254186e-05, |
|
"loss": 5.0596, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.1658408627222383, |
|
"grad_norm": 6.822023391723633, |
|
"learning_rate": 4.417322452151948e-05, |
|
"loss": 5.0544, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.1658408627222383, |
|
"eval_loss": 14.167996406555176, |
|
"eval_runtime": 20.2953, |
|
"eval_samples_per_second": 2704.815, |
|
"eval_steps_per_second": 169.054, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.1820330969267139, |
|
"grad_norm": 6.709229946136475, |
|
"learning_rate": 4.40922633504971e-05, |
|
"loss": 5.0675, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.1982253311311895, |
|
"grad_norm": 8.224237442016602, |
|
"learning_rate": 4.401162602415881e-05, |
|
"loss": 4.9965, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.1982253311311895, |
|
"eval_loss": 14.570647239685059, |
|
"eval_runtime": 20.3304, |
|
"eval_samples_per_second": 2700.144, |
|
"eval_steps_per_second": 168.762, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.214417565335665, |
|
"grad_norm": 8.49647045135498, |
|
"learning_rate": 4.393066485313644e-05, |
|
"loss": 4.9891, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.2306097995401406, |
|
"grad_norm": 6.622528553009033, |
|
"learning_rate": 4.384970368211406e-05, |
|
"loss": 4.9896, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.2306097995401406, |
|
"eval_loss": 14.330415725708008, |
|
"eval_runtime": 20.3174, |
|
"eval_samples_per_second": 2701.865, |
|
"eval_steps_per_second": 168.87, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.2468020337446162, |
|
"grad_norm": 7.867303371429443, |
|
"learning_rate": 4.376874251109168e-05, |
|
"loss": 4.9851, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.2629942679490918, |
|
"grad_norm": 7.329415321350098, |
|
"learning_rate": 4.368778134006931e-05, |
|
"loss": 4.969, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.2629942679490918, |
|
"eval_loss": 14.989124298095703, |
|
"eval_runtime": 20.3265, |
|
"eval_samples_per_second": 2700.663, |
|
"eval_steps_per_second": 168.795, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.279186502153567, |
|
"grad_norm": 8.647744178771973, |
|
"learning_rate": 4.3606820169046924e-05, |
|
"loss": 4.9547, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.2953787363580427, |
|
"grad_norm": 8.578520774841309, |
|
"learning_rate": 4.352585899802455e-05, |
|
"loss": 4.9569, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.2953787363580427, |
|
"eval_loss": 14.283174514770508, |
|
"eval_runtime": 20.3168, |
|
"eval_samples_per_second": 2701.954, |
|
"eval_steps_per_second": 168.875, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.3115709705625183, |
|
"grad_norm": 8.312915802001953, |
|
"learning_rate": 4.3444897827002176e-05, |
|
"loss": 4.9647, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.3277632047669938, |
|
"grad_norm": 9.59231185913086, |
|
"learning_rate": 4.3364098578321836e-05, |
|
"loss": 4.9462, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.3277632047669938, |
|
"eval_loss": 14.299793243408203, |
|
"eval_runtime": 20.2951, |
|
"eval_samples_per_second": 2704.839, |
|
"eval_steps_per_second": 169.055, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.3439554389714692, |
|
"grad_norm": 7.844625949859619, |
|
"learning_rate": 4.328313740729946e-05, |
|
"loss": 4.9147, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.3601476731759448, |
|
"grad_norm": 6.845381736755371, |
|
"learning_rate": 4.320217623627708e-05, |
|
"loss": 4.914, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.3601476731759448, |
|
"eval_loss": 14.049765586853027, |
|
"eval_runtime": 20.2836, |
|
"eval_samples_per_second": 2706.371, |
|
"eval_steps_per_second": 169.151, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.3763399073804203, |
|
"grad_norm": 7.644216537475586, |
|
"learning_rate": 4.3121215065254704e-05, |
|
"loss": 4.9066, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.392532141584896, |
|
"grad_norm": 7.950870513916016, |
|
"learning_rate": 4.304041581657438e-05, |
|
"loss": 4.8716, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.392532141584896, |
|
"eval_loss": 14.33547306060791, |
|
"eval_runtime": 20.3142, |
|
"eval_samples_per_second": 2702.295, |
|
"eval_steps_per_second": 168.897, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.4087243757893715, |
|
"grad_norm": 7.638858318328857, |
|
"learning_rate": 4.295945464555199e-05, |
|
"loss": 4.8557, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.424916609993847, |
|
"grad_norm": 9.173840522766113, |
|
"learning_rate": 4.2878493474529616e-05, |
|
"loss": 4.8394, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.424916609993847, |
|
"eval_loss": 13.498611450195312, |
|
"eval_runtime": 20.3548, |
|
"eval_samples_per_second": 2696.904, |
|
"eval_steps_per_second": 168.56, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.4411088441983224, |
|
"grad_norm": 8.224470138549805, |
|
"learning_rate": 4.279753230350724e-05, |
|
"loss": 4.8216, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.457301078402798, |
|
"grad_norm": 7.497710704803467, |
|
"learning_rate": 4.2716733054826905e-05, |
|
"loss": 4.8237, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.457301078402798, |
|
"eval_loss": 13.823565483093262, |
|
"eval_runtime": 20.2695, |
|
"eval_samples_per_second": 2708.257, |
|
"eval_steps_per_second": 169.269, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.4734933126072736, |
|
"grad_norm": 6.961141586303711, |
|
"learning_rate": 4.2635771883804535e-05, |
|
"loss": 4.8119, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.4896855468117491, |
|
"grad_norm": 6.934360504150391, |
|
"learning_rate": 4.255481071278215e-05, |
|
"loss": 4.7972, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.4896855468117491, |
|
"eval_loss": 14.0670804977417, |
|
"eval_runtime": 20.2732, |
|
"eval_samples_per_second": 2707.759, |
|
"eval_steps_per_second": 169.238, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.5058777810162245, |
|
"grad_norm": 7.600714206695557, |
|
"learning_rate": 4.247384954175977e-05, |
|
"loss": 4.8131, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.5220700152207, |
|
"grad_norm": 8.766816139221191, |
|
"learning_rate": 4.239305029307944e-05, |
|
"loss": 4.778, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.5220700152207, |
|
"eval_loss": 13.546905517578125, |
|
"eval_runtime": 20.3029, |
|
"eval_samples_per_second": 2703.795, |
|
"eval_steps_per_second": 168.99, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.5382622494251756, |
|
"grad_norm": 8.299079895019531, |
|
"learning_rate": 4.231208912205706e-05, |
|
"loss": 4.7336, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.5544544836296512, |
|
"grad_norm": 8.333426475524902, |
|
"learning_rate": 4.2231127951034685e-05, |
|
"loss": 4.7541, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.5544544836296512, |
|
"eval_loss": 13.711235046386719, |
|
"eval_runtime": 20.3064, |
|
"eval_samples_per_second": 2703.338, |
|
"eval_steps_per_second": 168.962, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.5706467178341268, |
|
"grad_norm": 7.705515384674072, |
|
"learning_rate": 4.215016678001231e-05, |
|
"loss": 4.7901, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.5868389520386024, |
|
"grad_norm": 6.669991493225098, |
|
"learning_rate": 4.206920560898993e-05, |
|
"loss": 4.7544, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.5868389520386024, |
|
"eval_loss": 13.069415092468262, |
|
"eval_runtime": 20.3123, |
|
"eval_samples_per_second": 2702.548, |
|
"eval_steps_per_second": 168.912, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.603031186243078, |
|
"grad_norm": 6.958492279052734, |
|
"learning_rate": 4.19884063603096e-05, |
|
"loss": 4.7175, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.6192234204475535, |
|
"grad_norm": 9.244942665100098, |
|
"learning_rate": 4.190744518928722e-05, |
|
"loss": 4.7092, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.6192234204475535, |
|
"eval_loss": 13.0497465133667, |
|
"eval_runtime": 20.3211, |
|
"eval_samples_per_second": 2701.382, |
|
"eval_steps_per_second": 168.839, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.6354156546520289, |
|
"grad_norm": 8.372320175170898, |
|
"learning_rate": 4.182648401826484e-05, |
|
"loss": 4.7401, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.6516078888565044, |
|
"grad_norm": 7.373213768005371, |
|
"learning_rate": 4.1745522847242465e-05, |
|
"loss": 4.7252, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.6516078888565044, |
|
"eval_loss": 13.589052200317383, |
|
"eval_runtime": 20.3142, |
|
"eval_samples_per_second": 2702.302, |
|
"eval_steps_per_second": 168.897, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.6678001230609798, |
|
"grad_norm": 9.22149658203125, |
|
"learning_rate": 4.166472359856213e-05, |
|
"loss": 4.6795, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.6839923572654554, |
|
"grad_norm": 5.994777202606201, |
|
"learning_rate": 4.1583762427539754e-05, |
|
"loss": 4.7021, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.6839923572654554, |
|
"eval_loss": 13.31000804901123, |
|
"eval_runtime": 20.2902, |
|
"eval_samples_per_second": 2705.498, |
|
"eval_steps_per_second": 169.097, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.700184591469931, |
|
"grad_norm": 8.134748458862305, |
|
"learning_rate": 4.150280125651738e-05, |
|
"loss": 4.7013, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.7163768256744065, |
|
"grad_norm": 8.395650863647461, |
|
"learning_rate": 4.1421840085495e-05, |
|
"loss": 4.6976, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.7163768256744065, |
|
"eval_loss": 13.707457542419434, |
|
"eval_runtime": 20.3052, |
|
"eval_samples_per_second": 2703.49, |
|
"eval_steps_per_second": 168.971, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.732569059878882, |
|
"grad_norm": 9.668176651000977, |
|
"learning_rate": 4.1341040836814666e-05, |
|
"loss": 4.6889, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.7487612940833577, |
|
"grad_norm": 7.348442554473877, |
|
"learning_rate": 4.126007966579229e-05, |
|
"loss": 4.6465, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.7487612940833577, |
|
"eval_loss": 13.285728454589844, |
|
"eval_runtime": 20.3053, |
|
"eval_samples_per_second": 2703.483, |
|
"eval_steps_per_second": 168.971, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.7649535282878333, |
|
"grad_norm": 7.7653889656066895, |
|
"learning_rate": 4.117911849476991e-05, |
|
"loss": 4.6537, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.7811457624923088, |
|
"grad_norm": 8.290285110473633, |
|
"learning_rate": 4.1098157323747534e-05, |
|
"loss": 4.6468, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.7811457624923088, |
|
"eval_loss": 13.134732246398926, |
|
"eval_runtime": 20.3013, |
|
"eval_samples_per_second": 2704.018, |
|
"eval_steps_per_second": 169.004, |
|
"step": 55000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 308790, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4320645675220992.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|