|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9991532599491956, |
|
"eval_steps": 500, |
|
"global_step": 295, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01693480101608806, |
|
"grad_norm": 11.07681941986084, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.8357, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03386960203217612, |
|
"grad_norm": 4.368485450744629, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 1.6208, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05080440304826418, |
|
"grad_norm": 2.597416877746582, |
|
"learning_rate": 1e-05, |
|
"loss": 1.2437, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06773920406435224, |
|
"grad_norm": 2.159755229949951, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.9804, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0846740050804403, |
|
"grad_norm": 1.7200169563293457, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.8607, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10160880609652836, |
|
"grad_norm": 1.2486546039581299, |
|
"learning_rate": 2e-05, |
|
"loss": 0.7869, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11854360711261643, |
|
"grad_norm": 1.2447178363800049, |
|
"learning_rate": 1.9982437317643218e-05, |
|
"loss": 0.7324, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1354784081287045, |
|
"grad_norm": 1.129758358001709, |
|
"learning_rate": 1.992981096013517e-05, |
|
"loss": 0.7058, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15241320914479256, |
|
"grad_norm": 0.9256328344345093, |
|
"learning_rate": 1.984230577947597e-05, |
|
"loss": 0.6845, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1693480101608806, |
|
"grad_norm": 0.9811558723449707, |
|
"learning_rate": 1.972022914080411e-05, |
|
"loss": 0.6698, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18628281117696868, |
|
"grad_norm": 1.0069135427474976, |
|
"learning_rate": 1.9564009842765225e-05, |
|
"loss": 0.6393, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.20321761219305673, |
|
"grad_norm": 0.9143001437187195, |
|
"learning_rate": 1.9374196611341212e-05, |
|
"loss": 0.6321, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2201524132091448, |
|
"grad_norm": 0.9945001006126404, |
|
"learning_rate": 1.9151456172430186e-05, |
|
"loss": 0.6286, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23708721422523285, |
|
"grad_norm": 0.9755650758743286, |
|
"learning_rate": 1.8896570909947477e-05, |
|
"loss": 0.6178, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2540220152413209, |
|
"grad_norm": 0.9144276976585388, |
|
"learning_rate": 1.8610436117673557e-05, |
|
"loss": 0.6204, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.270956816257409, |
|
"grad_norm": 0.9252317547798157, |
|
"learning_rate": 1.829405685450202e-05, |
|
"loss": 0.6124, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.28789161727349705, |
|
"grad_norm": 0.7833272218704224, |
|
"learning_rate": 1.7948544414133534e-05, |
|
"loss": 0.5958, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3048264182895851, |
|
"grad_norm": 0.8833147883415222, |
|
"learning_rate": 1.7575112421616203e-05, |
|
"loss": 0.6049, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32176121930567314, |
|
"grad_norm": 0.8253676295280457, |
|
"learning_rate": 1.717507257044331e-05, |
|
"loss": 0.5905, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3386960203217612, |
|
"grad_norm": 0.800396740436554, |
|
"learning_rate": 1.6749830015182106e-05, |
|
"loss": 0.5698, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3556308213378493, |
|
"grad_norm": 0.9177567362785339, |
|
"learning_rate": 1.6300878435817115e-05, |
|
"loss": 0.5787, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.37256562235393736, |
|
"grad_norm": 0.8931817412376404, |
|
"learning_rate": 1.5829794791144723e-05, |
|
"loss": 0.5748, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3895004233700254, |
|
"grad_norm": 1.0776097774505615, |
|
"learning_rate": 1.533823377964791e-05, |
|
"loss": 0.5693, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.40643522438611346, |
|
"grad_norm": 0.9571148157119751, |
|
"learning_rate": 1.482792202730745e-05, |
|
"loss": 0.5795, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.42337002540220153, |
|
"grad_norm": 0.9276224970817566, |
|
"learning_rate": 1.4300652022765207e-05, |
|
"loss": 0.5744, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4403048264182896, |
|
"grad_norm": 0.8562670350074768, |
|
"learning_rate": 1.3758275821142382e-05, |
|
"loss": 0.574, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4572396274343776, |
|
"grad_norm": 0.987328827381134, |
|
"learning_rate": 1.3202698538628376e-05, |
|
"loss": 0.5612, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4741744284504657, |
|
"grad_norm": 0.7609542012214661, |
|
"learning_rate": 1.2635871660690677e-05, |
|
"loss": 0.5608, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4911092294665538, |
|
"grad_norm": 0.752984881401062, |
|
"learning_rate": 1.2059786187410984e-05, |
|
"loss": 0.5462, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5080440304826418, |
|
"grad_norm": 0.7577420473098755, |
|
"learning_rate": 1.1476465640024814e-05, |
|
"loss": 0.5565, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5249788314987299, |
|
"grad_norm": 0.7515721321105957, |
|
"learning_rate": 1.0887958953229349e-05, |
|
"loss": 0.5475, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.541913632514818, |
|
"grad_norm": 0.7427639961242676, |
|
"learning_rate": 1.0296333278225599e-05, |
|
"loss": 0.5561, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.558848433530906, |
|
"grad_norm": 0.8810573816299438, |
|
"learning_rate": 9.703666721774403e-06, |
|
"loss": 0.5391, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5757832345469941, |
|
"grad_norm": 0.7623007297515869, |
|
"learning_rate": 9.112041046770653e-06, |
|
"loss": 0.5504, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5927180355630821, |
|
"grad_norm": 0.7749871015548706, |
|
"learning_rate": 8.52353435997519e-06, |
|
"loss": 0.5442, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6096528365791702, |
|
"grad_norm": 0.731037974357605, |
|
"learning_rate": 7.940213812589018e-06, |
|
"loss": 0.5504, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6265876375952583, |
|
"grad_norm": 0.6718469262123108, |
|
"learning_rate": 7.364128339309326e-06, |
|
"loss": 0.5461, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6435224386113463, |
|
"grad_norm": 0.7169979214668274, |
|
"learning_rate": 6.797301461371626e-06, |
|
"loss": 0.5387, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6604572396274344, |
|
"grad_norm": 0.6952066421508789, |
|
"learning_rate": 6.241724178857621e-06, |
|
"loss": 0.5391, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6773920406435224, |
|
"grad_norm": 0.7324413657188416, |
|
"learning_rate": 5.699347977234799e-06, |
|
"loss": 0.5351, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6943268416596104, |
|
"grad_norm": 0.7053231596946716, |
|
"learning_rate": 5.172077972692553e-06, |
|
"loss": 0.5393, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.7112616426756986, |
|
"grad_norm": 0.6486250758171082, |
|
"learning_rate": 4.661766220352098e-06, |
|
"loss": 0.5311, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7281964436917866, |
|
"grad_norm": 0.6176902055740356, |
|
"learning_rate": 4.170205208855281e-06, |
|
"loss": 0.5342, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7451312447078747, |
|
"grad_norm": 0.6902855634689331, |
|
"learning_rate": 3.6991215641828903e-06, |
|
"loss": 0.526, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7620660457239627, |
|
"grad_norm": 0.6924082636833191, |
|
"learning_rate": 3.250169984817897e-06, |
|
"loss": 0.5393, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7790008467400508, |
|
"grad_norm": 0.6927950978279114, |
|
"learning_rate": 2.8249274295566863e-06, |
|
"loss": 0.5372, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7959356477561389, |
|
"grad_norm": 0.6544324159622192, |
|
"learning_rate": 2.424887578383799e-06, |
|
"loss": 0.53, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8128704487722269, |
|
"grad_norm": 0.6218433976173401, |
|
"learning_rate": 2.0514555858664663e-06, |
|
"loss": 0.5383, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8298052497883149, |
|
"grad_norm": 0.6293333172798157, |
|
"learning_rate": 1.7059431454979825e-06, |
|
"loss": 0.5242, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8467400508044031, |
|
"grad_norm": 0.6629937291145325, |
|
"learning_rate": 1.3895638823264447e-06, |
|
"loss": 0.5283, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8636748518204911, |
|
"grad_norm": 0.6638931632041931, |
|
"learning_rate": 1.1034290900525279e-06, |
|
"loss": 0.5346, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8806096528365792, |
|
"grad_norm": 0.6154753565788269, |
|
"learning_rate": 8.485438275698154e-07, |
|
"loss": 0.5286, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8975444538526672, |
|
"grad_norm": 0.5576867461204529, |
|
"learning_rate": 6.258033886587911e-07, |
|
"loss": 0.5197, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9144792548687553, |
|
"grad_norm": 0.5925081372261047, |
|
"learning_rate": 4.359901572347758e-07, |
|
"loss": 0.5272, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9314140558848434, |
|
"grad_norm": 0.8894816040992737, |
|
"learning_rate": 2.7977085919589253e-07, |
|
"loss": 0.5229, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9483488569009314, |
|
"grad_norm": 0.613675057888031, |
|
"learning_rate": 1.5769422052403172e-07, |
|
"loss": 0.5336, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9652836579170194, |
|
"grad_norm": 0.5702030658721924, |
|
"learning_rate": 7.018903986483083e-08, |
|
"loss": 0.5295, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9822184589331076, |
|
"grad_norm": 0.6007622480392456, |
|
"learning_rate": 1.7562682356786488e-08, |
|
"loss": 0.5357, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9991532599491956, |
|
"grad_norm": 0.5643032193183899, |
|
"learning_rate": 0.0, |
|
"loss": 0.5246, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9991532599491956, |
|
"step": 295, |
|
"total_flos": 8.380405739893555e+17, |
|
"train_loss": 0.6345516301817813, |
|
"train_runtime": 3216.2267, |
|
"train_samples_per_second": 11.745, |
|
"train_steps_per_second": 0.092 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 295, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.380405739893555e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|