|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.979683972911964, |
|
"eval_steps": 50, |
|
"global_step": 330, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.045146726862302484, |
|
"grad_norm": 18.43708432360805, |
|
"learning_rate": 5e-07, |
|
"loss": 1.7371, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09029345372460497, |
|
"grad_norm": 12.96518208503692, |
|
"learning_rate": 1e-06, |
|
"loss": 1.6227, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13544018058690746, |
|
"grad_norm": 7.284644208168829, |
|
"learning_rate": 9.993977281025862e-07, |
|
"loss": 1.2966, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18058690744920994, |
|
"grad_norm": 4.047553649149097, |
|
"learning_rate": 9.975923633360984e-07, |
|
"loss": 1.1552, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22573363431151242, |
|
"grad_norm": 3.7692359566921234, |
|
"learning_rate": 9.945882549823904e-07, |
|
"loss": 1.0619, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2708803611738149, |
|
"grad_norm": 3.6288204791918646, |
|
"learning_rate": 9.90392640201615e-07, |
|
"loss": 1.0028, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3160270880361174, |
|
"grad_norm": 3.475318929765548, |
|
"learning_rate": 9.85015626597272e-07, |
|
"loss": 1.0079, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.3611738148984199, |
|
"grad_norm": 3.5252734455616874, |
|
"learning_rate": 9.784701678661044e-07, |
|
"loss": 0.971, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.40632054176072235, |
|
"grad_norm": 3.515836010245366, |
|
"learning_rate": 9.707720325915103e-07, |
|
"loss": 0.9516, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45146726862302483, |
|
"grad_norm": 3.6385086224106713, |
|
"learning_rate": 9.619397662556433e-07, |
|
"loss": 0.9534, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.45146726862302483, |
|
"eval_loss": 0.9350618124008179, |
|
"eval_runtime": 55.811, |
|
"eval_samples_per_second": 56.441, |
|
"eval_steps_per_second": 0.896, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4966139954853273, |
|
"grad_norm": 3.433110767032438, |
|
"learning_rate": 9.519946465617217e-07, |
|
"loss": 0.9279, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5417607223476298, |
|
"grad_norm": 3.370215115653193, |
|
"learning_rate": 9.409606321741774e-07, |
|
"loss": 0.9373, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5869074492099323, |
|
"grad_norm": 3.4752063755478906, |
|
"learning_rate": 9.28864305000136e-07, |
|
"loss": 0.9134, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6320541760722348, |
|
"grad_norm": 3.444843183525781, |
|
"learning_rate": 9.157348061512726e-07, |
|
"loss": 0.8902, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6772009029345373, |
|
"grad_norm": 3.3753005171135877, |
|
"learning_rate": 9.016037657403223e-07, |
|
"loss": 0.8907, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7223476297968398, |
|
"grad_norm": 3.3797166413654636, |
|
"learning_rate": 8.865052266813685e-07, |
|
"loss": 0.8798, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7674943566591422, |
|
"grad_norm": 3.399852464899469, |
|
"learning_rate": 8.704755626774795e-07, |
|
"loss": 0.8873, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8126410835214447, |
|
"grad_norm": 3.3736345000901644, |
|
"learning_rate": 8.535533905932737e-07, |
|
"loss": 0.8951, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8577878103837472, |
|
"grad_norm": 3.3317984949333104, |
|
"learning_rate": 8.357794774235092e-07, |
|
"loss": 0.8813, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9029345372460497, |
|
"grad_norm": 3.7790547931864826, |
|
"learning_rate": 8.171966420818227e-07, |
|
"loss": 0.8729, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9029345372460497, |
|
"eval_loss": 0.8780717253684998, |
|
"eval_runtime": 55.3629, |
|
"eval_samples_per_second": 56.897, |
|
"eval_steps_per_second": 0.903, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9480812641083521, |
|
"grad_norm": 3.843510457952551, |
|
"learning_rate": 7.978496522462167e-07, |
|
"loss": 0.8738, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9932279909706546, |
|
"grad_norm": 3.8556187579575734, |
|
"learning_rate": 7.777851165098011e-07, |
|
"loss": 0.8693, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0383747178329572, |
|
"grad_norm": 3.5905132811137124, |
|
"learning_rate": 7.570513720966107e-07, |
|
"loss": 0.8148, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0835214446952597, |
|
"grad_norm": 3.621093760889929, |
|
"learning_rate": 7.356983684129989e-07, |
|
"loss": 0.8324, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1286681715575622, |
|
"grad_norm": 3.483178959852461, |
|
"learning_rate": 7.13777546715141e-07, |
|
"loss": 0.7914, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1738148984198646, |
|
"grad_norm": 3.3812503372602465, |
|
"learning_rate": 6.913417161825449e-07, |
|
"loss": 0.7891, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2189616252821671, |
|
"grad_norm": 3.42376548092314, |
|
"learning_rate": 6.684449266961099e-07, |
|
"loss": 0.7905, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2641083521444696, |
|
"grad_norm": 3.670874737486496, |
|
"learning_rate": 6.451423386272311e-07, |
|
"loss": 0.7919, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.309255079006772, |
|
"grad_norm": 3.466865973590349, |
|
"learning_rate": 6.21490089951632e-07, |
|
"loss": 0.8129, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3544018058690745, |
|
"grad_norm": 3.8317546120945494, |
|
"learning_rate": 5.975451610080642e-07, |
|
"loss": 0.8045, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3544018058690745, |
|
"eval_loss": 0.859570324420929, |
|
"eval_runtime": 55.3516, |
|
"eval_samples_per_second": 56.909, |
|
"eval_steps_per_second": 0.903, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.399548532731377, |
|
"grad_norm": 3.6970944236498684, |
|
"learning_rate": 5.733652372276809e-07, |
|
"loss": 0.8012, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4446952595936795, |
|
"grad_norm": 3.320805141993761, |
|
"learning_rate": 5.490085701647804e-07, |
|
"loss": 0.7977, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.489841986455982, |
|
"grad_norm": 3.4859328024402685, |
|
"learning_rate": 5.245338371637091e-07, |
|
"loss": 0.7953, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5349887133182845, |
|
"grad_norm": 3.64297775104021, |
|
"learning_rate": 5e-07, |
|
"loss": 0.7917, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.580135440180587, |
|
"grad_norm": 3.470131936511277, |
|
"learning_rate": 4.75466162836291e-07, |
|
"loss": 0.7905, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6252821670428894, |
|
"grad_norm": 3.559410134845727, |
|
"learning_rate": 4.5099142983521963e-07, |
|
"loss": 0.7948, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.670428893905192, |
|
"grad_norm": 3.4417325502766594, |
|
"learning_rate": 4.2663476277231915e-07, |
|
"loss": 0.7983, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7155756207674944, |
|
"grad_norm": 3.669007752905804, |
|
"learning_rate": 4.0245483899193586e-07, |
|
"loss": 0.8006, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7607223476297968, |
|
"grad_norm": 3.4191546038129483, |
|
"learning_rate": 3.785099100483681e-07, |
|
"loss": 0.7879, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8058690744920993, |
|
"grad_norm": 3.697454786550471, |
|
"learning_rate": 3.548576613727689e-07, |
|
"loss": 0.7875, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8058690744920993, |
|
"eval_loss": 0.847703754901886, |
|
"eval_runtime": 55.4234, |
|
"eval_samples_per_second": 56.835, |
|
"eval_steps_per_second": 0.902, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8510158013544018, |
|
"grad_norm": 3.691082664320291, |
|
"learning_rate": 3.3155507330388996e-07, |
|
"loss": 0.7951, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.8961625282167043, |
|
"grad_norm": 3.5717461634355288, |
|
"learning_rate": 3.086582838174551e-07, |
|
"loss": 0.7895, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9413092550790068, |
|
"grad_norm": 3.579734970143079, |
|
"learning_rate": 2.8622245328485907e-07, |
|
"loss": 0.7825, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.9864559819413092, |
|
"grad_norm": 3.444703074876304, |
|
"learning_rate": 2.6430163158700113e-07, |
|
"loss": 0.7836, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0316027088036117, |
|
"grad_norm": 3.6858962070265617, |
|
"learning_rate": 2.4294862790338916e-07, |
|
"loss": 0.7546, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0767494356659144, |
|
"grad_norm": 3.5888237456746976, |
|
"learning_rate": 2.2221488349019902e-07, |
|
"loss": 0.7482, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1218961625282167, |
|
"grad_norm": 3.681045633726427, |
|
"learning_rate": 2.021503477537833e-07, |
|
"loss": 0.7402, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.1670428893905194, |
|
"grad_norm": 3.5167220269101747, |
|
"learning_rate": 1.828033579181773e-07, |
|
"loss": 0.7451, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2121896162528216, |
|
"grad_norm": 3.883082217563066, |
|
"learning_rate": 1.6422052257649077e-07, |
|
"loss": 0.7465, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2573363431151243, |
|
"grad_norm": 3.923342604410762, |
|
"learning_rate": 1.4644660940672627e-07, |
|
"loss": 0.7391, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.2573363431151243, |
|
"eval_loss": 0.8480744957923889, |
|
"eval_runtime": 55.3896, |
|
"eval_samples_per_second": 56.87, |
|
"eval_steps_per_second": 0.903, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3024830699774266, |
|
"grad_norm": 3.4937657027708307, |
|
"learning_rate": 1.2952443732252054e-07, |
|
"loss": 0.7314, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.3476297968397293, |
|
"grad_norm": 3.7212940895257525, |
|
"learning_rate": 1.134947733186315e-07, |
|
"loss": 0.7484, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.3927765237020315, |
|
"grad_norm": 3.4800342638669424, |
|
"learning_rate": 9.839623425967758e-08, |
|
"loss": 0.7429, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4379232505643342, |
|
"grad_norm": 3.6208493287123598, |
|
"learning_rate": 8.426519384872732e-08, |
|
"loss": 0.7335, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4830699774266365, |
|
"grad_norm": 3.6587749488735075, |
|
"learning_rate": 7.1135694999864e-08, |
|
"loss": 0.757, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.528216704288939, |
|
"grad_norm": 3.651673581185134, |
|
"learning_rate": 5.9039367825822526e-08, |
|
"loss": 0.7237, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5733634311512414, |
|
"grad_norm": 3.6732031228054653, |
|
"learning_rate": 4.800535343827833e-08, |
|
"loss": 0.7379, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.618510158013544, |
|
"grad_norm": 3.7552824088763455, |
|
"learning_rate": 3.806023374435663e-08, |
|
"loss": 0.7406, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6636568848758464, |
|
"grad_norm": 3.9856453979787916, |
|
"learning_rate": 2.922796740848965e-08, |
|
"loss": 0.7231, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.708803611738149, |
|
"grad_norm": 3.61930072668026, |
|
"learning_rate": 2.1529832133895588e-08, |
|
"loss": 0.7339, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.708803611738149, |
|
"eval_loss": 0.8461548089981079, |
|
"eval_runtime": 55.4056, |
|
"eval_samples_per_second": 56.853, |
|
"eval_steps_per_second": 0.902, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7539503386004514, |
|
"grad_norm": 3.739949022395361, |
|
"learning_rate": 1.4984373402728012e-08, |
|
"loss": 0.7323, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.799097065462754, |
|
"grad_norm": 3.5216583130079537, |
|
"learning_rate": 9.607359798384784e-09, |
|
"loss": 0.7267, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8442437923250563, |
|
"grad_norm": 3.78870498477702, |
|
"learning_rate": 5.411745017609493e-09, |
|
"loss": 0.7242, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.889390519187359, |
|
"grad_norm": 3.7009903709257244, |
|
"learning_rate": 2.407636663901591e-09, |
|
"loss": 0.7201, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9345372460496613, |
|
"grad_norm": 3.677700055727154, |
|
"learning_rate": 6.022718974137975e-10, |
|
"loss": 0.7308, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.979683972911964, |
|
"grad_norm": 3.5525511423180545, |
|
"learning_rate": 0.0, |
|
"loss": 0.7362, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.979683972911964, |
|
"step": 330, |
|
"total_flos": 1945595623243776.0, |
|
"train_loss": 0.8518027869137851, |
|
"train_runtime": 4686.6831, |
|
"train_samples_per_second": 18.147, |
|
"train_steps_per_second": 0.07 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 330, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1945595623243776.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|