|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 62, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": 31.199046559439136, |
|
"learning_rate": 9.993582535855265e-06, |
|
"loss": 1.6497, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 17.46234723437062, |
|
"learning_rate": 9.974346616959476e-06, |
|
"loss": 1.1688, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 17.2132760171181, |
|
"learning_rate": 9.942341621640558e-06, |
|
"loss": 1.2213, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 8.814975880215384, |
|
"learning_rate": 9.897649706262474e-06, |
|
"loss": 0.852, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 9.032603485694555, |
|
"learning_rate": 9.840385594331022e-06, |
|
"loss": 0.8313, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 5.078284135758567, |
|
"learning_rate": 9.770696282000245e-06, |
|
"loss": 0.6852, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 3.427300813881487, |
|
"learning_rate": 9.688760660735403e-06, |
|
"loss": 0.5022, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 4.2152309516284845, |
|
"learning_rate": 9.594789058101154e-06, |
|
"loss": 0.5572, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 3.085786333052321, |
|
"learning_rate": 9.48902269785371e-06, |
|
"loss": 0.4309, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 3.125642693243869, |
|
"learning_rate": 9.371733080722911e-06, |
|
"loss": 0.5173, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 3.277088815175521, |
|
"learning_rate": 9.243221287473755e-06, |
|
"loss": 0.4889, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 2.357479595943573, |
|
"learning_rate": 9.103817206036383e-06, |
|
"loss": 0.3478, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 2.0539004083223156, |
|
"learning_rate": 8.953878684688492e-06, |
|
"loss": 0.3496, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 2.52448299705888, |
|
"learning_rate": 8.793790613463956e-06, |
|
"loss": 0.398, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 2.355140249988772, |
|
"learning_rate": 8.6239639361456e-06, |
|
"loss": 0.3568, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 2.4469993078971326, |
|
"learning_rate": 8.444834595378434e-06, |
|
"loss": 0.3884, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 2.52296370250433, |
|
"learning_rate": 8.256862413611113e-06, |
|
"loss": 0.3657, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 3.3355903081537135, |
|
"learning_rate": 8.060529912738316e-06, |
|
"loss": 0.5187, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 2.484683322570902, |
|
"learning_rate": 7.856341075473963e-06, |
|
"loss": 0.4143, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 1.7487341996348138, |
|
"learning_rate": 7.644820051634813e-06, |
|
"loss": 0.2972, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 2.0583109478127204, |
|
"learning_rate": 7.4265098126554065e-06, |
|
"loss": 0.3197, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 2.0344563885911575, |
|
"learning_rate": 7.201970757788172e-06, |
|
"loss": 0.3395, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 2.058990191861192, |
|
"learning_rate": 6.971779275566593e-06, |
|
"loss": 0.3394, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 1.9074433438418341, |
|
"learning_rate": 6.736526264224101e-06, |
|
"loss": 0.3442, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 2.0162577599888323, |
|
"learning_rate": 6.496815614866792e-06, |
|
"loss": 0.2795, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 2.4742505970882163, |
|
"learning_rate": 6.2532626612936035e-06, |
|
"loss": 0.3968, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 2.207808258328297, |
|
"learning_rate": 6.006492600443301e-06, |
|
"loss": 0.3594, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 1.7696328864440694, |
|
"learning_rate": 5.757138887522884e-06, |
|
"loss": 0.3114, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 2.0275212616065645, |
|
"learning_rate": 5.505841609937162e-06, |
|
"loss": 0.3478, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 1.6298425948210777, |
|
"learning_rate": 5.253245844193564e-06, |
|
"loss": 0.2626, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.5467460791719727, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2845, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.032258064516129, |
|
"grad_norm": 1.5586457967216936, |
|
"learning_rate": 4.746754155806437e-06, |
|
"loss": 0.2419, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 1.888001852915147, |
|
"learning_rate": 4.49415839006284e-06, |
|
"loss": 0.2208, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.096774193548387, |
|
"grad_norm": 1.4011698191222255, |
|
"learning_rate": 4.2428611124771184e-06, |
|
"loss": 0.2144, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.129032258064516, |
|
"grad_norm": 1.7487392529140704, |
|
"learning_rate": 3.993507399556699e-06, |
|
"loss": 0.2356, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 2.111614480418175, |
|
"learning_rate": 3.7467373387063973e-06, |
|
"loss": 0.2874, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.1935483870967742, |
|
"grad_norm": 1.317740002801753, |
|
"learning_rate": 3.5031843851332105e-06, |
|
"loss": 0.1728, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.2258064516129032, |
|
"grad_norm": 1.7399051773302943, |
|
"learning_rate": 3.2634737357758994e-06, |
|
"loss": 0.2604, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.2580645161290323, |
|
"grad_norm": 1.8124616181419573, |
|
"learning_rate": 3.0282207244334084e-06, |
|
"loss": 0.2461, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 2.2245741657359934, |
|
"learning_rate": 2.7980292422118282e-06, |
|
"loss": 0.2703, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.3225806451612903, |
|
"grad_norm": 1.6789960223565612, |
|
"learning_rate": 2.573490187344596e-06, |
|
"loss": 0.2038, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 1.79239256078375, |
|
"learning_rate": 2.3551799483651894e-06, |
|
"loss": 0.2239, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.3870967741935485, |
|
"grad_norm": 3.076290808429015, |
|
"learning_rate": 2.1436589245260375e-06, |
|
"loss": 0.2957, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.4193548387096775, |
|
"grad_norm": 1.5602198674820165, |
|
"learning_rate": 1.9394700872616856e-06, |
|
"loss": 0.2107, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 1.577639906284918, |
|
"learning_rate": 1.74313758638889e-06, |
|
"loss": 0.2238, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.4838709677419355, |
|
"grad_norm": 1.5054440694141868, |
|
"learning_rate": 1.555165404621567e-06, |
|
"loss": 0.2035, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.5161290322580645, |
|
"grad_norm": 1.92724369253792, |
|
"learning_rate": 1.3760360638544012e-06, |
|
"loss": 0.256, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 1.52533329195425, |
|
"learning_rate": 1.2062093865360458e-06, |
|
"loss": 0.2078, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.5806451612903225, |
|
"grad_norm": 2.3222181174482044, |
|
"learning_rate": 1.046121315311508e-06, |
|
"loss": 0.2611, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 1.4315191687571915, |
|
"learning_rate": 8.961827939636198e-07, |
|
"loss": 0.1965, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 1.5414636545885392, |
|
"learning_rate": 7.567787125262449e-07, |
|
"loss": 0.2, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.6774193548387095, |
|
"grad_norm": 1.4835478438790177, |
|
"learning_rate": 6.282669192770896e-07, |
|
"loss": 0.2509, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.7096774193548387, |
|
"grad_norm": 1.4946263792553225, |
|
"learning_rate": 5.109773021462921e-07, |
|
"loss": 0.2085, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 1.7678787704558188, |
|
"learning_rate": 4.05210941898847e-07, |
|
"loss": 0.2467, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.7741935483870968, |
|
"grad_norm": 1.4867440873899518, |
|
"learning_rate": 3.112393392645985e-07, |
|
"loss": 0.2363, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.8064516129032258, |
|
"grad_norm": 1.4803775636293333, |
|
"learning_rate": 2.2930371799975593e-07, |
|
"loss": 0.1855, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.838709677419355, |
|
"grad_norm": 1.4490266945389694, |
|
"learning_rate": 1.5961440566897913e-07, |
|
"loss": 0.1821, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.870967741935484, |
|
"grad_norm": 1.7145630167827153, |
|
"learning_rate": 1.0235029373752758e-07, |
|
"loss": 0.2157, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.903225806451613, |
|
"grad_norm": 1.5925676389651224, |
|
"learning_rate": 5.7658378359443104e-08, |
|
"loss": 0.2158, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 1.4959223286590795, |
|
"learning_rate": 2.5653383040524228e-08, |
|
"loss": 0.2202, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.967741935483871, |
|
"grad_norm": 2.013681318495715, |
|
"learning_rate": 6.417464144736208e-09, |
|
"loss": 0.2533, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.4127405994063447, |
|
"learning_rate": 0.0, |
|
"loss": 0.2159, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"step": 62, |
|
"total_flos": 2138633404416.0, |
|
"train_loss": 0.370798627936071, |
|
"train_runtime": 218.5623, |
|
"train_samples_per_second": 2.251, |
|
"train_steps_per_second": 0.284 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 62, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 70000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2138633404416.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|