|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.9955555555555557, |
|
"eval_steps": 30, |
|
"global_step": 112, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1992.2813301086426, |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 0.11222778239149554, |
|
"kl": 0.0, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0509, |
|
"reward": -8.534029252827168, |
|
"reward_std": 3.1286671087145805, |
|
"rewards/cot_length_penalty_reward": -8.792958237230778, |
|
"rewards/math_latex_accuracy_reward": 0.2589285857975483, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 0.11224900964736327, |
|
"kl": 0.0, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0509, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.002639908329001628, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.11190265883625335, |
|
"kl": 0.0004132986068725586, |
|
"learning_rate": 5e-06, |
|
"loss": 0.051, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0026859724457608536, |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 0.10842287053917446, |
|
"kl": 0.00042808055877685547, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.0506, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2354.6050567626953, |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.11660083282365401, |
|
"kl": 0.0005452632904052734, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.0532, |
|
"reward": -9.576663568615913, |
|
"reward_std": 3.4961936213076115, |
|
"rewards/cot_length_penalty_reward": -9.817734986543655, |
|
"rewards/math_latex_accuracy_reward": 0.24107144074514508, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.004271271725883707, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.15124528039054966, |
|
"kl": 0.0023946762084960938, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0521, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.00593576196115464, |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 0.22845939154064746, |
|
"kl": 0.0017180442810058594, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 0.0519, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.00681446076487191, |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 0.25815482225017694, |
|
"kl": 0.002631664276123047, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.0484, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2251.4844703674316, |
|
"epoch": 0.32, |
|
"grad_norm": 0.11515864264586664, |
|
"kl": 0.0024547576904296875, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.0261, |
|
"reward": -8.878705874085426, |
|
"reward_std": 3.5140193179249763, |
|
"rewards/cot_length_penalty_reward": -9.128705888986588, |
|
"rewards/math_latex_accuracy_reward": 0.2500000149011612, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0070763813419034705, |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.2974695944541913, |
|
"kl": 0.005417823791503906, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.0255, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.010105093329912052, |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 85.04432926044146, |
|
"kl": 0.007180213928222656, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 19.5224, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.017627036664634943, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 2.432235493998052, |
|
"kl": 0.0702056884765625, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0247, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1870.0737648010254, |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 0.32693917550649865, |
|
"kl": 0.022480010986328125, |
|
"learning_rate": 1.9995065603657317e-05, |
|
"loss": 0.0103, |
|
"reward": -9.444286078214645, |
|
"reward_std": 3.207320176064968, |
|
"rewards/cot_length_penalty_reward": -9.848303943872452, |
|
"rewards/math_latex_accuracy_reward": 0.4040178805589676, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.004404508654261008, |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 1.6311442510668763, |
|
"kl": 0.010528564453125, |
|
"learning_rate": 1.9980267284282718e-05, |
|
"loss": 0.0093, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0062026621017139405, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.5615009008596469, |
|
"kl": 0.051082611083984375, |
|
"learning_rate": 1.99556196460308e-05, |
|
"loss": 0.0076, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.006701507809339091, |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 0.15171374489080983, |
|
"kl": 0.018802642822265625, |
|
"learning_rate": 1.9921147013144782e-05, |
|
"loss": 0.0041, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2222.4309005737305, |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 0.11283429637288535, |
|
"kl": 0.01442718505859375, |
|
"learning_rate": 1.9876883405951378e-05, |
|
"loss": 0.081, |
|
"reward": -9.554554164409637, |
|
"reward_std": 4.235630825161934, |
|
"rewards/cot_length_penalty_reward": -9.844732716679573, |
|
"rewards/math_latex_accuracy_reward": 0.29017858393490314, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.004120954225072637, |
|
"epoch": 0.64, |
|
"grad_norm": 0.10899171004441378, |
|
"kl": 0.015628814697265625, |
|
"learning_rate": 1.982287250728689e-05, |
|
"loss": 0.2482, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.005419444481958635, |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 0.1157331857796372, |
|
"kl": 0.01764678955078125, |
|
"learning_rate": 1.9759167619387474e-05, |
|
"loss": 0.2459, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.005958295805612579, |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.10652991525129403, |
|
"kl": 0.01905059814453125, |
|
"learning_rate": 1.9685831611286312e-05, |
|
"loss": 0.2434, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2501.2255668640137, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.12065925342674215, |
|
"kl": 0.020366668701171875, |
|
"learning_rate": 1.9602936856769432e-05, |
|
"loss": 0.033, |
|
"reward": -11.581663489341736, |
|
"reward_std": 4.305310405790806, |
|
"rewards/cot_length_penalty_reward": -11.86737784743309, |
|
"rewards/math_latex_accuracy_reward": 0.2857142973225564, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0039043642027536407, |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 0.36463686705700576, |
|
"kl": 0.019252777099609375, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.0325, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.005750590149546042, |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 20283.4909421177, |
|
"kl": 1147.058982849121, |
|
"learning_rate": 1.9408807689542257e-05, |
|
"loss": 46.027, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.008487990504363552, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.17175661916148474, |
|
"kl": 0.026885986328125, |
|
"learning_rate": 1.9297764858882516e-05, |
|
"loss": 0.0289, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2333.062614440918, |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.10364225115067384, |
|
"kl": 0.0201416015625, |
|
"learning_rate": 1.9177546256839814e-05, |
|
"loss": 0.0113, |
|
"reward": -10.619498401880264, |
|
"reward_std": 3.768970273435116, |
|
"rewards/cot_length_penalty_reward": -10.869498312473297, |
|
"rewards/math_latex_accuracy_reward": 0.2500000123400241, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0032545153953833506, |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 0.10471903488067007, |
|
"kl": 0.02140045166015625, |
|
"learning_rate": 1.9048270524660197e-05, |
|
"loss": 0.0103, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.004354664255515672, |
|
"epoch": 0.96, |
|
"grad_norm": 0.0965546219660842, |
|
"kl": 0.0223541259765625, |
|
"learning_rate": 1.891006524188368e-05, |
|
"loss": 0.0085, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.005275880845147185, |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 0.10544368470129743, |
|
"kl": 0.023712158203125, |
|
"learning_rate": 1.8763066800438638e-05, |
|
"loss": 0.0065, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2427.296974182129, |
|
"epoch": 1.0355555555555556, |
|
"grad_norm": 0.42983301720943096, |
|
"kl": 0.03394317626953125, |
|
"learning_rate": 1.860742027003944e-05, |
|
"loss": 0.0039, |
|
"reward": -11.214900106191635, |
|
"reward_std": 3.760936316102743, |
|
"rewards/cot_length_penalty_reward": -11.5006143450737, |
|
"rewards/math_latex_accuracy_reward": 0.285714297555387, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.0711111111111111, |
|
"grad_norm": 0.1067773530257121, |
|
"learning_rate": 1.8443279255020153e-05, |
|
"loss": 0.0071, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0711111111111111, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 2303.7637939453125, |
|
"eval_kl": 0.025606595552884616, |
|
"eval_loss": 0.04365207254886627, |
|
"eval_reward": -8.784464891140278, |
|
"eval_reward_std": 3.7600448498359094, |
|
"eval_rewards/cot_length_penalty_reward": -9.116882379238422, |
|
"eval_rewards/math_latex_accuracy_reward": 0.3324175958450024, |
|
"eval_runtime": 448.0952, |
|
"eval_samples_per_second": 0.112, |
|
"eval_steps_per_second": 0.004, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0037656883359886706, |
|
"epoch": 1.1066666666666667, |
|
"grad_norm": 0.7346983824268155, |
|
"kl": 0.027835845947265625, |
|
"learning_rate": 1.827080574274562e-05, |
|
"loss": 0.0033, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.006145871157059446, |
|
"epoch": 1.1422222222222222, |
|
"grad_norm": 11.473259751864658, |
|
"kl": 1.4422760009765625, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.0549, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2531.1674995422363, |
|
"epoch": 1.1777777777777778, |
|
"grad_norm": 0.11984100416609303, |
|
"kl": 0.03124237060546875, |
|
"learning_rate": 1.7901550123756906e-05, |
|
"loss": 0.0239, |
|
"reward": -8.361417755484581, |
|
"reward_std": 4.016169548034668, |
|
"rewards/cot_length_penalty_reward": -8.689542889595032, |
|
"rewards/math_latex_accuracy_reward": 0.3281250139698386, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.004058451057062484, |
|
"epoch": 1.2133333333333334, |
|
"grad_norm": 0.16684935043495766, |
|
"kl": 0.03450775146484375, |
|
"learning_rate": 1.7705132427757895e-05, |
|
"loss": 0.0232, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.006084064312744886, |
|
"epoch": 1.248888888888889, |
|
"grad_norm": 0.11163561617870978, |
|
"kl": 0.0318145751953125, |
|
"learning_rate": 1.7501110696304598e-05, |
|
"loss": 0.0214, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.007263028150191531, |
|
"epoch": 1.2844444444444445, |
|
"grad_norm": 0.11128954549532989, |
|
"kl": 0.03281402587890625, |
|
"learning_rate": 1.7289686274214116e-05, |
|
"loss": 0.0197, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1888.4487342834473, |
|
"epoch": 1.32, |
|
"grad_norm": 0.6837402463902799, |
|
"kl": 0.05413055419921875, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.1151, |
|
"reward": -7.783694684505463, |
|
"reward_std": 3.098730646073818, |
|
"rewards/cot_length_penalty_reward": -8.16985534131527, |
|
"rewards/math_latex_accuracy_reward": 0.3861607341095805, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0028788788622478023, |
|
"epoch": 1.3555555555555556, |
|
"grad_norm": 2.464864347264584, |
|
"kl": 0.04084014892578125, |
|
"learning_rate": 1.684547105928689e-05, |
|
"loss": 0.3644, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.004996606716304086, |
|
"epoch": 1.3911111111111112, |
|
"grad_norm": 0.32236476835294475, |
|
"kl": 0.04229736328125, |
|
"learning_rate": 1.661311865323652e-05, |
|
"loss": 0.1132, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.005848184140631929, |
|
"epoch": 1.4266666666666667, |
|
"grad_norm": 2.236113570550632, |
|
"kl": 0.180694580078125, |
|
"learning_rate": 1.63742398974869e-05, |
|
"loss": 0.116, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1681.7277793884277, |
|
"epoch": 1.462222222222222, |
|
"grad_norm": 55.91487468875501, |
|
"kl": 1.1835174560546875, |
|
"learning_rate": 1.6129070536529767e-05, |
|
"loss": 0.0918, |
|
"reward": -7.822701282799244, |
|
"reward_std": 2.5297958850860596, |
|
"rewards/cot_length_penalty_reward": -8.191005058586597, |
|
"rewards/math_latex_accuracy_reward": 0.3683035862632096, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.003095990905421786, |
|
"epoch": 1.4977777777777779, |
|
"grad_norm": 3454.9772869499748, |
|
"kl": 0.0470123291015625, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 6.8427, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.005094703097711317, |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 15.584761637863307, |
|
"kl": 1.0414886474609375, |
|
"learning_rate": 1.5620833778521306e-05, |
|
"loss": 0.0866, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.008272722363471985, |
|
"epoch": 1.568888888888889, |
|
"grad_norm": 1.2550063449844295, |
|
"kl": 0.04656219482421875, |
|
"learning_rate": 1.5358267949789968e-05, |
|
"loss": 0.0502, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2313.542510986328, |
|
"epoch": 1.6044444444444443, |
|
"grad_norm": 0.1122939508940078, |
|
"kl": 0.038604736328125, |
|
"learning_rate": 1.5090414157503715e-05, |
|
"loss": 0.0762, |
|
"reward": -9.223605461418629, |
|
"reward_std": 3.827972359955311, |
|
"rewards/cot_length_penalty_reward": -9.58521255850792, |
|
"rewards/math_latex_accuracy_reward": 0.36160715692676604, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0038211173960007727, |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.12378389214572502, |
|
"kl": 0.04041290283203125, |
|
"learning_rate": 1.4817536741017153e-05, |
|
"loss": 0.0756, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.005922177180764265, |
|
"epoch": 1.6755555555555555, |
|
"grad_norm": 0.12771635689812005, |
|
"kl": 0.04157257080078125, |
|
"learning_rate": 1.4539904997395468e-05, |
|
"loss": 0.0745, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.006733638554578647, |
|
"epoch": 1.7111111111111112, |
|
"grad_norm": 0.10941185512569215, |
|
"kl": 0.04193115234375, |
|
"learning_rate": 1.4257792915650728e-05, |
|
"loss": 0.0731, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1813.1072387695312, |
|
"epoch": 1.7466666666666666, |
|
"grad_norm": 3.0104818633298525, |
|
"kl": 0.2118988037109375, |
|
"learning_rate": 1.3971478906347806e-05, |
|
"loss": -0.0205, |
|
"reward": -10.289654642343521, |
|
"reward_std": 3.131831008940935, |
|
"rewards/cot_length_penalty_reward": -10.762868821620941, |
|
"rewards/math_latex_accuracy_reward": 0.47321430779993534, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.002227201643108856, |
|
"epoch": 1.7822222222222224, |
|
"grad_norm": 0.11902180384589063, |
|
"kl": 0.04238128662109375, |
|
"learning_rate": 1.3681245526846782e-05, |
|
"loss": -0.0276, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0032200364221353084, |
|
"epoch": 1.8177777777777777, |
|
"grad_norm": 0.1256236704087802, |
|
"kl": 0.04290008544921875, |
|
"learning_rate": 1.3387379202452917e-05, |
|
"loss": -0.0286, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.003942100578569807, |
|
"epoch": 1.8533333333333335, |
|
"grad_norm": 0.10119215538963353, |
|
"kl": 0.0430450439453125, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": -0.03, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2273.6608276367188, |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.17450385200895388, |
|
"kl": 0.05017852783203125, |
|
"learning_rate": 1.2789911060392295e-05, |
|
"loss": 0.005, |
|
"reward": -7.135257016867399, |
|
"reward_std": 3.697649233043194, |
|
"rewards/cot_length_penalty_reward": -7.5548999309539795, |
|
"rewards/math_latex_accuracy_reward": 0.4196428684517741, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.002888819137297105, |
|
"epoch": 1.9244444444444444, |
|
"grad_norm": 0.10967403835477697, |
|
"kl": 0.04810333251953125, |
|
"learning_rate": 1.2486898871648552e-05, |
|
"loss": 0.0038, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.005481840795255266, |
|
"epoch": 1.96, |
|
"grad_norm": 0.14987018246678602, |
|
"kl": 0.05097198486328125, |
|
"learning_rate": 1.2181432413965428e-05, |
|
"loss": 0.0028, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.007852705341065302, |
|
"epoch": 1.9955555555555555, |
|
"grad_norm": 0.12818843141531608, |
|
"kl": 0.0562286376953125, |
|
"learning_rate": 1.187381314585725e-05, |
|
"loss": 0.0013, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2008.5916328430176, |
|
"epoch": 2.0355555555555553, |
|
"grad_norm": 0.1643995846810127, |
|
"kl": 0.0548858642578125, |
|
"learning_rate": 1.156434465040231e-05, |
|
"loss": 0.0118, |
|
"reward": -7.901306234300137, |
|
"reward_std": 2.6794423200190067, |
|
"rewards/cot_length_penalty_reward": -8.209341906011105, |
|
"rewards/math_latex_accuracy_reward": 0.3080357303842902, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0029279392474563792, |
|
"epoch": 2.071111111111111, |
|
"grad_norm": 0.12045324524183162, |
|
"kl": 0.0587005615234375, |
|
"learning_rate": 1.1253332335643043e-05, |
|
"loss": 0.0108, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.005807226421893574, |
|
"epoch": 2.1066666666666665, |
|
"grad_norm": 0.14841036250602688, |
|
"kl": 0.0640106201171875, |
|
"learning_rate": 1.0941083133185146e-05, |
|
"loss": 0.0097, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.1422222222222222, |
|
"grad_norm": 0.11089468253649072, |
|
"learning_rate": 1.0627905195293135e-05, |
|
"loss": 0.0084, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.1422222222222222, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 2107.960148737981, |
|
"eval_kl": 0.05983323317307692, |
|
"eval_loss": -0.00015631201677024364, |
|
"eval_reward": -8.494354761563814, |
|
"eval_reward_std": 3.4025442325151882, |
|
"eval_rewards/cot_length_penalty_reward": -8.876223013951229, |
|
"eval_rewards/math_latex_accuracy_reward": 0.3818681509448932, |
|
"eval_runtime": 422.1329, |
|
"eval_samples_per_second": 0.118, |
|
"eval_steps_per_second": 0.005, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.003193242686393205, |
|
"completion_length": 1800.6674766540527, |
|
"epoch": 2.1777777777777776, |
|
"grad_norm": 0.1697521453747013, |
|
"kl": 0.065582275390625, |
|
"learning_rate": 1.0314107590781284e-05, |
|
"loss": 0.0174, |
|
"reward": -8.092556223273277, |
|
"reward_std": 3.146493151783943, |
|
"rewards/cot_length_penalty_reward": -8.458627462387085, |
|
"rewards/math_latex_accuracy_reward": 0.3660714477300644, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.003192656353348866, |
|
"epoch": 2.2133333333333334, |
|
"grad_norm": 0.12330763778596482, |
|
"kl": 0.0718231201171875, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0163, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0062453514110529795, |
|
"epoch": 2.2488888888888887, |
|
"grad_norm": 0.16098784282181033, |
|
"kl": 0.078704833984375, |
|
"learning_rate": 9.685892409218718e-06, |
|
"loss": 0.0151, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.006978008910664357, |
|
"epoch": 2.2844444444444445, |
|
"grad_norm": 0.1406450810476633, |
|
"kl": 0.0782470703125, |
|
"learning_rate": 9.372094804706867e-06, |
|
"loss": 0.0137, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2595.997859954834, |
|
"epoch": 2.32, |
|
"grad_norm": 0.18250322704566987, |
|
"kl": 0.0649871826171875, |
|
"learning_rate": 9.058916866814857e-06, |
|
"loss": 0.0147, |
|
"reward": -9.348549716174603, |
|
"reward_std": 3.3840084299445152, |
|
"rewards/cot_length_penalty_reward": -9.70346000418067, |
|
"rewards/math_latex_accuracy_reward": 0.35491072852164507, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0031114893354242668, |
|
"epoch": 2.3555555555555556, |
|
"grad_norm": 0.13005553219968966, |
|
"kl": 0.0695648193359375, |
|
"learning_rate": 8.746667664356957e-06, |
|
"loss": 0.014, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0075038159266114235, |
|
"epoch": 2.391111111111111, |
|
"grad_norm": 0.19621512659848725, |
|
"kl": 0.0780181884765625, |
|
"learning_rate": 8.43565534959769e-06, |
|
"loss": 0.0133, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.006932365708053112, |
|
"epoch": 2.4266666666666667, |
|
"grad_norm": 0.13215694629988284, |
|
"kl": 0.07647705078125, |
|
"learning_rate": 8.126186854142752e-06, |
|
"loss": 0.0122, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2154.4286880493164, |
|
"epoch": 2.462222222222222, |
|
"grad_norm": 0.27197173025048144, |
|
"kl": 0.086151123046875, |
|
"learning_rate": 7.818567586034578e-06, |
|
"loss": 0.0247, |
|
"reward": -8.337913118302822, |
|
"reward_std": 3.0677984952926636, |
|
"rewards/cot_length_penalty_reward": -8.806663155555725, |
|
"rewards/math_latex_accuracy_reward": 0.4687500186264515, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.005053140237578191, |
|
"epoch": 2.497777777777778, |
|
"grad_norm": 0.20964263197545416, |
|
"kl": 0.0977630615234375, |
|
"learning_rate": 7.513101128351454e-06, |
|
"loss": 0.0237, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.005771905358415097, |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 0.15787820635605407, |
|
"kl": 0.0987091064453125, |
|
"learning_rate": 7.210088939607709e-06, |
|
"loss": 0.0226, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0062158564978744835, |
|
"epoch": 2.568888888888889, |
|
"grad_norm": 0.4007267449310534, |
|
"kl": 0.0895538330078125, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.022, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1935.1675262451172, |
|
"epoch": 2.6044444444444443, |
|
"grad_norm": 0.266781012315019, |
|
"kl": 0.10394287109375, |
|
"learning_rate": 6.612620797547087e-06, |
|
"loss": 0.0125, |
|
"reward": -7.354442303534597, |
|
"reward_std": 2.94980551302433, |
|
"rewards/cot_length_penalty_reward": -7.771853107959032, |
|
"rewards/math_latex_accuracy_reward": 0.41741072852164507, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.01473489188356325, |
|
"epoch": 2.64, |
|
"grad_norm": 0.542093788713072, |
|
"kl": 0.1417083740234375, |
|
"learning_rate": 6.318754473153221e-06, |
|
"loss": 0.0132, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.009351018321467564, |
|
"epoch": 2.6755555555555555, |
|
"grad_norm": 0.32832820257493534, |
|
"kl": 0.1302490234375, |
|
"learning_rate": 6.028521093652195e-06, |
|
"loss": 0.0111, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.008401441504247487, |
|
"epoch": 2.7111111111111112, |
|
"grad_norm": 0.5313671762370776, |
|
"kl": 0.106719970703125, |
|
"learning_rate": 5.742207084349274e-06, |
|
"loss": 0.0105, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1853.8215065002441, |
|
"epoch": 2.7466666666666666, |
|
"grad_norm": 0.25601743476635025, |
|
"kl": 0.128814697265625, |
|
"learning_rate": 5.460095002604533e-06, |
|
"loss": -0.018, |
|
"reward": -7.114141087979078, |
|
"reward_std": 2.6430138647556305, |
|
"rewards/cot_length_penalty_reward": -7.493605274707079, |
|
"rewards/math_latex_accuracy_reward": 0.37946430314332247, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.004884305511950515, |
|
"epoch": 2.7822222222222224, |
|
"grad_norm": 0.18667971276259676, |
|
"kl": 0.1357421875, |
|
"learning_rate": 5.1824632589828465e-06, |
|
"loss": -0.019, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.008678867772687227, |
|
"epoch": 2.8177777777777777, |
|
"grad_norm": 0.2515967343714355, |
|
"kl": 0.1392822265625, |
|
"learning_rate": 4.909585842496287e-06, |
|
"loss": -0.0199, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.008155457631801255, |
|
"epoch": 2.8533333333333335, |
|
"grad_norm": 0.18942366870295294, |
|
"kl": 0.131805419921875, |
|
"learning_rate": 4.641732050210032e-06, |
|
"loss": -0.0211, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2211.9219856262207, |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 0.22043905749174672, |
|
"kl": 0.1049957275390625, |
|
"learning_rate": 4.379166221478697e-06, |
|
"loss": -0.0247, |
|
"reward": -9.63335988484323, |
|
"reward_std": 2.9292308390140533, |
|
"rewards/cot_length_penalty_reward": -10.111038556322455, |
|
"rewards/math_latex_accuracy_reward": 0.4776785969734192, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.002452510700095445, |
|
"epoch": 2.924444444444444, |
|
"grad_norm": 0.2264032851218282, |
|
"kl": 0.1047821044921875, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": -0.0248, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0035177832323824987, |
|
"epoch": 2.96, |
|
"grad_norm": 0.14675046732213165, |
|
"kl": 0.1127166748046875, |
|
"learning_rate": 3.8709294634702374e-06, |
|
"loss": -0.0259, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0036856129445368424, |
|
"epoch": 2.9955555555555557, |
|
"grad_norm": 0.1630420640767936, |
|
"kl": 0.09552001953125, |
|
"learning_rate": 3.625760102513103e-06, |
|
"loss": -0.0267, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1545.3081169128418, |
|
"epoch": 3.0355555555555553, |
|
"grad_norm": 14.400966954596354, |
|
"kl": 0.286773681640625, |
|
"learning_rate": 3.3868813467634833e-06, |
|
"loss": -0.0265, |
|
"reward": -7.1288284212350845, |
|
"reward_std": 1.927463386207819, |
|
"rewards/cot_length_penalty_reward": -7.606507122516632, |
|
"rewards/math_latex_accuracy_reward": 0.47767859511077404, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0029736382130067796, |
|
"epoch": 3.071111111111111, |
|
"grad_norm": 0.46033460120838376, |
|
"kl": 0.130645751953125, |
|
"learning_rate": 3.1545289407131128e-06, |
|
"loss": -0.0322, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0043519225146155804, |
|
"epoch": 3.1066666666666665, |
|
"grad_norm": 0.2864629379691617, |
|
"kl": 0.13775634765625, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": -0.0338, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.009186911847791635, |
|
"epoch": 3.1422222222222222, |
|
"grad_norm": 0.24547967049213823, |
|
"kl": 0.155426025390625, |
|
"learning_rate": 2.7103137257858867e-06, |
|
"loss": -0.0347, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1969.4442825317383, |
|
"epoch": 3.1777777777777776, |
|
"grad_norm": 0.433485726828516, |
|
"kl": 0.1619415283203125, |
|
"learning_rate": 2.4988893036954045e-06, |
|
"loss": -0.0013, |
|
"reward": -8.979866623878479, |
|
"reward_std": 2.524892296642065, |
|
"rewards/cot_length_penalty_reward": -9.319152384996414, |
|
"rewards/math_latex_accuracy_reward": 0.339285729220137, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 3.2133333333333334, |
|
"grad_norm": 0.2766265099056435, |
|
"learning_rate": 2.2948675722421086e-06, |
|
"loss": -0.003, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.2133333333333334, |
|
"eval_clip_ratio": 0.0, |
|
"eval_completion_length": 2205.780292217548, |
|
"eval_kl": 0.18556565504807693, |
|
"eval_loss": 0.01654699072241783, |
|
"eval_reward": -7.743022455332371, |
|
"eval_reward_std": 2.8156597109941335, |
|
"eval_rewards/cot_length_penalty_reward": -8.072692573070526, |
|
"eval_rewards/math_latex_accuracy_reward": 0.32967034670022816, |
|
"eval_runtime": 490.0675, |
|
"eval_samples_per_second": 0.102, |
|
"eval_steps_per_second": 0.004, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.006224101292900741, |
|
"epoch": 3.2488888888888887, |
|
"grad_norm": 0.4063813985520313, |
|
"kl": 0.197174072265625, |
|
"learning_rate": 2.098449876243096e-06, |
|
"loss": -0.0037, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.009523139509838074, |
|
"epoch": 3.2844444444444445, |
|
"grad_norm": 0.26707887580226697, |
|
"kl": 0.196014404296875, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": -0.0047, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2093.5982818603516, |
|
"epoch": 3.32, |
|
"grad_norm": 0.3581002181636425, |
|
"kl": 0.199798583984375, |
|
"learning_rate": 1.7291942572543806e-06, |
|
"loss": 0.0178, |
|
"reward": -7.027399688959122, |
|
"reward_std": 2.7810670882463455, |
|
"rewards/cot_length_penalty_reward": -7.435881897807121, |
|
"rewards/math_latex_accuracy_reward": 0.40848216507583857, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.002175023495510686, |
|
"epoch": 3.3555555555555556, |
|
"grad_norm": 0.35335279380104717, |
|
"kl": 0.19476318359375, |
|
"learning_rate": 1.5567207449798517e-06, |
|
"loss": 0.017, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0038238488195929676, |
|
"epoch": 3.391111111111111, |
|
"grad_norm": 0.26816647206060557, |
|
"kl": 0.21075439453125, |
|
"learning_rate": 1.3925797299605649e-06, |
|
"loss": 0.0159, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.006063876280677505, |
|
"epoch": 3.4266666666666667, |
|
"grad_norm": 0.3302248399941887, |
|
"kl": 0.22137451171875, |
|
"learning_rate": 1.2369331995613664e-06, |
|
"loss": 0.0151, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2630.805938720703, |
|
"epoch": 3.462222222222222, |
|
"grad_norm": 1.7166272647509115, |
|
"kl": 0.281219482421875, |
|
"learning_rate": 1.0899347581163222e-06, |
|
"loss": 0.0839, |
|
"reward": -7.589185383694712, |
|
"reward_std": 3.134066376835108, |
|
"rewards/cot_length_penalty_reward": -7.917310604825616, |
|
"rewards/math_latex_accuracy_reward": 0.32812501839362085, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.007515597055316903, |
|
"epoch": 3.497777777777778, |
|
"grad_norm": 4.303565090937016, |
|
"kl": 0.204925537109375, |
|
"learning_rate": 9.517294753398066e-07, |
|
"loss": 0.0869, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0075942349576507695, |
|
"epoch": 3.533333333333333, |
|
"grad_norm": 2.9889664346961444, |
|
"kl": 0.2061920166015625, |
|
"learning_rate": 8.224537431601886e-07, |
|
"loss": 0.0841, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0051619461009977385, |
|
"epoch": 3.568888888888889, |
|
"grad_norm": 0.4431396626395719, |
|
"kl": 0.22930908203125, |
|
"learning_rate": 7.022351411174866e-07, |
|
"loss": 0.0814, |
|
"step": 100 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2545.8371925354004, |
|
"epoch": 3.6044444444444443, |
|
"grad_norm": 0.4301809872674547, |
|
"kl": 0.171051025390625, |
|
"learning_rate": 5.911923104577455e-07, |
|
"loss": 0.0196, |
|
"reward": -10.28242233581841, |
|
"reward_std": 3.0546065159142017, |
|
"rewards/cot_length_penalty_reward": -10.666351079940796, |
|
"rewards/math_latex_accuracy_reward": 0.3839285857975483, |
|
"step": 101 |
|
}, |
|
{ |
|
"clip_ratio": 0.0024334693371201865, |
|
"epoch": 3.64, |
|
"grad_norm": 0.40563497309844976, |
|
"kl": 0.19537353515625, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 0.0191, |
|
"step": 102 |
|
}, |
|
{ |
|
"clip_ratio": 0.00391879488597624, |
|
"epoch": 3.6755555555555555, |
|
"grad_norm": 0.5720892424115457, |
|
"kl": 0.21160888671875, |
|
"learning_rate": 3.9706314323056936e-07, |
|
"loss": 0.0191, |
|
"step": 103 |
|
}, |
|
{ |
|
"clip_ratio": 0.005467013252200559, |
|
"epoch": 3.7111111111111112, |
|
"grad_norm": 0.5349890131537904, |
|
"kl": 0.21337890625, |
|
"learning_rate": 3.1416838871368925e-07, |
|
"loss": 0.0189, |
|
"step": 104 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 2160.7389335632324, |
|
"epoch": 3.7466666666666666, |
|
"grad_norm": 13.207091106199918, |
|
"kl": 0.7435302734375, |
|
"learning_rate": 2.4083238061252565e-07, |
|
"loss": 0.0564, |
|
"reward": -7.942288625985384, |
|
"reward_std": 2.594830472022295, |
|
"rewards/cot_length_penalty_reward": -8.37309193611145, |
|
"rewards/math_latex_accuracy_reward": 0.43080358672887087, |
|
"step": 105 |
|
}, |
|
{ |
|
"clip_ratio": 0.0028747237083734944, |
|
"epoch": 3.7822222222222224, |
|
"grad_norm": 3.5352628792853067, |
|
"kl": 0.472412109375, |
|
"learning_rate": 1.7712749271311392e-07, |
|
"loss": 0.0465, |
|
"step": 106 |
|
}, |
|
{ |
|
"clip_ratio": 0.004760361814987846, |
|
"epoch": 3.8177777777777777, |
|
"grad_norm": 0.94419108473764, |
|
"kl": 0.3760833740234375, |
|
"learning_rate": 1.231165940486234e-07, |
|
"loss": 0.0439, |
|
"step": 107 |
|
}, |
|
{ |
|
"clip_ratio": 0.0061231208674144, |
|
"epoch": 3.8533333333333335, |
|
"grad_norm": 1.7315584230873846, |
|
"kl": 0.3495025634765625, |
|
"learning_rate": 7.885298685522235e-08, |
|
"loss": 0.044, |
|
"step": 108 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 1982.8750839233398, |
|
"epoch": 3.888888888888889, |
|
"grad_norm": 0.7007233776562277, |
|
"kl": 0.2796783447265625, |
|
"learning_rate": 4.438035396920004e-08, |
|
"loss": 0.0207, |
|
"reward": -9.675179054960608, |
|
"reward_std": 2.574063938111067, |
|
"rewards/cot_length_penalty_reward": -9.989911276847124, |
|
"rewards/math_latex_accuracy_reward": 0.314732160884887, |
|
"step": 109 |
|
}, |
|
{ |
|
"clip_ratio": 0.001981915433134418, |
|
"epoch": 3.924444444444444, |
|
"grad_norm": 0.6787732749959883, |
|
"kl": 0.2747039794921875, |
|
"learning_rate": 1.973271571728441e-08, |
|
"loss": 0.0208, |
|
"step": 110 |
|
}, |
|
{ |
|
"clip_ratio": 0.0019334297030582093, |
|
"epoch": 3.96, |
|
"grad_norm": 0.6415798052337234, |
|
"kl": 0.27728271484375, |
|
"learning_rate": 4.9343963426840006e-09, |
|
"loss": 0.0206, |
|
"step": 111 |
|
}, |
|
{ |
|
"clip_ratio": 0.0017770093054423342, |
|
"epoch": 3.9955555555555557, |
|
"grad_norm": 0.6316769640873855, |
|
"kl": 0.30633544921875, |
|
"learning_rate": 0.0, |
|
"loss": 0.0207, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 3.9955555555555557, |
|
"step": 112, |
|
"total_flos": 0.0, |
|
"train_loss": 0.6775813137989773, |
|
"train_runtime": 20674.7257, |
|
"train_samples_per_second": 0.087, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 112, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|