Qwen2.5-1.5B-Open-R1-GRPO-cot-v3 / trainer_state.json
weltonwang88's picture
Model save
798f7cc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9955555555555557,
"eval_steps": 30,
"global_step": 112,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 1992.2813301086426,
"epoch": 0.035555555555555556,
"grad_norm": 0.11222778239149554,
"kl": 0.0,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0509,
"reward": -8.534029252827168,
"reward_std": 3.1286671087145805,
"rewards/cot_length_penalty_reward": -8.792958237230778,
"rewards/math_latex_accuracy_reward": 0.2589285857975483,
"step": 1
},
{
"clip_ratio": 0.0,
"epoch": 0.07111111111111111,
"grad_norm": 0.11224900964736327,
"kl": 0.0,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.0509,
"step": 2
},
{
"clip_ratio": 0.002639908329001628,
"epoch": 0.10666666666666667,
"grad_norm": 0.11190265883625335,
"kl": 0.0004132986068725586,
"learning_rate": 5e-06,
"loss": 0.051,
"step": 3
},
{
"clip_ratio": 0.0026859724457608536,
"epoch": 0.14222222222222222,
"grad_norm": 0.10842287053917446,
"kl": 0.00042808055877685547,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0506,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2354.6050567626953,
"epoch": 0.17777777777777778,
"grad_norm": 0.11660083282365401,
"kl": 0.0005452632904052734,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0532,
"reward": -9.576663568615913,
"reward_std": 3.4961936213076115,
"rewards/cot_length_penalty_reward": -9.817734986543655,
"rewards/math_latex_accuracy_reward": 0.24107144074514508,
"step": 5
},
{
"clip_ratio": 0.004271271725883707,
"epoch": 0.21333333333333335,
"grad_norm": 0.15124528039054966,
"kl": 0.0023946762084960938,
"learning_rate": 1e-05,
"loss": 0.0521,
"step": 6
},
{
"clip_ratio": 0.00593576196115464,
"epoch": 0.24888888888888888,
"grad_norm": 0.22845939154064746,
"kl": 0.0017180442810058594,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.0519,
"step": 7
},
{
"clip_ratio": 0.00681446076487191,
"epoch": 0.28444444444444444,
"grad_norm": 0.25815482225017694,
"kl": 0.002631664276123047,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0484,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2251.4844703674316,
"epoch": 0.32,
"grad_norm": 0.11515864264586664,
"kl": 0.0024547576904296875,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0261,
"reward": -8.878705874085426,
"reward_std": 3.5140193179249763,
"rewards/cot_length_penalty_reward": -9.128705888986588,
"rewards/math_latex_accuracy_reward": 0.2500000149011612,
"step": 9
},
{
"clip_ratio": 0.0070763813419034705,
"epoch": 0.35555555555555557,
"grad_norm": 0.2974695944541913,
"kl": 0.005417823791503906,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0255,
"step": 10
},
{
"clip_ratio": 0.010105093329912052,
"epoch": 0.39111111111111113,
"grad_norm": 85.04432926044146,
"kl": 0.007180213928222656,
"learning_rate": 1.8333333333333333e-05,
"loss": 19.5224,
"step": 11
},
{
"clip_ratio": 0.017627036664634943,
"epoch": 0.4266666666666667,
"grad_norm": 2.432235493998052,
"kl": 0.0702056884765625,
"learning_rate": 2e-05,
"loss": 0.0247,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1870.0737648010254,
"epoch": 0.4622222222222222,
"grad_norm": 0.32693917550649865,
"kl": 0.022480010986328125,
"learning_rate": 1.9995065603657317e-05,
"loss": 0.0103,
"reward": -9.444286078214645,
"reward_std": 3.207320176064968,
"rewards/cot_length_penalty_reward": -9.848303943872452,
"rewards/math_latex_accuracy_reward": 0.4040178805589676,
"step": 13
},
{
"clip_ratio": 0.004404508654261008,
"epoch": 0.49777777777777776,
"grad_norm": 1.6311442510668763,
"kl": 0.010528564453125,
"learning_rate": 1.9980267284282718e-05,
"loss": 0.0093,
"step": 14
},
{
"clip_ratio": 0.0062026621017139405,
"epoch": 0.5333333333333333,
"grad_norm": 0.5615009008596469,
"kl": 0.051082611083984375,
"learning_rate": 1.99556196460308e-05,
"loss": 0.0076,
"step": 15
},
{
"clip_ratio": 0.006701507809339091,
"epoch": 0.5688888888888889,
"grad_norm": 0.15171374489080983,
"kl": 0.018802642822265625,
"learning_rate": 1.9921147013144782e-05,
"loss": 0.0041,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 2222.4309005737305,
"epoch": 0.6044444444444445,
"grad_norm": 0.11283429637288535,
"kl": 0.01442718505859375,
"learning_rate": 1.9876883405951378e-05,
"loss": 0.081,
"reward": -9.554554164409637,
"reward_std": 4.235630825161934,
"rewards/cot_length_penalty_reward": -9.844732716679573,
"rewards/math_latex_accuracy_reward": 0.29017858393490314,
"step": 17
},
{
"clip_ratio": 0.004120954225072637,
"epoch": 0.64,
"grad_norm": 0.10899171004441378,
"kl": 0.015628814697265625,
"learning_rate": 1.982287250728689e-05,
"loss": 0.2482,
"step": 18
},
{
"clip_ratio": 0.005419444481958635,
"epoch": 0.6755555555555556,
"grad_norm": 0.1157331857796372,
"kl": 0.01764678955078125,
"learning_rate": 1.9759167619387474e-05,
"loss": 0.2459,
"step": 19
},
{
"clip_ratio": 0.005958295805612579,
"epoch": 0.7111111111111111,
"grad_norm": 0.10652991525129403,
"kl": 0.01905059814453125,
"learning_rate": 1.9685831611286312e-05,
"loss": 0.2434,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2501.2255668640137,
"epoch": 0.7466666666666667,
"grad_norm": 0.12065925342674215,
"kl": 0.020366668701171875,
"learning_rate": 1.9602936856769432e-05,
"loss": 0.033,
"reward": -11.581663489341736,
"reward_std": 4.305310405790806,
"rewards/cot_length_penalty_reward": -11.86737784743309,
"rewards/math_latex_accuracy_reward": 0.2857142973225564,
"step": 21
},
{
"clip_ratio": 0.0039043642027536407,
"epoch": 0.7822222222222223,
"grad_norm": 0.36463686705700576,
"kl": 0.019252777099609375,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.0325,
"step": 22
},
{
"clip_ratio": 0.005750590149546042,
"epoch": 0.8177777777777778,
"grad_norm": 20283.4909421177,
"kl": 1147.058982849121,
"learning_rate": 1.9408807689542257e-05,
"loss": 46.027,
"step": 23
},
{
"clip_ratio": 0.008487990504363552,
"epoch": 0.8533333333333334,
"grad_norm": 0.17175661916148474,
"kl": 0.026885986328125,
"learning_rate": 1.9297764858882516e-05,
"loss": 0.0289,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 2333.062614440918,
"epoch": 0.8888888888888888,
"grad_norm": 0.10364225115067384,
"kl": 0.0201416015625,
"learning_rate": 1.9177546256839814e-05,
"loss": 0.0113,
"reward": -10.619498401880264,
"reward_std": 3.768970273435116,
"rewards/cot_length_penalty_reward": -10.869498312473297,
"rewards/math_latex_accuracy_reward": 0.2500000123400241,
"step": 25
},
{
"clip_ratio": 0.0032545153953833506,
"epoch": 0.9244444444444444,
"grad_norm": 0.10471903488067007,
"kl": 0.02140045166015625,
"learning_rate": 1.9048270524660197e-05,
"loss": 0.0103,
"step": 26
},
{
"clip_ratio": 0.004354664255515672,
"epoch": 0.96,
"grad_norm": 0.0965546219660842,
"kl": 0.0223541259765625,
"learning_rate": 1.891006524188368e-05,
"loss": 0.0085,
"step": 27
},
{
"clip_ratio": 0.005275880845147185,
"epoch": 0.9955555555555555,
"grad_norm": 0.10544368470129743,
"kl": 0.023712158203125,
"learning_rate": 1.8763066800438638e-05,
"loss": 0.0065,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 2427.296974182129,
"epoch": 1.0355555555555556,
"grad_norm": 0.42983301720943096,
"kl": 0.03394317626953125,
"learning_rate": 1.860742027003944e-05,
"loss": 0.0039,
"reward": -11.214900106191635,
"reward_std": 3.760936316102743,
"rewards/cot_length_penalty_reward": -11.5006143450737,
"rewards/math_latex_accuracy_reward": 0.285714297555387,
"step": 29
},
{
"epoch": 1.0711111111111111,
"grad_norm": 0.1067773530257121,
"learning_rate": 1.8443279255020153e-05,
"loss": 0.0071,
"step": 30
},
{
"epoch": 1.0711111111111111,
"eval_clip_ratio": 0.0,
"eval_completion_length": 2303.7637939453125,
"eval_kl": 0.025606595552884616,
"eval_loss": 0.04365207254886627,
"eval_reward": -8.784464891140278,
"eval_reward_std": 3.7600448498359094,
"eval_rewards/cot_length_penalty_reward": -9.116882379238422,
"eval_rewards/math_latex_accuracy_reward": 0.3324175958450024,
"eval_runtime": 448.0952,
"eval_samples_per_second": 0.112,
"eval_steps_per_second": 0.004,
"step": 30
},
{
"clip_ratio": 0.0037656883359886706,
"epoch": 1.1066666666666667,
"grad_norm": 0.7346983824268155,
"kl": 0.027835845947265625,
"learning_rate": 1.827080574274562e-05,
"loss": 0.0033,
"step": 31
},
{
"clip_ratio": 0.006145871157059446,
"epoch": 1.1422222222222222,
"grad_norm": 11.473259751864658,
"kl": 1.4422760009765625,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.0549,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 2531.1674995422363,
"epoch": 1.1777777777777778,
"grad_norm": 0.11984100416609303,
"kl": 0.03124237060546875,
"learning_rate": 1.7901550123756906e-05,
"loss": 0.0239,
"reward": -8.361417755484581,
"reward_std": 4.016169548034668,
"rewards/cot_length_penalty_reward": -8.689542889595032,
"rewards/math_latex_accuracy_reward": 0.3281250139698386,
"step": 33
},
{
"clip_ratio": 0.004058451057062484,
"epoch": 1.2133333333333334,
"grad_norm": 0.16684935043495766,
"kl": 0.03450775146484375,
"learning_rate": 1.7705132427757895e-05,
"loss": 0.0232,
"step": 34
},
{
"clip_ratio": 0.006084064312744886,
"epoch": 1.248888888888889,
"grad_norm": 0.11163561617870978,
"kl": 0.0318145751953125,
"learning_rate": 1.7501110696304598e-05,
"loss": 0.0214,
"step": 35
},
{
"clip_ratio": 0.007263028150191531,
"epoch": 1.2844444444444445,
"grad_norm": 0.11128954549532989,
"kl": 0.03281402587890625,
"learning_rate": 1.7289686274214116e-05,
"loss": 0.0197,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 1888.4487342834473,
"epoch": 1.32,
"grad_norm": 0.6837402463902799,
"kl": 0.05413055419921875,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.1151,
"reward": -7.783694684505463,
"reward_std": 3.098730646073818,
"rewards/cot_length_penalty_reward": -8.16985534131527,
"rewards/math_latex_accuracy_reward": 0.3861607341095805,
"step": 37
},
{
"clip_ratio": 0.0028788788622478023,
"epoch": 1.3555555555555556,
"grad_norm": 2.464864347264584,
"kl": 0.04084014892578125,
"learning_rate": 1.684547105928689e-05,
"loss": 0.3644,
"step": 38
},
{
"clip_ratio": 0.004996606716304086,
"epoch": 1.3911111111111112,
"grad_norm": 0.32236476835294475,
"kl": 0.04229736328125,
"learning_rate": 1.661311865323652e-05,
"loss": 0.1132,
"step": 39
},
{
"clip_ratio": 0.005848184140631929,
"epoch": 1.4266666666666667,
"grad_norm": 2.236113570550632,
"kl": 0.180694580078125,
"learning_rate": 1.63742398974869e-05,
"loss": 0.116,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1681.7277793884277,
"epoch": 1.462222222222222,
"grad_norm": 55.91487468875501,
"kl": 1.1835174560546875,
"learning_rate": 1.6129070536529767e-05,
"loss": 0.0918,
"reward": -7.822701282799244,
"reward_std": 2.5297958850860596,
"rewards/cot_length_penalty_reward": -8.191005058586597,
"rewards/math_latex_accuracy_reward": 0.3683035862632096,
"step": 41
},
{
"clip_ratio": 0.003095990905421786,
"epoch": 1.4977777777777779,
"grad_norm": 3454.9772869499748,
"kl": 0.0470123291015625,
"learning_rate": 1.5877852522924733e-05,
"loss": 6.8427,
"step": 42
},
{
"clip_ratio": 0.005094703097711317,
"epoch": 1.5333333333333332,
"grad_norm": 15.584761637863307,
"kl": 1.0414886474609375,
"learning_rate": 1.5620833778521306e-05,
"loss": 0.0866,
"step": 43
},
{
"clip_ratio": 0.008272722363471985,
"epoch": 1.568888888888889,
"grad_norm": 1.2550063449844295,
"kl": 0.04656219482421875,
"learning_rate": 1.5358267949789968e-05,
"loss": 0.0502,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 2313.542510986328,
"epoch": 1.6044444444444443,
"grad_norm": 0.1122939508940078,
"kl": 0.038604736328125,
"learning_rate": 1.5090414157503715e-05,
"loss": 0.0762,
"reward": -9.223605461418629,
"reward_std": 3.827972359955311,
"rewards/cot_length_penalty_reward": -9.58521255850792,
"rewards/math_latex_accuracy_reward": 0.36160715692676604,
"step": 45
},
{
"clip_ratio": 0.0038211173960007727,
"epoch": 1.6400000000000001,
"grad_norm": 0.12378389214572502,
"kl": 0.04041290283203125,
"learning_rate": 1.4817536741017153e-05,
"loss": 0.0756,
"step": 46
},
{
"clip_ratio": 0.005922177180764265,
"epoch": 1.6755555555555555,
"grad_norm": 0.12771635689812005,
"kl": 0.04157257080078125,
"learning_rate": 1.4539904997395468e-05,
"loss": 0.0745,
"step": 47
},
{
"clip_ratio": 0.006733638554578647,
"epoch": 1.7111111111111112,
"grad_norm": 0.10941185512569215,
"kl": 0.04193115234375,
"learning_rate": 1.4257792915650728e-05,
"loss": 0.0731,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 1813.1072387695312,
"epoch": 1.7466666666666666,
"grad_norm": 3.0104818633298525,
"kl": 0.2118988037109375,
"learning_rate": 1.3971478906347806e-05,
"loss": -0.0205,
"reward": -10.289654642343521,
"reward_std": 3.131831008940935,
"rewards/cot_length_penalty_reward": -10.762868821620941,
"rewards/math_latex_accuracy_reward": 0.47321430779993534,
"step": 49
},
{
"clip_ratio": 0.002227201643108856,
"epoch": 1.7822222222222224,
"grad_norm": 0.11902180384589063,
"kl": 0.04238128662109375,
"learning_rate": 1.3681245526846782e-05,
"loss": -0.0276,
"step": 50
},
{
"clip_ratio": 0.0032200364221353084,
"epoch": 1.8177777777777777,
"grad_norm": 0.1256236704087802,
"kl": 0.04290008544921875,
"learning_rate": 1.3387379202452917e-05,
"loss": -0.0286,
"step": 51
},
{
"clip_ratio": 0.003942100578569807,
"epoch": 1.8533333333333335,
"grad_norm": 0.10119215538963353,
"kl": 0.0430450439453125,
"learning_rate": 1.3090169943749475e-05,
"loss": -0.03,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2273.6608276367188,
"epoch": 1.8888888888888888,
"grad_norm": 0.17450385200895388,
"kl": 0.05017852783203125,
"learning_rate": 1.2789911060392295e-05,
"loss": 0.005,
"reward": -7.135257016867399,
"reward_std": 3.697649233043194,
"rewards/cot_length_penalty_reward": -7.5548999309539795,
"rewards/math_latex_accuracy_reward": 0.4196428684517741,
"step": 53
},
{
"clip_ratio": 0.002888819137297105,
"epoch": 1.9244444444444444,
"grad_norm": 0.10967403835477697,
"kl": 0.04810333251953125,
"learning_rate": 1.2486898871648552e-05,
"loss": 0.0038,
"step": 54
},
{
"clip_ratio": 0.005481840795255266,
"epoch": 1.96,
"grad_norm": 0.14987018246678602,
"kl": 0.05097198486328125,
"learning_rate": 1.2181432413965428e-05,
"loss": 0.0028,
"step": 55
},
{
"clip_ratio": 0.007852705341065302,
"epoch": 1.9955555555555555,
"grad_norm": 0.12818843141531608,
"kl": 0.0562286376953125,
"learning_rate": 1.187381314585725e-05,
"loss": 0.0013,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2008.5916328430176,
"epoch": 2.0355555555555553,
"grad_norm": 0.1643995846810127,
"kl": 0.0548858642578125,
"learning_rate": 1.156434465040231e-05,
"loss": 0.0118,
"reward": -7.901306234300137,
"reward_std": 2.6794423200190067,
"rewards/cot_length_penalty_reward": -8.209341906011105,
"rewards/math_latex_accuracy_reward": 0.3080357303842902,
"step": 57
},
{
"clip_ratio": 0.0029279392474563792,
"epoch": 2.071111111111111,
"grad_norm": 0.12045324524183162,
"kl": 0.0587005615234375,
"learning_rate": 1.1253332335643043e-05,
"loss": 0.0108,
"step": 58
},
{
"clip_ratio": 0.005807226421893574,
"epoch": 2.1066666666666665,
"grad_norm": 0.14841036250602688,
"kl": 0.0640106201171875,
"learning_rate": 1.0941083133185146e-05,
"loss": 0.0097,
"step": 59
},
{
"epoch": 2.1422222222222222,
"grad_norm": 0.11089468253649072,
"learning_rate": 1.0627905195293135e-05,
"loss": 0.0084,
"step": 60
},
{
"epoch": 2.1422222222222222,
"eval_clip_ratio": 0.0,
"eval_completion_length": 2107.960148737981,
"eval_kl": 0.05983323317307692,
"eval_loss": -0.00015631201677024364,
"eval_reward": -8.494354761563814,
"eval_reward_std": 3.4025442325151882,
"eval_rewards/cot_length_penalty_reward": -8.876223013951229,
"eval_rewards/math_latex_accuracy_reward": 0.3818681509448932,
"eval_runtime": 422.1329,
"eval_samples_per_second": 0.118,
"eval_steps_per_second": 0.005,
"step": 60
},
{
"clip_ratio": 0.003193242686393205,
"completion_length": 1800.6674766540527,
"epoch": 2.1777777777777776,
"grad_norm": 0.1697521453747013,
"kl": 0.065582275390625,
"learning_rate": 1.0314107590781284e-05,
"loss": 0.0174,
"reward": -8.092556223273277,
"reward_std": 3.146493151783943,
"rewards/cot_length_penalty_reward": -8.458627462387085,
"rewards/math_latex_accuracy_reward": 0.3660714477300644,
"step": 61
},
{
"clip_ratio": 0.003192656353348866,
"epoch": 2.2133333333333334,
"grad_norm": 0.12330763778596482,
"kl": 0.0718231201171875,
"learning_rate": 1e-05,
"loss": 0.0163,
"step": 62
},
{
"clip_ratio": 0.0062453514110529795,
"epoch": 2.2488888888888887,
"grad_norm": 0.16098784282181033,
"kl": 0.078704833984375,
"learning_rate": 9.685892409218718e-06,
"loss": 0.0151,
"step": 63
},
{
"clip_ratio": 0.006978008910664357,
"epoch": 2.2844444444444445,
"grad_norm": 0.1406450810476633,
"kl": 0.0782470703125,
"learning_rate": 9.372094804706867e-06,
"loss": 0.0137,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 2595.997859954834,
"epoch": 2.32,
"grad_norm": 0.18250322704566987,
"kl": 0.0649871826171875,
"learning_rate": 9.058916866814857e-06,
"loss": 0.0147,
"reward": -9.348549716174603,
"reward_std": 3.3840084299445152,
"rewards/cot_length_penalty_reward": -9.70346000418067,
"rewards/math_latex_accuracy_reward": 0.35491072852164507,
"step": 65
},
{
"clip_ratio": 0.0031114893354242668,
"epoch": 2.3555555555555556,
"grad_norm": 0.13005553219968966,
"kl": 0.0695648193359375,
"learning_rate": 8.746667664356957e-06,
"loss": 0.014,
"step": 66
},
{
"clip_ratio": 0.0075038159266114235,
"epoch": 2.391111111111111,
"grad_norm": 0.19621512659848725,
"kl": 0.0780181884765625,
"learning_rate": 8.43565534959769e-06,
"loss": 0.0133,
"step": 67
},
{
"clip_ratio": 0.006932365708053112,
"epoch": 2.4266666666666667,
"grad_norm": 0.13215694629988284,
"kl": 0.07647705078125,
"learning_rate": 8.126186854142752e-06,
"loss": 0.0122,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 2154.4286880493164,
"epoch": 2.462222222222222,
"grad_norm": 0.27197173025048144,
"kl": 0.086151123046875,
"learning_rate": 7.818567586034578e-06,
"loss": 0.0247,
"reward": -8.337913118302822,
"reward_std": 3.0677984952926636,
"rewards/cot_length_penalty_reward": -8.806663155555725,
"rewards/math_latex_accuracy_reward": 0.4687500186264515,
"step": 69
},
{
"clip_ratio": 0.005053140237578191,
"epoch": 2.497777777777778,
"grad_norm": 0.20964263197545416,
"kl": 0.0977630615234375,
"learning_rate": 7.513101128351454e-06,
"loss": 0.0237,
"step": 70
},
{
"clip_ratio": 0.005771905358415097,
"epoch": 2.533333333333333,
"grad_norm": 0.15787820635605407,
"kl": 0.0987091064453125,
"learning_rate": 7.210088939607709e-06,
"loss": 0.0226,
"step": 71
},
{
"clip_ratio": 0.0062158564978744835,
"epoch": 2.568888888888889,
"grad_norm": 0.4007267449310534,
"kl": 0.0895538330078125,
"learning_rate": 6.909830056250527e-06,
"loss": 0.022,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 1935.1675262451172,
"epoch": 2.6044444444444443,
"grad_norm": 0.266781012315019,
"kl": 0.10394287109375,
"learning_rate": 6.612620797547087e-06,
"loss": 0.0125,
"reward": -7.354442303534597,
"reward_std": 2.94980551302433,
"rewards/cot_length_penalty_reward": -7.771853107959032,
"rewards/math_latex_accuracy_reward": 0.41741072852164507,
"step": 73
},
{
"clip_ratio": 0.01473489188356325,
"epoch": 2.64,
"grad_norm": 0.542093788713072,
"kl": 0.1417083740234375,
"learning_rate": 6.318754473153221e-06,
"loss": 0.0132,
"step": 74
},
{
"clip_ratio": 0.009351018321467564,
"epoch": 2.6755555555555555,
"grad_norm": 0.32832820257493534,
"kl": 0.1302490234375,
"learning_rate": 6.028521093652195e-06,
"loss": 0.0111,
"step": 75
},
{
"clip_ratio": 0.008401441504247487,
"epoch": 2.7111111111111112,
"grad_norm": 0.5313671762370776,
"kl": 0.106719970703125,
"learning_rate": 5.742207084349274e-06,
"loss": 0.0105,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1853.8215065002441,
"epoch": 2.7466666666666666,
"grad_norm": 0.25601743476635025,
"kl": 0.128814697265625,
"learning_rate": 5.460095002604533e-06,
"loss": -0.018,
"reward": -7.114141087979078,
"reward_std": 2.6430138647556305,
"rewards/cot_length_penalty_reward": -7.493605274707079,
"rewards/math_latex_accuracy_reward": 0.37946430314332247,
"step": 77
},
{
"clip_ratio": 0.004884305511950515,
"epoch": 2.7822222222222224,
"grad_norm": 0.18667971276259676,
"kl": 0.1357421875,
"learning_rate": 5.1824632589828465e-06,
"loss": -0.019,
"step": 78
},
{
"clip_ratio": 0.008678867772687227,
"epoch": 2.8177777777777777,
"grad_norm": 0.2515967343714355,
"kl": 0.1392822265625,
"learning_rate": 4.909585842496287e-06,
"loss": -0.0199,
"step": 79
},
{
"clip_ratio": 0.008155457631801255,
"epoch": 2.8533333333333335,
"grad_norm": 0.18942366870295294,
"kl": 0.131805419921875,
"learning_rate": 4.641732050210032e-06,
"loss": -0.0211,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 2211.9219856262207,
"epoch": 2.888888888888889,
"grad_norm": 0.22043905749174672,
"kl": 0.1049957275390625,
"learning_rate": 4.379166221478697e-06,
"loss": -0.0247,
"reward": -9.63335988484323,
"reward_std": 2.9292308390140533,
"rewards/cot_length_penalty_reward": -10.111038556322455,
"rewards/math_latex_accuracy_reward": 0.4776785969734192,
"step": 81
},
{
"clip_ratio": 0.002452510700095445,
"epoch": 2.924444444444444,
"grad_norm": 0.2264032851218282,
"kl": 0.1047821044921875,
"learning_rate": 4.12214747707527e-06,
"loss": -0.0248,
"step": 82
},
{
"clip_ratio": 0.0035177832323824987,
"epoch": 2.96,
"grad_norm": 0.14675046732213165,
"kl": 0.1127166748046875,
"learning_rate": 3.8709294634702374e-06,
"loss": -0.0259,
"step": 83
},
{
"clip_ratio": 0.0036856129445368424,
"epoch": 2.9955555555555557,
"grad_norm": 0.1630420640767936,
"kl": 0.09552001953125,
"learning_rate": 3.625760102513103e-06,
"loss": -0.0267,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1545.3081169128418,
"epoch": 3.0355555555555553,
"grad_norm": 14.400966954596354,
"kl": 0.286773681640625,
"learning_rate": 3.3868813467634833e-06,
"loss": -0.0265,
"reward": -7.1288284212350845,
"reward_std": 1.927463386207819,
"rewards/cot_length_penalty_reward": -7.606507122516632,
"rewards/math_latex_accuracy_reward": 0.47767859511077404,
"step": 85
},
{
"clip_ratio": 0.0029736382130067796,
"epoch": 3.071111111111111,
"grad_norm": 0.46033460120838376,
"kl": 0.130645751953125,
"learning_rate": 3.1545289407131128e-06,
"loss": -0.0322,
"step": 86
},
{
"clip_ratio": 0.0043519225146155804,
"epoch": 3.1066666666666665,
"grad_norm": 0.2864629379691617,
"kl": 0.13775634765625,
"learning_rate": 2.9289321881345257e-06,
"loss": -0.0338,
"step": 87
},
{
"clip_ratio": 0.009186911847791635,
"epoch": 3.1422222222222222,
"grad_norm": 0.24547967049213823,
"kl": 0.155426025390625,
"learning_rate": 2.7103137257858867e-06,
"loss": -0.0347,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 1969.4442825317383,
"epoch": 3.1777777777777776,
"grad_norm": 0.433485726828516,
"kl": 0.1619415283203125,
"learning_rate": 2.4988893036954045e-06,
"loss": -0.0013,
"reward": -8.979866623878479,
"reward_std": 2.524892296642065,
"rewards/cot_length_penalty_reward": -9.319152384996414,
"rewards/math_latex_accuracy_reward": 0.339285729220137,
"step": 89
},
{
"epoch": 3.2133333333333334,
"grad_norm": 0.2766265099056435,
"learning_rate": 2.2948675722421086e-06,
"loss": -0.003,
"step": 90
},
{
"epoch": 3.2133333333333334,
"eval_clip_ratio": 0.0,
"eval_completion_length": 2205.780292217548,
"eval_kl": 0.18556565504807693,
"eval_loss": 0.01654699072241783,
"eval_reward": -7.743022455332371,
"eval_reward_std": 2.8156597109941335,
"eval_rewards/cot_length_penalty_reward": -8.072692573070526,
"eval_rewards/math_latex_accuracy_reward": 0.32967034670022816,
"eval_runtime": 490.0675,
"eval_samples_per_second": 0.102,
"eval_steps_per_second": 0.004,
"step": 90
},
{
"clip_ratio": 0.006224101292900741,
"epoch": 3.2488888888888887,
"grad_norm": 0.4063813985520313,
"kl": 0.197174072265625,
"learning_rate": 2.098449876243096e-06,
"loss": -0.0037,
"step": 91
},
{
"clip_ratio": 0.009523139509838074,
"epoch": 3.2844444444444445,
"grad_norm": 0.26707887580226697,
"kl": 0.196014404296875,
"learning_rate": 1.9098300562505266e-06,
"loss": -0.0047,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 2093.5982818603516,
"epoch": 3.32,
"grad_norm": 0.3581002181636425,
"kl": 0.199798583984375,
"learning_rate": 1.7291942572543806e-06,
"loss": 0.0178,
"reward": -7.027399688959122,
"reward_std": 2.7810670882463455,
"rewards/cot_length_penalty_reward": -7.435881897807121,
"rewards/math_latex_accuracy_reward": 0.40848216507583857,
"step": 93
},
{
"clip_ratio": 0.002175023495510686,
"epoch": 3.3555555555555556,
"grad_norm": 0.35335279380104717,
"kl": 0.19476318359375,
"learning_rate": 1.5567207449798517e-06,
"loss": 0.017,
"step": 94
},
{
"clip_ratio": 0.0038238488195929676,
"epoch": 3.391111111111111,
"grad_norm": 0.26816647206060557,
"kl": 0.21075439453125,
"learning_rate": 1.3925797299605649e-06,
"loss": 0.0159,
"step": 95
},
{
"clip_ratio": 0.006063876280677505,
"epoch": 3.4266666666666667,
"grad_norm": 0.3302248399941887,
"kl": 0.22137451171875,
"learning_rate": 1.2369331995613664e-06,
"loss": 0.0151,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 2630.805938720703,
"epoch": 3.462222222222222,
"grad_norm": 1.7166272647509115,
"kl": 0.281219482421875,
"learning_rate": 1.0899347581163222e-06,
"loss": 0.0839,
"reward": -7.589185383694712,
"reward_std": 3.134066376835108,
"rewards/cot_length_penalty_reward": -7.917310604825616,
"rewards/math_latex_accuracy_reward": 0.32812501839362085,
"step": 97
},
{
"clip_ratio": 0.007515597055316903,
"epoch": 3.497777777777778,
"grad_norm": 4.303565090937016,
"kl": 0.204925537109375,
"learning_rate": 9.517294753398066e-07,
"loss": 0.0869,
"step": 98
},
{
"clip_ratio": 0.0075942349576507695,
"epoch": 3.533333333333333,
"grad_norm": 2.9889664346961444,
"kl": 0.2061920166015625,
"learning_rate": 8.224537431601886e-07,
"loss": 0.0841,
"step": 99
},
{
"clip_ratio": 0.0051619461009977385,
"epoch": 3.568888888888889,
"grad_norm": 0.4431396626395719,
"kl": 0.22930908203125,
"learning_rate": 7.022351411174866e-07,
"loss": 0.0814,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2545.8371925354004,
"epoch": 3.6044444444444443,
"grad_norm": 0.4301809872674547,
"kl": 0.171051025390625,
"learning_rate": 5.911923104577455e-07,
"loss": 0.0196,
"reward": -10.28242233581841,
"reward_std": 3.0546065159142017,
"rewards/cot_length_penalty_reward": -10.666351079940796,
"rewards/math_latex_accuracy_reward": 0.3839285857975483,
"step": 101
},
{
"clip_ratio": 0.0024334693371201865,
"epoch": 3.64,
"grad_norm": 0.40563497309844976,
"kl": 0.19537353515625,
"learning_rate": 4.894348370484648e-07,
"loss": 0.0191,
"step": 102
},
{
"clip_ratio": 0.00391879488597624,
"epoch": 3.6755555555555555,
"grad_norm": 0.5720892424115457,
"kl": 0.21160888671875,
"learning_rate": 3.9706314323056936e-07,
"loss": 0.0191,
"step": 103
},
{
"clip_ratio": 0.005467013252200559,
"epoch": 3.7111111111111112,
"grad_norm": 0.5349890131537904,
"kl": 0.21337890625,
"learning_rate": 3.1416838871368925e-07,
"loss": 0.0189,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 2160.7389335632324,
"epoch": 3.7466666666666666,
"grad_norm": 13.207091106199918,
"kl": 0.7435302734375,
"learning_rate": 2.4083238061252565e-07,
"loss": 0.0564,
"reward": -7.942288625985384,
"reward_std": 2.594830472022295,
"rewards/cot_length_penalty_reward": -8.37309193611145,
"rewards/math_latex_accuracy_reward": 0.43080358672887087,
"step": 105
},
{
"clip_ratio": 0.0028747237083734944,
"epoch": 3.7822222222222224,
"grad_norm": 3.5352628792853067,
"kl": 0.472412109375,
"learning_rate": 1.7712749271311392e-07,
"loss": 0.0465,
"step": 106
},
{
"clip_ratio": 0.004760361814987846,
"epoch": 3.8177777777777777,
"grad_norm": 0.94419108473764,
"kl": 0.3760833740234375,
"learning_rate": 1.231165940486234e-07,
"loss": 0.0439,
"step": 107
},
{
"clip_ratio": 0.0061231208674144,
"epoch": 3.8533333333333335,
"grad_norm": 1.7315584230873846,
"kl": 0.3495025634765625,
"learning_rate": 7.885298685522235e-08,
"loss": 0.044,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1982.8750839233398,
"epoch": 3.888888888888889,
"grad_norm": 0.7007233776562277,
"kl": 0.2796783447265625,
"learning_rate": 4.438035396920004e-08,
"loss": 0.0207,
"reward": -9.675179054960608,
"reward_std": 2.574063938111067,
"rewards/cot_length_penalty_reward": -9.989911276847124,
"rewards/math_latex_accuracy_reward": 0.314732160884887,
"step": 109
},
{
"clip_ratio": 0.001981915433134418,
"epoch": 3.924444444444444,
"grad_norm": 0.6787732749959883,
"kl": 0.2747039794921875,
"learning_rate": 1.973271571728441e-08,
"loss": 0.0208,
"step": 110
},
{
"clip_ratio": 0.0019334297030582093,
"epoch": 3.96,
"grad_norm": 0.6415798052337234,
"kl": 0.27728271484375,
"learning_rate": 4.9343963426840006e-09,
"loss": 0.0206,
"step": 111
},
{
"clip_ratio": 0.0017770093054423342,
"epoch": 3.9955555555555557,
"grad_norm": 0.6316769640873855,
"kl": 0.30633544921875,
"learning_rate": 0.0,
"loss": 0.0207,
"step": 112
},
{
"epoch": 3.9955555555555557,
"step": 112,
"total_flos": 0.0,
"train_loss": 0.6775813137989773,
"train_runtime": 20674.7257,
"train_samples_per_second": 0.087,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 112,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}