rlpvr-mcqa-only-unverifiable / trainer_state.json
Seongyun's picture
Model save
09814f7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998738011105502,
"eval_steps": 500,
"global_step": 2641,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 862.3935139973959,
"epoch": 0.0003785966683493185,
"grad_norm": 0.29023680090904236,
"kl": 0.0,
"learning_rate": 7.547169811320756e-08,
"loss": -0.0,
"reward": -2.4199581146240234,
"reward_std": 0.6020505428314209,
"rewards/Qwen2-0.5B-Reward": -2.4199581146240234,
"step": 1
},
{
"completion_length": 921.5138956705729,
"epoch": 0.003785966683493185,
"grad_norm": 0.5279305577278137,
"kl": 0.00010103649563259548,
"learning_rate": 7.547169811320755e-07,
"loss": 0.0,
"reward": -2.473096079296536,
"reward_std": 0.5592167631343559,
"rewards/Qwen2-0.5B-Reward": -2.473096079296536,
"step": 10
},
{
"completion_length": 910.1000081380208,
"epoch": 0.00757193336698637,
"grad_norm": 0.2346569150686264,
"kl": 0.00012467702229817707,
"learning_rate": 1.509433962264151e-06,
"loss": 0.0,
"reward": -2.422696002324422,
"reward_std": 0.5530827701091766,
"rewards/Qwen2-0.5B-Reward": -2.422696002324422,
"step": 20
},
{
"completion_length": 911.8717631022135,
"epoch": 0.011357900050479555,
"grad_norm": 0.21592262387275696,
"kl": 0.0003096898396809896,
"learning_rate": 2.2641509433962266e-06,
"loss": 0.0,
"reward": -2.41411194006602,
"reward_std": 0.5578982929388682,
"rewards/Qwen2-0.5B-Reward": -2.41411194006602,
"step": 30
},
{
"completion_length": 893.8527893066406,
"epoch": 0.01514386673397274,
"grad_norm": 0.26798614859580994,
"kl": 0.0015757242838541667,
"learning_rate": 3.018867924528302e-06,
"loss": 0.0001,
"reward": -2.297330105304718,
"reward_std": 0.528485847512881,
"rewards/Qwen2-0.5B-Reward": -2.297330105304718,
"step": 40
},
{
"completion_length": 875.3588053385416,
"epoch": 0.018929833417465926,
"grad_norm": 0.27687838673591614,
"kl": 0.0064605712890625,
"learning_rate": 3.7735849056603777e-06,
"loss": 0.0003,
"reward": -2.010285266240438,
"reward_std": 0.5279872556527455,
"rewards/Qwen2-0.5B-Reward": -2.010285266240438,
"step": 50
},
{
"completion_length": 859.5717692057292,
"epoch": 0.02271580010095911,
"grad_norm": 0.2615930736064911,
"kl": 0.018147786458333332,
"learning_rate": 4.528301886792453e-06,
"loss": 0.0007,
"reward": -1.795549988746643,
"reward_std": 0.49105457464853924,
"rewards/Qwen2-0.5B-Reward": -1.795549988746643,
"step": 60
},
{
"completion_length": 765.7564921061198,
"epoch": 0.026501766784452298,
"grad_norm": 0.28387993574142456,
"kl": 0.028316243489583334,
"learning_rate": 5.283018867924529e-06,
"loss": 0.0011,
"reward": -1.4461613575617471,
"reward_std": 0.47599050005277,
"rewards/Qwen2-0.5B-Reward": -1.4461613575617471,
"step": 70
},
{
"completion_length": 783.0041768391927,
"epoch": 0.03028773346794548,
"grad_norm": 0.25285565853118896,
"kl": 0.040238444010416666,
"learning_rate": 6.037735849056604e-06,
"loss": 0.0016,
"reward": -1.193545683224996,
"reward_std": 0.4799055278301239,
"rewards/Qwen2-0.5B-Reward": -1.193545683224996,
"step": 80
},
{
"completion_length": 836.7032450358073,
"epoch": 0.034073700151438666,
"grad_norm": 0.25005629658699036,
"kl": 0.05516764322916667,
"learning_rate": 6.792452830188679e-06,
"loss": 0.0022,
"reward": -1.0326486746470134,
"reward_std": 0.5193435788154602,
"rewards/Qwen2-0.5B-Reward": -1.0326486746470134,
"step": 90
},
{
"completion_length": 853.6324157714844,
"epoch": 0.03785966683493185,
"grad_norm": 0.3665623664855957,
"kl": 0.10475260416666667,
"learning_rate": 7.5471698113207555e-06,
"loss": 0.0042,
"reward": -0.9854023973147075,
"reward_std": 0.6189069559176763,
"rewards/Qwen2-0.5B-Reward": -0.9854023973147075,
"step": 100
},
{
"completion_length": 815.8801005045573,
"epoch": 0.04164563351842504,
"grad_norm": 0.8496055006980896,
"kl": 0.41549479166666664,
"learning_rate": 8.301886792452832e-06,
"loss": 0.0166,
"reward": -1.463372488816579,
"reward_std": 1.0357649803161622,
"rewards/Qwen2-0.5B-Reward": -1.463372488816579,
"step": 110
},
{
"completion_length": 813.9148213704427,
"epoch": 0.04543160020191822,
"grad_norm": 0.3407374918460846,
"kl": 0.40042317708333336,
"learning_rate": 9.056603773584907e-06,
"loss": 0.016,
"reward": -1.8741844495137532,
"reward_std": 1.722016990184784,
"rewards/Qwen2-0.5B-Reward": -1.8741844495137532,
"step": 120
},
{
"completion_length": 646.899545288086,
"epoch": 0.04921756688541141,
"grad_norm": 0.4906499981880188,
"kl": 0.2925618489583333,
"learning_rate": 9.811320754716981e-06,
"loss": 0.0117,
"reward": -1.3685388286908469,
"reward_std": 1.243816477060318,
"rewards/Qwen2-0.5B-Reward": -1.3685388286908469,
"step": 130
},
{
"completion_length": 547.943989054362,
"epoch": 0.053003533568904596,
"grad_norm": 1.3670618534088135,
"kl": 1.1440104166666667,
"learning_rate": 1.0566037735849058e-05,
"loss": 0.0458,
"reward": -2.530128773053487,
"reward_std": 2.079139538606008,
"rewards/Qwen2-0.5B-Reward": -2.530128773053487,
"step": 140
},
{
"completion_length": 482.1189860026042,
"epoch": 0.056789500252397776,
"grad_norm": 2.426543712615967,
"kl": 2.05546875,
"learning_rate": 1.1320754716981132e-05,
"loss": 0.0822,
"reward": -3.6518485943476358,
"reward_std": 2.5313418904940286,
"rewards/Qwen2-0.5B-Reward": -3.6518485943476358,
"step": 150
},
{
"completion_length": 583.3393595377604,
"epoch": 0.06057546693589096,
"grad_norm": 7.087838649749756,
"kl": 1.375,
"learning_rate": 1.2075471698113209e-05,
"loss": 0.055,
"reward": -2.7873202482859294,
"reward_std": 2.297369889418284,
"rewards/Qwen2-0.5B-Reward": -2.7873202482859294,
"step": 160
},
{
"completion_length": 581.331483968099,
"epoch": 0.06436143361938415,
"grad_norm": 0.3393622636795044,
"kl": 0.7234049479166667,
"learning_rate": 1.2830188679245283e-05,
"loss": 0.029,
"reward": -1.6760946492354074,
"reward_std": 1.530751649538676,
"rewards/Qwen2-0.5B-Reward": -1.6760946492354074,
"step": 170
},
{
"completion_length": 551.2680633544921,
"epoch": 0.06814740030287733,
"grad_norm": 0.5040144920349121,
"kl": 0.5955729166666667,
"learning_rate": 1.3584905660377358e-05,
"loss": 0.0238,
"reward": -1.6978328824043274,
"reward_std": 1.5175378421942394,
"rewards/Qwen2-0.5B-Reward": -1.6978328824043274,
"step": 180
},
{
"completion_length": 498.8856536865234,
"epoch": 0.07193336698637053,
"grad_norm": 0.6744162440299988,
"kl": 1.2225260416666666,
"learning_rate": 1.4339622641509435e-05,
"loss": 0.0489,
"reward": -2.619816021124522,
"reward_std": 2.05845144589742,
"rewards/Qwen2-0.5B-Reward": -2.619816021124522,
"step": 190
},
{
"completion_length": 651.0615783691406,
"epoch": 0.0757193336698637,
"grad_norm": 0.47306591272354126,
"kl": 0.9920572916666667,
"learning_rate": 1.5094339622641511e-05,
"loss": 0.0397,
"reward": -2.183918062845866,
"reward_std": 2.0014989256858824,
"rewards/Qwen2-0.5B-Reward": -2.183918062845866,
"step": 200
},
{
"completion_length": 723.8722351074218,
"epoch": 0.07950530035335689,
"grad_norm": 0.8630687594413757,
"kl": 1.132421875,
"learning_rate": 1.5849056603773586e-05,
"loss": 0.0453,
"reward": -2.3433102289835612,
"reward_std": 2.1008309284845987,
"rewards/Qwen2-0.5B-Reward": -2.3433102289835612,
"step": 210
},
{
"completion_length": 633.7060221354167,
"epoch": 0.08329126703685008,
"grad_norm": 0.33421555161476135,
"kl": 1.11015625,
"learning_rate": 1.6603773584905664e-05,
"loss": 0.0444,
"reward": -1.7007139801979065,
"reward_std": 1.8362650871276855,
"rewards/Qwen2-0.5B-Reward": -1.7007139801979065,
"step": 220
},
{
"completion_length": 801.9037150065104,
"epoch": 0.08707723372034326,
"grad_norm": 0.2525235116481781,
"kl": 0.492578125,
"learning_rate": 1.735849056603774e-05,
"loss": 0.0197,
"reward": -0.9283350398143132,
"reward_std": 1.1985284070173898,
"rewards/Qwen2-0.5B-Reward": -0.9283350398143132,
"step": 230
},
{
"completion_length": 757.5694498697917,
"epoch": 0.09086320040383644,
"grad_norm": 0.23876462876796722,
"kl": 0.5396484375,
"learning_rate": 1.8113207547169813e-05,
"loss": 0.0216,
"reward": -1.083450937271118,
"reward_std": 1.4640587449073792,
"rewards/Qwen2-0.5B-Reward": -1.083450937271118,
"step": 240
},
{
"completion_length": 691.9120452880859,
"epoch": 0.09464916708732964,
"grad_norm": 0.5013711452484131,
"kl": 0.6822265625,
"learning_rate": 1.8867924528301888e-05,
"loss": 0.0273,
"reward": -1.4570284724235534,
"reward_std": 1.626520773768425,
"rewards/Qwen2-0.5B-Reward": -1.4570284724235534,
"step": 250
},
{
"completion_length": 573.8166727701823,
"epoch": 0.09843513377082282,
"grad_norm": 0.6115075349807739,
"kl": 1.6170572916666666,
"learning_rate": 1.9622641509433963e-05,
"loss": 0.0647,
"reward": -2.9246065855026244,
"reward_std": 2.250337036450704,
"rewards/Qwen2-0.5B-Reward": -2.9246065855026244,
"step": 260
},
{
"completion_length": 773.3074178059895,
"epoch": 0.102221100454316,
"grad_norm": 0.31634387373924255,
"kl": 0.733984375,
"learning_rate": 1.999980332108064e-05,
"loss": 0.0294,
"reward": -1.5115862051645914,
"reward_std": 1.5288376450538634,
"rewards/Qwen2-0.5B-Reward": -1.5115862051645914,
"step": 270
},
{
"completion_length": 829.3129659016927,
"epoch": 0.10600706713780919,
"grad_norm": 0.25395989418029785,
"kl": 0.4763671875,
"learning_rate": 1.9998229941302175e-05,
"loss": 0.0191,
"reward": -0.9474448690811793,
"reward_std": 1.09269377887249,
"rewards/Qwen2-0.5B-Reward": -0.9474448690811793,
"step": 280
},
{
"completion_length": 999.9018575032552,
"epoch": 0.10979303382130237,
"grad_norm": 0.3452966809272766,
"kl": 0.6498046875,
"learning_rate": 1.9995083456809467e-05,
"loss": 0.026,
"reward": -1.7776759227116903,
"reward_std": 1.8762857417265575,
"rewards/Qwen2-0.5B-Reward": -1.7776759227116903,
"step": 290
},
{
"completion_length": 867.9375081380208,
"epoch": 0.11357900050479555,
"grad_norm": 0.4680746793746948,
"kl": 0.98203125,
"learning_rate": 1.9990364417682882e-05,
"loss": 0.0393,
"reward": -2.6815950234731036,
"reward_std": 2.1339449683825173,
"rewards/Qwen2-0.5B-Reward": -2.6815950234731036,
"step": 300
},
{
"completion_length": 841.0263916015625,
"epoch": 0.11736496718828875,
"grad_norm": 0.4002317786216736,
"kl": 1.4239583333333334,
"learning_rate": 1.9984073648922753e-05,
"loss": 0.057,
"reward": -2.9415343125661213,
"reward_std": 2.4492496887842816,
"rewards/Qwen2-0.5B-Reward": -2.9415343125661213,
"step": 310
},
{
"completion_length": 845.747226969401,
"epoch": 0.12115093387178193,
"grad_norm": 0.7248504757881165,
"kl": 1.8854166666666667,
"learning_rate": 1.997621225030515e-05,
"loss": 0.0754,
"reward": -3.695864470799764,
"reward_std": 2.583825929959615,
"rewards/Qwen2-0.5B-Reward": -3.695864470799764,
"step": 320
},
{
"completion_length": 1439.778253173828,
"epoch": 0.12493690055527511,
"grad_norm": 0.602688193321228,
"kl": 1.6190104166666666,
"learning_rate": 1.9966781596189623e-05,
"loss": 0.0648,
"reward": -3.7271327575047812,
"reward_std": 1.9023333628972372,
"rewards/Qwen2-0.5B-Reward": -3.7271327575047812,
"step": 330
},
{
"completion_length": 1148.3754781087239,
"epoch": 0.1287228672387683,
"grad_norm": 0.6237585544586182,
"kl": 1.2373697916666666,
"learning_rate": 1.9955783335278924e-05,
"loss": 0.0495,
"reward": -3.0485590934753417,
"reward_std": 1.7622671604156495,
"rewards/Qwen2-0.5B-Reward": -3.0485590934753417,
"step": 340
},
{
"completion_length": 871.3398213704427,
"epoch": 0.13250883392226148,
"grad_norm": 0.7727285623550415,
"kl": 0.8674479166666667,
"learning_rate": 1.9943219390330767e-05,
"loss": 0.0347,
"reward": -2.604492497444153,
"reward_std": 1.5047667543093364,
"rewards/Qwen2-0.5B-Reward": -2.604492497444153,
"step": 350
},
{
"completion_length": 731.1259358723959,
"epoch": 0.13629480060575466,
"grad_norm": 0.2828836441040039,
"kl": 0.725,
"learning_rate": 1.9929091957821703e-05,
"loss": 0.029,
"reward": -1.8863240122795104,
"reward_std": 1.1796027421951294,
"rewards/Qwen2-0.5B-Reward": -1.8863240122795104,
"step": 360
},
{
"completion_length": 593.0722300211588,
"epoch": 0.14008076728924784,
"grad_norm": 0.34980472922325134,
"kl": 0.5994791666666667,
"learning_rate": 1.9913403507563104e-05,
"loss": 0.024,
"reward": -1.5030529995759327,
"reward_std": 0.9978658020496368,
"rewards/Qwen2-0.5B-Reward": -1.5030529995759327,
"step": 370
},
{
"completion_length": 574.3398223876953,
"epoch": 0.14386673397274105,
"grad_norm": 0.44906890392303467,
"kl": 0.98515625,
"learning_rate": 1.9896156782269405e-05,
"loss": 0.0394,
"reward": -2.0863842129707337,
"reward_std": 1.7282814304033915,
"rewards/Qwen2-0.5B-Reward": -2.0863842129707337,
"step": 380
},
{
"completion_length": 598.7032470703125,
"epoch": 0.14765270065623423,
"grad_norm": 0.42980390787124634,
"kl": 1.1180989583333334,
"learning_rate": 1.9877354797078577e-05,
"loss": 0.0448,
"reward": -1.7375385125478109,
"reward_std": 1.6146510203679403,
"rewards/Qwen2-0.5B-Reward": -1.7375385125478109,
"step": 390
},
{
"completion_length": 522.9648213704427,
"epoch": 0.1514386673397274,
"grad_norm": 0.4960784614086151,
"kl": 0.8545572916666667,
"learning_rate": 1.9857000839025043e-05,
"loss": 0.0342,
"reward": -1.5845000902811686,
"reward_std": 1.6743145366509755,
"rewards/Qwen2-0.5B-Reward": -1.5845000902811686,
"step": 400
},
{
"completion_length": 650.5805603027344,
"epoch": 0.1552246340232206,
"grad_norm": 3.5647335052490234,
"kl": 0.7625,
"learning_rate": 1.983509846646502e-05,
"loss": 0.0305,
"reward": -1.3111775855223338,
"reward_std": 1.5136757413546245,
"rewards/Qwen2-0.5B-Reward": -1.3111775855223338,
"step": 410
},
{
"completion_length": 608.793061319987,
"epoch": 0.15901060070671377,
"grad_norm": 0.7843858599662781,
"kl": 1.0794270833333333,
"learning_rate": 1.9811651508454405e-05,
"loss": 0.0432,
"reward": -1.030318695306778,
"reward_std": 1.3256585756937662,
"rewards/Qwen2-0.5B-Reward": -1.030318695306778,
"step": 420
},
{
"completion_length": 717.2574086507161,
"epoch": 0.16279656739020695,
"grad_norm": 1.8327181339263916,
"kl": 1.7510416666666666,
"learning_rate": 1.97866640640794e-05,
"loss": 0.07,
"reward": -1.3865876078605652,
"reward_std": 1.5873213092486063,
"rewards/Qwen2-0.5B-Reward": -1.3865876078605652,
"step": 430
},
{
"completion_length": 886.4629699707032,
"epoch": 0.16658253407370016,
"grad_norm": 1.820997714996338,
"kl": 2.492317708333333,
"learning_rate": 1.9760140501739885e-05,
"loss": 0.0997,
"reward": -2.041487044095993,
"reward_std": 1.9687125941117605,
"rewards/Qwen2-0.5B-Reward": -2.041487044095993,
"step": 440
},
{
"completion_length": 761.4412068684895,
"epoch": 0.17036850075719334,
"grad_norm": 2.2856929302215576,
"kl": 2.5984375,
"learning_rate": 1.9732085458385706e-05,
"loss": 0.104,
"reward": -1.7194086611270905,
"reward_std": 1.6787285923957824,
"rewards/Qwen2-0.5B-Reward": -1.7194086611270905,
"step": 450
},
{
"completion_length": 800.9458414713541,
"epoch": 0.17415446744068652,
"grad_norm": 1.5514496564865112,
"kl": 3.582291666666667,
"learning_rate": 1.9702503838706032e-05,
"loss": 0.1433,
"reward": -1.918267943461736,
"reward_std": 1.798914521932602,
"rewards/Qwen2-0.5B-Reward": -1.918267943461736,
"step": 460
},
{
"completion_length": 766.8731547037761,
"epoch": 0.1779404341241797,
"grad_norm": 2.541083812713623,
"kl": 2.43359375,
"learning_rate": 1.9671400814271904e-05,
"loss": 0.0973,
"reward": -1.5191373944282531,
"reward_std": 1.7267815709114074,
"rewards/Qwen2-0.5B-Reward": -1.5191373944282531,
"step": 470
},
{
"completion_length": 730.6893575032552,
"epoch": 0.18172640080767288,
"grad_norm": 2.0177841186523438,
"kl": 1.9341145833333333,
"learning_rate": 1.9638781822632117e-05,
"loss": 0.0774,
"reward": -0.9346473336219787,
"reward_std": 1.284997742374738,
"rewards/Qwen2-0.5B-Reward": -0.9346473336219787,
"step": 480
},
{
"completion_length": 714.4213033040364,
"epoch": 0.1855123674911661,
"grad_norm": 1.7401411533355713,
"kl": 2.3013020833333333,
"learning_rate": 1.9604652566362604e-05,
"loss": 0.092,
"reward": -0.9350511769453684,
"reward_std": 1.2963847279548646,
"rewards/Qwen2-0.5B-Reward": -0.9350511769453684,
"step": 490
},
{
"completion_length": 718.9740814208984,
"epoch": 0.18929833417465927,
"grad_norm": 5.835248947143555,
"kl": 2.794661458333333,
"learning_rate": 1.95690190120695e-05,
"loss": 0.1118,
"reward": -1.0198218444983165,
"reward_std": 1.4708388864994049,
"rewards/Qwen2-0.5B-Reward": -1.0198218444983165,
"step": 500
},
{
"completion_length": 626.8421396891276,
"epoch": 0.19308430085815245,
"grad_norm": 2.837407112121582,
"kl": 1.6803385416666667,
"learning_rate": 1.9531887389346016e-05,
"loss": 0.0672,
"reward": -0.6805184543132782,
"reward_std": 1.061421944697698,
"rewards/Qwen2-0.5B-Reward": -0.6805184543132782,
"step": 510
},
{
"completion_length": 709.4504659016927,
"epoch": 0.19687026754164563,
"grad_norm": 1.3921163082122803,
"kl": 2.5,
"learning_rate": 1.9493264189683393e-05,
"loss": 0.1,
"reward": -1.0162009666363399,
"reward_std": 1.3732348203659057,
"rewards/Qwen2-0.5B-Reward": -1.0162009666363399,
"step": 520
},
{
"completion_length": 734.6662150065105,
"epoch": 0.2006562342251388,
"grad_norm": 0.9199353456497192,
"kl": 2.3385416666666665,
"learning_rate": 1.9453156165336e-05,
"loss": 0.0936,
"reward": -0.9758850524822871,
"reward_std": 1.4469304541746775,
"rewards/Qwen2-0.5B-Reward": -0.9758850524822871,
"step": 530
},
{
"completion_length": 685.4245402018229,
"epoch": 0.204442200908632,
"grad_norm": 1.9519050121307373,
"kl": 1.9833333333333334,
"learning_rate": 1.94115703281409e-05,
"loss": 0.0793,
"reward": -0.7073126316070557,
"reward_std": 1.1466901183128357,
"rewards/Qwen2-0.5B-Reward": -0.7073126316070557,
"step": 540
},
{
"completion_length": 755.8652852376302,
"epoch": 0.2082281675921252,
"grad_norm": 1.136602520942688,
"kl": 2.7690104166666667,
"learning_rate": 1.9368513948291997e-05,
"loss": 0.1108,
"reward": -1.0485609819491704,
"reward_std": 1.5164429823557535,
"rewards/Qwen2-0.5B-Reward": -1.0485609819491704,
"step": 550
},
{
"completion_length": 691.9601928710938,
"epoch": 0.21201413427561838,
"grad_norm": 1.1344228982925415,
"kl": 1.9572916666666667,
"learning_rate": 1.932399455306906e-05,
"loss": 0.0783,
"reward": -0.7905913976331552,
"reward_std": 1.2394062995910644,
"rewards/Qwen2-0.5B-Reward": -0.7905913976331552,
"step": 560
},
{
"completion_length": 774.0166727701823,
"epoch": 0.21580010095911156,
"grad_norm": 4.68572998046875,
"kl": 2.8236979166666667,
"learning_rate": 1.9278019925521744e-05,
"loss": 0.113,
"reward": -0.9427557557821273,
"reward_std": 1.4873551627000172,
"rewards/Qwen2-0.5B-Reward": -0.9427557557821273,
"step": 570
},
{
"completion_length": 782.1125061035157,
"epoch": 0.21958606764260474,
"grad_norm": 3.2951087951660156,
"kl": 2.81875,
"learning_rate": 1.9230598103108958e-05,
"loss": 0.1127,
"reward": -0.9920766482750575,
"reward_std": 1.5032208581765494,
"rewards/Qwen2-0.5B-Reward": -0.9920766482750575,
"step": 580
},
{
"completion_length": 764.4046447753906,
"epoch": 0.22337203432609792,
"grad_norm": 0.7878244519233704,
"kl": 2.4518229166666665,
"learning_rate": 1.9181737376293693e-05,
"loss": 0.0981,
"reward": -0.8713747123877208,
"reward_std": 1.4777807037035624,
"rewards/Qwen2-0.5B-Reward": -0.8713747123877208,
"step": 590
},
{
"completion_length": 787.2912129720052,
"epoch": 0.2271580010095911,
"grad_norm": 1.002245306968689,
"kl": 2.3055989583333334,
"learning_rate": 1.9131446287093683e-05,
"loss": 0.0922,
"reward": -0.914855935672919,
"reward_std": 1.4097402195135753,
"rewards/Qwen2-0.5B-Reward": -0.914855935672919,
"step": 600
},
{
"completion_length": 853.8175984700521,
"epoch": 0.2309439676930843,
"grad_norm": 1.2771328687667847,
"kl": 3.388802083333333,
"learning_rate": 1.9079733627588042e-05,
"loss": 0.1356,
"reward": -1.4452542603015899,
"reward_std": 1.7754655241966248,
"rewards/Qwen2-0.5B-Reward": -1.4452542603015899,
"step": 610
},
{
"completion_length": 742.8106587727865,
"epoch": 0.2347299343765775,
"grad_norm": 1.4351956844329834,
"kl": 2.539322916666667,
"learning_rate": 1.9026608438380195e-05,
"loss": 0.1016,
"reward": -1.0814687182505927,
"reward_std": 1.450120480855306,
"rewards/Qwen2-0.5B-Reward": -1.0814687182505927,
"step": 620
},
{
"completion_length": 683.3101959228516,
"epoch": 0.23851590106007067,
"grad_norm": 1.352857232093811,
"kl": 1.84296875,
"learning_rate": 1.897208000701737e-05,
"loss": 0.0737,
"reward": -0.6815965756773948,
"reward_std": 1.0737029949824015,
"rewards/Qwen2-0.5B-Reward": -0.6815965756773948,
"step": 630
},
{
"completion_length": 792.8847290039063,
"epoch": 0.24230186774356385,
"grad_norm": 0.9555492997169495,
"kl": 2.507552083333333,
"learning_rate": 1.8916157866366928e-05,
"loss": 0.1003,
"reward": -0.9711129138867061,
"reward_std": 1.5443729062875111,
"rewards/Qwen2-0.5B-Reward": -0.9711129138867061,
"step": 640
},
{
"completion_length": 771.8388916015625,
"epoch": 0.24608783442705703,
"grad_norm": 1.100447177886963,
"kl": 2.43984375,
"learning_rate": 1.8858851792949764e-05,
"loss": 0.0976,
"reward": -1.0383977095286052,
"reward_std": 1.4934775571028391,
"rewards/Qwen2-0.5B-Reward": -1.0383977095286052,
"step": 650
},
{
"completion_length": 719.6476888020833,
"epoch": 0.24987380111055021,
"grad_norm": 1.0007727146148682,
"kl": 1.8555989583333334,
"learning_rate": 1.880017180523116e-05,
"loss": 0.0742,
"reward": -0.844773971537749,
"reward_std": 1.3539518495400746,
"rewards/Qwen2-0.5B-Reward": -0.844773971537749,
"step": 660
},
{
"completion_length": 751.7523234049479,
"epoch": 0.2536597677940434,
"grad_norm": 0.8904104232788086,
"kl": 1.8032552083333333,
"learning_rate": 1.8740128161869308e-05,
"loss": 0.0721,
"reward": -0.6786784966786702,
"reward_std": 1.2231530169645946,
"rewards/Qwen2-0.5B-Reward": -0.6786784966786702,
"step": 670
},
{
"completion_length": 732.9828735351563,
"epoch": 0.2574457344775366,
"grad_norm": 0.9944930672645569,
"kl": 2.1743489583333333,
"learning_rate": 1.8678731359921856e-05,
"loss": 0.087,
"reward": -0.6016703399519125,
"reward_std": 1.2204503317674,
"rewards/Qwen2-0.5B-Reward": -0.6016703399519125,
"step": 680
},
{
"completion_length": 769.8699157714843,
"epoch": 0.2612317011610298,
"grad_norm": 1.7161378860473633,
"kl": 2.68125,
"learning_rate": 1.8615992133010777e-05,
"loss": 0.1073,
"reward": -0.9773722817500432,
"reward_std": 1.5413507958253225,
"rewards/Qwen2-0.5B-Reward": -0.9773722817500432,
"step": 690
},
{
"completion_length": 723.2902893066406,
"epoch": 0.26501766784452296,
"grad_norm": 1.2049915790557861,
"kl": 2.252083333333333,
"learning_rate": 1.855192144944586e-05,
"loss": 0.0901,
"reward": -0.6862340954442819,
"reward_std": 1.3176872313022614,
"rewards/Qwen2-0.5B-Reward": -0.6862340954442819,
"step": 700
},
{
"completion_length": 730.842598470052,
"epoch": 0.26880363452801614,
"grad_norm": 1.1353825330734253,
"kl": 2.6614583333333335,
"learning_rate": 1.8486530510307222e-05,
"loss": 0.1064,
"reward": -0.8512504202624162,
"reward_std": 1.4152730743090312,
"rewards/Qwen2-0.5B-Reward": -0.8512504202624162,
"step": 710
},
{
"completion_length": 784.4833374023438,
"epoch": 0.2725896012115093,
"grad_norm": 1.1364926099777222,
"kl": 2.8877604166666666,
"learning_rate": 1.8419830747487045e-05,
"loss": 0.1155,
"reward": -1.4028477271397908,
"reward_std": 1.6338281035423279,
"rewards/Qwen2-0.5B-Reward": -1.4028477271397908,
"step": 720
},
{
"completion_length": 786.0379679361979,
"epoch": 0.2763755678950025,
"grad_norm": 1.4071515798568726,
"kl": 3.0088541666666666,
"learning_rate": 1.8351833821691053e-05,
"loss": 0.1204,
"reward": -1.2512944350639978,
"reward_std": 1.6677428344885508,
"rewards/Qwen2-0.5B-Reward": -1.2512944350639978,
"step": 730
},
{
"completion_length": 807.7277893066406,
"epoch": 0.2801615345784957,
"grad_norm": 1.4424173831939697,
"kl": 3.021354166666667,
"learning_rate": 1.8282551620399917e-05,
"loss": 0.1208,
"reward": -1.225243662794431,
"reward_std": 1.7895207107067108,
"rewards/Qwen2-0.5B-Reward": -1.225243662794431,
"step": 740
},
{
"completion_length": 728.170839436849,
"epoch": 0.2839475012619889,
"grad_norm": 0.6950631141662598,
"kl": 2.519661458333333,
"learning_rate": 1.821199625579105e-05,
"loss": 0.1008,
"reward": -0.8639134142082184,
"reward_std": 1.4788370271523794,
"rewards/Qwen2-0.5B-Reward": -0.8639134142082184,
"step": 750
},
{
"completion_length": 679.6050984700521,
"epoch": 0.2877334679454821,
"grad_norm": 1.6717815399169922,
"kl": 1.7360677083333333,
"learning_rate": 1.8140180062621117e-05,
"loss": 0.0695,
"reward": -0.46732902062746384,
"reward_std": 0.9378261427084605,
"rewards/Qwen2-0.5B-Reward": -0.46732902062746384,
"step": 760
},
{
"completion_length": 783.3986165364583,
"epoch": 0.2915194346289753,
"grad_norm": 1.3388867378234863,
"kl": 2.79609375,
"learning_rate": 1.8067115596069607e-05,
"loss": 0.1118,
"reward": -0.9435359309117,
"reward_std": 1.6089221199353536,
"rewards/Qwen2-0.5B-Reward": -0.9435359309117,
"step": 770
},
{
"completion_length": 713.3592651367187,
"epoch": 0.29530540131246846,
"grad_norm": 1.2017817497253418,
"kl": 2.4661458333333335,
"learning_rate": 1.79928156295439e-05,
"loss": 0.0986,
"reward": -0.7846424505114555,
"reward_std": 1.4175224483013154,
"rewards/Qwen2-0.5B-Reward": -0.7846424505114555,
"step": 780
},
{
"completion_length": 813.8467631022136,
"epoch": 0.29909136799596164,
"grad_norm": 2.2606418132781982,
"kl": 3.955208333333333,
"learning_rate": 1.7917293152446184e-05,
"loss": 0.1583,
"reward": -1.4304717580477397,
"reward_std": 2.023730218410492,
"rewards/Qwen2-0.5B-Reward": -1.4304717580477397,
"step": 790
},
{
"completion_length": 701.6078796386719,
"epoch": 0.3028773346794548,
"grad_norm": 1.5273058414459229,
"kl": 2.3984375,
"learning_rate": 1.784056136790257e-05,
"loss": 0.096,
"reward": -0.7075912684202195,
"reward_std": 1.3393534004688263,
"rewards/Qwen2-0.5B-Reward": -0.7075912684202195,
"step": 800
},
{
"completion_length": 709.4273193359375,
"epoch": 0.306663301362948,
"grad_norm": 1.1304354667663574,
"kl": 2.4188802083333334,
"learning_rate": 1.7762633690454897e-05,
"loss": 0.0968,
"reward": -0.6373326261838277,
"reward_std": 1.289098753531774,
"rewards/Qwen2-0.5B-Reward": -0.6373326261838277,
"step": 810
},
{
"completion_length": 757.3930643717448,
"epoch": 0.3104492680464412,
"grad_norm": 1.3200254440307617,
"kl": 2.48046875,
"learning_rate": 1.7683523743715538e-05,
"loss": 0.0993,
"reward": -0.8247589614242316,
"reward_std": 1.4155633012453714,
"rewards/Qwen2-0.5B-Reward": -0.8247589614242316,
"step": 820
},
{
"completion_length": 697.1213033040365,
"epoch": 0.31423523472993437,
"grad_norm": 0.8467837572097778,
"kl": 2.003515625,
"learning_rate": 1.760324535798567e-05,
"loss": 0.0802,
"reward": -0.4532388661056757,
"reward_std": 1.0981567233800889,
"rewards/Qwen2-0.5B-Reward": -0.4532388661056757,
"step": 830
},
{
"completion_length": 780.1504659016927,
"epoch": 0.31802120141342755,
"grad_norm": 596676.75,
"kl": 3364.5799479166667,
"learning_rate": 1.752181256783741e-05,
"loss": 134.4873,
"reward": -0.9652832999825478,
"reward_std": 1.6040133237838745,
"rewards/Qwen2-0.5B-Reward": -0.9652832999825478,
"step": 840
},
{
"completion_length": 695.2185282389323,
"epoch": 0.3218071680969207,
"grad_norm": 1.3249794244766235,
"kl": 2.296744791666667,
"learning_rate": 1.7439239609660238e-05,
"loss": 0.0919,
"reward": -0.49953351405759655,
"reward_std": 1.100526017944018,
"rewards/Qwen2-0.5B-Reward": -0.49953351405759655,
"step": 850
},
{
"completion_length": 705.1893636067708,
"epoch": 0.3255931347804139,
"grad_norm": 2.2733891010284424,
"kl": 2.476302083333333,
"learning_rate": 1.735554091917214e-05,
"loss": 0.0991,
"reward": -0.7803226565321286,
"reward_std": 1.427873319387436,
"rewards/Qwen2-0.5B-Reward": -0.7803226565321286,
"step": 860
},
{
"completion_length": 724.2041748046875,
"epoch": 0.32937910146390714,
"grad_norm": 1.2122727632522583,
"kl": 2.6158854166666665,
"learning_rate": 1.7270731128895896e-05,
"loss": 0.1046,
"reward": -0.9140092690785726,
"reward_std": 1.5725321372350056,
"rewards/Qwen2-0.5B-Reward": -0.9140092690785726,
"step": 870
},
{
"completion_length": 736.9453796386719,
"epoch": 0.3331650681474003,
"grad_norm": 0.9501739740371704,
"kl": 2.3080729166666667,
"learning_rate": 1.7184825065600964e-05,
"loss": 0.0923,
"reward": -0.7457656829307476,
"reward_std": 1.343357914686203,
"rewards/Qwen2-0.5B-Reward": -0.7457656829307476,
"step": 880
},
{
"completion_length": 785.8842651367188,
"epoch": 0.3369510348308935,
"grad_norm": 0.9159669280052185,
"kl": 2.5815104166666667,
"learning_rate": 1.709783774771141e-05,
"loss": 0.1033,
"reward": -0.7225840290387472,
"reward_std": 1.4536415020624796,
"rewards/Qwen2-0.5B-Reward": -0.7225840290387472,
"step": 890
},
{
"completion_length": 824.7926045735677,
"epoch": 0.3407370015143867,
"grad_norm": 3.830165147781372,
"kl": 2.72265625,
"learning_rate": 1.7009784382680345e-05,
"loss": 0.1089,
"reward": -0.9060644646485646,
"reward_std": 1.5053735852241517,
"rewards/Qwen2-0.5B-Reward": -0.9060644646485646,
"step": 900
},
{
"completion_length": 788.6736124674479,
"epoch": 0.34452296819787986,
"grad_norm": 1.977720022201538,
"kl": 2.8015625,
"learning_rate": 1.692068036433128e-05,
"loss": 0.1121,
"reward": -0.7987352999548117,
"reward_std": 1.52867697874705,
"rewards/Qwen2-0.5B-Reward": -0.7987352999548117,
"step": 910
},
{
"completion_length": 740.3379699707032,
"epoch": 0.34830893488137304,
"grad_norm": 170.5952911376953,
"kl": 2.7221354166666667,
"learning_rate": 1.6830541270166928e-05,
"loss": 0.1088,
"reward": -0.9519633074601491,
"reward_std": 1.5265244921048482,
"rewards/Qwen2-0.5B-Reward": -0.9519633074601491,
"step": 920
},
{
"completion_length": 715.4027811686198,
"epoch": 0.3520949015648662,
"grad_norm": 1.736777663230896,
"kl": 2.2143229166666667,
"learning_rate": 1.673938285864588e-05,
"loss": 0.0886,
"reward": -0.5707902121047179,
"reward_std": 1.127177753051122,
"rewards/Qwen2-0.5B-Reward": -0.5707902121047179,
"step": 930
},
{
"completion_length": 812.7449137369791,
"epoch": 0.3558808682483594,
"grad_norm": 2.1414971351623535,
"kl": 2.667708333333333,
"learning_rate": 1.664722106642767e-05,
"loss": 0.1066,
"reward": -0.9589705864588419,
"reward_std": 1.527525293827057,
"rewards/Qwen2-0.5B-Reward": -0.9589705864588419,
"step": 940
},
{
"completion_length": 769.1416849772136,
"epoch": 0.3596668349318526,
"grad_norm": 4.496264457702637,
"kl": 2.38359375,
"learning_rate": 1.6554072005586638e-05,
"loss": 0.0953,
"reward": -0.5887288892020782,
"reward_std": 1.23007483681043,
"rewards/Qwen2-0.5B-Reward": -0.5887288892020782,
"step": 950
},
{
"completion_length": 766.1638936360677,
"epoch": 0.36345280161534577,
"grad_norm": 1.2970997095108032,
"kl": 2.5834635416666667,
"learning_rate": 1.6459951960795185e-05,
"loss": 0.1033,
"reward": -0.7721572608997425,
"reward_std": 1.4835912009080252,
"rewards/Qwen2-0.5B-Reward": -0.7721572608997425,
"step": 960
},
{
"completion_length": 751.2708374023438,
"epoch": 0.36723876829883895,
"grad_norm": 2.702152967453003,
"kl": 2.40390625,
"learning_rate": 1.6364877386476804e-05,
"loss": 0.0961,
"reward": -0.7570990284283956,
"reward_std": 1.4351972460746765,
"rewards/Qwen2-0.5B-Reward": -0.7570990284283956,
"step": 970
},
{
"completion_length": 730.1676005045573,
"epoch": 0.3710247349823322,
"grad_norm": 1.0240263938903809,
"kl": 2.5716145833333335,
"learning_rate": 1.6268864903929466e-05,
"loss": 0.1029,
"reward": -0.6520452598730723,
"reward_std": 1.3328065713246664,
"rewards/Qwen2-0.5B-Reward": -0.6520452598730723,
"step": 980
},
{
"completion_length": 738.0222351074219,
"epoch": 0.37481070166582536,
"grad_norm": 0.9893134832382202,
"kl": 2.990104166666667,
"learning_rate": 1.617193129841982e-05,
"loss": 0.1196,
"reward": -0.973382901151975,
"reward_std": 1.5284679671128591,
"rewards/Qwen2-0.5B-Reward": -0.973382901151975,
"step": 990
},
{
"completion_length": 767.7051005045573,
"epoch": 0.37859666834931854,
"grad_norm": 1.4028962850570679,
"kl": 3.0208333333333335,
"learning_rate": 1.6074093516248726e-05,
"loss": 0.1208,
"reward": -0.8820533196131388,
"reward_std": 1.5515558183193208,
"rewards/Qwen2-0.5B-Reward": -0.8820533196131388,
"step": 1000
},
{
"completion_length": 729.3245463053386,
"epoch": 0.3823826350328117,
"grad_norm": 1.1494252681732178,
"kl": 2.1536458333333335,
"learning_rate": 1.5975368661788636e-05,
"loss": 0.0861,
"reward": -0.617452886607498,
"reward_std": 1.2075418949127197,
"rewards/Qwen2-0.5B-Reward": -0.617452886607498,
"step": 1010
},
{
"completion_length": 711.6662150065105,
"epoch": 0.3861686017163049,
"grad_norm": 0.9261192083358765,
"kl": 2.349739583333333,
"learning_rate": 1.587577399449336e-05,
"loss": 0.094,
"reward": -0.6707314955691497,
"reward_std": 1.2855535586675009,
"rewards/Qwen2-0.5B-Reward": -0.6707314955691497,
"step": 1020
},
{
"completion_length": 751.3074096679687,
"epoch": 0.3899545683997981,
"grad_norm": 2.042595148086548,
"kl": 2.3372395833333335,
"learning_rate": 1.5775326925880675e-05,
"loss": 0.0935,
"reward": -0.6637267053127289,
"reward_std": 1.3381904661655426,
"rewards/Qwen2-0.5B-Reward": -0.6637267053127289,
"step": 1030
},
{
"completion_length": 776.1870402018229,
"epoch": 0.39374053508329127,
"grad_norm": 1.2383322715759277,
"kl": 5.3609375,
"learning_rate": 1.5674045016488397e-05,
"loss": 0.2142,
"reward": -0.6239150881767273,
"reward_std": 1.3248741805553437,
"rewards/Qwen2-0.5B-Reward": -0.6239150881767273,
"step": 1040
},
{
"completion_length": 718.260194905599,
"epoch": 0.39752650176678445,
"grad_norm": 1.5840164422988892,
"kl": 2.3580729166666665,
"learning_rate": 1.5571945972804376e-05,
"loss": 0.0943,
"reward": -0.5199564640720685,
"reward_std": 1.2036932865778605,
"rewards/Qwen2-0.5B-Reward": -0.5199564640720685,
"step": 1050
},
{
"completion_length": 809.6148213704427,
"epoch": 0.4013124684502776,
"grad_norm": 1.5066214799880981,
"kl": 3.0403645833333335,
"learning_rate": 1.546904764417098e-05,
"loss": 0.1216,
"reward": -0.9776304622491201,
"reward_std": 1.6650471250216166,
"rewards/Qwen2-0.5B-Reward": -0.9776304622491201,
"step": 1060
},
{
"completion_length": 766.8726867675781,
"epoch": 0.4050984351337708,
"grad_norm": 1.4285918474197388,
"kl": 2.3622395833333334,
"learning_rate": 1.5365368019664618e-05,
"loss": 0.0945,
"reward": -0.650248110294342,
"reward_std": 1.3134302516778311,
"rewards/Qwen2-0.5B-Reward": -0.650248110294342,
"step": 1070
},
{
"completion_length": 778.1921325683594,
"epoch": 0.408884401817264,
"grad_norm": 1.9540224075317383,
"kl": 2.269270833333333,
"learning_rate": 1.5260925224950785e-05,
"loss": 0.0908,
"reward": -0.5108215274910132,
"reward_std": 1.1806359807650249,
"rewards/Qwen2-0.5B-Reward": -0.5108215274910132,
"step": 1080
},
{
"completion_length": 806.8787089029948,
"epoch": 0.41267036850075717,
"grad_norm": 0.9543392062187195,
"kl": 2.60859375,
"learning_rate": 1.5155737519115308e-05,
"loss": 0.1043,
"reward": -0.8536549975474675,
"reward_std": 1.4883501867453257,
"rewards/Qwen2-0.5B-Reward": -0.8536549975474675,
"step": 1090
},
{
"completion_length": 790.8611124674479,
"epoch": 0.4164563351842504,
"grad_norm": 1.6240158081054688,
"kl": 2.213671875,
"learning_rate": 1.5049823291472195e-05,
"loss": 0.0885,
"reward": -0.5210499677807092,
"reward_std": 1.2201600551605225,
"rewards/Qwen2-0.5B-Reward": -0.5210499677807092,
"step": 1100
},
{
"completion_length": 844.4666748046875,
"epoch": 0.4202423018677436,
"grad_norm": 0.7703062891960144,
"kl": 3.1419270833333335,
"learning_rate": 1.494320105834876e-05,
"loss": 0.1257,
"reward": -1.1577677488327027,
"reward_std": 1.7909785747528075,
"rewards/Qwen2-0.5B-Reward": -1.1577677488327027,
"step": 1110
},
{
"completion_length": 873.2398213704427,
"epoch": 0.42402826855123676,
"grad_norm": 1.8059611320495605,
"kl": 3.25859375,
"learning_rate": 1.4835889459848517e-05,
"loss": 0.1304,
"reward": -0.9918207342425982,
"reward_std": 1.6435052702824275,
"rewards/Qwen2-0.5B-Reward": -0.9918207342425982,
"step": 1120
},
{
"completion_length": 883.4926025390625,
"epoch": 0.42781423523472994,
"grad_norm": 1.4837961196899414,
"kl": 2.7075520833333333,
"learning_rate": 1.472790725659245e-05,
"loss": 0.1083,
"reward": -0.7034151526788871,
"reward_std": 1.3653341392676035,
"rewards/Qwen2-0.5B-Reward": -0.7034151526788871,
"step": 1130
},
{
"completion_length": 779.6824117024739,
"epoch": 0.4316002019182231,
"grad_norm": 1.1727573871612549,
"kl": 2.1869791666666667,
"learning_rate": 1.4619273326439229e-05,
"loss": 0.0875,
"reward": -0.6506599500775337,
"reward_std": 1.3229804019133249,
"rewards/Qwen2-0.5B-Reward": -0.6506599500775337,
"step": 1140
},
{
"completion_length": 829.1185241699219,
"epoch": 0.4353861686017163,
"grad_norm": 0.974542498588562,
"kl": 2.659375,
"learning_rate": 1.4510006661184867e-05,
"loss": 0.1064,
"reward": -0.7578525463740031,
"reward_std": 1.531895116964976,
"rewards/Qwen2-0.5B-Reward": -0.7578525463740031,
"step": 1150
},
{
"completion_length": 796.5884318033854,
"epoch": 0.4391721352852095,
"grad_norm": 1.2544572353363037,
"kl": 2.2998697916666666,
"learning_rate": 1.440012636324255e-05,
"loss": 0.092,
"reward": -0.6453255646862089,
"reward_std": 1.2682056347529094,
"rewards/Qwen2-0.5B-Reward": -0.6453255646862089,
"step": 1160
},
{
"completion_length": 656.2717681884766,
"epoch": 0.44295810196870267,
"grad_norm": 1.7041164636611938,
"kl": 1.9328125,
"learning_rate": 1.4289651642303055e-05,
"loss": 0.0773,
"reward": -0.3406788529828191,
"reward_std": 1.0103827198346456,
"rewards/Qwen2-0.5B-Reward": -0.3406788529828191,
"step": 1170
},
{
"completion_length": 733.9171315511068,
"epoch": 0.44674406865219585,
"grad_norm": 0.846507728099823,
"kl": 2.328125,
"learning_rate": 1.4178601811976435e-05,
"loss": 0.0931,
"reward": -0.4902394848565261,
"reward_std": 1.2676184395949046,
"rewards/Qwen2-0.5B-Reward": -0.4902394848565261,
"step": 1180
},
{
"completion_length": 768.9398234049479,
"epoch": 0.450530035335689,
"grad_norm": 0.7115055322647095,
"kl": 2.668229166666667,
"learning_rate": 1.4066996286415562e-05,
"loss": 0.1068,
"reward": -0.7519384076197942,
"reward_std": 1.4289092858632406,
"rewards/Qwen2-0.5B-Reward": -0.7519384076197942,
"step": 1190
},
{
"completion_length": 825.0884297688802,
"epoch": 0.4543160020191822,
"grad_norm": 1.200706958770752,
"kl": 3.144791666666667,
"learning_rate": 1.3954854576922052e-05,
"loss": 0.1258,
"reward": -1.0908042828241984,
"reward_std": 1.6707689007123312,
"rewards/Qwen2-0.5B-Reward": -1.0908042828241984,
"step": 1200
},
{
"completion_length": 705.2310302734375,
"epoch": 0.45810196870267544,
"grad_norm": 1.3045536279678345,
"kl": 2.23046875,
"learning_rate": 1.3842196288535226e-05,
"loss": 0.0893,
"reward": -0.5541289503375689,
"reward_std": 1.264378293355306,
"rewards/Qwen2-0.5B-Reward": -0.5541289503375689,
"step": 1210
},
{
"completion_length": 662.7287109375,
"epoch": 0.4618879353861686,
"grad_norm": 1.1240729093551636,
"kl": 1.7548177083333334,
"learning_rate": 1.3729041116604697e-05,
"loss": 0.0702,
"reward": -0.33847450762987136,
"reward_std": 1.030816239118576,
"rewards/Qwen2-0.5B-Reward": -0.33847450762987136,
"step": 1220
},
{
"completion_length": 723.0296376546224,
"epoch": 0.4656739020696618,
"grad_norm": 2.3360471725463867,
"kl": 2.3111979166666665,
"learning_rate": 1.3615408843347141e-05,
"loss": 0.0924,
"reward": -0.5807175462444624,
"reward_std": 1.3384559114774068,
"rewards/Qwen2-0.5B-Reward": -0.5807175462444624,
"step": 1230
},
{
"completion_length": 751.2430704752604,
"epoch": 0.469459868753155,
"grad_norm": 2.823309898376465,
"kl": 2.7513020833333335,
"learning_rate": 1.3501319334387902e-05,
"loss": 0.1101,
"reward": -0.8531121673683325,
"reward_std": 1.5220951795578004,
"rewards/Qwen2-0.5B-Reward": -0.8531121673683325,
"step": 1240
},
{
"completion_length": 790.4842651367187,
"epoch": 0.47324583543664817,
"grad_norm": 1.8123273849487305,
"kl": 3.0010416666666666,
"learning_rate": 1.3386792535287997e-05,
"loss": 0.1201,
"reward": -0.9698835199077924,
"reward_std": 1.6139462788899739,
"rewards/Qwen2-0.5B-Reward": -0.9698835199077924,
"step": 1250
},
{
"completion_length": 734.1004720052083,
"epoch": 0.47703180212014135,
"grad_norm": 0.6924867033958435,
"kl": 2.5669270833333333,
"learning_rate": 1.3271848468057176e-05,
"loss": 0.1027,
"reward": -0.6089021896322568,
"reward_std": 1.2572330633799236,
"rewards/Qwen2-0.5B-Reward": -0.6089021896322568,
"step": 1260
},
{
"completion_length": 740.8490763346355,
"epoch": 0.4808177688036345,
"grad_norm": 1.0355186462402344,
"kl": 2.7315104166666666,
"learning_rate": 1.3156507227653582e-05,
"loss": 0.1093,
"reward": -0.5665054028232892,
"reward_std": 1.3232530683279038,
"rewards/Qwen2-0.5B-Reward": -0.5665054028232892,
"step": 1270
},
{
"completion_length": 721.3925944010417,
"epoch": 0.4846037354871277,
"grad_norm": 1.0751088857650757,
"kl": 2.77890625,
"learning_rate": 1.3040788978470678e-05,
"loss": 0.1111,
"reward": -0.617917682370171,
"reward_std": 1.3952182014783223,
"rewards/Qwen2-0.5B-Reward": -0.617917682370171,
"step": 1280
},
{
"completion_length": 743.2884348551432,
"epoch": 0.4883897021706209,
"grad_norm": 1.7289220094680786,
"kl": 2.8721354166666666,
"learning_rate": 1.2924713950812033e-05,
"loss": 0.1148,
"reward": -0.6107141558701793,
"reward_std": 1.3133805135885874,
"rewards/Qwen2-0.5B-Reward": -0.6107141558701793,
"step": 1290
},
{
"completion_length": 744.837967936198,
"epoch": 0.49217566885411407,
"grad_norm": 0.9980621337890625,
"kl": 2.6927083333333335,
"learning_rate": 1.280830243735459e-05,
"loss": 0.1077,
"reward": -0.6816005217532317,
"reward_std": 1.3647177835305533,
"rewards/Qwen2-0.5B-Reward": -0.6816005217532317,
"step": 1300
},
{
"completion_length": 765.5287109375,
"epoch": 0.49596163553760725,
"grad_norm": 1.5100042819976807,
"kl": 3.23359375,
"learning_rate": 1.2691574789601006e-05,
"loss": 0.1293,
"reward": -0.7456285426393151,
"reward_std": 1.504830890893936,
"rewards/Qwen2-0.5B-Reward": -0.7456285426393151,
"step": 1310
},
{
"completion_length": 776.5162089029948,
"epoch": 0.49974760222110043,
"grad_norm": 3.0420119762420654,
"kl": 2.664322916666667,
"learning_rate": 1.2574551414321749e-05,
"loss": 0.1066,
"reward": -0.6133380237966776,
"reward_std": 1.4030099928379058,
"rewards/Qwen2-0.5B-Reward": -0.6133380237966776,
"step": 1320
},
{
"completion_length": 756.2375101725261,
"epoch": 0.5035335689045937,
"grad_norm": 1.2776826620101929,
"kl": 2.5111979166666667,
"learning_rate": 1.2457252769987485e-05,
"loss": 0.1005,
"reward": -0.4735676831565797,
"reward_std": 1.2207833151022593,
"rewards/Qwen2-0.5B-Reward": -0.4735676831565797,
"step": 1330
},
{
"completion_length": 780.6055643717448,
"epoch": 0.5073195355880868,
"grad_norm": 1.277037262916565,
"kl": 2.29453125,
"learning_rate": 1.2339699363192461e-05,
"loss": 0.0918,
"reward": -0.41186855093886454,
"reward_std": 1.1698833445707957,
"rewards/Qwen2-0.5B-Reward": -0.41186855093886454,
"step": 1340
},
{
"completion_length": 814.8995402018229,
"epoch": 0.51110550227158,
"grad_norm": 1.1098392009735107,
"kl": 2.9515625,
"learning_rate": 1.2221911745069473e-05,
"loss": 0.118,
"reward": -0.7255906278888384,
"reward_std": 1.5052427490552267,
"rewards/Qwen2-0.5B-Reward": -0.7255906278888384,
"step": 1350
},
{
"completion_length": 800.6180623372396,
"epoch": 0.5148914689550732,
"grad_norm": 1.5379681587219238,
"kl": 3.078385416666667,
"learning_rate": 1.210391050769702e-05,
"loss": 0.1231,
"reward": -0.9011206914981206,
"reward_std": 1.5988249023755392,
"rewards/Qwen2-0.5B-Reward": -0.9011206914981206,
"step": 1360
},
{
"completion_length": 787.152783203125,
"epoch": 0.5186774356385664,
"grad_norm": 1.421747088432312,
"kl": 2.74453125,
"learning_rate": 1.1985716280499338e-05,
"loss": 0.1098,
"reward": -0.7614536421994368,
"reward_std": 1.4081373771031698,
"rewards/Qwen2-0.5B-Reward": -0.7614536421994368,
"step": 1370
},
{
"completion_length": 842.5861185709635,
"epoch": 0.5224634023220596,
"grad_norm": 2.403327226638794,
"kl": 3.16171875,
"learning_rate": 1.1867349726639868e-05,
"loss": 0.1266,
"reward": -0.8059929932157198,
"reward_std": 1.487107406059901,
"rewards/Qwen2-0.5B-Reward": -0.8059929932157198,
"step": 1380
},
{
"completion_length": 793.2569539388021,
"epoch": 0.5262493690055527,
"grad_norm": 1.0243574380874634,
"kl": 3.30625,
"learning_rate": 1.1748831539408863e-05,
"loss": 0.1323,
"reward": -0.9990609556436538,
"reward_std": 1.641613002618154,
"rewards/Qwen2-0.5B-Reward": -0.9990609556436538,
"step": 1390
},
{
"completion_length": 781.2513997395833,
"epoch": 0.5300353356890459,
"grad_norm": 1.4023561477661133,
"kl": 2.6401041666666667,
"learning_rate": 1.1630182438605688e-05,
"loss": 0.1056,
"reward": -0.73541273077329,
"reward_std": 1.391848737001419,
"rewards/Qwen2-0.5B-Reward": -0.73541273077329,
"step": 1400
},
{
"completion_length": 777.1314880371094,
"epoch": 0.5338213023725391,
"grad_norm": 1.4984385967254639,
"kl": 3.1786458333333334,
"learning_rate": 1.151142316691652e-05,
"loss": 0.1273,
"reward": -0.9620630964636803,
"reward_std": 1.6180862605571746,
"rewards/Qwen2-0.5B-Reward": -0.9620630964636803,
"step": 1410
},
{
"completion_length": 753.6296305338542,
"epoch": 0.5376072690560323,
"grad_norm": 0.6080305576324463,
"kl": 2.93046875,
"learning_rate": 1.1392574486288026e-05,
"loss": 0.1172,
"reward": -0.6871781093068421,
"reward_std": 1.4368105371793112,
"rewards/Qwen2-0.5B-Reward": -0.6871781093068421,
"step": 1420
},
{
"completion_length": 755.0680684407552,
"epoch": 0.5413932357395255,
"grad_norm": 0.9181307554244995,
"kl": 2.5361979166666666,
"learning_rate": 1.1273657174297687e-05,
"loss": 0.1016,
"reward": -0.41866928230350214,
"reward_std": 1.193355711301168,
"rewards/Qwen2-0.5B-Reward": -0.41866928230350214,
"step": 1430
},
{
"completion_length": 739.3643595377604,
"epoch": 0.5451792024230186,
"grad_norm": 1.3852412700653076,
"kl": 2.4328125,
"learning_rate": 1.1154692020521379e-05,
"loss": 0.0973,
"reward": -0.42044620849192144,
"reward_std": 1.1699665983517964,
"rewards/Qwen2-0.5B-Reward": -0.42044620849192144,
"step": 1440
},
{
"completion_length": 787.2263997395834,
"epoch": 0.5489651691065118,
"grad_norm": 1.2610223293304443,
"kl": 2.7135416666666665,
"learning_rate": 1.1035699822898852e-05,
"loss": 0.1085,
"reward": -0.5719452144578099,
"reward_std": 1.3674102127552032,
"rewards/Qwen2-0.5B-Reward": -0.5719452144578099,
"step": 1450
},
{
"completion_length": 794.1407450358073,
"epoch": 0.552751135790005,
"grad_norm": 3.987548351287842,
"kl": 3.580208333333333,
"learning_rate": 1.091670138409778e-05,
"loss": 0.1432,
"reward": -0.913334188858668,
"reward_std": 1.654043678442637,
"rewards/Qwen2-0.5B-Reward": -0.913334188858668,
"step": 1460
},
{
"completion_length": 755.6018575032552,
"epoch": 0.5565371024734982,
"grad_norm": 1.312009334564209,
"kl": 2.115364583333333,
"learning_rate": 1.0797717507876926e-05,
"loss": 0.0846,
"reward": -0.605161217538019,
"reward_std": 1.230643669764201,
"rewards/Qwen2-0.5B-Reward": -0.605161217538019,
"step": 1470
},
{
"completion_length": 745.5069519042969,
"epoch": 0.5603230691569914,
"grad_norm": 1.5958776473999023,
"kl": 2.8216145833333335,
"learning_rate": 1.0678768995449179e-05,
"loss": 0.1129,
"reward": -0.5114948400606711,
"reward_std": 1.213375515739123,
"rewards/Qwen2-0.5B-Reward": -0.5114948400606711,
"step": 1480
},
{
"completion_length": 790.2676025390625,
"epoch": 0.5641090358404846,
"grad_norm": 1.0892456769943237,
"kl": 2.9091145833333334,
"learning_rate": 1.055987664184499e-05,
"loss": 0.1164,
"reward": -0.6985714793205261,
"reward_std": 1.441979839404424,
"rewards/Qwen2-0.5B-Reward": -0.6985714793205261,
"step": 1490
},
{
"completion_length": 768.1861206054688,
"epoch": 0.5678950025239778,
"grad_norm": 1.5841772556304932,
"kl": 2.4014322916666666,
"learning_rate": 1.0441061232276914e-05,
"loss": 0.096,
"reward": -0.5361925270253172,
"reward_std": 1.2279207597176234,
"rewards/Qwen2-0.5B-Reward": -0.5361925270253172,
"step": 1500
},
{
"completion_length": 790.4763977050782,
"epoch": 0.571680969207471,
"grad_norm": 1.4919512271881104,
"kl": 2.945052083333333,
"learning_rate": 1.0322343538505859e-05,
"loss": 0.1178,
"reward": -0.6917820642391841,
"reward_std": 1.41629096865654,
"rewards/Qwen2-0.5B-Reward": -0.6917820642391841,
"step": 1510
},
{
"completion_length": 732.1388956705729,
"epoch": 0.5754669358909642,
"grad_norm": 1.3066332340240479,
"kl": 2.5669270833333333,
"learning_rate": 1.0203744315209683e-05,
"loss": 0.1026,
"reward": -0.4832228126314779,
"reward_std": 1.21365185379982,
"rewards/Qwen2-0.5B-Reward": -0.4832228126314779,
"step": 1520
},
{
"completion_length": 777.1152811686198,
"epoch": 0.5792529025744574,
"grad_norm": 2.0675883293151855,
"kl": 3.0052083333333335,
"learning_rate": 1.0085284296354784e-05,
"loss": 0.1202,
"reward": -0.7202197993795078,
"reward_std": 1.480885813633601,
"rewards/Qwen2-0.5B-Reward": -0.7202197993795078,
"step": 1530
},
{
"completion_length": 810.0231577555338,
"epoch": 0.5830388692579506,
"grad_norm": 1.1669964790344238,
"kl": 3.351822916666667,
"learning_rate": 9.966984191571318e-06,
"loss": 0.1341,
"reward": -0.9308211114102354,
"reward_std": 1.5289963026841482,
"rewards/Qwen2-0.5B-Reward": -0.9308211114102354,
"step": 1540
},
{
"completion_length": 803.5157470703125,
"epoch": 0.5868248359414437,
"grad_norm": 1.2970937490463257,
"kl": 2.9263020833333333,
"learning_rate": 9.848864682532654e-06,
"loss": 0.1171,
"reward": -0.897743321955204,
"reward_std": 1.4250325242678323,
"rewards/Qwen2-0.5B-Reward": -0.897743321955204,
"step": 1550
},
{
"completion_length": 774.900467936198,
"epoch": 0.5906108026249369,
"grad_norm": 1.5224976539611816,
"kl": 3.23203125,
"learning_rate": 9.730946419339721e-06,
"loss": 0.1293,
"reward": -0.8313487897316615,
"reward_std": 1.4089517414569854,
"rewards/Qwen2-0.5B-Reward": -0.8313487897316615,
"step": 1560
},
{
"completion_length": 814.4111185709636,
"epoch": 0.5943967693084301,
"grad_norm": 1.5672080516815186,
"kl": 2.9359375,
"learning_rate": 9.613250016910894e-06,
"loss": 0.1174,
"reward": -0.7221511860688528,
"reward_std": 1.3432387212912242,
"rewards/Qwen2-0.5B-Reward": -0.7221511860688528,
"step": 1570
},
{
"completion_length": 776.6129638671875,
"epoch": 0.5981827359919233,
"grad_norm": 1.8100062608718872,
"kl": 2.7890625,
"learning_rate": 9.495796051377997e-06,
"loss": 0.1115,
"reward": -0.8584653136630853,
"reward_std": 1.3234432935714722,
"rewards/Qwen2-0.5B-Reward": -0.8584653136630853,
"step": 1580
},
{
"completion_length": 825.8180603027344,
"epoch": 0.6019687026754165,
"grad_norm": 1.6404787302017212,
"kl": 3.863541666666667,
"learning_rate": 9.378605056489128e-06,
"loss": 0.1545,
"reward": -1.263607233762741,
"reward_std": 1.8019790093104044,
"rewards/Qwen2-0.5B-Reward": -1.263607233762741,
"step": 1590
},
{
"completion_length": 728.1912089029948,
"epoch": 0.6057546693589096,
"grad_norm": 0.8878143429756165,
"kl": 2.6088541666666667,
"learning_rate": 9.261697520018849e-06,
"loss": 0.1044,
"reward": -0.42785762051741283,
"reward_std": 1.0820347189903259,
"rewards/Qwen2-0.5B-Reward": -0.42785762051741283,
"step": 1600
},
{
"completion_length": 747.9509358723958,
"epoch": 0.6095406360424028,
"grad_norm": 1.613976240158081,
"kl": 2.468489583333333,
"learning_rate": 9.145093880186451e-06,
"loss": 0.0988,
"reward": -0.41555683029194673,
"reward_std": 1.179705987373988,
"rewards/Qwen2-0.5B-Reward": -0.41555683029194673,
"step": 1610
},
{
"completion_length": 787.43056640625,
"epoch": 0.613326602725896,
"grad_norm": 0.5864226818084717,
"kl": 2.894270833333333,
"learning_rate": 9.028814522082857e-06,
"loss": 0.1157,
"reward": -0.6661467840274174,
"reward_std": 1.412223219871521,
"rewards/Qwen2-0.5B-Reward": -0.6661467840274174,
"step": 1620
},
{
"completion_length": 742.3319498697916,
"epoch": 0.6171125694093892,
"grad_norm": 1.7149267196655273,
"kl": 2.7528645833333334,
"learning_rate": 8.912879774106832e-06,
"loss": 0.1101,
"reward": -0.560060964524746,
"reward_std": 1.2752733170986175,
"rewards/Qwen2-0.5B-Reward": -0.560060964524746,
"step": 1630
},
{
"completion_length": 750.7245422363281,
"epoch": 0.6208985360928824,
"grad_norm": 2.106180191040039,
"kl": 2.40546875,
"learning_rate": 8.797309904411087e-06,
"loss": 0.0962,
"reward": -0.416633996165668,
"reward_std": 1.1659721612930298,
"rewards/Qwen2-0.5B-Reward": -0.416633996165668,
"step": 1640
},
{
"completion_length": 778.6088033040364,
"epoch": 0.6246845027763755,
"grad_norm": 1.4638694524765015,
"kl": 2.676041666666667,
"learning_rate": 8.682125117358927e-06,
"loss": 0.1071,
"reward": -0.6446437170108159,
"reward_std": 1.3279209415117899,
"rewards/Qwen2-0.5B-Reward": -0.6446437170108159,
"step": 1650
},
{
"completion_length": 808.040283203125,
"epoch": 0.6284704694598687,
"grad_norm": 1.1022939682006836,
"kl": 3.4580729166666666,
"learning_rate": 8.567345549992045e-06,
"loss": 0.1383,
"reward": -0.7954719786842664,
"reward_std": 1.4967798054218293,
"rewards/Qwen2-0.5B-Reward": -0.7954719786842664,
"step": 1660
},
{
"completion_length": 757.4060282389323,
"epoch": 0.6322564361433619,
"grad_norm": 2.4723708629608154,
"kl": 2.792708333333333,
"learning_rate": 8.4529912685101e-06,
"loss": 0.1117,
"reward": -0.5523949672778448,
"reward_std": 1.3249893307685852,
"rewards/Qwen2-0.5B-Reward": -0.5523949672778448,
"step": 1670
},
{
"completion_length": 762.1398213704427,
"epoch": 0.6360424028268551,
"grad_norm": 0.8709607720375061,
"kl": 2.8286458333333333,
"learning_rate": 8.33908226476265e-06,
"loss": 0.1132,
"reward": -0.5545504409819841,
"reward_std": 1.3114221652348836,
"rewards/Qwen2-0.5B-Reward": -0.5545504409819841,
"step": 1680
},
{
"completion_length": 823.0356526692708,
"epoch": 0.6398283695103483,
"grad_norm": 0.969098687171936,
"kl": 2.855729166666667,
"learning_rate": 8.22563845275411e-06,
"loss": 0.1142,
"reward": -0.7070573056737582,
"reward_std": 1.3873663266499838,
"rewards/Qwen2-0.5B-Reward": -0.7070573056737582,
"step": 1690
},
{
"completion_length": 810.1981506347656,
"epoch": 0.6436143361938415,
"grad_norm": 1.2305635213851929,
"kl": 3.793229166666667,
"learning_rate": 8.11267966516231e-06,
"loss": 0.1518,
"reward": -1.061463608344396,
"reward_std": 1.7348846475283304,
"rewards/Qwen2-0.5B-Reward": -1.061463608344396,
"step": 1700
},
{
"completion_length": 776.243983968099,
"epoch": 0.6474003028773346,
"grad_norm": 1.6688897609710693,
"kl": 2.94375,
"learning_rate": 8.000225649871272e-06,
"loss": 0.1177,
"reward": -0.7328139250477155,
"reward_std": 1.4019733607769012,
"rewards/Qwen2-0.5B-Reward": -0.7328139250477155,
"step": 1710
},
{
"completion_length": 782.6092692057292,
"epoch": 0.6511862695608278,
"grad_norm": 2.184279680252075,
"kl": 3.275260416666667,
"learning_rate": 7.888296066518806e-06,
"loss": 0.131,
"reward": -0.826190093656381,
"reward_std": 1.539618053038915,
"rewards/Qwen2-0.5B-Reward": -0.826190093656381,
"step": 1720
},
{
"completion_length": 707.3263997395833,
"epoch": 0.6549722362443211,
"grad_norm": 2.3973989486694336,
"kl": 2.400260416666667,
"learning_rate": 7.776910483059543e-06,
"loss": 0.096,
"reward": -0.5184978457788626,
"reward_std": 1.1560731967290243,
"rewards/Qwen2-0.5B-Reward": -0.5184978457788626,
"step": 1730
},
{
"completion_length": 734.9152872721354,
"epoch": 0.6587582029278143,
"grad_norm": 1.8029112815856934,
"kl": 2.9859375,
"learning_rate": 7.666088372343984e-06,
"loss": 0.1194,
"reward": -0.5925529218278826,
"reward_std": 1.267720968524615,
"rewards/Qwen2-0.5B-Reward": -0.5925529218278826,
"step": 1740
},
{
"completion_length": 807.6726928710938,
"epoch": 0.6625441696113075,
"grad_norm": 1.5247033834457397,
"kl": 3.3872395833333333,
"learning_rate": 7.555849108714192e-06,
"loss": 0.1355,
"reward": -0.7715960969527562,
"reward_std": 1.4897764484087626,
"rewards/Qwen2-0.5B-Reward": -0.7715960969527562,
"step": 1750
},
{
"completion_length": 776.8838073730469,
"epoch": 0.6663301362948006,
"grad_norm": 1.9940361976623535,
"kl": 2.837760416666667,
"learning_rate": 7.4462119646166855e-06,
"loss": 0.1136,
"reward": -0.7241511250535647,
"reward_std": 1.4011840164661407,
"rewards/Qwen2-0.5B-Reward": -0.7241511250535647,
"step": 1760
},
{
"completion_length": 767.8162129720052,
"epoch": 0.6701161029782938,
"grad_norm": 1.5367672443389893,
"kl": 3.5140625,
"learning_rate": 7.337196107233155e-06,
"loss": 0.1407,
"reward": -0.7663616319497426,
"reward_std": 1.5210982898871104,
"rewards/Qwen2-0.5B-Reward": -0.7663616319497426,
"step": 1770
},
{
"completion_length": 721.7675944010417,
"epoch": 0.673902069661787,
"grad_norm": 1.302241563796997,
"kl": 2.931510416666667,
"learning_rate": 7.228820595129604e-06,
"loss": 0.1172,
"reward": -0.725257391979297,
"reward_std": 1.334197594722112,
"rewards/Qwen2-0.5B-Reward": -0.725257391979297,
"step": 1780
},
{
"completion_length": 720.3171468098958,
"epoch": 0.6776880363452802,
"grad_norm": 0.8652080297470093,
"kl": 3.028125,
"learning_rate": 7.12110437492443e-06,
"loss": 0.1211,
"reward": -0.753487682590882,
"reward_std": 1.4118338882923127,
"rewards/Qwen2-0.5B-Reward": -0.753487682590882,
"step": 1790
},
{
"completion_length": 744.602783203125,
"epoch": 0.6814740030287734,
"grad_norm": 0.6850081086158752,
"kl": 3.18046875,
"learning_rate": 7.014066277976128e-06,
"loss": 0.1272,
"reward": -0.6332276176661253,
"reward_std": 1.3656011939048767,
"rewards/Qwen2-0.5B-Reward": -0.6332276176661253,
"step": 1800
},
{
"completion_length": 759.4481526692708,
"epoch": 0.6852599697122665,
"grad_norm": 2.0515530109405518,
"kl": 3.3453125,
"learning_rate": 6.9077250170911005e-06,
"loss": 0.1338,
"reward": -0.8095526337623596,
"reward_std": 1.5075600425402322,
"rewards/Qwen2-0.5B-Reward": -0.8095526337623596,
"step": 1810
},
{
"completion_length": 723.5777811686198,
"epoch": 0.6890459363957597,
"grad_norm": 0.7833884358406067,
"kl": 2.9953125,
"learning_rate": 6.802099183252235e-06,
"loss": 0.1198,
"reward": -0.7537414369483789,
"reward_std": 1.383406792084376,
"rewards/Qwen2-0.5B-Reward": -0.7537414369483789,
"step": 1820
},
{
"completion_length": 724.8837972005208,
"epoch": 0.6928319030792529,
"grad_norm": 0.9831650853157043,
"kl": 2.5338541666666665,
"learning_rate": 6.697207242368742e-06,
"loss": 0.1013,
"reward": -0.43006037194281815,
"reward_std": 1.1635287086168924,
"rewards/Qwen2-0.5B-Reward": -0.43006037194281815,
"step": 1830
},
{
"completion_length": 760.3333374023438,
"epoch": 0.6966178697627461,
"grad_norm": 1.1536668539047241,
"kl": 2.6203125,
"learning_rate": 6.593067532047882e-06,
"loss": 0.1049,
"reward": -0.4441113060961167,
"reward_std": 1.1987637420495352,
"rewards/Qwen2-0.5B-Reward": -0.4441113060961167,
"step": 1840
},
{
"completion_length": 749.903251139323,
"epoch": 0.7004038364462393,
"grad_norm": 0.8368715643882751,
"kl": 2.5341145833333334,
"learning_rate": 6.489698258389107e-06,
"loss": 0.1013,
"reward": -0.5944258317351341,
"reward_std": 1.3474121958017349,
"rewards/Qwen2-0.5B-Reward": -0.5944258317351341,
"step": 1850
},
{
"completion_length": 745.5365783691407,
"epoch": 0.7041898031297325,
"grad_norm": 1.029958724975586,
"kl": 2.90078125,
"learning_rate": 6.387117492801213e-06,
"loss": 0.1161,
"reward": -0.6068828483422597,
"reward_std": 1.321648943424225,
"rewards/Qwen2-0.5B-Reward": -0.6068828483422597,
"step": 1860
},
{
"completion_length": 755.6328796386719,
"epoch": 0.7079757698132256,
"grad_norm": 5.108635425567627,
"kl": 2.9171875,
"learning_rate": 6.285343168843028e-06,
"loss": 0.1167,
"reward": -0.6523237491647402,
"reward_std": 1.3444733719031017,
"rewards/Qwen2-0.5B-Reward": -0.6523237491647402,
"step": 1870
},
{
"completion_length": 787.0935241699219,
"epoch": 0.7117617364967188,
"grad_norm": 1.3548846244812012,
"kl": 3.0869791666666666,
"learning_rate": 6.1843930790881766e-06,
"loss": 0.1235,
"reward": -0.6537054566045603,
"reward_std": 1.4838234384854634,
"rewards/Qwen2-0.5B-Reward": -0.6537054566045603,
"step": 1880
},
{
"completion_length": 773.563895670573,
"epoch": 0.715547703180212,
"grad_norm": 0.8410789966583252,
"kl": 2.837760416666667,
"learning_rate": 6.084284872014545e-06,
"loss": 0.1136,
"reward": -0.5507580937196811,
"reward_std": 1.2756544808546701,
"rewards/Qwen2-0.5B-Reward": -0.5507580937196811,
"step": 1890
},
{
"completion_length": 760.8699096679687,
"epoch": 0.7193336698637052,
"grad_norm": 1.5116900205612183,
"kl": 2.6723958333333333,
"learning_rate": 5.985036048918894e-06,
"loss": 0.1069,
"reward": -0.46427804150929053,
"reward_std": 1.1952710588773092,
"rewards/Qwen2-0.5B-Reward": -0.46427804150929053,
"step": 1900
},
{
"completion_length": 763.8004699707031,
"epoch": 0.7231196365471984,
"grad_norm": 1.1645935773849487,
"kl": 3.13828125,
"learning_rate": 5.886663960857202e-06,
"loss": 0.1255,
"reward": -0.7973003094395001,
"reward_std": 1.4403738955656686,
"rewards/Qwen2-0.5B-Reward": -0.7973003094395001,
"step": 1910
},
{
"completion_length": 746.5444559733073,
"epoch": 0.7269056032306915,
"grad_norm": 1.8314180374145508,
"kl": 3.378125,
"learning_rate": 5.789185805611313e-06,
"loss": 0.1351,
"reward": -0.6777333706617356,
"reward_std": 1.452496987581253,
"rewards/Qwen2-0.5B-Reward": -0.6777333706617356,
"step": 1920
},
{
"completion_length": 743.2513977050781,
"epoch": 0.7306915699141847,
"grad_norm": 1.8599276542663574,
"kl": 2.6572916666666666,
"learning_rate": 5.692618624682342e-06,
"loss": 0.1063,
"reward": -0.5468713939189911,
"reward_std": 1.203757886091868,
"rewards/Qwen2-0.5B-Reward": -0.5468713939189911,
"step": 1930
},
{
"completion_length": 715.9157470703125,
"epoch": 0.7344775365976779,
"grad_norm": 3.749554395675659,
"kl": 3.373177083333333,
"learning_rate": 5.596979300311408e-06,
"loss": 0.1351,
"reward": -0.42453126634160676,
"reward_std": 1.129069878657659,
"rewards/Qwen2-0.5B-Reward": -0.42453126634160676,
"step": 1940
},
{
"completion_length": 707.4583414713542,
"epoch": 0.7382635032811711,
"grad_norm": 1.2406065464019775,
"kl": 2.40546875,
"learning_rate": 5.502284552528236e-06,
"loss": 0.0962,
"reward": -0.3166978692635894,
"reward_std": 1.0220210254192352,
"rewards/Qwen2-0.5B-Reward": -0.3166978692635894,
"step": 1950
},
{
"completion_length": 730.9064880371094,
"epoch": 0.7420494699646644,
"grad_norm": 0.894660472869873,
"kl": 3.0755208333333335,
"learning_rate": 5.408550936228072e-06,
"loss": 0.1231,
"reward": -0.6020015890399615,
"reward_std": 1.3233680129051208,
"rewards/Qwen2-0.5B-Reward": -0.6020015890399615,
"step": 1960
},
{
"completion_length": 784.6120402018229,
"epoch": 0.7458354366481575,
"grad_norm": 0.9947274923324585,
"kl": 3.3036458333333334,
"learning_rate": 5.315794838277524e-06,
"loss": 0.1321,
"reward": -0.8605576127767562,
"reward_std": 1.5929324706395467,
"rewards/Qwen2-0.5B-Reward": -0.8605576127767562,
"step": 1970
},
{
"completion_length": 761.7782409667968,
"epoch": 0.7496214033316507,
"grad_norm": 0.8357589244842529,
"kl": 3.126822916666667,
"learning_rate": 5.2240324746497185e-06,
"loss": 0.1251,
"reward": -0.6573333943883578,
"reward_std": 1.3803256154060364,
"rewards/Qwen2-0.5B-Reward": -0.6573333943883578,
"step": 1980
},
{
"completion_length": 751.271309407552,
"epoch": 0.7534073700151439,
"grad_norm": 0.9635012149810791,
"kl": 2.846875,
"learning_rate": 5.133279887589381e-06,
"loss": 0.114,
"reward": -0.5246660086015861,
"reward_std": 1.2728915989398957,
"rewards/Qwen2-0.5B-Reward": -0.5246660086015861,
"step": 1990
},
{
"completion_length": 721.8902760823568,
"epoch": 0.7571933366986371,
"grad_norm": 1.915734887123108,
"kl": 2.886588541666667,
"learning_rate": 5.043552942808269e-06,
"loss": 0.1155,
"reward": -0.4225703233232101,
"reward_std": 1.1504804422458013,
"rewards/Qwen2-0.5B-Reward": -0.4225703233232101,
"step": 2000
},
{
"completion_length": 747.6074157714844,
"epoch": 0.7609793033821303,
"grad_norm": 1.7324910163879395,
"kl": 2.849739583333333,
"learning_rate": 4.9548673267114535e-06,
"loss": 0.114,
"reward": -0.4868051894629995,
"reward_std": 1.2382884542147319,
"rewards/Qwen2-0.5B-Reward": -0.4868051894629995,
"step": 2010
},
{
"completion_length": 723.2888916015625,
"epoch": 0.7647652700656234,
"grad_norm": 1.870195984840393,
"kl": 3.38125,
"learning_rate": 4.86723854365498e-06,
"loss": 0.1353,
"reward": -0.6813056563337644,
"reward_std": 1.4171151260534922,
"rewards/Qwen2-0.5B-Reward": -0.6813056563337644,
"step": 2020
},
{
"completion_length": 739.2546325683594,
"epoch": 0.7685512367491166,
"grad_norm": 0.6563529968261719,
"kl": 2.7765625,
"learning_rate": 4.78068191323533e-06,
"loss": 0.111,
"reward": -0.6810662182668845,
"reward_std": 1.3699560364087422,
"rewards/Qwen2-0.5B-Reward": -0.6810662182668845,
"step": 2030
},
{
"completion_length": 723.6301025390625,
"epoch": 0.7723372034326098,
"grad_norm": 0.845397412776947,
"kl": 3.3549479166666667,
"learning_rate": 4.695212567611183e-06,
"loss": 0.1343,
"reward": -0.6839562758803368,
"reward_std": 1.3764802972475687,
"rewards/Qwen2-0.5B-Reward": -0.6839562758803368,
"step": 2040
},
{
"completion_length": 707.3944529215495,
"epoch": 0.776123170116103,
"grad_norm": 0.8297199606895447,
"kl": 2.2606770833333334,
"learning_rate": 4.6108454488579754e-06,
"loss": 0.0904,
"reward": -0.32430495528969916,
"reward_std": 1.0496096114317577,
"rewards/Qwen2-0.5B-Reward": -0.32430495528969916,
"step": 2050
},
{
"completion_length": 728.9092631022136,
"epoch": 0.7799091367995962,
"grad_norm": 0.8965924382209778,
"kl": 2.7317708333333335,
"learning_rate": 4.5275953063556515e-06,
"loss": 0.1092,
"reward": -0.49890854886422553,
"reward_std": 1.1908490220705668,
"rewards/Qwen2-0.5B-Reward": -0.49890854886422553,
"step": 2060
},
{
"completion_length": 787.4157389322917,
"epoch": 0.7836951034830894,
"grad_norm": 1.6908742189407349,
"kl": 3.14453125,
"learning_rate": 4.445476694210125e-06,
"loss": 0.1258,
"reward": -0.6872879594564438,
"reward_std": 1.5059267342090608,
"rewards/Qwen2-0.5B-Reward": -0.6872879594564438,
"step": 2070
},
{
"completion_length": 724.3907409667969,
"epoch": 0.7874810701665825,
"grad_norm": 0.5646480917930603,
"kl": 2.5322916666666666,
"learning_rate": 4.364503968708885e-06,
"loss": 0.1013,
"reward": -0.4010113532965382,
"reward_std": 1.1661198248465856,
"rewards/Qwen2-0.5B-Reward": -0.4010113532965382,
"step": 2080
},
{
"completion_length": 762.3759338378907,
"epoch": 0.7912670368500757,
"grad_norm": 0.7707305550575256,
"kl": 3.08828125,
"learning_rate": 4.284691285811162e-06,
"loss": 0.1235,
"reward": -0.6063117478042841,
"reward_std": 1.4541340112686156,
"rewards/Qwen2-0.5B-Reward": -0.6063117478042841,
"step": 2090
},
{
"completion_length": 757.8597249348958,
"epoch": 0.7950530035335689,
"grad_norm": 0.609060525894165,
"kl": 2.7552083333333335,
"learning_rate": 4.206052598673134e-06,
"loss": 0.1102,
"reward": -0.5107901314894359,
"reward_std": 1.2742640137672425,
"rewards/Qwen2-0.5B-Reward": -0.5107901314894359,
"step": 2100
},
{
"completion_length": 714.1713012695312,
"epoch": 0.7988389702170621,
"grad_norm": 1.5023508071899414,
"kl": 2.7880208333333334,
"learning_rate": 4.128601655208588e-06,
"loss": 0.1115,
"reward": -0.4477219473881026,
"reward_std": 1.2109043717384338,
"rewards/Qwen2-0.5B-Reward": -0.4477219473881026,
"step": 2110
},
{
"completion_length": 742.9495381673177,
"epoch": 0.8026249369005553,
"grad_norm": 1.4843252897262573,
"kl": 2.490104166666667,
"learning_rate": 4.052351995685459e-06,
"loss": 0.0996,
"reward": -0.40210790758331616,
"reward_std": 1.1073905199766159,
"rewards/Qwen2-0.5B-Reward": -0.40210790758331616,
"step": 2120
},
{
"completion_length": 758.4166687011718,
"epoch": 0.8064109035840484,
"grad_norm": 0.8346318006515503,
"kl": 3.2510416666666666,
"learning_rate": 3.977316950358647e-06,
"loss": 0.1301,
"reward": -0.744351115822792,
"reward_std": 1.4400279184182485,
"rewards/Qwen2-0.5B-Reward": -0.744351115822792,
"step": 2130
},
{
"completion_length": 711.5217651367187,
"epoch": 0.8101968702675416,
"grad_norm": 3.075549840927124,
"kl": 2.4575520833333333,
"learning_rate": 3.903509637139604e-06,
"loss": 0.0983,
"reward": -0.4195836258431276,
"reward_std": 1.1368374347686767,
"rewards/Qwen2-0.5B-Reward": -0.4195836258431276,
"step": 2140
},
{
"completion_length": 667.8574137369792,
"epoch": 0.8139828369510348,
"grad_norm": 1.288053035736084,
"kl": 2.64140625,
"learning_rate": 3.830942959302988e-06,
"loss": 0.1056,
"reward": -0.25947842622796696,
"reward_std": 1.0453672617673875,
"rewards/Qwen2-0.5B-Reward": -0.25947842622796696,
"step": 2150
},
{
"completion_length": 713.2092692057291,
"epoch": 0.817768803634528,
"grad_norm": 1.47870934009552,
"kl": 3.060677083333333,
"learning_rate": 3.7596296032308655e-06,
"loss": 0.1224,
"reward": -0.5742474019527435,
"reward_std": 1.2993368287881215,
"rewards/Qwen2-0.5B-Reward": -0.5742474019527435,
"step": 2160
},
{
"completion_length": 756.1185282389323,
"epoch": 0.8215547703180212,
"grad_norm": 1.0809710025787354,
"kl": 3.0234375,
"learning_rate": 3.689582036194844e-06,
"loss": 0.121,
"reward": -0.6388996203740438,
"reward_std": 1.3941177546977996,
"rewards/Qwen2-0.5B-Reward": -0.6388996203740438,
"step": 2170
},
{
"completion_length": 689.1287068684895,
"epoch": 0.8253407370015143,
"grad_norm": 0.8256644606590271,
"kl": 2.6302083333333335,
"learning_rate": 3.620812504176483e-06,
"loss": 0.1052,
"reward": -0.3896134149283171,
"reward_std": 1.1061949849128723,
"rewards/Qwen2-0.5B-Reward": -0.3896134149283171,
"step": 2180
},
{
"completion_length": 747.3708435058594,
"epoch": 0.8291267036850076,
"grad_norm": 1.2586473226547241,
"kl": 2.8255208333333335,
"learning_rate": 3.5533330297264055e-06,
"loss": 0.113,
"reward": -0.47125562417010464,
"reward_std": 1.3159513572851818,
"rewards/Qwen2-0.5B-Reward": -0.47125562417010464,
"step": 2190
},
{
"completion_length": 718.9842681884766,
"epoch": 0.8329126703685008,
"grad_norm": 0.7325953841209412,
"kl": 2.89453125,
"learning_rate": 3.4871554098624783e-06,
"loss": 0.1159,
"reward": -0.515640505651633,
"reward_std": 1.2894119222958882,
"rewards/Qwen2-0.5B-Reward": -0.515640505651633,
"step": 2200
},
{
"completion_length": 730.6486206054688,
"epoch": 0.836698637051994,
"grad_norm": 1.3458070755004883,
"kl": 2.746354166666667,
"learning_rate": 3.4222912140074072e-06,
"loss": 0.1099,
"reward": -0.43878471093873184,
"reward_std": 1.1841597487529119,
"rewards/Qwen2-0.5B-Reward": -0.43878471093873184,
"step": 2210
},
{
"completion_length": 728.4597218831381,
"epoch": 0.8404846037354872,
"grad_norm": 2.082460880279541,
"kl": 3.025520833333333,
"learning_rate": 3.358751781966125e-06,
"loss": 0.121,
"reward": -0.5120975616077582,
"reward_std": 1.399947702884674,
"rewards/Qwen2-0.5B-Reward": -0.5120975616077582,
"step": 2220
},
{
"completion_length": 702.8384338378906,
"epoch": 0.8442705704189803,
"grad_norm": 0.7987167239189148,
"kl": 2.9817708333333335,
"learning_rate": 3.2965482219433266e-06,
"loss": 0.1193,
"reward": -0.5346707743903001,
"reward_std": 1.298090636730194,
"rewards/Qwen2-0.5B-Reward": -0.5346707743903001,
"step": 2230
},
{
"completion_length": 743.4412068684895,
"epoch": 0.8480565371024735,
"grad_norm": 1.0572713613510132,
"kl": 2.8296875,
"learning_rate": 3.2356914086014895e-06,
"loss": 0.1132,
"reward": -0.45420979845027126,
"reward_std": 1.2626650591691335,
"rewards/Qwen2-0.5B-Reward": -0.45420979845027126,
"step": 2240
},
{
"completion_length": 751.9037150065104,
"epoch": 0.8518425037859667,
"grad_norm": 1.2263774871826172,
"kl": 2.789322916666667,
"learning_rate": 3.1761919811597286e-06,
"loss": 0.1116,
"reward": -0.41814162402103344,
"reward_std": 1.254759935537974,
"rewards/Qwen2-0.5B-Reward": -0.41814162402103344,
"step": 2250
},
{
"completion_length": 735.0213012695312,
"epoch": 0.8556284704694599,
"grad_norm": 1.536089539527893,
"kl": 2.711197916666667,
"learning_rate": 3.118060341533795e-06,
"loss": 0.1084,
"reward": -0.3957721870703002,
"reward_std": 1.215382601817449,
"rewards/Qwen2-0.5B-Reward": -0.3957721870703002,
"step": 2260
},
{
"completion_length": 739.1541676839192,
"epoch": 0.8594144371529531,
"grad_norm": 2.2628087997436523,
"kl": 3.322135416666667,
"learning_rate": 3.0613066525175916e-06,
"loss": 0.1328,
"reward": -0.5474292345655462,
"reward_std": 1.3296300649642945,
"rewards/Qwen2-0.5B-Reward": -0.5474292345655462,
"step": 2270
},
{
"completion_length": 753.1319498697917,
"epoch": 0.8632004038364463,
"grad_norm": 1.759981393814087,
"kl": 2.53984375,
"learning_rate": 3.00594083600646e-06,
"loss": 0.1016,
"reward": -0.4004799094672004,
"reward_std": 1.2508702536424001,
"rewards/Qwen2-0.5B-Reward": -0.4004799094672004,
"step": 2280
},
{
"completion_length": 765.2296366373698,
"epoch": 0.8669863705199394,
"grad_norm": 1.7521519660949707,
"kl": 3.2877604166666665,
"learning_rate": 2.9519725712625993e-06,
"loss": 0.1315,
"reward": -0.5632258212814728,
"reward_std": 1.3489103774229685,
"rewards/Qwen2-0.5B-Reward": -0.5632258212814728,
"step": 2290
},
{
"completion_length": 728.2092671712239,
"epoch": 0.8707723372034326,
"grad_norm": 1.1282004117965698,
"kl": 2.808333333333333,
"learning_rate": 2.89941129322291e-06,
"loss": 0.1123,
"reward": -0.4623491804425915,
"reward_std": 1.2616208771864572,
"rewards/Qwen2-0.5B-Reward": -0.4623491804425915,
"step": 2300
},
{
"completion_length": 763.8801025390625,
"epoch": 0.8745583038869258,
"grad_norm": 1.6411226987838745,
"kl": 2.96328125,
"learning_rate": 2.848266190849534e-06,
"loss": 0.1186,
"reward": -0.47133560677369435,
"reward_std": 1.3187556425730387,
"rewards/Qwen2-0.5B-Reward": -0.47133560677369435,
"step": 2310
},
{
"completion_length": 767.462967936198,
"epoch": 0.878344270570419,
"grad_norm": 1.238519310951233,
"kl": 2.96015625,
"learning_rate": 2.798546205523405e-06,
"loss": 0.1184,
"reward": -0.553766346598665,
"reward_std": 1.3190133293469748,
"rewards/Qwen2-0.5B-Reward": -0.553766346598665,
"step": 2320
},
{
"completion_length": 738.1370381673177,
"epoch": 0.8821302372539122,
"grad_norm": 1.9779850244522095,
"kl": 2.7606770833333334,
"learning_rate": 2.7502600294810888e-06,
"loss": 0.1104,
"reward": -0.48763653316224614,
"reward_std": 1.276737904548645,
"rewards/Qwen2-0.5B-Reward": -0.48763653316224614,
"step": 2330
},
{
"completion_length": 772.7652852376302,
"epoch": 0.8859162039374053,
"grad_norm": 0.9569075107574463,
"kl": 3.640625,
"learning_rate": 2.7034161042951696e-06,
"loss": 0.1457,
"reward": -0.752403491238753,
"reward_std": 1.5029548863569895,
"rewards/Qwen2-0.5B-Reward": -0.752403491238753,
"step": 2340
},
{
"completion_length": 748.1027770996094,
"epoch": 0.8897021706208985,
"grad_norm": 1.2532896995544434,
"kl": 2.788802083333333,
"learning_rate": 2.658022619398459e-06,
"loss": 0.1115,
"reward": -0.5759354960018148,
"reward_std": 1.252836243311564,
"rewards/Qwen2-0.5B-Reward": -0.5759354960018148,
"step": 2350
},
{
"completion_length": 756.765283203125,
"epoch": 0.8934881373043917,
"grad_norm": 1.243710994720459,
"kl": 3.470572916666667,
"learning_rate": 2.6140875106522906e-06,
"loss": 0.1388,
"reward": -0.7527099266648293,
"reward_std": 1.5181720991929373,
"rewards/Qwen2-0.5B-Reward": -0.7527099266648293,
"step": 2360
},
{
"completion_length": 731.5490844726562,
"epoch": 0.8972741039878849,
"grad_norm": 0.8256412744522095,
"kl": 2.8911458333333333,
"learning_rate": 2.5716184589591504e-06,
"loss": 0.1156,
"reward": -0.4917602331067125,
"reward_std": 1.3739383776982625,
"rewards/Qwen2-0.5B-Reward": -0.4917602331067125,
"step": 2370
},
{
"completion_length": 762.2222249348958,
"epoch": 0.901060070671378,
"grad_norm": 0.976091206073761,
"kl": 3.2059895833333334,
"learning_rate": 2.5306228889198595e-06,
"loss": 0.1282,
"reward": -0.492262601479888,
"reward_std": 1.3222837885220846,
"rewards/Qwen2-0.5B-Reward": -0.492262601479888,
"step": 2380
},
{
"completion_length": 752.6963033040364,
"epoch": 0.9048460373548712,
"grad_norm": 0.8627796769142151,
"kl": 3.144270833333333,
"learning_rate": 2.4911079675355852e-06,
"loss": 0.1258,
"reward": -0.5920792824278275,
"reward_std": 1.4338179051876068,
"rewards/Qwen2-0.5B-Reward": -0.5920792824278275,
"step": 2390
},
{
"completion_length": 729.3250172932943,
"epoch": 0.9086320040383644,
"grad_norm": 2.569244384765625,
"kl": 3.0598958333333335,
"learning_rate": 2.453080602954878e-06,
"loss": 0.1224,
"reward": -0.5552944198250771,
"reward_std": 1.259453280766805,
"rewards/Qwen2-0.5B-Reward": -0.5552944198250771,
"step": 2400
},
{
"completion_length": 769.4680562337239,
"epoch": 0.9124179707218576,
"grad_norm": 1.9891189336776733,
"kl": 3.347395833333333,
"learning_rate": 2.416547443265959e-06,
"loss": 0.134,
"reward": -0.7994883202016354,
"reward_std": 1.5337923685709636,
"rewards/Qwen2-0.5B-Reward": -0.7994883202016354,
"step": 2410
},
{
"completion_length": 711.1333435058593,
"epoch": 0.9162039374053509,
"grad_norm": 1.2348560094833374,
"kl": 2.6640625,
"learning_rate": 2.381514875334478e-06,
"loss": 0.1066,
"reward": -0.4012350387871265,
"reward_std": 1.1682847638924916,
"rewards/Qwen2-0.5B-Reward": -0.4012350387871265,
"step": 2420
},
{
"completion_length": 728.6402770996094,
"epoch": 0.9199899040888441,
"grad_norm": 1.0510834455490112,
"kl": 2.4625,
"learning_rate": 2.34798902368694e-06,
"loss": 0.0985,
"reward": -0.255227384219567,
"reward_std": 1.0641139527161916,
"rewards/Qwen2-0.5B-Reward": -0.255227384219567,
"step": 2430
},
{
"completion_length": 742.7620483398438,
"epoch": 0.9237758707723372,
"grad_norm": 0.6936110854148865,
"kl": 2.7760416666666665,
"learning_rate": 2.31597574943999e-06,
"loss": 0.1111,
"reward": -0.32442100283806213,
"reward_std": 1.1662549694379172,
"rewards/Qwen2-0.5B-Reward": -0.32442100283806213,
"step": 2440
},
{
"completion_length": 709.3500081380208,
"epoch": 0.9275618374558304,
"grad_norm": 0.6553380489349365,
"kl": 2.91484375,
"learning_rate": 2.2854806492757473e-06,
"loss": 0.1166,
"reward": -0.4610091609259446,
"reward_std": 1.2611193935076395,
"rewards/Qwen2-0.5B-Reward": -0.4610091609259446,
"step": 2450
},
{
"completion_length": 751.5171427408854,
"epoch": 0.9313478041393236,
"grad_norm": 1.1703935861587524,
"kl": 3.030989583333333,
"learning_rate": 2.256509054463379e-06,
"loss": 0.1212,
"reward": -0.47760866036017735,
"reward_std": 1.3580244441827138,
"rewards/Qwen2-0.5B-Reward": -0.47760866036017735,
"step": 2460
},
{
"completion_length": 734.3888997395833,
"epoch": 0.9351337708228168,
"grad_norm": 1.4841110706329346,
"kl": 2.941666666666667,
"learning_rate": 2.2290660299270626e-06,
"loss": 0.1176,
"reward": -0.5363880881418784,
"reward_std": 1.320775838692983,
"rewards/Qwen2-0.5B-Reward": -0.5363880881418784,
"step": 2470
},
{
"completion_length": 792.8597361246744,
"epoch": 0.93891973750631,
"grad_norm": 0.9216225743293762,
"kl": 3.2005208333333335,
"learning_rate": 2.2031563733605154e-06,
"loss": 0.128,
"reward": -0.6734383806586266,
"reward_std": 1.5115692138671875,
"rewards/Qwen2-0.5B-Reward": -0.6734383806586266,
"step": 2480
},
{
"completion_length": 742.3555603027344,
"epoch": 0.9427057041898032,
"grad_norm": 0.8652907013893127,
"kl": 2.9796875,
"learning_rate": 2.178784614388247e-06,
"loss": 0.1192,
"reward": -0.5235640426476796,
"reward_std": 1.2792722801367442,
"rewards/Qwen2-0.5B-Reward": -0.5235640426476796,
"step": 2490
},
{
"completion_length": 731.887510172526,
"epoch": 0.9464916708732963,
"grad_norm": 0.900198221206665,
"kl": 2.758072916666667,
"learning_rate": 2.155955013773674e-06,
"loss": 0.1102,
"reward": -0.427229492738843,
"reward_std": 1.2093970189491907,
"rewards/Qwen2-0.5B-Reward": -0.427229492738843,
"step": 2500
},
{
"completion_length": 742.8615783691406,
"epoch": 0.9502776375567895,
"grad_norm": 1.4608945846557617,
"kl": 2.884635416666667,
"learning_rate": 2.134671562674233e-06,
"loss": 0.1154,
"reward": -0.40613490512090117,
"reward_std": 1.2598043183485668,
"rewards/Qwen2-0.5B-Reward": -0.40613490512090117,
"step": 2510
},
{
"completion_length": 732.2143575032552,
"epoch": 0.9540636042402827,
"grad_norm": 0.861190676689148,
"kl": 3.107291666666667,
"learning_rate": 2.114937981943634e-06,
"loss": 0.1243,
"reward": -0.4464622031897306,
"reward_std": 1.2578558444976806,
"rewards/Qwen2-0.5B-Reward": -0.4464622031897306,
"step": 2520
},
{
"completion_length": 796.321767171224,
"epoch": 0.9578495709237759,
"grad_norm": 2.202199697494507,
"kl": 3.068489583333333,
"learning_rate": 2.096757721481365e-06,
"loss": 0.1228,
"reward": -0.6399494647979737,
"reward_std": 1.4180189092954,
"rewards/Qwen2-0.5B-Reward": -0.6399494647979737,
"step": 2530
},
{
"completion_length": 744.4689880371094,
"epoch": 0.961635537607269,
"grad_norm": 0.9193338751792908,
"kl": 2.9799479166666667,
"learning_rate": 2.0801339596295706e-06,
"loss": 0.1192,
"reward": -0.5712469642050564,
"reward_std": 1.3389502465724945,
"rewards/Qwen2-0.5B-Reward": -0.5712469642050564,
"step": 2540
},
{
"completion_length": 779.9819458007812,
"epoch": 0.9654215042907622,
"grad_norm": 1.811191439628601,
"kl": 3.6411458333333333,
"learning_rate": 2.0650696026173993e-06,
"loss": 0.1456,
"reward": -0.7589993777374426,
"reward_std": 1.5557840009530386,
"rewards/Qwen2-0.5B-Reward": -0.7589993777374426,
"step": 2550
},
{
"completion_length": 782.5782531738281,
"epoch": 0.9692074709742554,
"grad_norm": 0.9566059112548828,
"kl": 3.095572916666667,
"learning_rate": 2.051567284052924e-06,
"loss": 0.1238,
"reward": -0.6302419572137297,
"reward_std": 1.4944741606712342,
"rewards/Qwen2-0.5B-Reward": -0.6302419572137297,
"step": 2560
},
{
"completion_length": 718.9152872721354,
"epoch": 0.9729934376577486,
"grad_norm": 0.9425510168075562,
"kl": 2.82890625,
"learning_rate": 2.0396293644627313e-06,
"loss": 0.1132,
"reward": -0.32908876914686214,
"reward_std": 1.2080858111381532,
"rewards/Qwen2-0.5B-Reward": -0.32908876914686214,
"step": 2570
},
{
"completion_length": 732.7824157714844,
"epoch": 0.9767794043412418,
"grad_norm": 0.9575442671775818,
"kl": 3.121875,
"learning_rate": 2.0292579308792374e-06,
"loss": 0.125,
"reward": -0.47131281966964406,
"reward_std": 1.3826497634251913,
"rewards/Qwen2-0.5B-Reward": -0.47131281966964406,
"step": 2580
},
{
"completion_length": 761.6578694661458,
"epoch": 0.980565371024735,
"grad_norm": 1.0160202980041504,
"kl": 2.9203125,
"learning_rate": 2.020454796475829e-06,
"loss": 0.1168,
"reward": -0.47771473427613576,
"reward_std": 1.2897698918978373,
"rewards/Qwen2-0.5B-Reward": -0.47771473427613576,
"step": 2590
},
{
"completion_length": 781.389815266927,
"epoch": 0.9843513377082281,
"grad_norm": 2.1385881900787354,
"kl": 2.9213541666666667,
"learning_rate": 2.013221500249879e-06,
"loss": 0.1168,
"reward": -0.4969511273006598,
"reward_std": 1.3705980678399403,
"rewards/Qwen2-0.5B-Reward": -0.4969511273006598,
"step": 2600
},
{
"completion_length": 748.4037109375,
"epoch": 0.9881373043917213,
"grad_norm": 1.3061258792877197,
"kl": 3.1223958333333335,
"learning_rate": 2.0075593067536895e-06,
"loss": 0.1249,
"reward": -0.511777646218737,
"reward_std": 1.338163250684738,
"rewards/Qwen2-0.5B-Reward": -0.511777646218737,
"step": 2610
},
{
"completion_length": 730.8513916015625,
"epoch": 0.9919232710752145,
"grad_norm": 1.0508885383605957,
"kl": 2.6411458333333333,
"learning_rate": 2.0034692058734197e-06,
"loss": 0.1056,
"reward": -0.3765604312221209,
"reward_std": 1.2147092600663503,
"rewards/Qwen2-0.5B-Reward": -0.3765604312221209,
"step": 2620
},
{
"completion_length": 783.2115763346354,
"epoch": 0.9957092377587077,
"grad_norm": 1.1735745668411255,
"kl": 3.4817708333333335,
"learning_rate": 2.000951912656033e-06,
"loss": 0.1392,
"reward": -0.6186425998806954,
"reward_std": 1.4974812746047974,
"rewards/Qwen2-0.5B-Reward": -0.6186425998806954,
"step": 2630
},
{
"completion_length": 786.5088033040364,
"epoch": 0.9994952044422009,
"grad_norm": 1.1970211267471313,
"kl": 3.134375,
"learning_rate": 2.0000078671842824e-06,
"loss": 0.1254,
"reward": -0.662852063588798,
"reward_std": 1.5238366266091665,
"rewards/Qwen2-0.5B-Reward": -0.662852063588798,
"step": 2640
},
{
"completion_length": 728.1759236653646,
"epoch": 0.9998738011105502,
"kl": 3.125,
"reward": -0.8051454126834869,
"reward_std": 1.3035079042116802,
"rewards/Qwen2-0.5B-Reward": -0.8051454126834869,
"step": 2641,
"total_flos": 0.0,
"train_loss": 0.6071465962344739,
"train_runtime": 159997.8149,
"train_samples_per_second": 1.189,
"train_steps_per_second": 0.017
}
],
"logging_steps": 10,
"max_steps": 2641,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 24,
"trial_name": null,
"trial_params": null
}