|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9998738011105502, |
|
"eval_steps": 500, |
|
"global_step": 2641, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 862.3935139973959, |
|
"epoch": 0.0003785966683493185, |
|
"grad_norm": 0.29023680090904236, |
|
"kl": 0.0, |
|
"learning_rate": 7.547169811320756e-08, |
|
"loss": -0.0, |
|
"reward": -2.4199581146240234, |
|
"reward_std": 0.6020505428314209, |
|
"rewards/Qwen2-0.5B-Reward": -2.4199581146240234, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 921.5138956705729, |
|
"epoch": 0.003785966683493185, |
|
"grad_norm": 0.5279305577278137, |
|
"kl": 0.00010103649563259548, |
|
"learning_rate": 7.547169811320755e-07, |
|
"loss": 0.0, |
|
"reward": -2.473096079296536, |
|
"reward_std": 0.5592167631343559, |
|
"rewards/Qwen2-0.5B-Reward": -2.473096079296536, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 910.1000081380208, |
|
"epoch": 0.00757193336698637, |
|
"grad_norm": 0.2346569150686264, |
|
"kl": 0.00012467702229817707, |
|
"learning_rate": 1.509433962264151e-06, |
|
"loss": 0.0, |
|
"reward": -2.422696002324422, |
|
"reward_std": 0.5530827701091766, |
|
"rewards/Qwen2-0.5B-Reward": -2.422696002324422, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 911.8717631022135, |
|
"epoch": 0.011357900050479555, |
|
"grad_norm": 0.21592262387275696, |
|
"kl": 0.0003096898396809896, |
|
"learning_rate": 2.2641509433962266e-06, |
|
"loss": 0.0, |
|
"reward": -2.41411194006602, |
|
"reward_std": 0.5578982929388682, |
|
"rewards/Qwen2-0.5B-Reward": -2.41411194006602, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 893.8527893066406, |
|
"epoch": 0.01514386673397274, |
|
"grad_norm": 0.26798614859580994, |
|
"kl": 0.0015757242838541667, |
|
"learning_rate": 3.018867924528302e-06, |
|
"loss": 0.0001, |
|
"reward": -2.297330105304718, |
|
"reward_std": 0.528485847512881, |
|
"rewards/Qwen2-0.5B-Reward": -2.297330105304718, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 875.3588053385416, |
|
"epoch": 0.018929833417465926, |
|
"grad_norm": 0.27687838673591614, |
|
"kl": 0.0064605712890625, |
|
"learning_rate": 3.7735849056603777e-06, |
|
"loss": 0.0003, |
|
"reward": -2.010285266240438, |
|
"reward_std": 0.5279872556527455, |
|
"rewards/Qwen2-0.5B-Reward": -2.010285266240438, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 859.5717692057292, |
|
"epoch": 0.02271580010095911, |
|
"grad_norm": 0.2615930736064911, |
|
"kl": 0.018147786458333332, |
|
"learning_rate": 4.528301886792453e-06, |
|
"loss": 0.0007, |
|
"reward": -1.795549988746643, |
|
"reward_std": 0.49105457464853924, |
|
"rewards/Qwen2-0.5B-Reward": -1.795549988746643, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 765.7564921061198, |
|
"epoch": 0.026501766784452298, |
|
"grad_norm": 0.28387993574142456, |
|
"kl": 0.028316243489583334, |
|
"learning_rate": 5.283018867924529e-06, |
|
"loss": 0.0011, |
|
"reward": -1.4461613575617471, |
|
"reward_std": 0.47599050005277, |
|
"rewards/Qwen2-0.5B-Reward": -1.4461613575617471, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 783.0041768391927, |
|
"epoch": 0.03028773346794548, |
|
"grad_norm": 0.25285565853118896, |
|
"kl": 0.040238444010416666, |
|
"learning_rate": 6.037735849056604e-06, |
|
"loss": 0.0016, |
|
"reward": -1.193545683224996, |
|
"reward_std": 0.4799055278301239, |
|
"rewards/Qwen2-0.5B-Reward": -1.193545683224996, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 836.7032450358073, |
|
"epoch": 0.034073700151438666, |
|
"grad_norm": 0.25005629658699036, |
|
"kl": 0.05516764322916667, |
|
"learning_rate": 6.792452830188679e-06, |
|
"loss": 0.0022, |
|
"reward": -1.0326486746470134, |
|
"reward_std": 0.5193435788154602, |
|
"rewards/Qwen2-0.5B-Reward": -1.0326486746470134, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 853.6324157714844, |
|
"epoch": 0.03785966683493185, |
|
"grad_norm": 0.3665623664855957, |
|
"kl": 0.10475260416666667, |
|
"learning_rate": 7.5471698113207555e-06, |
|
"loss": 0.0042, |
|
"reward": -0.9854023973147075, |
|
"reward_std": 0.6189069559176763, |
|
"rewards/Qwen2-0.5B-Reward": -0.9854023973147075, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 815.8801005045573, |
|
"epoch": 0.04164563351842504, |
|
"grad_norm": 0.8496055006980896, |
|
"kl": 0.41549479166666664, |
|
"learning_rate": 8.301886792452832e-06, |
|
"loss": 0.0166, |
|
"reward": -1.463372488816579, |
|
"reward_std": 1.0357649803161622, |
|
"rewards/Qwen2-0.5B-Reward": -1.463372488816579, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 813.9148213704427, |
|
"epoch": 0.04543160020191822, |
|
"grad_norm": 0.3407374918460846, |
|
"kl": 0.40042317708333336, |
|
"learning_rate": 9.056603773584907e-06, |
|
"loss": 0.016, |
|
"reward": -1.8741844495137532, |
|
"reward_std": 1.722016990184784, |
|
"rewards/Qwen2-0.5B-Reward": -1.8741844495137532, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 646.899545288086, |
|
"epoch": 0.04921756688541141, |
|
"grad_norm": 0.4906499981880188, |
|
"kl": 0.2925618489583333, |
|
"learning_rate": 9.811320754716981e-06, |
|
"loss": 0.0117, |
|
"reward": -1.3685388286908469, |
|
"reward_std": 1.243816477060318, |
|
"rewards/Qwen2-0.5B-Reward": -1.3685388286908469, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 547.943989054362, |
|
"epoch": 0.053003533568904596, |
|
"grad_norm": 1.3670618534088135, |
|
"kl": 1.1440104166666667, |
|
"learning_rate": 1.0566037735849058e-05, |
|
"loss": 0.0458, |
|
"reward": -2.530128773053487, |
|
"reward_std": 2.079139538606008, |
|
"rewards/Qwen2-0.5B-Reward": -2.530128773053487, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 482.1189860026042, |
|
"epoch": 0.056789500252397776, |
|
"grad_norm": 2.426543712615967, |
|
"kl": 2.05546875, |
|
"learning_rate": 1.1320754716981132e-05, |
|
"loss": 0.0822, |
|
"reward": -3.6518485943476358, |
|
"reward_std": 2.5313418904940286, |
|
"rewards/Qwen2-0.5B-Reward": -3.6518485943476358, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 583.3393595377604, |
|
"epoch": 0.06057546693589096, |
|
"grad_norm": 7.087838649749756, |
|
"kl": 1.375, |
|
"learning_rate": 1.2075471698113209e-05, |
|
"loss": 0.055, |
|
"reward": -2.7873202482859294, |
|
"reward_std": 2.297369889418284, |
|
"rewards/Qwen2-0.5B-Reward": -2.7873202482859294, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 581.331483968099, |
|
"epoch": 0.06436143361938415, |
|
"grad_norm": 0.3393622636795044, |
|
"kl": 0.7234049479166667, |
|
"learning_rate": 1.2830188679245283e-05, |
|
"loss": 0.029, |
|
"reward": -1.6760946492354074, |
|
"reward_std": 1.530751649538676, |
|
"rewards/Qwen2-0.5B-Reward": -1.6760946492354074, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 551.2680633544921, |
|
"epoch": 0.06814740030287733, |
|
"grad_norm": 0.5040144920349121, |
|
"kl": 0.5955729166666667, |
|
"learning_rate": 1.3584905660377358e-05, |
|
"loss": 0.0238, |
|
"reward": -1.6978328824043274, |
|
"reward_std": 1.5175378421942394, |
|
"rewards/Qwen2-0.5B-Reward": -1.6978328824043274, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 498.8856536865234, |
|
"epoch": 0.07193336698637053, |
|
"grad_norm": 0.6744162440299988, |
|
"kl": 1.2225260416666666, |
|
"learning_rate": 1.4339622641509435e-05, |
|
"loss": 0.0489, |
|
"reward": -2.619816021124522, |
|
"reward_std": 2.05845144589742, |
|
"rewards/Qwen2-0.5B-Reward": -2.619816021124522, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 651.0615783691406, |
|
"epoch": 0.0757193336698637, |
|
"grad_norm": 0.47306591272354126, |
|
"kl": 0.9920572916666667, |
|
"learning_rate": 1.5094339622641511e-05, |
|
"loss": 0.0397, |
|
"reward": -2.183918062845866, |
|
"reward_std": 2.0014989256858824, |
|
"rewards/Qwen2-0.5B-Reward": -2.183918062845866, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 723.8722351074218, |
|
"epoch": 0.07950530035335689, |
|
"grad_norm": 0.8630687594413757, |
|
"kl": 1.132421875, |
|
"learning_rate": 1.5849056603773586e-05, |
|
"loss": 0.0453, |
|
"reward": -2.3433102289835612, |
|
"reward_std": 2.1008309284845987, |
|
"rewards/Qwen2-0.5B-Reward": -2.3433102289835612, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 633.7060221354167, |
|
"epoch": 0.08329126703685008, |
|
"grad_norm": 0.33421555161476135, |
|
"kl": 1.11015625, |
|
"learning_rate": 1.6603773584905664e-05, |
|
"loss": 0.0444, |
|
"reward": -1.7007139801979065, |
|
"reward_std": 1.8362650871276855, |
|
"rewards/Qwen2-0.5B-Reward": -1.7007139801979065, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 801.9037150065104, |
|
"epoch": 0.08707723372034326, |
|
"grad_norm": 0.2525235116481781, |
|
"kl": 0.492578125, |
|
"learning_rate": 1.735849056603774e-05, |
|
"loss": 0.0197, |
|
"reward": -0.9283350398143132, |
|
"reward_std": 1.1985284070173898, |
|
"rewards/Qwen2-0.5B-Reward": -0.9283350398143132, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 757.5694498697917, |
|
"epoch": 0.09086320040383644, |
|
"grad_norm": 0.23876462876796722, |
|
"kl": 0.5396484375, |
|
"learning_rate": 1.8113207547169813e-05, |
|
"loss": 0.0216, |
|
"reward": -1.083450937271118, |
|
"reward_std": 1.4640587449073792, |
|
"rewards/Qwen2-0.5B-Reward": -1.083450937271118, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 691.9120452880859, |
|
"epoch": 0.09464916708732964, |
|
"grad_norm": 0.5013711452484131, |
|
"kl": 0.6822265625, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 0.0273, |
|
"reward": -1.4570284724235534, |
|
"reward_std": 1.626520773768425, |
|
"rewards/Qwen2-0.5B-Reward": -1.4570284724235534, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 573.8166727701823, |
|
"epoch": 0.09843513377082282, |
|
"grad_norm": 0.6115075349807739, |
|
"kl": 1.6170572916666666, |
|
"learning_rate": 1.9622641509433963e-05, |
|
"loss": 0.0647, |
|
"reward": -2.9246065855026244, |
|
"reward_std": 2.250337036450704, |
|
"rewards/Qwen2-0.5B-Reward": -2.9246065855026244, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 773.3074178059895, |
|
"epoch": 0.102221100454316, |
|
"grad_norm": 0.31634387373924255, |
|
"kl": 0.733984375, |
|
"learning_rate": 1.999980332108064e-05, |
|
"loss": 0.0294, |
|
"reward": -1.5115862051645914, |
|
"reward_std": 1.5288376450538634, |
|
"rewards/Qwen2-0.5B-Reward": -1.5115862051645914, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 829.3129659016927, |
|
"epoch": 0.10600706713780919, |
|
"grad_norm": 0.25395989418029785, |
|
"kl": 0.4763671875, |
|
"learning_rate": 1.9998229941302175e-05, |
|
"loss": 0.0191, |
|
"reward": -0.9474448690811793, |
|
"reward_std": 1.09269377887249, |
|
"rewards/Qwen2-0.5B-Reward": -0.9474448690811793, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 999.9018575032552, |
|
"epoch": 0.10979303382130237, |
|
"grad_norm": 0.3452966809272766, |
|
"kl": 0.6498046875, |
|
"learning_rate": 1.9995083456809467e-05, |
|
"loss": 0.026, |
|
"reward": -1.7776759227116903, |
|
"reward_std": 1.8762857417265575, |
|
"rewards/Qwen2-0.5B-Reward": -1.7776759227116903, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 867.9375081380208, |
|
"epoch": 0.11357900050479555, |
|
"grad_norm": 0.4680746793746948, |
|
"kl": 0.98203125, |
|
"learning_rate": 1.9990364417682882e-05, |
|
"loss": 0.0393, |
|
"reward": -2.6815950234731036, |
|
"reward_std": 2.1339449683825173, |
|
"rewards/Qwen2-0.5B-Reward": -2.6815950234731036, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 841.0263916015625, |
|
"epoch": 0.11736496718828875, |
|
"grad_norm": 0.4002317786216736, |
|
"kl": 1.4239583333333334, |
|
"learning_rate": 1.9984073648922753e-05, |
|
"loss": 0.057, |
|
"reward": -2.9415343125661213, |
|
"reward_std": 2.4492496887842816, |
|
"rewards/Qwen2-0.5B-Reward": -2.9415343125661213, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 845.747226969401, |
|
"epoch": 0.12115093387178193, |
|
"grad_norm": 0.7248504757881165, |
|
"kl": 1.8854166666666667, |
|
"learning_rate": 1.997621225030515e-05, |
|
"loss": 0.0754, |
|
"reward": -3.695864470799764, |
|
"reward_std": 2.583825929959615, |
|
"rewards/Qwen2-0.5B-Reward": -3.695864470799764, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 1439.778253173828, |
|
"epoch": 0.12493690055527511, |
|
"grad_norm": 0.602688193321228, |
|
"kl": 1.6190104166666666, |
|
"learning_rate": 1.9966781596189623e-05, |
|
"loss": 0.0648, |
|
"reward": -3.7271327575047812, |
|
"reward_std": 1.9023333628972372, |
|
"rewards/Qwen2-0.5B-Reward": -3.7271327575047812, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 1148.3754781087239, |
|
"epoch": 0.1287228672387683, |
|
"grad_norm": 0.6237585544586182, |
|
"kl": 1.2373697916666666, |
|
"learning_rate": 1.9955783335278924e-05, |
|
"loss": 0.0495, |
|
"reward": -3.0485590934753417, |
|
"reward_std": 1.7622671604156495, |
|
"rewards/Qwen2-0.5B-Reward": -3.0485590934753417, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 871.3398213704427, |
|
"epoch": 0.13250883392226148, |
|
"grad_norm": 0.7727285623550415, |
|
"kl": 0.8674479166666667, |
|
"learning_rate": 1.9943219390330767e-05, |
|
"loss": 0.0347, |
|
"reward": -2.604492497444153, |
|
"reward_std": 1.5047667543093364, |
|
"rewards/Qwen2-0.5B-Reward": -2.604492497444153, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 731.1259358723959, |
|
"epoch": 0.13629480060575466, |
|
"grad_norm": 0.2828836441040039, |
|
"kl": 0.725, |
|
"learning_rate": 1.9929091957821703e-05, |
|
"loss": 0.029, |
|
"reward": -1.8863240122795104, |
|
"reward_std": 1.1796027421951294, |
|
"rewards/Qwen2-0.5B-Reward": -1.8863240122795104, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 593.0722300211588, |
|
"epoch": 0.14008076728924784, |
|
"grad_norm": 0.34980472922325134, |
|
"kl": 0.5994791666666667, |
|
"learning_rate": 1.9913403507563104e-05, |
|
"loss": 0.024, |
|
"reward": -1.5030529995759327, |
|
"reward_std": 0.9978658020496368, |
|
"rewards/Qwen2-0.5B-Reward": -1.5030529995759327, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 574.3398223876953, |
|
"epoch": 0.14386673397274105, |
|
"grad_norm": 0.44906890392303467, |
|
"kl": 0.98515625, |
|
"learning_rate": 1.9896156782269405e-05, |
|
"loss": 0.0394, |
|
"reward": -2.0863842129707337, |
|
"reward_std": 1.7282814304033915, |
|
"rewards/Qwen2-0.5B-Reward": -2.0863842129707337, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 598.7032470703125, |
|
"epoch": 0.14765270065623423, |
|
"grad_norm": 0.42980390787124634, |
|
"kl": 1.1180989583333334, |
|
"learning_rate": 1.9877354797078577e-05, |
|
"loss": 0.0448, |
|
"reward": -1.7375385125478109, |
|
"reward_std": 1.6146510203679403, |
|
"rewards/Qwen2-0.5B-Reward": -1.7375385125478109, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 522.9648213704427, |
|
"epoch": 0.1514386673397274, |
|
"grad_norm": 0.4960784614086151, |
|
"kl": 0.8545572916666667, |
|
"learning_rate": 1.9857000839025043e-05, |
|
"loss": 0.0342, |
|
"reward": -1.5845000902811686, |
|
"reward_std": 1.6743145366509755, |
|
"rewards/Qwen2-0.5B-Reward": -1.5845000902811686, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 650.5805603027344, |
|
"epoch": 0.1552246340232206, |
|
"grad_norm": 3.5647335052490234, |
|
"kl": 0.7625, |
|
"learning_rate": 1.983509846646502e-05, |
|
"loss": 0.0305, |
|
"reward": -1.3111775855223338, |
|
"reward_std": 1.5136757413546245, |
|
"rewards/Qwen2-0.5B-Reward": -1.3111775855223338, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 608.793061319987, |
|
"epoch": 0.15901060070671377, |
|
"grad_norm": 0.7843858599662781, |
|
"kl": 1.0794270833333333, |
|
"learning_rate": 1.9811651508454405e-05, |
|
"loss": 0.0432, |
|
"reward": -1.030318695306778, |
|
"reward_std": 1.3256585756937662, |
|
"rewards/Qwen2-0.5B-Reward": -1.030318695306778, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 717.2574086507161, |
|
"epoch": 0.16279656739020695, |
|
"grad_norm": 1.8327181339263916, |
|
"kl": 1.7510416666666666, |
|
"learning_rate": 1.97866640640794e-05, |
|
"loss": 0.07, |
|
"reward": -1.3865876078605652, |
|
"reward_std": 1.5873213092486063, |
|
"rewards/Qwen2-0.5B-Reward": -1.3865876078605652, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 886.4629699707032, |
|
"epoch": 0.16658253407370016, |
|
"grad_norm": 1.820997714996338, |
|
"kl": 2.492317708333333, |
|
"learning_rate": 1.9760140501739885e-05, |
|
"loss": 0.0997, |
|
"reward": -2.041487044095993, |
|
"reward_std": 1.9687125941117605, |
|
"rewards/Qwen2-0.5B-Reward": -2.041487044095993, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 761.4412068684895, |
|
"epoch": 0.17036850075719334, |
|
"grad_norm": 2.2856929302215576, |
|
"kl": 2.5984375, |
|
"learning_rate": 1.9732085458385706e-05, |
|
"loss": 0.104, |
|
"reward": -1.7194086611270905, |
|
"reward_std": 1.6787285923957824, |
|
"rewards/Qwen2-0.5B-Reward": -1.7194086611270905, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 800.9458414713541, |
|
"epoch": 0.17415446744068652, |
|
"grad_norm": 1.5514496564865112, |
|
"kl": 3.582291666666667, |
|
"learning_rate": 1.9702503838706032e-05, |
|
"loss": 0.1433, |
|
"reward": -1.918267943461736, |
|
"reward_std": 1.798914521932602, |
|
"rewards/Qwen2-0.5B-Reward": -1.918267943461736, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 766.8731547037761, |
|
"epoch": 0.1779404341241797, |
|
"grad_norm": 2.541083812713623, |
|
"kl": 2.43359375, |
|
"learning_rate": 1.9671400814271904e-05, |
|
"loss": 0.0973, |
|
"reward": -1.5191373944282531, |
|
"reward_std": 1.7267815709114074, |
|
"rewards/Qwen2-0.5B-Reward": -1.5191373944282531, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 730.6893575032552, |
|
"epoch": 0.18172640080767288, |
|
"grad_norm": 2.0177841186523438, |
|
"kl": 1.9341145833333333, |
|
"learning_rate": 1.9638781822632117e-05, |
|
"loss": 0.0774, |
|
"reward": -0.9346473336219787, |
|
"reward_std": 1.284997742374738, |
|
"rewards/Qwen2-0.5B-Reward": -0.9346473336219787, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 714.4213033040364, |
|
"epoch": 0.1855123674911661, |
|
"grad_norm": 1.7401411533355713, |
|
"kl": 2.3013020833333333, |
|
"learning_rate": 1.9604652566362604e-05, |
|
"loss": 0.092, |
|
"reward": -0.9350511769453684, |
|
"reward_std": 1.2963847279548646, |
|
"rewards/Qwen2-0.5B-Reward": -0.9350511769453684, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 718.9740814208984, |
|
"epoch": 0.18929833417465927, |
|
"grad_norm": 5.835248947143555, |
|
"kl": 2.794661458333333, |
|
"learning_rate": 1.95690190120695e-05, |
|
"loss": 0.1118, |
|
"reward": -1.0198218444983165, |
|
"reward_std": 1.4708388864994049, |
|
"rewards/Qwen2-0.5B-Reward": -1.0198218444983165, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 626.8421396891276, |
|
"epoch": 0.19308430085815245, |
|
"grad_norm": 2.837407112121582, |
|
"kl": 1.6803385416666667, |
|
"learning_rate": 1.9531887389346016e-05, |
|
"loss": 0.0672, |
|
"reward": -0.6805184543132782, |
|
"reward_std": 1.061421944697698, |
|
"rewards/Qwen2-0.5B-Reward": -0.6805184543132782, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 709.4504659016927, |
|
"epoch": 0.19687026754164563, |
|
"grad_norm": 1.3921163082122803, |
|
"kl": 2.5, |
|
"learning_rate": 1.9493264189683393e-05, |
|
"loss": 0.1, |
|
"reward": -1.0162009666363399, |
|
"reward_std": 1.3732348203659057, |
|
"rewards/Qwen2-0.5B-Reward": -1.0162009666363399, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 734.6662150065105, |
|
"epoch": 0.2006562342251388, |
|
"grad_norm": 0.9199353456497192, |
|
"kl": 2.3385416666666665, |
|
"learning_rate": 1.9453156165336e-05, |
|
"loss": 0.0936, |
|
"reward": -0.9758850524822871, |
|
"reward_std": 1.4469304541746775, |
|
"rewards/Qwen2-0.5B-Reward": -0.9758850524822871, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 685.4245402018229, |
|
"epoch": 0.204442200908632, |
|
"grad_norm": 1.9519050121307373, |
|
"kl": 1.9833333333333334, |
|
"learning_rate": 1.94115703281409e-05, |
|
"loss": 0.0793, |
|
"reward": -0.7073126316070557, |
|
"reward_std": 1.1466901183128357, |
|
"rewards/Qwen2-0.5B-Reward": -0.7073126316070557, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 755.8652852376302, |
|
"epoch": 0.2082281675921252, |
|
"grad_norm": 1.136602520942688, |
|
"kl": 2.7690104166666667, |
|
"learning_rate": 1.9368513948291997e-05, |
|
"loss": 0.1108, |
|
"reward": -1.0485609819491704, |
|
"reward_std": 1.5164429823557535, |
|
"rewards/Qwen2-0.5B-Reward": -1.0485609819491704, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 691.9601928710938, |
|
"epoch": 0.21201413427561838, |
|
"grad_norm": 1.1344228982925415, |
|
"kl": 1.9572916666666667, |
|
"learning_rate": 1.932399455306906e-05, |
|
"loss": 0.0783, |
|
"reward": -0.7905913976331552, |
|
"reward_std": 1.2394062995910644, |
|
"rewards/Qwen2-0.5B-Reward": -0.7905913976331552, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 774.0166727701823, |
|
"epoch": 0.21580010095911156, |
|
"grad_norm": 4.68572998046875, |
|
"kl": 2.8236979166666667, |
|
"learning_rate": 1.9278019925521744e-05, |
|
"loss": 0.113, |
|
"reward": -0.9427557557821273, |
|
"reward_std": 1.4873551627000172, |
|
"rewards/Qwen2-0.5B-Reward": -0.9427557557821273, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 782.1125061035157, |
|
"epoch": 0.21958606764260474, |
|
"grad_norm": 3.2951087951660156, |
|
"kl": 2.81875, |
|
"learning_rate": 1.9230598103108958e-05, |
|
"loss": 0.1127, |
|
"reward": -0.9920766482750575, |
|
"reward_std": 1.5032208581765494, |
|
"rewards/Qwen2-0.5B-Reward": -0.9920766482750575, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 764.4046447753906, |
|
"epoch": 0.22337203432609792, |
|
"grad_norm": 0.7878244519233704, |
|
"kl": 2.4518229166666665, |
|
"learning_rate": 1.9181737376293693e-05, |
|
"loss": 0.0981, |
|
"reward": -0.8713747123877208, |
|
"reward_std": 1.4777807037035624, |
|
"rewards/Qwen2-0.5B-Reward": -0.8713747123877208, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 787.2912129720052, |
|
"epoch": 0.2271580010095911, |
|
"grad_norm": 1.002245306968689, |
|
"kl": 2.3055989583333334, |
|
"learning_rate": 1.9131446287093683e-05, |
|
"loss": 0.0922, |
|
"reward": -0.914855935672919, |
|
"reward_std": 1.4097402195135753, |
|
"rewards/Qwen2-0.5B-Reward": -0.914855935672919, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 853.8175984700521, |
|
"epoch": 0.2309439676930843, |
|
"grad_norm": 1.2771328687667847, |
|
"kl": 3.388802083333333, |
|
"learning_rate": 1.9079733627588042e-05, |
|
"loss": 0.1356, |
|
"reward": -1.4452542603015899, |
|
"reward_std": 1.7754655241966248, |
|
"rewards/Qwen2-0.5B-Reward": -1.4452542603015899, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 742.8106587727865, |
|
"epoch": 0.2347299343765775, |
|
"grad_norm": 1.4351956844329834, |
|
"kl": 2.539322916666667, |
|
"learning_rate": 1.9026608438380195e-05, |
|
"loss": 0.1016, |
|
"reward": -1.0814687182505927, |
|
"reward_std": 1.450120480855306, |
|
"rewards/Qwen2-0.5B-Reward": -1.0814687182505927, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 683.3101959228516, |
|
"epoch": 0.23851590106007067, |
|
"grad_norm": 1.352857232093811, |
|
"kl": 1.84296875, |
|
"learning_rate": 1.897208000701737e-05, |
|
"loss": 0.0737, |
|
"reward": -0.6815965756773948, |
|
"reward_std": 1.0737029949824015, |
|
"rewards/Qwen2-0.5B-Reward": -0.6815965756773948, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 792.8847290039063, |
|
"epoch": 0.24230186774356385, |
|
"grad_norm": 0.9555492997169495, |
|
"kl": 2.507552083333333, |
|
"learning_rate": 1.8916157866366928e-05, |
|
"loss": 0.1003, |
|
"reward": -0.9711129138867061, |
|
"reward_std": 1.5443729062875111, |
|
"rewards/Qwen2-0.5B-Reward": -0.9711129138867061, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 771.8388916015625, |
|
"epoch": 0.24608783442705703, |
|
"grad_norm": 1.100447177886963, |
|
"kl": 2.43984375, |
|
"learning_rate": 1.8858851792949764e-05, |
|
"loss": 0.0976, |
|
"reward": -1.0383977095286052, |
|
"reward_std": 1.4934775571028391, |
|
"rewards/Qwen2-0.5B-Reward": -1.0383977095286052, |
|
"step": 650 |
|
}, |
|
{ |
|
"completion_length": 719.6476888020833, |
|
"epoch": 0.24987380111055021, |
|
"grad_norm": 1.0007727146148682, |
|
"kl": 1.8555989583333334, |
|
"learning_rate": 1.880017180523116e-05, |
|
"loss": 0.0742, |
|
"reward": -0.844773971537749, |
|
"reward_std": 1.3539518495400746, |
|
"rewards/Qwen2-0.5B-Reward": -0.844773971537749, |
|
"step": 660 |
|
}, |
|
{ |
|
"completion_length": 751.7523234049479, |
|
"epoch": 0.2536597677940434, |
|
"grad_norm": 0.8904104232788086, |
|
"kl": 1.8032552083333333, |
|
"learning_rate": 1.8740128161869308e-05, |
|
"loss": 0.0721, |
|
"reward": -0.6786784966786702, |
|
"reward_std": 1.2231530169645946, |
|
"rewards/Qwen2-0.5B-Reward": -0.6786784966786702, |
|
"step": 670 |
|
}, |
|
{ |
|
"completion_length": 732.9828735351563, |
|
"epoch": 0.2574457344775366, |
|
"grad_norm": 0.9944930672645569, |
|
"kl": 2.1743489583333333, |
|
"learning_rate": 1.8678731359921856e-05, |
|
"loss": 0.087, |
|
"reward": -0.6016703399519125, |
|
"reward_std": 1.2204503317674, |
|
"rewards/Qwen2-0.5B-Reward": -0.6016703399519125, |
|
"step": 680 |
|
}, |
|
{ |
|
"completion_length": 769.8699157714843, |
|
"epoch": 0.2612317011610298, |
|
"grad_norm": 1.7161378860473633, |
|
"kl": 2.68125, |
|
"learning_rate": 1.8615992133010777e-05, |
|
"loss": 0.1073, |
|
"reward": -0.9773722817500432, |
|
"reward_std": 1.5413507958253225, |
|
"rewards/Qwen2-0.5B-Reward": -0.9773722817500432, |
|
"step": 690 |
|
}, |
|
{ |
|
"completion_length": 723.2902893066406, |
|
"epoch": 0.26501766784452296, |
|
"grad_norm": 1.2049915790557861, |
|
"kl": 2.252083333333333, |
|
"learning_rate": 1.855192144944586e-05, |
|
"loss": 0.0901, |
|
"reward": -0.6862340954442819, |
|
"reward_std": 1.3176872313022614, |
|
"rewards/Qwen2-0.5B-Reward": -0.6862340954442819, |
|
"step": 700 |
|
}, |
|
{ |
|
"completion_length": 730.842598470052, |
|
"epoch": 0.26880363452801614, |
|
"grad_norm": 1.1353825330734253, |
|
"kl": 2.6614583333333335, |
|
"learning_rate": 1.8486530510307222e-05, |
|
"loss": 0.1064, |
|
"reward": -0.8512504202624162, |
|
"reward_std": 1.4152730743090312, |
|
"rewards/Qwen2-0.5B-Reward": -0.8512504202624162, |
|
"step": 710 |
|
}, |
|
{ |
|
"completion_length": 784.4833374023438, |
|
"epoch": 0.2725896012115093, |
|
"grad_norm": 1.1364926099777222, |
|
"kl": 2.8877604166666666, |
|
"learning_rate": 1.8419830747487045e-05, |
|
"loss": 0.1155, |
|
"reward": -1.4028477271397908, |
|
"reward_std": 1.6338281035423279, |
|
"rewards/Qwen2-0.5B-Reward": -1.4028477271397908, |
|
"step": 720 |
|
}, |
|
{ |
|
"completion_length": 786.0379679361979, |
|
"epoch": 0.2763755678950025, |
|
"grad_norm": 1.4071515798568726, |
|
"kl": 3.0088541666666666, |
|
"learning_rate": 1.8351833821691053e-05, |
|
"loss": 0.1204, |
|
"reward": -1.2512944350639978, |
|
"reward_std": 1.6677428344885508, |
|
"rewards/Qwen2-0.5B-Reward": -1.2512944350639978, |
|
"step": 730 |
|
}, |
|
{ |
|
"completion_length": 807.7277893066406, |
|
"epoch": 0.2801615345784957, |
|
"grad_norm": 1.4424173831939697, |
|
"kl": 3.021354166666667, |
|
"learning_rate": 1.8282551620399917e-05, |
|
"loss": 0.1208, |
|
"reward": -1.225243662794431, |
|
"reward_std": 1.7895207107067108, |
|
"rewards/Qwen2-0.5B-Reward": -1.225243662794431, |
|
"step": 740 |
|
}, |
|
{ |
|
"completion_length": 728.170839436849, |
|
"epoch": 0.2839475012619889, |
|
"grad_norm": 0.6950631141662598, |
|
"kl": 2.519661458333333, |
|
"learning_rate": 1.821199625579105e-05, |
|
"loss": 0.1008, |
|
"reward": -0.8639134142082184, |
|
"reward_std": 1.4788370271523794, |
|
"rewards/Qwen2-0.5B-Reward": -0.8639134142082184, |
|
"step": 750 |
|
}, |
|
{ |
|
"completion_length": 679.6050984700521, |
|
"epoch": 0.2877334679454821, |
|
"grad_norm": 1.6717815399169922, |
|
"kl": 1.7360677083333333, |
|
"learning_rate": 1.8140180062621117e-05, |
|
"loss": 0.0695, |
|
"reward": -0.46732902062746384, |
|
"reward_std": 0.9378261427084605, |
|
"rewards/Qwen2-0.5B-Reward": -0.46732902062746384, |
|
"step": 760 |
|
}, |
|
{ |
|
"completion_length": 783.3986165364583, |
|
"epoch": 0.2915194346289753, |
|
"grad_norm": 1.3388867378234863, |
|
"kl": 2.79609375, |
|
"learning_rate": 1.8067115596069607e-05, |
|
"loss": 0.1118, |
|
"reward": -0.9435359309117, |
|
"reward_std": 1.6089221199353536, |
|
"rewards/Qwen2-0.5B-Reward": -0.9435359309117, |
|
"step": 770 |
|
}, |
|
{ |
|
"completion_length": 713.3592651367187, |
|
"epoch": 0.29530540131246846, |
|
"grad_norm": 1.2017817497253418, |
|
"kl": 2.4661458333333335, |
|
"learning_rate": 1.79928156295439e-05, |
|
"loss": 0.0986, |
|
"reward": -0.7846424505114555, |
|
"reward_std": 1.4175224483013154, |
|
"rewards/Qwen2-0.5B-Reward": -0.7846424505114555, |
|
"step": 780 |
|
}, |
|
{ |
|
"completion_length": 813.8467631022136, |
|
"epoch": 0.29909136799596164, |
|
"grad_norm": 2.2606418132781982, |
|
"kl": 3.955208333333333, |
|
"learning_rate": 1.7917293152446184e-05, |
|
"loss": 0.1583, |
|
"reward": -1.4304717580477397, |
|
"reward_std": 2.023730218410492, |
|
"rewards/Qwen2-0.5B-Reward": -1.4304717580477397, |
|
"step": 790 |
|
}, |
|
{ |
|
"completion_length": 701.6078796386719, |
|
"epoch": 0.3028773346794548, |
|
"grad_norm": 1.5273058414459229, |
|
"kl": 2.3984375, |
|
"learning_rate": 1.784056136790257e-05, |
|
"loss": 0.096, |
|
"reward": -0.7075912684202195, |
|
"reward_std": 1.3393534004688263, |
|
"rewards/Qwen2-0.5B-Reward": -0.7075912684202195, |
|
"step": 800 |
|
}, |
|
{ |
|
"completion_length": 709.4273193359375, |
|
"epoch": 0.306663301362948, |
|
"grad_norm": 1.1304354667663574, |
|
"kl": 2.4188802083333334, |
|
"learning_rate": 1.7762633690454897e-05, |
|
"loss": 0.0968, |
|
"reward": -0.6373326261838277, |
|
"reward_std": 1.289098753531774, |
|
"rewards/Qwen2-0.5B-Reward": -0.6373326261838277, |
|
"step": 810 |
|
}, |
|
{ |
|
"completion_length": 757.3930643717448, |
|
"epoch": 0.3104492680464412, |
|
"grad_norm": 1.3200254440307617, |
|
"kl": 2.48046875, |
|
"learning_rate": 1.7683523743715538e-05, |
|
"loss": 0.0993, |
|
"reward": -0.8247589614242316, |
|
"reward_std": 1.4155633012453714, |
|
"rewards/Qwen2-0.5B-Reward": -0.8247589614242316, |
|
"step": 820 |
|
}, |
|
{ |
|
"completion_length": 697.1213033040365, |
|
"epoch": 0.31423523472993437, |
|
"grad_norm": 0.8467837572097778, |
|
"kl": 2.003515625, |
|
"learning_rate": 1.760324535798567e-05, |
|
"loss": 0.0802, |
|
"reward": -0.4532388661056757, |
|
"reward_std": 1.0981567233800889, |
|
"rewards/Qwen2-0.5B-Reward": -0.4532388661056757, |
|
"step": 830 |
|
}, |
|
{ |
|
"completion_length": 780.1504659016927, |
|
"epoch": 0.31802120141342755, |
|
"grad_norm": 596676.75, |
|
"kl": 3364.5799479166667, |
|
"learning_rate": 1.752181256783741e-05, |
|
"loss": 134.4873, |
|
"reward": -0.9652832999825478, |
|
"reward_std": 1.6040133237838745, |
|
"rewards/Qwen2-0.5B-Reward": -0.9652832999825478, |
|
"step": 840 |
|
}, |
|
{ |
|
"completion_length": 695.2185282389323, |
|
"epoch": 0.3218071680969207, |
|
"grad_norm": 1.3249794244766235, |
|
"kl": 2.296744791666667, |
|
"learning_rate": 1.7439239609660238e-05, |
|
"loss": 0.0919, |
|
"reward": -0.49953351405759655, |
|
"reward_std": 1.100526017944018, |
|
"rewards/Qwen2-0.5B-Reward": -0.49953351405759655, |
|
"step": 850 |
|
}, |
|
{ |
|
"completion_length": 705.1893636067708, |
|
"epoch": 0.3255931347804139, |
|
"grad_norm": 2.2733891010284424, |
|
"kl": 2.476302083333333, |
|
"learning_rate": 1.735554091917214e-05, |
|
"loss": 0.0991, |
|
"reward": -0.7803226565321286, |
|
"reward_std": 1.427873319387436, |
|
"rewards/Qwen2-0.5B-Reward": -0.7803226565321286, |
|
"step": 860 |
|
}, |
|
{ |
|
"completion_length": 724.2041748046875, |
|
"epoch": 0.32937910146390714, |
|
"grad_norm": 1.2122727632522583, |
|
"kl": 2.6158854166666665, |
|
"learning_rate": 1.7270731128895896e-05, |
|
"loss": 0.1046, |
|
"reward": -0.9140092690785726, |
|
"reward_std": 1.5725321372350056, |
|
"rewards/Qwen2-0.5B-Reward": -0.9140092690785726, |
|
"step": 870 |
|
}, |
|
{ |
|
"completion_length": 736.9453796386719, |
|
"epoch": 0.3331650681474003, |
|
"grad_norm": 0.9501739740371704, |
|
"kl": 2.3080729166666667, |
|
"learning_rate": 1.7184825065600964e-05, |
|
"loss": 0.0923, |
|
"reward": -0.7457656829307476, |
|
"reward_std": 1.343357914686203, |
|
"rewards/Qwen2-0.5B-Reward": -0.7457656829307476, |
|
"step": 880 |
|
}, |
|
{ |
|
"completion_length": 785.8842651367188, |
|
"epoch": 0.3369510348308935, |
|
"grad_norm": 0.9159669280052185, |
|
"kl": 2.5815104166666667, |
|
"learning_rate": 1.709783774771141e-05, |
|
"loss": 0.1033, |
|
"reward": -0.7225840290387472, |
|
"reward_std": 1.4536415020624796, |
|
"rewards/Qwen2-0.5B-Reward": -0.7225840290387472, |
|
"step": 890 |
|
}, |
|
{ |
|
"completion_length": 824.7926045735677, |
|
"epoch": 0.3407370015143867, |
|
"grad_norm": 3.830165147781372, |
|
"kl": 2.72265625, |
|
"learning_rate": 1.7009784382680345e-05, |
|
"loss": 0.1089, |
|
"reward": -0.9060644646485646, |
|
"reward_std": 1.5053735852241517, |
|
"rewards/Qwen2-0.5B-Reward": -0.9060644646485646, |
|
"step": 900 |
|
}, |
|
{ |
|
"completion_length": 788.6736124674479, |
|
"epoch": 0.34452296819787986, |
|
"grad_norm": 1.977720022201538, |
|
"kl": 2.8015625, |
|
"learning_rate": 1.692068036433128e-05, |
|
"loss": 0.1121, |
|
"reward": -0.7987352999548117, |
|
"reward_std": 1.52867697874705, |
|
"rewards/Qwen2-0.5B-Reward": -0.7987352999548117, |
|
"step": 910 |
|
}, |
|
{ |
|
"completion_length": 740.3379699707032, |
|
"epoch": 0.34830893488137304, |
|
"grad_norm": 170.5952911376953, |
|
"kl": 2.7221354166666667, |
|
"learning_rate": 1.6830541270166928e-05, |
|
"loss": 0.1088, |
|
"reward": -0.9519633074601491, |
|
"reward_std": 1.5265244921048482, |
|
"rewards/Qwen2-0.5B-Reward": -0.9519633074601491, |
|
"step": 920 |
|
}, |
|
{ |
|
"completion_length": 715.4027811686198, |
|
"epoch": 0.3520949015648662, |
|
"grad_norm": 1.736777663230896, |
|
"kl": 2.2143229166666667, |
|
"learning_rate": 1.673938285864588e-05, |
|
"loss": 0.0886, |
|
"reward": -0.5707902121047179, |
|
"reward_std": 1.127177753051122, |
|
"rewards/Qwen2-0.5B-Reward": -0.5707902121047179, |
|
"step": 930 |
|
}, |
|
{ |
|
"completion_length": 812.7449137369791, |
|
"epoch": 0.3558808682483594, |
|
"grad_norm": 2.1414971351623535, |
|
"kl": 2.667708333333333, |
|
"learning_rate": 1.664722106642767e-05, |
|
"loss": 0.1066, |
|
"reward": -0.9589705864588419, |
|
"reward_std": 1.527525293827057, |
|
"rewards/Qwen2-0.5B-Reward": -0.9589705864588419, |
|
"step": 940 |
|
}, |
|
{ |
|
"completion_length": 769.1416849772136, |
|
"epoch": 0.3596668349318526, |
|
"grad_norm": 4.496264457702637, |
|
"kl": 2.38359375, |
|
"learning_rate": 1.6554072005586638e-05, |
|
"loss": 0.0953, |
|
"reward": -0.5887288892020782, |
|
"reward_std": 1.23007483681043, |
|
"rewards/Qwen2-0.5B-Reward": -0.5887288892020782, |
|
"step": 950 |
|
}, |
|
{ |
|
"completion_length": 766.1638936360677, |
|
"epoch": 0.36345280161534577, |
|
"grad_norm": 1.2970997095108032, |
|
"kl": 2.5834635416666667, |
|
"learning_rate": 1.6459951960795185e-05, |
|
"loss": 0.1033, |
|
"reward": -0.7721572608997425, |
|
"reward_std": 1.4835912009080252, |
|
"rewards/Qwen2-0.5B-Reward": -0.7721572608997425, |
|
"step": 960 |
|
}, |
|
{ |
|
"completion_length": 751.2708374023438, |
|
"epoch": 0.36723876829883895, |
|
"grad_norm": 2.702152967453003, |
|
"kl": 2.40390625, |
|
"learning_rate": 1.6364877386476804e-05, |
|
"loss": 0.0961, |
|
"reward": -0.7570990284283956, |
|
"reward_std": 1.4351972460746765, |
|
"rewards/Qwen2-0.5B-Reward": -0.7570990284283956, |
|
"step": 970 |
|
}, |
|
{ |
|
"completion_length": 730.1676005045573, |
|
"epoch": 0.3710247349823322, |
|
"grad_norm": 1.0240263938903809, |
|
"kl": 2.5716145833333335, |
|
"learning_rate": 1.6268864903929466e-05, |
|
"loss": 0.1029, |
|
"reward": -0.6520452598730723, |
|
"reward_std": 1.3328065713246664, |
|
"rewards/Qwen2-0.5B-Reward": -0.6520452598730723, |
|
"step": 980 |
|
}, |
|
{ |
|
"completion_length": 738.0222351074219, |
|
"epoch": 0.37481070166582536, |
|
"grad_norm": 0.9893134832382202, |
|
"kl": 2.990104166666667, |
|
"learning_rate": 1.617193129841982e-05, |
|
"loss": 0.1196, |
|
"reward": -0.973382901151975, |
|
"reward_std": 1.5284679671128591, |
|
"rewards/Qwen2-0.5B-Reward": -0.973382901151975, |
|
"step": 990 |
|
}, |
|
{ |
|
"completion_length": 767.7051005045573, |
|
"epoch": 0.37859666834931854, |
|
"grad_norm": 1.4028962850570679, |
|
"kl": 3.0208333333333335, |
|
"learning_rate": 1.6074093516248726e-05, |
|
"loss": 0.1208, |
|
"reward": -0.8820533196131388, |
|
"reward_std": 1.5515558183193208, |
|
"rewards/Qwen2-0.5B-Reward": -0.8820533196131388, |
|
"step": 1000 |
|
}, |
|
{ |
|
"completion_length": 729.3245463053386, |
|
"epoch": 0.3823826350328117, |
|
"grad_norm": 1.1494252681732178, |
|
"kl": 2.1536458333333335, |
|
"learning_rate": 1.5975368661788636e-05, |
|
"loss": 0.0861, |
|
"reward": -0.617452886607498, |
|
"reward_std": 1.2075418949127197, |
|
"rewards/Qwen2-0.5B-Reward": -0.617452886607498, |
|
"step": 1010 |
|
}, |
|
{ |
|
"completion_length": 711.6662150065105, |
|
"epoch": 0.3861686017163049, |
|
"grad_norm": 0.9261192083358765, |
|
"kl": 2.349739583333333, |
|
"learning_rate": 1.587577399449336e-05, |
|
"loss": 0.094, |
|
"reward": -0.6707314955691497, |
|
"reward_std": 1.2855535586675009, |
|
"rewards/Qwen2-0.5B-Reward": -0.6707314955691497, |
|
"step": 1020 |
|
}, |
|
{ |
|
"completion_length": 751.3074096679687, |
|
"epoch": 0.3899545683997981, |
|
"grad_norm": 2.042595148086548, |
|
"kl": 2.3372395833333335, |
|
"learning_rate": 1.5775326925880675e-05, |
|
"loss": 0.0935, |
|
"reward": -0.6637267053127289, |
|
"reward_std": 1.3381904661655426, |
|
"rewards/Qwen2-0.5B-Reward": -0.6637267053127289, |
|
"step": 1030 |
|
}, |
|
{ |
|
"completion_length": 776.1870402018229, |
|
"epoch": 0.39374053508329127, |
|
"grad_norm": 1.2383322715759277, |
|
"kl": 5.3609375, |
|
"learning_rate": 1.5674045016488397e-05, |
|
"loss": 0.2142, |
|
"reward": -0.6239150881767273, |
|
"reward_std": 1.3248741805553437, |
|
"rewards/Qwen2-0.5B-Reward": -0.6239150881767273, |
|
"step": 1040 |
|
}, |
|
{ |
|
"completion_length": 718.260194905599, |
|
"epoch": 0.39752650176678445, |
|
"grad_norm": 1.5840164422988892, |
|
"kl": 2.3580729166666665, |
|
"learning_rate": 1.5571945972804376e-05, |
|
"loss": 0.0943, |
|
"reward": -0.5199564640720685, |
|
"reward_std": 1.2036932865778605, |
|
"rewards/Qwen2-0.5B-Reward": -0.5199564640720685, |
|
"step": 1050 |
|
}, |
|
{ |
|
"completion_length": 809.6148213704427, |
|
"epoch": 0.4013124684502776, |
|
"grad_norm": 1.5066214799880981, |
|
"kl": 3.0403645833333335, |
|
"learning_rate": 1.546904764417098e-05, |
|
"loss": 0.1216, |
|
"reward": -0.9776304622491201, |
|
"reward_std": 1.6650471250216166, |
|
"rewards/Qwen2-0.5B-Reward": -0.9776304622491201, |
|
"step": 1060 |
|
}, |
|
{ |
|
"completion_length": 766.8726867675781, |
|
"epoch": 0.4050984351337708, |
|
"grad_norm": 1.4285918474197388, |
|
"kl": 2.3622395833333334, |
|
"learning_rate": 1.5365368019664618e-05, |
|
"loss": 0.0945, |
|
"reward": -0.650248110294342, |
|
"reward_std": 1.3134302516778311, |
|
"rewards/Qwen2-0.5B-Reward": -0.650248110294342, |
|
"step": 1070 |
|
}, |
|
{ |
|
"completion_length": 778.1921325683594, |
|
"epoch": 0.408884401817264, |
|
"grad_norm": 1.9540224075317383, |
|
"kl": 2.269270833333333, |
|
"learning_rate": 1.5260925224950785e-05, |
|
"loss": 0.0908, |
|
"reward": -0.5108215274910132, |
|
"reward_std": 1.1806359807650249, |
|
"rewards/Qwen2-0.5B-Reward": -0.5108215274910132, |
|
"step": 1080 |
|
}, |
|
{ |
|
"completion_length": 806.8787089029948, |
|
"epoch": 0.41267036850075717, |
|
"grad_norm": 0.9543392062187195, |
|
"kl": 2.60859375, |
|
"learning_rate": 1.5155737519115308e-05, |
|
"loss": 0.1043, |
|
"reward": -0.8536549975474675, |
|
"reward_std": 1.4883501867453257, |
|
"rewards/Qwen2-0.5B-Reward": -0.8536549975474675, |
|
"step": 1090 |
|
}, |
|
{ |
|
"completion_length": 790.8611124674479, |
|
"epoch": 0.4164563351842504, |
|
"grad_norm": 1.6240158081054688, |
|
"kl": 2.213671875, |
|
"learning_rate": 1.5049823291472195e-05, |
|
"loss": 0.0885, |
|
"reward": -0.5210499677807092, |
|
"reward_std": 1.2201600551605225, |
|
"rewards/Qwen2-0.5B-Reward": -0.5210499677807092, |
|
"step": 1100 |
|
}, |
|
{ |
|
"completion_length": 844.4666748046875, |
|
"epoch": 0.4202423018677436, |
|
"grad_norm": 0.7703062891960144, |
|
"kl": 3.1419270833333335, |
|
"learning_rate": 1.494320105834876e-05, |
|
"loss": 0.1257, |
|
"reward": -1.1577677488327027, |
|
"reward_std": 1.7909785747528075, |
|
"rewards/Qwen2-0.5B-Reward": -1.1577677488327027, |
|
"step": 1110 |
|
}, |
|
{ |
|
"completion_length": 873.2398213704427, |
|
"epoch": 0.42402826855123676, |
|
"grad_norm": 1.8059611320495605, |
|
"kl": 3.25859375, |
|
"learning_rate": 1.4835889459848517e-05, |
|
"loss": 0.1304, |
|
"reward": -0.9918207342425982, |
|
"reward_std": 1.6435052702824275, |
|
"rewards/Qwen2-0.5B-Reward": -0.9918207342425982, |
|
"step": 1120 |
|
}, |
|
{ |
|
"completion_length": 883.4926025390625, |
|
"epoch": 0.42781423523472994, |
|
"grad_norm": 1.4837961196899414, |
|
"kl": 2.7075520833333333, |
|
"learning_rate": 1.472790725659245e-05, |
|
"loss": 0.1083, |
|
"reward": -0.7034151526788871, |
|
"reward_std": 1.3653341392676035, |
|
"rewards/Qwen2-0.5B-Reward": -0.7034151526788871, |
|
"step": 1130 |
|
}, |
|
{ |
|
"completion_length": 779.6824117024739, |
|
"epoch": 0.4316002019182231, |
|
"grad_norm": 1.1727573871612549, |
|
"kl": 2.1869791666666667, |
|
"learning_rate": 1.4619273326439229e-05, |
|
"loss": 0.0875, |
|
"reward": -0.6506599500775337, |
|
"reward_std": 1.3229804019133249, |
|
"rewards/Qwen2-0.5B-Reward": -0.6506599500775337, |
|
"step": 1140 |
|
}, |
|
{ |
|
"completion_length": 829.1185241699219, |
|
"epoch": 0.4353861686017163, |
|
"grad_norm": 0.974542498588562, |
|
"kl": 2.659375, |
|
"learning_rate": 1.4510006661184867e-05, |
|
"loss": 0.1064, |
|
"reward": -0.7578525463740031, |
|
"reward_std": 1.531895116964976, |
|
"rewards/Qwen2-0.5B-Reward": -0.7578525463740031, |
|
"step": 1150 |
|
}, |
|
{ |
|
"completion_length": 796.5884318033854, |
|
"epoch": 0.4391721352852095, |
|
"grad_norm": 1.2544572353363037, |
|
"kl": 2.2998697916666666, |
|
"learning_rate": 1.440012636324255e-05, |
|
"loss": 0.092, |
|
"reward": -0.6453255646862089, |
|
"reward_std": 1.2682056347529094, |
|
"rewards/Qwen2-0.5B-Reward": -0.6453255646862089, |
|
"step": 1160 |
|
}, |
|
{ |
|
"completion_length": 656.2717681884766, |
|
"epoch": 0.44295810196870267, |
|
"grad_norm": 1.7041164636611938, |
|
"kl": 1.9328125, |
|
"learning_rate": 1.4289651642303055e-05, |
|
"loss": 0.0773, |
|
"reward": -0.3406788529828191, |
|
"reward_std": 1.0103827198346456, |
|
"rewards/Qwen2-0.5B-Reward": -0.3406788529828191, |
|
"step": 1170 |
|
}, |
|
{ |
|
"completion_length": 733.9171315511068, |
|
"epoch": 0.44674406865219585, |
|
"grad_norm": 0.846507728099823, |
|
"kl": 2.328125, |
|
"learning_rate": 1.4178601811976435e-05, |
|
"loss": 0.0931, |
|
"reward": -0.4902394848565261, |
|
"reward_std": 1.2676184395949046, |
|
"rewards/Qwen2-0.5B-Reward": -0.4902394848565261, |
|
"step": 1180 |
|
}, |
|
{ |
|
"completion_length": 768.9398234049479, |
|
"epoch": 0.450530035335689, |
|
"grad_norm": 0.7115055322647095, |
|
"kl": 2.668229166666667, |
|
"learning_rate": 1.4066996286415562e-05, |
|
"loss": 0.1068, |
|
"reward": -0.7519384076197942, |
|
"reward_std": 1.4289092858632406, |
|
"rewards/Qwen2-0.5B-Reward": -0.7519384076197942, |
|
"step": 1190 |
|
}, |
|
{ |
|
"completion_length": 825.0884297688802, |
|
"epoch": 0.4543160020191822, |
|
"grad_norm": 1.200706958770752, |
|
"kl": 3.144791666666667, |
|
"learning_rate": 1.3954854576922052e-05, |
|
"loss": 0.1258, |
|
"reward": -1.0908042828241984, |
|
"reward_std": 1.6707689007123312, |
|
"rewards/Qwen2-0.5B-Reward": -1.0908042828241984, |
|
"step": 1200 |
|
}, |
|
{ |
|
"completion_length": 705.2310302734375, |
|
"epoch": 0.45810196870267544, |
|
"grad_norm": 1.3045536279678345, |
|
"kl": 2.23046875, |
|
"learning_rate": 1.3842196288535226e-05, |
|
"loss": 0.0893, |
|
"reward": -0.5541289503375689, |
|
"reward_std": 1.264378293355306, |
|
"rewards/Qwen2-0.5B-Reward": -0.5541289503375689, |
|
"step": 1210 |
|
}, |
|
{ |
|
"completion_length": 662.7287109375, |
|
"epoch": 0.4618879353861686, |
|
"grad_norm": 1.1240729093551636, |
|
"kl": 1.7548177083333334, |
|
"learning_rate": 1.3729041116604697e-05, |
|
"loss": 0.0702, |
|
"reward": -0.33847450762987136, |
|
"reward_std": 1.030816239118576, |
|
"rewards/Qwen2-0.5B-Reward": -0.33847450762987136, |
|
"step": 1220 |
|
}, |
|
{ |
|
"completion_length": 723.0296376546224, |
|
"epoch": 0.4656739020696618, |
|
"grad_norm": 2.3360471725463867, |
|
"kl": 2.3111979166666665, |
|
"learning_rate": 1.3615408843347141e-05, |
|
"loss": 0.0924, |
|
"reward": -0.5807175462444624, |
|
"reward_std": 1.3384559114774068, |
|
"rewards/Qwen2-0.5B-Reward": -0.5807175462444624, |
|
"step": 1230 |
|
}, |
|
{ |
|
"completion_length": 751.2430704752604, |
|
"epoch": 0.469459868753155, |
|
"grad_norm": 2.823309898376465, |
|
"kl": 2.7513020833333335, |
|
"learning_rate": 1.3501319334387902e-05, |
|
"loss": 0.1101, |
|
"reward": -0.8531121673683325, |
|
"reward_std": 1.5220951795578004, |
|
"rewards/Qwen2-0.5B-Reward": -0.8531121673683325, |
|
"step": 1240 |
|
}, |
|
{ |
|
"completion_length": 790.4842651367187, |
|
"epoch": 0.47324583543664817, |
|
"grad_norm": 1.8123273849487305, |
|
"kl": 3.0010416666666666, |
|
"learning_rate": 1.3386792535287997e-05, |
|
"loss": 0.1201, |
|
"reward": -0.9698835199077924, |
|
"reward_std": 1.6139462788899739, |
|
"rewards/Qwen2-0.5B-Reward": -0.9698835199077924, |
|
"step": 1250 |
|
}, |
|
{ |
|
"completion_length": 734.1004720052083, |
|
"epoch": 0.47703180212014135, |
|
"grad_norm": 0.6924867033958435, |
|
"kl": 2.5669270833333333, |
|
"learning_rate": 1.3271848468057176e-05, |
|
"loss": 0.1027, |
|
"reward": -0.6089021896322568, |
|
"reward_std": 1.2572330633799236, |
|
"rewards/Qwen2-0.5B-Reward": -0.6089021896322568, |
|
"step": 1260 |
|
}, |
|
{ |
|
"completion_length": 740.8490763346355, |
|
"epoch": 0.4808177688036345, |
|
"grad_norm": 1.0355186462402344, |
|
"kl": 2.7315104166666666, |
|
"learning_rate": 1.3156507227653582e-05, |
|
"loss": 0.1093, |
|
"reward": -0.5665054028232892, |
|
"reward_std": 1.3232530683279038, |
|
"rewards/Qwen2-0.5B-Reward": -0.5665054028232892, |
|
"step": 1270 |
|
}, |
|
{ |
|
"completion_length": 721.3925944010417, |
|
"epoch": 0.4846037354871277, |
|
"grad_norm": 1.0751088857650757, |
|
"kl": 2.77890625, |
|
"learning_rate": 1.3040788978470678e-05, |
|
"loss": 0.1111, |
|
"reward": -0.617917682370171, |
|
"reward_std": 1.3952182014783223, |
|
"rewards/Qwen2-0.5B-Reward": -0.617917682370171, |
|
"step": 1280 |
|
}, |
|
{ |
|
"completion_length": 743.2884348551432, |
|
"epoch": 0.4883897021706209, |
|
"grad_norm": 1.7289220094680786, |
|
"kl": 2.8721354166666666, |
|
"learning_rate": 1.2924713950812033e-05, |
|
"loss": 0.1148, |
|
"reward": -0.6107141558701793, |
|
"reward_std": 1.3133805135885874, |
|
"rewards/Qwen2-0.5B-Reward": -0.6107141558701793, |
|
"step": 1290 |
|
}, |
|
{ |
|
"completion_length": 744.837967936198, |
|
"epoch": 0.49217566885411407, |
|
"grad_norm": 0.9980621337890625, |
|
"kl": 2.6927083333333335, |
|
"learning_rate": 1.280830243735459e-05, |
|
"loss": 0.1077, |
|
"reward": -0.6816005217532317, |
|
"reward_std": 1.3647177835305533, |
|
"rewards/Qwen2-0.5B-Reward": -0.6816005217532317, |
|
"step": 1300 |
|
}, |
|
{ |
|
"completion_length": 765.5287109375, |
|
"epoch": 0.49596163553760725, |
|
"grad_norm": 1.5100042819976807, |
|
"kl": 3.23359375, |
|
"learning_rate": 1.2691574789601006e-05, |
|
"loss": 0.1293, |
|
"reward": -0.7456285426393151, |
|
"reward_std": 1.504830890893936, |
|
"rewards/Qwen2-0.5B-Reward": -0.7456285426393151, |
|
"step": 1310 |
|
}, |
|
{ |
|
"completion_length": 776.5162089029948, |
|
"epoch": 0.49974760222110043, |
|
"grad_norm": 3.0420119762420654, |
|
"kl": 2.664322916666667, |
|
"learning_rate": 1.2574551414321749e-05, |
|
"loss": 0.1066, |
|
"reward": -0.6133380237966776, |
|
"reward_std": 1.4030099928379058, |
|
"rewards/Qwen2-0.5B-Reward": -0.6133380237966776, |
|
"step": 1320 |
|
}, |
|
{ |
|
"completion_length": 756.2375101725261, |
|
"epoch": 0.5035335689045937, |
|
"grad_norm": 1.2776826620101929, |
|
"kl": 2.5111979166666667, |
|
"learning_rate": 1.2457252769987485e-05, |
|
"loss": 0.1005, |
|
"reward": -0.4735676831565797, |
|
"reward_std": 1.2207833151022593, |
|
"rewards/Qwen2-0.5B-Reward": -0.4735676831565797, |
|
"step": 1330 |
|
}, |
|
{ |
|
"completion_length": 780.6055643717448, |
|
"epoch": 0.5073195355880868, |
|
"grad_norm": 1.277037262916565, |
|
"kl": 2.29453125, |
|
"learning_rate": 1.2339699363192461e-05, |
|
"loss": 0.0918, |
|
"reward": -0.41186855093886454, |
|
"reward_std": 1.1698833445707957, |
|
"rewards/Qwen2-0.5B-Reward": -0.41186855093886454, |
|
"step": 1340 |
|
}, |
|
{ |
|
"completion_length": 814.8995402018229, |
|
"epoch": 0.51110550227158, |
|
"grad_norm": 1.1098392009735107, |
|
"kl": 2.9515625, |
|
"learning_rate": 1.2221911745069473e-05, |
|
"loss": 0.118, |
|
"reward": -0.7255906278888384, |
|
"reward_std": 1.5052427490552267, |
|
"rewards/Qwen2-0.5B-Reward": -0.7255906278888384, |
|
"step": 1350 |
|
}, |
|
{ |
|
"completion_length": 800.6180623372396, |
|
"epoch": 0.5148914689550732, |
|
"grad_norm": 1.5379681587219238, |
|
"kl": 3.078385416666667, |
|
"learning_rate": 1.210391050769702e-05, |
|
"loss": 0.1231, |
|
"reward": -0.9011206914981206, |
|
"reward_std": 1.5988249023755392, |
|
"rewards/Qwen2-0.5B-Reward": -0.9011206914981206, |
|
"step": 1360 |
|
}, |
|
{ |
|
"completion_length": 787.152783203125, |
|
"epoch": 0.5186774356385664, |
|
"grad_norm": 1.421747088432312, |
|
"kl": 2.74453125, |
|
"learning_rate": 1.1985716280499338e-05, |
|
"loss": 0.1098, |
|
"reward": -0.7614536421994368, |
|
"reward_std": 1.4081373771031698, |
|
"rewards/Qwen2-0.5B-Reward": -0.7614536421994368, |
|
"step": 1370 |
|
}, |
|
{ |
|
"completion_length": 842.5861185709635, |
|
"epoch": 0.5224634023220596, |
|
"grad_norm": 2.403327226638794, |
|
"kl": 3.16171875, |
|
"learning_rate": 1.1867349726639868e-05, |
|
"loss": 0.1266, |
|
"reward": -0.8059929932157198, |
|
"reward_std": 1.487107406059901, |
|
"rewards/Qwen2-0.5B-Reward": -0.8059929932157198, |
|
"step": 1380 |
|
}, |
|
{ |
|
"completion_length": 793.2569539388021, |
|
"epoch": 0.5262493690055527, |
|
"grad_norm": 1.0243574380874634, |
|
"kl": 3.30625, |
|
"learning_rate": 1.1748831539408863e-05, |
|
"loss": 0.1323, |
|
"reward": -0.9990609556436538, |
|
"reward_std": 1.641613002618154, |
|
"rewards/Qwen2-0.5B-Reward": -0.9990609556436538, |
|
"step": 1390 |
|
}, |
|
{ |
|
"completion_length": 781.2513997395833, |
|
"epoch": 0.5300353356890459, |
|
"grad_norm": 1.4023561477661133, |
|
"kl": 2.6401041666666667, |
|
"learning_rate": 1.1630182438605688e-05, |
|
"loss": 0.1056, |
|
"reward": -0.73541273077329, |
|
"reward_std": 1.391848737001419, |
|
"rewards/Qwen2-0.5B-Reward": -0.73541273077329, |
|
"step": 1400 |
|
}, |
|
{ |
|
"completion_length": 777.1314880371094, |
|
"epoch": 0.5338213023725391, |
|
"grad_norm": 1.4984385967254639, |
|
"kl": 3.1786458333333334, |
|
"learning_rate": 1.151142316691652e-05, |
|
"loss": 0.1273, |
|
"reward": -0.9620630964636803, |
|
"reward_std": 1.6180862605571746, |
|
"rewards/Qwen2-0.5B-Reward": -0.9620630964636803, |
|
"step": 1410 |
|
}, |
|
{ |
|
"completion_length": 753.6296305338542, |
|
"epoch": 0.5376072690560323, |
|
"grad_norm": 0.6080305576324463, |
|
"kl": 2.93046875, |
|
"learning_rate": 1.1392574486288026e-05, |
|
"loss": 0.1172, |
|
"reward": -0.6871781093068421, |
|
"reward_std": 1.4368105371793112, |
|
"rewards/Qwen2-0.5B-Reward": -0.6871781093068421, |
|
"step": 1420 |
|
}, |
|
{ |
|
"completion_length": 755.0680684407552, |
|
"epoch": 0.5413932357395255, |
|
"grad_norm": 0.9181307554244995, |
|
"kl": 2.5361979166666666, |
|
"learning_rate": 1.1273657174297687e-05, |
|
"loss": 0.1016, |
|
"reward": -0.41866928230350214, |
|
"reward_std": 1.193355711301168, |
|
"rewards/Qwen2-0.5B-Reward": -0.41866928230350214, |
|
"step": 1430 |
|
}, |
|
{ |
|
"completion_length": 739.3643595377604, |
|
"epoch": 0.5451792024230186, |
|
"grad_norm": 1.3852412700653076, |
|
"kl": 2.4328125, |
|
"learning_rate": 1.1154692020521379e-05, |
|
"loss": 0.0973, |
|
"reward": -0.42044620849192144, |
|
"reward_std": 1.1699665983517964, |
|
"rewards/Qwen2-0.5B-Reward": -0.42044620849192144, |
|
"step": 1440 |
|
}, |
|
{ |
|
"completion_length": 787.2263997395834, |
|
"epoch": 0.5489651691065118, |
|
"grad_norm": 1.2610223293304443, |
|
"kl": 2.7135416666666665, |
|
"learning_rate": 1.1035699822898852e-05, |
|
"loss": 0.1085, |
|
"reward": -0.5719452144578099, |
|
"reward_std": 1.3674102127552032, |
|
"rewards/Qwen2-0.5B-Reward": -0.5719452144578099, |
|
"step": 1450 |
|
}, |
|
{ |
|
"completion_length": 794.1407450358073, |
|
"epoch": 0.552751135790005, |
|
"grad_norm": 3.987548351287842, |
|
"kl": 3.580208333333333, |
|
"learning_rate": 1.091670138409778e-05, |
|
"loss": 0.1432, |
|
"reward": -0.913334188858668, |
|
"reward_std": 1.654043678442637, |
|
"rewards/Qwen2-0.5B-Reward": -0.913334188858668, |
|
"step": 1460 |
|
}, |
|
{ |
|
"completion_length": 755.6018575032552, |
|
"epoch": 0.5565371024734982, |
|
"grad_norm": 1.312009334564209, |
|
"kl": 2.115364583333333, |
|
"learning_rate": 1.0797717507876926e-05, |
|
"loss": 0.0846, |
|
"reward": -0.605161217538019, |
|
"reward_std": 1.230643669764201, |
|
"rewards/Qwen2-0.5B-Reward": -0.605161217538019, |
|
"step": 1470 |
|
}, |
|
{ |
|
"completion_length": 745.5069519042969, |
|
"epoch": 0.5603230691569914, |
|
"grad_norm": 1.5958776473999023, |
|
"kl": 2.8216145833333335, |
|
"learning_rate": 1.0678768995449179e-05, |
|
"loss": 0.1129, |
|
"reward": -0.5114948400606711, |
|
"reward_std": 1.213375515739123, |
|
"rewards/Qwen2-0.5B-Reward": -0.5114948400606711, |
|
"step": 1480 |
|
}, |
|
{ |
|
"completion_length": 790.2676025390625, |
|
"epoch": 0.5641090358404846, |
|
"grad_norm": 1.0892456769943237, |
|
"kl": 2.9091145833333334, |
|
"learning_rate": 1.055987664184499e-05, |
|
"loss": 0.1164, |
|
"reward": -0.6985714793205261, |
|
"reward_std": 1.441979839404424, |
|
"rewards/Qwen2-0.5B-Reward": -0.6985714793205261, |
|
"step": 1490 |
|
}, |
|
{ |
|
"completion_length": 768.1861206054688, |
|
"epoch": 0.5678950025239778, |
|
"grad_norm": 1.5841772556304932, |
|
"kl": 2.4014322916666666, |
|
"learning_rate": 1.0441061232276914e-05, |
|
"loss": 0.096, |
|
"reward": -0.5361925270253172, |
|
"reward_std": 1.2279207597176234, |
|
"rewards/Qwen2-0.5B-Reward": -0.5361925270253172, |
|
"step": 1500 |
|
}, |
|
{ |
|
"completion_length": 790.4763977050782, |
|
"epoch": 0.571680969207471, |
|
"grad_norm": 1.4919512271881104, |
|
"kl": 2.945052083333333, |
|
"learning_rate": 1.0322343538505859e-05, |
|
"loss": 0.1178, |
|
"reward": -0.6917820642391841, |
|
"reward_std": 1.41629096865654, |
|
"rewards/Qwen2-0.5B-Reward": -0.6917820642391841, |
|
"step": 1510 |
|
}, |
|
{ |
|
"completion_length": 732.1388956705729, |
|
"epoch": 0.5754669358909642, |
|
"grad_norm": 1.3066332340240479, |
|
"kl": 2.5669270833333333, |
|
"learning_rate": 1.0203744315209683e-05, |
|
"loss": 0.1026, |
|
"reward": -0.4832228126314779, |
|
"reward_std": 1.21365185379982, |
|
"rewards/Qwen2-0.5B-Reward": -0.4832228126314779, |
|
"step": 1520 |
|
}, |
|
{ |
|
"completion_length": 777.1152811686198, |
|
"epoch": 0.5792529025744574, |
|
"grad_norm": 2.0675883293151855, |
|
"kl": 3.0052083333333335, |
|
"learning_rate": 1.0085284296354784e-05, |
|
"loss": 0.1202, |
|
"reward": -0.7202197993795078, |
|
"reward_std": 1.480885813633601, |
|
"rewards/Qwen2-0.5B-Reward": -0.7202197993795078, |
|
"step": 1530 |
|
}, |
|
{ |
|
"completion_length": 810.0231577555338, |
|
"epoch": 0.5830388692579506, |
|
"grad_norm": 1.1669964790344238, |
|
"kl": 3.351822916666667, |
|
"learning_rate": 9.966984191571318e-06, |
|
"loss": 0.1341, |
|
"reward": -0.9308211114102354, |
|
"reward_std": 1.5289963026841482, |
|
"rewards/Qwen2-0.5B-Reward": -0.9308211114102354, |
|
"step": 1540 |
|
}, |
|
{ |
|
"completion_length": 803.5157470703125, |
|
"epoch": 0.5868248359414437, |
|
"grad_norm": 1.2970937490463257, |
|
"kl": 2.9263020833333333, |
|
"learning_rate": 9.848864682532654e-06, |
|
"loss": 0.1171, |
|
"reward": -0.897743321955204, |
|
"reward_std": 1.4250325242678323, |
|
"rewards/Qwen2-0.5B-Reward": -0.897743321955204, |
|
"step": 1550 |
|
}, |
|
{ |
|
"completion_length": 774.900467936198, |
|
"epoch": 0.5906108026249369, |
|
"grad_norm": 1.5224976539611816, |
|
"kl": 3.23203125, |
|
"learning_rate": 9.730946419339721e-06, |
|
"loss": 0.1293, |
|
"reward": -0.8313487897316615, |
|
"reward_std": 1.4089517414569854, |
|
"rewards/Qwen2-0.5B-Reward": -0.8313487897316615, |
|
"step": 1560 |
|
}, |
|
{ |
|
"completion_length": 814.4111185709636, |
|
"epoch": 0.5943967693084301, |
|
"grad_norm": 1.5672080516815186, |
|
"kl": 2.9359375, |
|
"learning_rate": 9.613250016910894e-06, |
|
"loss": 0.1174, |
|
"reward": -0.7221511860688528, |
|
"reward_std": 1.3432387212912242, |
|
"rewards/Qwen2-0.5B-Reward": -0.7221511860688528, |
|
"step": 1570 |
|
}, |
|
{ |
|
"completion_length": 776.6129638671875, |
|
"epoch": 0.5981827359919233, |
|
"grad_norm": 1.8100062608718872, |
|
"kl": 2.7890625, |
|
"learning_rate": 9.495796051377997e-06, |
|
"loss": 0.1115, |
|
"reward": -0.8584653136630853, |
|
"reward_std": 1.3234432935714722, |
|
"rewards/Qwen2-0.5B-Reward": -0.8584653136630853, |
|
"step": 1580 |
|
}, |
|
{ |
|
"completion_length": 825.8180603027344, |
|
"epoch": 0.6019687026754165, |
|
"grad_norm": 1.6404787302017212, |
|
"kl": 3.863541666666667, |
|
"learning_rate": 9.378605056489128e-06, |
|
"loss": 0.1545, |
|
"reward": -1.263607233762741, |
|
"reward_std": 1.8019790093104044, |
|
"rewards/Qwen2-0.5B-Reward": -1.263607233762741, |
|
"step": 1590 |
|
}, |
|
{ |
|
"completion_length": 728.1912089029948, |
|
"epoch": 0.6057546693589096, |
|
"grad_norm": 0.8878143429756165, |
|
"kl": 2.6088541666666667, |
|
"learning_rate": 9.261697520018849e-06, |
|
"loss": 0.1044, |
|
"reward": -0.42785762051741283, |
|
"reward_std": 1.0820347189903259, |
|
"rewards/Qwen2-0.5B-Reward": -0.42785762051741283, |
|
"step": 1600 |
|
}, |
|
{ |
|
"completion_length": 747.9509358723958, |
|
"epoch": 0.6095406360424028, |
|
"grad_norm": 1.613976240158081, |
|
"kl": 2.468489583333333, |
|
"learning_rate": 9.145093880186451e-06, |
|
"loss": 0.0988, |
|
"reward": -0.41555683029194673, |
|
"reward_std": 1.179705987373988, |
|
"rewards/Qwen2-0.5B-Reward": -0.41555683029194673, |
|
"step": 1610 |
|
}, |
|
{ |
|
"completion_length": 787.43056640625, |
|
"epoch": 0.613326602725896, |
|
"grad_norm": 0.5864226818084717, |
|
"kl": 2.894270833333333, |
|
"learning_rate": 9.028814522082857e-06, |
|
"loss": 0.1157, |
|
"reward": -0.6661467840274174, |
|
"reward_std": 1.412223219871521, |
|
"rewards/Qwen2-0.5B-Reward": -0.6661467840274174, |
|
"step": 1620 |
|
}, |
|
{ |
|
"completion_length": 742.3319498697916, |
|
"epoch": 0.6171125694093892, |
|
"grad_norm": 1.7149267196655273, |
|
"kl": 2.7528645833333334, |
|
"learning_rate": 8.912879774106832e-06, |
|
"loss": 0.1101, |
|
"reward": -0.560060964524746, |
|
"reward_std": 1.2752733170986175, |
|
"rewards/Qwen2-0.5B-Reward": -0.560060964524746, |
|
"step": 1630 |
|
}, |
|
{ |
|
"completion_length": 750.7245422363281, |
|
"epoch": 0.6208985360928824, |
|
"grad_norm": 2.106180191040039, |
|
"kl": 2.40546875, |
|
"learning_rate": 8.797309904411087e-06, |
|
"loss": 0.0962, |
|
"reward": -0.416633996165668, |
|
"reward_std": 1.1659721612930298, |
|
"rewards/Qwen2-0.5B-Reward": -0.416633996165668, |
|
"step": 1640 |
|
}, |
|
{ |
|
"completion_length": 778.6088033040364, |
|
"epoch": 0.6246845027763755, |
|
"grad_norm": 1.4638694524765015, |
|
"kl": 2.676041666666667, |
|
"learning_rate": 8.682125117358927e-06, |
|
"loss": 0.1071, |
|
"reward": -0.6446437170108159, |
|
"reward_std": 1.3279209415117899, |
|
"rewards/Qwen2-0.5B-Reward": -0.6446437170108159, |
|
"step": 1650 |
|
}, |
|
{ |
|
"completion_length": 808.040283203125, |
|
"epoch": 0.6284704694598687, |
|
"grad_norm": 1.1022939682006836, |
|
"kl": 3.4580729166666666, |
|
"learning_rate": 8.567345549992045e-06, |
|
"loss": 0.1383, |
|
"reward": -0.7954719786842664, |
|
"reward_std": 1.4967798054218293, |
|
"rewards/Qwen2-0.5B-Reward": -0.7954719786842664, |
|
"step": 1660 |
|
}, |
|
{ |
|
"completion_length": 757.4060282389323, |
|
"epoch": 0.6322564361433619, |
|
"grad_norm": 2.4723708629608154, |
|
"kl": 2.792708333333333, |
|
"learning_rate": 8.4529912685101e-06, |
|
"loss": 0.1117, |
|
"reward": -0.5523949672778448, |
|
"reward_std": 1.3249893307685852, |
|
"rewards/Qwen2-0.5B-Reward": -0.5523949672778448, |
|
"step": 1670 |
|
}, |
|
{ |
|
"completion_length": 762.1398213704427, |
|
"epoch": 0.6360424028268551, |
|
"grad_norm": 0.8709607720375061, |
|
"kl": 2.8286458333333333, |
|
"learning_rate": 8.33908226476265e-06, |
|
"loss": 0.1132, |
|
"reward": -0.5545504409819841, |
|
"reward_std": 1.3114221652348836, |
|
"rewards/Qwen2-0.5B-Reward": -0.5545504409819841, |
|
"step": 1680 |
|
}, |
|
{ |
|
"completion_length": 823.0356526692708, |
|
"epoch": 0.6398283695103483, |
|
"grad_norm": 0.969098687171936, |
|
"kl": 2.855729166666667, |
|
"learning_rate": 8.22563845275411e-06, |
|
"loss": 0.1142, |
|
"reward": -0.7070573056737582, |
|
"reward_std": 1.3873663266499838, |
|
"rewards/Qwen2-0.5B-Reward": -0.7070573056737582, |
|
"step": 1690 |
|
}, |
|
{ |
|
"completion_length": 810.1981506347656, |
|
"epoch": 0.6436143361938415, |
|
"grad_norm": 1.2305635213851929, |
|
"kl": 3.793229166666667, |
|
"learning_rate": 8.11267966516231e-06, |
|
"loss": 0.1518, |
|
"reward": -1.061463608344396, |
|
"reward_std": 1.7348846475283304, |
|
"rewards/Qwen2-0.5B-Reward": -1.061463608344396, |
|
"step": 1700 |
|
}, |
|
{ |
|
"completion_length": 776.243983968099, |
|
"epoch": 0.6474003028773346, |
|
"grad_norm": 1.6688897609710693, |
|
"kl": 2.94375, |
|
"learning_rate": 8.000225649871272e-06, |
|
"loss": 0.1177, |
|
"reward": -0.7328139250477155, |
|
"reward_std": 1.4019733607769012, |
|
"rewards/Qwen2-0.5B-Reward": -0.7328139250477155, |
|
"step": 1710 |
|
}, |
|
{ |
|
"completion_length": 782.6092692057292, |
|
"epoch": 0.6511862695608278, |
|
"grad_norm": 2.184279680252075, |
|
"kl": 3.275260416666667, |
|
"learning_rate": 7.888296066518806e-06, |
|
"loss": 0.131, |
|
"reward": -0.826190093656381, |
|
"reward_std": 1.539618053038915, |
|
"rewards/Qwen2-0.5B-Reward": -0.826190093656381, |
|
"step": 1720 |
|
}, |
|
{ |
|
"completion_length": 707.3263997395833, |
|
"epoch": 0.6549722362443211, |
|
"grad_norm": 2.3973989486694336, |
|
"kl": 2.400260416666667, |
|
"learning_rate": 7.776910483059543e-06, |
|
"loss": 0.096, |
|
"reward": -0.5184978457788626, |
|
"reward_std": 1.1560731967290243, |
|
"rewards/Qwen2-0.5B-Reward": -0.5184978457788626, |
|
"step": 1730 |
|
}, |
|
{ |
|
"completion_length": 734.9152872721354, |
|
"epoch": 0.6587582029278143, |
|
"grad_norm": 1.8029112815856934, |
|
"kl": 2.9859375, |
|
"learning_rate": 7.666088372343984e-06, |
|
"loss": 0.1194, |
|
"reward": -0.5925529218278826, |
|
"reward_std": 1.267720968524615, |
|
"rewards/Qwen2-0.5B-Reward": -0.5925529218278826, |
|
"step": 1740 |
|
}, |
|
{ |
|
"completion_length": 807.6726928710938, |
|
"epoch": 0.6625441696113075, |
|
"grad_norm": 1.5247033834457397, |
|
"kl": 3.3872395833333333, |
|
"learning_rate": 7.555849108714192e-06, |
|
"loss": 0.1355, |
|
"reward": -0.7715960969527562, |
|
"reward_std": 1.4897764484087626, |
|
"rewards/Qwen2-0.5B-Reward": -0.7715960969527562, |
|
"step": 1750 |
|
}, |
|
{ |
|
"completion_length": 776.8838073730469, |
|
"epoch": 0.6663301362948006, |
|
"grad_norm": 1.9940361976623535, |
|
"kl": 2.837760416666667, |
|
"learning_rate": 7.4462119646166855e-06, |
|
"loss": 0.1136, |
|
"reward": -0.7241511250535647, |
|
"reward_std": 1.4011840164661407, |
|
"rewards/Qwen2-0.5B-Reward": -0.7241511250535647, |
|
"step": 1760 |
|
}, |
|
{ |
|
"completion_length": 767.8162129720052, |
|
"epoch": 0.6701161029782938, |
|
"grad_norm": 1.5367672443389893, |
|
"kl": 3.5140625, |
|
"learning_rate": 7.337196107233155e-06, |
|
"loss": 0.1407, |
|
"reward": -0.7663616319497426, |
|
"reward_std": 1.5210982898871104, |
|
"rewards/Qwen2-0.5B-Reward": -0.7663616319497426, |
|
"step": 1770 |
|
}, |
|
{ |
|
"completion_length": 721.7675944010417, |
|
"epoch": 0.673902069661787, |
|
"grad_norm": 1.302241563796997, |
|
"kl": 2.931510416666667, |
|
"learning_rate": 7.228820595129604e-06, |
|
"loss": 0.1172, |
|
"reward": -0.725257391979297, |
|
"reward_std": 1.334197594722112, |
|
"rewards/Qwen2-0.5B-Reward": -0.725257391979297, |
|
"step": 1780 |
|
}, |
|
{ |
|
"completion_length": 720.3171468098958, |
|
"epoch": 0.6776880363452802, |
|
"grad_norm": 0.8652080297470093, |
|
"kl": 3.028125, |
|
"learning_rate": 7.12110437492443e-06, |
|
"loss": 0.1211, |
|
"reward": -0.753487682590882, |
|
"reward_std": 1.4118338882923127, |
|
"rewards/Qwen2-0.5B-Reward": -0.753487682590882, |
|
"step": 1790 |
|
}, |
|
{ |
|
"completion_length": 744.602783203125, |
|
"epoch": 0.6814740030287734, |
|
"grad_norm": 0.6850081086158752, |
|
"kl": 3.18046875, |
|
"learning_rate": 7.014066277976128e-06, |
|
"loss": 0.1272, |
|
"reward": -0.6332276176661253, |
|
"reward_std": 1.3656011939048767, |
|
"rewards/Qwen2-0.5B-Reward": -0.6332276176661253, |
|
"step": 1800 |
|
}, |
|
{ |
|
"completion_length": 759.4481526692708, |
|
"epoch": 0.6852599697122665, |
|
"grad_norm": 2.0515530109405518, |
|
"kl": 3.3453125, |
|
"learning_rate": 6.9077250170911005e-06, |
|
"loss": 0.1338, |
|
"reward": -0.8095526337623596, |
|
"reward_std": 1.5075600425402322, |
|
"rewards/Qwen2-0.5B-Reward": -0.8095526337623596, |
|
"step": 1810 |
|
}, |
|
{ |
|
"completion_length": 723.5777811686198, |
|
"epoch": 0.6890459363957597, |
|
"grad_norm": 0.7833884358406067, |
|
"kl": 2.9953125, |
|
"learning_rate": 6.802099183252235e-06, |
|
"loss": 0.1198, |
|
"reward": -0.7537414369483789, |
|
"reward_std": 1.383406792084376, |
|
"rewards/Qwen2-0.5B-Reward": -0.7537414369483789, |
|
"step": 1820 |
|
}, |
|
{ |
|
"completion_length": 724.8837972005208, |
|
"epoch": 0.6928319030792529, |
|
"grad_norm": 0.9831650853157043, |
|
"kl": 2.5338541666666665, |
|
"learning_rate": 6.697207242368742e-06, |
|
"loss": 0.1013, |
|
"reward": -0.43006037194281815, |
|
"reward_std": 1.1635287086168924, |
|
"rewards/Qwen2-0.5B-Reward": -0.43006037194281815, |
|
"step": 1830 |
|
}, |
|
{ |
|
"completion_length": 760.3333374023438, |
|
"epoch": 0.6966178697627461, |
|
"grad_norm": 1.1536668539047241, |
|
"kl": 2.6203125, |
|
"learning_rate": 6.593067532047882e-06, |
|
"loss": 0.1049, |
|
"reward": -0.4441113060961167, |
|
"reward_std": 1.1987637420495352, |
|
"rewards/Qwen2-0.5B-Reward": -0.4441113060961167, |
|
"step": 1840 |
|
}, |
|
{ |
|
"completion_length": 749.903251139323, |
|
"epoch": 0.7004038364462393, |
|
"grad_norm": 0.8368715643882751, |
|
"kl": 2.5341145833333334, |
|
"learning_rate": 6.489698258389107e-06, |
|
"loss": 0.1013, |
|
"reward": -0.5944258317351341, |
|
"reward_std": 1.3474121958017349, |
|
"rewards/Qwen2-0.5B-Reward": -0.5944258317351341, |
|
"step": 1850 |
|
}, |
|
{ |
|
"completion_length": 745.5365783691407, |
|
"epoch": 0.7041898031297325, |
|
"grad_norm": 1.029958724975586, |
|
"kl": 2.90078125, |
|
"learning_rate": 6.387117492801213e-06, |
|
"loss": 0.1161, |
|
"reward": -0.6068828483422597, |
|
"reward_std": 1.321648943424225, |
|
"rewards/Qwen2-0.5B-Reward": -0.6068828483422597, |
|
"step": 1860 |
|
}, |
|
{ |
|
"completion_length": 755.6328796386719, |
|
"epoch": 0.7079757698132256, |
|
"grad_norm": 5.108635425567627, |
|
"kl": 2.9171875, |
|
"learning_rate": 6.285343168843028e-06, |
|
"loss": 0.1167, |
|
"reward": -0.6523237491647402, |
|
"reward_std": 1.3444733719031017, |
|
"rewards/Qwen2-0.5B-Reward": -0.6523237491647402, |
|
"step": 1870 |
|
}, |
|
{ |
|
"completion_length": 787.0935241699219, |
|
"epoch": 0.7117617364967188, |
|
"grad_norm": 1.3548846244812012, |
|
"kl": 3.0869791666666666, |
|
"learning_rate": 6.1843930790881766e-06, |
|
"loss": 0.1235, |
|
"reward": -0.6537054566045603, |
|
"reward_std": 1.4838234384854634, |
|
"rewards/Qwen2-0.5B-Reward": -0.6537054566045603, |
|
"step": 1880 |
|
}, |
|
{ |
|
"completion_length": 773.563895670573, |
|
"epoch": 0.715547703180212, |
|
"grad_norm": 0.8410789966583252, |
|
"kl": 2.837760416666667, |
|
"learning_rate": 6.084284872014545e-06, |
|
"loss": 0.1136, |
|
"reward": -0.5507580937196811, |
|
"reward_std": 1.2756544808546701, |
|
"rewards/Qwen2-0.5B-Reward": -0.5507580937196811, |
|
"step": 1890 |
|
}, |
|
{ |
|
"completion_length": 760.8699096679687, |
|
"epoch": 0.7193336698637052, |
|
"grad_norm": 1.5116900205612183, |
|
"kl": 2.6723958333333333, |
|
"learning_rate": 5.985036048918894e-06, |
|
"loss": 0.1069, |
|
"reward": -0.46427804150929053, |
|
"reward_std": 1.1952710588773092, |
|
"rewards/Qwen2-0.5B-Reward": -0.46427804150929053, |
|
"step": 1900 |
|
}, |
|
{ |
|
"completion_length": 763.8004699707031, |
|
"epoch": 0.7231196365471984, |
|
"grad_norm": 1.1645935773849487, |
|
"kl": 3.13828125, |
|
"learning_rate": 5.886663960857202e-06, |
|
"loss": 0.1255, |
|
"reward": -0.7973003094395001, |
|
"reward_std": 1.4403738955656686, |
|
"rewards/Qwen2-0.5B-Reward": -0.7973003094395001, |
|
"step": 1910 |
|
}, |
|
{ |
|
"completion_length": 746.5444559733073, |
|
"epoch": 0.7269056032306915, |
|
"grad_norm": 1.8314180374145508, |
|
"kl": 3.378125, |
|
"learning_rate": 5.789185805611313e-06, |
|
"loss": 0.1351, |
|
"reward": -0.6777333706617356, |
|
"reward_std": 1.452496987581253, |
|
"rewards/Qwen2-0.5B-Reward": -0.6777333706617356, |
|
"step": 1920 |
|
}, |
|
{ |
|
"completion_length": 743.2513977050781, |
|
"epoch": 0.7306915699141847, |
|
"grad_norm": 1.8599276542663574, |
|
"kl": 2.6572916666666666, |
|
"learning_rate": 5.692618624682342e-06, |
|
"loss": 0.1063, |
|
"reward": -0.5468713939189911, |
|
"reward_std": 1.203757886091868, |
|
"rewards/Qwen2-0.5B-Reward": -0.5468713939189911, |
|
"step": 1930 |
|
}, |
|
{ |
|
"completion_length": 715.9157470703125, |
|
"epoch": 0.7344775365976779, |
|
"grad_norm": 3.749554395675659, |
|
"kl": 3.373177083333333, |
|
"learning_rate": 5.596979300311408e-06, |
|
"loss": 0.1351, |
|
"reward": -0.42453126634160676, |
|
"reward_std": 1.129069878657659, |
|
"rewards/Qwen2-0.5B-Reward": -0.42453126634160676, |
|
"step": 1940 |
|
}, |
|
{ |
|
"completion_length": 707.4583414713542, |
|
"epoch": 0.7382635032811711, |
|
"grad_norm": 1.2406065464019775, |
|
"kl": 2.40546875, |
|
"learning_rate": 5.502284552528236e-06, |
|
"loss": 0.0962, |
|
"reward": -0.3166978692635894, |
|
"reward_std": 1.0220210254192352, |
|
"rewards/Qwen2-0.5B-Reward": -0.3166978692635894, |
|
"step": 1950 |
|
}, |
|
{ |
|
"completion_length": 730.9064880371094, |
|
"epoch": 0.7420494699646644, |
|
"grad_norm": 0.894660472869873, |
|
"kl": 3.0755208333333335, |
|
"learning_rate": 5.408550936228072e-06, |
|
"loss": 0.1231, |
|
"reward": -0.6020015890399615, |
|
"reward_std": 1.3233680129051208, |
|
"rewards/Qwen2-0.5B-Reward": -0.6020015890399615, |
|
"step": 1960 |
|
}, |
|
{ |
|
"completion_length": 784.6120402018229, |
|
"epoch": 0.7458354366481575, |
|
"grad_norm": 0.9947274923324585, |
|
"kl": 3.3036458333333334, |
|
"learning_rate": 5.315794838277524e-06, |
|
"loss": 0.1321, |
|
"reward": -0.8605576127767562, |
|
"reward_std": 1.5929324706395467, |
|
"rewards/Qwen2-0.5B-Reward": -0.8605576127767562, |
|
"step": 1970 |
|
}, |
|
{ |
|
"completion_length": 761.7782409667968, |
|
"epoch": 0.7496214033316507, |
|
"grad_norm": 0.8357589244842529, |
|
"kl": 3.126822916666667, |
|
"learning_rate": 5.2240324746497185e-06, |
|
"loss": 0.1251, |
|
"reward": -0.6573333943883578, |
|
"reward_std": 1.3803256154060364, |
|
"rewards/Qwen2-0.5B-Reward": -0.6573333943883578, |
|
"step": 1980 |
|
}, |
|
{ |
|
"completion_length": 751.271309407552, |
|
"epoch": 0.7534073700151439, |
|
"grad_norm": 0.9635012149810791, |
|
"kl": 2.846875, |
|
"learning_rate": 5.133279887589381e-06, |
|
"loss": 0.114, |
|
"reward": -0.5246660086015861, |
|
"reward_std": 1.2728915989398957, |
|
"rewards/Qwen2-0.5B-Reward": -0.5246660086015861, |
|
"step": 1990 |
|
}, |
|
{ |
|
"completion_length": 721.8902760823568, |
|
"epoch": 0.7571933366986371, |
|
"grad_norm": 1.915734887123108, |
|
"kl": 2.886588541666667, |
|
"learning_rate": 5.043552942808269e-06, |
|
"loss": 0.1155, |
|
"reward": -0.4225703233232101, |
|
"reward_std": 1.1504804422458013, |
|
"rewards/Qwen2-0.5B-Reward": -0.4225703233232101, |
|
"step": 2000 |
|
}, |
|
{ |
|
"completion_length": 747.6074157714844, |
|
"epoch": 0.7609793033821303, |
|
"grad_norm": 1.7324910163879395, |
|
"kl": 2.849739583333333, |
|
"learning_rate": 4.9548673267114535e-06, |
|
"loss": 0.114, |
|
"reward": -0.4868051894629995, |
|
"reward_std": 1.2382884542147319, |
|
"rewards/Qwen2-0.5B-Reward": -0.4868051894629995, |
|
"step": 2010 |
|
}, |
|
{ |
|
"completion_length": 723.2888916015625, |
|
"epoch": 0.7647652700656234, |
|
"grad_norm": 1.870195984840393, |
|
"kl": 3.38125, |
|
"learning_rate": 4.86723854365498e-06, |
|
"loss": 0.1353, |
|
"reward": -0.6813056563337644, |
|
"reward_std": 1.4171151260534922, |
|
"rewards/Qwen2-0.5B-Reward": -0.6813056563337644, |
|
"step": 2020 |
|
}, |
|
{ |
|
"completion_length": 739.2546325683594, |
|
"epoch": 0.7685512367491166, |
|
"grad_norm": 0.6563529968261719, |
|
"kl": 2.7765625, |
|
"learning_rate": 4.78068191323533e-06, |
|
"loss": 0.111, |
|
"reward": -0.6810662182668845, |
|
"reward_std": 1.3699560364087422, |
|
"rewards/Qwen2-0.5B-Reward": -0.6810662182668845, |
|
"step": 2030 |
|
}, |
|
{ |
|
"completion_length": 723.6301025390625, |
|
"epoch": 0.7723372034326098, |
|
"grad_norm": 0.845397412776947, |
|
"kl": 3.3549479166666667, |
|
"learning_rate": 4.695212567611183e-06, |
|
"loss": 0.1343, |
|
"reward": -0.6839562758803368, |
|
"reward_std": 1.3764802972475687, |
|
"rewards/Qwen2-0.5B-Reward": -0.6839562758803368, |
|
"step": 2040 |
|
}, |
|
{ |
|
"completion_length": 707.3944529215495, |
|
"epoch": 0.776123170116103, |
|
"grad_norm": 0.8297199606895447, |
|
"kl": 2.2606770833333334, |
|
"learning_rate": 4.6108454488579754e-06, |
|
"loss": 0.0904, |
|
"reward": -0.32430495528969916, |
|
"reward_std": 1.0496096114317577, |
|
"rewards/Qwen2-0.5B-Reward": -0.32430495528969916, |
|
"step": 2050 |
|
}, |
|
{ |
|
"completion_length": 728.9092631022136, |
|
"epoch": 0.7799091367995962, |
|
"grad_norm": 0.8965924382209778, |
|
"kl": 2.7317708333333335, |
|
"learning_rate": 4.5275953063556515e-06, |
|
"loss": 0.1092, |
|
"reward": -0.49890854886422553, |
|
"reward_std": 1.1908490220705668, |
|
"rewards/Qwen2-0.5B-Reward": -0.49890854886422553, |
|
"step": 2060 |
|
}, |
|
{ |
|
"completion_length": 787.4157389322917, |
|
"epoch": 0.7836951034830894, |
|
"grad_norm": 1.6908742189407349, |
|
"kl": 3.14453125, |
|
"learning_rate": 4.445476694210125e-06, |
|
"loss": 0.1258, |
|
"reward": -0.6872879594564438, |
|
"reward_std": 1.5059267342090608, |
|
"rewards/Qwen2-0.5B-Reward": -0.6872879594564438, |
|
"step": 2070 |
|
}, |
|
{ |
|
"completion_length": 724.3907409667969, |
|
"epoch": 0.7874810701665825, |
|
"grad_norm": 0.5646480917930603, |
|
"kl": 2.5322916666666666, |
|
"learning_rate": 4.364503968708885e-06, |
|
"loss": 0.1013, |
|
"reward": -0.4010113532965382, |
|
"reward_std": 1.1661198248465856, |
|
"rewards/Qwen2-0.5B-Reward": -0.4010113532965382, |
|
"step": 2080 |
|
}, |
|
{ |
|
"completion_length": 762.3759338378907, |
|
"epoch": 0.7912670368500757, |
|
"grad_norm": 0.7707305550575256, |
|
"kl": 3.08828125, |
|
"learning_rate": 4.284691285811162e-06, |
|
"loss": 0.1235, |
|
"reward": -0.6063117478042841, |
|
"reward_std": 1.4541340112686156, |
|
"rewards/Qwen2-0.5B-Reward": -0.6063117478042841, |
|
"step": 2090 |
|
}, |
|
{ |
|
"completion_length": 757.8597249348958, |
|
"epoch": 0.7950530035335689, |
|
"grad_norm": 0.609060525894165, |
|
"kl": 2.7552083333333335, |
|
"learning_rate": 4.206052598673134e-06, |
|
"loss": 0.1102, |
|
"reward": -0.5107901314894359, |
|
"reward_std": 1.2742640137672425, |
|
"rewards/Qwen2-0.5B-Reward": -0.5107901314894359, |
|
"step": 2100 |
|
}, |
|
{ |
|
"completion_length": 714.1713012695312, |
|
"epoch": 0.7988389702170621, |
|
"grad_norm": 1.5023508071899414, |
|
"kl": 2.7880208333333334, |
|
"learning_rate": 4.128601655208588e-06, |
|
"loss": 0.1115, |
|
"reward": -0.4477219473881026, |
|
"reward_std": 1.2109043717384338, |
|
"rewards/Qwen2-0.5B-Reward": -0.4477219473881026, |
|
"step": 2110 |
|
}, |
|
{ |
|
"completion_length": 742.9495381673177, |
|
"epoch": 0.8026249369005553, |
|
"grad_norm": 1.4843252897262573, |
|
"kl": 2.490104166666667, |
|
"learning_rate": 4.052351995685459e-06, |
|
"loss": 0.0996, |
|
"reward": -0.40210790758331616, |
|
"reward_std": 1.1073905199766159, |
|
"rewards/Qwen2-0.5B-Reward": -0.40210790758331616, |
|
"step": 2120 |
|
}, |
|
{ |
|
"completion_length": 758.4166687011718, |
|
"epoch": 0.8064109035840484, |
|
"grad_norm": 0.8346318006515503, |
|
"kl": 3.2510416666666666, |
|
"learning_rate": 3.977316950358647e-06, |
|
"loss": 0.1301, |
|
"reward": -0.744351115822792, |
|
"reward_std": 1.4400279184182485, |
|
"rewards/Qwen2-0.5B-Reward": -0.744351115822792, |
|
"step": 2130 |
|
}, |
|
{ |
|
"completion_length": 711.5217651367187, |
|
"epoch": 0.8101968702675416, |
|
"grad_norm": 3.075549840927124, |
|
"kl": 2.4575520833333333, |
|
"learning_rate": 3.903509637139604e-06, |
|
"loss": 0.0983, |
|
"reward": -0.4195836258431276, |
|
"reward_std": 1.1368374347686767, |
|
"rewards/Qwen2-0.5B-Reward": -0.4195836258431276, |
|
"step": 2140 |
|
}, |
|
{ |
|
"completion_length": 667.8574137369792, |
|
"epoch": 0.8139828369510348, |
|
"grad_norm": 1.288053035736084, |
|
"kl": 2.64140625, |
|
"learning_rate": 3.830942959302988e-06, |
|
"loss": 0.1056, |
|
"reward": -0.25947842622796696, |
|
"reward_std": 1.0453672617673875, |
|
"rewards/Qwen2-0.5B-Reward": -0.25947842622796696, |
|
"step": 2150 |
|
}, |
|
{ |
|
"completion_length": 713.2092692057291, |
|
"epoch": 0.817768803634528, |
|
"grad_norm": 1.47870934009552, |
|
"kl": 3.060677083333333, |
|
"learning_rate": 3.7596296032308655e-06, |
|
"loss": 0.1224, |
|
"reward": -0.5742474019527435, |
|
"reward_std": 1.2993368287881215, |
|
"rewards/Qwen2-0.5B-Reward": -0.5742474019527435, |
|
"step": 2160 |
|
}, |
|
{ |
|
"completion_length": 756.1185282389323, |
|
"epoch": 0.8215547703180212, |
|
"grad_norm": 1.0809710025787354, |
|
"kl": 3.0234375, |
|
"learning_rate": 3.689582036194844e-06, |
|
"loss": 0.121, |
|
"reward": -0.6388996203740438, |
|
"reward_std": 1.3941177546977996, |
|
"rewards/Qwen2-0.5B-Reward": -0.6388996203740438, |
|
"step": 2170 |
|
}, |
|
{ |
|
"completion_length": 689.1287068684895, |
|
"epoch": 0.8253407370015143, |
|
"grad_norm": 0.8256644606590271, |
|
"kl": 2.6302083333333335, |
|
"learning_rate": 3.620812504176483e-06, |
|
"loss": 0.1052, |
|
"reward": -0.3896134149283171, |
|
"reward_std": 1.1061949849128723, |
|
"rewards/Qwen2-0.5B-Reward": -0.3896134149283171, |
|
"step": 2180 |
|
}, |
|
{ |
|
"completion_length": 747.3708435058594, |
|
"epoch": 0.8291267036850076, |
|
"grad_norm": 1.2586473226547241, |
|
"kl": 2.8255208333333335, |
|
"learning_rate": 3.5533330297264055e-06, |
|
"loss": 0.113, |
|
"reward": -0.47125562417010464, |
|
"reward_std": 1.3159513572851818, |
|
"rewards/Qwen2-0.5B-Reward": -0.47125562417010464, |
|
"step": 2190 |
|
}, |
|
{ |
|
"completion_length": 718.9842681884766, |
|
"epoch": 0.8329126703685008, |
|
"grad_norm": 0.7325953841209412, |
|
"kl": 2.89453125, |
|
"learning_rate": 3.4871554098624783e-06, |
|
"loss": 0.1159, |
|
"reward": -0.515640505651633, |
|
"reward_std": 1.2894119222958882, |
|
"rewards/Qwen2-0.5B-Reward": -0.515640505651633, |
|
"step": 2200 |
|
}, |
|
{ |
|
"completion_length": 730.6486206054688, |
|
"epoch": 0.836698637051994, |
|
"grad_norm": 1.3458070755004883, |
|
"kl": 2.746354166666667, |
|
"learning_rate": 3.4222912140074072e-06, |
|
"loss": 0.1099, |
|
"reward": -0.43878471093873184, |
|
"reward_std": 1.1841597487529119, |
|
"rewards/Qwen2-0.5B-Reward": -0.43878471093873184, |
|
"step": 2210 |
|
}, |
|
{ |
|
"completion_length": 728.4597218831381, |
|
"epoch": 0.8404846037354872, |
|
"grad_norm": 2.082460880279541, |
|
"kl": 3.025520833333333, |
|
"learning_rate": 3.358751781966125e-06, |
|
"loss": 0.121, |
|
"reward": -0.5120975616077582, |
|
"reward_std": 1.399947702884674, |
|
"rewards/Qwen2-0.5B-Reward": -0.5120975616077582, |
|
"step": 2220 |
|
}, |
|
{ |
|
"completion_length": 702.8384338378906, |
|
"epoch": 0.8442705704189803, |
|
"grad_norm": 0.7987167239189148, |
|
"kl": 2.9817708333333335, |
|
"learning_rate": 3.2965482219433266e-06, |
|
"loss": 0.1193, |
|
"reward": -0.5346707743903001, |
|
"reward_std": 1.298090636730194, |
|
"rewards/Qwen2-0.5B-Reward": -0.5346707743903001, |
|
"step": 2230 |
|
}, |
|
{ |
|
"completion_length": 743.4412068684895, |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 1.0572713613510132, |
|
"kl": 2.8296875, |
|
"learning_rate": 3.2356914086014895e-06, |
|
"loss": 0.1132, |
|
"reward": -0.45420979845027126, |
|
"reward_std": 1.2626650591691335, |
|
"rewards/Qwen2-0.5B-Reward": -0.45420979845027126, |
|
"step": 2240 |
|
}, |
|
{ |
|
"completion_length": 751.9037150065104, |
|
"epoch": 0.8518425037859667, |
|
"grad_norm": 1.2263774871826172, |
|
"kl": 2.789322916666667, |
|
"learning_rate": 3.1761919811597286e-06, |
|
"loss": 0.1116, |
|
"reward": -0.41814162402103344, |
|
"reward_std": 1.254759935537974, |
|
"rewards/Qwen2-0.5B-Reward": -0.41814162402103344, |
|
"step": 2250 |
|
}, |
|
{ |
|
"completion_length": 735.0213012695312, |
|
"epoch": 0.8556284704694599, |
|
"grad_norm": 1.536089539527893, |
|
"kl": 2.711197916666667, |
|
"learning_rate": 3.118060341533795e-06, |
|
"loss": 0.1084, |
|
"reward": -0.3957721870703002, |
|
"reward_std": 1.215382601817449, |
|
"rewards/Qwen2-0.5B-Reward": -0.3957721870703002, |
|
"step": 2260 |
|
}, |
|
{ |
|
"completion_length": 739.1541676839192, |
|
"epoch": 0.8594144371529531, |
|
"grad_norm": 2.2628087997436523, |
|
"kl": 3.322135416666667, |
|
"learning_rate": 3.0613066525175916e-06, |
|
"loss": 0.1328, |
|
"reward": -0.5474292345655462, |
|
"reward_std": 1.3296300649642945, |
|
"rewards/Qwen2-0.5B-Reward": -0.5474292345655462, |
|
"step": 2270 |
|
}, |
|
{ |
|
"completion_length": 753.1319498697917, |
|
"epoch": 0.8632004038364463, |
|
"grad_norm": 1.759981393814087, |
|
"kl": 2.53984375, |
|
"learning_rate": 3.00594083600646e-06, |
|
"loss": 0.1016, |
|
"reward": -0.4004799094672004, |
|
"reward_std": 1.2508702536424001, |
|
"rewards/Qwen2-0.5B-Reward": -0.4004799094672004, |
|
"step": 2280 |
|
}, |
|
{ |
|
"completion_length": 765.2296366373698, |
|
"epoch": 0.8669863705199394, |
|
"grad_norm": 1.7521519660949707, |
|
"kl": 3.2877604166666665, |
|
"learning_rate": 2.9519725712625993e-06, |
|
"loss": 0.1315, |
|
"reward": -0.5632258212814728, |
|
"reward_std": 1.3489103774229685, |
|
"rewards/Qwen2-0.5B-Reward": -0.5632258212814728, |
|
"step": 2290 |
|
}, |
|
{ |
|
"completion_length": 728.2092671712239, |
|
"epoch": 0.8707723372034326, |
|
"grad_norm": 1.1282004117965698, |
|
"kl": 2.808333333333333, |
|
"learning_rate": 2.89941129322291e-06, |
|
"loss": 0.1123, |
|
"reward": -0.4623491804425915, |
|
"reward_std": 1.2616208771864572, |
|
"rewards/Qwen2-0.5B-Reward": -0.4623491804425915, |
|
"step": 2300 |
|
}, |
|
{ |
|
"completion_length": 763.8801025390625, |
|
"epoch": 0.8745583038869258, |
|
"grad_norm": 1.6411226987838745, |
|
"kl": 2.96328125, |
|
"learning_rate": 2.848266190849534e-06, |
|
"loss": 0.1186, |
|
"reward": -0.47133560677369435, |
|
"reward_std": 1.3187556425730387, |
|
"rewards/Qwen2-0.5B-Reward": -0.47133560677369435, |
|
"step": 2310 |
|
}, |
|
{ |
|
"completion_length": 767.462967936198, |
|
"epoch": 0.878344270570419, |
|
"grad_norm": 1.238519310951233, |
|
"kl": 2.96015625, |
|
"learning_rate": 2.798546205523405e-06, |
|
"loss": 0.1184, |
|
"reward": -0.553766346598665, |
|
"reward_std": 1.3190133293469748, |
|
"rewards/Qwen2-0.5B-Reward": -0.553766346598665, |
|
"step": 2320 |
|
}, |
|
{ |
|
"completion_length": 738.1370381673177, |
|
"epoch": 0.8821302372539122, |
|
"grad_norm": 1.9779850244522095, |
|
"kl": 2.7606770833333334, |
|
"learning_rate": 2.7502600294810888e-06, |
|
"loss": 0.1104, |
|
"reward": -0.48763653316224614, |
|
"reward_std": 1.276737904548645, |
|
"rewards/Qwen2-0.5B-Reward": -0.48763653316224614, |
|
"step": 2330 |
|
}, |
|
{ |
|
"completion_length": 772.7652852376302, |
|
"epoch": 0.8859162039374053, |
|
"grad_norm": 0.9569075107574463, |
|
"kl": 3.640625, |
|
"learning_rate": 2.7034161042951696e-06, |
|
"loss": 0.1457, |
|
"reward": -0.752403491238753, |
|
"reward_std": 1.5029548863569895, |
|
"rewards/Qwen2-0.5B-Reward": -0.752403491238753, |
|
"step": 2340 |
|
}, |
|
{ |
|
"completion_length": 748.1027770996094, |
|
"epoch": 0.8897021706208985, |
|
"grad_norm": 1.2532896995544434, |
|
"kl": 2.788802083333333, |
|
"learning_rate": 2.658022619398459e-06, |
|
"loss": 0.1115, |
|
"reward": -0.5759354960018148, |
|
"reward_std": 1.252836243311564, |
|
"rewards/Qwen2-0.5B-Reward": -0.5759354960018148, |
|
"step": 2350 |
|
}, |
|
{ |
|
"completion_length": 756.765283203125, |
|
"epoch": 0.8934881373043917, |
|
"grad_norm": 1.243710994720459, |
|
"kl": 3.470572916666667, |
|
"learning_rate": 2.6140875106522906e-06, |
|
"loss": 0.1388, |
|
"reward": -0.7527099266648293, |
|
"reward_std": 1.5181720991929373, |
|
"rewards/Qwen2-0.5B-Reward": -0.7527099266648293, |
|
"step": 2360 |
|
}, |
|
{ |
|
"completion_length": 731.5490844726562, |
|
"epoch": 0.8972741039878849, |
|
"grad_norm": 0.8256412744522095, |
|
"kl": 2.8911458333333333, |
|
"learning_rate": 2.5716184589591504e-06, |
|
"loss": 0.1156, |
|
"reward": -0.4917602331067125, |
|
"reward_std": 1.3739383776982625, |
|
"rewards/Qwen2-0.5B-Reward": -0.4917602331067125, |
|
"step": 2370 |
|
}, |
|
{ |
|
"completion_length": 762.2222249348958, |
|
"epoch": 0.901060070671378, |
|
"grad_norm": 0.976091206073761, |
|
"kl": 3.2059895833333334, |
|
"learning_rate": 2.5306228889198595e-06, |
|
"loss": 0.1282, |
|
"reward": -0.492262601479888, |
|
"reward_std": 1.3222837885220846, |
|
"rewards/Qwen2-0.5B-Reward": -0.492262601479888, |
|
"step": 2380 |
|
}, |
|
{ |
|
"completion_length": 752.6963033040364, |
|
"epoch": 0.9048460373548712, |
|
"grad_norm": 0.8627796769142151, |
|
"kl": 3.144270833333333, |
|
"learning_rate": 2.4911079675355852e-06, |
|
"loss": 0.1258, |
|
"reward": -0.5920792824278275, |
|
"reward_std": 1.4338179051876068, |
|
"rewards/Qwen2-0.5B-Reward": -0.5920792824278275, |
|
"step": 2390 |
|
}, |
|
{ |
|
"completion_length": 729.3250172932943, |
|
"epoch": 0.9086320040383644, |
|
"grad_norm": 2.569244384765625, |
|
"kl": 3.0598958333333335, |
|
"learning_rate": 2.453080602954878e-06, |
|
"loss": 0.1224, |
|
"reward": -0.5552944198250771, |
|
"reward_std": 1.259453280766805, |
|
"rewards/Qwen2-0.5B-Reward": -0.5552944198250771, |
|
"step": 2400 |
|
}, |
|
{ |
|
"completion_length": 769.4680562337239, |
|
"epoch": 0.9124179707218576, |
|
"grad_norm": 1.9891189336776733, |
|
"kl": 3.347395833333333, |
|
"learning_rate": 2.416547443265959e-06, |
|
"loss": 0.134, |
|
"reward": -0.7994883202016354, |
|
"reward_std": 1.5337923685709636, |
|
"rewards/Qwen2-0.5B-Reward": -0.7994883202016354, |
|
"step": 2410 |
|
}, |
|
{ |
|
"completion_length": 711.1333435058593, |
|
"epoch": 0.9162039374053509, |
|
"grad_norm": 1.2348560094833374, |
|
"kl": 2.6640625, |
|
"learning_rate": 2.381514875334478e-06, |
|
"loss": 0.1066, |
|
"reward": -0.4012350387871265, |
|
"reward_std": 1.1682847638924916, |
|
"rewards/Qwen2-0.5B-Reward": -0.4012350387871265, |
|
"step": 2420 |
|
}, |
|
{ |
|
"completion_length": 728.6402770996094, |
|
"epoch": 0.9199899040888441, |
|
"grad_norm": 1.0510834455490112, |
|
"kl": 2.4625, |
|
"learning_rate": 2.34798902368694e-06, |
|
"loss": 0.0985, |
|
"reward": -0.255227384219567, |
|
"reward_std": 1.0641139527161916, |
|
"rewards/Qwen2-0.5B-Reward": -0.255227384219567, |
|
"step": 2430 |
|
}, |
|
{ |
|
"completion_length": 742.7620483398438, |
|
"epoch": 0.9237758707723372, |
|
"grad_norm": 0.6936110854148865, |
|
"kl": 2.7760416666666665, |
|
"learning_rate": 2.31597574943999e-06, |
|
"loss": 0.1111, |
|
"reward": -0.32442100283806213, |
|
"reward_std": 1.1662549694379172, |
|
"rewards/Qwen2-0.5B-Reward": -0.32442100283806213, |
|
"step": 2440 |
|
}, |
|
{ |
|
"completion_length": 709.3500081380208, |
|
"epoch": 0.9275618374558304, |
|
"grad_norm": 0.6553380489349365, |
|
"kl": 2.91484375, |
|
"learning_rate": 2.2854806492757473e-06, |
|
"loss": 0.1166, |
|
"reward": -0.4610091609259446, |
|
"reward_std": 1.2611193935076395, |
|
"rewards/Qwen2-0.5B-Reward": -0.4610091609259446, |
|
"step": 2450 |
|
}, |
|
{ |
|
"completion_length": 751.5171427408854, |
|
"epoch": 0.9313478041393236, |
|
"grad_norm": 1.1703935861587524, |
|
"kl": 3.030989583333333, |
|
"learning_rate": 2.256509054463379e-06, |
|
"loss": 0.1212, |
|
"reward": -0.47760866036017735, |
|
"reward_std": 1.3580244441827138, |
|
"rewards/Qwen2-0.5B-Reward": -0.47760866036017735, |
|
"step": 2460 |
|
}, |
|
{ |
|
"completion_length": 734.3888997395833, |
|
"epoch": 0.9351337708228168, |
|
"grad_norm": 1.4841110706329346, |
|
"kl": 2.941666666666667, |
|
"learning_rate": 2.2290660299270626e-06, |
|
"loss": 0.1176, |
|
"reward": -0.5363880881418784, |
|
"reward_std": 1.320775838692983, |
|
"rewards/Qwen2-0.5B-Reward": -0.5363880881418784, |
|
"step": 2470 |
|
}, |
|
{ |
|
"completion_length": 792.8597361246744, |
|
"epoch": 0.93891973750631, |
|
"grad_norm": 0.9216225743293762, |
|
"kl": 3.2005208333333335, |
|
"learning_rate": 2.2031563733605154e-06, |
|
"loss": 0.128, |
|
"reward": -0.6734383806586266, |
|
"reward_std": 1.5115692138671875, |
|
"rewards/Qwen2-0.5B-Reward": -0.6734383806586266, |
|
"step": 2480 |
|
}, |
|
{ |
|
"completion_length": 742.3555603027344, |
|
"epoch": 0.9427057041898032, |
|
"grad_norm": 0.8652907013893127, |
|
"kl": 2.9796875, |
|
"learning_rate": 2.178784614388247e-06, |
|
"loss": 0.1192, |
|
"reward": -0.5235640426476796, |
|
"reward_std": 1.2792722801367442, |
|
"rewards/Qwen2-0.5B-Reward": -0.5235640426476796, |
|
"step": 2490 |
|
}, |
|
{ |
|
"completion_length": 731.887510172526, |
|
"epoch": 0.9464916708732963, |
|
"grad_norm": 0.900198221206665, |
|
"kl": 2.758072916666667, |
|
"learning_rate": 2.155955013773674e-06, |
|
"loss": 0.1102, |
|
"reward": -0.427229492738843, |
|
"reward_std": 1.2093970189491907, |
|
"rewards/Qwen2-0.5B-Reward": -0.427229492738843, |
|
"step": 2500 |
|
}, |
|
{ |
|
"completion_length": 742.8615783691406, |
|
"epoch": 0.9502776375567895, |
|
"grad_norm": 1.4608945846557617, |
|
"kl": 2.884635416666667, |
|
"learning_rate": 2.134671562674233e-06, |
|
"loss": 0.1154, |
|
"reward": -0.40613490512090117, |
|
"reward_std": 1.2598043183485668, |
|
"rewards/Qwen2-0.5B-Reward": -0.40613490512090117, |
|
"step": 2510 |
|
}, |
|
{ |
|
"completion_length": 732.2143575032552, |
|
"epoch": 0.9540636042402827, |
|
"grad_norm": 0.861190676689148, |
|
"kl": 3.107291666666667, |
|
"learning_rate": 2.114937981943634e-06, |
|
"loss": 0.1243, |
|
"reward": -0.4464622031897306, |
|
"reward_std": 1.2578558444976806, |
|
"rewards/Qwen2-0.5B-Reward": -0.4464622031897306, |
|
"step": 2520 |
|
}, |
|
{ |
|
"completion_length": 796.321767171224, |
|
"epoch": 0.9578495709237759, |
|
"grad_norm": 2.202199697494507, |
|
"kl": 3.068489583333333, |
|
"learning_rate": 2.096757721481365e-06, |
|
"loss": 0.1228, |
|
"reward": -0.6399494647979737, |
|
"reward_std": 1.4180189092954, |
|
"rewards/Qwen2-0.5B-Reward": -0.6399494647979737, |
|
"step": 2530 |
|
}, |
|
{ |
|
"completion_length": 744.4689880371094, |
|
"epoch": 0.961635537607269, |
|
"grad_norm": 0.9193338751792908, |
|
"kl": 2.9799479166666667, |
|
"learning_rate": 2.0801339596295706e-06, |
|
"loss": 0.1192, |
|
"reward": -0.5712469642050564, |
|
"reward_std": 1.3389502465724945, |
|
"rewards/Qwen2-0.5B-Reward": -0.5712469642050564, |
|
"step": 2540 |
|
}, |
|
{ |
|
"completion_length": 779.9819458007812, |
|
"epoch": 0.9654215042907622, |
|
"grad_norm": 1.811191439628601, |
|
"kl": 3.6411458333333333, |
|
"learning_rate": 2.0650696026173993e-06, |
|
"loss": 0.1456, |
|
"reward": -0.7589993777374426, |
|
"reward_std": 1.5557840009530386, |
|
"rewards/Qwen2-0.5B-Reward": -0.7589993777374426, |
|
"step": 2550 |
|
}, |
|
{ |
|
"completion_length": 782.5782531738281, |
|
"epoch": 0.9692074709742554, |
|
"grad_norm": 0.9566059112548828, |
|
"kl": 3.095572916666667, |
|
"learning_rate": 2.051567284052924e-06, |
|
"loss": 0.1238, |
|
"reward": -0.6302419572137297, |
|
"reward_std": 1.4944741606712342, |
|
"rewards/Qwen2-0.5B-Reward": -0.6302419572137297, |
|
"step": 2560 |
|
}, |
|
{ |
|
"completion_length": 718.9152872721354, |
|
"epoch": 0.9729934376577486, |
|
"grad_norm": 0.9425510168075562, |
|
"kl": 2.82890625, |
|
"learning_rate": 2.0396293644627313e-06, |
|
"loss": 0.1132, |
|
"reward": -0.32908876914686214, |
|
"reward_std": 1.2080858111381532, |
|
"rewards/Qwen2-0.5B-Reward": -0.32908876914686214, |
|
"step": 2570 |
|
}, |
|
{ |
|
"completion_length": 732.7824157714844, |
|
"epoch": 0.9767794043412418, |
|
"grad_norm": 0.9575442671775818, |
|
"kl": 3.121875, |
|
"learning_rate": 2.0292579308792374e-06, |
|
"loss": 0.125, |
|
"reward": -0.47131281966964406, |
|
"reward_std": 1.3826497634251913, |
|
"rewards/Qwen2-0.5B-Reward": -0.47131281966964406, |
|
"step": 2580 |
|
}, |
|
{ |
|
"completion_length": 761.6578694661458, |
|
"epoch": 0.980565371024735, |
|
"grad_norm": 1.0160202980041504, |
|
"kl": 2.9203125, |
|
"learning_rate": 2.020454796475829e-06, |
|
"loss": 0.1168, |
|
"reward": -0.47771473427613576, |
|
"reward_std": 1.2897698918978373, |
|
"rewards/Qwen2-0.5B-Reward": -0.47771473427613576, |
|
"step": 2590 |
|
}, |
|
{ |
|
"completion_length": 781.389815266927, |
|
"epoch": 0.9843513377082281, |
|
"grad_norm": 2.1385881900787354, |
|
"kl": 2.9213541666666667, |
|
"learning_rate": 2.013221500249879e-06, |
|
"loss": 0.1168, |
|
"reward": -0.4969511273006598, |
|
"reward_std": 1.3705980678399403, |
|
"rewards/Qwen2-0.5B-Reward": -0.4969511273006598, |
|
"step": 2600 |
|
}, |
|
{ |
|
"completion_length": 748.4037109375, |
|
"epoch": 0.9881373043917213, |
|
"grad_norm": 1.3061258792877197, |
|
"kl": 3.1223958333333335, |
|
"learning_rate": 2.0075593067536895e-06, |
|
"loss": 0.1249, |
|
"reward": -0.511777646218737, |
|
"reward_std": 1.338163250684738, |
|
"rewards/Qwen2-0.5B-Reward": -0.511777646218737, |
|
"step": 2610 |
|
}, |
|
{ |
|
"completion_length": 730.8513916015625, |
|
"epoch": 0.9919232710752145, |
|
"grad_norm": 1.0508885383605957, |
|
"kl": 2.6411458333333333, |
|
"learning_rate": 2.0034692058734197e-06, |
|
"loss": 0.1056, |
|
"reward": -0.3765604312221209, |
|
"reward_std": 1.2147092600663503, |
|
"rewards/Qwen2-0.5B-Reward": -0.3765604312221209, |
|
"step": 2620 |
|
}, |
|
{ |
|
"completion_length": 783.2115763346354, |
|
"epoch": 0.9957092377587077, |
|
"grad_norm": 1.1735745668411255, |
|
"kl": 3.4817708333333335, |
|
"learning_rate": 2.000951912656033e-06, |
|
"loss": 0.1392, |
|
"reward": -0.6186425998806954, |
|
"reward_std": 1.4974812746047974, |
|
"rewards/Qwen2-0.5B-Reward": -0.6186425998806954, |
|
"step": 2630 |
|
}, |
|
{ |
|
"completion_length": 786.5088033040364, |
|
"epoch": 0.9994952044422009, |
|
"grad_norm": 1.1970211267471313, |
|
"kl": 3.134375, |
|
"learning_rate": 2.0000078671842824e-06, |
|
"loss": 0.1254, |
|
"reward": -0.662852063588798, |
|
"reward_std": 1.5238366266091665, |
|
"rewards/Qwen2-0.5B-Reward": -0.662852063588798, |
|
"step": 2640 |
|
}, |
|
{ |
|
"completion_length": 728.1759236653646, |
|
"epoch": 0.9998738011105502, |
|
"kl": 3.125, |
|
"reward": -0.8051454126834869, |
|
"reward_std": 1.3035079042116802, |
|
"rewards/Qwen2-0.5B-Reward": -0.8051454126834869, |
|
"step": 2641, |
|
"total_flos": 0.0, |
|
"train_loss": 0.6071465962344739, |
|
"train_runtime": 159997.8149, |
|
"train_samples_per_second": 1.189, |
|
"train_steps_per_second": 0.017 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2641, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 24, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|