{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998738011105502, "eval_steps": 500, "global_step": 2641, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 862.3935139973959, "epoch": 0.0003785966683493185, "grad_norm": 0.29023680090904236, "kl": 0.0, "learning_rate": 7.547169811320756e-08, "loss": -0.0, "reward": -2.4199581146240234, "reward_std": 0.6020505428314209, "rewards/Qwen2-0.5B-Reward": -2.4199581146240234, "step": 1 }, { "completion_length": 921.5138956705729, "epoch": 0.003785966683493185, "grad_norm": 0.5279305577278137, "kl": 0.00010103649563259548, "learning_rate": 7.547169811320755e-07, "loss": 0.0, "reward": -2.473096079296536, "reward_std": 0.5592167631343559, "rewards/Qwen2-0.5B-Reward": -2.473096079296536, "step": 10 }, { "completion_length": 910.1000081380208, "epoch": 0.00757193336698637, "grad_norm": 0.2346569150686264, "kl": 0.00012467702229817707, "learning_rate": 1.509433962264151e-06, "loss": 0.0, "reward": -2.422696002324422, "reward_std": 0.5530827701091766, "rewards/Qwen2-0.5B-Reward": -2.422696002324422, "step": 20 }, { "completion_length": 911.8717631022135, "epoch": 0.011357900050479555, "grad_norm": 0.21592262387275696, "kl": 0.0003096898396809896, "learning_rate": 2.2641509433962266e-06, "loss": 0.0, "reward": -2.41411194006602, "reward_std": 0.5578982929388682, "rewards/Qwen2-0.5B-Reward": -2.41411194006602, "step": 30 }, { "completion_length": 893.8527893066406, "epoch": 0.01514386673397274, "grad_norm": 0.26798614859580994, "kl": 0.0015757242838541667, "learning_rate": 3.018867924528302e-06, "loss": 0.0001, "reward": -2.297330105304718, "reward_std": 0.528485847512881, "rewards/Qwen2-0.5B-Reward": -2.297330105304718, "step": 40 }, { "completion_length": 875.3588053385416, "epoch": 0.018929833417465926, "grad_norm": 0.27687838673591614, "kl": 0.0064605712890625, "learning_rate": 3.7735849056603777e-06, "loss": 0.0003, "reward": -2.010285266240438, "reward_std": 0.5279872556527455, "rewards/Qwen2-0.5B-Reward": -2.010285266240438, "step": 50 }, { "completion_length": 859.5717692057292, "epoch": 0.02271580010095911, "grad_norm": 0.2615930736064911, "kl": 0.018147786458333332, "learning_rate": 4.528301886792453e-06, "loss": 0.0007, "reward": -1.795549988746643, "reward_std": 0.49105457464853924, "rewards/Qwen2-0.5B-Reward": -1.795549988746643, "step": 60 }, { "completion_length": 765.7564921061198, "epoch": 0.026501766784452298, "grad_norm": 0.28387993574142456, "kl": 0.028316243489583334, "learning_rate": 5.283018867924529e-06, "loss": 0.0011, "reward": -1.4461613575617471, "reward_std": 0.47599050005277, "rewards/Qwen2-0.5B-Reward": -1.4461613575617471, "step": 70 }, { "completion_length": 783.0041768391927, "epoch": 0.03028773346794548, "grad_norm": 0.25285565853118896, "kl": 0.040238444010416666, "learning_rate": 6.037735849056604e-06, "loss": 0.0016, "reward": -1.193545683224996, "reward_std": 0.4799055278301239, "rewards/Qwen2-0.5B-Reward": -1.193545683224996, "step": 80 }, { "completion_length": 836.7032450358073, "epoch": 0.034073700151438666, "grad_norm": 0.25005629658699036, "kl": 0.05516764322916667, "learning_rate": 6.792452830188679e-06, "loss": 0.0022, "reward": -1.0326486746470134, "reward_std": 0.5193435788154602, "rewards/Qwen2-0.5B-Reward": -1.0326486746470134, "step": 90 }, { "completion_length": 853.6324157714844, "epoch": 0.03785966683493185, "grad_norm": 0.3665623664855957, "kl": 0.10475260416666667, "learning_rate": 7.5471698113207555e-06, "loss": 0.0042, "reward": -0.9854023973147075, "reward_std": 0.6189069559176763, "rewards/Qwen2-0.5B-Reward": -0.9854023973147075, "step": 100 }, { "completion_length": 815.8801005045573, "epoch": 0.04164563351842504, "grad_norm": 0.8496055006980896, "kl": 0.41549479166666664, "learning_rate": 8.301886792452832e-06, "loss": 0.0166, "reward": -1.463372488816579, "reward_std": 1.0357649803161622, "rewards/Qwen2-0.5B-Reward": -1.463372488816579, "step": 110 }, { "completion_length": 813.9148213704427, "epoch": 0.04543160020191822, "grad_norm": 0.3407374918460846, "kl": 0.40042317708333336, "learning_rate": 9.056603773584907e-06, "loss": 0.016, "reward": -1.8741844495137532, "reward_std": 1.722016990184784, "rewards/Qwen2-0.5B-Reward": -1.8741844495137532, "step": 120 }, { "completion_length": 646.899545288086, "epoch": 0.04921756688541141, "grad_norm": 0.4906499981880188, "kl": 0.2925618489583333, "learning_rate": 9.811320754716981e-06, "loss": 0.0117, "reward": -1.3685388286908469, "reward_std": 1.243816477060318, "rewards/Qwen2-0.5B-Reward": -1.3685388286908469, "step": 130 }, { "completion_length": 547.943989054362, "epoch": 0.053003533568904596, "grad_norm": 1.3670618534088135, "kl": 1.1440104166666667, "learning_rate": 1.0566037735849058e-05, "loss": 0.0458, "reward": -2.530128773053487, "reward_std": 2.079139538606008, "rewards/Qwen2-0.5B-Reward": -2.530128773053487, "step": 140 }, { "completion_length": 482.1189860026042, "epoch": 0.056789500252397776, "grad_norm": 2.426543712615967, "kl": 2.05546875, "learning_rate": 1.1320754716981132e-05, "loss": 0.0822, "reward": -3.6518485943476358, "reward_std": 2.5313418904940286, "rewards/Qwen2-0.5B-Reward": -3.6518485943476358, "step": 150 }, { "completion_length": 583.3393595377604, "epoch": 0.06057546693589096, "grad_norm": 7.087838649749756, "kl": 1.375, "learning_rate": 1.2075471698113209e-05, "loss": 0.055, "reward": -2.7873202482859294, "reward_std": 2.297369889418284, "rewards/Qwen2-0.5B-Reward": -2.7873202482859294, "step": 160 }, { "completion_length": 581.331483968099, "epoch": 0.06436143361938415, "grad_norm": 0.3393622636795044, "kl": 0.7234049479166667, "learning_rate": 1.2830188679245283e-05, "loss": 0.029, "reward": -1.6760946492354074, "reward_std": 1.530751649538676, "rewards/Qwen2-0.5B-Reward": -1.6760946492354074, "step": 170 }, { "completion_length": 551.2680633544921, "epoch": 0.06814740030287733, "grad_norm": 0.5040144920349121, "kl": 0.5955729166666667, "learning_rate": 1.3584905660377358e-05, "loss": 0.0238, "reward": -1.6978328824043274, "reward_std": 1.5175378421942394, "rewards/Qwen2-0.5B-Reward": -1.6978328824043274, "step": 180 }, { "completion_length": 498.8856536865234, "epoch": 0.07193336698637053, "grad_norm": 0.6744162440299988, "kl": 1.2225260416666666, "learning_rate": 1.4339622641509435e-05, "loss": 0.0489, "reward": -2.619816021124522, "reward_std": 2.05845144589742, "rewards/Qwen2-0.5B-Reward": -2.619816021124522, "step": 190 }, { "completion_length": 651.0615783691406, "epoch": 0.0757193336698637, "grad_norm": 0.47306591272354126, "kl": 0.9920572916666667, "learning_rate": 1.5094339622641511e-05, "loss": 0.0397, "reward": -2.183918062845866, "reward_std": 2.0014989256858824, "rewards/Qwen2-0.5B-Reward": -2.183918062845866, "step": 200 }, { "completion_length": 723.8722351074218, "epoch": 0.07950530035335689, "grad_norm": 0.8630687594413757, "kl": 1.132421875, "learning_rate": 1.5849056603773586e-05, "loss": 0.0453, "reward": -2.3433102289835612, "reward_std": 2.1008309284845987, "rewards/Qwen2-0.5B-Reward": -2.3433102289835612, "step": 210 }, { "completion_length": 633.7060221354167, "epoch": 0.08329126703685008, "grad_norm": 0.33421555161476135, "kl": 1.11015625, "learning_rate": 1.6603773584905664e-05, "loss": 0.0444, "reward": -1.7007139801979065, "reward_std": 1.8362650871276855, "rewards/Qwen2-0.5B-Reward": -1.7007139801979065, "step": 220 }, { "completion_length": 801.9037150065104, "epoch": 0.08707723372034326, "grad_norm": 0.2525235116481781, "kl": 0.492578125, "learning_rate": 1.735849056603774e-05, "loss": 0.0197, "reward": -0.9283350398143132, "reward_std": 1.1985284070173898, "rewards/Qwen2-0.5B-Reward": -0.9283350398143132, "step": 230 }, { "completion_length": 757.5694498697917, "epoch": 0.09086320040383644, "grad_norm": 0.23876462876796722, "kl": 0.5396484375, "learning_rate": 1.8113207547169813e-05, "loss": 0.0216, "reward": -1.083450937271118, "reward_std": 1.4640587449073792, "rewards/Qwen2-0.5B-Reward": -1.083450937271118, "step": 240 }, { "completion_length": 691.9120452880859, "epoch": 0.09464916708732964, "grad_norm": 0.5013711452484131, "kl": 0.6822265625, "learning_rate": 1.8867924528301888e-05, "loss": 0.0273, "reward": -1.4570284724235534, "reward_std": 1.626520773768425, "rewards/Qwen2-0.5B-Reward": -1.4570284724235534, "step": 250 }, { "completion_length": 573.8166727701823, "epoch": 0.09843513377082282, "grad_norm": 0.6115075349807739, "kl": 1.6170572916666666, "learning_rate": 1.9622641509433963e-05, "loss": 0.0647, "reward": -2.9246065855026244, "reward_std": 2.250337036450704, "rewards/Qwen2-0.5B-Reward": -2.9246065855026244, "step": 260 }, { "completion_length": 773.3074178059895, "epoch": 0.102221100454316, "grad_norm": 0.31634387373924255, "kl": 0.733984375, "learning_rate": 1.999980332108064e-05, "loss": 0.0294, "reward": -1.5115862051645914, "reward_std": 1.5288376450538634, "rewards/Qwen2-0.5B-Reward": -1.5115862051645914, "step": 270 }, { "completion_length": 829.3129659016927, "epoch": 0.10600706713780919, "grad_norm": 0.25395989418029785, "kl": 0.4763671875, "learning_rate": 1.9998229941302175e-05, "loss": 0.0191, "reward": -0.9474448690811793, "reward_std": 1.09269377887249, "rewards/Qwen2-0.5B-Reward": -0.9474448690811793, "step": 280 }, { "completion_length": 999.9018575032552, "epoch": 0.10979303382130237, "grad_norm": 0.3452966809272766, "kl": 0.6498046875, "learning_rate": 1.9995083456809467e-05, "loss": 0.026, "reward": -1.7776759227116903, "reward_std": 1.8762857417265575, "rewards/Qwen2-0.5B-Reward": -1.7776759227116903, "step": 290 }, { "completion_length": 867.9375081380208, "epoch": 0.11357900050479555, "grad_norm": 0.4680746793746948, "kl": 0.98203125, "learning_rate": 1.9990364417682882e-05, "loss": 0.0393, "reward": -2.6815950234731036, "reward_std": 2.1339449683825173, "rewards/Qwen2-0.5B-Reward": -2.6815950234731036, "step": 300 }, { "completion_length": 841.0263916015625, "epoch": 0.11736496718828875, "grad_norm": 0.4002317786216736, "kl": 1.4239583333333334, "learning_rate": 1.9984073648922753e-05, "loss": 0.057, "reward": -2.9415343125661213, "reward_std": 2.4492496887842816, "rewards/Qwen2-0.5B-Reward": -2.9415343125661213, "step": 310 }, { "completion_length": 845.747226969401, "epoch": 0.12115093387178193, "grad_norm": 0.7248504757881165, "kl": 1.8854166666666667, "learning_rate": 1.997621225030515e-05, "loss": 0.0754, "reward": -3.695864470799764, "reward_std": 2.583825929959615, "rewards/Qwen2-0.5B-Reward": -3.695864470799764, "step": 320 }, { "completion_length": 1439.778253173828, "epoch": 0.12493690055527511, "grad_norm": 0.602688193321228, "kl": 1.6190104166666666, "learning_rate": 1.9966781596189623e-05, "loss": 0.0648, "reward": -3.7271327575047812, "reward_std": 1.9023333628972372, "rewards/Qwen2-0.5B-Reward": -3.7271327575047812, "step": 330 }, { "completion_length": 1148.3754781087239, "epoch": 0.1287228672387683, "grad_norm": 0.6237585544586182, "kl": 1.2373697916666666, "learning_rate": 1.9955783335278924e-05, "loss": 0.0495, "reward": -3.0485590934753417, "reward_std": 1.7622671604156495, "rewards/Qwen2-0.5B-Reward": -3.0485590934753417, "step": 340 }, { "completion_length": 871.3398213704427, "epoch": 0.13250883392226148, "grad_norm": 0.7727285623550415, "kl": 0.8674479166666667, "learning_rate": 1.9943219390330767e-05, "loss": 0.0347, "reward": -2.604492497444153, "reward_std": 1.5047667543093364, "rewards/Qwen2-0.5B-Reward": -2.604492497444153, "step": 350 }, { "completion_length": 731.1259358723959, "epoch": 0.13629480060575466, "grad_norm": 0.2828836441040039, "kl": 0.725, "learning_rate": 1.9929091957821703e-05, "loss": 0.029, "reward": -1.8863240122795104, "reward_std": 1.1796027421951294, "rewards/Qwen2-0.5B-Reward": -1.8863240122795104, "step": 360 }, { "completion_length": 593.0722300211588, "epoch": 0.14008076728924784, "grad_norm": 0.34980472922325134, "kl": 0.5994791666666667, "learning_rate": 1.9913403507563104e-05, "loss": 0.024, "reward": -1.5030529995759327, "reward_std": 0.9978658020496368, "rewards/Qwen2-0.5B-Reward": -1.5030529995759327, "step": 370 }, { "completion_length": 574.3398223876953, "epoch": 0.14386673397274105, "grad_norm": 0.44906890392303467, "kl": 0.98515625, "learning_rate": 1.9896156782269405e-05, "loss": 0.0394, "reward": -2.0863842129707337, "reward_std": 1.7282814304033915, "rewards/Qwen2-0.5B-Reward": -2.0863842129707337, "step": 380 }, { "completion_length": 598.7032470703125, "epoch": 0.14765270065623423, "grad_norm": 0.42980390787124634, "kl": 1.1180989583333334, "learning_rate": 1.9877354797078577e-05, "loss": 0.0448, "reward": -1.7375385125478109, "reward_std": 1.6146510203679403, "rewards/Qwen2-0.5B-Reward": -1.7375385125478109, "step": 390 }, { "completion_length": 522.9648213704427, "epoch": 0.1514386673397274, "grad_norm": 0.4960784614086151, "kl": 0.8545572916666667, "learning_rate": 1.9857000839025043e-05, "loss": 0.0342, "reward": -1.5845000902811686, "reward_std": 1.6743145366509755, "rewards/Qwen2-0.5B-Reward": -1.5845000902811686, "step": 400 }, { "completion_length": 650.5805603027344, "epoch": 0.1552246340232206, "grad_norm": 3.5647335052490234, "kl": 0.7625, "learning_rate": 1.983509846646502e-05, "loss": 0.0305, "reward": -1.3111775855223338, "reward_std": 1.5136757413546245, "rewards/Qwen2-0.5B-Reward": -1.3111775855223338, "step": 410 }, { "completion_length": 608.793061319987, "epoch": 0.15901060070671377, "grad_norm": 0.7843858599662781, "kl": 1.0794270833333333, "learning_rate": 1.9811651508454405e-05, "loss": 0.0432, "reward": -1.030318695306778, "reward_std": 1.3256585756937662, "rewards/Qwen2-0.5B-Reward": -1.030318695306778, "step": 420 }, { "completion_length": 717.2574086507161, "epoch": 0.16279656739020695, "grad_norm": 1.8327181339263916, "kl": 1.7510416666666666, "learning_rate": 1.97866640640794e-05, "loss": 0.07, "reward": -1.3865876078605652, "reward_std": 1.5873213092486063, "rewards/Qwen2-0.5B-Reward": -1.3865876078605652, "step": 430 }, { "completion_length": 886.4629699707032, "epoch": 0.16658253407370016, "grad_norm": 1.820997714996338, "kl": 2.492317708333333, "learning_rate": 1.9760140501739885e-05, "loss": 0.0997, "reward": -2.041487044095993, "reward_std": 1.9687125941117605, "rewards/Qwen2-0.5B-Reward": -2.041487044095993, "step": 440 }, { "completion_length": 761.4412068684895, "epoch": 0.17036850075719334, "grad_norm": 2.2856929302215576, "kl": 2.5984375, "learning_rate": 1.9732085458385706e-05, "loss": 0.104, "reward": -1.7194086611270905, "reward_std": 1.6787285923957824, "rewards/Qwen2-0.5B-Reward": -1.7194086611270905, "step": 450 }, { "completion_length": 800.9458414713541, "epoch": 0.17415446744068652, "grad_norm": 1.5514496564865112, "kl": 3.582291666666667, "learning_rate": 1.9702503838706032e-05, "loss": 0.1433, "reward": -1.918267943461736, "reward_std": 1.798914521932602, "rewards/Qwen2-0.5B-Reward": -1.918267943461736, "step": 460 }, { "completion_length": 766.8731547037761, "epoch": 0.1779404341241797, "grad_norm": 2.541083812713623, "kl": 2.43359375, "learning_rate": 1.9671400814271904e-05, "loss": 0.0973, "reward": -1.5191373944282531, "reward_std": 1.7267815709114074, "rewards/Qwen2-0.5B-Reward": -1.5191373944282531, "step": 470 }, { "completion_length": 730.6893575032552, "epoch": 0.18172640080767288, "grad_norm": 2.0177841186523438, "kl": 1.9341145833333333, "learning_rate": 1.9638781822632117e-05, "loss": 0.0774, "reward": -0.9346473336219787, "reward_std": 1.284997742374738, "rewards/Qwen2-0.5B-Reward": -0.9346473336219787, "step": 480 }, { "completion_length": 714.4213033040364, "epoch": 0.1855123674911661, "grad_norm": 1.7401411533355713, "kl": 2.3013020833333333, "learning_rate": 1.9604652566362604e-05, "loss": 0.092, "reward": -0.9350511769453684, "reward_std": 1.2963847279548646, "rewards/Qwen2-0.5B-Reward": -0.9350511769453684, "step": 490 }, { "completion_length": 718.9740814208984, "epoch": 0.18929833417465927, "grad_norm": 5.835248947143555, "kl": 2.794661458333333, "learning_rate": 1.95690190120695e-05, "loss": 0.1118, "reward": -1.0198218444983165, "reward_std": 1.4708388864994049, "rewards/Qwen2-0.5B-Reward": -1.0198218444983165, "step": 500 }, { "completion_length": 626.8421396891276, "epoch": 0.19308430085815245, "grad_norm": 2.837407112121582, "kl": 1.6803385416666667, "learning_rate": 1.9531887389346016e-05, "loss": 0.0672, "reward": -0.6805184543132782, "reward_std": 1.061421944697698, "rewards/Qwen2-0.5B-Reward": -0.6805184543132782, "step": 510 }, { "completion_length": 709.4504659016927, "epoch": 0.19687026754164563, "grad_norm": 1.3921163082122803, "kl": 2.5, "learning_rate": 1.9493264189683393e-05, "loss": 0.1, "reward": -1.0162009666363399, "reward_std": 1.3732348203659057, "rewards/Qwen2-0.5B-Reward": -1.0162009666363399, "step": 520 }, { "completion_length": 734.6662150065105, "epoch": 0.2006562342251388, "grad_norm": 0.9199353456497192, "kl": 2.3385416666666665, "learning_rate": 1.9453156165336e-05, "loss": 0.0936, "reward": -0.9758850524822871, "reward_std": 1.4469304541746775, "rewards/Qwen2-0.5B-Reward": -0.9758850524822871, "step": 530 }, { "completion_length": 685.4245402018229, "epoch": 0.204442200908632, "grad_norm": 1.9519050121307373, "kl": 1.9833333333333334, "learning_rate": 1.94115703281409e-05, "loss": 0.0793, "reward": -0.7073126316070557, "reward_std": 1.1466901183128357, "rewards/Qwen2-0.5B-Reward": -0.7073126316070557, "step": 540 }, { "completion_length": 755.8652852376302, "epoch": 0.2082281675921252, "grad_norm": 1.136602520942688, "kl": 2.7690104166666667, "learning_rate": 1.9368513948291997e-05, "loss": 0.1108, "reward": -1.0485609819491704, "reward_std": 1.5164429823557535, "rewards/Qwen2-0.5B-Reward": -1.0485609819491704, "step": 550 }, { "completion_length": 691.9601928710938, "epoch": 0.21201413427561838, "grad_norm": 1.1344228982925415, "kl": 1.9572916666666667, "learning_rate": 1.932399455306906e-05, "loss": 0.0783, "reward": -0.7905913976331552, "reward_std": 1.2394062995910644, "rewards/Qwen2-0.5B-Reward": -0.7905913976331552, "step": 560 }, { "completion_length": 774.0166727701823, "epoch": 0.21580010095911156, "grad_norm": 4.68572998046875, "kl": 2.8236979166666667, "learning_rate": 1.9278019925521744e-05, "loss": 0.113, "reward": -0.9427557557821273, "reward_std": 1.4873551627000172, "rewards/Qwen2-0.5B-Reward": -0.9427557557821273, "step": 570 }, { "completion_length": 782.1125061035157, "epoch": 0.21958606764260474, "grad_norm": 3.2951087951660156, "kl": 2.81875, "learning_rate": 1.9230598103108958e-05, "loss": 0.1127, "reward": -0.9920766482750575, "reward_std": 1.5032208581765494, "rewards/Qwen2-0.5B-Reward": -0.9920766482750575, "step": 580 }, { "completion_length": 764.4046447753906, "epoch": 0.22337203432609792, "grad_norm": 0.7878244519233704, "kl": 2.4518229166666665, "learning_rate": 1.9181737376293693e-05, "loss": 0.0981, "reward": -0.8713747123877208, "reward_std": 1.4777807037035624, "rewards/Qwen2-0.5B-Reward": -0.8713747123877208, "step": 590 }, { "completion_length": 787.2912129720052, "epoch": 0.2271580010095911, "grad_norm": 1.002245306968689, "kl": 2.3055989583333334, "learning_rate": 1.9131446287093683e-05, "loss": 0.0922, "reward": -0.914855935672919, "reward_std": 1.4097402195135753, "rewards/Qwen2-0.5B-Reward": -0.914855935672919, "step": 600 }, { "completion_length": 853.8175984700521, "epoch": 0.2309439676930843, "grad_norm": 1.2771328687667847, "kl": 3.388802083333333, "learning_rate": 1.9079733627588042e-05, "loss": 0.1356, "reward": -1.4452542603015899, "reward_std": 1.7754655241966248, "rewards/Qwen2-0.5B-Reward": -1.4452542603015899, "step": 610 }, { "completion_length": 742.8106587727865, "epoch": 0.2347299343765775, "grad_norm": 1.4351956844329834, "kl": 2.539322916666667, "learning_rate": 1.9026608438380195e-05, "loss": 0.1016, "reward": -1.0814687182505927, "reward_std": 1.450120480855306, "rewards/Qwen2-0.5B-Reward": -1.0814687182505927, "step": 620 }, { "completion_length": 683.3101959228516, "epoch": 0.23851590106007067, "grad_norm": 1.352857232093811, "kl": 1.84296875, "learning_rate": 1.897208000701737e-05, "loss": 0.0737, "reward": -0.6815965756773948, "reward_std": 1.0737029949824015, "rewards/Qwen2-0.5B-Reward": -0.6815965756773948, "step": 630 }, { "completion_length": 792.8847290039063, "epoch": 0.24230186774356385, "grad_norm": 0.9555492997169495, "kl": 2.507552083333333, "learning_rate": 1.8916157866366928e-05, "loss": 0.1003, "reward": -0.9711129138867061, "reward_std": 1.5443729062875111, "rewards/Qwen2-0.5B-Reward": -0.9711129138867061, "step": 640 }, { "completion_length": 771.8388916015625, "epoch": 0.24608783442705703, "grad_norm": 1.100447177886963, "kl": 2.43984375, "learning_rate": 1.8858851792949764e-05, "loss": 0.0976, "reward": -1.0383977095286052, "reward_std": 1.4934775571028391, "rewards/Qwen2-0.5B-Reward": -1.0383977095286052, "step": 650 }, { "completion_length": 719.6476888020833, "epoch": 0.24987380111055021, "grad_norm": 1.0007727146148682, "kl": 1.8555989583333334, "learning_rate": 1.880017180523116e-05, "loss": 0.0742, "reward": -0.844773971537749, "reward_std": 1.3539518495400746, "rewards/Qwen2-0.5B-Reward": -0.844773971537749, "step": 660 }, { "completion_length": 751.7523234049479, "epoch": 0.2536597677940434, "grad_norm": 0.8904104232788086, "kl": 1.8032552083333333, "learning_rate": 1.8740128161869308e-05, "loss": 0.0721, "reward": -0.6786784966786702, "reward_std": 1.2231530169645946, "rewards/Qwen2-0.5B-Reward": -0.6786784966786702, "step": 670 }, { "completion_length": 732.9828735351563, "epoch": 0.2574457344775366, "grad_norm": 0.9944930672645569, "kl": 2.1743489583333333, "learning_rate": 1.8678731359921856e-05, "loss": 0.087, "reward": -0.6016703399519125, "reward_std": 1.2204503317674, "rewards/Qwen2-0.5B-Reward": -0.6016703399519125, "step": 680 }, { "completion_length": 769.8699157714843, "epoch": 0.2612317011610298, "grad_norm": 1.7161378860473633, "kl": 2.68125, "learning_rate": 1.8615992133010777e-05, "loss": 0.1073, "reward": -0.9773722817500432, "reward_std": 1.5413507958253225, "rewards/Qwen2-0.5B-Reward": -0.9773722817500432, "step": 690 }, { "completion_length": 723.2902893066406, "epoch": 0.26501766784452296, "grad_norm": 1.2049915790557861, "kl": 2.252083333333333, "learning_rate": 1.855192144944586e-05, "loss": 0.0901, "reward": -0.6862340954442819, "reward_std": 1.3176872313022614, "rewards/Qwen2-0.5B-Reward": -0.6862340954442819, "step": 700 }, { "completion_length": 730.842598470052, "epoch": 0.26880363452801614, "grad_norm": 1.1353825330734253, "kl": 2.6614583333333335, "learning_rate": 1.8486530510307222e-05, "loss": 0.1064, "reward": -0.8512504202624162, "reward_std": 1.4152730743090312, "rewards/Qwen2-0.5B-Reward": -0.8512504202624162, "step": 710 }, { "completion_length": 784.4833374023438, "epoch": 0.2725896012115093, "grad_norm": 1.1364926099777222, "kl": 2.8877604166666666, "learning_rate": 1.8419830747487045e-05, "loss": 0.1155, "reward": -1.4028477271397908, "reward_std": 1.6338281035423279, "rewards/Qwen2-0.5B-Reward": -1.4028477271397908, "step": 720 }, { "completion_length": 786.0379679361979, "epoch": 0.2763755678950025, "grad_norm": 1.4071515798568726, "kl": 3.0088541666666666, "learning_rate": 1.8351833821691053e-05, "loss": 0.1204, "reward": -1.2512944350639978, "reward_std": 1.6677428344885508, "rewards/Qwen2-0.5B-Reward": -1.2512944350639978, "step": 730 }, { "completion_length": 807.7277893066406, "epoch": 0.2801615345784957, "grad_norm": 1.4424173831939697, "kl": 3.021354166666667, "learning_rate": 1.8282551620399917e-05, "loss": 0.1208, "reward": -1.225243662794431, "reward_std": 1.7895207107067108, "rewards/Qwen2-0.5B-Reward": -1.225243662794431, "step": 740 }, { "completion_length": 728.170839436849, "epoch": 0.2839475012619889, "grad_norm": 0.6950631141662598, "kl": 2.519661458333333, "learning_rate": 1.821199625579105e-05, "loss": 0.1008, "reward": -0.8639134142082184, "reward_std": 1.4788370271523794, "rewards/Qwen2-0.5B-Reward": -0.8639134142082184, "step": 750 }, { "completion_length": 679.6050984700521, "epoch": 0.2877334679454821, "grad_norm": 1.6717815399169922, "kl": 1.7360677083333333, "learning_rate": 1.8140180062621117e-05, "loss": 0.0695, "reward": -0.46732902062746384, "reward_std": 0.9378261427084605, "rewards/Qwen2-0.5B-Reward": -0.46732902062746384, "step": 760 }, { "completion_length": 783.3986165364583, "epoch": 0.2915194346289753, "grad_norm": 1.3388867378234863, "kl": 2.79609375, "learning_rate": 1.8067115596069607e-05, "loss": 0.1118, "reward": -0.9435359309117, "reward_std": 1.6089221199353536, "rewards/Qwen2-0.5B-Reward": -0.9435359309117, "step": 770 }, { "completion_length": 713.3592651367187, "epoch": 0.29530540131246846, "grad_norm": 1.2017817497253418, "kl": 2.4661458333333335, "learning_rate": 1.79928156295439e-05, "loss": 0.0986, "reward": -0.7846424505114555, "reward_std": 1.4175224483013154, "rewards/Qwen2-0.5B-Reward": -0.7846424505114555, "step": 780 }, { "completion_length": 813.8467631022136, "epoch": 0.29909136799596164, "grad_norm": 2.2606418132781982, "kl": 3.955208333333333, "learning_rate": 1.7917293152446184e-05, "loss": 0.1583, "reward": -1.4304717580477397, "reward_std": 2.023730218410492, "rewards/Qwen2-0.5B-Reward": -1.4304717580477397, "step": 790 }, { "completion_length": 701.6078796386719, "epoch": 0.3028773346794548, "grad_norm": 1.5273058414459229, "kl": 2.3984375, "learning_rate": 1.784056136790257e-05, "loss": 0.096, "reward": -0.7075912684202195, "reward_std": 1.3393534004688263, "rewards/Qwen2-0.5B-Reward": -0.7075912684202195, "step": 800 }, { "completion_length": 709.4273193359375, "epoch": 0.306663301362948, "grad_norm": 1.1304354667663574, "kl": 2.4188802083333334, "learning_rate": 1.7762633690454897e-05, "loss": 0.0968, "reward": -0.6373326261838277, "reward_std": 1.289098753531774, "rewards/Qwen2-0.5B-Reward": -0.6373326261838277, "step": 810 }, { "completion_length": 757.3930643717448, "epoch": 0.3104492680464412, "grad_norm": 1.3200254440307617, "kl": 2.48046875, "learning_rate": 1.7683523743715538e-05, "loss": 0.0993, "reward": -0.8247589614242316, "reward_std": 1.4155633012453714, "rewards/Qwen2-0.5B-Reward": -0.8247589614242316, "step": 820 }, { "completion_length": 697.1213033040365, "epoch": 0.31423523472993437, "grad_norm": 0.8467837572097778, "kl": 2.003515625, "learning_rate": 1.760324535798567e-05, "loss": 0.0802, "reward": -0.4532388661056757, "reward_std": 1.0981567233800889, "rewards/Qwen2-0.5B-Reward": -0.4532388661056757, "step": 830 }, { "completion_length": 780.1504659016927, "epoch": 0.31802120141342755, "grad_norm": 596676.75, "kl": 3364.5799479166667, "learning_rate": 1.752181256783741e-05, "loss": 134.4873, "reward": -0.9652832999825478, "reward_std": 1.6040133237838745, "rewards/Qwen2-0.5B-Reward": -0.9652832999825478, "step": 840 }, { "completion_length": 695.2185282389323, "epoch": 0.3218071680969207, "grad_norm": 1.3249794244766235, "kl": 2.296744791666667, "learning_rate": 1.7439239609660238e-05, "loss": 0.0919, "reward": -0.49953351405759655, "reward_std": 1.100526017944018, "rewards/Qwen2-0.5B-Reward": -0.49953351405759655, "step": 850 }, { "completion_length": 705.1893636067708, "epoch": 0.3255931347804139, "grad_norm": 2.2733891010284424, "kl": 2.476302083333333, "learning_rate": 1.735554091917214e-05, "loss": 0.0991, "reward": -0.7803226565321286, "reward_std": 1.427873319387436, "rewards/Qwen2-0.5B-Reward": -0.7803226565321286, "step": 860 }, { "completion_length": 724.2041748046875, "epoch": 0.32937910146390714, "grad_norm": 1.2122727632522583, "kl": 2.6158854166666665, "learning_rate": 1.7270731128895896e-05, "loss": 0.1046, "reward": -0.9140092690785726, "reward_std": 1.5725321372350056, "rewards/Qwen2-0.5B-Reward": -0.9140092690785726, "step": 870 }, { "completion_length": 736.9453796386719, "epoch": 0.3331650681474003, "grad_norm": 0.9501739740371704, "kl": 2.3080729166666667, "learning_rate": 1.7184825065600964e-05, "loss": 0.0923, "reward": -0.7457656829307476, "reward_std": 1.343357914686203, "rewards/Qwen2-0.5B-Reward": -0.7457656829307476, "step": 880 }, { "completion_length": 785.8842651367188, "epoch": 0.3369510348308935, "grad_norm": 0.9159669280052185, "kl": 2.5815104166666667, "learning_rate": 1.709783774771141e-05, "loss": 0.1033, "reward": -0.7225840290387472, "reward_std": 1.4536415020624796, "rewards/Qwen2-0.5B-Reward": -0.7225840290387472, "step": 890 }, { "completion_length": 824.7926045735677, "epoch": 0.3407370015143867, "grad_norm": 3.830165147781372, "kl": 2.72265625, "learning_rate": 1.7009784382680345e-05, "loss": 0.1089, "reward": -0.9060644646485646, "reward_std": 1.5053735852241517, "rewards/Qwen2-0.5B-Reward": -0.9060644646485646, "step": 900 }, { "completion_length": 788.6736124674479, "epoch": 0.34452296819787986, "grad_norm": 1.977720022201538, "kl": 2.8015625, "learning_rate": 1.692068036433128e-05, "loss": 0.1121, "reward": -0.7987352999548117, "reward_std": 1.52867697874705, "rewards/Qwen2-0.5B-Reward": -0.7987352999548117, "step": 910 }, { "completion_length": 740.3379699707032, "epoch": 0.34830893488137304, "grad_norm": 170.5952911376953, "kl": 2.7221354166666667, "learning_rate": 1.6830541270166928e-05, "loss": 0.1088, "reward": -0.9519633074601491, "reward_std": 1.5265244921048482, "rewards/Qwen2-0.5B-Reward": -0.9519633074601491, "step": 920 }, { "completion_length": 715.4027811686198, "epoch": 0.3520949015648662, "grad_norm": 1.736777663230896, "kl": 2.2143229166666667, "learning_rate": 1.673938285864588e-05, "loss": 0.0886, "reward": -0.5707902121047179, "reward_std": 1.127177753051122, "rewards/Qwen2-0.5B-Reward": -0.5707902121047179, "step": 930 }, { "completion_length": 812.7449137369791, "epoch": 0.3558808682483594, "grad_norm": 2.1414971351623535, "kl": 2.667708333333333, "learning_rate": 1.664722106642767e-05, "loss": 0.1066, "reward": -0.9589705864588419, "reward_std": 1.527525293827057, "rewards/Qwen2-0.5B-Reward": -0.9589705864588419, "step": 940 }, { "completion_length": 769.1416849772136, "epoch": 0.3596668349318526, "grad_norm": 4.496264457702637, "kl": 2.38359375, "learning_rate": 1.6554072005586638e-05, "loss": 0.0953, "reward": -0.5887288892020782, "reward_std": 1.23007483681043, "rewards/Qwen2-0.5B-Reward": -0.5887288892020782, "step": 950 }, { "completion_length": 766.1638936360677, "epoch": 0.36345280161534577, "grad_norm": 1.2970997095108032, "kl": 2.5834635416666667, "learning_rate": 1.6459951960795185e-05, "loss": 0.1033, "reward": -0.7721572608997425, "reward_std": 1.4835912009080252, "rewards/Qwen2-0.5B-Reward": -0.7721572608997425, "step": 960 }, { "completion_length": 751.2708374023438, "epoch": 0.36723876829883895, "grad_norm": 2.702152967453003, "kl": 2.40390625, "learning_rate": 1.6364877386476804e-05, "loss": 0.0961, "reward": -0.7570990284283956, "reward_std": 1.4351972460746765, "rewards/Qwen2-0.5B-Reward": -0.7570990284283956, "step": 970 }, { "completion_length": 730.1676005045573, "epoch": 0.3710247349823322, "grad_norm": 1.0240263938903809, "kl": 2.5716145833333335, "learning_rate": 1.6268864903929466e-05, "loss": 0.1029, "reward": -0.6520452598730723, "reward_std": 1.3328065713246664, "rewards/Qwen2-0.5B-Reward": -0.6520452598730723, "step": 980 }, { "completion_length": 738.0222351074219, "epoch": 0.37481070166582536, "grad_norm": 0.9893134832382202, "kl": 2.990104166666667, "learning_rate": 1.617193129841982e-05, "loss": 0.1196, "reward": -0.973382901151975, "reward_std": 1.5284679671128591, "rewards/Qwen2-0.5B-Reward": -0.973382901151975, "step": 990 }, { "completion_length": 767.7051005045573, "epoch": 0.37859666834931854, "grad_norm": 1.4028962850570679, "kl": 3.0208333333333335, "learning_rate": 1.6074093516248726e-05, "loss": 0.1208, "reward": -0.8820533196131388, "reward_std": 1.5515558183193208, "rewards/Qwen2-0.5B-Reward": -0.8820533196131388, "step": 1000 }, { "completion_length": 729.3245463053386, "epoch": 0.3823826350328117, "grad_norm": 1.1494252681732178, "kl": 2.1536458333333335, "learning_rate": 1.5975368661788636e-05, "loss": 0.0861, "reward": -0.617452886607498, "reward_std": 1.2075418949127197, "rewards/Qwen2-0.5B-Reward": -0.617452886607498, "step": 1010 }, { "completion_length": 711.6662150065105, "epoch": 0.3861686017163049, "grad_norm": 0.9261192083358765, "kl": 2.349739583333333, "learning_rate": 1.587577399449336e-05, "loss": 0.094, "reward": -0.6707314955691497, "reward_std": 1.2855535586675009, "rewards/Qwen2-0.5B-Reward": -0.6707314955691497, "step": 1020 }, { "completion_length": 751.3074096679687, "epoch": 0.3899545683997981, "grad_norm": 2.042595148086548, "kl": 2.3372395833333335, "learning_rate": 1.5775326925880675e-05, "loss": 0.0935, "reward": -0.6637267053127289, "reward_std": 1.3381904661655426, "rewards/Qwen2-0.5B-Reward": -0.6637267053127289, "step": 1030 }, { "completion_length": 776.1870402018229, "epoch": 0.39374053508329127, "grad_norm": 1.2383322715759277, "kl": 5.3609375, "learning_rate": 1.5674045016488397e-05, "loss": 0.2142, "reward": -0.6239150881767273, "reward_std": 1.3248741805553437, "rewards/Qwen2-0.5B-Reward": -0.6239150881767273, "step": 1040 }, { "completion_length": 718.260194905599, "epoch": 0.39752650176678445, "grad_norm": 1.5840164422988892, "kl": 2.3580729166666665, "learning_rate": 1.5571945972804376e-05, "loss": 0.0943, "reward": -0.5199564640720685, "reward_std": 1.2036932865778605, "rewards/Qwen2-0.5B-Reward": -0.5199564640720685, "step": 1050 }, { "completion_length": 809.6148213704427, "epoch": 0.4013124684502776, "grad_norm": 1.5066214799880981, "kl": 3.0403645833333335, "learning_rate": 1.546904764417098e-05, "loss": 0.1216, "reward": -0.9776304622491201, "reward_std": 1.6650471250216166, "rewards/Qwen2-0.5B-Reward": -0.9776304622491201, "step": 1060 }, { "completion_length": 766.8726867675781, "epoch": 0.4050984351337708, "grad_norm": 1.4285918474197388, "kl": 2.3622395833333334, "learning_rate": 1.5365368019664618e-05, "loss": 0.0945, "reward": -0.650248110294342, "reward_std": 1.3134302516778311, "rewards/Qwen2-0.5B-Reward": -0.650248110294342, "step": 1070 }, { "completion_length": 778.1921325683594, "epoch": 0.408884401817264, "grad_norm": 1.9540224075317383, "kl": 2.269270833333333, "learning_rate": 1.5260925224950785e-05, "loss": 0.0908, "reward": -0.5108215274910132, "reward_std": 1.1806359807650249, "rewards/Qwen2-0.5B-Reward": -0.5108215274910132, "step": 1080 }, { "completion_length": 806.8787089029948, "epoch": 0.41267036850075717, "grad_norm": 0.9543392062187195, "kl": 2.60859375, "learning_rate": 1.5155737519115308e-05, "loss": 0.1043, "reward": -0.8536549975474675, "reward_std": 1.4883501867453257, "rewards/Qwen2-0.5B-Reward": -0.8536549975474675, "step": 1090 }, { "completion_length": 790.8611124674479, "epoch": 0.4164563351842504, "grad_norm": 1.6240158081054688, "kl": 2.213671875, "learning_rate": 1.5049823291472195e-05, "loss": 0.0885, "reward": -0.5210499677807092, "reward_std": 1.2201600551605225, "rewards/Qwen2-0.5B-Reward": -0.5210499677807092, "step": 1100 }, { "completion_length": 844.4666748046875, "epoch": 0.4202423018677436, "grad_norm": 0.7703062891960144, "kl": 3.1419270833333335, "learning_rate": 1.494320105834876e-05, "loss": 0.1257, "reward": -1.1577677488327027, "reward_std": 1.7909785747528075, "rewards/Qwen2-0.5B-Reward": -1.1577677488327027, "step": 1110 }, { "completion_length": 873.2398213704427, "epoch": 0.42402826855123676, "grad_norm": 1.8059611320495605, "kl": 3.25859375, "learning_rate": 1.4835889459848517e-05, "loss": 0.1304, "reward": -0.9918207342425982, "reward_std": 1.6435052702824275, "rewards/Qwen2-0.5B-Reward": -0.9918207342425982, "step": 1120 }, { "completion_length": 883.4926025390625, "epoch": 0.42781423523472994, "grad_norm": 1.4837961196899414, "kl": 2.7075520833333333, "learning_rate": 1.472790725659245e-05, "loss": 0.1083, "reward": -0.7034151526788871, "reward_std": 1.3653341392676035, "rewards/Qwen2-0.5B-Reward": -0.7034151526788871, "step": 1130 }, { "completion_length": 779.6824117024739, "epoch": 0.4316002019182231, "grad_norm": 1.1727573871612549, "kl": 2.1869791666666667, "learning_rate": 1.4619273326439229e-05, "loss": 0.0875, "reward": -0.6506599500775337, "reward_std": 1.3229804019133249, "rewards/Qwen2-0.5B-Reward": -0.6506599500775337, "step": 1140 }, { "completion_length": 829.1185241699219, "epoch": 0.4353861686017163, "grad_norm": 0.974542498588562, "kl": 2.659375, "learning_rate": 1.4510006661184867e-05, "loss": 0.1064, "reward": -0.7578525463740031, "reward_std": 1.531895116964976, "rewards/Qwen2-0.5B-Reward": -0.7578525463740031, "step": 1150 }, { "completion_length": 796.5884318033854, "epoch": 0.4391721352852095, "grad_norm": 1.2544572353363037, "kl": 2.2998697916666666, "learning_rate": 1.440012636324255e-05, "loss": 0.092, "reward": -0.6453255646862089, "reward_std": 1.2682056347529094, "rewards/Qwen2-0.5B-Reward": -0.6453255646862089, "step": 1160 }, { "completion_length": 656.2717681884766, "epoch": 0.44295810196870267, "grad_norm": 1.7041164636611938, "kl": 1.9328125, "learning_rate": 1.4289651642303055e-05, "loss": 0.0773, "reward": -0.3406788529828191, "reward_std": 1.0103827198346456, "rewards/Qwen2-0.5B-Reward": -0.3406788529828191, "step": 1170 }, { "completion_length": 733.9171315511068, "epoch": 0.44674406865219585, "grad_norm": 0.846507728099823, "kl": 2.328125, "learning_rate": 1.4178601811976435e-05, "loss": 0.0931, "reward": -0.4902394848565261, "reward_std": 1.2676184395949046, "rewards/Qwen2-0.5B-Reward": -0.4902394848565261, "step": 1180 }, { "completion_length": 768.9398234049479, "epoch": 0.450530035335689, "grad_norm": 0.7115055322647095, "kl": 2.668229166666667, "learning_rate": 1.4066996286415562e-05, "loss": 0.1068, "reward": -0.7519384076197942, "reward_std": 1.4289092858632406, "rewards/Qwen2-0.5B-Reward": -0.7519384076197942, "step": 1190 }, { "completion_length": 825.0884297688802, "epoch": 0.4543160020191822, "grad_norm": 1.200706958770752, "kl": 3.144791666666667, "learning_rate": 1.3954854576922052e-05, "loss": 0.1258, "reward": -1.0908042828241984, "reward_std": 1.6707689007123312, "rewards/Qwen2-0.5B-Reward": -1.0908042828241984, "step": 1200 }, { "completion_length": 705.2310302734375, "epoch": 0.45810196870267544, "grad_norm": 1.3045536279678345, "kl": 2.23046875, "learning_rate": 1.3842196288535226e-05, "loss": 0.0893, "reward": -0.5541289503375689, "reward_std": 1.264378293355306, "rewards/Qwen2-0.5B-Reward": -0.5541289503375689, "step": 1210 }, { "completion_length": 662.7287109375, "epoch": 0.4618879353861686, "grad_norm": 1.1240729093551636, "kl": 1.7548177083333334, "learning_rate": 1.3729041116604697e-05, "loss": 0.0702, "reward": -0.33847450762987136, "reward_std": 1.030816239118576, "rewards/Qwen2-0.5B-Reward": -0.33847450762987136, "step": 1220 }, { "completion_length": 723.0296376546224, "epoch": 0.4656739020696618, "grad_norm": 2.3360471725463867, "kl": 2.3111979166666665, "learning_rate": 1.3615408843347141e-05, "loss": 0.0924, "reward": -0.5807175462444624, "reward_std": 1.3384559114774068, "rewards/Qwen2-0.5B-Reward": -0.5807175462444624, "step": 1230 }, { "completion_length": 751.2430704752604, "epoch": 0.469459868753155, "grad_norm": 2.823309898376465, "kl": 2.7513020833333335, "learning_rate": 1.3501319334387902e-05, "loss": 0.1101, "reward": -0.8531121673683325, "reward_std": 1.5220951795578004, "rewards/Qwen2-0.5B-Reward": -0.8531121673683325, "step": 1240 }, { "completion_length": 790.4842651367187, "epoch": 0.47324583543664817, "grad_norm": 1.8123273849487305, "kl": 3.0010416666666666, "learning_rate": 1.3386792535287997e-05, "loss": 0.1201, "reward": -0.9698835199077924, "reward_std": 1.6139462788899739, "rewards/Qwen2-0.5B-Reward": -0.9698835199077924, "step": 1250 }, { "completion_length": 734.1004720052083, "epoch": 0.47703180212014135, "grad_norm": 0.6924867033958435, "kl": 2.5669270833333333, "learning_rate": 1.3271848468057176e-05, "loss": 0.1027, "reward": -0.6089021896322568, "reward_std": 1.2572330633799236, "rewards/Qwen2-0.5B-Reward": -0.6089021896322568, "step": 1260 }, { "completion_length": 740.8490763346355, "epoch": 0.4808177688036345, "grad_norm": 1.0355186462402344, "kl": 2.7315104166666666, "learning_rate": 1.3156507227653582e-05, "loss": 0.1093, "reward": -0.5665054028232892, "reward_std": 1.3232530683279038, "rewards/Qwen2-0.5B-Reward": -0.5665054028232892, "step": 1270 }, { "completion_length": 721.3925944010417, "epoch": 0.4846037354871277, "grad_norm": 1.0751088857650757, "kl": 2.77890625, "learning_rate": 1.3040788978470678e-05, "loss": 0.1111, "reward": -0.617917682370171, "reward_std": 1.3952182014783223, "rewards/Qwen2-0.5B-Reward": -0.617917682370171, "step": 1280 }, { "completion_length": 743.2884348551432, "epoch": 0.4883897021706209, "grad_norm": 1.7289220094680786, "kl": 2.8721354166666666, "learning_rate": 1.2924713950812033e-05, "loss": 0.1148, "reward": -0.6107141558701793, "reward_std": 1.3133805135885874, "rewards/Qwen2-0.5B-Reward": -0.6107141558701793, "step": 1290 }, { "completion_length": 744.837967936198, "epoch": 0.49217566885411407, "grad_norm": 0.9980621337890625, "kl": 2.6927083333333335, "learning_rate": 1.280830243735459e-05, "loss": 0.1077, "reward": -0.6816005217532317, "reward_std": 1.3647177835305533, "rewards/Qwen2-0.5B-Reward": -0.6816005217532317, "step": 1300 }, { "completion_length": 765.5287109375, "epoch": 0.49596163553760725, "grad_norm": 1.5100042819976807, "kl": 3.23359375, "learning_rate": 1.2691574789601006e-05, "loss": 0.1293, "reward": -0.7456285426393151, "reward_std": 1.504830890893936, "rewards/Qwen2-0.5B-Reward": -0.7456285426393151, "step": 1310 }, { "completion_length": 776.5162089029948, "epoch": 0.49974760222110043, "grad_norm": 3.0420119762420654, "kl": 2.664322916666667, "learning_rate": 1.2574551414321749e-05, "loss": 0.1066, "reward": -0.6133380237966776, "reward_std": 1.4030099928379058, "rewards/Qwen2-0.5B-Reward": -0.6133380237966776, "step": 1320 }, { "completion_length": 756.2375101725261, "epoch": 0.5035335689045937, "grad_norm": 1.2776826620101929, "kl": 2.5111979166666667, "learning_rate": 1.2457252769987485e-05, "loss": 0.1005, "reward": -0.4735676831565797, "reward_std": 1.2207833151022593, "rewards/Qwen2-0.5B-Reward": -0.4735676831565797, "step": 1330 }, { "completion_length": 780.6055643717448, "epoch": 0.5073195355880868, "grad_norm": 1.277037262916565, "kl": 2.29453125, "learning_rate": 1.2339699363192461e-05, "loss": 0.0918, "reward": -0.41186855093886454, "reward_std": 1.1698833445707957, "rewards/Qwen2-0.5B-Reward": -0.41186855093886454, "step": 1340 }, { "completion_length": 814.8995402018229, "epoch": 0.51110550227158, "grad_norm": 1.1098392009735107, "kl": 2.9515625, "learning_rate": 1.2221911745069473e-05, "loss": 0.118, "reward": -0.7255906278888384, "reward_std": 1.5052427490552267, "rewards/Qwen2-0.5B-Reward": -0.7255906278888384, "step": 1350 }, { "completion_length": 800.6180623372396, "epoch": 0.5148914689550732, "grad_norm": 1.5379681587219238, "kl": 3.078385416666667, "learning_rate": 1.210391050769702e-05, "loss": 0.1231, "reward": -0.9011206914981206, "reward_std": 1.5988249023755392, "rewards/Qwen2-0.5B-Reward": -0.9011206914981206, "step": 1360 }, { "completion_length": 787.152783203125, "epoch": 0.5186774356385664, "grad_norm": 1.421747088432312, "kl": 2.74453125, "learning_rate": 1.1985716280499338e-05, "loss": 0.1098, "reward": -0.7614536421994368, "reward_std": 1.4081373771031698, "rewards/Qwen2-0.5B-Reward": -0.7614536421994368, "step": 1370 }, { "completion_length": 842.5861185709635, "epoch": 0.5224634023220596, "grad_norm": 2.403327226638794, "kl": 3.16171875, "learning_rate": 1.1867349726639868e-05, "loss": 0.1266, "reward": -0.8059929932157198, "reward_std": 1.487107406059901, "rewards/Qwen2-0.5B-Reward": -0.8059929932157198, "step": 1380 }, { "completion_length": 793.2569539388021, "epoch": 0.5262493690055527, "grad_norm": 1.0243574380874634, "kl": 3.30625, "learning_rate": 1.1748831539408863e-05, "loss": 0.1323, "reward": -0.9990609556436538, "reward_std": 1.641613002618154, "rewards/Qwen2-0.5B-Reward": -0.9990609556436538, "step": 1390 }, { "completion_length": 781.2513997395833, "epoch": 0.5300353356890459, "grad_norm": 1.4023561477661133, "kl": 2.6401041666666667, "learning_rate": 1.1630182438605688e-05, "loss": 0.1056, "reward": -0.73541273077329, "reward_std": 1.391848737001419, "rewards/Qwen2-0.5B-Reward": -0.73541273077329, "step": 1400 }, { "completion_length": 777.1314880371094, "epoch": 0.5338213023725391, "grad_norm": 1.4984385967254639, "kl": 3.1786458333333334, "learning_rate": 1.151142316691652e-05, "loss": 0.1273, "reward": -0.9620630964636803, "reward_std": 1.6180862605571746, "rewards/Qwen2-0.5B-Reward": -0.9620630964636803, "step": 1410 }, { "completion_length": 753.6296305338542, "epoch": 0.5376072690560323, "grad_norm": 0.6080305576324463, "kl": 2.93046875, "learning_rate": 1.1392574486288026e-05, "loss": 0.1172, "reward": -0.6871781093068421, "reward_std": 1.4368105371793112, "rewards/Qwen2-0.5B-Reward": -0.6871781093068421, "step": 1420 }, { "completion_length": 755.0680684407552, "epoch": 0.5413932357395255, "grad_norm": 0.9181307554244995, "kl": 2.5361979166666666, "learning_rate": 1.1273657174297687e-05, "loss": 0.1016, "reward": -0.41866928230350214, "reward_std": 1.193355711301168, "rewards/Qwen2-0.5B-Reward": -0.41866928230350214, "step": 1430 }, { "completion_length": 739.3643595377604, "epoch": 0.5451792024230186, "grad_norm": 1.3852412700653076, "kl": 2.4328125, "learning_rate": 1.1154692020521379e-05, "loss": 0.0973, "reward": -0.42044620849192144, "reward_std": 1.1699665983517964, "rewards/Qwen2-0.5B-Reward": -0.42044620849192144, "step": 1440 }, { "completion_length": 787.2263997395834, "epoch": 0.5489651691065118, "grad_norm": 1.2610223293304443, "kl": 2.7135416666666665, "learning_rate": 1.1035699822898852e-05, "loss": 0.1085, "reward": -0.5719452144578099, "reward_std": 1.3674102127552032, "rewards/Qwen2-0.5B-Reward": -0.5719452144578099, "step": 1450 }, { "completion_length": 794.1407450358073, "epoch": 0.552751135790005, "grad_norm": 3.987548351287842, "kl": 3.580208333333333, "learning_rate": 1.091670138409778e-05, "loss": 0.1432, "reward": -0.913334188858668, "reward_std": 1.654043678442637, "rewards/Qwen2-0.5B-Reward": -0.913334188858668, "step": 1460 }, { "completion_length": 755.6018575032552, "epoch": 0.5565371024734982, "grad_norm": 1.312009334564209, "kl": 2.115364583333333, "learning_rate": 1.0797717507876926e-05, "loss": 0.0846, "reward": -0.605161217538019, "reward_std": 1.230643669764201, "rewards/Qwen2-0.5B-Reward": -0.605161217538019, "step": 1470 }, { "completion_length": 745.5069519042969, "epoch": 0.5603230691569914, "grad_norm": 1.5958776473999023, "kl": 2.8216145833333335, "learning_rate": 1.0678768995449179e-05, "loss": 0.1129, "reward": -0.5114948400606711, "reward_std": 1.213375515739123, "rewards/Qwen2-0.5B-Reward": -0.5114948400606711, "step": 1480 }, { "completion_length": 790.2676025390625, "epoch": 0.5641090358404846, "grad_norm": 1.0892456769943237, "kl": 2.9091145833333334, "learning_rate": 1.055987664184499e-05, "loss": 0.1164, "reward": -0.6985714793205261, "reward_std": 1.441979839404424, "rewards/Qwen2-0.5B-Reward": -0.6985714793205261, "step": 1490 }, { "completion_length": 768.1861206054688, "epoch": 0.5678950025239778, "grad_norm": 1.5841772556304932, "kl": 2.4014322916666666, "learning_rate": 1.0441061232276914e-05, "loss": 0.096, "reward": -0.5361925270253172, "reward_std": 1.2279207597176234, "rewards/Qwen2-0.5B-Reward": -0.5361925270253172, "step": 1500 }, { "completion_length": 790.4763977050782, "epoch": 0.571680969207471, "grad_norm": 1.4919512271881104, "kl": 2.945052083333333, "learning_rate": 1.0322343538505859e-05, "loss": 0.1178, "reward": -0.6917820642391841, "reward_std": 1.41629096865654, "rewards/Qwen2-0.5B-Reward": -0.6917820642391841, "step": 1510 }, { "completion_length": 732.1388956705729, "epoch": 0.5754669358909642, "grad_norm": 1.3066332340240479, "kl": 2.5669270833333333, "learning_rate": 1.0203744315209683e-05, "loss": 0.1026, "reward": -0.4832228126314779, "reward_std": 1.21365185379982, "rewards/Qwen2-0.5B-Reward": -0.4832228126314779, "step": 1520 }, { "completion_length": 777.1152811686198, "epoch": 0.5792529025744574, "grad_norm": 2.0675883293151855, "kl": 3.0052083333333335, "learning_rate": 1.0085284296354784e-05, "loss": 0.1202, "reward": -0.7202197993795078, "reward_std": 1.480885813633601, "rewards/Qwen2-0.5B-Reward": -0.7202197993795078, "step": 1530 }, { "completion_length": 810.0231577555338, "epoch": 0.5830388692579506, "grad_norm": 1.1669964790344238, "kl": 3.351822916666667, "learning_rate": 9.966984191571318e-06, "loss": 0.1341, "reward": -0.9308211114102354, "reward_std": 1.5289963026841482, "rewards/Qwen2-0.5B-Reward": -0.9308211114102354, "step": 1540 }, { "completion_length": 803.5157470703125, "epoch": 0.5868248359414437, "grad_norm": 1.2970937490463257, "kl": 2.9263020833333333, "learning_rate": 9.848864682532654e-06, "loss": 0.1171, "reward": -0.897743321955204, "reward_std": 1.4250325242678323, "rewards/Qwen2-0.5B-Reward": -0.897743321955204, "step": 1550 }, { "completion_length": 774.900467936198, "epoch": 0.5906108026249369, "grad_norm": 1.5224976539611816, "kl": 3.23203125, "learning_rate": 9.730946419339721e-06, "loss": 0.1293, "reward": -0.8313487897316615, "reward_std": 1.4089517414569854, "rewards/Qwen2-0.5B-Reward": -0.8313487897316615, "step": 1560 }, { "completion_length": 814.4111185709636, "epoch": 0.5943967693084301, "grad_norm": 1.5672080516815186, "kl": 2.9359375, "learning_rate": 9.613250016910894e-06, "loss": 0.1174, "reward": -0.7221511860688528, "reward_std": 1.3432387212912242, "rewards/Qwen2-0.5B-Reward": -0.7221511860688528, "step": 1570 }, { "completion_length": 776.6129638671875, "epoch": 0.5981827359919233, "grad_norm": 1.8100062608718872, "kl": 2.7890625, "learning_rate": 9.495796051377997e-06, "loss": 0.1115, "reward": -0.8584653136630853, "reward_std": 1.3234432935714722, "rewards/Qwen2-0.5B-Reward": -0.8584653136630853, "step": 1580 }, { "completion_length": 825.8180603027344, "epoch": 0.6019687026754165, "grad_norm": 1.6404787302017212, "kl": 3.863541666666667, "learning_rate": 9.378605056489128e-06, "loss": 0.1545, "reward": -1.263607233762741, "reward_std": 1.8019790093104044, "rewards/Qwen2-0.5B-Reward": -1.263607233762741, "step": 1590 }, { "completion_length": 728.1912089029948, "epoch": 0.6057546693589096, "grad_norm": 0.8878143429756165, "kl": 2.6088541666666667, "learning_rate": 9.261697520018849e-06, "loss": 0.1044, "reward": -0.42785762051741283, "reward_std": 1.0820347189903259, "rewards/Qwen2-0.5B-Reward": -0.42785762051741283, "step": 1600 }, { "completion_length": 747.9509358723958, "epoch": 0.6095406360424028, "grad_norm": 1.613976240158081, "kl": 2.468489583333333, "learning_rate": 9.145093880186451e-06, "loss": 0.0988, "reward": -0.41555683029194673, "reward_std": 1.179705987373988, "rewards/Qwen2-0.5B-Reward": -0.41555683029194673, "step": 1610 }, { "completion_length": 787.43056640625, "epoch": 0.613326602725896, "grad_norm": 0.5864226818084717, "kl": 2.894270833333333, "learning_rate": 9.028814522082857e-06, "loss": 0.1157, "reward": -0.6661467840274174, "reward_std": 1.412223219871521, "rewards/Qwen2-0.5B-Reward": -0.6661467840274174, "step": 1620 }, { "completion_length": 742.3319498697916, "epoch": 0.6171125694093892, "grad_norm": 1.7149267196655273, "kl": 2.7528645833333334, "learning_rate": 8.912879774106832e-06, "loss": 0.1101, "reward": -0.560060964524746, "reward_std": 1.2752733170986175, "rewards/Qwen2-0.5B-Reward": -0.560060964524746, "step": 1630 }, { "completion_length": 750.7245422363281, "epoch": 0.6208985360928824, "grad_norm": 2.106180191040039, "kl": 2.40546875, "learning_rate": 8.797309904411087e-06, "loss": 0.0962, "reward": -0.416633996165668, "reward_std": 1.1659721612930298, "rewards/Qwen2-0.5B-Reward": -0.416633996165668, "step": 1640 }, { "completion_length": 778.6088033040364, "epoch": 0.6246845027763755, "grad_norm": 1.4638694524765015, "kl": 2.676041666666667, "learning_rate": 8.682125117358927e-06, "loss": 0.1071, "reward": -0.6446437170108159, "reward_std": 1.3279209415117899, "rewards/Qwen2-0.5B-Reward": -0.6446437170108159, "step": 1650 }, { "completion_length": 808.040283203125, "epoch": 0.6284704694598687, "grad_norm": 1.1022939682006836, "kl": 3.4580729166666666, "learning_rate": 8.567345549992045e-06, "loss": 0.1383, "reward": -0.7954719786842664, "reward_std": 1.4967798054218293, "rewards/Qwen2-0.5B-Reward": -0.7954719786842664, "step": 1660 }, { "completion_length": 757.4060282389323, "epoch": 0.6322564361433619, "grad_norm": 2.4723708629608154, "kl": 2.792708333333333, "learning_rate": 8.4529912685101e-06, "loss": 0.1117, "reward": -0.5523949672778448, "reward_std": 1.3249893307685852, "rewards/Qwen2-0.5B-Reward": -0.5523949672778448, "step": 1670 }, { "completion_length": 762.1398213704427, "epoch": 0.6360424028268551, "grad_norm": 0.8709607720375061, "kl": 2.8286458333333333, "learning_rate": 8.33908226476265e-06, "loss": 0.1132, "reward": -0.5545504409819841, "reward_std": 1.3114221652348836, "rewards/Qwen2-0.5B-Reward": -0.5545504409819841, "step": 1680 }, { "completion_length": 823.0356526692708, "epoch": 0.6398283695103483, "grad_norm": 0.969098687171936, "kl": 2.855729166666667, "learning_rate": 8.22563845275411e-06, "loss": 0.1142, "reward": -0.7070573056737582, "reward_std": 1.3873663266499838, "rewards/Qwen2-0.5B-Reward": -0.7070573056737582, "step": 1690 }, { "completion_length": 810.1981506347656, "epoch": 0.6436143361938415, "grad_norm": 1.2305635213851929, "kl": 3.793229166666667, "learning_rate": 8.11267966516231e-06, "loss": 0.1518, "reward": -1.061463608344396, "reward_std": 1.7348846475283304, "rewards/Qwen2-0.5B-Reward": -1.061463608344396, "step": 1700 }, { "completion_length": 776.243983968099, "epoch": 0.6474003028773346, "grad_norm": 1.6688897609710693, "kl": 2.94375, "learning_rate": 8.000225649871272e-06, "loss": 0.1177, "reward": -0.7328139250477155, "reward_std": 1.4019733607769012, "rewards/Qwen2-0.5B-Reward": -0.7328139250477155, "step": 1710 }, { "completion_length": 782.6092692057292, "epoch": 0.6511862695608278, "grad_norm": 2.184279680252075, "kl": 3.275260416666667, "learning_rate": 7.888296066518806e-06, "loss": 0.131, "reward": -0.826190093656381, "reward_std": 1.539618053038915, "rewards/Qwen2-0.5B-Reward": -0.826190093656381, "step": 1720 }, { "completion_length": 707.3263997395833, "epoch": 0.6549722362443211, "grad_norm": 2.3973989486694336, "kl": 2.400260416666667, "learning_rate": 7.776910483059543e-06, "loss": 0.096, "reward": -0.5184978457788626, "reward_std": 1.1560731967290243, "rewards/Qwen2-0.5B-Reward": -0.5184978457788626, "step": 1730 }, { "completion_length": 734.9152872721354, "epoch": 0.6587582029278143, "grad_norm": 1.8029112815856934, "kl": 2.9859375, "learning_rate": 7.666088372343984e-06, "loss": 0.1194, "reward": -0.5925529218278826, "reward_std": 1.267720968524615, "rewards/Qwen2-0.5B-Reward": -0.5925529218278826, "step": 1740 }, { "completion_length": 807.6726928710938, "epoch": 0.6625441696113075, "grad_norm": 1.5247033834457397, "kl": 3.3872395833333333, "learning_rate": 7.555849108714192e-06, "loss": 0.1355, "reward": -0.7715960969527562, "reward_std": 1.4897764484087626, "rewards/Qwen2-0.5B-Reward": -0.7715960969527562, "step": 1750 }, { "completion_length": 776.8838073730469, "epoch": 0.6663301362948006, "grad_norm": 1.9940361976623535, "kl": 2.837760416666667, "learning_rate": 7.4462119646166855e-06, "loss": 0.1136, "reward": -0.7241511250535647, "reward_std": 1.4011840164661407, "rewards/Qwen2-0.5B-Reward": -0.7241511250535647, "step": 1760 }, { "completion_length": 767.8162129720052, "epoch": 0.6701161029782938, "grad_norm": 1.5367672443389893, "kl": 3.5140625, "learning_rate": 7.337196107233155e-06, "loss": 0.1407, "reward": -0.7663616319497426, "reward_std": 1.5210982898871104, "rewards/Qwen2-0.5B-Reward": -0.7663616319497426, "step": 1770 }, { "completion_length": 721.7675944010417, "epoch": 0.673902069661787, "grad_norm": 1.302241563796997, "kl": 2.931510416666667, "learning_rate": 7.228820595129604e-06, "loss": 0.1172, "reward": -0.725257391979297, "reward_std": 1.334197594722112, "rewards/Qwen2-0.5B-Reward": -0.725257391979297, "step": 1780 }, { "completion_length": 720.3171468098958, "epoch": 0.6776880363452802, "grad_norm": 0.8652080297470093, "kl": 3.028125, "learning_rate": 7.12110437492443e-06, "loss": 0.1211, "reward": -0.753487682590882, "reward_std": 1.4118338882923127, "rewards/Qwen2-0.5B-Reward": -0.753487682590882, "step": 1790 }, { "completion_length": 744.602783203125, "epoch": 0.6814740030287734, "grad_norm": 0.6850081086158752, "kl": 3.18046875, "learning_rate": 7.014066277976128e-06, "loss": 0.1272, "reward": -0.6332276176661253, "reward_std": 1.3656011939048767, "rewards/Qwen2-0.5B-Reward": -0.6332276176661253, "step": 1800 }, { "completion_length": 759.4481526692708, "epoch": 0.6852599697122665, "grad_norm": 2.0515530109405518, "kl": 3.3453125, "learning_rate": 6.9077250170911005e-06, "loss": 0.1338, "reward": -0.8095526337623596, "reward_std": 1.5075600425402322, "rewards/Qwen2-0.5B-Reward": -0.8095526337623596, "step": 1810 }, { "completion_length": 723.5777811686198, "epoch": 0.6890459363957597, "grad_norm": 0.7833884358406067, "kl": 2.9953125, "learning_rate": 6.802099183252235e-06, "loss": 0.1198, "reward": -0.7537414369483789, "reward_std": 1.383406792084376, "rewards/Qwen2-0.5B-Reward": -0.7537414369483789, "step": 1820 }, { "completion_length": 724.8837972005208, "epoch": 0.6928319030792529, "grad_norm": 0.9831650853157043, "kl": 2.5338541666666665, "learning_rate": 6.697207242368742e-06, "loss": 0.1013, "reward": -0.43006037194281815, "reward_std": 1.1635287086168924, "rewards/Qwen2-0.5B-Reward": -0.43006037194281815, "step": 1830 }, { "completion_length": 760.3333374023438, "epoch": 0.6966178697627461, "grad_norm": 1.1536668539047241, "kl": 2.6203125, "learning_rate": 6.593067532047882e-06, "loss": 0.1049, "reward": -0.4441113060961167, "reward_std": 1.1987637420495352, "rewards/Qwen2-0.5B-Reward": -0.4441113060961167, "step": 1840 }, { "completion_length": 749.903251139323, "epoch": 0.7004038364462393, "grad_norm": 0.8368715643882751, "kl": 2.5341145833333334, "learning_rate": 6.489698258389107e-06, "loss": 0.1013, "reward": -0.5944258317351341, "reward_std": 1.3474121958017349, "rewards/Qwen2-0.5B-Reward": -0.5944258317351341, "step": 1850 }, { "completion_length": 745.5365783691407, "epoch": 0.7041898031297325, "grad_norm": 1.029958724975586, "kl": 2.90078125, "learning_rate": 6.387117492801213e-06, "loss": 0.1161, "reward": -0.6068828483422597, "reward_std": 1.321648943424225, "rewards/Qwen2-0.5B-Reward": -0.6068828483422597, "step": 1860 }, { "completion_length": 755.6328796386719, "epoch": 0.7079757698132256, "grad_norm": 5.108635425567627, "kl": 2.9171875, "learning_rate": 6.285343168843028e-06, "loss": 0.1167, "reward": -0.6523237491647402, "reward_std": 1.3444733719031017, "rewards/Qwen2-0.5B-Reward": -0.6523237491647402, "step": 1870 }, { "completion_length": 787.0935241699219, "epoch": 0.7117617364967188, "grad_norm": 1.3548846244812012, "kl": 3.0869791666666666, "learning_rate": 6.1843930790881766e-06, "loss": 0.1235, "reward": -0.6537054566045603, "reward_std": 1.4838234384854634, "rewards/Qwen2-0.5B-Reward": -0.6537054566045603, "step": 1880 }, { "completion_length": 773.563895670573, "epoch": 0.715547703180212, "grad_norm": 0.8410789966583252, "kl": 2.837760416666667, "learning_rate": 6.084284872014545e-06, "loss": 0.1136, "reward": -0.5507580937196811, "reward_std": 1.2756544808546701, "rewards/Qwen2-0.5B-Reward": -0.5507580937196811, "step": 1890 }, { "completion_length": 760.8699096679687, "epoch": 0.7193336698637052, "grad_norm": 1.5116900205612183, "kl": 2.6723958333333333, "learning_rate": 5.985036048918894e-06, "loss": 0.1069, "reward": -0.46427804150929053, "reward_std": 1.1952710588773092, "rewards/Qwen2-0.5B-Reward": -0.46427804150929053, "step": 1900 }, { "completion_length": 763.8004699707031, "epoch": 0.7231196365471984, "grad_norm": 1.1645935773849487, "kl": 3.13828125, "learning_rate": 5.886663960857202e-06, "loss": 0.1255, "reward": -0.7973003094395001, "reward_std": 1.4403738955656686, "rewards/Qwen2-0.5B-Reward": -0.7973003094395001, "step": 1910 }, { "completion_length": 746.5444559733073, "epoch": 0.7269056032306915, "grad_norm": 1.8314180374145508, "kl": 3.378125, "learning_rate": 5.789185805611313e-06, "loss": 0.1351, "reward": -0.6777333706617356, "reward_std": 1.452496987581253, "rewards/Qwen2-0.5B-Reward": -0.6777333706617356, "step": 1920 }, { "completion_length": 743.2513977050781, "epoch": 0.7306915699141847, "grad_norm": 1.8599276542663574, "kl": 2.6572916666666666, "learning_rate": 5.692618624682342e-06, "loss": 0.1063, "reward": -0.5468713939189911, "reward_std": 1.203757886091868, "rewards/Qwen2-0.5B-Reward": -0.5468713939189911, "step": 1930 }, { "completion_length": 715.9157470703125, "epoch": 0.7344775365976779, "grad_norm": 3.749554395675659, "kl": 3.373177083333333, "learning_rate": 5.596979300311408e-06, "loss": 0.1351, "reward": -0.42453126634160676, "reward_std": 1.129069878657659, "rewards/Qwen2-0.5B-Reward": -0.42453126634160676, "step": 1940 }, { "completion_length": 707.4583414713542, "epoch": 0.7382635032811711, "grad_norm": 1.2406065464019775, "kl": 2.40546875, "learning_rate": 5.502284552528236e-06, "loss": 0.0962, "reward": -0.3166978692635894, "reward_std": 1.0220210254192352, "rewards/Qwen2-0.5B-Reward": -0.3166978692635894, "step": 1950 }, { "completion_length": 730.9064880371094, "epoch": 0.7420494699646644, "grad_norm": 0.894660472869873, "kl": 3.0755208333333335, "learning_rate": 5.408550936228072e-06, "loss": 0.1231, "reward": -0.6020015890399615, "reward_std": 1.3233680129051208, "rewards/Qwen2-0.5B-Reward": -0.6020015890399615, "step": 1960 }, { "completion_length": 784.6120402018229, "epoch": 0.7458354366481575, "grad_norm": 0.9947274923324585, "kl": 3.3036458333333334, "learning_rate": 5.315794838277524e-06, "loss": 0.1321, "reward": -0.8605576127767562, "reward_std": 1.5929324706395467, "rewards/Qwen2-0.5B-Reward": -0.8605576127767562, "step": 1970 }, { "completion_length": 761.7782409667968, "epoch": 0.7496214033316507, "grad_norm": 0.8357589244842529, "kl": 3.126822916666667, "learning_rate": 5.2240324746497185e-06, "loss": 0.1251, "reward": -0.6573333943883578, "reward_std": 1.3803256154060364, "rewards/Qwen2-0.5B-Reward": -0.6573333943883578, "step": 1980 }, { "completion_length": 751.271309407552, "epoch": 0.7534073700151439, "grad_norm": 0.9635012149810791, "kl": 2.846875, "learning_rate": 5.133279887589381e-06, "loss": 0.114, "reward": -0.5246660086015861, "reward_std": 1.2728915989398957, "rewards/Qwen2-0.5B-Reward": -0.5246660086015861, "step": 1990 }, { "completion_length": 721.8902760823568, "epoch": 0.7571933366986371, "grad_norm": 1.915734887123108, "kl": 2.886588541666667, "learning_rate": 5.043552942808269e-06, "loss": 0.1155, "reward": -0.4225703233232101, "reward_std": 1.1504804422458013, "rewards/Qwen2-0.5B-Reward": -0.4225703233232101, "step": 2000 }, { "completion_length": 747.6074157714844, "epoch": 0.7609793033821303, "grad_norm": 1.7324910163879395, "kl": 2.849739583333333, "learning_rate": 4.9548673267114535e-06, "loss": 0.114, "reward": -0.4868051894629995, "reward_std": 1.2382884542147319, "rewards/Qwen2-0.5B-Reward": -0.4868051894629995, "step": 2010 }, { "completion_length": 723.2888916015625, "epoch": 0.7647652700656234, "grad_norm": 1.870195984840393, "kl": 3.38125, "learning_rate": 4.86723854365498e-06, "loss": 0.1353, "reward": -0.6813056563337644, "reward_std": 1.4171151260534922, "rewards/Qwen2-0.5B-Reward": -0.6813056563337644, "step": 2020 }, { "completion_length": 739.2546325683594, "epoch": 0.7685512367491166, "grad_norm": 0.6563529968261719, "kl": 2.7765625, "learning_rate": 4.78068191323533e-06, "loss": 0.111, "reward": -0.6810662182668845, "reward_std": 1.3699560364087422, "rewards/Qwen2-0.5B-Reward": -0.6810662182668845, "step": 2030 }, { "completion_length": 723.6301025390625, "epoch": 0.7723372034326098, "grad_norm": 0.845397412776947, "kl": 3.3549479166666667, "learning_rate": 4.695212567611183e-06, "loss": 0.1343, "reward": -0.6839562758803368, "reward_std": 1.3764802972475687, "rewards/Qwen2-0.5B-Reward": -0.6839562758803368, "step": 2040 }, { "completion_length": 707.3944529215495, "epoch": 0.776123170116103, "grad_norm": 0.8297199606895447, "kl": 2.2606770833333334, "learning_rate": 4.6108454488579754e-06, "loss": 0.0904, "reward": -0.32430495528969916, "reward_std": 1.0496096114317577, "rewards/Qwen2-0.5B-Reward": -0.32430495528969916, "step": 2050 }, { "completion_length": 728.9092631022136, "epoch": 0.7799091367995962, "grad_norm": 0.8965924382209778, "kl": 2.7317708333333335, "learning_rate": 4.5275953063556515e-06, "loss": 0.1092, "reward": -0.49890854886422553, "reward_std": 1.1908490220705668, "rewards/Qwen2-0.5B-Reward": -0.49890854886422553, "step": 2060 }, { "completion_length": 787.4157389322917, "epoch": 0.7836951034830894, "grad_norm": 1.6908742189407349, "kl": 3.14453125, "learning_rate": 4.445476694210125e-06, "loss": 0.1258, "reward": -0.6872879594564438, "reward_std": 1.5059267342090608, "rewards/Qwen2-0.5B-Reward": -0.6872879594564438, "step": 2070 }, { "completion_length": 724.3907409667969, "epoch": 0.7874810701665825, "grad_norm": 0.5646480917930603, "kl": 2.5322916666666666, "learning_rate": 4.364503968708885e-06, "loss": 0.1013, "reward": -0.4010113532965382, "reward_std": 1.1661198248465856, "rewards/Qwen2-0.5B-Reward": -0.4010113532965382, "step": 2080 }, { "completion_length": 762.3759338378907, "epoch": 0.7912670368500757, "grad_norm": 0.7707305550575256, "kl": 3.08828125, "learning_rate": 4.284691285811162e-06, "loss": 0.1235, "reward": -0.6063117478042841, "reward_std": 1.4541340112686156, "rewards/Qwen2-0.5B-Reward": -0.6063117478042841, "step": 2090 }, { "completion_length": 757.8597249348958, "epoch": 0.7950530035335689, "grad_norm": 0.609060525894165, "kl": 2.7552083333333335, "learning_rate": 4.206052598673134e-06, "loss": 0.1102, "reward": -0.5107901314894359, "reward_std": 1.2742640137672425, "rewards/Qwen2-0.5B-Reward": -0.5107901314894359, "step": 2100 }, { "completion_length": 714.1713012695312, "epoch": 0.7988389702170621, "grad_norm": 1.5023508071899414, "kl": 2.7880208333333334, "learning_rate": 4.128601655208588e-06, "loss": 0.1115, "reward": -0.4477219473881026, "reward_std": 1.2109043717384338, "rewards/Qwen2-0.5B-Reward": -0.4477219473881026, "step": 2110 }, { "completion_length": 742.9495381673177, "epoch": 0.8026249369005553, "grad_norm": 1.4843252897262573, "kl": 2.490104166666667, "learning_rate": 4.052351995685459e-06, "loss": 0.0996, "reward": -0.40210790758331616, "reward_std": 1.1073905199766159, "rewards/Qwen2-0.5B-Reward": -0.40210790758331616, "step": 2120 }, { "completion_length": 758.4166687011718, "epoch": 0.8064109035840484, "grad_norm": 0.8346318006515503, "kl": 3.2510416666666666, "learning_rate": 3.977316950358647e-06, "loss": 0.1301, "reward": -0.744351115822792, "reward_std": 1.4400279184182485, "rewards/Qwen2-0.5B-Reward": -0.744351115822792, "step": 2130 }, { "completion_length": 711.5217651367187, "epoch": 0.8101968702675416, "grad_norm": 3.075549840927124, "kl": 2.4575520833333333, "learning_rate": 3.903509637139604e-06, "loss": 0.0983, "reward": -0.4195836258431276, "reward_std": 1.1368374347686767, "rewards/Qwen2-0.5B-Reward": -0.4195836258431276, "step": 2140 }, { "completion_length": 667.8574137369792, "epoch": 0.8139828369510348, "grad_norm": 1.288053035736084, "kl": 2.64140625, "learning_rate": 3.830942959302988e-06, "loss": 0.1056, "reward": -0.25947842622796696, "reward_std": 1.0453672617673875, "rewards/Qwen2-0.5B-Reward": -0.25947842622796696, "step": 2150 }, { "completion_length": 713.2092692057291, "epoch": 0.817768803634528, "grad_norm": 1.47870934009552, "kl": 3.060677083333333, "learning_rate": 3.7596296032308655e-06, "loss": 0.1224, "reward": -0.5742474019527435, "reward_std": 1.2993368287881215, "rewards/Qwen2-0.5B-Reward": -0.5742474019527435, "step": 2160 }, { "completion_length": 756.1185282389323, "epoch": 0.8215547703180212, "grad_norm": 1.0809710025787354, "kl": 3.0234375, "learning_rate": 3.689582036194844e-06, "loss": 0.121, "reward": -0.6388996203740438, "reward_std": 1.3941177546977996, "rewards/Qwen2-0.5B-Reward": -0.6388996203740438, "step": 2170 }, { "completion_length": 689.1287068684895, "epoch": 0.8253407370015143, "grad_norm": 0.8256644606590271, "kl": 2.6302083333333335, "learning_rate": 3.620812504176483e-06, "loss": 0.1052, "reward": -0.3896134149283171, "reward_std": 1.1061949849128723, "rewards/Qwen2-0.5B-Reward": -0.3896134149283171, "step": 2180 }, { "completion_length": 747.3708435058594, "epoch": 0.8291267036850076, "grad_norm": 1.2586473226547241, "kl": 2.8255208333333335, "learning_rate": 3.5533330297264055e-06, "loss": 0.113, "reward": -0.47125562417010464, "reward_std": 1.3159513572851818, "rewards/Qwen2-0.5B-Reward": -0.47125562417010464, "step": 2190 }, { "completion_length": 718.9842681884766, "epoch": 0.8329126703685008, "grad_norm": 0.7325953841209412, "kl": 2.89453125, "learning_rate": 3.4871554098624783e-06, "loss": 0.1159, "reward": -0.515640505651633, "reward_std": 1.2894119222958882, "rewards/Qwen2-0.5B-Reward": -0.515640505651633, "step": 2200 }, { "completion_length": 730.6486206054688, "epoch": 0.836698637051994, "grad_norm": 1.3458070755004883, "kl": 2.746354166666667, "learning_rate": 3.4222912140074072e-06, "loss": 0.1099, "reward": -0.43878471093873184, "reward_std": 1.1841597487529119, "rewards/Qwen2-0.5B-Reward": -0.43878471093873184, "step": 2210 }, { "completion_length": 728.4597218831381, "epoch": 0.8404846037354872, "grad_norm": 2.082460880279541, "kl": 3.025520833333333, "learning_rate": 3.358751781966125e-06, "loss": 0.121, "reward": -0.5120975616077582, "reward_std": 1.399947702884674, "rewards/Qwen2-0.5B-Reward": -0.5120975616077582, "step": 2220 }, { "completion_length": 702.8384338378906, "epoch": 0.8442705704189803, "grad_norm": 0.7987167239189148, "kl": 2.9817708333333335, "learning_rate": 3.2965482219433266e-06, "loss": 0.1193, "reward": -0.5346707743903001, "reward_std": 1.298090636730194, "rewards/Qwen2-0.5B-Reward": -0.5346707743903001, "step": 2230 }, { "completion_length": 743.4412068684895, "epoch": 0.8480565371024735, "grad_norm": 1.0572713613510132, "kl": 2.8296875, "learning_rate": 3.2356914086014895e-06, "loss": 0.1132, "reward": -0.45420979845027126, "reward_std": 1.2626650591691335, "rewards/Qwen2-0.5B-Reward": -0.45420979845027126, "step": 2240 }, { "completion_length": 751.9037150065104, "epoch": 0.8518425037859667, "grad_norm": 1.2263774871826172, "kl": 2.789322916666667, "learning_rate": 3.1761919811597286e-06, "loss": 0.1116, "reward": -0.41814162402103344, "reward_std": 1.254759935537974, "rewards/Qwen2-0.5B-Reward": -0.41814162402103344, "step": 2250 }, { "completion_length": 735.0213012695312, "epoch": 0.8556284704694599, "grad_norm": 1.536089539527893, "kl": 2.711197916666667, "learning_rate": 3.118060341533795e-06, "loss": 0.1084, "reward": -0.3957721870703002, "reward_std": 1.215382601817449, "rewards/Qwen2-0.5B-Reward": -0.3957721870703002, "step": 2260 }, { "completion_length": 739.1541676839192, "epoch": 0.8594144371529531, "grad_norm": 2.2628087997436523, "kl": 3.322135416666667, "learning_rate": 3.0613066525175916e-06, "loss": 0.1328, "reward": -0.5474292345655462, "reward_std": 1.3296300649642945, "rewards/Qwen2-0.5B-Reward": -0.5474292345655462, "step": 2270 }, { "completion_length": 753.1319498697917, "epoch": 0.8632004038364463, "grad_norm": 1.759981393814087, "kl": 2.53984375, "learning_rate": 3.00594083600646e-06, "loss": 0.1016, "reward": -0.4004799094672004, "reward_std": 1.2508702536424001, "rewards/Qwen2-0.5B-Reward": -0.4004799094672004, "step": 2280 }, { "completion_length": 765.2296366373698, "epoch": 0.8669863705199394, "grad_norm": 1.7521519660949707, "kl": 3.2877604166666665, "learning_rate": 2.9519725712625993e-06, "loss": 0.1315, "reward": -0.5632258212814728, "reward_std": 1.3489103774229685, "rewards/Qwen2-0.5B-Reward": -0.5632258212814728, "step": 2290 }, { "completion_length": 728.2092671712239, "epoch": 0.8707723372034326, "grad_norm": 1.1282004117965698, "kl": 2.808333333333333, "learning_rate": 2.89941129322291e-06, "loss": 0.1123, "reward": -0.4623491804425915, "reward_std": 1.2616208771864572, "rewards/Qwen2-0.5B-Reward": -0.4623491804425915, "step": 2300 }, { "completion_length": 763.8801025390625, "epoch": 0.8745583038869258, "grad_norm": 1.6411226987838745, "kl": 2.96328125, "learning_rate": 2.848266190849534e-06, "loss": 0.1186, "reward": -0.47133560677369435, "reward_std": 1.3187556425730387, "rewards/Qwen2-0.5B-Reward": -0.47133560677369435, "step": 2310 }, { "completion_length": 767.462967936198, "epoch": 0.878344270570419, "grad_norm": 1.238519310951233, "kl": 2.96015625, "learning_rate": 2.798546205523405e-06, "loss": 0.1184, "reward": -0.553766346598665, "reward_std": 1.3190133293469748, "rewards/Qwen2-0.5B-Reward": -0.553766346598665, "step": 2320 }, { "completion_length": 738.1370381673177, "epoch": 0.8821302372539122, "grad_norm": 1.9779850244522095, "kl": 2.7606770833333334, "learning_rate": 2.7502600294810888e-06, "loss": 0.1104, "reward": -0.48763653316224614, "reward_std": 1.276737904548645, "rewards/Qwen2-0.5B-Reward": -0.48763653316224614, "step": 2330 }, { "completion_length": 772.7652852376302, "epoch": 0.8859162039374053, "grad_norm": 0.9569075107574463, "kl": 3.640625, "learning_rate": 2.7034161042951696e-06, "loss": 0.1457, "reward": -0.752403491238753, "reward_std": 1.5029548863569895, "rewards/Qwen2-0.5B-Reward": -0.752403491238753, "step": 2340 }, { "completion_length": 748.1027770996094, "epoch": 0.8897021706208985, "grad_norm": 1.2532896995544434, "kl": 2.788802083333333, "learning_rate": 2.658022619398459e-06, "loss": 0.1115, "reward": -0.5759354960018148, "reward_std": 1.252836243311564, "rewards/Qwen2-0.5B-Reward": -0.5759354960018148, "step": 2350 }, { "completion_length": 756.765283203125, "epoch": 0.8934881373043917, "grad_norm": 1.243710994720459, "kl": 3.470572916666667, "learning_rate": 2.6140875106522906e-06, "loss": 0.1388, "reward": -0.7527099266648293, "reward_std": 1.5181720991929373, "rewards/Qwen2-0.5B-Reward": -0.7527099266648293, "step": 2360 }, { "completion_length": 731.5490844726562, "epoch": 0.8972741039878849, "grad_norm": 0.8256412744522095, "kl": 2.8911458333333333, "learning_rate": 2.5716184589591504e-06, "loss": 0.1156, "reward": -0.4917602331067125, "reward_std": 1.3739383776982625, "rewards/Qwen2-0.5B-Reward": -0.4917602331067125, "step": 2370 }, { "completion_length": 762.2222249348958, "epoch": 0.901060070671378, "grad_norm": 0.976091206073761, "kl": 3.2059895833333334, "learning_rate": 2.5306228889198595e-06, "loss": 0.1282, "reward": -0.492262601479888, "reward_std": 1.3222837885220846, "rewards/Qwen2-0.5B-Reward": -0.492262601479888, "step": 2380 }, { "completion_length": 752.6963033040364, "epoch": 0.9048460373548712, "grad_norm": 0.8627796769142151, "kl": 3.144270833333333, "learning_rate": 2.4911079675355852e-06, "loss": 0.1258, "reward": -0.5920792824278275, "reward_std": 1.4338179051876068, "rewards/Qwen2-0.5B-Reward": -0.5920792824278275, "step": 2390 }, { "completion_length": 729.3250172932943, "epoch": 0.9086320040383644, "grad_norm": 2.569244384765625, "kl": 3.0598958333333335, "learning_rate": 2.453080602954878e-06, "loss": 0.1224, "reward": -0.5552944198250771, "reward_std": 1.259453280766805, "rewards/Qwen2-0.5B-Reward": -0.5552944198250771, "step": 2400 }, { "completion_length": 769.4680562337239, "epoch": 0.9124179707218576, "grad_norm": 1.9891189336776733, "kl": 3.347395833333333, "learning_rate": 2.416547443265959e-06, "loss": 0.134, "reward": -0.7994883202016354, "reward_std": 1.5337923685709636, "rewards/Qwen2-0.5B-Reward": -0.7994883202016354, "step": 2410 }, { "completion_length": 711.1333435058593, "epoch": 0.9162039374053509, "grad_norm": 1.2348560094833374, "kl": 2.6640625, "learning_rate": 2.381514875334478e-06, "loss": 0.1066, "reward": -0.4012350387871265, "reward_std": 1.1682847638924916, "rewards/Qwen2-0.5B-Reward": -0.4012350387871265, "step": 2420 }, { "completion_length": 728.6402770996094, "epoch": 0.9199899040888441, "grad_norm": 1.0510834455490112, "kl": 2.4625, "learning_rate": 2.34798902368694e-06, "loss": 0.0985, "reward": -0.255227384219567, "reward_std": 1.0641139527161916, "rewards/Qwen2-0.5B-Reward": -0.255227384219567, "step": 2430 }, { "completion_length": 742.7620483398438, "epoch": 0.9237758707723372, "grad_norm": 0.6936110854148865, "kl": 2.7760416666666665, "learning_rate": 2.31597574943999e-06, "loss": 0.1111, "reward": -0.32442100283806213, "reward_std": 1.1662549694379172, "rewards/Qwen2-0.5B-Reward": -0.32442100283806213, "step": 2440 }, { "completion_length": 709.3500081380208, "epoch": 0.9275618374558304, "grad_norm": 0.6553380489349365, "kl": 2.91484375, "learning_rate": 2.2854806492757473e-06, "loss": 0.1166, "reward": -0.4610091609259446, "reward_std": 1.2611193935076395, "rewards/Qwen2-0.5B-Reward": -0.4610091609259446, "step": 2450 }, { "completion_length": 751.5171427408854, "epoch": 0.9313478041393236, "grad_norm": 1.1703935861587524, "kl": 3.030989583333333, "learning_rate": 2.256509054463379e-06, "loss": 0.1212, "reward": -0.47760866036017735, "reward_std": 1.3580244441827138, "rewards/Qwen2-0.5B-Reward": -0.47760866036017735, "step": 2460 }, { "completion_length": 734.3888997395833, "epoch": 0.9351337708228168, "grad_norm": 1.4841110706329346, "kl": 2.941666666666667, "learning_rate": 2.2290660299270626e-06, "loss": 0.1176, "reward": -0.5363880881418784, "reward_std": 1.320775838692983, "rewards/Qwen2-0.5B-Reward": -0.5363880881418784, "step": 2470 }, { "completion_length": 792.8597361246744, "epoch": 0.93891973750631, "grad_norm": 0.9216225743293762, "kl": 3.2005208333333335, "learning_rate": 2.2031563733605154e-06, "loss": 0.128, "reward": -0.6734383806586266, "reward_std": 1.5115692138671875, "rewards/Qwen2-0.5B-Reward": -0.6734383806586266, "step": 2480 }, { "completion_length": 742.3555603027344, "epoch": 0.9427057041898032, "grad_norm": 0.8652907013893127, "kl": 2.9796875, "learning_rate": 2.178784614388247e-06, "loss": 0.1192, "reward": -0.5235640426476796, "reward_std": 1.2792722801367442, "rewards/Qwen2-0.5B-Reward": -0.5235640426476796, "step": 2490 }, { "completion_length": 731.887510172526, "epoch": 0.9464916708732963, "grad_norm": 0.900198221206665, "kl": 2.758072916666667, "learning_rate": 2.155955013773674e-06, "loss": 0.1102, "reward": -0.427229492738843, "reward_std": 1.2093970189491907, "rewards/Qwen2-0.5B-Reward": -0.427229492738843, "step": 2500 }, { "completion_length": 742.8615783691406, "epoch": 0.9502776375567895, "grad_norm": 1.4608945846557617, "kl": 2.884635416666667, "learning_rate": 2.134671562674233e-06, "loss": 0.1154, "reward": -0.40613490512090117, "reward_std": 1.2598043183485668, "rewards/Qwen2-0.5B-Reward": -0.40613490512090117, "step": 2510 }, { "completion_length": 732.2143575032552, "epoch": 0.9540636042402827, "grad_norm": 0.861190676689148, "kl": 3.107291666666667, "learning_rate": 2.114937981943634e-06, "loss": 0.1243, "reward": -0.4464622031897306, "reward_std": 1.2578558444976806, "rewards/Qwen2-0.5B-Reward": -0.4464622031897306, "step": 2520 }, { "completion_length": 796.321767171224, "epoch": 0.9578495709237759, "grad_norm": 2.202199697494507, "kl": 3.068489583333333, "learning_rate": 2.096757721481365e-06, "loss": 0.1228, "reward": -0.6399494647979737, "reward_std": 1.4180189092954, "rewards/Qwen2-0.5B-Reward": -0.6399494647979737, "step": 2530 }, { "completion_length": 744.4689880371094, "epoch": 0.961635537607269, "grad_norm": 0.9193338751792908, "kl": 2.9799479166666667, "learning_rate": 2.0801339596295706e-06, "loss": 0.1192, "reward": -0.5712469642050564, "reward_std": 1.3389502465724945, "rewards/Qwen2-0.5B-Reward": -0.5712469642050564, "step": 2540 }, { "completion_length": 779.9819458007812, "epoch": 0.9654215042907622, "grad_norm": 1.811191439628601, "kl": 3.6411458333333333, "learning_rate": 2.0650696026173993e-06, "loss": 0.1456, "reward": -0.7589993777374426, "reward_std": 1.5557840009530386, "rewards/Qwen2-0.5B-Reward": -0.7589993777374426, "step": 2550 }, { "completion_length": 782.5782531738281, "epoch": 0.9692074709742554, "grad_norm": 0.9566059112548828, "kl": 3.095572916666667, "learning_rate": 2.051567284052924e-06, "loss": 0.1238, "reward": -0.6302419572137297, "reward_std": 1.4944741606712342, "rewards/Qwen2-0.5B-Reward": -0.6302419572137297, "step": 2560 }, { "completion_length": 718.9152872721354, "epoch": 0.9729934376577486, "grad_norm": 0.9425510168075562, "kl": 2.82890625, "learning_rate": 2.0396293644627313e-06, "loss": 0.1132, "reward": -0.32908876914686214, "reward_std": 1.2080858111381532, "rewards/Qwen2-0.5B-Reward": -0.32908876914686214, "step": 2570 }, { "completion_length": 732.7824157714844, "epoch": 0.9767794043412418, "grad_norm": 0.9575442671775818, "kl": 3.121875, "learning_rate": 2.0292579308792374e-06, "loss": 0.125, "reward": -0.47131281966964406, "reward_std": 1.3826497634251913, "rewards/Qwen2-0.5B-Reward": -0.47131281966964406, "step": 2580 }, { "completion_length": 761.6578694661458, "epoch": 0.980565371024735, "grad_norm": 1.0160202980041504, "kl": 2.9203125, "learning_rate": 2.020454796475829e-06, "loss": 0.1168, "reward": -0.47771473427613576, "reward_std": 1.2897698918978373, "rewards/Qwen2-0.5B-Reward": -0.47771473427613576, "step": 2590 }, { "completion_length": 781.389815266927, "epoch": 0.9843513377082281, "grad_norm": 2.1385881900787354, "kl": 2.9213541666666667, "learning_rate": 2.013221500249879e-06, "loss": 0.1168, "reward": -0.4969511273006598, "reward_std": 1.3705980678399403, "rewards/Qwen2-0.5B-Reward": -0.4969511273006598, "step": 2600 }, { "completion_length": 748.4037109375, "epoch": 0.9881373043917213, "grad_norm": 1.3061258792877197, "kl": 3.1223958333333335, "learning_rate": 2.0075593067536895e-06, "loss": 0.1249, "reward": -0.511777646218737, "reward_std": 1.338163250684738, "rewards/Qwen2-0.5B-Reward": -0.511777646218737, "step": 2610 }, { "completion_length": 730.8513916015625, "epoch": 0.9919232710752145, "grad_norm": 1.0508885383605957, "kl": 2.6411458333333333, "learning_rate": 2.0034692058734197e-06, "loss": 0.1056, "reward": -0.3765604312221209, "reward_std": 1.2147092600663503, "rewards/Qwen2-0.5B-Reward": -0.3765604312221209, "step": 2620 }, { "completion_length": 783.2115763346354, "epoch": 0.9957092377587077, "grad_norm": 1.1735745668411255, "kl": 3.4817708333333335, "learning_rate": 2.000951912656033e-06, "loss": 0.1392, "reward": -0.6186425998806954, "reward_std": 1.4974812746047974, "rewards/Qwen2-0.5B-Reward": -0.6186425998806954, "step": 2630 }, { "completion_length": 786.5088033040364, "epoch": 0.9994952044422009, "grad_norm": 1.1970211267471313, "kl": 3.134375, "learning_rate": 2.0000078671842824e-06, "loss": 0.1254, "reward": -0.662852063588798, "reward_std": 1.5238366266091665, "rewards/Qwen2-0.5B-Reward": -0.662852063588798, "step": 2640 }, { "completion_length": 728.1759236653646, "epoch": 0.9998738011105502, "kl": 3.125, "reward": -0.8051454126834869, "reward_std": 1.3035079042116802, "rewards/Qwen2-0.5B-Reward": -0.8051454126834869, "step": 2641, "total_flos": 0.0, "train_loss": 0.6071465962344739, "train_runtime": 159997.8149, "train_samples_per_second": 1.189, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 2641, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }