{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 100, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 637.8268123626709, "epoch": 0.010666666666666666, "grad_norm": 0.9318448901176453, "kl": 0.00011245012283325195, "learning_rate": 3.1914893617021275e-07, "loss": 0.0, "reward": 0.6035714592784643, "reward_std": 0.3709465142339468, "rewards/accuracy_reward": 0.6017857443541288, "rewards/format_reward": 0.001785714365541935, "step": 5 }, { "completion_length": 611.45181350708, "epoch": 0.021333333333333333, "grad_norm": 6.491418838500977, "kl": 0.0001920461654663086, "learning_rate": 6.382978723404255e-07, "loss": 0.0, "reward": 0.617857176065445, "reward_std": 0.36645040661096573, "rewards/accuracy_reward": 0.617857176065445, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 604.3607414245605, "epoch": 0.032, "grad_norm": 9.168869018554688, "kl": 0.00031398534774780275, "learning_rate": 9.574468085106384e-07, "loss": 0.0, "reward": 0.6482143150642514, "reward_std": 0.34973760321736336, "rewards/accuracy_reward": 0.6482143150642514, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 617.7803817749024, "epoch": 0.042666666666666665, "grad_norm": 1.5698275566101074, "kl": 0.0008591651916503906, "learning_rate": 1.276595744680851e-06, "loss": 0.0, "reward": 0.6303571715950966, "reward_std": 0.35366948917508123, "rewards/accuracy_reward": 0.6303571715950966, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 639.5375259399414, "epoch": 0.05333333333333334, "grad_norm": 0.858055055141449, "kl": 0.002974271774291992, "learning_rate": 1.5957446808510639e-06, "loss": 0.0001, "reward": 0.6125000298023224, "reward_std": 0.3530873417854309, "rewards/accuracy_reward": 0.6125000298023224, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 664.7107475280761, "epoch": 0.064, "grad_norm": 1.1970467567443848, "kl": 0.005088996887207031, "learning_rate": 1.9148936170212767e-06, "loss": 0.0002, "reward": 0.6553571727126837, "reward_std": 0.33203957192599776, "rewards/accuracy_reward": 0.6553571727126837, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 632.7196670532227, "epoch": 0.07466666666666667, "grad_norm": 1.0202564001083374, "kl": 0.996532678604126, "learning_rate": 2.2340425531914894e-06, "loss": 0.0398, "reward": 0.7000000279396772, "reward_std": 0.3037954304367304, "rewards/accuracy_reward": 0.7000000279396772, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 640.1286018371582, "epoch": 0.08533333333333333, "grad_norm": 0.5117438435554504, "kl": 0.002056884765625, "learning_rate": 2.553191489361702e-06, "loss": 0.0001, "reward": 0.7250000268220902, "reward_std": 0.2653495166450739, "rewards/accuracy_reward": 0.7250000268220902, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 607.9250259399414, "epoch": 0.096, "grad_norm": 0.3817993402481079, "kl": 0.002574253082275391, "learning_rate": 2.872340425531915e-06, "loss": 0.0001, "reward": 0.7392857447266579, "reward_std": 0.24080881737172605, "rewards/accuracy_reward": 0.7392857447266579, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 628.1535995483398, "epoch": 0.10666666666666667, "grad_norm": 0.7749062180519104, "kl": 0.004121017456054687, "learning_rate": 2.9996241442585123e-06, "loss": 0.0002, "reward": 0.6964286014437675, "reward_std": 0.300217243283987, "rewards/accuracy_reward": 0.6964286014437675, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 600.3839553833008, "epoch": 0.11733333333333333, "grad_norm": 0.5632015466690063, "kl": 0.003362083435058594, "learning_rate": 2.9973279301399446e-06, "loss": 0.0001, "reward": 0.7178571719676257, "reward_std": 0.27802310809493064, "rewards/accuracy_reward": 0.7178571719676257, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 585.9303833007813, "epoch": 0.128, "grad_norm": 1.2080248594284058, "kl": 0.004734039306640625, "learning_rate": 2.992947502998804e-06, "loss": 0.0002, "reward": 0.7750000357627869, "reward_std": 0.25223284475505353, "rewards/accuracy_reward": 0.7750000357627869, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 591.1464614868164, "epoch": 0.13866666666666666, "grad_norm": 1.7432464361190796, "kl": 0.013347434997558593, "learning_rate": 2.9864889601923268e-06, "loss": 0.0005, "reward": 0.7267857432365418, "reward_std": 0.2874183960258961, "rewards/accuracy_reward": 0.7267857432365418, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 588.239315032959, "epoch": 0.14933333333333335, "grad_norm": 0.28103670477867126, "kl": 0.004166412353515625, "learning_rate": 2.977961291721137e-06, "loss": 0.0002, "reward": 0.7875000283122062, "reward_std": 0.23745907917618753, "rewards/accuracy_reward": 0.7875000283122062, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 554.2375259399414, "epoch": 0.16, "grad_norm": 0.26267099380493164, "kl": 0.004508209228515625, "learning_rate": 2.9673763677155655e-06, "loss": 0.0002, "reward": 0.7767857410013675, "reward_std": 0.21466484777629374, "rewards/accuracy_reward": 0.7767857410013675, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 553.0071640014648, "epoch": 0.17066666666666666, "grad_norm": 0.34145233035087585, "kl": 0.0057952880859375, "learning_rate": 2.9547489219129666e-06, "loss": 0.0002, "reward": 0.8285714581608772, "reward_std": 0.20639798790216446, "rewards/accuracy_reward": 0.8285714581608772, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 585.5857421875, "epoch": 0.18133333333333335, "grad_norm": 0.240849107503891, "kl": 0.0038000106811523437, "learning_rate": 2.9400965311490175e-06, "loss": 0.0002, "reward": 0.7660714596509933, "reward_std": 0.24059830717742442, "rewards/accuracy_reward": 0.7660714596509933, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 558.014315032959, "epoch": 0.192, "grad_norm": 0.42303234338760376, "kl": 0.004909515380859375, "learning_rate": 2.9234395908915565e-06, "loss": 0.0002, "reward": 0.7142857456579804, "reward_std": 0.2469261337071657, "rewards/accuracy_reward": 0.7142857456579804, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 565.8589538574219, "epoch": 0.20266666666666666, "grad_norm": 0.36556175351142883, "kl": 0.004494476318359375, "learning_rate": 2.904801286851009e-06, "loss": 0.0002, "reward": 0.7517857383936644, "reward_std": 0.2242392159998417, "rewards/accuracy_reward": 0.7517857383936644, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 520.4696662902832, "epoch": 0.21333333333333335, "grad_norm": 0.2530968487262726, "kl": 0.00856475830078125, "learning_rate": 2.884207562706925e-06, "loss": 0.0003, "reward": 0.8125000301748514, "reward_std": 0.18475012369453908, "rewards/accuracy_reward": 0.8125000301748514, "rewards/format_reward": 0.0, "step": 100 }, { "epoch": 0.21333333333333335, "eval_completion_length": 547.995567590332, "eval_kl": 0.00711322021484375, "eval_loss": 0.00028467908850871027, "eval_reward": 0.6861428862035275, "eval_reward_std": 0.270268753862381, "eval_rewards/accuracy_reward": 0.6860857433497906, "eval_rewards/format_reward": 5.7142859697341916e-05, "eval_runtime": 28593.9461, "eval_samples_per_second": 0.175, "eval_steps_per_second": 0.013, "step": 100 }, { "completion_length": 549.5107398986817, "epoch": 0.224, "grad_norm": 0.2235504686832428, "kl": 0.004962539672851563, "learning_rate": 2.8616870839955444e-06, "loss": 0.0002, "reward": 0.7964286103844642, "reward_std": 0.26562733463943006, "rewards/accuracy_reward": 0.7964286103844642, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 581.1750267028808, "epoch": 0.23466666666666666, "grad_norm": 0.4279918968677521, "kl": 0.005224609375, "learning_rate": 2.837271198208662e-06, "loss": 0.0002, "reward": 0.7785714581608772, "reward_std": 0.20086282528936864, "rewards/accuracy_reward": 0.7785714581608772, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 532.0053810119629, "epoch": 0.24533333333333332, "grad_norm": 0.6788883805274963, "kl": 0.0057430267333984375, "learning_rate": 2.8109938911593322e-06, "loss": 0.0002, "reward": 0.7767857426777482, "reward_std": 0.20144498273730277, "rewards/accuracy_reward": 0.7767857426777482, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 544.6714546203614, "epoch": 0.256, "grad_norm": 0.23265813291072845, "kl": 0.006147003173828125, "learning_rate": 2.7828917396751474e-06, "loss": 0.0002, "reward": 0.7696428894996643, "reward_std": 0.20026272870600223, "rewards/accuracy_reward": 0.7696428894996643, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 518.471452331543, "epoch": 0.26666666666666666, "grad_norm": 0.41492336988449097, "kl": 0.00635986328125, "learning_rate": 2.753003860684943e-06, "loss": 0.0003, "reward": 0.8375000298023224, "reward_std": 0.20442306995391846, "rewards/accuracy_reward": 0.8375000298023224, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 570.2303848266602, "epoch": 0.2773333333333333, "grad_norm": 0.4288278818130493, "kl": 0.006764602661132812, "learning_rate": 2.721371856769793e-06, "loss": 0.0003, "reward": 0.7160714600235224, "reward_std": 0.2673064887523651, "rewards/accuracy_reward": 0.7160714600235224, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 494.16431350708007, "epoch": 0.288, "grad_norm": 0.5349715352058411, "kl": 0.008311080932617187, "learning_rate": 2.688039758254093e-06, "loss": 0.0003, "reward": 0.7910714618861675, "reward_std": 0.22742781266570092, "rewards/accuracy_reward": 0.7910714618861675, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 503.5232357025146, "epoch": 0.2986666666666667, "grad_norm": 0.9469221830368042, "kl": 0.011474609375, "learning_rate": 2.65305396191733e-06, "loss": 0.0005, "reward": 0.8053571686148644, "reward_std": 0.22859212197363377, "rewards/accuracy_reward": 0.8053571686148644, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 526.3750251770019, "epoch": 0.30933333333333335, "grad_norm": 0.4655410945415497, "kl": 0.016347885131835938, "learning_rate": 2.61646316641186e-06, "loss": 0.0007, "reward": 0.7767857382073998, "reward_std": 0.21031193807721138, "rewards/accuracy_reward": 0.7767857382073998, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 536.4660957336425, "epoch": 0.32, "grad_norm": 0.36182740330696106, "kl": 0.020062255859375, "learning_rate": 2.5783183044765715e-06, "loss": 0.0008, "reward": 0.7517857436090708, "reward_std": 0.2297923181205988, "rewards/accuracy_reward": 0.7517857436090708, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 514.8803802490235, "epoch": 0.33066666666666666, "grad_norm": 0.7956529855728149, "kl": 0.03543167114257813, "learning_rate": 2.5386724720408135e-06, "loss": 0.0014, "reward": 0.7517857443541288, "reward_std": 0.26203566156327723, "rewards/accuracy_reward": 0.7517857443541288, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 549.3571701049805, "epoch": 0.3413333333333333, "grad_norm": 0.29695257544517517, "kl": 0.062085723876953124, "learning_rate": 2.49758085431725e-06, "loss": 0.0025, "reward": 0.7285714589059353, "reward_std": 0.269120267406106, "rewards/accuracy_reward": 0.7285714589059353, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 589.5339569091797, "epoch": 0.352, "grad_norm": 0.41898104548454285, "kl": 0.13505706787109376, "learning_rate": 2.455100648986533e-06, "loss": 0.0054, "reward": 0.6857143165543675, "reward_std": 0.32202624566853044, "rewards/accuracy_reward": 0.6857143165543675, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 627.4214576721191, "epoch": 0.3626666666666667, "grad_norm": 0.2985088527202606, "kl": 0.1629364013671875, "learning_rate": 2.4112909865807053e-06, "loss": 0.0065, "reward": 0.6410714607685805, "reward_std": 0.2777946576476097, "rewards/accuracy_reward": 0.6410714607685805, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 567.5768112182617, "epoch": 0.37333333333333335, "grad_norm": 0.49856701493263245, "kl": 0.115283203125, "learning_rate": 2.366212848176164e-06, "loss": 0.0046, "reward": 0.7089286031201482, "reward_std": 0.2502220422029495, "rewards/accuracy_reward": 0.7089286031201482, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 528.6750221252441, "epoch": 0.384, "grad_norm": 0.6558699607849121, "kl": 0.18359222412109374, "learning_rate": 2.319928980510752e-06, "loss": 0.0073, "reward": 0.6535714527592063, "reward_std": 0.28206231258809566, "rewards/accuracy_reward": 0.6535714527592063, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 571.7518112182618, "epoch": 0.39466666666666667, "grad_norm": 6.485378742218018, "kl": 0.34942626953125, "learning_rate": 2.272503808643123e-06, "loss": 0.014, "reward": 0.4696428783237934, "reward_std": 0.27855144031345846, "rewards/accuracy_reward": 0.4696428783237934, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 542.8089500427246, "epoch": 0.4053333333333333, "grad_norm": 1.8310225009918213, "kl": 0.4147308349609375, "learning_rate": 2.2240033462759628e-06, "loss": 0.0166, "reward": 0.46071431171149013, "reward_std": 0.36014051400125024, "rewards/accuracy_reward": 0.46071431171149013, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 626.9553817749023, "epoch": 0.416, "grad_norm": 17.148595809936523, "kl": 0.4925048828125, "learning_rate": 2.1744951038678905e-06, "loss": 0.0197, "reward": 0.4000000203028321, "reward_std": 0.3870592150837183, "rewards/accuracy_reward": 0.4000000203028321, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 525.5928825378418, "epoch": 0.4266666666666667, "grad_norm": 38.49786376953125, "kl": 0.23444061279296874, "learning_rate": 2.124047994661941e-06, "loss": 0.0094, "reward": 0.6821428872644901, "reward_std": 0.3179911646991968, "rewards/accuracy_reward": 0.6821428872644901, "rewards/format_reward": 0.0, "step": 200 }, { "epoch": 0.4266666666666667, "eval_completion_length": 528.541595147705, "eval_kl": 0.42865986328125, "eval_loss": 0.017097920179367065, "eval_reward": 0.662742884466052, "eval_reward_std": 0.2557599967300892, "eval_rewards/accuracy_reward": 0.662742884466052, "eval_rewards/format_reward": 0.0, "eval_runtime": 28459.3155, "eval_samples_per_second": 0.176, "eval_steps_per_second": 0.013, "step": 200 }, { "completion_length": 509.92859649658203, "epoch": 0.43733333333333335, "grad_norm": 4.442358493804932, "kl": 0.09366302490234375, "learning_rate": 2.072732238761434e-06, "loss": 0.0037, "reward": 0.7660714574158192, "reward_std": 0.21246148198843, "rewards/accuracy_reward": 0.7660714574158192, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 497.5357364654541, "epoch": 0.448, "grad_norm": 0.5748523473739624, "kl": 0.085107421875, "learning_rate": 2.0206192653867536e-06, "loss": 0.0034, "reward": 0.796428595483303, "reward_std": 0.18161089681088924, "rewards/accuracy_reward": 0.796428595483303, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 614.8643173217773, "epoch": 0.45866666666666667, "grad_norm": 0.3233760893344879, "kl": 0.15386199951171875, "learning_rate": 1.967781613449095e-06, "loss": 0.0062, "reward": 0.6392857398837805, "reward_std": 0.26852017305791376, "rewards/accuracy_reward": 0.6392857398837805, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 575.8321681976319, "epoch": 0.4693333333333333, "grad_norm": 0.6134320497512817, "kl": 0.16066131591796876, "learning_rate": 1.9142928305795637e-06, "loss": 0.0064, "reward": 0.6446428839117289, "reward_std": 0.3076914418488741, "rewards/accuracy_reward": 0.6446428839117289, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 529.201806640625, "epoch": 0.48, "grad_norm": 14.935003280639648, "kl": 0.27866592407226565, "learning_rate": 1.8602273707541886e-06, "loss": 0.0111, "reward": 0.6982143182307482, "reward_std": 0.29295355789363386, "rewards/accuracy_reward": 0.6982143182307482, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 599.1643127441406, "epoch": 0.49066666666666664, "grad_norm": 29.34033203125, "kl": 1.2954010009765624, "learning_rate": 1.8056604906573418e-06, "loss": 0.0518, "reward": 0.605357170291245, "reward_std": 0.30737774074077606, "rewards/accuracy_reward": 0.605357170291245, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 577.0696716308594, "epoch": 0.5013333333333333, "grad_norm": 33.51802062988281, "kl": 0.703057861328125, "learning_rate": 1.7506681449278226e-06, "loss": 0.0281, "reward": 0.571428601257503, "reward_std": 0.32992145605385303, "rewards/accuracy_reward": 0.571428601257503, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 501.30717010498046, "epoch": 0.512, "grad_norm": 7.524632930755615, "kl": 0.3678741455078125, "learning_rate": 1.6953268804334257e-06, "loss": 0.0147, "reward": 0.589285738952458, "reward_std": 0.28248333670198916, "rewards/accuracy_reward": 0.589285738952458, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 462.3607376098633, "epoch": 0.5226666666666666, "grad_norm": 6.015294075012207, "kl": 0.3237091064453125, "learning_rate": 1.6397137297211436e-06, "loss": 0.0129, "reward": 0.5589285986497998, "reward_std": 0.3269613076001406, "rewards/accuracy_reward": 0.5589285986497998, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 638.7928871154785, "epoch": 0.5333333333333333, "grad_norm": 1.8317821025848389, "kl": 0.807421875, "learning_rate": 1.5839061037913395e-06, "loss": 0.0323, "reward": 0.3250000160187483, "reward_std": 0.34638786166906355, "rewards/accuracy_reward": 0.3250000160187483, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 636.7161018371582, "epoch": 0.544, "grad_norm": 0.6774631142616272, "kl": 0.53349609375, "learning_rate": 1.527981684345115e-06, "loss": 0.0213, "reward": 0.35714287366718056, "reward_std": 0.3179732210934162, "rewards/accuracy_reward": 0.35714287366718056, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 558.9875259399414, "epoch": 0.5546666666666666, "grad_norm": 2.738410472869873, "kl": 0.205511474609375, "learning_rate": 1.4720183156548855e-06, "loss": 0.0082, "reward": 0.6000000305473805, "reward_std": 0.3251969013363123, "rewards/accuracy_reward": 0.6000000305473805, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 545.3893096923828, "epoch": 0.5653333333333334, "grad_norm": 17.57588005065918, "kl": 1.5068023681640625, "learning_rate": 1.4160938962086612e-06, "loss": 0.0603, "reward": 0.6089285997673869, "reward_std": 0.28422979824244976, "rewards/accuracy_reward": 0.6089285997673869, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 526.2607391357421, "epoch": 0.576, "grad_norm": 38.554412841796875, "kl": 1.0892745971679687, "learning_rate": 1.3602862702788567e-06, "loss": 0.0436, "reward": 0.6267857417464257, "reward_std": 0.32045440524816515, "rewards/accuracy_reward": 0.6267857417464257, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 517.2678787231446, "epoch": 0.5866666666666667, "grad_norm": 4.718568325042725, "kl": 1.1258697509765625, "learning_rate": 1.3046731195665748e-06, "loss": 0.045, "reward": 0.6464285977184773, "reward_std": 0.320261836796999, "rewards/accuracy_reward": 0.6464285977184773, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 514.8375244140625, "epoch": 0.5973333333333334, "grad_norm": 4.270685195922852, "kl": 1.4042236328125, "learning_rate": 1.2493318550721775e-06, "loss": 0.0562, "reward": 0.6982143145054579, "reward_std": 0.2838581532239914, "rewards/accuracy_reward": 0.6982143145054579, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 534.7946655273438, "epoch": 0.608, "grad_norm": 9.498967170715332, "kl": 0.8575332641601563, "learning_rate": 1.1943395093426585e-06, "loss": 0.0343, "reward": 0.6910714596509934, "reward_std": 0.2867689304053783, "rewards/accuracy_reward": 0.6910714596509934, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 529.5875297546387, "epoch": 0.6186666666666667, "grad_norm": 8.131922721862793, "kl": 0.6070526123046875, "learning_rate": 1.1397726292458115e-06, "loss": 0.0243, "reward": 0.675000030733645, "reward_std": 0.2807727467268705, "rewards/accuracy_reward": 0.675000030733645, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 557.6500236511231, "epoch": 0.6293333333333333, "grad_norm": 2.224785089492798, "kl": 1.1581512451171876, "learning_rate": 1.085707169420437e-06, "loss": 0.0463, "reward": 0.6142857430502773, "reward_std": 0.2811264578253031, "rewards/accuracy_reward": 0.6142857430502773, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 529.6071685791015, "epoch": 0.64, "grad_norm": 8.0582857131958, "kl": 1.2655914306640625, "learning_rate": 1.0322183865509054e-06, "loss": 0.0506, "reward": 0.6750000296160579, "reward_std": 0.28525091484189036, "rewards/accuracy_reward": 0.6750000296160579, "rewards/format_reward": 0.0, "step": 300 }, { "epoch": 0.64, "eval_completion_length": 562.8381398986817, "eval_kl": 0.95684150390625, "eval_loss": 0.03832858428359032, "eval_reward": 0.5700857400953769, "eval_reward_std": 0.30191256090998647, "eval_rewards/accuracy_reward": 0.5700857400953769, "eval_rewards/format_reward": 0.0, "eval_runtime": 30482.0361, "eval_samples_per_second": 0.164, "eval_steps_per_second": 0.012, "step": 300 }, { "completion_length": 557.5018104553222, "epoch": 0.6506666666666666, "grad_norm": 2.934140682220459, "kl": 0.6237777709960938, "learning_rate": 9.793807346132464e-07, "loss": 0.025, "reward": 0.6339285995811224, "reward_std": 0.3295004416257143, "rewards/accuracy_reward": 0.6339285995811224, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 507.2857383728027, "epoch": 0.6613333333333333, "grad_norm": 1.5333462953567505, "kl": 0.18207855224609376, "learning_rate": 9.272677612385667e-07, "loss": 0.0073, "reward": 0.6017857391387225, "reward_std": 0.2927430454641581, "rewards/accuracy_reward": 0.6017857391387225, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 535.4660949707031, "epoch": 0.672, "grad_norm": 0.8268552422523499, "kl": 0.2217437744140625, "learning_rate": 8.759520053380591e-07, "loss": 0.0089, "reward": 0.5303571715950965, "reward_std": 0.30413119196891786, "rewards/accuracy_reward": 0.5303571715950965, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 587.7893112182617, "epoch": 0.6826666666666666, "grad_norm": 0.8802245259284973, "kl": 0.28748779296875, "learning_rate": 8.255048961321088e-07, "loss": 0.0115, "reward": 0.5250000260770321, "reward_std": 0.32775397710502147, "rewards/accuracy_reward": 0.5250000260770321, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 568.7143135070801, "epoch": 0.6933333333333334, "grad_norm": 0.42495808005332947, "kl": 0.20731658935546876, "learning_rate": 7.759966537240373e-07, "loss": 0.0083, "reward": 0.6357143180444836, "reward_std": 0.32221881337463854, "rewards/accuracy_reward": 0.6357143180444836, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 572.9053848266601, "epoch": 0.704, "grad_norm": 0.8112100958824158, "kl": 0.2259490966796875, "learning_rate": 7.274961913568773e-07, "loss": 0.009, "reward": 0.6071428848430515, "reward_std": 0.28955445289611814, "rewards/accuracy_reward": 0.6071428848430515, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 585.521450805664, "epoch": 0.7146666666666667, "grad_norm": 2.5413522720336914, "kl": 0.1725799560546875, "learning_rate": 6.800710194892484e-07, "loss": 0.0069, "reward": 0.6946428865194321, "reward_std": 0.27458811886608603, "rewards/accuracy_reward": 0.6946428865194321, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 578.700025177002, "epoch": 0.7253333333333334, "grad_norm": 2.077190637588501, "kl": 0.23962783813476562, "learning_rate": 6.33787151823836e-07, "loss": 0.0096, "reward": 0.6517857488244772, "reward_std": 0.29764223508536813, "rewards/accuracy_reward": 0.6517857488244772, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 517.0357376098633, "epoch": 0.736, "grad_norm": 1.8005852699279785, "kl": 0.1731414794921875, "learning_rate": 5.887090134192947e-07, "loss": 0.0069, "reward": 0.7303571674972773, "reward_std": 0.20812651440501212, "rewards/accuracy_reward": 0.7303571674972773, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 537.6071670532226, "epoch": 0.7466666666666667, "grad_norm": 5.837470531463623, "kl": 0.22538909912109376, "learning_rate": 5.448993510134669e-07, "loss": 0.009, "reward": 0.7285714605823159, "reward_std": 0.23610220104455948, "rewards/accuracy_reward": 0.7285714605823159, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 511.03573455810545, "epoch": 0.7573333333333333, "grad_norm": 10.607796669006348, "kl": 0.2825325012207031, "learning_rate": 5.024191456827498e-07, "loss": 0.0113, "reward": 0.7357143178582192, "reward_std": 0.24888310953974724, "rewards/accuracy_reward": 0.7357143178582192, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 528.4375274658203, "epoch": 0.768, "grad_norm": 17.805084228515625, "kl": 0.409503173828125, "learning_rate": 4.6132752795918667e-07, "loss": 0.0164, "reward": 0.6857143182307481, "reward_std": 0.28699738159775734, "rewards/accuracy_reward": 0.6857143182307481, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 519.4303817749023, "epoch": 0.7786666666666666, "grad_norm": 7.785521507263184, "kl": 0.34308319091796874, "learning_rate": 4.2168169552342905e-07, "loss": 0.0137, "reward": 0.6928571717813611, "reward_std": 0.29954983331263063, "rewards/accuracy_reward": 0.6928571717813611, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 504.0107368469238, "epoch": 0.7893333333333333, "grad_norm": 4.727066516876221, "kl": 0.7092597961425782, "learning_rate": 3.8353683358814046e-07, "loss": 0.0285, "reward": 0.7000000275671482, "reward_std": 0.23943399637937546, "rewards/accuracy_reward": 0.7000000275671482, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 517.7428833007813, "epoch": 0.8, "grad_norm": 7.284985542297363, "kl": 0.2571014404296875, "learning_rate": 3.469460380826697e-07, "loss": 0.0103, "reward": 0.673214310593903, "reward_std": 0.2639926388859749, "rewards/accuracy_reward": 0.673214310593903, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 543.2714515686035, "epoch": 0.8106666666666666, "grad_norm": 14.041029930114746, "kl": 0.28242645263671873, "learning_rate": 3.119602417459075e-07, "loss": 0.0113, "reward": 0.6732143096625804, "reward_std": 0.25163274370133876, "rewards/accuracy_reward": 0.6732143096625804, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 546.032169342041, "epoch": 0.8213333333333334, "grad_norm": 15.783819198608398, "kl": 0.20561065673828124, "learning_rate": 2.786281432302071e-07, "loss": 0.0082, "reward": 0.7339286010712385, "reward_std": 0.234727381169796, "rewards/accuracy_reward": 0.7339286010712385, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 546.478592300415, "epoch": 0.832, "grad_norm": 6.679286003112793, "kl": 0.18747406005859374, "learning_rate": 2.46996139315057e-07, "loss": 0.0075, "reward": 0.7375000283122063, "reward_std": 0.2588605497032404, "rewards/accuracy_reward": 0.7375000283122063, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 531.3750255584716, "epoch": 0.8426666666666667, "grad_norm": 4.5977044105529785, "kl": 0.1937103271484375, "learning_rate": 2.1710826032485286e-07, "loss": 0.0077, "reward": 0.7357143165543676, "reward_std": 0.2378486678004265, "rewards/accuracy_reward": 0.7357143165543676, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 595.6964569091797, "epoch": 0.8533333333333334, "grad_norm": 4.166606903076172, "kl": 0.2564910888671875, "learning_rate": 1.8900610884066817e-07, "loss": 0.0103, "reward": 0.6285714594647288, "reward_std": 0.2736343163996935, "rewards/accuracy_reward": 0.6285714594647288, "rewards/format_reward": 0.0, "step": 400 }, { "epoch": 0.8533333333333334, "eval_completion_length": 569.8527975585938, "eval_kl": 0.20817554931640625, "eval_loss": 0.008296786807477474, "eval_reward": 0.6110285988628864, "eval_reward_std": 0.28674686477184297, "eval_rewards/accuracy_reward": 0.6110285988628864, "eval_rewards/format_reward": 0.0, "eval_runtime": 29873.0033, "eval_samples_per_second": 0.167, "eval_steps_per_second": 0.012, "step": 400 }, { "completion_length": 605.528596496582, "epoch": 0.864, "grad_norm": 49.79132843017578, "kl": 0.341680908203125, "learning_rate": 1.627288017913383e-07, "loss": 0.0137, "reward": 0.5571428839117288, "reward_std": 0.3719855587929487, "rewards/accuracy_reward": 0.5571428839117288, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 637.5250267028808, "epoch": 0.8746666666666667, "grad_norm": 12.56246566772461, "kl": 0.39511566162109374, "learning_rate": 1.3831291600445573e-07, "loss": 0.0158, "reward": 0.5392857391387225, "reward_std": 0.3052775662392378, "rewards/accuracy_reward": 0.5392857391387225, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 631.8607391357422, "epoch": 0.8853333333333333, "grad_norm": 6.395442962646484, "kl": 0.45283203125, "learning_rate": 1.1579243729307487e-07, "loss": 0.0181, "reward": 0.49107145331799984, "reward_std": 0.36503970213234427, "rewards/accuracy_reward": 0.49107145331799984, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 611.7053848266602, "epoch": 0.896, "grad_norm": 1.6087334156036377, "kl": 0.3666015625, "learning_rate": 9.519871314899092e-08, "loss": 0.0147, "reward": 0.6000000327825546, "reward_std": 0.3550443138927221, "rewards/accuracy_reward": 0.6000000327825546, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 613.4946708679199, "epoch": 0.9066666666666666, "grad_norm": 1.6784651279449463, "kl": 0.3438507080078125, "learning_rate": 7.656040910844358e-08, "loss": 0.0138, "reward": 0.5946428859606385, "reward_std": 0.3528768301010132, "rewards/accuracy_reward": 0.5946428859606385, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 603.3143119812012, "epoch": 0.9173333333333333, "grad_norm": 4.043834209442139, "kl": 0.3444915771484375, "learning_rate": 5.990346885098235e-08, "loss": 0.0138, "reward": 0.5964285979047418, "reward_std": 0.3912374936044216, "rewards/accuracy_reward": 0.5964285979047418, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 575.2339546203614, "epoch": 0.928, "grad_norm": 4.404404163360596, "kl": 0.30106658935546876, "learning_rate": 4.5251078087033493e-08, "loss": 0.012, "reward": 0.6696428902447223, "reward_std": 0.3243862982839346, "rewards/accuracy_reward": 0.6696428902447223, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 582.6393119812012, "epoch": 0.9386666666666666, "grad_norm": 2.593308448791504, "kl": 0.383941650390625, "learning_rate": 3.262363228443427e-08, "loss": 0.0154, "reward": 0.6035714616999031, "reward_std": 0.29665699824690817, "rewards/accuracy_reward": 0.6035714616999031, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 627.2125282287598, "epoch": 0.9493333333333334, "grad_norm": 5.177957057952881, "kl": 0.3490997314453125, "learning_rate": 2.2038708278862952e-08, "loss": 0.014, "reward": 0.5839285971596837, "reward_std": 0.30867667235434054, "rewards/accuracy_reward": 0.5839285971596837, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 576.0964553833007, "epoch": 0.96, "grad_norm": 3.4815688133239746, "kl": 0.2956207275390625, "learning_rate": 1.3511039807673209e-08, "loss": 0.0118, "reward": 0.6589286010712385, "reward_std": 0.3096484154462814, "rewards/accuracy_reward": 0.6589286010712385, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 531.355379486084, "epoch": 0.9706666666666667, "grad_norm": 2.63545298576355, "kl": 0.261767578125, "learning_rate": 7.0524970011963675e-09, "loss": 0.0105, "reward": 0.7125000340864063, "reward_std": 0.28212962336838243, "rewards/accuracy_reward": 0.7125000340864063, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 569.5285957336425, "epoch": 0.9813333333333333, "grad_norm": 1.8794143199920654, "kl": 0.2968017578125, "learning_rate": 2.6720698600553595e-09, "loss": 0.0119, "reward": 0.6303571775555611, "reward_std": 0.3074988707900047, "rewards/accuracy_reward": 0.6303571775555611, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 612.3214569091797, "epoch": 0.992, "grad_norm": 3.266709327697754, "kl": 0.39468994140625, "learning_rate": 3.7585574148779613e-10, "loss": 0.0158, "reward": 0.5517857383936644, "reward_std": 0.3217977944761515, "rewards/accuracy_reward": 0.5517857383936644, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 626.7321701049805, "epoch": 0.9984, "kl": 0.32488250732421875, "reward": 0.604166692122817, "reward_std": 0.3102082473536332, "rewards/accuracy_reward": 0.604166692122817, "rewards/format_reward": 0.0, "step": 468, "total_flos": 0.0, "train_loss": 0.00286464851636153, "train_runtime": 7716.3929, "train_samples_per_second": 0.972, "train_steps_per_second": 0.061 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }