|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 637.8268123626709, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.9318448901176453, |
|
"kl": 0.00011245012283325195, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0, |
|
"reward": 0.6035714592784643, |
|
"reward_std": 0.3709465142339468, |
|
"rewards/accuracy_reward": 0.6017857443541288, |
|
"rewards/format_reward": 0.001785714365541935, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 611.45181350708, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 6.491418838500977, |
|
"kl": 0.0001920461654663086, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0, |
|
"reward": 0.617857176065445, |
|
"reward_std": 0.36645040661096573, |
|
"rewards/accuracy_reward": 0.617857176065445, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 604.3607414245605, |
|
"epoch": 0.032, |
|
"grad_norm": 9.168869018554688, |
|
"kl": 0.00031398534774780275, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0, |
|
"reward": 0.6482143150642514, |
|
"reward_std": 0.34973760321736336, |
|
"rewards/accuracy_reward": 0.6482143150642514, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 617.7803817749024, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 1.5698275566101074, |
|
"kl": 0.0008591651916503906, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0, |
|
"reward": 0.6303571715950966, |
|
"reward_std": 0.35366948917508123, |
|
"rewards/accuracy_reward": 0.6303571715950966, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 639.5375259399414, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.858055055141449, |
|
"kl": 0.002974271774291992, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0001, |
|
"reward": 0.6125000298023224, |
|
"reward_std": 0.3530873417854309, |
|
"rewards/accuracy_reward": 0.6125000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 664.7107475280761, |
|
"epoch": 0.064, |
|
"grad_norm": 1.1970467567443848, |
|
"kl": 0.005088996887207031, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6553571727126837, |
|
"reward_std": 0.33203957192599776, |
|
"rewards/accuracy_reward": 0.6553571727126837, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 632.7196670532227, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 1.0202564001083374, |
|
"kl": 0.996532678604126, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 0.0398, |
|
"reward": 0.7000000279396772, |
|
"reward_std": 0.3037954304367304, |
|
"rewards/accuracy_reward": 0.7000000279396772, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 640.1286018371582, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.5117438435554504, |
|
"kl": 0.002056884765625, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7250000268220902, |
|
"reward_std": 0.2653495166450739, |
|
"rewards/accuracy_reward": 0.7250000268220902, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 607.9250259399414, |
|
"epoch": 0.096, |
|
"grad_norm": 0.3817993402481079, |
|
"kl": 0.002574253082275391, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7392857447266579, |
|
"reward_std": 0.24080881737172605, |
|
"rewards/accuracy_reward": 0.7392857447266579, |
|
"rewards/format_reward": 0.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 628.1535995483398, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.7749062180519104, |
|
"kl": 0.004121017456054687, |
|
"learning_rate": 2.9996241442585123e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6964286014437675, |
|
"reward_std": 0.300217243283987, |
|
"rewards/accuracy_reward": 0.6964286014437675, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 600.3839553833008, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.5632015466690063, |
|
"kl": 0.003362083435058594, |
|
"learning_rate": 2.9973279301399446e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7178571719676257, |
|
"reward_std": 0.27802310809493064, |
|
"rewards/accuracy_reward": 0.7178571719676257, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 585.9303833007813, |
|
"epoch": 0.128, |
|
"grad_norm": 1.2080248594284058, |
|
"kl": 0.004734039306640625, |
|
"learning_rate": 2.992947502998804e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7750000357627869, |
|
"reward_std": 0.25223284475505353, |
|
"rewards/accuracy_reward": 0.7750000357627869, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 591.1464614868164, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 1.7432464361190796, |
|
"kl": 0.013347434997558593, |
|
"learning_rate": 2.9864889601923268e-06, |
|
"loss": 0.0005, |
|
"reward": 0.7267857432365418, |
|
"reward_std": 0.2874183960258961, |
|
"rewards/accuracy_reward": 0.7267857432365418, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 588.239315032959, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.28103670477867126, |
|
"kl": 0.004166412353515625, |
|
"learning_rate": 2.977961291721137e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7875000283122062, |
|
"reward_std": 0.23745907917618753, |
|
"rewards/accuracy_reward": 0.7875000283122062, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 554.2375259399414, |
|
"epoch": 0.16, |
|
"grad_norm": 0.26267099380493164, |
|
"kl": 0.004508209228515625, |
|
"learning_rate": 2.9673763677155655e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7767857410013675, |
|
"reward_std": 0.21466484777629374, |
|
"rewards/accuracy_reward": 0.7767857410013675, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 553.0071640014648, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.34145233035087585, |
|
"kl": 0.0057952880859375, |
|
"learning_rate": 2.9547489219129666e-06, |
|
"loss": 0.0002, |
|
"reward": 0.8285714581608772, |
|
"reward_std": 0.20639798790216446, |
|
"rewards/accuracy_reward": 0.8285714581608772, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 585.5857421875, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.240849107503891, |
|
"kl": 0.0038000106811523437, |
|
"learning_rate": 2.9400965311490175e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7660714596509933, |
|
"reward_std": 0.24059830717742442, |
|
"rewards/accuracy_reward": 0.7660714596509933, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 558.014315032959, |
|
"epoch": 0.192, |
|
"grad_norm": 0.42303234338760376, |
|
"kl": 0.004909515380859375, |
|
"learning_rate": 2.9234395908915565e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7142857456579804, |
|
"reward_std": 0.2469261337071657, |
|
"rewards/accuracy_reward": 0.7142857456579804, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 565.8589538574219, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.36556175351142883, |
|
"kl": 0.004494476318359375, |
|
"learning_rate": 2.904801286851009e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7517857383936644, |
|
"reward_std": 0.2242392159998417, |
|
"rewards/accuracy_reward": 0.7517857383936644, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 520.4696662902832, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.2530968487262726, |
|
"kl": 0.00856475830078125, |
|
"learning_rate": 2.884207562706925e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8125000301748514, |
|
"reward_std": 0.18475012369453908, |
|
"rewards/accuracy_reward": 0.8125000301748514, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_completion_length": 547.995567590332, |
|
"eval_kl": 0.00711322021484375, |
|
"eval_loss": 0.00028467908850871027, |
|
"eval_reward": 0.6861428862035275, |
|
"eval_reward_std": 0.270268753862381, |
|
"eval_rewards/accuracy_reward": 0.6860857433497906, |
|
"eval_rewards/format_reward": 5.7142859697341916e-05, |
|
"eval_runtime": 28593.9461, |
|
"eval_samples_per_second": 0.175, |
|
"eval_steps_per_second": 0.013, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 549.5107398986817, |
|
"epoch": 0.224, |
|
"grad_norm": 0.2235504686832428, |
|
"kl": 0.004962539672851563, |
|
"learning_rate": 2.8616870839955444e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7964286103844642, |
|
"reward_std": 0.26562733463943006, |
|
"rewards/accuracy_reward": 0.7964286103844642, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 581.1750267028808, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.4279918968677521, |
|
"kl": 0.005224609375, |
|
"learning_rate": 2.837271198208662e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7785714581608772, |
|
"reward_std": 0.20086282528936864, |
|
"rewards/accuracy_reward": 0.7785714581608772, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 532.0053810119629, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.6788883805274963, |
|
"kl": 0.0057430267333984375, |
|
"learning_rate": 2.8109938911593322e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7767857426777482, |
|
"reward_std": 0.20144498273730277, |
|
"rewards/accuracy_reward": 0.7767857426777482, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 544.6714546203614, |
|
"epoch": 0.256, |
|
"grad_norm": 0.23265813291072845, |
|
"kl": 0.006147003173828125, |
|
"learning_rate": 2.7828917396751474e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7696428894996643, |
|
"reward_std": 0.20026272870600223, |
|
"rewards/accuracy_reward": 0.7696428894996643, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 518.471452331543, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.41492336988449097, |
|
"kl": 0.00635986328125, |
|
"learning_rate": 2.753003860684943e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8375000298023224, |
|
"reward_std": 0.20442306995391846, |
|
"rewards/accuracy_reward": 0.8375000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 570.2303848266602, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.4288278818130493, |
|
"kl": 0.006764602661132812, |
|
"learning_rate": 2.721371856769793e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7160714600235224, |
|
"reward_std": 0.2673064887523651, |
|
"rewards/accuracy_reward": 0.7160714600235224, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 494.16431350708007, |
|
"epoch": 0.288, |
|
"grad_norm": 0.5349715352058411, |
|
"kl": 0.008311080932617187, |
|
"learning_rate": 2.688039758254093e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7910714618861675, |
|
"reward_std": 0.22742781266570092, |
|
"rewards/accuracy_reward": 0.7910714618861675, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 503.5232357025146, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 0.9469221830368042, |
|
"kl": 0.011474609375, |
|
"learning_rate": 2.65305396191733e-06, |
|
"loss": 0.0005, |
|
"reward": 0.8053571686148644, |
|
"reward_std": 0.22859212197363377, |
|
"rewards/accuracy_reward": 0.8053571686148644, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 526.3750251770019, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.4655410945415497, |
|
"kl": 0.016347885131835938, |
|
"learning_rate": 2.61646316641186e-06, |
|
"loss": 0.0007, |
|
"reward": 0.7767857382073998, |
|
"reward_std": 0.21031193807721138, |
|
"rewards/accuracy_reward": 0.7767857382073998, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 536.4660957336425, |
|
"epoch": 0.32, |
|
"grad_norm": 0.36182740330696106, |
|
"kl": 0.020062255859375, |
|
"learning_rate": 2.5783183044765715e-06, |
|
"loss": 0.0008, |
|
"reward": 0.7517857436090708, |
|
"reward_std": 0.2297923181205988, |
|
"rewards/accuracy_reward": 0.7517857436090708, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 514.8803802490235, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.7956529855728149, |
|
"kl": 0.03543167114257813, |
|
"learning_rate": 2.5386724720408135e-06, |
|
"loss": 0.0014, |
|
"reward": 0.7517857443541288, |
|
"reward_std": 0.26203566156327723, |
|
"rewards/accuracy_reward": 0.7517857443541288, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 549.3571701049805, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.29695257544517517, |
|
"kl": 0.062085723876953124, |
|
"learning_rate": 2.49758085431725e-06, |
|
"loss": 0.0025, |
|
"reward": 0.7285714589059353, |
|
"reward_std": 0.269120267406106, |
|
"rewards/accuracy_reward": 0.7285714589059353, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 589.5339569091797, |
|
"epoch": 0.352, |
|
"grad_norm": 0.41898104548454285, |
|
"kl": 0.13505706787109376, |
|
"learning_rate": 2.455100648986533e-06, |
|
"loss": 0.0054, |
|
"reward": 0.6857143165543675, |
|
"reward_std": 0.32202624566853044, |
|
"rewards/accuracy_reward": 0.6857143165543675, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 627.4214576721191, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 0.2985088527202606, |
|
"kl": 0.1629364013671875, |
|
"learning_rate": 2.4112909865807053e-06, |
|
"loss": 0.0065, |
|
"reward": 0.6410714607685805, |
|
"reward_std": 0.2777946576476097, |
|
"rewards/accuracy_reward": 0.6410714607685805, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 567.5768112182617, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.49856701493263245, |
|
"kl": 0.115283203125, |
|
"learning_rate": 2.366212848176164e-06, |
|
"loss": 0.0046, |
|
"reward": 0.7089286031201482, |
|
"reward_std": 0.2502220422029495, |
|
"rewards/accuracy_reward": 0.7089286031201482, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 528.6750221252441, |
|
"epoch": 0.384, |
|
"grad_norm": 0.6558699607849121, |
|
"kl": 0.18359222412109374, |
|
"learning_rate": 2.319928980510752e-06, |
|
"loss": 0.0073, |
|
"reward": 0.6535714527592063, |
|
"reward_std": 0.28206231258809566, |
|
"rewards/accuracy_reward": 0.6535714527592063, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 571.7518112182618, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 6.485378742218018, |
|
"kl": 0.34942626953125, |
|
"learning_rate": 2.272503808643123e-06, |
|
"loss": 0.014, |
|
"reward": 0.4696428783237934, |
|
"reward_std": 0.27855144031345846, |
|
"rewards/accuracy_reward": 0.4696428783237934, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 542.8089500427246, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 1.8310225009918213, |
|
"kl": 0.4147308349609375, |
|
"learning_rate": 2.2240033462759628e-06, |
|
"loss": 0.0166, |
|
"reward": 0.46071431171149013, |
|
"reward_std": 0.36014051400125024, |
|
"rewards/accuracy_reward": 0.46071431171149013, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 626.9553817749023, |
|
"epoch": 0.416, |
|
"grad_norm": 17.148595809936523, |
|
"kl": 0.4925048828125, |
|
"learning_rate": 2.1744951038678905e-06, |
|
"loss": 0.0197, |
|
"reward": 0.4000000203028321, |
|
"reward_std": 0.3870592150837183, |
|
"rewards/accuracy_reward": 0.4000000203028321, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 525.5928825378418, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 38.49786376953125, |
|
"kl": 0.23444061279296874, |
|
"learning_rate": 2.124047994661941e-06, |
|
"loss": 0.0094, |
|
"reward": 0.6821428872644901, |
|
"reward_std": 0.3179911646991968, |
|
"rewards/accuracy_reward": 0.6821428872644901, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_completion_length": 528.541595147705, |
|
"eval_kl": 0.42865986328125, |
|
"eval_loss": 0.017097920179367065, |
|
"eval_reward": 0.662742884466052, |
|
"eval_reward_std": 0.2557599967300892, |
|
"eval_rewards/accuracy_reward": 0.662742884466052, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 28459.3155, |
|
"eval_samples_per_second": 0.176, |
|
"eval_steps_per_second": 0.013, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 509.92859649658203, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 4.442358493804932, |
|
"kl": 0.09366302490234375, |
|
"learning_rate": 2.072732238761434e-06, |
|
"loss": 0.0037, |
|
"reward": 0.7660714574158192, |
|
"reward_std": 0.21246148198843, |
|
"rewards/accuracy_reward": 0.7660714574158192, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 497.5357364654541, |
|
"epoch": 0.448, |
|
"grad_norm": 0.5748523473739624, |
|
"kl": 0.085107421875, |
|
"learning_rate": 2.0206192653867536e-06, |
|
"loss": 0.0034, |
|
"reward": 0.796428595483303, |
|
"reward_std": 0.18161089681088924, |
|
"rewards/accuracy_reward": 0.796428595483303, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 614.8643173217773, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 0.3233760893344879, |
|
"kl": 0.15386199951171875, |
|
"learning_rate": 1.967781613449095e-06, |
|
"loss": 0.0062, |
|
"reward": 0.6392857398837805, |
|
"reward_std": 0.26852017305791376, |
|
"rewards/accuracy_reward": 0.6392857398837805, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 575.8321681976319, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 0.6134320497512817, |
|
"kl": 0.16066131591796876, |
|
"learning_rate": 1.9142928305795637e-06, |
|
"loss": 0.0064, |
|
"reward": 0.6446428839117289, |
|
"reward_std": 0.3076914418488741, |
|
"rewards/accuracy_reward": 0.6446428839117289, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 529.201806640625, |
|
"epoch": 0.48, |
|
"grad_norm": 14.935003280639648, |
|
"kl": 0.27866592407226565, |
|
"learning_rate": 1.8602273707541886e-06, |
|
"loss": 0.0111, |
|
"reward": 0.6982143182307482, |
|
"reward_std": 0.29295355789363386, |
|
"rewards/accuracy_reward": 0.6982143182307482, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 599.1643127441406, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 29.34033203125, |
|
"kl": 1.2954010009765624, |
|
"learning_rate": 1.8056604906573418e-06, |
|
"loss": 0.0518, |
|
"reward": 0.605357170291245, |
|
"reward_std": 0.30737774074077606, |
|
"rewards/accuracy_reward": 0.605357170291245, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 577.0696716308594, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 33.51802062988281, |
|
"kl": 0.703057861328125, |
|
"learning_rate": 1.7506681449278226e-06, |
|
"loss": 0.0281, |
|
"reward": 0.571428601257503, |
|
"reward_std": 0.32992145605385303, |
|
"rewards/accuracy_reward": 0.571428601257503, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 501.30717010498046, |
|
"epoch": 0.512, |
|
"grad_norm": 7.524632930755615, |
|
"kl": 0.3678741455078125, |
|
"learning_rate": 1.6953268804334257e-06, |
|
"loss": 0.0147, |
|
"reward": 0.589285738952458, |
|
"reward_std": 0.28248333670198916, |
|
"rewards/accuracy_reward": 0.589285738952458, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 462.3607376098633, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 6.015294075012207, |
|
"kl": 0.3237091064453125, |
|
"learning_rate": 1.6397137297211436e-06, |
|
"loss": 0.0129, |
|
"reward": 0.5589285986497998, |
|
"reward_std": 0.3269613076001406, |
|
"rewards/accuracy_reward": 0.5589285986497998, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 638.7928871154785, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 1.8317821025848389, |
|
"kl": 0.807421875, |
|
"learning_rate": 1.5839061037913395e-06, |
|
"loss": 0.0323, |
|
"reward": 0.3250000160187483, |
|
"reward_std": 0.34638786166906355, |
|
"rewards/accuracy_reward": 0.3250000160187483, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 636.7161018371582, |
|
"epoch": 0.544, |
|
"grad_norm": 0.6774631142616272, |
|
"kl": 0.53349609375, |
|
"learning_rate": 1.527981684345115e-06, |
|
"loss": 0.0213, |
|
"reward": 0.35714287366718056, |
|
"reward_std": 0.3179732210934162, |
|
"rewards/accuracy_reward": 0.35714287366718056, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 558.9875259399414, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 2.738410472869873, |
|
"kl": 0.205511474609375, |
|
"learning_rate": 1.4720183156548855e-06, |
|
"loss": 0.0082, |
|
"reward": 0.6000000305473805, |
|
"reward_std": 0.3251969013363123, |
|
"rewards/accuracy_reward": 0.6000000305473805, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 545.3893096923828, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 17.57588005065918, |
|
"kl": 1.5068023681640625, |
|
"learning_rate": 1.4160938962086612e-06, |
|
"loss": 0.0603, |
|
"reward": 0.6089285997673869, |
|
"reward_std": 0.28422979824244976, |
|
"rewards/accuracy_reward": 0.6089285997673869, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 526.2607391357421, |
|
"epoch": 0.576, |
|
"grad_norm": 38.554412841796875, |
|
"kl": 1.0892745971679687, |
|
"learning_rate": 1.3602862702788567e-06, |
|
"loss": 0.0436, |
|
"reward": 0.6267857417464257, |
|
"reward_std": 0.32045440524816515, |
|
"rewards/accuracy_reward": 0.6267857417464257, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 517.2678787231446, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 4.718568325042725, |
|
"kl": 1.1258697509765625, |
|
"learning_rate": 1.3046731195665748e-06, |
|
"loss": 0.045, |
|
"reward": 0.6464285977184773, |
|
"reward_std": 0.320261836796999, |
|
"rewards/accuracy_reward": 0.6464285977184773, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 514.8375244140625, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 4.270685195922852, |
|
"kl": 1.4042236328125, |
|
"learning_rate": 1.2493318550721775e-06, |
|
"loss": 0.0562, |
|
"reward": 0.6982143145054579, |
|
"reward_std": 0.2838581532239914, |
|
"rewards/accuracy_reward": 0.6982143145054579, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 534.7946655273438, |
|
"epoch": 0.608, |
|
"grad_norm": 9.498967170715332, |
|
"kl": 0.8575332641601563, |
|
"learning_rate": 1.1943395093426585e-06, |
|
"loss": 0.0343, |
|
"reward": 0.6910714596509934, |
|
"reward_std": 0.2867689304053783, |
|
"rewards/accuracy_reward": 0.6910714596509934, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 529.5875297546387, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 8.131922721862793, |
|
"kl": 0.6070526123046875, |
|
"learning_rate": 1.1397726292458115e-06, |
|
"loss": 0.0243, |
|
"reward": 0.675000030733645, |
|
"reward_std": 0.2807727467268705, |
|
"rewards/accuracy_reward": 0.675000030733645, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 557.6500236511231, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 2.224785089492798, |
|
"kl": 1.1581512451171876, |
|
"learning_rate": 1.085707169420437e-06, |
|
"loss": 0.0463, |
|
"reward": 0.6142857430502773, |
|
"reward_std": 0.2811264578253031, |
|
"rewards/accuracy_reward": 0.6142857430502773, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 529.6071685791015, |
|
"epoch": 0.64, |
|
"grad_norm": 8.0582857131958, |
|
"kl": 1.2655914306640625, |
|
"learning_rate": 1.0322183865509054e-06, |
|
"loss": 0.0506, |
|
"reward": 0.6750000296160579, |
|
"reward_std": 0.28525091484189036, |
|
"rewards/accuracy_reward": 0.6750000296160579, |
|
"rewards/format_reward": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_completion_length": 562.8381398986817, |
|
"eval_kl": 0.95684150390625, |
|
"eval_loss": 0.03832858428359032, |
|
"eval_reward": 0.5700857400953769, |
|
"eval_reward_std": 0.30191256090998647, |
|
"eval_rewards/accuracy_reward": 0.5700857400953769, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 30482.0361, |
|
"eval_samples_per_second": 0.164, |
|
"eval_steps_per_second": 0.012, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 557.5018104553222, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 2.934140682220459, |
|
"kl": 0.6237777709960938, |
|
"learning_rate": 9.793807346132464e-07, |
|
"loss": 0.025, |
|
"reward": 0.6339285995811224, |
|
"reward_std": 0.3295004416257143, |
|
"rewards/accuracy_reward": 0.6339285995811224, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 507.2857383728027, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 1.5333462953567505, |
|
"kl": 0.18207855224609376, |
|
"learning_rate": 9.272677612385667e-07, |
|
"loss": 0.0073, |
|
"reward": 0.6017857391387225, |
|
"reward_std": 0.2927430454641581, |
|
"rewards/accuracy_reward": 0.6017857391387225, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 535.4660949707031, |
|
"epoch": 0.672, |
|
"grad_norm": 0.8268552422523499, |
|
"kl": 0.2217437744140625, |
|
"learning_rate": 8.759520053380591e-07, |
|
"loss": 0.0089, |
|
"reward": 0.5303571715950965, |
|
"reward_std": 0.30413119196891786, |
|
"rewards/accuracy_reward": 0.5303571715950965, |
|
"rewards/format_reward": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 587.7893112182617, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 0.8802245259284973, |
|
"kl": 0.28748779296875, |
|
"learning_rate": 8.255048961321088e-07, |
|
"loss": 0.0115, |
|
"reward": 0.5250000260770321, |
|
"reward_std": 0.32775397710502147, |
|
"rewards/accuracy_reward": 0.5250000260770321, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 568.7143135070801, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.42495808005332947, |
|
"kl": 0.20731658935546876, |
|
"learning_rate": 7.759966537240373e-07, |
|
"loss": 0.0083, |
|
"reward": 0.6357143180444836, |
|
"reward_std": 0.32221881337463854, |
|
"rewards/accuracy_reward": 0.6357143180444836, |
|
"rewards/format_reward": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 572.9053848266601, |
|
"epoch": 0.704, |
|
"grad_norm": 0.8112100958824158, |
|
"kl": 0.2259490966796875, |
|
"learning_rate": 7.274961913568773e-07, |
|
"loss": 0.009, |
|
"reward": 0.6071428848430515, |
|
"reward_std": 0.28955445289611814, |
|
"rewards/accuracy_reward": 0.6071428848430515, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 585.521450805664, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 2.5413522720336914, |
|
"kl": 0.1725799560546875, |
|
"learning_rate": 6.800710194892484e-07, |
|
"loss": 0.0069, |
|
"reward": 0.6946428865194321, |
|
"reward_std": 0.27458811886608603, |
|
"rewards/accuracy_reward": 0.6946428865194321, |
|
"rewards/format_reward": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 578.700025177002, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 2.077190637588501, |
|
"kl": 0.23962783813476562, |
|
"learning_rate": 6.33787151823836e-07, |
|
"loss": 0.0096, |
|
"reward": 0.6517857488244772, |
|
"reward_std": 0.29764223508536813, |
|
"rewards/accuracy_reward": 0.6517857488244772, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 517.0357376098633, |
|
"epoch": 0.736, |
|
"grad_norm": 1.8005852699279785, |
|
"kl": 0.1731414794921875, |
|
"learning_rate": 5.887090134192947e-07, |
|
"loss": 0.0069, |
|
"reward": 0.7303571674972773, |
|
"reward_std": 0.20812651440501212, |
|
"rewards/accuracy_reward": 0.7303571674972773, |
|
"rewards/format_reward": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 537.6071670532226, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 5.837470531463623, |
|
"kl": 0.22538909912109376, |
|
"learning_rate": 5.448993510134669e-07, |
|
"loss": 0.009, |
|
"reward": 0.7285714605823159, |
|
"reward_std": 0.23610220104455948, |
|
"rewards/accuracy_reward": 0.7285714605823159, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 511.03573455810545, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 10.607796669006348, |
|
"kl": 0.2825325012207031, |
|
"learning_rate": 5.024191456827498e-07, |
|
"loss": 0.0113, |
|
"reward": 0.7357143178582192, |
|
"reward_std": 0.24888310953974724, |
|
"rewards/accuracy_reward": 0.7357143178582192, |
|
"rewards/format_reward": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 528.4375274658203, |
|
"epoch": 0.768, |
|
"grad_norm": 17.805084228515625, |
|
"kl": 0.409503173828125, |
|
"learning_rate": 4.6132752795918667e-07, |
|
"loss": 0.0164, |
|
"reward": 0.6857143182307481, |
|
"reward_std": 0.28699738159775734, |
|
"rewards/accuracy_reward": 0.6857143182307481, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 519.4303817749023, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 7.785521507263184, |
|
"kl": 0.34308319091796874, |
|
"learning_rate": 4.2168169552342905e-07, |
|
"loss": 0.0137, |
|
"reward": 0.6928571717813611, |
|
"reward_std": 0.29954983331263063, |
|
"rewards/accuracy_reward": 0.6928571717813611, |
|
"rewards/format_reward": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 504.0107368469238, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 4.727066516876221, |
|
"kl": 0.7092597961425782, |
|
"learning_rate": 3.8353683358814046e-07, |
|
"loss": 0.0285, |
|
"reward": 0.7000000275671482, |
|
"reward_std": 0.23943399637937546, |
|
"rewards/accuracy_reward": 0.7000000275671482, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 517.7428833007813, |
|
"epoch": 0.8, |
|
"grad_norm": 7.284985542297363, |
|
"kl": 0.2571014404296875, |
|
"learning_rate": 3.469460380826697e-07, |
|
"loss": 0.0103, |
|
"reward": 0.673214310593903, |
|
"reward_std": 0.2639926388859749, |
|
"rewards/accuracy_reward": 0.673214310593903, |
|
"rewards/format_reward": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 543.2714515686035, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 14.041029930114746, |
|
"kl": 0.28242645263671873, |
|
"learning_rate": 3.119602417459075e-07, |
|
"loss": 0.0113, |
|
"reward": 0.6732143096625804, |
|
"reward_std": 0.25163274370133876, |
|
"rewards/accuracy_reward": 0.6732143096625804, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 546.032169342041, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 15.783819198608398, |
|
"kl": 0.20561065673828124, |
|
"learning_rate": 2.786281432302071e-07, |
|
"loss": 0.0082, |
|
"reward": 0.7339286010712385, |
|
"reward_std": 0.234727381169796, |
|
"rewards/accuracy_reward": 0.7339286010712385, |
|
"rewards/format_reward": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 546.478592300415, |
|
"epoch": 0.832, |
|
"grad_norm": 6.679286003112793, |
|
"kl": 0.18747406005859374, |
|
"learning_rate": 2.46996139315057e-07, |
|
"loss": 0.0075, |
|
"reward": 0.7375000283122063, |
|
"reward_std": 0.2588605497032404, |
|
"rewards/accuracy_reward": 0.7375000283122063, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 531.3750255584716, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 4.5977044105529785, |
|
"kl": 0.1937103271484375, |
|
"learning_rate": 2.1710826032485286e-07, |
|
"loss": 0.0077, |
|
"reward": 0.7357143165543676, |
|
"reward_std": 0.2378486678004265, |
|
"rewards/accuracy_reward": 0.7357143165543676, |
|
"rewards/format_reward": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 595.6964569091797, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 4.166606903076172, |
|
"kl": 0.2564910888671875, |
|
"learning_rate": 1.8900610884066817e-07, |
|
"loss": 0.0103, |
|
"reward": 0.6285714594647288, |
|
"reward_std": 0.2736343163996935, |
|
"rewards/accuracy_reward": 0.6285714594647288, |
|
"rewards/format_reward": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_completion_length": 569.8527975585938, |
|
"eval_kl": 0.20817554931640625, |
|
"eval_loss": 0.008296786807477474, |
|
"eval_reward": 0.6110285988628864, |
|
"eval_reward_std": 0.28674686477184297, |
|
"eval_rewards/accuracy_reward": 0.6110285988628864, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 29873.0033, |
|
"eval_samples_per_second": 0.167, |
|
"eval_steps_per_second": 0.012, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 605.528596496582, |
|
"epoch": 0.864, |
|
"grad_norm": 49.79132843017578, |
|
"kl": 0.341680908203125, |
|
"learning_rate": 1.627288017913383e-07, |
|
"loss": 0.0137, |
|
"reward": 0.5571428839117288, |
|
"reward_std": 0.3719855587929487, |
|
"rewards/accuracy_reward": 0.5571428839117288, |
|
"rewards/format_reward": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 637.5250267028808, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 12.56246566772461, |
|
"kl": 0.39511566162109374, |
|
"learning_rate": 1.3831291600445573e-07, |
|
"loss": 0.0158, |
|
"reward": 0.5392857391387225, |
|
"reward_std": 0.3052775662392378, |
|
"rewards/accuracy_reward": 0.5392857391387225, |
|
"rewards/format_reward": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 631.8607391357422, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 6.395442962646484, |
|
"kl": 0.45283203125, |
|
"learning_rate": 1.1579243729307487e-07, |
|
"loss": 0.0181, |
|
"reward": 0.49107145331799984, |
|
"reward_std": 0.36503970213234427, |
|
"rewards/accuracy_reward": 0.49107145331799984, |
|
"rewards/format_reward": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 611.7053848266602, |
|
"epoch": 0.896, |
|
"grad_norm": 1.6087334156036377, |
|
"kl": 0.3666015625, |
|
"learning_rate": 9.519871314899092e-08, |
|
"loss": 0.0147, |
|
"reward": 0.6000000327825546, |
|
"reward_std": 0.3550443138927221, |
|
"rewards/accuracy_reward": 0.6000000327825546, |
|
"rewards/format_reward": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 613.4946708679199, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 1.6784651279449463, |
|
"kl": 0.3438507080078125, |
|
"learning_rate": 7.656040910844358e-08, |
|
"loss": 0.0138, |
|
"reward": 0.5946428859606385, |
|
"reward_std": 0.3528768301010132, |
|
"rewards/accuracy_reward": 0.5946428859606385, |
|
"rewards/format_reward": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 603.3143119812012, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 4.043834209442139, |
|
"kl": 0.3444915771484375, |
|
"learning_rate": 5.990346885098235e-08, |
|
"loss": 0.0138, |
|
"reward": 0.5964285979047418, |
|
"reward_std": 0.3912374936044216, |
|
"rewards/accuracy_reward": 0.5964285979047418, |
|
"rewards/format_reward": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 575.2339546203614, |
|
"epoch": 0.928, |
|
"grad_norm": 4.404404163360596, |
|
"kl": 0.30106658935546876, |
|
"learning_rate": 4.5251078087033493e-08, |
|
"loss": 0.012, |
|
"reward": 0.6696428902447223, |
|
"reward_std": 0.3243862982839346, |
|
"rewards/accuracy_reward": 0.6696428902447223, |
|
"rewards/format_reward": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 582.6393119812012, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 2.593308448791504, |
|
"kl": 0.383941650390625, |
|
"learning_rate": 3.262363228443427e-08, |
|
"loss": 0.0154, |
|
"reward": 0.6035714616999031, |
|
"reward_std": 0.29665699824690817, |
|
"rewards/accuracy_reward": 0.6035714616999031, |
|
"rewards/format_reward": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 627.2125282287598, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 5.177957057952881, |
|
"kl": 0.3490997314453125, |
|
"learning_rate": 2.2038708278862952e-08, |
|
"loss": 0.014, |
|
"reward": 0.5839285971596837, |
|
"reward_std": 0.30867667235434054, |
|
"rewards/accuracy_reward": 0.5839285971596837, |
|
"rewards/format_reward": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 576.0964553833007, |
|
"epoch": 0.96, |
|
"grad_norm": 3.4815688133239746, |
|
"kl": 0.2956207275390625, |
|
"learning_rate": 1.3511039807673209e-08, |
|
"loss": 0.0118, |
|
"reward": 0.6589286010712385, |
|
"reward_std": 0.3096484154462814, |
|
"rewards/accuracy_reward": 0.6589286010712385, |
|
"rewards/format_reward": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 531.355379486084, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 2.63545298576355, |
|
"kl": 0.261767578125, |
|
"learning_rate": 7.0524970011963675e-09, |
|
"loss": 0.0105, |
|
"reward": 0.7125000340864063, |
|
"reward_std": 0.28212962336838243, |
|
"rewards/accuracy_reward": 0.7125000340864063, |
|
"rewards/format_reward": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 569.5285957336425, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 1.8794143199920654, |
|
"kl": 0.2968017578125, |
|
"learning_rate": 2.6720698600553595e-09, |
|
"loss": 0.0119, |
|
"reward": 0.6303571775555611, |
|
"reward_std": 0.3074988707900047, |
|
"rewards/accuracy_reward": 0.6303571775555611, |
|
"rewards/format_reward": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 612.3214569091797, |
|
"epoch": 0.992, |
|
"grad_norm": 3.266709327697754, |
|
"kl": 0.39468994140625, |
|
"learning_rate": 3.7585574148779613e-10, |
|
"loss": 0.0158, |
|
"reward": 0.5517857383936644, |
|
"reward_std": 0.3217977944761515, |
|
"rewards/accuracy_reward": 0.5517857383936644, |
|
"rewards/format_reward": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 626.7321701049805, |
|
"epoch": 0.9984, |
|
"kl": 0.32488250732421875, |
|
"reward": 0.604166692122817, |
|
"reward_std": 0.3102082473536332, |
|
"rewards/accuracy_reward": 0.604166692122817, |
|
"rewards/format_reward": 0.0, |
|
"step": 468, |
|
"total_flos": 0.0, |
|
"train_loss": 0.00286464851636153, |
|
"train_runtime": 7716.3929, |
|
"train_samples_per_second": 0.972, |
|
"train_steps_per_second": 0.061 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|