{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.991869918699187, "eval_steps": 100, "global_step": 61, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 326.9895935058594, "epoch": 0.016260162601626018, "grad_norm": 0.6227036118507385, "kl": 0.0, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.4496528208255768, "reward_std": 0.08717759698629379, "rewards/semantic_prob_reward": 0.4496528208255768, "step": 1 }, { "completion_length": 405.0833435058594, "epoch": 0.032520325203252036, "grad_norm": 0.6315253973007202, "kl": 0.0005207061767578125, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.3298611119389534, "reward_std": 0.1270247846841812, "rewards/semantic_prob_reward": 0.3298611119389534, "step": 2 }, { "completion_length": 322.78126525878906, "epoch": 0.04878048780487805, "grad_norm": 0.7605056762695312, "kl": 0.001079559326171875, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.378472238779068, "reward_std": 0.18147467076778412, "rewards/semantic_prob_reward": 0.378472238779068, "step": 3 }, { "completion_length": 297.8229217529297, "epoch": 0.06504065040650407, "grad_norm": 0.7152072191238403, "kl": 0.00182342529296875, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5555555522441864, "reward_std": 0.17427652329206467, "rewards/semantic_prob_reward": 0.5555555522441864, "step": 4 }, { "completion_length": 343.7708435058594, "epoch": 0.08130081300813008, "grad_norm": 0.5844411849975586, "kl": 0.00408935546875, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.5868056118488312, "reward_std": 0.1467457190155983, "rewards/semantic_prob_reward": 0.5868056118488312, "step": 5 }, { "completion_length": 299.96876525878906, "epoch": 0.0975609756097561, "grad_norm": 0.7371426224708557, "kl": 0.0071258544921875, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.5052083432674408, "reward_std": 0.13677234202623367, "rewards/semantic_prob_reward": 0.5052083432674408, "step": 6 }, { "completion_length": 251.04167938232422, "epoch": 0.11382113821138211, "grad_norm": 0.8003924489021301, "kl": 0.013641357421875, "learning_rate": 2e-06, "loss": 0.0003, "reward": 0.404513917863369, "reward_std": 0.13188457489013672, "rewards/semantic_prob_reward": 0.404513917863369, "step": 7 }, { "completion_length": 281.15625, "epoch": 0.13008130081300814, "grad_norm": 0.8694770932197571, "kl": 0.011932373046875, "learning_rate": 2e-06, "loss": 0.0002, "reward": 0.6684028208255768, "reward_std": 0.1733601987361908, "rewards/semantic_prob_reward": 0.6684028208255768, "step": 8 }, { "completion_length": 240.6041717529297, "epoch": 0.14634146341463414, "grad_norm": 0.7374126315116882, "kl": 0.0234375, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.7013888955116272, "reward_std": 0.18010085076093674, "rewards/semantic_prob_reward": 0.7013888955116272, "step": 9 }, { "completion_length": 232.90625762939453, "epoch": 0.16260162601626016, "grad_norm": 0.7839574813842773, "kl": 0.0263671875, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.493055522441864, "reward_std": 0.09839068725705147, "rewards/semantic_prob_reward": 0.493055522441864, "step": 10 }, { "completion_length": 264.21875762939453, "epoch": 0.17886178861788618, "grad_norm": 0.9004881381988525, "kl": 0.03497314453125, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.460069477558136, "reward_std": 0.1534503921866417, "rewards/semantic_prob_reward": 0.460069477558136, "step": 11 }, { "completion_length": 297.22918701171875, "epoch": 0.1951219512195122, "grad_norm": 0.7715654969215393, "kl": 0.03662109375, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.4461805522441864, "reward_std": 0.11340713500976562, "rewards/semantic_prob_reward": 0.4461805522441864, "step": 12 }, { "completion_length": 255.6666717529297, "epoch": 0.21138211382113822, "grad_norm": 0.6141469478607178, "kl": 0.0439453125, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.6354166269302368, "reward_std": 0.11250432580709457, "rewards/semantic_prob_reward": 0.6354166269302368, "step": 13 }, { "completion_length": 254.4791717529297, "epoch": 0.22764227642276422, "grad_norm": 0.8626372814178467, "kl": 0.0457763671875, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.3888888955116272, "reward_std": 0.14496835693717003, "rewards/semantic_prob_reward": 0.3888888955116272, "step": 14 }, { "completion_length": 262.7604217529297, "epoch": 0.24390243902439024, "grad_norm": 0.8154762387275696, "kl": 0.062744140625, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.5642361044883728, "reward_std": 0.1623847484588623, "rewards/semantic_prob_reward": 0.5642361044883728, "step": 15 }, { "completion_length": 250.53125762939453, "epoch": 0.2601626016260163, "grad_norm": 0.8684131503105164, "kl": 0.0484619140625, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.546875, "reward_std": 0.10251581482589245, "rewards/semantic_prob_reward": 0.546875, "step": 16 }, { "completion_length": 268.1979293823242, "epoch": 0.2764227642276423, "grad_norm": 0.8222023844718933, "kl": 0.0479736328125, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.6875, "reward_std": 0.15590529516339302, "rewards/semantic_prob_reward": 0.6875, "step": 17 }, { "completion_length": 238.45833587646484, "epoch": 0.2926829268292683, "grad_norm": 0.830650806427002, "kl": 0.0574951171875, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.4236111044883728, "reward_std": 0.07365942373871803, "rewards/semantic_prob_reward": 0.4236111044883728, "step": 18 }, { "completion_length": 272.58333587646484, "epoch": 0.3089430894308943, "grad_norm": 0.7622032165527344, "kl": 0.054931640625, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.5555555522441864, "reward_std": 0.1605427861213684, "rewards/semantic_prob_reward": 0.5555555522441864, "step": 19 }, { "completion_length": 259.7291793823242, "epoch": 0.3252032520325203, "grad_norm": 0.6568298935890198, "kl": 0.072021484375, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.7465277910232544, "reward_std": 0.08705444633960724, "rewards/semantic_prob_reward": 0.7465277910232544, "step": 20 }, { "completion_length": 253.46875762939453, "epoch": 0.34146341463414637, "grad_norm": 0.8788058757781982, "kl": 0.0638427734375, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.470486119389534, "reward_std": 0.16892866045236588, "rewards/semantic_prob_reward": 0.470486119389534, "step": 21 }, { "completion_length": 246.26042938232422, "epoch": 0.35772357723577236, "grad_norm": 0.8425477147102356, "kl": 0.080322265625, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.578125, "reward_std": 0.14923276007175446, "rewards/semantic_prob_reward": 0.578125, "step": 22 }, { "completion_length": 232.7291717529297, "epoch": 0.37398373983739835, "grad_norm": 0.826454222202301, "kl": 0.09228515625, "learning_rate": 2e-06, "loss": 0.0018, "reward": 0.6232638955116272, "reward_std": 0.12366497330367565, "rewards/semantic_prob_reward": 0.6232638955116272, "step": 23 }, { "completion_length": 262.6458435058594, "epoch": 0.3902439024390244, "grad_norm": 0.6750925183296204, "kl": 0.0687255859375, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.7256944477558136, "reward_std": 0.08347194455564022, "rewards/semantic_prob_reward": 0.7256944477558136, "step": 24 }, { "completion_length": 248.8854217529297, "epoch": 0.4065040650406504, "grad_norm": 0.6638458371162415, "kl": 0.080078125, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.5208333730697632, "reward_std": 0.08115789666771889, "rewards/semantic_prob_reward": 0.5208333730697632, "step": 25 }, { "completion_length": 240.40625762939453, "epoch": 0.42276422764227645, "grad_norm": 0.7445152401924133, "kl": 0.07421875, "learning_rate": 2e-06, "loss": 0.0015, "reward": 0.612847238779068, "reward_std": 0.10508114472031593, "rewards/semantic_prob_reward": 0.612847238779068, "step": 26 }, { "completion_length": 274.06250762939453, "epoch": 0.43902439024390244, "grad_norm": 0.6577355861663818, "kl": 0.068603515625, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.7534722685813904, "reward_std": 0.14198515191674232, "rewards/semantic_prob_reward": 0.7534722685813904, "step": 27 }, { "completion_length": 288.3541717529297, "epoch": 0.45528455284552843, "grad_norm": 0.74998939037323, "kl": 0.0869140625, "learning_rate": 2e-06, "loss": 0.0017, "reward": 0.6666666865348816, "reward_std": 0.16053049266338348, "rewards/semantic_prob_reward": 0.6666666865348816, "step": 28 }, { "completion_length": 301.43751525878906, "epoch": 0.4715447154471545, "grad_norm": 0.8003519773483276, "kl": 0.0673828125, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.5381944477558136, "reward_std": 0.09778347611427307, "rewards/semantic_prob_reward": 0.5381944477558136, "step": 29 }, { "completion_length": 300.12501525878906, "epoch": 0.4878048780487805, "grad_norm": 0.7790213227272034, "kl": 0.08154296875, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.4739583432674408, "reward_std": 0.1577911302447319, "rewards/semantic_prob_reward": 0.4739583432674408, "step": 30 }, { "completion_length": 340.37501525878906, "epoch": 0.5040650406504065, "grad_norm": 0.5246845483779907, "kl": 0.060791015625, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.5312499850988388, "reward_std": 0.0729602500796318, "rewards/semantic_prob_reward": 0.5312499850988388, "step": 31 }, { "completion_length": 260.40625, "epoch": 0.5203252032520326, "grad_norm": 0.8522101044654846, "kl": 0.0693359375, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.744791716337204, "reward_std": 0.16898231208324432, "rewards/semantic_prob_reward": 0.744791716337204, "step": 32 }, { "completion_length": 304.59376525878906, "epoch": 0.5365853658536586, "grad_norm": 0.6503528952598572, "kl": 0.061279296875, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.5277778208255768, "reward_std": 0.06914992816746235, "rewards/semantic_prob_reward": 0.5277778208255768, "step": 33 }, { "completion_length": 312.9895935058594, "epoch": 0.5528455284552846, "grad_norm": 0.5762602686882019, "kl": 0.059326171875, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.6545139402151108, "reward_std": 0.0959564745426178, "rewards/semantic_prob_reward": 0.6545139402151108, "step": 34 }, { "completion_length": 323.19793701171875, "epoch": 0.5691056910569106, "grad_norm": 0.686737060546875, "kl": 0.0577392578125, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.5468750596046448, "reward_std": 0.10773483663797379, "rewards/semantic_prob_reward": 0.5468750596046448, "step": 35 }, { "completion_length": 301.5729217529297, "epoch": 0.5853658536585366, "grad_norm": 0.6504166722297668, "kl": 0.056640625, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.621527761220932, "reward_std": 0.1079883947968483, "rewards/semantic_prob_reward": 0.621527761220932, "step": 36 }, { "completion_length": 271.3854217529297, "epoch": 0.6016260162601627, "grad_norm": 0.609277069568634, "kl": 0.06982421875, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.7204861044883728, "reward_std": 0.0827992595732212, "rewards/semantic_prob_reward": 0.7204861044883728, "step": 37 }, { "completion_length": 229.20833587646484, "epoch": 0.6178861788617886, "grad_norm": 0.7599443197250366, "kl": 0.0771484375, "learning_rate": 2e-06, "loss": 0.0015, "reward": 0.6406250596046448, "reward_std": 0.09435661882162094, "rewards/semantic_prob_reward": 0.6406250596046448, "step": 38 }, { "completion_length": 270.21876525878906, "epoch": 0.6341463414634146, "grad_norm": 0.6375428438186646, "kl": 0.077880859375, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.6145833432674408, "reward_std": 0.03708049841225147, "rewards/semantic_prob_reward": 0.6145833432674408, "step": 39 }, { "completion_length": 270.2916717529297, "epoch": 0.6504065040650406, "grad_norm": 0.7068930864334106, "kl": 0.088134765625, "learning_rate": 2e-06, "loss": 0.0018, "reward": 0.6892361044883728, "reward_std": 0.09742275439202785, "rewards/semantic_prob_reward": 0.6892361044883728, "step": 40 }, { "completion_length": 266.39583587646484, "epoch": 0.6666666666666666, "grad_norm": 0.7177326083183289, "kl": 0.080322265625, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.578125, "reward_std": 0.10076737776398659, "rewards/semantic_prob_reward": 0.578125, "step": 41 }, { "completion_length": 253.1354217529297, "epoch": 0.6829268292682927, "grad_norm": 0.6259705424308777, "kl": 0.067626953125, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.7916666865348816, "reward_std": 0.11008695513010025, "rewards/semantic_prob_reward": 0.7916666865348816, "step": 42 }, { "completion_length": 258.71876525878906, "epoch": 0.6991869918699187, "grad_norm": 0.75113844871521, "kl": 0.0693359375, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.6371528208255768, "reward_std": 0.14734797179698944, "rewards/semantic_prob_reward": 0.6371528208255768, "step": 43 }, { "completion_length": 278.90626525878906, "epoch": 0.7154471544715447, "grad_norm": 0.5536551475524902, "kl": 0.067138671875, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.6805555820465088, "reward_std": 0.12111106887459755, "rewards/semantic_prob_reward": 0.6805555820465088, "step": 44 }, { "completion_length": 247.5, "epoch": 0.7317073170731707, "grad_norm": 0.862553060054779, "kl": 0.07568359375, "learning_rate": 2e-06, "loss": 0.0015, "reward": 0.6736111044883728, "reward_std": 0.14401617273688316, "rewards/semantic_prob_reward": 0.6736111044883728, "step": 45 }, { "completion_length": 315.8958435058594, "epoch": 0.7479674796747967, "grad_norm": 0.7693763375282288, "kl": 0.0625, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.444444477558136, "reward_std": 0.18099138885736465, "rewards/semantic_prob_reward": 0.444444477558136, "step": 46 }, { "completion_length": 267.8958435058594, "epoch": 0.7642276422764228, "grad_norm": 0.6072492599487305, "kl": 0.0606689453125, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.682291716337204, "reward_std": 0.13895628601312637, "rewards/semantic_prob_reward": 0.682291716337204, "step": 47 }, { "completion_length": 285.6770935058594, "epoch": 0.7804878048780488, "grad_norm": 0.6549988985061646, "kl": 0.06884765625, "learning_rate": 2e-06, "loss": 0.0014, "reward": 0.569444477558136, "reward_std": 0.09050748497247696, "rewards/semantic_prob_reward": 0.569444477558136, "step": 48 }, { "completion_length": 280.3333435058594, "epoch": 0.7967479674796748, "grad_norm": 0.6826404333114624, "kl": 0.059814453125, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.6267361640930176, "reward_std": 0.06955048628151417, "rewards/semantic_prob_reward": 0.6267361640930176, "step": 49 }, { "completion_length": 331.72918701171875, "epoch": 0.8130081300813008, "grad_norm": 0.6480268836021423, "kl": 0.055419921875, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.4895833432674408, "reward_std": 0.09887155145406723, "rewards/semantic_prob_reward": 0.4895833432674408, "step": 50 }, { "completion_length": 318.3333435058594, "epoch": 0.8292682926829268, "grad_norm": 0.5913366675376892, "kl": 0.0465087890625, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.5954861342906952, "reward_std": 0.09535550139844418, "rewards/semantic_prob_reward": 0.5954861342906952, "step": 51 }, { "completion_length": 317.9270935058594, "epoch": 0.8455284552845529, "grad_norm": 0.6996480822563171, "kl": 0.052001953125, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.388888880610466, "reward_std": 0.122083043679595, "rewards/semantic_prob_reward": 0.388888880610466, "step": 52 }, { "completion_length": 260.84375, "epoch": 0.8617886178861789, "grad_norm": 0.6170709133148193, "kl": 0.0472412109375, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.7499999701976776, "reward_std": 0.0948900431394577, "rewards/semantic_prob_reward": 0.7499999701976776, "step": 53 }, { "completion_length": 304.1145935058594, "epoch": 0.8780487804878049, "grad_norm": 0.6153264045715332, "kl": 0.0499267578125, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.550347238779068, "reward_std": 0.07403821125626564, "rewards/semantic_prob_reward": 0.550347238779068, "step": 54 }, { "completion_length": 331.0416717529297, "epoch": 0.8943089430894309, "grad_norm": 0.48850128054618835, "kl": 0.0430908203125, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.675347238779068, "reward_std": 0.06912494450807571, "rewards/semantic_prob_reward": 0.675347238779068, "step": 55 }, { "completion_length": 309.53125, "epoch": 0.9105691056910569, "grad_norm": 0.5893312692642212, "kl": 0.0594482421875, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.616319477558136, "reward_std": 0.10437992960214615, "rewards/semantic_prob_reward": 0.616319477558136, "step": 56 }, { "completion_length": 321.6458435058594, "epoch": 0.926829268292683, "grad_norm": 0.6194024085998535, "kl": 0.057861328125, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.647569477558136, "reward_std": 0.14092495292425156, "rewards/semantic_prob_reward": 0.647569477558136, "step": 57 }, { "completion_length": 304.1145935058594, "epoch": 0.943089430894309, "grad_norm": 0.6568655967712402, "kl": 0.04931640625, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.7777777910232544, "reward_std": 0.13045287132263184, "rewards/semantic_prob_reward": 0.7777777910232544, "step": 58 }, { "completion_length": 285.6770935058594, "epoch": 0.959349593495935, "grad_norm": 0.6948373317718506, "kl": 0.065185546875, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.5954861342906952, "reward_std": 0.12868088483810425, "rewards/semantic_prob_reward": 0.5954861342906952, "step": 59 }, { "completion_length": 296.3958435058594, "epoch": 0.975609756097561, "grad_norm": 0.6584079265594482, "kl": 0.0615234375, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.6736111640930176, "reward_std": 0.1756286770105362, "rewards/semantic_prob_reward": 0.6736111640930176, "step": 60 }, { "completion_length": 298.2604293823242, "epoch": 0.991869918699187, "grad_norm": 0.6497851014137268, "kl": 0.0648193359375, "learning_rate": 2e-06, "loss": 0.0013, "reward": 0.640625, "reward_std": 0.10011639073491096, "rewards/semantic_prob_reward": 0.640625, "step": 61 }, { "epoch": 0.991869918699187, "step": 61, "total_flos": 0.0, "train_loss": 0.0010827767922910098, "train_runtime": 3385.3213, "train_samples_per_second": 0.145, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 61, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }