{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 82, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 357.7986145019531, "epoch": 0.024390243902439025, "grad_norm": 0.47233226895332336, "kl": 0.0, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.4097222238779068, "reward_std": 0.14688297733664513, "rewards/semantic_prob_reward": 0.4097222238779068, "step": 1 }, { "completion_length": 365.84027099609375, "epoch": 0.04878048780487805, "grad_norm": 0.478311151266098, "kl": 0.00047397613525390625, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.340277761220932, "reward_std": 0.07923426479101181, "rewards/semantic_prob_reward": 0.340277761220932, "step": 2 }, { "completion_length": 326.9861145019531, "epoch": 0.07317073170731707, "grad_norm": 0.4847537577152252, "kl": 0.001033782958984375, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.472222238779068, "reward_std": 0.15099337697029114, "rewards/semantic_prob_reward": 0.472222238779068, "step": 3 }, { "completion_length": 337.96527099609375, "epoch": 0.0975609756097561, "grad_norm": 0.5380074977874756, "kl": 0.001251220703125, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.37384258210659027, "reward_std": 0.12506470456719398, "rewards/semantic_prob_reward": 0.37384258210659027, "step": 4 }, { "completion_length": 309.3402862548828, "epoch": 0.12195121951219512, "grad_norm": 0.574732780456543, "kl": 0.0018157958984375, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.4027777761220932, "reward_std": 0.13711940497159958, "rewards/semantic_prob_reward": 0.4027777761220932, "step": 5 }, { "completion_length": 321.9791717529297, "epoch": 0.14634146341463414, "grad_norm": 0.48638466000556946, "kl": 0.00345611572265625, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.5578703880310059, "reward_std": 0.11088646575808525, "rewards/semantic_prob_reward": 0.5578703880310059, "step": 6 }, { "completion_length": 291.50001525878906, "epoch": 0.17073170731707318, "grad_norm": 0.6627788543701172, "kl": 0.0049285888671875, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.40046297013759613, "reward_std": 0.16729174554347992, "rewards/semantic_prob_reward": 0.40046297013759613, "step": 7 }, { "completion_length": 333.9166564941406, "epoch": 0.1951219512195122, "grad_norm": 0.45172250270843506, "kl": 0.0053558349609375, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.3854166567325592, "reward_std": 0.08161446824669838, "rewards/semantic_prob_reward": 0.3854166567325592, "step": 8 }, { "completion_length": 324.2916717529297, "epoch": 0.21951219512195122, "grad_norm": 0.49190187454223633, "kl": 0.0042724609375, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.5821759402751923, "reward_std": 0.16060803085565567, "rewards/semantic_prob_reward": 0.5821759402751923, "step": 9 }, { "completion_length": 307.59722900390625, "epoch": 0.24390243902439024, "grad_norm": 0.5778583288192749, "kl": 0.0066680908203125, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.35300925374031067, "reward_std": 0.12848389521241188, "rewards/semantic_prob_reward": 0.35300925374031067, "step": 10 }, { "completion_length": 315.5416564941406, "epoch": 0.2682926829268293, "grad_norm": 0.5226218104362488, "kl": 0.0061492919921875, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.49421297013759613, "reward_std": 0.13309864327311516, "rewards/semantic_prob_reward": 0.49421297013759613, "step": 11 }, { "completion_length": 313.30555725097656, "epoch": 0.2926829268292683, "grad_norm": 0.4998522102832794, "kl": 0.0058441162109375, "learning_rate": 2e-06, "loss": 0.0001, "reward": 0.5046296268701553, "reward_std": 0.13224972039461136, "rewards/semantic_prob_reward": 0.5046296268701553, "step": 12 }, { "completion_length": 325.6319580078125, "epoch": 0.3170731707317073, "grad_norm": 0.544158399105072, "kl": 0.00799560546875, "learning_rate": 2e-06, "loss": 0.0002, "reward": 0.4548610895872116, "reward_std": 0.11304211243987083, "rewards/semantic_prob_reward": 0.4548610895872116, "step": 13 }, { "completion_length": 308.4652862548828, "epoch": 0.34146341463414637, "grad_norm": 0.4912228286266327, "kl": 0.00982666015625, "learning_rate": 2e-06, "loss": 0.0002, "reward": 0.47800926864147186, "reward_std": 0.10628756135702133, "rewards/semantic_prob_reward": 0.47800926864147186, "step": 14 }, { "completion_length": 265.4652786254883, "epoch": 0.36585365853658536, "grad_norm": 0.5952423214912415, "kl": 0.0174560546875, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.44328702986240387, "reward_std": 0.11145028471946716, "rewards/semantic_prob_reward": 0.44328702986240387, "step": 15 }, { "completion_length": 280.74305725097656, "epoch": 0.3902439024390244, "grad_norm": 0.5030803084373474, "kl": 0.0162353515625, "learning_rate": 2e-06, "loss": 0.0003, "reward": 0.5590278208255768, "reward_std": 0.08214099518954754, "rewards/semantic_prob_reward": 0.5590278208255768, "step": 16 }, { "completion_length": 239.74305725097656, "epoch": 0.4146341463414634, "grad_norm": 0.654198944568634, "kl": 0.02203369140625, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.35300926864147186, "reward_std": 0.12268669158220291, "rewards/semantic_prob_reward": 0.35300926864147186, "step": 17 }, { "completion_length": 267.5972137451172, "epoch": 0.43902439024390244, "grad_norm": 0.6382430195808411, "kl": 0.023193359375, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.548611119389534, "reward_std": 0.1619417816400528, "rewards/semantic_prob_reward": 0.548611119389534, "step": 18 }, { "completion_length": 234.15277862548828, "epoch": 0.4634146341463415, "grad_norm": 0.7074310183525085, "kl": 0.03375244140625, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.38773147761821747, "reward_std": 0.09990003705024719, "rewards/semantic_prob_reward": 0.38773147761821747, "step": 19 }, { "completion_length": 250.70138549804688, "epoch": 0.4878048780487805, "grad_norm": 0.5313490629196167, "kl": 0.0289306640625, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.4444444626569748, "reward_std": 0.0742310918867588, "rewards/semantic_prob_reward": 0.4444444626569748, "step": 20 }, { "completion_length": 230.83333587646484, "epoch": 0.5121951219512195, "grad_norm": 0.5721469521522522, "kl": 0.079833984375, "learning_rate": 2e-06, "loss": 0.0016, "reward": 0.663194477558136, "reward_std": 0.09703290555626154, "rewards/semantic_prob_reward": 0.663194477558136, "step": 21 }, { "completion_length": 225.63195037841797, "epoch": 0.5365853658536586, "grad_norm": 0.7681702971458435, "kl": 0.0391845703125, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.5740740597248077, "reward_std": 0.11927787587046623, "rewards/semantic_prob_reward": 0.5740740597248077, "step": 22 }, { "completion_length": 248.4513931274414, "epoch": 0.5609756097560976, "grad_norm": 0.6738829612731934, "kl": 0.03350830078125, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.5081018656492233, "reward_std": 0.17548304051160812, "rewards/semantic_prob_reward": 0.5081018656492233, "step": 23 }, { "completion_length": 234.77777862548828, "epoch": 0.5853658536585366, "grad_norm": 0.7024486064910889, "kl": 0.0347900390625, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.49074074625968933, "reward_std": 0.11887246370315552, "rewards/semantic_prob_reward": 0.49074074625968933, "step": 24 }, { "completion_length": 259.51390075683594, "epoch": 0.6097560975609756, "grad_norm": 0.6278256177902222, "kl": 0.0274658203125, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.5023148208856583, "reward_std": 0.1663999781012535, "rewards/semantic_prob_reward": 0.5023148208856583, "step": 25 }, { "completion_length": 238.09027862548828, "epoch": 0.6341463414634146, "grad_norm": 0.49240294098854065, "kl": 0.02777099609375, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.7384259402751923, "reward_std": 0.05578385107219219, "rewards/semantic_prob_reward": 0.7384259402751923, "step": 26 }, { "completion_length": 278.7222213745117, "epoch": 0.6585365853658537, "grad_norm": 0.5482228398323059, "kl": 0.023193359375, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.5208333134651184, "reward_std": 0.08630015514791012, "rewards/semantic_prob_reward": 0.5208333134651184, "step": 27 }, { "completion_length": 304.2291717529297, "epoch": 0.6829268292682927, "grad_norm": 0.42461201548576355, "kl": 0.0185546875, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.8067129552364349, "reward_std": 0.06628867937251925, "rewards/semantic_prob_reward": 0.8067129552364349, "step": 28 }, { "completion_length": 291.0138854980469, "epoch": 0.7073170731707317, "grad_norm": 0.4634277820587158, "kl": 0.01934814453125, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.5474537312984467, "reward_std": 0.06389683112502098, "rewards/semantic_prob_reward": 0.5474537312984467, "step": 29 }, { "completion_length": 269.8541717529297, "epoch": 0.7317073170731707, "grad_norm": 0.5370882153511047, "kl": 0.02099609375, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.5312500298023224, "reward_std": 0.10866545513272285, "rewards/semantic_prob_reward": 0.5312500298023224, "step": 30 }, { "completion_length": 289.8472213745117, "epoch": 0.7560975609756098, "grad_norm": 0.5340102910995483, "kl": 0.020751953125, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.37384262681007385, "reward_std": 0.09463867917656898, "rewards/semantic_prob_reward": 0.37384262681007385, "step": 31 }, { "completion_length": 318.2152862548828, "epoch": 0.7804878048780488, "grad_norm": 0.347016304731369, "kl": 0.01983642578125, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.7199074625968933, "reward_std": 0.07877019420266151, "rewards/semantic_prob_reward": 0.7199074625968933, "step": 32 }, { "completion_length": 321.78472900390625, "epoch": 0.8048780487804879, "grad_norm": 0.43192410469055176, "kl": 0.01873779296875, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.6157407313585281, "reward_std": 0.06844101194292307, "rewards/semantic_prob_reward": 0.6157407313585281, "step": 33 }, { "completion_length": 324.4583435058594, "epoch": 0.8292682926829268, "grad_norm": 0.39936167001724243, "kl": 0.0201416015625, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.7071759104728699, "reward_std": 0.07635667733848095, "rewards/semantic_prob_reward": 0.7071759104728699, "step": 34 }, { "completion_length": 324.2986145019531, "epoch": 0.8536585365853658, "grad_norm": 0.5036412477493286, "kl": 0.02099609375, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.6064814925193787, "reward_std": 0.12485842034220695, "rewards/semantic_prob_reward": 0.6064814925193787, "step": 35 }, { "completion_length": 305.2708282470703, "epoch": 0.8780487804878049, "grad_norm": 0.4643731117248535, "kl": 0.02056884765625, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.5601851642131805, "reward_std": 0.15315568447113037, "rewards/semantic_prob_reward": 0.5601851642131805, "step": 36 }, { "completion_length": 324.7361145019531, "epoch": 0.9024390243902439, "grad_norm": 0.5027931928634644, "kl": 0.019287109375, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.5208333432674408, "reward_std": 0.1526167280972004, "rewards/semantic_prob_reward": 0.5208333432674408, "step": 37 }, { "completion_length": 347.09722900390625, "epoch": 0.926829268292683, "grad_norm": 0.5362846851348877, "kl": 0.022705078125, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.4340277761220932, "reward_std": 0.13655241951346397, "rewards/semantic_prob_reward": 0.4340277761220932, "step": 38 }, { "completion_length": 337.65972900390625, "epoch": 0.9512195121951219, "grad_norm": 0.42662495374679565, "kl": 0.01947021484375, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.6018518656492233, "reward_std": 0.1015152232721448, "rewards/semantic_prob_reward": 0.6018518656492233, "step": 39 }, { "completion_length": 312.63890075683594, "epoch": 0.975609756097561, "grad_norm": 0.4735974073410034, "kl": 0.02545166015625, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.5219907462596893, "reward_std": 0.11485875025391579, "rewards/semantic_prob_reward": 0.5219907462596893, "step": 40 }, { "completion_length": 356.0, "epoch": 1.0, "grad_norm": 0.38108670711517334, "kl": 0.02081298828125, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.75, "reward_std": 0.0, "rewards/semantic_prob_reward": 0.75, "step": 41 }, { "completion_length": 315.9513854980469, "epoch": 1.024390243902439, "grad_norm": 0.3852001130580902, "kl": 0.026123046875, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.7118055522441864, "reward_std": 0.06554049998521805, "rewards/semantic_prob_reward": 0.7118055522441864, "step": 42 }, { "completion_length": 327.96527099609375, "epoch": 1.048780487804878, "grad_norm": 0.4032045006752014, "kl": 0.0242919921875, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.6782407164573669, "reward_std": 0.07121855020523071, "rewards/semantic_prob_reward": 0.6782407164573669, "step": 43 }, { "completion_length": 306.22222900390625, "epoch": 1.0731707317073171, "grad_norm": 0.5639402270317078, "kl": 0.02679443359375, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.4293981343507767, "reward_std": 0.16462695598602295, "rewards/semantic_prob_reward": 0.4293981343507767, "step": 44 }, { "completion_length": 370.96527099609375, "epoch": 1.0975609756097562, "grad_norm": 0.4736635982990265, "kl": 0.019287109375, "learning_rate": 2e-06, "loss": 0.0004, "reward": 0.47800929844379425, "reward_std": 0.10908563435077667, "rewards/semantic_prob_reward": 0.47800929844379425, "step": 45 }, { "completion_length": 351.2708282470703, "epoch": 1.1219512195121952, "grad_norm": 0.3622564375400543, "kl": 0.0260009765625, "learning_rate": 2e-06, "loss": 0.0005, "reward": 0.7557870447635651, "reward_std": 0.07712824456393719, "rewards/semantic_prob_reward": 0.7557870447635651, "step": 46 }, { "completion_length": 310.4166717529297, "epoch": 1.146341463414634, "grad_norm": 0.40313443541526794, "kl": 0.0284423828125, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.7650463283061981, "reward_std": 0.1128253573551774, "rewards/semantic_prob_reward": 0.7650463283061981, "step": 47 }, { "completion_length": 326.4791717529297, "epoch": 1.170731707317073, "grad_norm": 0.3694097697734833, "kl": 0.03057861328125, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.7557870447635651, "reward_std": 0.060931041836738586, "rewards/semantic_prob_reward": 0.7557870447635651, "step": 48 }, { "completion_length": 324.625, "epoch": 1.1951219512195121, "grad_norm": 0.2911110818386078, "kl": 0.0291748046875, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.9027777910232544, "reward_std": 0.053954847157001495, "rewards/semantic_prob_reward": 0.9027777910232544, "step": 49 }, { "completion_length": 362.3125, "epoch": 1.2195121951219512, "grad_norm": 0.3079072833061218, "kl": 0.02984619140625, "learning_rate": 2e-06, "loss": 0.0006, "reward": 0.8067129850387573, "reward_std": 0.05853627622127533, "rewards/semantic_prob_reward": 0.8067129850387573, "step": 50 }, { "completion_length": 319.4097137451172, "epoch": 1.2439024390243902, "grad_norm": 0.27289971709251404, "kl": 0.03289794921875, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.809027761220932, "reward_std": 0.03438824974000454, "rewards/semantic_prob_reward": 0.809027761220932, "step": 51 }, { "completion_length": 295.2152862548828, "epoch": 1.2682926829268293, "grad_norm": 0.29519933462142944, "kl": 0.0426025390625, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.9074074327945709, "reward_std": 0.0566580705344677, "rewards/semantic_prob_reward": 0.9074074327945709, "step": 52 }, { "completion_length": 341.61805725097656, "epoch": 1.2926829268292683, "grad_norm": 0.20022991299629211, "kl": 0.040283203125, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.8854166567325592, "reward_std": 0.025046803057193756, "rewards/semantic_prob_reward": 0.8854166567325592, "step": 53 }, { "completion_length": 351.65972900390625, "epoch": 1.3170731707317074, "grad_norm": 0.11979760229587555, "kl": 0.037841796875, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.9351851642131805, "reward_std": 0.021624969318509102, "rewards/semantic_prob_reward": 0.9351851642131805, "step": 54 }, { "completion_length": 312.8819580078125, "epoch": 1.3414634146341464, "grad_norm": 0.19860704243183136, "kl": 0.0477294921875, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.9872685670852661, "reward_std": 0.020046884194016457, "rewards/semantic_prob_reward": 0.9872685670852661, "step": 55 }, { "completion_length": 332.55555725097656, "epoch": 1.3658536585365852, "grad_norm": 0.13305607438087463, "kl": 0.044921875, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.9768518507480621, "reward_std": 0.021624969318509102, "rewards/semantic_prob_reward": 0.9768518507480621, "step": 56 }, { "completion_length": 309.8333282470703, "epoch": 1.3902439024390243, "grad_norm": 0.2834044396877289, "kl": 0.05224609375, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.9050925672054291, "reward_std": 0.03189620561897755, "rewards/semantic_prob_reward": 0.9050925672054291, "step": 57 }, { "completion_length": 306.2916717529297, "epoch": 1.4146341463414633, "grad_norm": 0.18958237767219543, "kl": 0.0540771484375, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.8946759104728699, "reward_std": 0.028776755090802908, "rewards/semantic_prob_reward": 0.8946759104728699, "step": 58 }, { "completion_length": 336.1527862548828, "epoch": 1.4390243902439024, "grad_norm": 0.38953498005867004, "kl": 0.0679931640625, "learning_rate": 2e-06, "loss": 0.0014, "reward": 1.0, "reward_std": 0.0, "rewards/semantic_prob_reward": 1.0, "step": 59 }, { "completion_length": 304.05555725097656, "epoch": 1.4634146341463414, "grad_norm": 0.1806153953075409, "kl": 0.05517578125, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.9548611044883728, "reward_std": 0.014045829884707928, "rewards/semantic_prob_reward": 0.9548611044883728, "step": 60 }, { "completion_length": 289.61805725097656, "epoch": 1.4878048780487805, "grad_norm": 0.16269907355308533, "kl": 0.0577392578125, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.9594907164573669, "reward_std": 0.007151785772293806, "rewards/semantic_prob_reward": 0.9594907164573669, "step": 61 }, { "completion_length": 278.4166564941406, "epoch": 1.5121951219512195, "grad_norm": 0.18248751759529114, "kl": 0.0499267578125, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.9398148059844971, "reward_std": 0.011210623197257519, "rewards/semantic_prob_reward": 0.9398148059844971, "step": 62 }, { "completion_length": 313.9236145019531, "epoch": 1.5365853658536586, "grad_norm": 0.20052289962768555, "kl": 0.0479736328125, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.8842592537403107, "reward_std": 0.028703952208161354, "rewards/semantic_prob_reward": 0.8842592537403107, "step": 63 }, { "completion_length": 292.07640075683594, "epoch": 1.5609756097560976, "grad_norm": 0.41178181767463684, "kl": 0.0526123046875, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.8414351940155029, "reward_std": 0.08965621888637543, "rewards/semantic_prob_reward": 0.8414351940155029, "step": 64 }, { "completion_length": 304.74305725097656, "epoch": 1.5853658536585367, "grad_norm": 0.17011003196239471, "kl": 0.05224609375, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.9432870447635651, "reward_std": 0.009746242314577103, "rewards/semantic_prob_reward": 0.9432870447635651, "step": 65 }, { "completion_length": 289.375, "epoch": 1.6097560975609757, "grad_norm": 0.27571210265159607, "kl": 0.055419921875, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.8692129552364349, "reward_std": 0.04567318223416805, "rewards/semantic_prob_reward": 0.8692129552364349, "step": 66 }, { "completion_length": 325.9652862548828, "epoch": 1.6341463414634148, "grad_norm": 0.3293726146221161, "kl": 0.0435791015625, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.8576388955116272, "reward_std": 0.06112281419336796, "rewards/semantic_prob_reward": 0.8576388955116272, "step": 67 }, { "completion_length": 285.4375, "epoch": 1.6585365853658538, "grad_norm": 0.21041785180568695, "kl": 0.0584716796875, "learning_rate": 2e-06, "loss": 0.0012, "reward": 0.9456018805503845, "reward_std": 0.013836178928613663, "rewards/semantic_prob_reward": 0.9456018805503845, "step": 68 }, { "completion_length": 311.50694274902344, "epoch": 1.6829268292682928, "grad_norm": 0.30075928568840027, "kl": 0.045166015625, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.887731522321701, "reward_std": 0.0688705276697874, "rewards/semantic_prob_reward": 0.887731522321701, "step": 69 }, { "completion_length": 289.4166717529297, "epoch": 1.7073170731707317, "grad_norm": 0.35588082671165466, "kl": 0.0548095703125, "learning_rate": 2e-06, "loss": 0.0011, "reward": 0.9074074327945709, "reward_std": 0.058881448581814766, "rewards/semantic_prob_reward": 0.9074074327945709, "step": 70 }, { "completion_length": 334.5208282470703, "epoch": 1.7317073170731707, "grad_norm": 0.270826518535614, "kl": 0.037109375, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.7916666567325592, "reward_std": 0.06214603967964649, "rewards/semantic_prob_reward": 0.7916666567325592, "step": 71 }, { "completion_length": 309.9652862548828, "epoch": 1.7560975609756098, "grad_norm": 0.26750263571739197, "kl": 0.0440673828125, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.8946759104728699, "reward_std": 0.05284506734460592, "rewards/semantic_prob_reward": 0.8946759104728699, "step": 72 }, { "completion_length": 311.74305725097656, "epoch": 1.7804878048780488, "grad_norm": 0.21291472017765045, "kl": 0.0474853515625, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.8923611044883728, "reward_std": 0.02916168374940753, "rewards/semantic_prob_reward": 0.8923611044883728, "step": 73 }, { "completion_length": 327.38194274902344, "epoch": 1.8048780487804879, "grad_norm": 0.11419837921857834, "kl": 0.037109375, "learning_rate": 2e-06, "loss": 0.0007, "reward": 0.9594907164573669, "reward_std": 0.007151785772293806, "rewards/semantic_prob_reward": 0.9594907164573669, "step": 74 }, { "completion_length": 308.8541717529297, "epoch": 1.8292682926829267, "grad_norm": 0.30795952677726746, "kl": 0.042724609375, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.8738425672054291, "reward_std": 0.041339725255966187, "rewards/semantic_prob_reward": 0.8738425672054291, "step": 75 }, { "completion_length": 322.28472900390625, "epoch": 1.8536585365853657, "grad_norm": 0.3246361017227173, "kl": 0.0443115234375, "learning_rate": 2e-06, "loss": 0.0009, "reward": 0.8668981790542603, "reward_std": 0.06655931193381548, "rewards/semantic_prob_reward": 0.8668981790542603, "step": 76 }, { "completion_length": 339.7916717529297, "epoch": 1.8780487804878048, "grad_norm": 0.2843785285949707, "kl": 0.039794921875, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.9004630148410797, "reward_std": 0.08244897052645683, "rewards/semantic_prob_reward": 0.9004630148410797, "step": 77 }, { "completion_length": 296.9027862548828, "epoch": 1.9024390243902438, "grad_norm": 0.21712768077850342, "kl": 0.049072265625, "learning_rate": 2e-06, "loss": 0.001, "reward": 0.9398148357868195, "reward_std": 0.02979312650859356, "rewards/semantic_prob_reward": 0.9398148357868195, "step": 78 }, { "completion_length": 327.4444580078125, "epoch": 1.9268292682926829, "grad_norm": 0.24541597068309784, "kl": 0.041748046875, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.881944477558136, "reward_std": 0.03883950226008892, "rewards/semantic_prob_reward": 0.881944477558136, "step": 79 }, { "completion_length": 336.25, "epoch": 1.951219512195122, "grad_norm": 0.15884429216384888, "kl": 0.040283203125, "learning_rate": 2e-06, "loss": 0.0008, "reward": 0.9594907164573669, "reward_std": 0.007151785772293806, "rewards/semantic_prob_reward": 0.9594907164573669, "step": 80 }, { "completion_length": 335.68055725097656, "epoch": 1.975609756097561, "grad_norm": 0.01797318644821644, "kl": 0.042236328125, "learning_rate": 2e-06, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/semantic_prob_reward": 1.0, "step": 81 }, { "completion_length": 307.25, "epoch": 2.0, "grad_norm": 0.21741825342178345, "kl": 0.038330078125, "learning_rate": 2e-06, "loss": 0.0008, "reward": 1.0, "reward_std": 0.0, "rewards/semantic_prob_reward": 1.0, "step": 82 }, { "epoch": 2.0, "step": 82, "total_flos": 0.0, "train_loss": 0.0006075712070133446, "train_runtime": 2993.8628, "train_samples_per_second": 0.327, "train_steps_per_second": 0.027 } ], "logging_steps": 1, "max_steps": 82, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }