{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4, "eval_steps": 500, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 101.90625, "epoch": 0.0006666666666666666, "grad_norm": 3.0352887202837673, "kl": 0.0, "learning_rate": 9.996666666666667e-07, "loss": 0.0, "reward": 1.3406250476837158, "reward_std": 0.2543097734451294, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.49687501788139343, "step": 1 }, { "completion_length": 120.15625, "epoch": 0.0013333333333333333, "grad_norm": 3.952252558157825, "kl": 0.00041961669921875, "learning_rate": 9.993333333333333e-07, "loss": 0.0, "reward": 1.2511160373687744, "reward_std": 0.23065093159675598, "rewards/format_reward": 0.71875, "rewards/iou_reward": 0.5323660373687744, "step": 2 }, { "completion_length": 92.75, "epoch": 0.002, "grad_norm": 5.372625056806963, "kl": 0.0006256103515625, "learning_rate": 9.989999999999999e-07, "loss": 0.0, "reward": 0.5625, "reward_std": 0.32216876745224, "rewards/format_reward": 0.28125, "rewards/iou_reward": 0.28125, "step": 3 }, { "completion_length": 111.15625, "epoch": 0.0026666666666666666, "grad_norm": 5.729635417533713, "kl": 0.000530242919921875, "learning_rate": 9.986666666666667e-07, "loss": 0.0, "reward": 1.5104167461395264, "reward_std": 0.28608438372612, "rewards/format_reward": 0.78125, "rewards/iou_reward": 0.7291666865348816, "step": 4 }, { "completion_length": 96.15625, "epoch": 0.0033333333333333335, "grad_norm": 4.854221974064142, "kl": 0.000606536865234375, "learning_rate": 9.983333333333332e-07, "loss": 0.0, "reward": 1.3020833730697632, "reward_std": 0.48561251163482666, "rewards/format_reward": 0.65625, "rewards/iou_reward": 0.6458333730697632, "step": 5 }, { "completion_length": 131.28125, "epoch": 0.004, "grad_norm": 4.806680681789822, "kl": 0.0005340576171875, "learning_rate": 9.98e-07, "loss": 0.0, "reward": 1.4010417461395264, "reward_std": 0.4656534790992737, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.5260416865348816, "step": 6 }, { "completion_length": 123.09375, "epoch": 0.004666666666666667, "grad_norm": 4.771558222131954, "kl": 0.000789642333984375, "learning_rate": 9.976666666666666e-07, "loss": 0.0, "reward": 1.5656249523162842, "reward_std": 0.23628026247024536, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.690625011920929, "step": 7 }, { "completion_length": 133.40625, "epoch": 0.005333333333333333, "grad_norm": 5.735029838883173, "kl": 0.001129150390625, "learning_rate": 9.973333333333332e-07, "loss": 0.0, "reward": 1.2109375, "reward_std": 0.47322168946266174, "rewards/format_reward": 0.71875, "rewards/iou_reward": 0.4921875, "step": 8 }, { "completion_length": 94.09375, "epoch": 0.006, "grad_norm": 2.2270102117496076, "kl": 0.001373291015625, "learning_rate": 9.97e-07, "loss": 0.0001, "reward": 1.28125, "reward_std": 0.31684717535972595, "rewards/format_reward": 0.6875, "rewards/iou_reward": 0.59375, "step": 9 }, { "completion_length": 109.25, "epoch": 0.006666666666666667, "grad_norm": 5.689586459113548, "kl": 0.0014190673828125, "learning_rate": 9.966666666666667e-07, "loss": 0.0001, "reward": 1.5598958730697632, "reward_std": 0.33001992106437683, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6223958134651184, "step": 10 }, { "completion_length": 116.96875, "epoch": 0.007333333333333333, "grad_norm": 5.480746594877343, "kl": 0.0033111572265625, "learning_rate": 9.963333333333333e-07, "loss": 0.0001, "reward": 1.2681176662445068, "reward_std": 0.3618003726005554, "rewards/format_reward": 0.78125, "rewards/iou_reward": 0.4868675768375397, "step": 11 }, { "completion_length": 104.8125, "epoch": 0.008, "grad_norm": 22.087122922736054, "kl": 0.002777099609375, "learning_rate": 9.959999999999999e-07, "loss": 0.0001, "reward": 1.251562476158142, "reward_std": 0.47693055868148804, "rewards/format_reward": 0.625, "rewards/iou_reward": 0.6265624761581421, "step": 12 }, { "completion_length": 107.375, "epoch": 0.008666666666666666, "grad_norm": 3.9991879671453074, "kl": 0.007568359375, "learning_rate": 9.956666666666666e-07, "loss": 0.0003, "reward": 1.3666667938232422, "reward_std": 0.3738614320755005, "rewards/format_reward": 0.8125, "rewards/iou_reward": 0.5541666746139526, "step": 13 }, { "completion_length": 129.375, "epoch": 0.009333333333333334, "grad_norm": 9.076134851249542, "kl": 0.0096435546875, "learning_rate": 9.953333333333332e-07, "loss": 0.0004, "reward": 1.1927083730697632, "reward_std": 0.41140446066856384, "rewards/format_reward": 0.71875, "rewards/iou_reward": 0.4739583432674408, "step": 14 }, { "completion_length": 107.09375, "epoch": 0.01, "grad_norm": 2.6729315067274793, "kl": 0.006134033203125, "learning_rate": 9.95e-07, "loss": 0.0002, "reward": 1.2321181297302246, "reward_std": 0.47299420833587646, "rewards/format_reward": 0.71875, "rewards/iou_reward": 0.5133680701255798, "step": 15 }, { "completion_length": 115.625, "epoch": 0.010666666666666666, "grad_norm": 3.708406991941987, "kl": 0.0123291015625, "learning_rate": 9.946666666666666e-07, "loss": 0.0005, "reward": 1.2611855268478394, "reward_std": 0.5005860328674316, "rewards/format_reward": 0.625, "rewards/iou_reward": 0.6361855268478394, "step": 16 }, { "completion_length": 124.71875, "epoch": 0.011333333333333334, "grad_norm": 11.751321800900396, "kl": 0.00653076171875, "learning_rate": 9.943333333333331e-07, "loss": 0.0003, "reward": 1.4117324352264404, "reward_std": 0.5311921238899231, "rewards/format_reward": 0.65625, "rewards/iou_reward": 0.7554824948310852, "step": 17 }, { "completion_length": 105.4375, "epoch": 0.012, "grad_norm": 5.674303905095575, "kl": 0.0107421875, "learning_rate": 9.94e-07, "loss": 0.0004, "reward": 1.3666666746139526, "reward_std": 0.3777332603931427, "rewards/format_reward": 0.78125, "rewards/iou_reward": 0.5854166746139526, "step": 18 }, { "completion_length": 119.3125, "epoch": 0.012666666666666666, "grad_norm": 5.496089292094567, "kl": 0.00970458984375, "learning_rate": 9.936666666666667e-07, "loss": 0.0004, "reward": 1.4960813522338867, "reward_std": 0.35393866896629333, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.6523313522338867, "step": 19 }, { "completion_length": 122.84375, "epoch": 0.013333333333333334, "grad_norm": 6.977661812929151, "kl": 0.061279296875, "learning_rate": 9.933333333333333e-07, "loss": 0.0025, "reward": 1.4038631916046143, "reward_std": 0.5247980356216431, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.5601131916046143, "step": 20 }, { "completion_length": 132.0625, "epoch": 0.014, "grad_norm": 3.618879977153046, "kl": 0.0167236328125, "learning_rate": 9.929999999999999e-07, "loss": 0.0007, "reward": 1.3584449291229248, "reward_std": 0.3886280655860901, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.4521949887275696, "step": 21 }, { "completion_length": 126.6875, "epoch": 0.014666666666666666, "grad_norm": 6.442647705437091, "kl": 0.0057373046875, "learning_rate": 9.926666666666666e-07, "loss": 0.0002, "reward": 1.5473958253860474, "reward_std": 0.41308844089508057, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6098958253860474, "step": 22 }, { "completion_length": 121.875, "epoch": 0.015333333333333332, "grad_norm": 2.8959231383645228, "kl": 0.006134033203125, "learning_rate": 9.923333333333332e-07, "loss": 0.0002, "reward": 1.6670758724212646, "reward_std": 0.15621747076511383, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6670758724212646, "step": 23 }, { "completion_length": 111.0, "epoch": 0.016, "grad_norm": 8.945260032621553, "kl": 0.0211181640625, "learning_rate": 9.92e-07, "loss": 0.0008, "reward": 1.671875, "reward_std": 0.4404904842376709, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.765625, "step": 24 }, { "completion_length": 127.53125, "epoch": 0.016666666666666666, "grad_norm": 6.640190767681227, "kl": 0.01318359375, "learning_rate": 9.916666666666666e-07, "loss": 0.0005, "reward": 1.3937499523162842, "reward_std": 0.431326687335968, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.48750001192092896, "step": 25 }, { "completion_length": 97.34375, "epoch": 0.017333333333333333, "grad_norm": 5.896490947883833, "kl": 0.01214599609375, "learning_rate": 9.913333333333333e-07, "loss": 0.0005, "reward": 1.3946428298950195, "reward_std": 0.23205098509788513, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.45714282989501953, "step": 26 }, { "completion_length": 126.53125, "epoch": 0.018, "grad_norm": 6.838078307211532, "kl": 0.0023956298828125, "learning_rate": 9.91e-07, "loss": 0.0001, "reward": 1.699479103088379, "reward_std": 0.2258627861738205, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6994791626930237, "step": 27 }, { "completion_length": 104.625, "epoch": 0.018666666666666668, "grad_norm": 13.623202821462954, "kl": 0.0274658203125, "learning_rate": 9.906666666666667e-07, "loss": 0.0011, "reward": 1.5750000476837158, "reward_std": 0.36375024914741516, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.606249988079071, "step": 28 }, { "completion_length": 109.28125, "epoch": 0.019333333333333334, "grad_norm": 3.040344268362676, "kl": 0.00994873046875, "learning_rate": 9.903333333333333e-07, "loss": 0.0004, "reward": 1.5885417461395264, "reward_std": 0.272108256816864, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6197917461395264, "step": 29 }, { "completion_length": 114.21875, "epoch": 0.02, "grad_norm": 4.369935267967927, "kl": 0.0126953125, "learning_rate": 9.9e-07, "loss": 0.0005, "reward": 1.6119792461395264, "reward_std": 0.17724287509918213, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6432291865348816, "step": 30 }, { "completion_length": 110.96875, "epoch": 0.020666666666666667, "grad_norm": 3.1359565062145434, "kl": 0.0128173828125, "learning_rate": 9.896666666666666e-07, "loss": 0.0005, "reward": 1.5088541507720947, "reward_std": 0.21562500298023224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5401041507720947, "step": 31 }, { "completion_length": 111.9375, "epoch": 0.021333333333333333, "grad_norm": 5.0968509172670045, "kl": 0.00909423828125, "learning_rate": 9.893333333333332e-07, "loss": 0.0004, "reward": 1.8536458015441895, "reward_std": 0.15635381639003754, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.853645920753479, "step": 32 }, { "completion_length": 117.8125, "epoch": 0.022, "grad_norm": 3.6041767166580096, "kl": 0.0174560546875, "learning_rate": 9.89e-07, "loss": 0.0007, "reward": 1.6114583015441895, "reward_std": 0.3868013620376587, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7052083611488342, "step": 33 }, { "completion_length": 130.3125, "epoch": 0.02266666666666667, "grad_norm": 15.015831206771892, "kl": 0.0157470703125, "learning_rate": 9.886666666666665e-07, "loss": 0.0006, "reward": 1.6171875, "reward_std": 0.39245539903640747, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6796875, "step": 34 }, { "completion_length": 135.15625, "epoch": 0.023333333333333334, "grad_norm": 13.032763299682761, "kl": 0.01495361328125, "learning_rate": 9.883333333333333e-07, "loss": 0.0006, "reward": 1.4656250476837158, "reward_std": 0.38596677780151367, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.590624988079071, "step": 35 }, { "completion_length": 102.40625, "epoch": 0.024, "grad_norm": 5.632361056698005, "kl": 0.01092529296875, "learning_rate": 9.88e-07, "loss": 0.0004, "reward": 1.5651042461395264, "reward_std": 0.27788737416267395, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5963541865348816, "step": 36 }, { "completion_length": 127.09375, "epoch": 0.024666666666666667, "grad_norm": 3.7115113463765694, "kl": 0.0164794921875, "learning_rate": 9.876666666666667e-07, "loss": 0.0007, "reward": 1.4057291746139526, "reward_std": 0.424655020236969, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.5307291746139526, "step": 37 }, { "completion_length": 113.65625, "epoch": 0.025333333333333333, "grad_norm": 4.093051002690221, "kl": 0.078125, "learning_rate": 9.873333333333333e-07, "loss": 0.0031, "reward": 1.4947916269302368, "reward_std": 0.6059920787811279, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.6197916865348816, "step": 38 }, { "completion_length": 111.3125, "epoch": 0.026, "grad_norm": 3.498845851761715, "kl": 0.017333984375, "learning_rate": 9.87e-07, "loss": 0.0007, "reward": 1.84375, "reward_std": 0.23325318098068237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 39 }, { "completion_length": 110.1875, "epoch": 0.02666666666666667, "grad_norm": 3.218093816995547, "kl": 0.01422119140625, "learning_rate": 9.866666666666666e-07, "loss": 0.0006, "reward": 1.5515625476837158, "reward_std": 0.1820041835308075, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.551562488079071, "step": 40 }, { "completion_length": 101.5, "epoch": 0.027333333333333334, "grad_norm": 6.5635208326845715, "kl": 0.00897216796875, "learning_rate": 9.863333333333332e-07, "loss": 0.0004, "reward": 1.492708444595337, "reward_std": 0.2810943126678467, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.49270832538604736, "step": 41 }, { "completion_length": 103.59375, "epoch": 0.028, "grad_norm": 3.19476342285546, "kl": 0.03369140625, "learning_rate": 9.86e-07, "loss": 0.0014, "reward": 1.59375, "reward_std": 0.2957531809806824, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.625, "step": 42 }, { "completion_length": 110.46875, "epoch": 0.028666666666666667, "grad_norm": 3.302872472898087, "kl": 0.01507568359375, "learning_rate": 9.856666666666667e-07, "loss": 0.0006, "reward": 1.7864583730697632, "reward_std": 0.31378236413002014, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8489583730697632, "step": 43 }, { "completion_length": 106.71875, "epoch": 0.029333333333333333, "grad_norm": 14.398052130814657, "kl": 0.01336669921875, "learning_rate": 9.853333333333333e-07, "loss": 0.0005, "reward": 1.7713541984558105, "reward_std": 0.10214866697788239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7713541388511658, "step": 44 }, { "completion_length": 122.34375, "epoch": 0.03, "grad_norm": 2.193826705399377, "kl": 0.0035247802734375, "learning_rate": 9.849999999999999e-07, "loss": 0.0001, "reward": 1.5732886791229248, "reward_std": 0.19328995048999786, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6045386791229248, "step": 45 }, { "completion_length": 108.875, "epoch": 0.030666666666666665, "grad_norm": 2.649975362348558, "kl": 0.0269775390625, "learning_rate": 9.846666666666667e-07, "loss": 0.0011, "reward": 1.5520833730697632, "reward_std": 0.1555021107196808, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6145833134651184, "step": 46 }, { "completion_length": 109.03125, "epoch": 0.03133333333333333, "grad_norm": 5.865770660072811, "kl": 0.0050048828125, "learning_rate": 9.843333333333332e-07, "loss": 0.0002, "reward": 1.7208333015441895, "reward_std": 0.09747881442308426, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7208333015441895, "step": 47 }, { "completion_length": 113.4375, "epoch": 0.032, "grad_norm": 3.99465563967742, "kl": 0.0233154296875, "learning_rate": 9.84e-07, "loss": 0.0009, "reward": 1.7220982313156128, "reward_std": 0.24174949526786804, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7220982313156128, "step": 48 }, { "completion_length": 118.15625, "epoch": 0.03266666666666666, "grad_norm": 5.732326596392835, "kl": 0.0162353515625, "learning_rate": 9.836666666666666e-07, "loss": 0.0007, "reward": 1.618749976158142, "reward_std": 0.27827000617980957, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6187499761581421, "step": 49 }, { "completion_length": 106.53125, "epoch": 0.03333333333333333, "grad_norm": 2.372331615633515, "kl": 0.01611328125, "learning_rate": 9.833333333333332e-07, "loss": 0.0006, "reward": 1.796875, "reward_std": 0.1979166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.796875, "step": 50 }, { "completion_length": 121.625, "epoch": 0.034, "grad_norm": 2.5321213307008925, "kl": 0.016845703125, "learning_rate": 9.83e-07, "loss": 0.0007, "reward": 1.8234374523162842, "reward_std": 0.1622893214225769, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.823437511920929, "step": 51 }, { "completion_length": 111.5625, "epoch": 0.034666666666666665, "grad_norm": 6.710579665160341, "kl": 0.0201416015625, "learning_rate": 9.826666666666667e-07, "loss": 0.0008, "reward": 1.7135417461395264, "reward_std": 0.26096415519714355, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7135416865348816, "step": 52 }, { "completion_length": 102.21875, "epoch": 0.035333333333333335, "grad_norm": 29.01839941536559, "kl": 0.019775390625, "learning_rate": 9.823333333333333e-07, "loss": 0.0008, "reward": 1.9406249523162842, "reward_std": 0.006249999161809683, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.940625011920929, "step": 53 }, { "completion_length": 124.25, "epoch": 0.036, "grad_norm": 3.2804345383226674, "kl": 0.0159912109375, "learning_rate": 9.819999999999999e-07, "loss": 0.0006, "reward": 1.734375, "reward_std": 0.3189588189125061, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.765625, "step": 54 }, { "completion_length": 116.8125, "epoch": 0.03666666666666667, "grad_norm": 10.925247029993871, "kl": 0.0167236328125, "learning_rate": 9.816666666666667e-07, "loss": 0.0007, "reward": 1.4932291507720947, "reward_std": 0.25750160217285156, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5244791507720947, "step": 55 }, { "completion_length": 109.1875, "epoch": 0.037333333333333336, "grad_norm": 3.4178439412553923, "kl": 0.01446533203125, "learning_rate": 9.813333333333332e-07, "loss": 0.0006, "reward": 1.801041603088379, "reward_std": 0.047282956540584564, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8010416626930237, "step": 56 }, { "completion_length": 107.625, "epoch": 0.038, "grad_norm": 2.695118172281793, "kl": 0.017578125, "learning_rate": 9.81e-07, "loss": 0.0007, "reward": 1.578125, "reward_std": 0.35341876745224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.578125, "step": 57 }, { "completion_length": 106.03125, "epoch": 0.03866666666666667, "grad_norm": 2.932399590346732, "kl": 0.012451171875, "learning_rate": 9.806666666666666e-07, "loss": 0.0005, "reward": 1.556249976158142, "reward_std": 0.14271336793899536, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5562500357627869, "step": 58 }, { "completion_length": 105.5, "epoch": 0.03933333333333333, "grad_norm": 4.623475140326131, "kl": 0.0194091796875, "learning_rate": 9.803333333333332e-07, "loss": 0.0008, "reward": 1.8255208730697632, "reward_std": 0.15471456944942474, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8255208134651184, "step": 59 }, { "completion_length": 105.5625, "epoch": 0.04, "grad_norm": 5.826848278847118, "kl": 0.0169677734375, "learning_rate": 9.8e-07, "loss": 0.0007, "reward": 1.5553572177886963, "reward_std": 0.16175492107868195, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5866071581840515, "step": 60 }, { "completion_length": 109.03125, "epoch": 0.04066666666666666, "grad_norm": 3.7619425679491654, "kl": 0.01336669921875, "learning_rate": 9.796666666666667e-07, "loss": 0.0005, "reward": 1.808333396911621, "reward_std": 0.15275105834007263, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8083333373069763, "step": 61 }, { "completion_length": 107.125, "epoch": 0.04133333333333333, "grad_norm": 5.263362931647119, "kl": 0.0164794921875, "learning_rate": 9.793333333333333e-07, "loss": 0.0007, "reward": 1.6687500476837158, "reward_std": 0.21241983771324158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.668749988079071, "step": 62 }, { "completion_length": 113.625, "epoch": 0.042, "grad_norm": 5.635427051260688, "kl": 0.016357421875, "learning_rate": 9.789999999999999e-07, "loss": 0.0007, "reward": 1.6791666746139526, "reward_std": 0.10842075198888779, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6791666746139526, "step": 63 }, { "completion_length": 141.84375, "epoch": 0.042666666666666665, "grad_norm": 4.192718329133132, "kl": 0.00933837890625, "learning_rate": 9.786666666666666e-07, "loss": 0.0004, "reward": 1.5416666269302368, "reward_std": 0.3339538872241974, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.6354166865348816, "step": 64 }, { "completion_length": 107.03125, "epoch": 0.043333333333333335, "grad_norm": 3.0223383137633992, "kl": 0.0228271484375, "learning_rate": 9.783333333333334e-07, "loss": 0.0009, "reward": 1.673532247543335, "reward_std": 0.15354402363300323, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7047821879386902, "step": 65 }, { "completion_length": 97.90625, "epoch": 0.044, "grad_norm": 4.1896217999628655, "kl": 0.011962890625, "learning_rate": 9.78e-07, "loss": 0.0005, "reward": 1.7883522510528564, "reward_std": 0.1348692774772644, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7883523106575012, "step": 66 }, { "completion_length": 88.03125, "epoch": 0.04466666666666667, "grad_norm": 11.445819642868669, "kl": 0.0087890625, "learning_rate": 9.776666666666666e-07, "loss": 0.0004, "reward": 1.881250023841858, "reward_std": 0.06203501671552658, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8812500238418579, "step": 67 }, { "completion_length": 99.75, "epoch": 0.04533333333333334, "grad_norm": 3.8075043799894868, "kl": 0.03173828125, "learning_rate": 9.773333333333333e-07, "loss": 0.0013, "reward": 1.7671130895614624, "reward_std": 0.03491881862282753, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7671130895614624, "step": 68 }, { "completion_length": 97.09375, "epoch": 0.046, "grad_norm": 1.9549733285152766, "kl": 0.032470703125, "learning_rate": 9.77e-07, "loss": 0.0013, "reward": 1.9166667461395264, "reward_std": 0.09622503817081451, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9166666269302368, "step": 69 }, { "completion_length": 107.0, "epoch": 0.04666666666666667, "grad_norm": 8.948567292478604, "kl": 0.01007080078125, "learning_rate": 9.766666666666667e-07, "loss": 0.0004, "reward": 1.6713541746139526, "reward_std": 0.22485247254371643, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6713541746139526, "step": 70 }, { "completion_length": 127.4375, "epoch": 0.04733333333333333, "grad_norm": 9.76251285738584, "kl": 0.02099609375, "learning_rate": 9.763333333333333e-07, "loss": 0.0008, "reward": 1.6416666507720947, "reward_std": 0.23632033169269562, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6416666507720947, "step": 71 }, { "completion_length": 104.4375, "epoch": 0.048, "grad_norm": 2.9295509817018246, "kl": 0.0230712890625, "learning_rate": 9.759999999999998e-07, "loss": 0.0009, "reward": 1.671875, "reward_std": 0.09678799659013748, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.671875, "step": 72 }, { "completion_length": 115.96875, "epoch": 0.048666666666666664, "grad_norm": 2.8826789757104487, "kl": 0.0289306640625, "learning_rate": 9.756666666666666e-07, "loss": 0.0012, "reward": 1.4291666746139526, "reward_std": 0.12946277856826782, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.46041667461395264, "step": 73 }, { "completion_length": 106.0, "epoch": 0.04933333333333333, "grad_norm": 1.7878676703488574, "kl": 0.02734375, "learning_rate": 9.753333333333334e-07, "loss": 0.0011, "reward": 1.9406249523162842, "reward_std": 0.011967840604484081, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.940625011920929, "step": 74 }, { "completion_length": 109.78125, "epoch": 0.05, "grad_norm": 2.026816232625581, "kl": 0.0264892578125, "learning_rate": 9.75e-07, "loss": 0.0011, "reward": 1.7104166746139526, "reward_std": 0.21626877784729004, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7416666150093079, "step": 75 }, { "completion_length": 116.9375, "epoch": 0.050666666666666665, "grad_norm": 3.557653750996693, "kl": 0.0281982421875, "learning_rate": 9.746666666666666e-07, "loss": 0.0011, "reward": 1.4453125, "reward_std": 0.21339544653892517, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.4765625, "step": 76 }, { "completion_length": 102.375, "epoch": 0.051333333333333335, "grad_norm": 5.405656916312641, "kl": 0.0250244140625, "learning_rate": 9.743333333333333e-07, "loss": 0.001, "reward": 1.683333396911621, "reward_std": 0.2595481872558594, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6833333373069763, "step": 77 }, { "completion_length": 99.15625, "epoch": 0.052, "grad_norm": 2.892751685694306, "kl": 0.03857421875, "learning_rate": 9.74e-07, "loss": 0.0015, "reward": 1.6088541746139526, "reward_std": 0.057359836995601654, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6088541150093079, "step": 78 }, { "completion_length": 123.625, "epoch": 0.05266666666666667, "grad_norm": 3.2191297886140307, "kl": 0.0198974609375, "learning_rate": 9.736666666666667e-07, "loss": 0.0008, "reward": 1.6572965383529663, "reward_std": 0.3763349652290344, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7197965383529663, "step": 79 }, { "completion_length": 102.875, "epoch": 0.05333333333333334, "grad_norm": 2.8647206070202627, "kl": 0.01336669921875, "learning_rate": 9.733333333333333e-07, "loss": 0.0005, "reward": 1.6458333730697632, "reward_std": 0.12932969629764557, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6458333730697632, "step": 80 }, { "completion_length": 111.59375, "epoch": 0.054, "grad_norm": 2.377974626907879, "kl": 0.033203125, "learning_rate": 9.729999999999998e-07, "loss": 0.0013, "reward": 1.8705357313156128, "reward_std": 0.14209690690040588, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.870535671710968, "step": 81 }, { "completion_length": 106.5625, "epoch": 0.05466666666666667, "grad_norm": 7.823464380163642, "kl": 0.0234375, "learning_rate": 9.726666666666666e-07, "loss": 0.0009, "reward": 1.7234375476837158, "reward_std": 0.3013303279876709, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.723437488079071, "step": 82 }, { "completion_length": 121.53125, "epoch": 0.05533333333333333, "grad_norm": 1.9978705941652486, "kl": 0.032470703125, "learning_rate": 9.723333333333334e-07, "loss": 0.0013, "reward": 1.78125, "reward_std": 0.16108438372612, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 83 }, { "completion_length": 109.0625, "epoch": 0.056, "grad_norm": 1.8599432463385859, "kl": 0.029296875, "learning_rate": 9.72e-07, "loss": 0.0012, "reward": 1.8515625, "reward_std": 0.140625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8515625, "step": 84 }, { "completion_length": 126.40625, "epoch": 0.056666666666666664, "grad_norm": 3.7398144502119983, "kl": 0.0162353515625, "learning_rate": 9.716666666666665e-07, "loss": 0.0007, "reward": 1.5666667222976685, "reward_std": 0.2706969678401947, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5979166626930237, "step": 85 }, { "completion_length": 100.28125, "epoch": 0.05733333333333333, "grad_norm": 0.9138927058573423, "kl": 0.035400390625, "learning_rate": 9.713333333333333e-07, "loss": 0.0014, "reward": 1.8020833730697632, "reward_std": 0.0625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8333333730697632, "step": 86 }, { "completion_length": 106.34375, "epoch": 0.058, "grad_norm": 4.1417488236611755, "kl": 0.0264892578125, "learning_rate": 9.709999999999999e-07, "loss": 0.0011, "reward": 1.6988096237182617, "reward_std": 0.1488431692123413, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6988095045089722, "step": 87 }, { "completion_length": 108.375, "epoch": 0.058666666666666666, "grad_norm": 4.244365536213055, "kl": 0.020751953125, "learning_rate": 9.706666666666667e-07, "loss": 0.0008, "reward": 1.7838542461395264, "reward_std": 0.16694773733615875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7838541269302368, "step": 88 }, { "completion_length": 115.1875, "epoch": 0.059333333333333335, "grad_norm": 4.102893936093592, "kl": 0.029296875, "learning_rate": 9.703333333333332e-07, "loss": 0.0012, "reward": 1.5223958492279053, "reward_std": 0.1686100959777832, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5536458492279053, "step": 89 }, { "completion_length": 108.03125, "epoch": 0.06, "grad_norm": 3.6035186761657245, "kl": 0.017578125, "learning_rate": 9.7e-07, "loss": 0.0007, "reward": 1.7942708730697632, "reward_std": 0.11727401614189148, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7942708730697632, "step": 90 }, { "completion_length": 90.125, "epoch": 0.06066666666666667, "grad_norm": 11.772013258885057, "kl": 0.025634765625, "learning_rate": 9.696666666666666e-07, "loss": 0.001, "reward": 1.7552083730697632, "reward_std": 0.20758545398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7552083730697632, "step": 91 }, { "completion_length": 105.0, "epoch": 0.06133333333333333, "grad_norm": 4.856058253600517, "kl": 0.03857421875, "learning_rate": 9.693333333333334e-07, "loss": 0.0015, "reward": 1.6421875953674316, "reward_std": 0.29073864221572876, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6734374761581421, "step": 92 }, { "completion_length": 102.8125, "epoch": 0.062, "grad_norm": 2.4531985892397934, "kl": 0.0439453125, "learning_rate": 9.69e-07, "loss": 0.0018, "reward": 1.8276041746139526, "reward_std": 0.1543872058391571, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8276041746139526, "step": 93 }, { "completion_length": 98.625, "epoch": 0.06266666666666666, "grad_norm": 5.021751682169003, "kl": 0.046630859375, "learning_rate": 9.686666666666667e-07, "loss": 0.0019, "reward": 1.8046875, "reward_std": 0.1549195945262909, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8046875, "step": 94 }, { "completion_length": 112.8125, "epoch": 0.06333333333333334, "grad_norm": 2.591983202961889, "kl": 0.0238037109375, "learning_rate": 9.683333333333333e-07, "loss": 0.001, "reward": 1.6875, "reward_std": 0.22767089307308197, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6874999403953552, "step": 95 }, { "completion_length": 103.34375, "epoch": 0.064, "grad_norm": 6.848469043039981, "kl": 0.039306640625, "learning_rate": 9.679999999999999e-07, "loss": 0.0016, "reward": 1.670907735824585, "reward_std": 0.12492072582244873, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.670907735824585, "step": 96 }, { "completion_length": 96.1875, "epoch": 0.06466666666666666, "grad_norm": 3.6566717555300063, "kl": 0.03662109375, "learning_rate": 9.676666666666667e-07, "loss": 0.0015, "reward": 1.7588541507720947, "reward_std": 0.2289992868900299, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7588541507720947, "step": 97 }, { "completion_length": 89.875, "epoch": 0.06533333333333333, "grad_norm": 2.8843096223763065, "kl": 0.0380859375, "learning_rate": 9.673333333333332e-07, "loss": 0.0015, "reward": 1.6941964626312256, "reward_std": 0.10833029448986053, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6941964626312256, "step": 98 }, { "completion_length": 115.59375, "epoch": 0.066, "grad_norm": 2.4575446120559232, "kl": 0.036865234375, "learning_rate": 9.67e-07, "loss": 0.0015, "reward": 1.7369792461395264, "reward_std": 0.28305375576019287, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7682291865348816, "step": 99 }, { "completion_length": 111.65625, "epoch": 0.06666666666666667, "grad_norm": 4.658521167466829, "kl": 0.044921875, "learning_rate": 9.666666666666666e-07, "loss": 0.0018, "reward": 1.5463541746139526, "reward_std": 0.35908043384552, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5776041746139526, "step": 100 }, { "completion_length": 101.9375, "epoch": 0.06733333333333333, "grad_norm": 9.342804573261901, "kl": 0.047119140625, "learning_rate": 9.663333333333334e-07, "loss": 0.0019, "reward": 1.6422619819641113, "reward_std": 0.22818556427955627, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6422619223594666, "step": 101 }, { "completion_length": 97.84375, "epoch": 0.068, "grad_norm": 6.655149497000102, "kl": 0.044189453125, "learning_rate": 9.66e-07, "loss": 0.0018, "reward": 1.8541667461395264, "reward_std": 0.13275586068630219, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666865348816, "step": 102 }, { "completion_length": 102.625, "epoch": 0.06866666666666667, "grad_norm": 19.9949550842642, "kl": 0.033447265625, "learning_rate": 9.656666666666667e-07, "loss": 0.0013, "reward": 1.7895833253860474, "reward_std": 0.18346890807151794, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7895833253860474, "step": 103 }, { "completion_length": 100.65625, "epoch": 0.06933333333333333, "grad_norm": 1.9437950375799338, "kl": 0.05029296875, "learning_rate": 9.653333333333333e-07, "loss": 0.002, "reward": 1.8333333730697632, "reward_std": 0.1955089271068573, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8645833134651184, "step": 104 }, { "completion_length": 103.25, "epoch": 0.07, "grad_norm": 2.6039974057742032, "kl": 0.034423828125, "learning_rate": 9.649999999999999e-07, "loss": 0.0014, "reward": 1.6510417461395264, "reward_std": 0.15625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6510416269302368, "step": 105 }, { "completion_length": 129.03125, "epoch": 0.07066666666666667, "grad_norm": 2.9765048247773946, "kl": 0.0277099609375, "learning_rate": 9.646666666666666e-07, "loss": 0.0011, "reward": 1.6875, "reward_std": 0.38655388355255127, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.75, "step": 106 }, { "completion_length": 102.125, "epoch": 0.07133333333333333, "grad_norm": 15.57011221105927, "kl": 0.0224609375, "learning_rate": 9.643333333333334e-07, "loss": 0.0009, "reward": 1.8489583730697632, "reward_std": 0.2074463963508606, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8802083730697632, "step": 107 }, { "completion_length": 107.75, "epoch": 0.072, "grad_norm": 3.3703784098693794, "kl": 0.039794921875, "learning_rate": 9.64e-07, "loss": 0.0016, "reward": 1.6755952835083008, "reward_std": 0.12082208693027496, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6755952835083008, "step": 108 }, { "completion_length": 115.84375, "epoch": 0.07266666666666667, "grad_norm": 44.59290634627873, "kl": 0.03173828125, "learning_rate": 9.636666666666666e-07, "loss": 0.0013, "reward": 1.65234375, "reward_std": 0.2900276184082031, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.65234375, "step": 109 }, { "completion_length": 117.78125, "epoch": 0.07333333333333333, "grad_norm": 2.7694523962679347, "kl": 0.0235595703125, "learning_rate": 9.633333333333334e-07, "loss": 0.0009, "reward": 1.7864583730697632, "reward_std": 0.23044900596141815, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8489583730697632, "step": 110 }, { "completion_length": 95.28125, "epoch": 0.074, "grad_norm": 3.478949773888699, "kl": 0.032470703125, "learning_rate": 9.63e-07, "loss": 0.0013, "reward": 1.6770833730697632, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6770833730697632, "step": 111 }, { "completion_length": 114.625, "epoch": 0.07466666666666667, "grad_norm": 2.8998877467259514, "kl": 0.033203125, "learning_rate": 9.626666666666667e-07, "loss": 0.0013, "reward": 1.5552083253860474, "reward_std": 0.2875972092151642, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5864583253860474, "step": 112 }, { "completion_length": 101.21875, "epoch": 0.07533333333333334, "grad_norm": 3.9532770136401885, "kl": 0.01544189453125, "learning_rate": 9.623333333333333e-07, "loss": 0.0006, "reward": 1.8177083730697632, "reward_std": 0.16428396105766296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083730697632, "step": 113 }, { "completion_length": 98.8125, "epoch": 0.076, "grad_norm": 4.6542542249507965, "kl": 0.024658203125, "learning_rate": 9.619999999999999e-07, "loss": 0.001, "reward": 1.796875, "reward_std": 0.1837744116783142, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7968750596046448, "step": 114 }, { "completion_length": 98.0625, "epoch": 0.07666666666666666, "grad_norm": 2.776056856330233, "kl": 0.0322265625, "learning_rate": 9.616666666666666e-07, "loss": 0.0013, "reward": 1.7057292461395264, "reward_std": 0.21027614176273346, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7057291865348816, "step": 115 }, { "completion_length": 99.6875, "epoch": 0.07733333333333334, "grad_norm": 4.257501084426516, "kl": 0.032958984375, "learning_rate": 9.613333333333334e-07, "loss": 0.0013, "reward": 1.7979166507720947, "reward_std": 0.21421191096305847, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7979166507720947, "step": 116 }, { "completion_length": 109.0, "epoch": 0.078, "grad_norm": 3.32136876667691, "kl": 0.029052734375, "learning_rate": 9.61e-07, "loss": 0.0012, "reward": 1.670562505722046, "reward_std": 0.2284429669380188, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7018124461174011, "step": 117 }, { "completion_length": 98.875, "epoch": 0.07866666666666666, "grad_norm": 14.202204606400672, "kl": 0.035888671875, "learning_rate": 9.606666666666666e-07, "loss": 0.0014, "reward": 1.7937500476837158, "reward_std": 0.09779573976993561, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7937500476837158, "step": 118 }, { "completion_length": 105.5, "epoch": 0.07933333333333334, "grad_norm": 4.371318784835907, "kl": 0.0174560546875, "learning_rate": 9.603333333333333e-07, "loss": 0.0007, "reward": 1.75, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 119 }, { "completion_length": 99.4375, "epoch": 0.08, "grad_norm": 3.924387635591637, "kl": 0.047607421875, "learning_rate": 9.6e-07, "loss": 0.0019, "reward": 1.8062500953674316, "reward_std": 0.11436793208122253, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8062499761581421, "step": 120 }, { "completion_length": 107.125, "epoch": 0.08066666666666666, "grad_norm": 2.308524580463205, "kl": 0.026123046875, "learning_rate": 9.596666666666667e-07, "loss": 0.001, "reward": 1.678497076034546, "reward_std": 0.10066182911396027, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6784970164299011, "step": 121 }, { "completion_length": 101.96875, "epoch": 0.08133333333333333, "grad_norm": 3.7684727537629024, "kl": 0.01708984375, "learning_rate": 9.593333333333333e-07, "loss": 0.0007, "reward": 1.6614583730697632, "reward_std": 0.19561251997947693, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6927083730697632, "step": 122 }, { "completion_length": 114.53125, "epoch": 0.082, "grad_norm": 6.875092048077352, "kl": 0.0537109375, "learning_rate": 9.589999999999998e-07, "loss": 0.0021, "reward": 1.551976203918457, "reward_std": 0.20216986536979675, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5832261443138123, "step": 123 }, { "completion_length": 105.15625, "epoch": 0.08266666666666667, "grad_norm": 1.5181643524541126, "kl": 0.0233154296875, "learning_rate": 9.586666666666666e-07, "loss": 0.0009, "reward": 1.53125, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.53125, "step": 124 }, { "completion_length": 99.3125, "epoch": 0.08333333333333333, "grad_norm": 2.9021964085239795, "kl": 0.032958984375, "learning_rate": 9.583333333333334e-07, "loss": 0.0013, "reward": 1.7567708492279053, "reward_std": 0.08838905394077301, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7567707896232605, "step": 125 }, { "completion_length": 96.09375, "epoch": 0.084, "grad_norm": 1.895819471989893, "kl": 0.034912109375, "learning_rate": 9.58e-07, "loss": 0.0014, "reward": 1.6666667461395264, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6979166865348816, "step": 126 }, { "completion_length": 108.75, "epoch": 0.08466666666666667, "grad_norm": 3.659604643843363, "kl": 0.0311279296875, "learning_rate": 9.576666666666665e-07, "loss": 0.0012, "reward": 1.65625, "reward_std": 0.31684717535972595, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.71875, "step": 127 }, { "completion_length": 105.40625, "epoch": 0.08533333333333333, "grad_norm": 3.4366853402288444, "kl": 0.043212890625, "learning_rate": 9.573333333333333e-07, "loss": 0.0017, "reward": 1.7208333015441895, "reward_std": 0.22436830401420593, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7208333015441895, "step": 128 }, { "completion_length": 100.53125, "epoch": 0.086, "grad_norm": 17.255591893893026, "kl": 0.034423828125, "learning_rate": 9.57e-07, "loss": 0.0014, "reward": 1.78125, "reward_std": 0.23325316607952118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 129 }, { "completion_length": 105.1875, "epoch": 0.08666666666666667, "grad_norm": 3.372096174282969, "kl": 0.039794921875, "learning_rate": 9.566666666666667e-07, "loss": 0.0016, "reward": 1.7529761791229248, "reward_std": 0.2514623701572418, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7529761791229248, "step": 130 }, { "completion_length": 99.25, "epoch": 0.08733333333333333, "grad_norm": 3.7316482702265334, "kl": 0.0260009765625, "learning_rate": 9.563333333333333e-07, "loss": 0.001, "reward": 1.5989583730697632, "reward_std": 0.11544691026210785, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5989583730697632, "step": 131 }, { "completion_length": 110.0, "epoch": 0.088, "grad_norm": 3.5899153877185905, "kl": 0.039794921875, "learning_rate": 9.559999999999998e-07, "loss": 0.0016, "reward": 1.6912946701049805, "reward_std": 0.2744322121143341, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6912946701049805, "step": 132 }, { "completion_length": 103.46875, "epoch": 0.08866666666666667, "grad_norm": 1.649735090364701, "kl": 0.038818359375, "learning_rate": 9.556666666666666e-07, "loss": 0.0015, "reward": 1.946874976158142, "reward_std": 0.043042197823524475, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9468750357627869, "step": 133 }, { "completion_length": 102.65625, "epoch": 0.08933333333333333, "grad_norm": 5.948158957351683, "kl": 0.044189453125, "learning_rate": 9.553333333333334e-07, "loss": 0.0018, "reward": 1.7604167461395264, "reward_std": 0.3192065358161926, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7916667461395264, "step": 134 }, { "completion_length": 95.84375, "epoch": 0.09, "grad_norm": 2.998157383472464, "kl": 0.034423828125, "learning_rate": 9.55e-07, "loss": 0.0014, "reward": 1.8229167461395264, "reward_std": 0.15636569261550903, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229167461395264, "step": 135 }, { "completion_length": 89.6875, "epoch": 0.09066666666666667, "grad_norm": 3.1260213829471533, "kl": 0.03662109375, "learning_rate": 9.546666666666665e-07, "loss": 0.0015, "reward": 1.796875, "reward_std": 0.1899750530719757, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.796875, "step": 136 }, { "completion_length": 101.625, "epoch": 0.09133333333333334, "grad_norm": 2.493921906199596, "kl": 0.051513671875, "learning_rate": 9.543333333333333e-07, "loss": 0.0021, "reward": 1.7218749523162842, "reward_std": 0.19374999403953552, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.753125011920929, "step": 137 }, { "completion_length": 97.4375, "epoch": 0.092, "grad_norm": 5.831174849159176, "kl": 0.047119140625, "learning_rate": 9.539999999999999e-07, "loss": 0.0019, "reward": 1.6979167461395264, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6979166865348816, "step": 138 }, { "completion_length": 94.53125, "epoch": 0.09266666666666666, "grad_norm": 2.2965076105043463, "kl": 0.05224609375, "learning_rate": 9.536666666666667e-07, "loss": 0.0021, "reward": 1.6427083015441895, "reward_std": 0.17490029335021973, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6427083015441895, "step": 139 }, { "completion_length": 106.1875, "epoch": 0.09333333333333334, "grad_norm": 4.521131590780394, "kl": 0.050537109375, "learning_rate": 9.533333333333333e-07, "loss": 0.002, "reward": 1.6636160612106323, "reward_std": 0.209461510181427, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6636160612106323, "step": 140 }, { "completion_length": 112.46875, "epoch": 0.094, "grad_norm": 2.3403608421538866, "kl": 0.05224609375, "learning_rate": 9.529999999999999e-07, "loss": 0.0021, "reward": 1.803125023841858, "reward_std": 0.13190963864326477, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8031250238418579, "step": 141 }, { "completion_length": 104.15625, "epoch": 0.09466666666666666, "grad_norm": 4.089428074295178, "kl": 0.0302734375, "learning_rate": 9.526666666666666e-07, "loss": 0.0012, "reward": 1.5026042461395264, "reward_std": 0.13052211701869965, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5026041269302368, "step": 142 }, { "completion_length": 94.59375, "epoch": 0.09533333333333334, "grad_norm": 2.742975363741501, "kl": 0.04248046875, "learning_rate": 9.523333333333333e-07, "loss": 0.0017, "reward": 1.7286458015441895, "reward_std": 0.03705654293298721, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7286458015441895, "step": 143 }, { "completion_length": 121.15625, "epoch": 0.096, "grad_norm": 1.4148412660229537, "kl": 0.0267333984375, "learning_rate": 9.52e-07, "loss": 0.0011, "reward": 1.7161458730697632, "reward_std": 0.23622477054595947, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7786458730697632, "step": 144 }, { "completion_length": 97.84375, "epoch": 0.09666666666666666, "grad_norm": 31.570863087820637, "kl": 0.9140625, "learning_rate": 9.516666666666666e-07, "loss": 0.0365, "reward": 1.7822916507720947, "reward_std": 0.18088197708129883, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8135416507720947, "step": 145 }, { "completion_length": 114.5, "epoch": 0.09733333333333333, "grad_norm": 2.760421870357055, "kl": 0.02783203125, "learning_rate": 9.513333333333333e-07, "loss": 0.0011, "reward": 1.616964340209961, "reward_std": 0.09507784247398376, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6482142806053162, "step": 146 }, { "completion_length": 97.40625, "epoch": 0.098, "grad_norm": 2.9797147160589734, "kl": 0.03564453125, "learning_rate": 9.509999999999999e-07, "loss": 0.0014, "reward": 1.7520833015441895, "reward_std": 0.08522062003612518, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7520833015441895, "step": 147 }, { "completion_length": 99.4375, "epoch": 0.09866666666666667, "grad_norm": 3.5608700550788015, "kl": 0.04638671875, "learning_rate": 9.506666666666667e-07, "loss": 0.0019, "reward": 1.8136160373687744, "reward_std": 0.10069677233695984, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8136160373687744, "step": 148 }, { "completion_length": 106.96875, "epoch": 0.09933333333333333, "grad_norm": 3.0952618974405435, "kl": 0.026123046875, "learning_rate": 9.503333333333333e-07, "loss": 0.001, "reward": 1.515625, "reward_std": 0.28559717535972595, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.546875, "step": 149 }, { "completion_length": 90.15625, "epoch": 0.1, "grad_norm": 8.444212029448154, "kl": 0.041015625, "learning_rate": 9.499999999999999e-07, "loss": 0.0016, "reward": 1.7885416746139526, "reward_std": 0.1571868509054184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7885416746139526, "step": 150 }, { "completion_length": 96.90625, "epoch": 0.10066666666666667, "grad_norm": 5.453295526634438, "kl": 0.0400390625, "learning_rate": 9.496666666666666e-07, "loss": 0.0016, "reward": 1.8177083730697632, "reward_std": 0.14093759655952454, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8489583134651184, "step": 151 }, { "completion_length": 90.96875, "epoch": 0.10133333333333333, "grad_norm": 2.2883383271865596, "kl": 0.03369140625, "learning_rate": 9.493333333333334e-07, "loss": 0.0014, "reward": 1.890625, "reward_std": 0.21875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.890625, "step": 152 }, { "completion_length": 94.9375, "epoch": 0.102, "grad_norm": 28.983911787037535, "kl": 0.036376953125, "learning_rate": 9.489999999999999e-07, "loss": 0.0015, "reward": 1.7041666507720947, "reward_std": 0.10775899887084961, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7041666507720947, "step": 153 }, { "completion_length": 96.5625, "epoch": 0.10266666666666667, "grad_norm": 7.561932397443963, "kl": 0.0322265625, "learning_rate": 9.486666666666666e-07, "loss": 0.0013, "reward": 1.53125, "reward_std": 0.41456207633018494, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.59375, "step": 154 }, { "completion_length": 97.875, "epoch": 0.10333333333333333, "grad_norm": 8.721610414089684, "kl": 0.037109375, "learning_rate": 9.483333333333333e-07, "loss": 0.0015, "reward": 1.6531250476837158, "reward_std": 0.14993596076965332, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6531250476837158, "step": 155 }, { "completion_length": 99.75, "epoch": 0.104, "grad_norm": 0.8896757542501016, "kl": 0.016357421875, "learning_rate": 9.479999999999999e-07, "loss": 0.0007, "reward": 1.9375, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 156 }, { "completion_length": 72.625, "epoch": 0.10466666666666667, "grad_norm": 2.646041022062639, "kl": 0.036865234375, "learning_rate": 9.476666666666666e-07, "loss": 0.0015, "reward": 1.8020833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8333333730697632, "step": 157 }, { "completion_length": 105.09375, "epoch": 0.10533333333333333, "grad_norm": 9.535064159400097, "kl": 0.04638671875, "learning_rate": 9.473333333333333e-07, "loss": 0.0019, "reward": 1.6458333730697632, "reward_std": 0.3764490485191345, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6770833730697632, "step": 158 }, { "completion_length": 86.0, "epoch": 0.106, "grad_norm": 3.6058861863680587, "kl": 0.031494140625, "learning_rate": 9.469999999999999e-07, "loss": 0.0013, "reward": 1.4343750476837158, "reward_std": 0.33349883556365967, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.590624988079071, "step": 159 }, { "completion_length": 97.71875, "epoch": 0.10666666666666667, "grad_norm": 13.43175032344286, "kl": 0.05712890625, "learning_rate": 9.466666666666666e-07, "loss": 0.0023, "reward": 1.5738095045089722, "reward_std": 0.45403021574020386, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.6675595045089722, "step": 160 }, { "completion_length": 98.6875, "epoch": 0.10733333333333334, "grad_norm": 5.431258267508595, "kl": 0.033935546875, "learning_rate": 9.463333333333334e-07, "loss": 0.0014, "reward": 1.7708333730697632, "reward_std": 0.2940104603767395, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8333333134651184, "step": 161 }, { "completion_length": 99.8125, "epoch": 0.108, "grad_norm": 5.848674208288759, "kl": 0.036376953125, "learning_rate": 9.459999999999999e-07, "loss": 0.0015, "reward": 1.357812523841858, "reward_std": 0.4710971713066101, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.5140625238418579, "step": 162 }, { "completion_length": 96.40625, "epoch": 0.10866666666666666, "grad_norm": 4.476410398380337, "kl": 0.0693359375, "learning_rate": 9.456666666666666e-07, "loss": 0.0028, "reward": 1.742708444595337, "reward_std": 0.45598694682121277, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.8364583253860474, "step": 163 }, { "completion_length": 103.03125, "epoch": 0.10933333333333334, "grad_norm": 1.8416547906612528, "kl": 0.031005859375, "learning_rate": 9.453333333333333e-07, "loss": 0.0012, "reward": 1.65625, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.65625, "step": 164 }, { "completion_length": 90.875, "epoch": 0.11, "grad_norm": 3.1657965238193997, "kl": 0.035888671875, "learning_rate": 9.45e-07, "loss": 0.0014, "reward": 1.7265625, "reward_std": 0.1709880828857422, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7265624403953552, "step": 165 }, { "completion_length": 108.0, "epoch": 0.11066666666666666, "grad_norm": 2.348624956866292, "kl": 0.03369140625, "learning_rate": 9.446666666666666e-07, "loss": 0.0013, "reward": 1.821874976158142, "reward_std": 0.20352619886398315, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8531249761581421, "step": 166 }, { "completion_length": 105.03125, "epoch": 0.11133333333333334, "grad_norm": 5.083300938296901, "kl": 0.0498046875, "learning_rate": 9.443333333333333e-07, "loss": 0.002, "reward": 1.6375000476837158, "reward_std": 0.21424409747123718, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.699999988079071, "step": 167 }, { "completion_length": 91.4375, "epoch": 0.112, "grad_norm": 8.492183559693906, "kl": 0.0380859375, "learning_rate": 9.439999999999999e-07, "loss": 0.0015, "reward": 1.5645833015441895, "reward_std": 0.18701881170272827, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5645833611488342, "step": 168 }, { "completion_length": 98.8125, "epoch": 0.11266666666666666, "grad_norm": 2.330518258829748, "kl": 0.04638671875, "learning_rate": 9.436666666666667e-07, "loss": 0.0019, "reward": 1.7317708730697632, "reward_std": 0.21864327788352966, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7317708134651184, "step": 169 }, { "completion_length": 91.125, "epoch": 0.11333333333333333, "grad_norm": 5.580824754264434, "kl": 0.0673828125, "learning_rate": 9.433333333333333e-07, "loss": 0.0027, "reward": 1.7447917461395264, "reward_std": 0.2797542214393616, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8072916865348816, "step": 170 }, { "completion_length": 91.53125, "epoch": 0.114, "grad_norm": 4.908398456475127, "kl": 0.04248046875, "learning_rate": 9.429999999999999e-07, "loss": 0.0017, "reward": 1.636458396911621, "reward_std": 0.13999196887016296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6364583373069763, "step": 171 }, { "completion_length": 101.1875, "epoch": 0.11466666666666667, "grad_norm": 6.62930187460451, "kl": 0.059814453125, "learning_rate": 9.426666666666666e-07, "loss": 0.0024, "reward": 1.670312523841858, "reward_std": 0.2906249761581421, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7328125238418579, "step": 172 }, { "completion_length": 96.25, "epoch": 0.11533333333333333, "grad_norm": 4.053764339051984, "kl": 0.0419921875, "learning_rate": 9.423333333333333e-07, "loss": 0.0017, "reward": 1.679464340209961, "reward_std": 0.23642607033252716, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6794642806053162, "step": 173 }, { "completion_length": 102.75, "epoch": 0.116, "grad_norm": 2.778918113874751, "kl": 0.0201416015625, "learning_rate": 9.419999999999999e-07, "loss": 0.0008, "reward": 1.6041667461395264, "reward_std": 0.2784034311771393, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6354166269302368, "step": 174 }, { "completion_length": 108.4375, "epoch": 0.11666666666666667, "grad_norm": 0.1615595138321011, "kl": 0.031982421875, "learning_rate": 9.416666666666666e-07, "loss": 0.0013, "reward": 1.8125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 175 }, { "completion_length": 115.5, "epoch": 0.11733333333333333, "grad_norm": 5.64898991257348, "kl": 0.055419921875, "learning_rate": 9.413333333333333e-07, "loss": 0.0022, "reward": 1.6416666507720947, "reward_std": 0.18436621129512787, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6729167103767395, "step": 176 }, { "completion_length": 75.71875, "epoch": 0.118, "grad_norm": 6.773144201972213, "kl": 0.049072265625, "learning_rate": 9.409999999999999e-07, "loss": 0.002, "reward": 1.6593749523162842, "reward_std": 0.16504085063934326, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.659375011920929, "step": 177 }, { "completion_length": 98.4375, "epoch": 0.11866666666666667, "grad_norm": 7.033374188259998, "kl": 0.05322265625, "learning_rate": 9.406666666666666e-07, "loss": 0.0021, "reward": 1.6765625476837158, "reward_std": 0.13490571081638336, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.676562488079071, "step": 178 }, { "completion_length": 90.71875, "epoch": 0.11933333333333333, "grad_norm": 2.5754154078665508, "kl": 0.041748046875, "learning_rate": 9.403333333333333e-07, "loss": 0.0017, "reward": 1.816145896911621, "reward_std": 0.10654377937316895, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8161458373069763, "step": 179 }, { "completion_length": 94.4375, "epoch": 0.12, "grad_norm": 28.33401109999467, "kl": 0.041748046875, "learning_rate": 9.399999999999999e-07, "loss": 0.0017, "reward": 1.6294643878936768, "reward_std": 0.1349318027496338, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6294642686843872, "step": 180 }, { "completion_length": 105.65625, "epoch": 0.12066666666666667, "grad_norm": 2.9260429481800916, "kl": 0.0361328125, "learning_rate": 9.396666666666666e-07, "loss": 0.0014, "reward": 1.7005208730697632, "reward_std": 0.22831040620803833, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7317708730697632, "step": 181 }, { "completion_length": 104.0625, "epoch": 0.12133333333333333, "grad_norm": 3.400706900413098, "kl": 0.041748046875, "learning_rate": 9.393333333333334e-07, "loss": 0.0017, "reward": 1.7526042461395264, "reward_std": 0.21332323551177979, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7838541269302368, "step": 182 }, { "completion_length": 110.28125, "epoch": 0.122, "grad_norm": 3.9645967138484024, "kl": 0.04296875, "learning_rate": 9.389999999999999e-07, "loss": 0.0017, "reward": 1.5177083015441895, "reward_std": 0.3163855969905853, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5489583611488342, "step": 183 }, { "completion_length": 106.8125, "epoch": 0.12266666666666666, "grad_norm": 2.8983264717340815, "kl": 0.057373046875, "learning_rate": 9.386666666666666e-07, "loss": 0.0023, "reward": 1.649999976158142, "reward_std": 0.141310453414917, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6499999761581421, "step": 184 }, { "completion_length": 105.6875, "epoch": 0.12333333333333334, "grad_norm": 2.4836935517505823, "kl": 0.039306640625, "learning_rate": 9.383333333333333e-07, "loss": 0.0016, "reward": 1.8541667461395264, "reward_std": 0.27405625581741333, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.9166666865348816, "step": 185 }, { "completion_length": 106.84375, "epoch": 0.124, "grad_norm": 12.780558211071277, "kl": 0.039794921875, "learning_rate": 9.379999999999998e-07, "loss": 0.0016, "reward": 1.787500023841858, "reward_std": 0.24891288578510284, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.8812500238418579, "step": 186 }, { "completion_length": 98.1875, "epoch": 0.12466666666666666, "grad_norm": 31.405791260883998, "kl": 0.048828125, "learning_rate": 9.376666666666666e-07, "loss": 0.0019, "reward": 1.8286458253860474, "reward_std": 0.14768722653388977, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8286458253860474, "step": 187 }, { "completion_length": 121.40625, "epoch": 0.12533333333333332, "grad_norm": 3.3508506565171885, "kl": 0.035400390625, "learning_rate": 9.373333333333333e-07, "loss": 0.0014, "reward": 1.6427083015441895, "reward_std": 0.3479166626930237, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7052083611488342, "step": 188 }, { "completion_length": 98.3125, "epoch": 0.126, "grad_norm": 1.9873824410572383, "kl": 0.03857421875, "learning_rate": 9.37e-07, "loss": 0.0015, "reward": 1.8337054252624512, "reward_std": 0.026718882843852043, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8337053656578064, "step": 189 }, { "completion_length": 107.59375, "epoch": 0.12666666666666668, "grad_norm": 4.692157501966455, "kl": 0.034912109375, "learning_rate": 9.366666666666666e-07, "loss": 0.0014, "reward": 1.662500023841858, "reward_std": 0.16595885157585144, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6625000238418579, "step": 190 }, { "completion_length": 103.0, "epoch": 0.12733333333333333, "grad_norm": 6.549576927451648, "kl": 0.03857421875, "learning_rate": 9.363333333333333e-07, "loss": 0.0015, "reward": 1.8764880895614624, "reward_std": 0.14158311486244202, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8764880895614624, "step": 191 }, { "completion_length": 99.90625, "epoch": 0.128, "grad_norm": 50.85190630937863, "kl": 0.05078125, "learning_rate": 9.36e-07, "loss": 0.002, "reward": 1.9354166984558105, "reward_std": 0.054166655987501144, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9354166388511658, "step": 192 }, { "completion_length": 99.0625, "epoch": 0.12866666666666668, "grad_norm": 28.340052285056856, "kl": 0.049560546875, "learning_rate": 9.356666666666666e-07, "loss": 0.002, "reward": 1.681249976158142, "reward_std": 0.1324199140071869, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6812499761581421, "step": 193 }, { "completion_length": 110.0625, "epoch": 0.12933333333333333, "grad_norm": 4.846128820289932, "kl": 0.0439453125, "learning_rate": 9.353333333333333e-07, "loss": 0.0018, "reward": 1.3454241752624512, "reward_std": 0.0857219398021698, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.3766741156578064, "step": 194 }, { "completion_length": 99.125, "epoch": 0.13, "grad_norm": 3.007934322932846, "kl": 0.06396484375, "learning_rate": 9.35e-07, "loss": 0.0026, "reward": 1.9319195747375488, "reward_std": 0.03524615615606308, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9319196343421936, "step": 195 }, { "completion_length": 102.0, "epoch": 0.13066666666666665, "grad_norm": 7.58579661072562, "kl": 0.050048828125, "learning_rate": 9.346666666666666e-07, "loss": 0.002, "reward": 1.8020832538604736, "reward_std": 0.14893567562103271, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 196 }, { "completion_length": 97.96875, "epoch": 0.13133333333333333, "grad_norm": 5.690025510841803, "kl": 0.041015625, "learning_rate": 9.343333333333333e-07, "loss": 0.0016, "reward": 1.8125, "reward_std": 0.17955836653709412, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125000596046448, "step": 197 }, { "completion_length": 93.0625, "epoch": 0.132, "grad_norm": 3.1650326595475593, "kl": 0.051025390625, "learning_rate": 9.34e-07, "loss": 0.002, "reward": 1.7933779954910278, "reward_std": 0.07717864960432053, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7933779954910278, "step": 198 }, { "completion_length": 94.34375, "epoch": 0.13266666666666665, "grad_norm": 0.11364790413943804, "kl": 0.04150390625, "learning_rate": 9.336666666666666e-07, "loss": 0.0017, "reward": 1.96875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 199 }, { "completion_length": 92.71875, "epoch": 0.13333333333333333, "grad_norm": 3.4666357477038185, "kl": 0.042236328125, "learning_rate": 9.333333333333333e-07, "loss": 0.0017, "reward": 1.738541603088379, "reward_std": 0.1012147068977356, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7385416626930237, "step": 200 }, { "completion_length": 105.96875, "epoch": 0.134, "grad_norm": 4.230960311339918, "kl": 0.039794921875, "learning_rate": 9.33e-07, "loss": 0.0016, "reward": 1.8156249523162842, "reward_std": 0.24091878533363342, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8468749523162842, "step": 201 }, { "completion_length": 99.03125, "epoch": 0.13466666666666666, "grad_norm": 2.5354137319236933, "kl": 0.03759765625, "learning_rate": 9.326666666666666e-07, "loss": 0.0015, "reward": 1.7916667461395264, "reward_std": 0.18496489524841309, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666269302368, "step": 202 }, { "completion_length": 90.8125, "epoch": 0.13533333333333333, "grad_norm": 2.140883288842938, "kl": 0.0439453125, "learning_rate": 9.323333333333334e-07, "loss": 0.0018, "reward": 1.875, "reward_std": 0.11383545398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8749999403953552, "step": 203 }, { "completion_length": 95.125, "epoch": 0.136, "grad_norm": 3.5058952323412718, "kl": 0.044189453125, "learning_rate": 9.32e-07, "loss": 0.0018, "reward": 1.6783483028411865, "reward_std": 0.11462653428316116, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.678348183631897, "step": 204 }, { "completion_length": 108.21875, "epoch": 0.13666666666666666, "grad_norm": 1.9986785541926297, "kl": 0.050048828125, "learning_rate": 9.316666666666666e-07, "loss": 0.002, "reward": 1.8977677822113037, "reward_std": 0.14099684357643127, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9290179014205933, "step": 205 }, { "completion_length": 91.46875, "epoch": 0.13733333333333334, "grad_norm": 2.4103465861569826, "kl": 0.04345703125, "learning_rate": 9.313333333333333e-07, "loss": 0.0017, "reward": 1.90625, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 206 }, { "completion_length": 99.625, "epoch": 0.138, "grad_norm": 2.035811139337149, "kl": 0.052490234375, "learning_rate": 9.31e-07, "loss": 0.0021, "reward": 1.6744792461395264, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6744791269302368, "step": 207 }, { "completion_length": 87.3125, "epoch": 0.13866666666666666, "grad_norm": 2.726075029514403, "kl": 0.03515625, "learning_rate": 9.306666666666666e-07, "loss": 0.0014, "reward": 1.6854166984558105, "reward_std": 0.0870281308889389, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6854166388511658, "step": 208 }, { "completion_length": 113.875, "epoch": 0.13933333333333334, "grad_norm": 2.026294505142206, "kl": 0.0654296875, "learning_rate": 9.303333333333333e-07, "loss": 0.0026, "reward": 1.6531250476837158, "reward_std": 0.17081207036972046, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.684374988079071, "step": 209 }, { "completion_length": 103.09375, "epoch": 0.14, "grad_norm": 3.0994231347890304, "kl": 0.054443359375, "learning_rate": 9.3e-07, "loss": 0.0022, "reward": 1.7213541269302368, "reward_std": 0.07604166120290756, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7213541269302368, "step": 210 }, { "completion_length": 96.6875, "epoch": 0.14066666666666666, "grad_norm": 63.25053341023618, "kl": 0.040771484375, "learning_rate": 9.296666666666666e-07, "loss": 0.0016, "reward": 1.7119791507720947, "reward_std": 0.30735599994659424, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7119791507720947, "step": 211 }, { "completion_length": 114.40625, "epoch": 0.14133333333333334, "grad_norm": 1.9072100737188098, "kl": 0.034423828125, "learning_rate": 9.293333333333333e-07, "loss": 0.0014, "reward": 1.6953125, "reward_std": 0.19060374796390533, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7265625, "step": 212 }, { "completion_length": 107.1875, "epoch": 0.142, "grad_norm": 2.702969784734906, "kl": 0.06591796875, "learning_rate": 9.29e-07, "loss": 0.0026, "reward": 1.7859375476837158, "reward_std": 0.09866960346698761, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.785937488079071, "step": 213 }, { "completion_length": 105.09375, "epoch": 0.14266666666666666, "grad_norm": 2.885056288051695, "kl": 0.040771484375, "learning_rate": 9.286666666666666e-07, "loss": 0.0016, "reward": 1.7234375476837158, "reward_std": 0.17615234851837158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7234375476837158, "step": 214 }, { "completion_length": 90.96875, "epoch": 0.14333333333333334, "grad_norm": 1.1830621979917098, "kl": 0.033935546875, "learning_rate": 9.283333333333333e-07, "loss": 0.0014, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 215 }, { "completion_length": 95.28125, "epoch": 0.144, "grad_norm": 1.6585782178240749, "kl": 0.042236328125, "learning_rate": 9.28e-07, "loss": 0.0017, "reward": 1.71875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 216 }, { "completion_length": 95.96875, "epoch": 0.14466666666666667, "grad_norm": 3.2013250030885065, "kl": 0.050048828125, "learning_rate": 9.276666666666666e-07, "loss": 0.002, "reward": 1.8697917461395264, "reward_std": 0.1657797247171402, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9010416269302368, "step": 217 }, { "completion_length": 98.34375, "epoch": 0.14533333333333334, "grad_norm": 2.9703153710792702, "kl": 0.044677734375, "learning_rate": 9.273333333333333e-07, "loss": 0.0018, "reward": 1.6979167461395264, "reward_std": 0.14331157505512238, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6979166269302368, "step": 218 }, { "completion_length": 105.15625, "epoch": 0.146, "grad_norm": 3.7396070670642607, "kl": 0.0274658203125, "learning_rate": 9.27e-07, "loss": 0.0011, "reward": 1.6510417461395264, "reward_std": 0.0729166641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6510416269302368, "step": 219 }, { "completion_length": 106.75, "epoch": 0.14666666666666667, "grad_norm": 14.62890246170383, "kl": 0.04296875, "learning_rate": 9.266666666666665e-07, "loss": 0.0017, "reward": 1.6083333492279053, "reward_std": 0.3248142600059509, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6083333492279053, "step": 220 }, { "completion_length": 102.65625, "epoch": 0.14733333333333334, "grad_norm": 8.971206078170704, "kl": 0.037841796875, "learning_rate": 9.263333333333333e-07, "loss": 0.0015, "reward": 1.7395833730697632, "reward_std": 0.25, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 221 }, { "completion_length": 92.96875, "epoch": 0.148, "grad_norm": 2.0277100874100196, "kl": 0.034912109375, "learning_rate": 9.26e-07, "loss": 0.0014, "reward": 1.9322917461395264, "reward_std": 0.019946396350860596, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322917461395264, "step": 222 }, { "completion_length": 116.3125, "epoch": 0.14866666666666667, "grad_norm": 2.904480189581057, "kl": 0.046142578125, "learning_rate": 9.256666666666666e-07, "loss": 0.0019, "reward": 1.7947916984558105, "reward_std": 0.1929364949464798, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8260416984558105, "step": 223 }, { "completion_length": 108.46875, "epoch": 0.14933333333333335, "grad_norm": 2.6782520313472205, "kl": 0.027587890625, "learning_rate": 9.253333333333333e-07, "loss": 0.0011, "reward": 1.7864583730697632, "reward_std": 0.22678396105766296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7864583134651184, "step": 224 }, { "completion_length": 100.0, "epoch": 0.15, "grad_norm": 47.132355036989274, "kl": 0.045166015625, "learning_rate": 9.25e-07, "loss": 0.0018, "reward": 1.6531250476837158, "reward_std": 0.2709237337112427, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.684374988079071, "step": 225 }, { "completion_length": 99.1875, "epoch": 0.15066666666666667, "grad_norm": 2.6159606939678675, "kl": 0.0303955078125, "learning_rate": 9.246666666666666e-07, "loss": 0.0012, "reward": 1.8854167461395264, "reward_std": 0.1763354390859604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8854166865348816, "step": 226 }, { "completion_length": 96.59375, "epoch": 0.15133333333333332, "grad_norm": 4.5740520833246645, "kl": 0.05859375, "learning_rate": 9.243333333333333e-07, "loss": 0.0023, "reward": 1.6848958730697632, "reward_std": 0.16071045398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6848958730697632, "step": 227 }, { "completion_length": 101.90625, "epoch": 0.152, "grad_norm": 74.00849290024082, "kl": 0.045166015625, "learning_rate": 9.24e-07, "loss": 0.0018, "reward": 1.7468750476837158, "reward_std": 0.1703011691570282, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7468750476837158, "step": 228 }, { "completion_length": 121.375, "epoch": 0.15266666666666667, "grad_norm": 4.979675063216591, "kl": 0.03515625, "learning_rate": 9.236666666666666e-07, "loss": 0.0014, "reward": 1.7442708015441895, "reward_std": 0.2780952453613281, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7755208611488342, "step": 229 }, { "completion_length": 98.59375, "epoch": 0.15333333333333332, "grad_norm": 2.198585505462029, "kl": 0.02197265625, "learning_rate": 9.233333333333333e-07, "loss": 0.0009, "reward": 1.7708333730697632, "reward_std": 0.1555021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 230 }, { "completion_length": 102.0625, "epoch": 0.154, "grad_norm": 2.021262812645677, "kl": 0.052734375, "learning_rate": 9.23e-07, "loss": 0.0021, "reward": 1.881250023841858, "reward_std": 0.16353629529476166, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8812500238418579, "step": 231 }, { "completion_length": 95.5, "epoch": 0.15466666666666667, "grad_norm": 67.79620875753693, "kl": 0.0250244140625, "learning_rate": 9.226666666666666e-07, "loss": 0.001, "reward": 1.730208396911621, "reward_std": 0.0364374965429306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7302083969116211, "step": 232 }, { "completion_length": 95.65625, "epoch": 0.15533333333333332, "grad_norm": 2.40210799460135, "kl": 0.03515625, "learning_rate": 9.223333333333333e-07, "loss": 0.0014, "reward": 1.722916603088379, "reward_std": 0.15391379594802856, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7229167222976685, "step": 233 }, { "completion_length": 103.40625, "epoch": 0.156, "grad_norm": 5.485494146630768, "kl": 0.046875, "learning_rate": 9.22e-07, "loss": 0.0019, "reward": 1.6875, "reward_std": 0.28076934814453125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 234 }, { "completion_length": 100.375, "epoch": 0.15666666666666668, "grad_norm": 1.4102594153693595, "kl": 0.0252685546875, "learning_rate": 9.216666666666666e-07, "loss": 0.001, "reward": 1.7864583730697632, "reward_std": 0.1808105856180191, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8177083730697632, "step": 235 }, { "completion_length": 103.3125, "epoch": 0.15733333333333333, "grad_norm": 3.686468289885704, "kl": 0.03076171875, "learning_rate": 9.213333333333333e-07, "loss": 0.0012, "reward": 1.7062499523162842, "reward_std": 0.26066654920578003, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.737500011920929, "step": 236 }, { "completion_length": 127.875, "epoch": 0.158, "grad_norm": 6.475914203835673, "kl": 0.032470703125, "learning_rate": 9.21e-07, "loss": 0.0013, "reward": 1.6124999523162842, "reward_std": 0.29049044847488403, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.737500011920929, "step": 237 }, { "completion_length": 103.25, "epoch": 0.15866666666666668, "grad_norm": 2.3128754020100977, "kl": 0.039794921875, "learning_rate": 9.206666666666666e-07, "loss": 0.0016, "reward": 1.793229103088379, "reward_std": 0.1004544198513031, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7932291626930237, "step": 238 }, { "completion_length": 109.9375, "epoch": 0.15933333333333333, "grad_norm": 2.7668902712919805, "kl": 0.046875, "learning_rate": 9.203333333333333e-07, "loss": 0.0019, "reward": 1.6489583253860474, "reward_std": 0.2935434877872467, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6802083253860474, "step": 239 }, { "completion_length": 98.875, "epoch": 0.16, "grad_norm": 4.571970552387521, "kl": 0.030517578125, "learning_rate": 9.2e-07, "loss": 0.0012, "reward": 1.8385417461395264, "reward_std": 0.15121470391750336, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416269302368, "step": 240 }, { "completion_length": 102.625, "epoch": 0.16066666666666668, "grad_norm": 14.092232028746135, "kl": 0.0439453125, "learning_rate": 9.196666666666666e-07, "loss": 0.0018, "reward": 1.7687499523162842, "reward_std": 0.28225451707839966, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7999999523162842, "step": 241 }, { "completion_length": 100.15625, "epoch": 0.16133333333333333, "grad_norm": 3.095507179351653, "kl": 0.039306640625, "learning_rate": 9.193333333333333e-07, "loss": 0.0016, "reward": 1.53125, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5625, "step": 242 }, { "completion_length": 90.96875, "epoch": 0.162, "grad_norm": 2.85508803289734, "kl": 0.045166015625, "learning_rate": 9.19e-07, "loss": 0.0018, "reward": 1.912500023841858, "reward_std": 0.08643568307161331, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9124999642372131, "step": 243 }, { "completion_length": 107.65625, "epoch": 0.16266666666666665, "grad_norm": 4.656050370447121, "kl": 0.050537109375, "learning_rate": 9.186666666666666e-07, "loss": 0.002, "reward": 1.855729103088379, "reward_std": 0.19336140155792236, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8869791626930237, "step": 244 }, { "completion_length": 113.375, "epoch": 0.16333333333333333, "grad_norm": 3.618689873222807, "kl": 0.05078125, "learning_rate": 9.183333333333333e-07, "loss": 0.002, "reward": 1.6322917938232422, "reward_std": 0.23179635405540466, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6947916746139526, "step": 245 }, { "completion_length": 92.15625, "epoch": 0.164, "grad_norm": 5.783178174335252, "kl": 0.050048828125, "learning_rate": 9.18e-07, "loss": 0.002, "reward": 1.8854167461395264, "reward_std": 0.15872503817081451, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8854166269302368, "step": 246 }, { "completion_length": 93.875, "epoch": 0.16466666666666666, "grad_norm": 2.76451647998699, "kl": 0.036865234375, "learning_rate": 9.176666666666666e-07, "loss": 0.0015, "reward": 1.7720237970352173, "reward_std": 0.13674496114253998, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7720237970352173, "step": 247 }, { "completion_length": 115.875, "epoch": 0.16533333333333333, "grad_norm": 3.606796745422391, "kl": 0.057373046875, "learning_rate": 9.173333333333333e-07, "loss": 0.0023, "reward": 1.7026042938232422, "reward_std": 0.26235929131507874, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7338541746139526, "step": 248 }, { "completion_length": 94.4375, "epoch": 0.166, "grad_norm": 3.7951969539447026, "kl": 0.05419921875, "learning_rate": 9.17e-07, "loss": 0.0022, "reward": 1.5625, "reward_std": 0.22767089307308197, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5625, "step": 249 }, { "completion_length": 91.875, "epoch": 0.16666666666666666, "grad_norm": 2.461579515743762, "kl": 0.041748046875, "learning_rate": 9.166666666666665e-07, "loss": 0.0017, "reward": 1.8515625, "reward_std": 0.109375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8515625, "step": 250 }, { "completion_length": 96.875, "epoch": 0.16733333333333333, "grad_norm": 10.779518213001918, "kl": 0.06298828125, "learning_rate": 9.163333333333333e-07, "loss": 0.0025, "reward": 1.8684524297714233, "reward_std": 0.1055847704410553, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8684523701667786, "step": 251 }, { "completion_length": 97.96875, "epoch": 0.168, "grad_norm": 9.266745007510268, "kl": 0.05859375, "learning_rate": 9.16e-07, "loss": 0.0023, "reward": 1.7838542461395264, "reward_std": 0.15337765216827393, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8151041865348816, "step": 252 }, { "completion_length": 104.96875, "epoch": 0.16866666666666666, "grad_norm": 1.8576254501525191, "kl": 0.0400390625, "learning_rate": 9.156666666666666e-07, "loss": 0.0016, "reward": 1.6614583730697632, "reward_std": 0.149931401014328, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7239583134651184, "step": 253 }, { "completion_length": 90.0625, "epoch": 0.16933333333333334, "grad_norm": 4.407263253238808, "kl": 0.0546875, "learning_rate": 9.153333333333332e-07, "loss": 0.0022, "reward": 1.578125, "reward_std": 0.10223910212516785, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.578125, "step": 254 }, { "completion_length": 95.53125, "epoch": 0.17, "grad_norm": 4.248517093703313, "kl": 0.051513671875, "learning_rate": 9.15e-07, "loss": 0.0021, "reward": 1.8380208015441895, "reward_std": 0.10531838238239288, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8380208015441895, "step": 255 }, { "completion_length": 101.21875, "epoch": 0.17066666666666666, "grad_norm": 11.073014141442423, "kl": 0.0791015625, "learning_rate": 9.146666666666666e-07, "loss": 0.0032, "reward": 1.869270920753479, "reward_std": 0.14972177147865295, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8692708611488342, "step": 256 }, { "completion_length": 99.0, "epoch": 0.17133333333333334, "grad_norm": 2.178600234514558, "kl": 0.04736328125, "learning_rate": 9.143333333333333e-07, "loss": 0.0019, "reward": 1.90625, "reward_std": 0.1041666641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 257 }, { "completion_length": 86.0, "epoch": 0.172, "grad_norm": 6.299622091856718, "kl": 0.052978515625, "learning_rate": 9.14e-07, "loss": 0.0021, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 258 }, { "completion_length": 87.75, "epoch": 0.17266666666666666, "grad_norm": 2.9209317228854403, "kl": 0.05029296875, "learning_rate": 9.136666666666666e-07, "loss": 0.002, "reward": 1.8442708253860474, "reward_std": 0.12956276535987854, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8442708253860474, "step": 259 }, { "completion_length": 104.84375, "epoch": 0.17333333333333334, "grad_norm": 2.935854480180611, "kl": 0.0556640625, "learning_rate": 9.133333333333333e-07, "loss": 0.0022, "reward": 1.7677083015441895, "reward_std": 0.16685625910758972, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7677083611488342, "step": 260 }, { "completion_length": 88.46875, "epoch": 0.174, "grad_norm": 19.90675129095415, "kl": 0.10009765625, "learning_rate": 9.13e-07, "loss": 0.004, "reward": 1.7421875, "reward_std": 0.18218742311000824, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7421875596046448, "step": 261 }, { "completion_length": 90.03125, "epoch": 0.17466666666666666, "grad_norm": 2.250216054864341, "kl": 0.06787109375, "learning_rate": 9.126666666666666e-07, "loss": 0.0027, "reward": 1.6875, "reward_std": 0.25, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 262 }, { "completion_length": 113.8125, "epoch": 0.17533333333333334, "grad_norm": 2.635228398241944, "kl": 0.04931640625, "learning_rate": 9.123333333333333e-07, "loss": 0.002, "reward": 1.8468749523162842, "reward_std": 0.22085149586200714, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.878125011920929, "step": 263 }, { "completion_length": 95.34375, "epoch": 0.176, "grad_norm": 2.5479408379675283, "kl": 0.047119140625, "learning_rate": 9.12e-07, "loss": 0.0019, "reward": 1.7916667461395264, "reward_std": 0.1666666567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666865348816, "step": 264 }, { "completion_length": 97.40625, "epoch": 0.17666666666666667, "grad_norm": 5.848505754917177, "kl": 0.057861328125, "learning_rate": 9.116666666666666e-07, "loss": 0.0023, "reward": 1.8515625, "reward_std": 0.10394902527332306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8515625, "step": 265 }, { "completion_length": 95.15625, "epoch": 0.17733333333333334, "grad_norm": 3.0857401128537187, "kl": 0.05224609375, "learning_rate": 9.113333333333333e-07, "loss": 0.0021, "reward": 1.8390624523162842, "reward_std": 0.18387053906917572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.839062511920929, "step": 266 }, { "completion_length": 100.6875, "epoch": 0.178, "grad_norm": 5.032591464544968, "kl": 0.06591796875, "learning_rate": 9.109999999999999e-07, "loss": 0.0026, "reward": 1.5610119104385376, "reward_std": 0.12951864302158356, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5610119104385376, "step": 267 }, { "completion_length": 105.46875, "epoch": 0.17866666666666667, "grad_norm": 4.178811061742027, "kl": 0.047119140625, "learning_rate": 9.106666666666666e-07, "loss": 0.0019, "reward": 1.7552083730697632, "reward_std": 0.18239547312259674, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7864583730697632, "step": 268 }, { "completion_length": 102.59375, "epoch": 0.17933333333333334, "grad_norm": 13.993578504794353, "kl": 0.0517578125, "learning_rate": 9.103333333333333e-07, "loss": 0.0021, "reward": 1.6041667461395264, "reward_std": 0.22276708483695984, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6041666865348816, "step": 269 }, { "completion_length": 110.15625, "epoch": 0.18, "grad_norm": 2.1550551072668056, "kl": 0.0311279296875, "learning_rate": 9.1e-07, "loss": 0.0012, "reward": 1.4869792461395264, "reward_std": 0.08779378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.4869791567325592, "step": 270 }, { "completion_length": 99.53125, "epoch": 0.18066666666666667, "grad_norm": 4.596448450894119, "kl": 0.042724609375, "learning_rate": 9.096666666666665e-07, "loss": 0.0017, "reward": 1.7489583492279053, "reward_std": 0.1516544371843338, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7489583492279053, "step": 271 }, { "completion_length": 111.875, "epoch": 0.18133333333333335, "grad_norm": 10.966614600190884, "kl": 0.0537109375, "learning_rate": 9.093333333333333e-07, "loss": 0.0021, "reward": 1.773958444595337, "reward_std": 0.13339340686798096, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7739583253860474, "step": 272 }, { "completion_length": 114.46875, "epoch": 0.182, "grad_norm": 1.7076418588035502, "kl": 0.038330078125, "learning_rate": 9.09e-07, "loss": 0.0015, "reward": 1.7083333730697632, "reward_std": 0.14716878533363342, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7395833730697632, "step": 273 }, { "completion_length": 104.40625, "epoch": 0.18266666666666667, "grad_norm": 2.3477343289632233, "kl": 0.043701171875, "learning_rate": 9.086666666666666e-07, "loss": 0.0018, "reward": 1.7364583015441895, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7364583015441895, "step": 274 }, { "completion_length": 94.28125, "epoch": 0.18333333333333332, "grad_norm": 7.923996941629084, "kl": 0.06103515625, "learning_rate": 9.083333333333332e-07, "loss": 0.0024, "reward": 1.8098958730697632, "reward_std": 0.2180699110031128, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8411458730697632, "step": 275 }, { "completion_length": 91.09375, "epoch": 0.184, "grad_norm": 1.8045164795072997, "kl": 0.09130859375, "learning_rate": 9.08e-07, "loss": 0.0036, "reward": 1.8385417461395264, "reward_std": 0.04294900968670845, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416269302368, "step": 276 }, { "completion_length": 90.8125, "epoch": 0.18466666666666667, "grad_norm": 8.759774939348892, "kl": 0.042724609375, "learning_rate": 9.076666666666666e-07, "loss": 0.0017, "reward": 1.837499976158142, "reward_std": 0.08924409747123718, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8375000357627869, "step": 277 }, { "completion_length": 118.40625, "epoch": 0.18533333333333332, "grad_norm": 4.000160228528656, "kl": 0.04833984375, "learning_rate": 9.073333333333333e-07, "loss": 0.0019, "reward": 1.6104166507720947, "reward_std": 0.17388354241847992, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6416666507720947, "step": 278 }, { "completion_length": 107.4375, "epoch": 0.186, "grad_norm": 6.546242234359383, "kl": 0.0284423828125, "learning_rate": 9.07e-07, "loss": 0.0011, "reward": 1.6172618865966797, "reward_std": 0.15857936441898346, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6172619462013245, "step": 279 }, { "completion_length": 93.71875, "epoch": 0.18666666666666668, "grad_norm": 21.87119629083296, "kl": 0.498046875, "learning_rate": 9.066666666666665e-07, "loss": 0.02, "reward": 1.7395833730697632, "reward_std": 0.06680577993392944, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 280 }, { "completion_length": 107.15625, "epoch": 0.18733333333333332, "grad_norm": 15.812527243877218, "kl": 0.052734375, "learning_rate": 9.063333333333333e-07, "loss": 0.0021, "reward": 1.625, "reward_std": 0.3333333134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.625, "step": 281 }, { "completion_length": 104.34375, "epoch": 0.188, "grad_norm": 2.2869964277027637, "kl": 0.0308837890625, "learning_rate": 9.06e-07, "loss": 0.0012, "reward": 1.871354103088379, "reward_std": 0.13384521007537842, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8713542222976685, "step": 282 }, { "completion_length": 102.65625, "epoch": 0.18866666666666668, "grad_norm": 2.8839038440835827, "kl": 0.05224609375, "learning_rate": 9.056666666666666e-07, "loss": 0.0021, "reward": 1.7751116752624512, "reward_std": 0.163817897439003, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7751116156578064, "step": 283 }, { "completion_length": 97.59375, "epoch": 0.18933333333333333, "grad_norm": 12.787751548279791, "kl": 0.041748046875, "learning_rate": 9.053333333333332e-07, "loss": 0.0017, "reward": 1.6661458015441895, "reward_std": 0.1786557137966156, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6661458611488342, "step": 284 }, { "completion_length": 99.65625, "epoch": 0.19, "grad_norm": 1.2678578785591448, "kl": 0.0869140625, "learning_rate": 9.05e-07, "loss": 0.0035, "reward": 1.7468750476837158, "reward_std": 0.01812220923602581, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.746874988079071, "step": 285 }, { "completion_length": 96.46875, "epoch": 0.19066666666666668, "grad_norm": 19.791191223573538, "kl": 0.06005859375, "learning_rate": 9.046666666666666e-07, "loss": 0.0024, "reward": 1.6505208015441895, "reward_std": 0.32222670316696167, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6505208611488342, "step": 286 }, { "completion_length": 94.15625, "epoch": 0.19133333333333333, "grad_norm": 18.170796963205614, "kl": 0.0693359375, "learning_rate": 9.043333333333333e-07, "loss": 0.0028, "reward": 1.8015625476837158, "reward_std": 0.17091664671897888, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.801562488079071, "step": 287 }, { "completion_length": 146.0, "epoch": 0.192, "grad_norm": 3.0022017669072, "kl": 0.04443359375, "learning_rate": 9.039999999999999e-07, "loss": 0.0018, "reward": 1.389062523841858, "reward_std": 0.2930019795894623, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.4203125238418579, "step": 288 }, { "completion_length": 89.9375, "epoch": 0.19266666666666668, "grad_norm": 8.983422917064903, "kl": 0.06689453125, "learning_rate": 9.036666666666666e-07, "loss": 0.0027, "reward": 1.6932291984558105, "reward_std": 0.09270832687616348, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6932291984558105, "step": 289 }, { "completion_length": 95.59375, "epoch": 0.19333333333333333, "grad_norm": 2.1345896116588965, "kl": 0.039794921875, "learning_rate": 9.033333333333333e-07, "loss": 0.0016, "reward": 1.75, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 290 }, { "completion_length": 116.1875, "epoch": 0.194, "grad_norm": 13.157319561450578, "kl": 0.054931640625, "learning_rate": 9.03e-07, "loss": 0.0022, "reward": 1.3604166507720947, "reward_std": 0.27489346265792847, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.3916666805744171, "step": 291 }, { "completion_length": 102.8125, "epoch": 0.19466666666666665, "grad_norm": 2.3921520652129558, "kl": 0.050537109375, "learning_rate": 9.026666666666665e-07, "loss": 0.002, "reward": 1.671875, "reward_std": 0.09523916989564896, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.671875, "step": 292 }, { "completion_length": 107.9375, "epoch": 0.19533333333333333, "grad_norm": 3.1986704968039152, "kl": 0.057373046875, "learning_rate": 9.023333333333333e-07, "loss": 0.0023, "reward": 1.5255208015441895, "reward_std": 0.15047045052051544, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5255208611488342, "step": 293 }, { "completion_length": 114.03125, "epoch": 0.196, "grad_norm": 1.9017987173948723, "kl": 0.040771484375, "learning_rate": 9.02e-07, "loss": 0.0016, "reward": 1.734375, "reward_std": 0.1848391890525818, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.765625, "step": 294 }, { "completion_length": 120.84375, "epoch": 0.19666666666666666, "grad_norm": 5.993080198180405, "kl": 0.060791015625, "learning_rate": 9.016666666666666e-07, "loss": 0.0024, "reward": 1.7109375, "reward_std": 0.2953792214393616, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7421875, "step": 295 }, { "completion_length": 112.8125, "epoch": 0.19733333333333333, "grad_norm": 3.521744115577763, "kl": 0.07080078125, "learning_rate": 9.013333333333333e-07, "loss": 0.0028, "reward": 1.5989583730697632, "reward_std": 0.1479978710412979, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5989583730697632, "step": 296 }, { "completion_length": 106.8125, "epoch": 0.198, "grad_norm": 3.841684336063397, "kl": 0.07373046875, "learning_rate": 9.01e-07, "loss": 0.003, "reward": 1.5166666507720947, "reward_std": 0.29966092109680176, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5479167103767395, "step": 297 }, { "completion_length": 100.875, "epoch": 0.19866666666666666, "grad_norm": 2.590867909190481, "kl": 0.054443359375, "learning_rate": 9.006666666666666e-07, "loss": 0.0022, "reward": 1.5583332777023315, "reward_std": 0.1302136480808258, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5583333373069763, "step": 298 }, { "completion_length": 119.4375, "epoch": 0.19933333333333333, "grad_norm": 11.943319726135588, "kl": 0.037841796875, "learning_rate": 9.003333333333333e-07, "loss": 0.0015, "reward": 1.728124976158142, "reward_std": 0.15271694958209991, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7281249761581421, "step": 299 }, { "completion_length": 114.75, "epoch": 0.2, "grad_norm": 3.5793903364661137, "kl": 0.05419921875, "learning_rate": 9e-07, "loss": 0.0022, "reward": 1.6666667461395264, "reward_std": 0.16572721302509308, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7291667461395264, "step": 300 }, { "completion_length": 112.09375, "epoch": 0.20066666666666666, "grad_norm": 5.180997080007257, "kl": 0.0458984375, "learning_rate": 8.996666666666665e-07, "loss": 0.0018, "reward": 1.6244791746139526, "reward_std": 0.11455857753753662, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6244791746139526, "step": 301 }, { "completion_length": 99.0, "epoch": 0.20133333333333334, "grad_norm": 2.2584657431154347, "kl": 0.07470703125, "learning_rate": 8.993333333333333e-07, "loss": 0.003, "reward": 1.7348958253860474, "reward_std": 0.0757172703742981, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7348958253860474, "step": 302 }, { "completion_length": 96.65625, "epoch": 0.202, "grad_norm": 5.41752183283855, "kl": 0.09619140625, "learning_rate": 8.99e-07, "loss": 0.0039, "reward": 1.9296875, "reward_std": 0.023097459226846695, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 303 }, { "completion_length": 114.21875, "epoch": 0.20266666666666666, "grad_norm": 2.0367995902110967, "kl": 0.031982421875, "learning_rate": 8.986666666666666e-07, "loss": 0.0013, "reward": 1.7883929014205933, "reward_std": 0.19821427762508392, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7883928418159485, "step": 304 }, { "completion_length": 87.46875, "epoch": 0.20333333333333334, "grad_norm": 7.059539715978511, "kl": 0.07763671875, "learning_rate": 8.983333333333332e-07, "loss": 0.0031, "reward": 1.8541667461395264, "reward_std": 0.0416666604578495, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666865348816, "step": 305 }, { "completion_length": 88.5, "epoch": 0.204, "grad_norm": 2.330592887394481, "kl": 0.03955078125, "learning_rate": 8.98e-07, "loss": 0.0016, "reward": 1.8958333730697632, "reward_std": 0.046348851174116135, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 306 }, { "completion_length": 90.34375, "epoch": 0.20466666666666666, "grad_norm": 2.3476226732425367, "kl": 0.0400390625, "learning_rate": 8.976666666666666e-07, "loss": 0.0016, "reward": 1.734375, "reward_std": 0.15625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 307 }, { "completion_length": 91.59375, "epoch": 0.20533333333333334, "grad_norm": 5.6604201993781125, "kl": 0.04736328125, "learning_rate": 8.973333333333333e-07, "loss": 0.0019, "reward": 1.7156250476837158, "reward_std": 0.08958332985639572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7156250476837158, "step": 308 }, { "completion_length": 94.9375, "epoch": 0.206, "grad_norm": 5.458705846331098, "kl": 0.04248046875, "learning_rate": 8.969999999999999e-07, "loss": 0.0017, "reward": 1.7598958015441895, "reward_std": 0.12494659423828125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7598958015441895, "step": 309 }, { "completion_length": 100.28125, "epoch": 0.20666666666666667, "grad_norm": 2.076860316558726, "kl": 0.050537109375, "learning_rate": 8.966666666666666e-07, "loss": 0.002, "reward": 1.8145833015441895, "reward_std": 0.037500008940696716, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8145833611488342, "step": 310 }, { "completion_length": 104.96875, "epoch": 0.20733333333333334, "grad_norm": 3.167190260788374, "kl": 0.06201171875, "learning_rate": 8.963333333333333e-07, "loss": 0.0025, "reward": 1.6781249046325684, "reward_std": 0.2845558524131775, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7406249642372131, "step": 311 }, { "completion_length": 104.0625, "epoch": 0.208, "grad_norm": 7.32782846162845, "kl": 0.0888671875, "learning_rate": 8.96e-07, "loss": 0.0036, "reward": 1.6005208492279053, "reward_std": 0.2029816210269928, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6005208492279053, "step": 312 }, { "completion_length": 97.9375, "epoch": 0.20866666666666667, "grad_norm": 4.9871685343415555, "kl": 0.091796875, "learning_rate": 8.956666666666667e-07, "loss": 0.0037, "reward": 1.7078125476837158, "reward_std": 0.2365349531173706, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.707812488079071, "step": 313 }, { "completion_length": 95.4375, "epoch": 0.20933333333333334, "grad_norm": 8.432316607369641, "kl": 0.05078125, "learning_rate": 8.953333333333332e-07, "loss": 0.002, "reward": 1.8729166984558105, "reward_std": 0.15285272896289825, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8729166984558105, "step": 314 }, { "completion_length": 104.1875, "epoch": 0.21, "grad_norm": 2.620571081133261, "kl": 0.07958984375, "learning_rate": 8.95e-07, "loss": 0.0032, "reward": 1.7578125, "reward_std": 0.2603033781051636, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7890625, "step": 315 }, { "completion_length": 103.03125, "epoch": 0.21066666666666667, "grad_norm": 4.552060460937647, "kl": 0.0732421875, "learning_rate": 8.946666666666667e-07, "loss": 0.0029, "reward": 1.9375, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 316 }, { "completion_length": 88.71875, "epoch": 0.21133333333333335, "grad_norm": 4.201277438621109, "kl": 0.0673828125, "learning_rate": 8.943333333333333e-07, "loss": 0.0027, "reward": 1.6206846237182617, "reward_std": 0.21335925161838531, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6206845045089722, "step": 317 }, { "completion_length": 105.90625, "epoch": 0.212, "grad_norm": 5.405379194790786, "kl": 0.07763671875, "learning_rate": 8.939999999999999e-07, "loss": 0.0031, "reward": 1.7708333730697632, "reward_std": 0.10738959908485413, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333134651184, "step": 318 }, { "completion_length": 96.34375, "epoch": 0.21266666666666667, "grad_norm": 5.232829388789652, "kl": 0.0947265625, "learning_rate": 8.936666666666667e-07, "loss": 0.0038, "reward": 1.7395833730697632, "reward_std": 0.24393707513809204, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 319 }, { "completion_length": 89.5, "epoch": 0.21333333333333335, "grad_norm": 10.305775233860423, "kl": 0.03662109375, "learning_rate": 8.933333333333333e-07, "loss": 0.0015, "reward": 1.6984374523162842, "reward_std": 0.12325052171945572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6984374523162842, "step": 320 }, { "completion_length": 105.375, "epoch": 0.214, "grad_norm": 3.4358157034905, "kl": 0.07958984375, "learning_rate": 8.93e-07, "loss": 0.0032, "reward": 1.7442708015441895, "reward_std": 0.10813502967357635, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7442708015441895, "step": 321 }, { "completion_length": 100.84375, "epoch": 0.21466666666666667, "grad_norm": 6.552353663583116, "kl": 0.06884765625, "learning_rate": 8.926666666666666e-07, "loss": 0.0027, "reward": 1.7708333730697632, "reward_std": 0.17176829278469086, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333134651184, "step": 322 }, { "completion_length": 102.53125, "epoch": 0.21533333333333332, "grad_norm": 2.5664934347088644, "kl": 0.0478515625, "learning_rate": 8.923333333333333e-07, "loss": 0.0019, "reward": 1.808333396911621, "reward_std": 0.3069930076599121, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8395833373069763, "step": 323 }, { "completion_length": 102.71875, "epoch": 0.216, "grad_norm": 5.709896106600502, "kl": 0.10205078125, "learning_rate": 8.92e-07, "loss": 0.0041, "reward": 1.84375, "reward_std": 0.3125, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.9375, "step": 324 }, { "completion_length": 95.875, "epoch": 0.21666666666666667, "grad_norm": 1.2980528792612382, "kl": 0.07373046875, "learning_rate": 8.916666666666667e-07, "loss": 0.003, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 325 }, { "completion_length": 109.0625, "epoch": 0.21733333333333332, "grad_norm": 2.756860412625057, "kl": 0.055419921875, "learning_rate": 8.913333333333332e-07, "loss": 0.0022, "reward": 1.6802083253860474, "reward_std": 0.1833425611257553, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7114583253860474, "step": 326 }, { "completion_length": 102.8125, "epoch": 0.218, "grad_norm": 3.685570692228128, "kl": 0.045166015625, "learning_rate": 8.91e-07, "loss": 0.0018, "reward": 1.7468750476837158, "reward_std": 0.15842358767986298, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.746874988079071, "step": 327 }, { "completion_length": 115.375, "epoch": 0.21866666666666668, "grad_norm": 1.8670998022046774, "kl": 0.02197265625, "learning_rate": 8.906666666666667e-07, "loss": 0.0009, "reward": 1.65625, "reward_std": 0.18157242238521576, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6875000596046448, "step": 328 }, { "completion_length": 96.0, "epoch": 0.21933333333333332, "grad_norm": 3.1023486343564928, "kl": 0.0380859375, "learning_rate": 8.903333333333333e-07, "loss": 0.0015, "reward": 1.890625, "reward_std": 0.12425211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8906249403953552, "step": 329 }, { "completion_length": 97.34375, "epoch": 0.22, "grad_norm": 3.5326876498989597, "kl": 0.032958984375, "learning_rate": 8.9e-07, "loss": 0.0013, "reward": 1.703125, "reward_std": 0.0937499850988388, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.703125, "step": 330 }, { "completion_length": 105.6875, "epoch": 0.22066666666666668, "grad_norm": 7.990616964254744, "kl": 0.06884765625, "learning_rate": 8.896666666666666e-07, "loss": 0.0027, "reward": 1.7625000476837158, "reward_std": 0.10374575108289719, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7625000476837158, "step": 331 }, { "completion_length": 94.90625, "epoch": 0.22133333333333333, "grad_norm": 1.1972463781579885, "kl": 0.0546875, "learning_rate": 8.893333333333333e-07, "loss": 0.0022, "reward": 1.96875, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9687499403953552, "step": 332 }, { "completion_length": 94.0, "epoch": 0.222, "grad_norm": 0.8544607056187763, "kl": 0.04833984375, "learning_rate": 8.89e-07, "loss": 0.0019, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 333 }, { "completion_length": 109.09375, "epoch": 0.22266666666666668, "grad_norm": 3.9381549739004535, "kl": 0.04248046875, "learning_rate": 8.886666666666667e-07, "loss": 0.0017, "reward": 1.7135417461395264, "reward_std": 0.1735902726650238, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7135417461395264, "step": 334 }, { "completion_length": 121.375, "epoch": 0.22333333333333333, "grad_norm": 3.572262980597358, "kl": 0.040283203125, "learning_rate": 8.883333333333332e-07, "loss": 0.0016, "reward": 1.5547618865966797, "reward_std": 0.3287668228149414, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.6485118865966797, "step": 335 }, { "completion_length": 108.75, "epoch": 0.224, "grad_norm": 2.2651746685447427, "kl": 0.07763671875, "learning_rate": 8.88e-07, "loss": 0.0031, "reward": 1.7555515766143799, "reward_std": 0.18703266978263855, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7868016362190247, "step": 336 }, { "completion_length": 133.75, "epoch": 0.22466666666666665, "grad_norm": 3.4503721354231858, "kl": 0.036376953125, "learning_rate": 8.876666666666667e-07, "loss": 0.0015, "reward": 1.84375, "reward_std": 0.2027510702610016, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 337 }, { "completion_length": 98.5, "epoch": 0.22533333333333333, "grad_norm": 1.7517972188963207, "kl": 0.04296875, "learning_rate": 8.873333333333333e-07, "loss": 0.0017, "reward": 1.9140625, "reward_std": 0.140625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9453125, "step": 338 }, { "completion_length": 113.0625, "epoch": 0.226, "grad_norm": 5.020134164630519, "kl": 0.0615234375, "learning_rate": 8.869999999999999e-07, "loss": 0.0025, "reward": 1.648958444595337, "reward_std": 0.363955020904541, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7114582657814026, "step": 339 }, { "completion_length": 102.71875, "epoch": 0.22666666666666666, "grad_norm": 7.108698254426284, "kl": 0.040771484375, "learning_rate": 8.866666666666667e-07, "loss": 0.0016, "reward": 1.6131696701049805, "reward_std": 0.2212633192539215, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6131696701049805, "step": 340 }, { "completion_length": 113.65625, "epoch": 0.22733333333333333, "grad_norm": 2.847388706288219, "kl": 0.0927734375, "learning_rate": 8.863333333333333e-07, "loss": 0.0037, "reward": 1.6822917461395264, "reward_std": 0.19845113158226013, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6822916865348816, "step": 341 }, { "completion_length": 104.625, "epoch": 0.228, "grad_norm": 5.708670139102101, "kl": 0.0771484375, "learning_rate": 8.86e-07, "loss": 0.0031, "reward": 1.8046875, "reward_std": 0.09541931003332138, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8046875596046448, "step": 342 }, { "completion_length": 110.03125, "epoch": 0.22866666666666666, "grad_norm": 4.301232167203465, "kl": 0.06640625, "learning_rate": 8.856666666666666e-07, "loss": 0.0027, "reward": 1.5859375, "reward_std": 0.1894388198852539, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5859375, "step": 343 }, { "completion_length": 99.3125, "epoch": 0.22933333333333333, "grad_norm": 2.054657666837539, "kl": 0.0751953125, "learning_rate": 8.853333333333332e-07, "loss": 0.003, "reward": 1.7807291746139526, "reward_std": 0.061612002551555634, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7807291746139526, "step": 344 }, { "completion_length": 98.78125, "epoch": 0.23, "grad_norm": 2.64387316742671, "kl": 0.10546875, "learning_rate": 8.85e-07, "loss": 0.0042, "reward": 1.8359375, "reward_std": 0.203125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8359375, "step": 345 }, { "completion_length": 95.09375, "epoch": 0.23066666666666666, "grad_norm": 2.949850791597313, "kl": 0.060302734375, "learning_rate": 8.846666666666667e-07, "loss": 0.0024, "reward": 1.9348958730697632, "reward_std": 0.0885416641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9348958730697632, "step": 346 }, { "completion_length": 86.34375, "epoch": 0.23133333333333334, "grad_norm": 1.3479033518315071, "kl": 0.08251953125, "learning_rate": 8.843333333333332e-07, "loss": 0.0033, "reward": 1.9375, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9374999403953552, "step": 347 }, { "completion_length": 99.75, "epoch": 0.232, "grad_norm": 9.75928769621437, "kl": 0.0634765625, "learning_rate": 8.839999999999999e-07, "loss": 0.0025, "reward": 1.7911458015441895, "reward_std": 0.08581355959177017, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7911458015441895, "step": 348 }, { "completion_length": 113.71875, "epoch": 0.23266666666666666, "grad_norm": 4.895181086073219, "kl": 0.05029296875, "learning_rate": 8.836666666666667e-07, "loss": 0.002, "reward": 1.7416666746139526, "reward_std": 0.08542665094137192, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7416666746139526, "step": 349 }, { "completion_length": 107.0, "epoch": 0.23333333333333334, "grad_norm": 2.4599517980623062, "kl": 0.033447265625, "learning_rate": 8.833333333333333e-07, "loss": 0.0013, "reward": 1.8958333730697632, "reward_std": 0.1555021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 350 }, { "completion_length": 108.09375, "epoch": 0.234, "grad_norm": 12.160264355666698, "kl": 0.053955078125, "learning_rate": 8.83e-07, "loss": 0.0022, "reward": 1.784895896911621, "reward_std": 0.16049198806285858, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7848957777023315, "step": 351 }, { "completion_length": 97.625, "epoch": 0.23466666666666666, "grad_norm": 4.480644088357104, "kl": 0.0693359375, "learning_rate": 8.826666666666666e-07, "loss": 0.0028, "reward": 1.7604167461395264, "reward_std": 0.10825316607952118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7604166865348816, "step": 352 }, { "completion_length": 108.75, "epoch": 0.23533333333333334, "grad_norm": 6.718880388990212, "kl": 0.042724609375, "learning_rate": 8.823333333333333e-07, "loss": 0.0017, "reward": 1.6437499523162842, "reward_std": 0.25794684886932373, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.643750011920929, "step": 353 }, { "completion_length": 116.59375, "epoch": 0.236, "grad_norm": 7.276527708962089, "kl": 0.048828125, "learning_rate": 8.82e-07, "loss": 0.002, "reward": 1.695833444595337, "reward_std": 0.20611600577831268, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7270833253860474, "step": 354 }, { "completion_length": 130.84375, "epoch": 0.23666666666666666, "grad_norm": 1.757592533698236, "kl": 0.045166015625, "learning_rate": 8.816666666666667e-07, "loss": 0.0018, "reward": 1.7062499523162842, "reward_std": 0.14030930399894714, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.706250011920929, "step": 355 }, { "completion_length": 96.90625, "epoch": 0.23733333333333334, "grad_norm": 1.8350815271860423, "kl": 0.07177734375, "learning_rate": 8.813333333333332e-07, "loss": 0.0029, "reward": 1.673437476158142, "reward_std": 0.03657718747854233, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6734374761581421, "step": 356 }, { "completion_length": 97.3125, "epoch": 0.238, "grad_norm": 4.487054342786195, "kl": 0.06640625, "learning_rate": 8.81e-07, "loss": 0.0027, "reward": 1.5635416507720947, "reward_std": 0.18573325872421265, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5635416507720947, "step": 357 }, { "completion_length": 108.3125, "epoch": 0.23866666666666667, "grad_norm": 7.909814309074039, "kl": 0.057373046875, "learning_rate": 8.806666666666667e-07, "loss": 0.0023, "reward": 1.6454613208770752, "reward_std": 0.18034188449382782, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6454613208770752, "step": 358 }, { "completion_length": 100.375, "epoch": 0.23933333333333334, "grad_norm": 1.9732271377418562, "kl": 0.06591796875, "learning_rate": 8.803333333333333e-07, "loss": 0.0026, "reward": 1.8333333730697632, "reward_std": 0.09858439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 359 }, { "completion_length": 107.59375, "epoch": 0.24, "grad_norm": 3.0928122180250535, "kl": 0.0478515625, "learning_rate": 8.799999999999999e-07, "loss": 0.0019, "reward": 1.553125023841858, "reward_std": 0.22596687078475952, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5531250238418579, "step": 360 }, { "completion_length": 94.84375, "epoch": 0.24066666666666667, "grad_norm": 3.454582294980594, "kl": 0.0888671875, "learning_rate": 8.796666666666666e-07, "loss": 0.0036, "reward": 1.8781249523162842, "reward_std": 0.013466879725456238, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.878125011920929, "step": 361 }, { "completion_length": 113.90625, "epoch": 0.24133333333333334, "grad_norm": 2.365870912654935, "kl": 0.038330078125, "learning_rate": 8.793333333333333e-07, "loss": 0.0015, "reward": 1.9005208015441895, "reward_std": 0.09194426983594894, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9005208015441895, "step": 362 }, { "completion_length": 97.0, "epoch": 0.242, "grad_norm": 1.7848517539529176, "kl": 0.07177734375, "learning_rate": 8.79e-07, "loss": 0.0029, "reward": 1.9140625, "reward_std": 0.0885416716337204, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9453124403953552, "step": 363 }, { "completion_length": 83.40625, "epoch": 0.24266666666666667, "grad_norm": 12.44874734203815, "kl": 0.08544921875, "learning_rate": 8.786666666666666e-07, "loss": 0.0034, "reward": 1.9427083730697632, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083134651184, "step": 364 }, { "completion_length": 112.75, "epoch": 0.24333333333333335, "grad_norm": 3.061536767841306, "kl": 0.050048828125, "learning_rate": 8.783333333333332e-07, "loss": 0.002, "reward": 1.6923611164093018, "reward_std": 0.2643388509750366, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7236111164093018, "step": 365 }, { "completion_length": 102.375, "epoch": 0.244, "grad_norm": 2.2017497280308387, "kl": 0.055908203125, "learning_rate": 8.78e-07, "loss": 0.0022, "reward": 1.59375, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.625, "step": 366 }, { "completion_length": 99.78125, "epoch": 0.24466666666666667, "grad_norm": 2.983909145983555, "kl": 0.08154296875, "learning_rate": 8.776666666666667e-07, "loss": 0.0033, "reward": 1.807031273841858, "reward_std": 0.16980399191379547, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8070312738418579, "step": 367 }, { "completion_length": 90.4375, "epoch": 0.24533333333333332, "grad_norm": 3.5203961172490645, "kl": 0.061767578125, "learning_rate": 8.773333333333332e-07, "loss": 0.0025, "reward": 1.8671875, "reward_std": 0.1398771107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8671875, "step": 368 }, { "completion_length": 86.96875, "epoch": 0.246, "grad_norm": 3.1114550359945716, "kl": 0.0732421875, "learning_rate": 8.769999999999999e-07, "loss": 0.0029, "reward": 1.8046875, "reward_std": 0.13019686937332153, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8046875, "step": 369 }, { "completion_length": 112.9375, "epoch": 0.24666666666666667, "grad_norm": 7.219673586843124, "kl": 0.0849609375, "learning_rate": 8.766666666666667e-07, "loss": 0.0034, "reward": 1.5989583730697632, "reward_std": 0.2578815817832947, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5989583730697632, "step": 370 }, { "completion_length": 105.5, "epoch": 0.24733333333333332, "grad_norm": 2.91866359044739, "kl": 0.06494140625, "learning_rate": 8.763333333333333e-07, "loss": 0.0026, "reward": 1.734375, "reward_std": 0.21725423634052277, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 371 }, { "completion_length": 96.34375, "epoch": 0.248, "grad_norm": 2.5046101642005265, "kl": 0.06103515625, "learning_rate": 8.76e-07, "loss": 0.0024, "reward": 1.870833396911621, "reward_std": 0.05557724088430405, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8708333373069763, "step": 372 }, { "completion_length": 126.34375, "epoch": 0.24866666666666667, "grad_norm": 1.8451058200669619, "kl": 0.042236328125, "learning_rate": 8.756666666666666e-07, "loss": 0.0017, "reward": 1.681249976158142, "reward_std": 0.4393952488899231, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.8062499761581421, "step": 373 }, { "completion_length": 94.28125, "epoch": 0.24933333333333332, "grad_norm": 3.360632057974314, "kl": 0.0810546875, "learning_rate": 8.753333333333332e-07, "loss": 0.0032, "reward": 1.8536458015441895, "reward_std": 0.09083178639411926, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8536458015441895, "step": 374 }, { "completion_length": 91.3125, "epoch": 0.25, "grad_norm": 9.280564638287403, "kl": 0.0908203125, "learning_rate": 8.75e-07, "loss": 0.0036, "reward": 1.78125, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 375 }, { "completion_length": 94.375, "epoch": 0.25066666666666665, "grad_norm": 5.786801011371701, "kl": 0.055419921875, "learning_rate": 8.746666666666667e-07, "loss": 0.0022, "reward": 1.4796874523162842, "reward_std": 0.11515313386917114, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.47968751192092896, "step": 376 }, { "completion_length": 98.90625, "epoch": 0.25133333333333335, "grad_norm": 25.57792742432225, "kl": 0.068359375, "learning_rate": 8.743333333333332e-07, "loss": 0.0027, "reward": 1.7395833730697632, "reward_std": 0.2725604772567749, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 377 }, { "completion_length": 86.71875, "epoch": 0.252, "grad_norm": 2.7098699404258983, "kl": 0.08154296875, "learning_rate": 8.739999999999999e-07, "loss": 0.0033, "reward": 1.765625, "reward_std": 0.11921681463718414, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.765625, "step": 378 }, { "completion_length": 91.71875, "epoch": 0.25266666666666665, "grad_norm": 2.1473500462718076, "kl": 0.06005859375, "learning_rate": 8.736666666666667e-07, "loss": 0.0024, "reward": 1.7838542461395264, "reward_std": 0.1510416716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7838542461395264, "step": 379 }, { "completion_length": 98.75, "epoch": 0.25333333333333335, "grad_norm": 1.8731447683519789, "kl": 0.05224609375, "learning_rate": 8.733333333333333e-07, "loss": 0.0021, "reward": 1.8162202835083008, "reward_std": 0.03873991593718529, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8162202835083008, "step": 380 }, { "completion_length": 96.875, "epoch": 0.254, "grad_norm": 5.6856163280349215, "kl": 0.04736328125, "learning_rate": 8.729999999999999e-07, "loss": 0.0019, "reward": 1.769270896911621, "reward_std": 0.24842853844165802, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7692708969116211, "step": 381 }, { "completion_length": 83.0, "epoch": 0.25466666666666665, "grad_norm": 1.898686415653219, "kl": 0.059814453125, "learning_rate": 8.726666666666666e-07, "loss": 0.0024, "reward": 1.9270833730697632, "reward_std": 0.07701761275529861, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9583333134651184, "step": 382 }, { "completion_length": 101.71875, "epoch": 0.25533333333333336, "grad_norm": 4.902538021628008, "kl": 0.09033203125, "learning_rate": 8.723333333333333e-07, "loss": 0.0036, "reward": 1.633333444595337, "reward_std": 0.10710735619068146, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6333333253860474, "step": 383 }, { "completion_length": 89.0625, "epoch": 0.256, "grad_norm": 1.6006851537621938, "kl": 0.054443359375, "learning_rate": 8.72e-07, "loss": 0.0022, "reward": 1.765625, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.765625, "step": 384 }, { "completion_length": 102.5625, "epoch": 0.25666666666666665, "grad_norm": 3.3124574515010585, "kl": 0.0791015625, "learning_rate": 8.716666666666667e-07, "loss": 0.0032, "reward": 1.7626116275787354, "reward_std": 0.1054522693157196, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7626116275787354, "step": 385 }, { "completion_length": 88.40625, "epoch": 0.25733333333333336, "grad_norm": 1.891718669918606, "kl": 0.1142578125, "learning_rate": 8.713333333333332e-07, "loss": 0.0046, "reward": 1.9296875, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 386 }, { "completion_length": 97.90625, "epoch": 0.258, "grad_norm": 3.5728013360296904, "kl": 0.0751953125, "learning_rate": 8.71e-07, "loss": 0.003, "reward": 1.6072916984558105, "reward_std": 0.28423935174942017, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6072916984558105, "step": 387 }, { "completion_length": 94.0, "epoch": 0.25866666666666666, "grad_norm": 5.337115981391621, "kl": 0.0830078125, "learning_rate": 8.706666666666667e-07, "loss": 0.0033, "reward": 1.698586344718933, "reward_std": 0.17683136463165283, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7610863447189331, "step": 388 }, { "completion_length": 91.75, "epoch": 0.25933333333333336, "grad_norm": 31.78833476772039, "kl": 0.07958984375, "learning_rate": 8.703333333333333e-07, "loss": 0.0032, "reward": 1.8610118627548218, "reward_std": 0.030103161931037903, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8610118627548218, "step": 389 }, { "completion_length": 98.625, "epoch": 0.26, "grad_norm": 3.8332781754840792, "kl": 0.0654296875, "learning_rate": 8.699999999999999e-07, "loss": 0.0026, "reward": 1.828125, "reward_std": 0.20554219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.828125, "step": 390 }, { "completion_length": 93.03125, "epoch": 0.26066666666666666, "grad_norm": 3.6969301218679878, "kl": 0.10595703125, "learning_rate": 8.696666666666667e-07, "loss": 0.0042, "reward": 1.796875, "reward_std": 0.35341876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.828125, "step": 391 }, { "completion_length": 92.28125, "epoch": 0.2613333333333333, "grad_norm": 6.046780819346233, "kl": 0.056396484375, "learning_rate": 8.693333333333333e-07, "loss": 0.0023, "reward": 1.764657735824585, "reward_std": 0.14479459822177887, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7646577954292297, "step": 392 }, { "completion_length": 88.625, "epoch": 0.262, "grad_norm": 3.704674610604767, "kl": 0.07080078125, "learning_rate": 8.69e-07, "loss": 0.0028, "reward": 1.84375, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 393 }, { "completion_length": 96.5625, "epoch": 0.26266666666666666, "grad_norm": 3.6951564346031796, "kl": 0.087890625, "learning_rate": 8.686666666666666e-07, "loss": 0.0035, "reward": 1.625520944595337, "reward_std": 0.17206206917762756, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6255208253860474, "step": 394 }, { "completion_length": 90.375, "epoch": 0.2633333333333333, "grad_norm": 3.5290826643020274, "kl": 0.07861328125, "learning_rate": 8.683333333333332e-07, "loss": 0.0031, "reward": 1.7708333730697632, "reward_std": 0.1326255202293396, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333134651184, "step": 395 }, { "completion_length": 91.96875, "epoch": 0.264, "grad_norm": 4.943110700907235, "kl": 0.1103515625, "learning_rate": 8.68e-07, "loss": 0.0044, "reward": 1.8036458492279053, "reward_std": 0.03714568912982941, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8036458492279053, "step": 396 }, { "completion_length": 102.125, "epoch": 0.26466666666666666, "grad_norm": 4.875740212514898, "kl": 0.053466796875, "learning_rate": 8.676666666666667e-07, "loss": 0.0021, "reward": 1.8580729961395264, "reward_std": 0.15686875581741333, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8893229365348816, "step": 397 }, { "completion_length": 79.3125, "epoch": 0.2653333333333333, "grad_norm": 11.723602178027432, "kl": 0.06640625, "learning_rate": 8.673333333333332e-07, "loss": 0.0027, "reward": 1.8406250476837158, "reward_std": 0.1068776547908783, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.840624988079071, "step": 398 }, { "completion_length": 85.4375, "epoch": 0.266, "grad_norm": 2.1888808288591637, "kl": 0.095703125, "learning_rate": 8.669999999999999e-07, "loss": 0.0038, "reward": 1.65625, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.65625, "step": 399 }, { "completion_length": 87.5, "epoch": 0.26666666666666666, "grad_norm": 4.931205854329692, "kl": 0.087890625, "learning_rate": 8.666666666666667e-07, "loss": 0.0035, "reward": 1.777083396911621, "reward_std": 0.11108437925577164, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7770832777023315, "step": 400 }, { "completion_length": 94.59375, "epoch": 0.2673333333333333, "grad_norm": 4.427262512249429, "kl": 0.0654296875, "learning_rate": 8.663333333333333e-07, "loss": 0.0026, "reward": 1.7296874523162842, "reward_std": 0.24639824032783508, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.729687511920929, "step": 401 }, { "completion_length": 90.71875, "epoch": 0.268, "grad_norm": 4.39845105638797, "kl": 0.078125, "learning_rate": 8.659999999999999e-07, "loss": 0.0031, "reward": 1.7786458730697632, "reward_std": 0.19482415914535522, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7786458730697632, "step": 402 }, { "completion_length": 87.53125, "epoch": 0.26866666666666666, "grad_norm": 2.363335901004909, "kl": 0.08984375, "learning_rate": 8.656666666666666e-07, "loss": 0.0036, "reward": 1.870833396911621, "reward_std": 0.016925079748034477, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8708333373069763, "step": 403 }, { "completion_length": 93.09375, "epoch": 0.2693333333333333, "grad_norm": 2.910328586307391, "kl": 0.04931640625, "learning_rate": 8.653333333333333e-07, "loss": 0.002, "reward": 1.671875, "reward_std": 0.15625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.671875, "step": 404 }, { "completion_length": 94.4375, "epoch": 0.27, "grad_norm": 4.860430147899286, "kl": 0.05615234375, "learning_rate": 8.65e-07, "loss": 0.0022, "reward": 1.640625, "reward_std": 0.20022058486938477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.671875, "step": 405 }, { "completion_length": 97.9375, "epoch": 0.27066666666666667, "grad_norm": 1.645771140659919, "kl": 0.040771484375, "learning_rate": 8.646666666666667e-07, "loss": 0.0016, "reward": 1.875, "reward_std": 0.08237498998641968, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 406 }, { "completion_length": 99.375, "epoch": 0.2713333333333333, "grad_norm": 4.686585666953656, "kl": 0.041015625, "learning_rate": 8.643333333333332e-07, "loss": 0.0016, "reward": 1.7916667461395264, "reward_std": 0.2690594494342804, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8229166865348816, "step": 407 }, { "completion_length": 89.5625, "epoch": 0.272, "grad_norm": 1.7518852862226164, "kl": 0.0654296875, "learning_rate": 8.639999999999999e-07, "loss": 0.0026, "reward": 1.9114583730697632, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9114583134651184, "step": 408 }, { "completion_length": 90.8125, "epoch": 0.27266666666666667, "grad_norm": 2.419027154685206, "kl": 0.0927734375, "learning_rate": 8.636666666666667e-07, "loss": 0.0037, "reward": 1.765625, "reward_std": 0.0729166641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7656250596046448, "step": 409 }, { "completion_length": 113.5625, "epoch": 0.2733333333333333, "grad_norm": 3.6966786074140354, "kl": 0.045654296875, "learning_rate": 8.633333333333333e-07, "loss": 0.0018, "reward": 1.7083333730697632, "reward_std": 0.20653314888477325, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7395833134651184, "step": 410 }, { "completion_length": 94.0625, "epoch": 0.274, "grad_norm": 2.8384709163404813, "kl": 0.09375, "learning_rate": 8.629999999999999e-07, "loss": 0.0037, "reward": 1.8458333015441895, "reward_std": 0.10000000149011612, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8458333611488342, "step": 411 }, { "completion_length": 94.125, "epoch": 0.27466666666666667, "grad_norm": 3.1372553439847395, "kl": 0.0703125, "learning_rate": 8.626666666666666e-07, "loss": 0.0028, "reward": 1.78125, "reward_std": 0.15701940655708313, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 412 }, { "completion_length": 103.5, "epoch": 0.2753333333333333, "grad_norm": 2.7453285871955866, "kl": 0.07421875, "learning_rate": 8.623333333333333e-07, "loss": 0.003, "reward": 1.7578125, "reward_std": 0.11801779270172119, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7578125596046448, "step": 413 }, { "completion_length": 80.59375, "epoch": 0.276, "grad_norm": 10.546693877610831, "kl": 0.087890625, "learning_rate": 8.62e-07, "loss": 0.0035, "reward": 1.9239583015441895, "reward_std": 0.036562152206897736, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9239583611488342, "step": 414 }, { "completion_length": 99.125, "epoch": 0.27666666666666667, "grad_norm": 11.85945470970928, "kl": 0.08837890625, "learning_rate": 8.616666666666666e-07, "loss": 0.0035, "reward": 1.8357515335083008, "reward_std": 0.1393168717622757, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.835751473903656, "step": 415 }, { "completion_length": 93.09375, "epoch": 0.2773333333333333, "grad_norm": 4.052641162783467, "kl": 0.056396484375, "learning_rate": 8.613333333333332e-07, "loss": 0.0023, "reward": 1.8032739162445068, "reward_std": 0.23154760897159576, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8032737970352173, "step": 416 }, { "completion_length": 111.4375, "epoch": 0.278, "grad_norm": 2.1997006138611797, "kl": 0.0198974609375, "learning_rate": 8.61e-07, "loss": 0.0008, "reward": 1.6875, "reward_std": 0.19716878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 417 }, { "completion_length": 103.53125, "epoch": 0.2786666666666667, "grad_norm": 9.631063838566106, "kl": 0.072265625, "learning_rate": 8.606666666666667e-07, "loss": 0.0029, "reward": 1.8696428537368774, "reward_std": 0.06167963519692421, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8696428537368774, "step": 418 }, { "completion_length": 93.75, "epoch": 0.2793333333333333, "grad_norm": 2.05329354798476, "kl": 0.09716796875, "learning_rate": 8.603333333333332e-07, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 419 }, { "completion_length": 101.4375, "epoch": 0.28, "grad_norm": 2.384249181425087, "kl": 0.087890625, "learning_rate": 8.599999999999999e-07, "loss": 0.0035, "reward": 1.7265625, "reward_std": 0.2783317565917969, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7265625, "step": 420 }, { "completion_length": 102.5, "epoch": 0.2806666666666667, "grad_norm": 2.530623164395373, "kl": 0.032958984375, "learning_rate": 8.596666666666667e-07, "loss": 0.0013, "reward": 1.527083396911621, "reward_std": 0.10910648852586746, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5270833373069763, "step": 421 }, { "completion_length": 104.03125, "epoch": 0.2813333333333333, "grad_norm": 6.820148712765909, "kl": 0.083984375, "learning_rate": 8.593333333333333e-07, "loss": 0.0034, "reward": 1.7619792222976685, "reward_std": 0.20829832553863525, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7932291626930237, "step": 422 }, { "completion_length": 87.40625, "epoch": 0.282, "grad_norm": 0.18020193853730837, "kl": 0.0703125, "learning_rate": 8.59e-07, "loss": 0.0028, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 423 }, { "completion_length": 106.25, "epoch": 0.2826666666666667, "grad_norm": 3.904219970190863, "kl": 0.049560546875, "learning_rate": 8.586666666666666e-07, "loss": 0.002, "reward": 1.7052083015441895, "reward_std": 0.1646508425474167, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7364583015441895, "step": 424 }, { "completion_length": 109.75, "epoch": 0.2833333333333333, "grad_norm": 2.352532970095605, "kl": 0.048828125, "learning_rate": 8.583333333333332e-07, "loss": 0.002, "reward": 1.8489583730697632, "reward_std": 0.1145833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583730697632, "step": 425 }, { "completion_length": 118.96875, "epoch": 0.284, "grad_norm": 3.7593659642672095, "kl": 0.05322265625, "learning_rate": 8.58e-07, "loss": 0.0021, "reward": 1.7062499523162842, "reward_std": 0.17254751920700073, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.737500011920929, "step": 426 }, { "completion_length": 98.03125, "epoch": 0.2846666666666667, "grad_norm": 1.494538456719172, "kl": 0.07373046875, "learning_rate": 8.576666666666667e-07, "loss": 0.003, "reward": 1.6927083730697632, "reward_std": 0.03125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7239583134651184, "step": 427 }, { "completion_length": 110.46875, "epoch": 0.2853333333333333, "grad_norm": 3.9755061615573086, "kl": 0.057861328125, "learning_rate": 8.573333333333332e-07, "loss": 0.0023, "reward": 1.8541667461395264, "reward_std": 0.02405625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541667461395264, "step": 428 }, { "completion_length": 106.1875, "epoch": 0.286, "grad_norm": 4.045425153589551, "kl": 0.03564453125, "learning_rate": 8.569999999999999e-07, "loss": 0.0014, "reward": 1.75, "reward_std": 0.29697883129119873, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 429 }, { "completion_length": 92.1875, "epoch": 0.2866666666666667, "grad_norm": 39.47438372316267, "kl": 0.07470703125, "learning_rate": 8.566666666666667e-07, "loss": 0.003, "reward": 1.7416666746139526, "reward_std": 0.1731092631816864, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7416666746139526, "step": 430 }, { "completion_length": 100.375, "epoch": 0.28733333333333333, "grad_norm": 1.878306293717268, "kl": 0.10009765625, "learning_rate": 8.563333333333333e-07, "loss": 0.004, "reward": 1.7864583730697632, "reward_std": 0.09375, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8177083730697632, "step": 431 }, { "completion_length": 84.40625, "epoch": 0.288, "grad_norm": 0.14033486439685722, "kl": 0.04833984375, "learning_rate": 8.559999999999999e-07, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 432 }, { "completion_length": 89.71875, "epoch": 0.2886666666666667, "grad_norm": 4.183997679675941, "kl": 0.05859375, "learning_rate": 8.556666666666666e-07, "loss": 0.0023, "reward": 1.7708333730697632, "reward_std": 0.0416666716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333134651184, "step": 433 }, { "completion_length": 86.5, "epoch": 0.28933333333333333, "grad_norm": 3.676105773513266, "kl": 0.0908203125, "learning_rate": 8.553333333333333e-07, "loss": 0.0036, "reward": 1.7552083730697632, "reward_std": 0.13950318098068237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7552083730697632, "step": 434 }, { "completion_length": 99.03125, "epoch": 0.29, "grad_norm": 1.4956617100510887, "kl": 0.099609375, "learning_rate": 8.55e-07, "loss": 0.004, "reward": 1.928125023841858, "reward_std": 0.006250003818422556, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9281249642372131, "step": 435 }, { "completion_length": 101.0, "epoch": 0.2906666666666667, "grad_norm": 12.188134830730075, "kl": 0.06298828125, "learning_rate": 8.546666666666666e-07, "loss": 0.0025, "reward": 1.6921875476837158, "reward_std": 0.22789166867733002, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6921875476837158, "step": 436 }, { "completion_length": 109.375, "epoch": 0.29133333333333333, "grad_norm": 1.8241333741727856, "kl": 0.0654296875, "learning_rate": 8.543333333333332e-07, "loss": 0.0026, "reward": 1.75, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 437 }, { "completion_length": 109.125, "epoch": 0.292, "grad_norm": 2.1419761831479835, "kl": 0.09130859375, "learning_rate": 8.539999999999999e-07, "loss": 0.0037, "reward": 1.7994792461395264, "reward_std": 0.04822869971394539, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7994791269302368, "step": 438 }, { "completion_length": 98.65625, "epoch": 0.2926666666666667, "grad_norm": 2.9167986154709937, "kl": 0.0966796875, "learning_rate": 8.536666666666667e-07, "loss": 0.0039, "reward": 1.798437476158142, "reward_std": 0.14784188568592072, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8296874761581421, "step": 439 }, { "completion_length": 98.375, "epoch": 0.29333333333333333, "grad_norm": 3.206621924614535, "kl": 0.060546875, "learning_rate": 8.533333333333334e-07, "loss": 0.0024, "reward": 1.8645833730697632, "reward_std": 0.1666666716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833730697632, "step": 440 }, { "completion_length": 99.4375, "epoch": 0.294, "grad_norm": 13.676268077400573, "kl": 0.0673828125, "learning_rate": 8.529999999999999e-07, "loss": 0.0027, "reward": 1.9296875, "reward_std": 0.046875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 441 }, { "completion_length": 93.8125, "epoch": 0.2946666666666667, "grad_norm": 4.394398962752481, "kl": 0.042724609375, "learning_rate": 8.526666666666666e-07, "loss": 0.0017, "reward": 1.787500023841858, "reward_std": 0.06971687823534012, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7875000238418579, "step": 442 }, { "completion_length": 93.40625, "epoch": 0.29533333333333334, "grad_norm": 1.2856086251880054, "kl": 0.072265625, "learning_rate": 8.523333333333334e-07, "loss": 0.0029, "reward": 1.8020833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 443 }, { "completion_length": 98.21875, "epoch": 0.296, "grad_norm": 4.944119984278421, "kl": 0.07421875, "learning_rate": 8.52e-07, "loss": 0.003, "reward": 1.8416666984558105, "reward_std": 0.14716877043247223, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8416666984558105, "step": 444 }, { "completion_length": 112.34375, "epoch": 0.2966666666666667, "grad_norm": 2.29946390624271, "kl": 0.052734375, "learning_rate": 8.516666666666666e-07, "loss": 0.0021, "reward": 1.751339316368103, "reward_std": 0.08230968564748764, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7513392567634583, "step": 445 }, { "completion_length": 93.59375, "epoch": 0.29733333333333334, "grad_norm": 14.133061867119931, "kl": 0.076171875, "learning_rate": 8.513333333333333e-07, "loss": 0.003, "reward": 1.7645833492279053, "reward_std": 0.05416665971279144, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7645833492279053, "step": 446 }, { "completion_length": 98.3125, "epoch": 0.298, "grad_norm": 5.154866986047095, "kl": 0.0634765625, "learning_rate": 8.51e-07, "loss": 0.0025, "reward": 1.7005208730697632, "reward_std": 0.17697662115097046, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7005208730697632, "step": 447 }, { "completion_length": 110.4375, "epoch": 0.2986666666666667, "grad_norm": 15.105081326427822, "kl": 0.06298828125, "learning_rate": 8.506666666666667e-07, "loss": 0.0025, "reward": 1.774999976158142, "reward_std": 0.03393542394042015, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7750000953674316, "step": 448 }, { "completion_length": 88.75, "epoch": 0.29933333333333334, "grad_norm": 1.269277111356288, "kl": 0.061767578125, "learning_rate": 8.503333333333333e-07, "loss": 0.0025, "reward": 1.9427083730697632, "reward_std": 0.014583338052034378, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083134651184, "step": 449 }, { "completion_length": 104.46875, "epoch": 0.3, "grad_norm": 3.8779143307002277, "kl": 0.1083984375, "learning_rate": 8.499999999999999e-07, "loss": 0.0043, "reward": 1.9505208730697632, "reward_std": 0.0364583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9505208134651184, "step": 450 }, { "completion_length": 94.4375, "epoch": 0.3006666666666667, "grad_norm": 1.5557829787080268, "kl": 0.07421875, "learning_rate": 8.496666666666667e-07, "loss": 0.003, "reward": 1.953125, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.953125, "step": 451 }, { "completion_length": 105.25, "epoch": 0.30133333333333334, "grad_norm": 3.0398084928538913, "kl": 0.06005859375, "learning_rate": 8.493333333333334e-07, "loss": 0.0024, "reward": 1.511458396911621, "reward_std": 0.1180456355214119, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5114583373069763, "step": 452 }, { "completion_length": 112.625, "epoch": 0.302, "grad_norm": 1.7602528692585786, "kl": 0.05126953125, "learning_rate": 8.489999999999999e-07, "loss": 0.0021, "reward": 1.640625, "reward_std": 0.19581207633018494, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.671875, "step": 453 }, { "completion_length": 82.125, "epoch": 0.30266666666666664, "grad_norm": 5.785773764481809, "kl": 0.0751953125, "learning_rate": 8.486666666666666e-07, "loss": 0.003, "reward": 1.6927083730697632, "reward_std": 0.1145833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6927083730697632, "step": 454 }, { "completion_length": 109.09375, "epoch": 0.30333333333333334, "grad_norm": 4.61070889824777, "kl": 0.05224609375, "learning_rate": 8.483333333333333e-07, "loss": 0.0021, "reward": 1.7752976417541504, "reward_std": 0.0383913479745388, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7752976417541504, "step": 455 }, { "completion_length": 98.125, "epoch": 0.304, "grad_norm": 1.1386837406361192, "kl": 0.043701171875, "learning_rate": 8.48e-07, "loss": 0.0017, "reward": 1.7921874523162842, "reward_std": 0.023593232035636902, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.823437511920929, "step": 456 }, { "completion_length": 94.875, "epoch": 0.30466666666666664, "grad_norm": 2.942983720106858, "kl": 0.05517578125, "learning_rate": 8.476666666666666e-07, "loss": 0.0022, "reward": 1.7654762268066406, "reward_std": 0.1618575155735016, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7654761672019958, "step": 457 }, { "completion_length": 115.34375, "epoch": 0.30533333333333335, "grad_norm": 3.029762032482373, "kl": 0.0439453125, "learning_rate": 8.473333333333333e-07, "loss": 0.0018, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 458 }, { "completion_length": 104.53125, "epoch": 0.306, "grad_norm": 2.3356248112057254, "kl": 0.07080078125, "learning_rate": 8.469999999999999e-07, "loss": 0.0028, "reward": 1.938020944595337, "reward_std": 0.024672335013747215, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9380208253860474, "step": 459 }, { "completion_length": 110.625, "epoch": 0.30666666666666664, "grad_norm": 1.7165777628539867, "kl": 0.046142578125, "learning_rate": 8.466666666666667e-07, "loss": 0.0018, "reward": 1.84375, "reward_std": 0.20683756470680237, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 460 }, { "completion_length": 86.65625, "epoch": 0.30733333333333335, "grad_norm": 3.523047956857513, "kl": 0.08349609375, "learning_rate": 8.463333333333334e-07, "loss": 0.0033, "reward": 1.8135416507720947, "reward_std": 0.04233439639210701, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8135416507720947, "step": 461 }, { "completion_length": 103.4375, "epoch": 0.308, "grad_norm": 3.200138339625778, "kl": 0.020263671875, "learning_rate": 8.459999999999999e-07, "loss": 0.0008, "reward": 1.8093750476837158, "reward_std": 0.09685234725475311, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.809374988079071, "step": 462 }, { "completion_length": 109.21875, "epoch": 0.30866666666666664, "grad_norm": 3.663562174746685, "kl": 0.041015625, "learning_rate": 8.456666666666666e-07, "loss": 0.0016, "reward": 1.37693452835083, "reward_std": 0.22906933724880219, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.3769345283508301, "step": 463 }, { "completion_length": 100.6875, "epoch": 0.30933333333333335, "grad_norm": 7.121323988968079, "kl": 0.087890625, "learning_rate": 8.453333333333334e-07, "loss": 0.0035, "reward": 1.8078124523162842, "reward_std": 0.021474510431289673, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8078124523162842, "step": 464 }, { "completion_length": 117.65625, "epoch": 0.31, "grad_norm": 6.429557795215013, "kl": 0.0458984375, "learning_rate": 8.45e-07, "loss": 0.0018, "reward": 1.6276042461395264, "reward_std": 0.10104319453239441, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6276041269302368, "step": 465 }, { "completion_length": 114.25, "epoch": 0.31066666666666665, "grad_norm": 12.605847345037791, "kl": 0.053955078125, "learning_rate": 8.446666666666666e-07, "loss": 0.0022, "reward": 1.875, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 466 }, { "completion_length": 109.15625, "epoch": 0.31133333333333335, "grad_norm": 7.126201541933563, "kl": 0.08642578125, "learning_rate": 8.443333333333333e-07, "loss": 0.0035, "reward": 1.6614583730697632, "reward_std": 0.2482127845287323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6614583730697632, "step": 467 }, { "completion_length": 103.96875, "epoch": 0.312, "grad_norm": 16.32877634418201, "kl": 0.05126953125, "learning_rate": 8.439999999999999e-07, "loss": 0.0021, "reward": 1.8520833253860474, "reward_std": 0.11155000329017639, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8520833253860474, "step": 468 }, { "completion_length": 106.46875, "epoch": 0.31266666666666665, "grad_norm": 1.7861241841426807, "kl": 0.09814453125, "learning_rate": 8.436666666666667e-07, "loss": 0.0039, "reward": 1.78125, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 469 }, { "completion_length": 119.34375, "epoch": 0.31333333333333335, "grad_norm": 2.8578934947356864, "kl": 0.064453125, "learning_rate": 8.433333333333333e-07, "loss": 0.0026, "reward": 1.8255208730697632, "reward_std": 0.2524191737174988, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8567708730697632, "step": 470 }, { "completion_length": 100.53125, "epoch": 0.314, "grad_norm": 2.6433587790176722, "kl": 0.057373046875, "learning_rate": 8.429999999999999e-07, "loss": 0.0023, "reward": 1.8229167461395264, "reward_std": 0.0502961091697216, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166269302368, "step": 471 }, { "completion_length": 119.375, "epoch": 0.31466666666666665, "grad_norm": 5.412283341870455, "kl": 0.0576171875, "learning_rate": 8.426666666666666e-07, "loss": 0.0023, "reward": 1.6592261791229248, "reward_std": 0.12162567675113678, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6592261791229248, "step": 472 }, { "completion_length": 99.4375, "epoch": 0.31533333333333335, "grad_norm": 3.9748381179596715, "kl": 0.0830078125, "learning_rate": 8.423333333333334e-07, "loss": 0.0033, "reward": 1.696874976158142, "reward_std": 0.11049661040306091, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7281249761581421, "step": 473 }, { "completion_length": 109.40625, "epoch": 0.316, "grad_norm": 4.354085484944017, "kl": 0.06494140625, "learning_rate": 8.419999999999999e-07, "loss": 0.0026, "reward": 1.777083396911621, "reward_std": 0.07916667312383652, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7770833373069763, "step": 474 }, { "completion_length": 106.625, "epoch": 0.31666666666666665, "grad_norm": 3.0266877527409504, "kl": 0.0625, "learning_rate": 8.416666666666666e-07, "loss": 0.0025, "reward": 1.6640625, "reward_std": 0.24404378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6640625, "step": 475 }, { "completion_length": 106.65625, "epoch": 0.31733333333333336, "grad_norm": 3.64175496045427, "kl": 0.06396484375, "learning_rate": 8.413333333333333e-07, "loss": 0.0026, "reward": 1.8177083730697632, "reward_std": 0.16591878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083730697632, "step": 476 }, { "completion_length": 112.75, "epoch": 0.318, "grad_norm": 4.129646983169572, "kl": 0.032470703125, "learning_rate": 8.41e-07, "loss": 0.0013, "reward": 1.734375, "reward_std": 0.17204804718494415, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.765625, "step": 477 }, { "completion_length": 103.40625, "epoch": 0.31866666666666665, "grad_norm": 2.9309433207832636, "kl": 0.061767578125, "learning_rate": 8.406666666666667e-07, "loss": 0.0025, "reward": 1.9427083730697632, "reward_std": 0.0475049763917923, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083134651184, "step": 478 }, { "completion_length": 100.5, "epoch": 0.31933333333333336, "grad_norm": 2.8797991539238526, "kl": 0.0625, "learning_rate": 8.403333333333333e-07, "loss": 0.0025, "reward": 1.6953125, "reward_std": 0.19780339300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7265625, "step": 479 }, { "completion_length": 107.5, "epoch": 0.32, "grad_norm": 7.565174979042409, "kl": 0.06787109375, "learning_rate": 8.399999999999999e-07, "loss": 0.0027, "reward": 1.8484002351760864, "reward_std": 0.036122821271419525, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.848400354385376, "step": 480 }, { "completion_length": 100.03125, "epoch": 0.32066666666666666, "grad_norm": 5.206374823347147, "kl": 0.0634765625, "learning_rate": 8.396666666666667e-07, "loss": 0.0025, "reward": 1.7625000476837158, "reward_std": 0.08679220080375671, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.762499988079071, "step": 481 }, { "completion_length": 112.0, "epoch": 0.32133333333333336, "grad_norm": 14.009410339836203, "kl": 0.07080078125, "learning_rate": 8.393333333333334e-07, "loss": 0.0028, "reward": 1.637686014175415, "reward_std": 0.08652792125940323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.637686014175415, "step": 482 }, { "completion_length": 104.90625, "epoch": 0.322, "grad_norm": 4.658225681772625, "kl": 0.072265625, "learning_rate": 8.389999999999999e-07, "loss": 0.0029, "reward": 1.7838542461395264, "reward_std": 0.14241796731948853, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7838541865348816, "step": 483 }, { "completion_length": 105.625, "epoch": 0.32266666666666666, "grad_norm": 1.3687968509248316, "kl": 0.0625, "learning_rate": 8.386666666666666e-07, "loss": 0.0025, "reward": 1.9348958730697632, "reward_std": 0.021474512293934822, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9348958134651184, "step": 484 }, { "completion_length": 120.8125, "epoch": 0.3233333333333333, "grad_norm": 2.5864019972109156, "kl": 0.083984375, "learning_rate": 8.383333333333334e-07, "loss": 0.0034, "reward": 1.53125, "reward_std": 0.38466876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5625, "step": 485 }, { "completion_length": 105.6875, "epoch": 0.324, "grad_norm": 4.089178901427125, "kl": 0.0556640625, "learning_rate": 8.38e-07, "loss": 0.0022, "reward": 1.5906250476837158, "reward_std": 0.4102563261985779, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.621874988079071, "step": 486 }, { "completion_length": 126.375, "epoch": 0.32466666666666666, "grad_norm": 2.246086436960358, "kl": 0.07275390625, "learning_rate": 8.376666666666666e-07, "loss": 0.0029, "reward": 1.6979167461395264, "reward_std": 0.337587833404541, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7916666269302368, "step": 487 }, { "completion_length": 114.15625, "epoch": 0.3253333333333333, "grad_norm": 5.043802099094882, "kl": 0.064453125, "learning_rate": 8.373333333333333e-07, "loss": 0.0026, "reward": 1.7453124523162842, "reward_std": 0.16908210515975952, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.745312511920929, "step": 488 }, { "completion_length": 96.8125, "epoch": 0.326, "grad_norm": 3.5283565468951923, "kl": 0.06689453125, "learning_rate": 8.369999999999999e-07, "loss": 0.0027, "reward": 1.671875, "reward_std": 0.1872895359992981, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.671875, "step": 489 }, { "completion_length": 130.1875, "epoch": 0.32666666666666666, "grad_norm": 2.4677356874082768, "kl": 0.0419921875, "learning_rate": 8.366666666666667e-07, "loss": 0.0017, "reward": 1.6854910850524902, "reward_std": 0.1653963029384613, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7479910850524902, "step": 490 }, { "completion_length": 107.28125, "epoch": 0.3273333333333333, "grad_norm": 1.9828282272607893, "kl": 0.05615234375, "learning_rate": 8.363333333333333e-07, "loss": 0.0022, "reward": 1.732812523841858, "reward_std": 0.17744678258895874, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7640625238418579, "step": 491 }, { "completion_length": 111.75, "epoch": 0.328, "grad_norm": 4.570533912283949, "kl": 0.0703125, "learning_rate": 8.359999999999999e-07, "loss": 0.0028, "reward": 1.4998512268066406, "reward_std": 0.2889426648616791, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5311012268066406, "step": 492 }, { "completion_length": 95.5, "epoch": 0.32866666666666666, "grad_norm": 4.3340221827421885, "kl": 0.03564453125, "learning_rate": 8.356666666666666e-07, "loss": 0.0014, "reward": 1.7265625, "reward_std": 0.18154378235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7578125, "step": 493 }, { "completion_length": 107.90625, "epoch": 0.3293333333333333, "grad_norm": 3.661908981023361, "kl": 0.08203125, "learning_rate": 8.353333333333334e-07, "loss": 0.0033, "reward": 1.90625, "reward_std": 0.1232261210680008, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 494 }, { "completion_length": 99.25, "epoch": 0.33, "grad_norm": 2.2282559545831035, "kl": 0.058837890625, "learning_rate": 8.349999999999999e-07, "loss": 0.0024, "reward": 1.86796236038208, "reward_std": 0.1814166009426117, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8992123007774353, "step": 495 }, { "completion_length": 86.71875, "epoch": 0.33066666666666666, "grad_norm": 4.663200712663093, "kl": 0.0712890625, "learning_rate": 8.346666666666666e-07, "loss": 0.0029, "reward": 1.9010417461395264, "reward_std": 0.03197453171014786, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9010416865348816, "step": 496 }, { "completion_length": 92.5625, "epoch": 0.3313333333333333, "grad_norm": 2.9445395257447267, "kl": 0.04736328125, "learning_rate": 8.343333333333333e-07, "loss": 0.0019, "reward": 1.8484375476837158, "reward_std": 0.10915116220712662, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.848437488079071, "step": 497 }, { "completion_length": 110.21875, "epoch": 0.332, "grad_norm": 3.6822166819795883, "kl": 0.076171875, "learning_rate": 8.34e-07, "loss": 0.0031, "reward": 1.5652530193328857, "reward_std": 0.2518462538719177, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5652530193328857, "step": 498 }, { "completion_length": 120.875, "epoch": 0.33266666666666667, "grad_norm": 3.840688051307915, "kl": 0.06884765625, "learning_rate": 8.336666666666667e-07, "loss": 0.0027, "reward": 1.7750000953674316, "reward_std": 0.06781215220689774, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7750000357627869, "step": 499 }, { "completion_length": 110.90625, "epoch": 0.3333333333333333, "grad_norm": 2.7344058712128185, "kl": 0.06787109375, "learning_rate": 8.333333333333333e-07, "loss": 0.0027, "reward": 1.8333333730697632, "reward_std": 0.19716879725456238, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8645833730697632, "step": 500 }, { "completion_length": 106.09375, "epoch": 0.334, "grad_norm": 2.2508462730830554, "kl": 0.0634765625, "learning_rate": 8.329999999999999e-07, "loss": 0.0025, "reward": 1.640625, "reward_std": 0.14304219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.640625, "step": 501 }, { "completion_length": 85.0625, "epoch": 0.33466666666666667, "grad_norm": 74.18941861948994, "kl": 0.06298828125, "learning_rate": 8.326666666666666e-07, "loss": 0.0025, "reward": 1.8229167461395264, "reward_std": 0.14299382269382477, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166865348816, "step": 502 }, { "completion_length": 104.28125, "epoch": 0.3353333333333333, "grad_norm": 4.082005092880075, "kl": 0.06201171875, "learning_rate": 8.323333333333334e-07, "loss": 0.0025, "reward": 1.7994792461395264, "reward_std": 0.20060735940933228, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7994791269302368, "step": 503 }, { "completion_length": 104.84375, "epoch": 0.336, "grad_norm": 3.346678636488335, "kl": 0.0625, "learning_rate": 8.319999999999999e-07, "loss": 0.0025, "reward": 1.6303571462631226, "reward_std": 0.0972675010561943, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6616071462631226, "step": 504 }, { "completion_length": 106.34375, "epoch": 0.33666666666666667, "grad_norm": 4.195071125998081, "kl": 0.07177734375, "learning_rate": 8.316666666666666e-07, "loss": 0.0029, "reward": 1.7708333730697632, "reward_std": 0.17311251163482666, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 505 }, { "completion_length": 98.625, "epoch": 0.3373333333333333, "grad_norm": 2.1331504132793415, "kl": 0.062255859375, "learning_rate": 8.313333333333333e-07, "loss": 0.0025, "reward": 1.7239583730697632, "reward_std": 0.1770833283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7552083730697632, "step": 506 }, { "completion_length": 104.71875, "epoch": 0.338, "grad_norm": 3.4915549340273615, "kl": 0.064453125, "learning_rate": 8.31e-07, "loss": 0.0026, "reward": 1.6473958492279053, "reward_std": 0.1438538134098053, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6473958492279053, "step": 507 }, { "completion_length": 120.4375, "epoch": 0.33866666666666667, "grad_norm": 3.4522688693897496, "kl": 0.0625, "learning_rate": 8.306666666666666e-07, "loss": 0.0025, "reward": 1.6895833015441895, "reward_std": 0.11384586244821548, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6895833015441895, "step": 508 }, { "completion_length": 115.65625, "epoch": 0.3393333333333333, "grad_norm": 7.25668519157744, "kl": 0.06591796875, "learning_rate": 8.303333333333333e-07, "loss": 0.0026, "reward": 1.693750023841858, "reward_std": 0.30966877937316895, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6937500238418579, "step": 509 }, { "completion_length": 100.875, "epoch": 0.34, "grad_norm": 2.961505806012424, "kl": 0.0595703125, "learning_rate": 8.299999999999999e-07, "loss": 0.0024, "reward": 1.7364583015441895, "reward_std": 0.12860843539237976, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7364583611488342, "step": 510 }, { "completion_length": 116.78125, "epoch": 0.3406666666666667, "grad_norm": 6.679036401689781, "kl": 0.055419921875, "learning_rate": 8.296666666666667e-07, "loss": 0.0022, "reward": 1.7708333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 511 }, { "completion_length": 108.9375, "epoch": 0.3413333333333333, "grad_norm": 2.2420279740806404, "kl": 0.06494140625, "learning_rate": 8.293333333333333e-07, "loss": 0.0026, "reward": 1.5130208730697632, "reward_std": 0.2230713963508606, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5442708730697632, "step": 512 }, { "completion_length": 120.59375, "epoch": 0.342, "grad_norm": 8.935885575841862, "kl": 0.060791015625, "learning_rate": 8.289999999999999e-07, "loss": 0.0024, "reward": 1.5330357551574707, "reward_std": 0.17921116948127747, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5330357551574707, "step": 513 }, { "completion_length": 101.5625, "epoch": 0.3426666666666667, "grad_norm": 2.3241658046776, "kl": 0.057861328125, "learning_rate": 8.286666666666666e-07, "loss": 0.0023, "reward": 1.515625, "reward_std": 0.08054219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.515625, "step": 514 }, { "completion_length": 95.9375, "epoch": 0.3433333333333333, "grad_norm": 1.5365508679231525, "kl": 0.0673828125, "learning_rate": 8.283333333333334e-07, "loss": 0.0027, "reward": 1.7708333730697632, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8020833134651184, "step": 515 }, { "completion_length": 97.625, "epoch": 0.344, "grad_norm": 4.819128406875141, "kl": 0.1611328125, "learning_rate": 8.28e-07, "loss": 0.0065, "reward": 1.712499976158142, "reward_std": 0.2520620822906494, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7437499761581421, "step": 516 }, { "completion_length": 86.875, "epoch": 0.3446666666666667, "grad_norm": 2.5509908386062827, "kl": 0.06689453125, "learning_rate": 8.276666666666666e-07, "loss": 0.0027, "reward": 1.8125, "reward_std": 0.1041666567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 517 }, { "completion_length": 103.09375, "epoch": 0.3453333333333333, "grad_norm": 4.025585875432403, "kl": 0.06103515625, "learning_rate": 8.273333333333333e-07, "loss": 0.0024, "reward": 1.6741070747375488, "reward_std": 0.13439296185970306, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7053571343421936, "step": 518 }, { "completion_length": 101.9375, "epoch": 0.346, "grad_norm": 29.995754846715467, "kl": 0.080078125, "learning_rate": 8.269999999999999e-07, "loss": 0.0032, "reward": 1.71875, "reward_std": 0.19678783416748047, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7187500596046448, "step": 519 }, { "completion_length": 94.90625, "epoch": 0.3466666666666667, "grad_norm": 2.8457253603941775, "kl": 0.06787109375, "learning_rate": 8.266666666666667e-07, "loss": 0.0027, "reward": 1.84375, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 520 }, { "completion_length": 102.59375, "epoch": 0.3473333333333333, "grad_norm": 2.5574245994247358, "kl": 0.0673828125, "learning_rate": 8.263333333333333e-07, "loss": 0.0027, "reward": 1.662500023841858, "reward_std": 0.12689527869224548, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6937500238418579, "step": 521 }, { "completion_length": 91.46875, "epoch": 0.348, "grad_norm": 7.294344499044292, "kl": 0.0712890625, "learning_rate": 8.259999999999999e-07, "loss": 0.0029, "reward": 1.8250000476837158, "reward_std": 0.14166668057441711, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.824999988079071, "step": 522 }, { "completion_length": 102.6875, "epoch": 0.3486666666666667, "grad_norm": 2.9648971652502407, "kl": 0.07373046875, "learning_rate": 8.256666666666666e-07, "loss": 0.003, "reward": 1.8328125476837158, "reward_std": 0.18464422225952148, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8640624284744263, "step": 523 }, { "completion_length": 106.5, "epoch": 0.34933333333333333, "grad_norm": 5.066966181572414, "kl": 0.0693359375, "learning_rate": 8.253333333333334e-07, "loss": 0.0028, "reward": 1.6979167461395264, "reward_std": 0.10706222057342529, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6979166865348816, "step": 524 }, { "completion_length": 99.5, "epoch": 0.35, "grad_norm": 2.734216044213777, "kl": 0.05517578125, "learning_rate": 8.249999999999999e-07, "loss": 0.0022, "reward": 1.84375, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 525 }, { "completion_length": 113.9375, "epoch": 0.3506666666666667, "grad_norm": 2.3866475367730833, "kl": 0.07568359375, "learning_rate": 8.246666666666666e-07, "loss": 0.003, "reward": 1.625, "reward_std": 0.17075318098068237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.625, "step": 526 }, { "completion_length": 91.53125, "epoch": 0.35133333333333333, "grad_norm": 2.735007314366826, "kl": 0.054443359375, "learning_rate": 8.243333333333333e-07, "loss": 0.0022, "reward": 1.90625, "reward_std": 0.16108438372612, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 527 }, { "completion_length": 99.96875, "epoch": 0.352, "grad_norm": 4.039439401257505, "kl": 0.09814453125, "learning_rate": 8.24e-07, "loss": 0.0039, "reward": 1.9130208492279053, "reward_std": 0.06145832687616348, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9130208492279053, "step": 528 }, { "completion_length": 90.5, "epoch": 0.3526666666666667, "grad_norm": 3.7784060819957497, "kl": 0.1005859375, "learning_rate": 8.236666666666666e-07, "loss": 0.004, "reward": 1.6744792461395264, "reward_std": 0.26721417903900146, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7057291865348816, "step": 529 }, { "completion_length": 93.25, "epoch": 0.35333333333333333, "grad_norm": 9.739905795021983, "kl": 0.0771484375, "learning_rate": 8.233333333333333e-07, "loss": 0.0031, "reward": 1.859375, "reward_std": 0.1791265904903412, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 530 }, { "completion_length": 101.9375, "epoch": 0.354, "grad_norm": 2.595750556678461, "kl": 0.05908203125, "learning_rate": 8.229999999999999e-07, "loss": 0.0024, "reward": 1.8019344806671143, "reward_std": 0.1582408845424652, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8019344806671143, "step": 531 }, { "completion_length": 94.40625, "epoch": 0.3546666666666667, "grad_norm": 1.2800352927815961, "kl": 0.111328125, "learning_rate": 8.226666666666666e-07, "loss": 0.0045, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 532 }, { "completion_length": 115.59375, "epoch": 0.35533333333333333, "grad_norm": 2.516034415350756, "kl": 0.0625, "learning_rate": 8.223333333333334e-07, "loss": 0.0025, "reward": 1.514062523841858, "reward_std": 0.15218330919742584, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5140625238418579, "step": 533 }, { "completion_length": 100.09375, "epoch": 0.356, "grad_norm": 3.9346845965881023, "kl": 0.06787109375, "learning_rate": 8.219999999999999e-07, "loss": 0.0027, "reward": 1.609375, "reward_std": 0.26835688948631287, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.609375, "step": 534 }, { "completion_length": 97.34375, "epoch": 0.3566666666666667, "grad_norm": 6.24498171312172, "kl": 0.07763671875, "learning_rate": 8.216666666666666e-07, "loss": 0.0031, "reward": 1.7449777126312256, "reward_std": 0.05971115082502365, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7449777126312256, "step": 535 }, { "completion_length": 89.65625, "epoch": 0.35733333333333334, "grad_norm": 2.1514105566759256, "kl": 0.0615234375, "learning_rate": 8.213333333333333e-07, "loss": 0.0025, "reward": 1.7781250476837158, "reward_std": 0.14663662016391754, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.778124988079071, "step": 536 }, { "completion_length": 111.9375, "epoch": 0.358, "grad_norm": 5.918786371551919, "kl": 0.08837890625, "learning_rate": 8.21e-07, "loss": 0.0035, "reward": 1.8536458015441895, "reward_std": 0.11904378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8536458611488342, "step": 537 }, { "completion_length": 93.1875, "epoch": 0.3586666666666667, "grad_norm": 2.512790107727555, "kl": 0.068359375, "learning_rate": 8.206666666666666e-07, "loss": 0.0027, "reward": 1.859375, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 538 }, { "completion_length": 111.46875, "epoch": 0.35933333333333334, "grad_norm": 3.3251129749116, "kl": 0.056884765625, "learning_rate": 8.203333333333333e-07, "loss": 0.0023, "reward": 1.8020833730697632, "reward_std": 0.27518051862716675, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8333333134651184, "step": 539 }, { "completion_length": 103.65625, "epoch": 0.36, "grad_norm": 2.9636528152585546, "kl": 0.0947265625, "learning_rate": 8.199999999999999e-07, "loss": 0.0038, "reward": 1.8322917222976685, "reward_std": 0.18131792545318604, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8635416626930237, "step": 540 }, { "completion_length": 91.6875, "epoch": 0.3606666666666667, "grad_norm": 7.822437957826098, "kl": 0.06787109375, "learning_rate": 8.196666666666667e-07, "loss": 0.0027, "reward": 1.7890625, "reward_std": 0.029264595359563828, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7890625, "step": 541 }, { "completion_length": 110.15625, "epoch": 0.36133333333333334, "grad_norm": 2.8044424056322073, "kl": 0.0830078125, "learning_rate": 8.193333333333333e-07, "loss": 0.0033, "reward": 1.7390625476837158, "reward_std": 0.09157966077327728, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7390625476837158, "step": 542 }, { "completion_length": 107.25, "epoch": 0.362, "grad_norm": 3.2953443395016486, "kl": 0.049560546875, "learning_rate": 8.189999999999999e-07, "loss": 0.002, "reward": 1.7572916746139526, "reward_std": 0.03365563228726387, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7572916746139526, "step": 543 }, { "completion_length": 98.5, "epoch": 0.3626666666666667, "grad_norm": 2.768849003896332, "kl": 0.05810546875, "learning_rate": 8.186666666666666e-07, "loss": 0.0023, "reward": 1.7239583730697632, "reward_std": 0.272108256816864, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 544 }, { "completion_length": 95.28125, "epoch": 0.36333333333333334, "grad_norm": 4.694148034118447, "kl": 0.07177734375, "learning_rate": 8.183333333333334e-07, "loss": 0.0029, "reward": 1.673437476158142, "reward_std": 0.20141145586967468, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7046875357627869, "step": 545 }, { "completion_length": 92.75, "epoch": 0.364, "grad_norm": 9.94981357064956, "kl": 0.0791015625, "learning_rate": 8.179999999999999e-07, "loss": 0.0032, "reward": 1.734375, "reward_std": 0.18983599543571472, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 546 }, { "completion_length": 103.25, "epoch": 0.36466666666666664, "grad_norm": 1.8663750621102015, "kl": 0.046875, "learning_rate": 8.176666666666666e-07, "loss": 0.0019, "reward": 1.8489583730697632, "reward_std": 0.0824463963508606, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583730697632, "step": 547 }, { "completion_length": 110.65625, "epoch": 0.36533333333333334, "grad_norm": 3.648923015947155, "kl": 0.052978515625, "learning_rate": 8.173333333333333e-07, "loss": 0.0021, "reward": 1.8177083730697632, "reward_std": 0.16591876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8489583730697632, "step": 548 }, { "completion_length": 104.84375, "epoch": 0.366, "grad_norm": 2.9181818373313093, "kl": 0.04931640625, "learning_rate": 8.169999999999999e-07, "loss": 0.002, "reward": 1.8177083730697632, "reward_std": 0.14068284630775452, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083730697632, "step": 549 }, { "completion_length": 102.9375, "epoch": 0.36666666666666664, "grad_norm": 3.421191948589756, "kl": 0.046142578125, "learning_rate": 8.166666666666666e-07, "loss": 0.0018, "reward": 1.765625, "reward_std": 0.22487975656986237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.765625, "step": 550 }, { "completion_length": 115.84375, "epoch": 0.36733333333333335, "grad_norm": 3.9022164958983887, "kl": 0.048095703125, "learning_rate": 8.163333333333333e-07, "loss": 0.0019, "reward": 1.6875, "reward_std": 0.22358438372612, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.71875, "step": 551 }, { "completion_length": 96.875, "epoch": 0.368, "grad_norm": 2.0766182843212766, "kl": 0.09228515625, "learning_rate": 8.159999999999999e-07, "loss": 0.0037, "reward": 1.7921874523162842, "reward_std": 0.09684111177921295, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.792187511920929, "step": 552 }, { "completion_length": 97.75, "epoch": 0.36866666666666664, "grad_norm": 1.0847244217157235, "kl": 0.0703125, "learning_rate": 8.156666666666666e-07, "loss": 0.0028, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 553 }, { "completion_length": 111.53125, "epoch": 0.36933333333333335, "grad_norm": 13.899219110554107, "kl": 0.0615234375, "learning_rate": 8.153333333333334e-07, "loss": 0.0025, "reward": 1.8468749523162842, "reward_std": 0.16041666269302368, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8781250715255737, "step": 554 }, { "completion_length": 87.6875, "epoch": 0.37, "grad_norm": 1.376309683610701, "kl": 0.0703125, "learning_rate": 8.149999999999999e-07, "loss": 0.0028, "reward": 1.7395833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 555 }, { "completion_length": 109.03125, "epoch": 0.37066666666666664, "grad_norm": 2.1582486310110767, "kl": 0.056640625, "learning_rate": 8.146666666666666e-07, "loss": 0.0023, "reward": 1.8380208015441895, "reward_std": 0.16279378533363342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8380208611488342, "step": 556 }, { "completion_length": 99.8125, "epoch": 0.37133333333333335, "grad_norm": 4.51875789301335, "kl": 0.05908203125, "learning_rate": 8.143333333333333e-07, "loss": 0.0024, "reward": 1.7255208492279053, "reward_std": 0.17602482438087463, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7255208492279053, "step": 557 }, { "completion_length": 97.9375, "epoch": 0.372, "grad_norm": 2.9644503752149105, "kl": 0.0693359375, "learning_rate": 8.14e-07, "loss": 0.0028, "reward": 1.6770833730697632, "reward_std": 0.11061252653598785, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6770833134651184, "step": 558 }, { "completion_length": 108.09375, "epoch": 0.37266666666666665, "grad_norm": 1.3111981419024572, "kl": 0.062255859375, "learning_rate": 8.136666666666666e-07, "loss": 0.0025, "reward": 1.796875, "reward_std": 0.0729166716337204, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.828125, "step": 559 }, { "completion_length": 97.03125, "epoch": 0.37333333333333335, "grad_norm": 0.32462424487336816, "kl": 0.04638671875, "learning_rate": 8.133333333333333e-07, "loss": 0.0019, "reward": 1.6666667461395264, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6666666269302368, "step": 560 }, { "completion_length": 114.25, "epoch": 0.374, "grad_norm": 37.86311213780811, "kl": 0.06396484375, "learning_rate": 8.129999999999999e-07, "loss": 0.0026, "reward": 1.662500023841858, "reward_std": 0.11720609664916992, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6625000238418579, "step": 561 }, { "completion_length": 105.71875, "epoch": 0.37466666666666665, "grad_norm": 5.407931644177575, "kl": 0.07763671875, "learning_rate": 8.126666666666666e-07, "loss": 0.0031, "reward": 1.7287201881408691, "reward_std": 0.19243867695331573, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7287201881408691, "step": 562 }, { "completion_length": 122.84375, "epoch": 0.37533333333333335, "grad_norm": 4.818947894229937, "kl": 0.05126953125, "learning_rate": 8.123333333333333e-07, "loss": 0.0021, "reward": 1.5833333730697632, "reward_std": 0.5026708841323853, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6458333730697632, "step": 563 }, { "completion_length": 115.09375, "epoch": 0.376, "grad_norm": 3.3833000134202114, "kl": 0.05859375, "learning_rate": 8.12e-07, "loss": 0.0023, "reward": 1.765364646911621, "reward_std": 0.15260416269302368, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7653645277023315, "step": 564 }, { "completion_length": 108.1875, "epoch": 0.37666666666666665, "grad_norm": 3.8478155156412663, "kl": 0.07177734375, "learning_rate": 8.116666666666666e-07, "loss": 0.0029, "reward": 1.9812500476837158, "reward_std": 0.012500002980232239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9812500476837158, "step": 565 }, { "completion_length": 98.46875, "epoch": 0.37733333333333335, "grad_norm": 2.4086764501123636, "kl": 0.05859375, "learning_rate": 8.113333333333333e-07, "loss": 0.0023, "reward": 1.8250000476837158, "reward_std": 0.1721687763929367, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.824999988079071, "step": 566 }, { "completion_length": 114.5, "epoch": 0.378, "grad_norm": 5.804437846529614, "kl": 0.05859375, "learning_rate": 8.11e-07, "loss": 0.0023, "reward": 1.9197916984558105, "reward_std": 0.06259354948997498, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9197916388511658, "step": 567 }, { "completion_length": 121.8125, "epoch": 0.37866666666666665, "grad_norm": 4.2298376732750524, "kl": 0.07275390625, "learning_rate": 8.106666666666666e-07, "loss": 0.0029, "reward": 1.6927083730697632, "reward_std": 0.23808756470680237, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7552083730697632, "step": 568 }, { "completion_length": 104.96875, "epoch": 0.37933333333333336, "grad_norm": 12.335378695180559, "kl": 0.498046875, "learning_rate": 8.103333333333333e-07, "loss": 0.02, "reward": 1.7916666269302368, "reward_std": 0.08750000596046448, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666865348816, "step": 569 }, { "completion_length": 96.65625, "epoch": 0.38, "grad_norm": 2.3169279618136285, "kl": 0.062255859375, "learning_rate": 8.1e-07, "loss": 0.0025, "reward": 1.71875, "reward_std": 0.1648927927017212, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 570 }, { "completion_length": 107.3125, "epoch": 0.38066666666666665, "grad_norm": 2.445023163069184, "kl": 0.058837890625, "learning_rate": 8.096666666666667e-07, "loss": 0.0024, "reward": 1.6479166746139526, "reward_std": 0.03989279642701149, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6479166746139526, "step": 571 }, { "completion_length": 104.28125, "epoch": 0.38133333333333336, "grad_norm": 4.0541920090581245, "kl": 0.04931640625, "learning_rate": 8.093333333333333e-07, "loss": 0.002, "reward": 1.703125, "reward_std": 0.30760425329208374, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.765625, "step": 572 }, { "completion_length": 117.5, "epoch": 0.382, "grad_norm": 6.608249776688042, "kl": 0.0576171875, "learning_rate": 8.09e-07, "loss": 0.0023, "reward": 1.6953125, "reward_std": 0.24058544635772705, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6953125596046448, "step": 573 }, { "completion_length": 96.8125, "epoch": 0.38266666666666665, "grad_norm": 5.808792735397549, "kl": 0.06787109375, "learning_rate": 8.086666666666666e-07, "loss": 0.0027, "reward": 1.6453125476837158, "reward_std": 0.11912232637405396, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6453125476837158, "step": 574 }, { "completion_length": 104.21875, "epoch": 0.38333333333333336, "grad_norm": 3.234569939817099, "kl": 0.076171875, "learning_rate": 8.083333333333334e-07, "loss": 0.003, "reward": 1.875, "reward_std": 0.2083333283662796, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.9375, "step": 575 }, { "completion_length": 104.9375, "epoch": 0.384, "grad_norm": 3.160716246848833, "kl": 0.06494140625, "learning_rate": 8.08e-07, "loss": 0.0026, "reward": 1.9083333015441895, "reward_std": 0.06001617759466171, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9083333015441895, "step": 576 }, { "completion_length": 104.625, "epoch": 0.38466666666666666, "grad_norm": 2.0124600271146726, "kl": 0.06396484375, "learning_rate": 8.076666666666666e-07, "loss": 0.0026, "reward": 1.84375, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 577 }, { "completion_length": 112.0, "epoch": 0.38533333333333336, "grad_norm": 7.735227864486438, "kl": 0.05908203125, "learning_rate": 8.073333333333333e-07, "loss": 0.0024, "reward": 1.8078124523162842, "reward_std": 0.11936085671186447, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8078125715255737, "step": 578 }, { "completion_length": 97.375, "epoch": 0.386, "grad_norm": 2.2900950958062976, "kl": 0.053955078125, "learning_rate": 8.070000000000001e-07, "loss": 0.0022, "reward": 1.921875, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 579 }, { "completion_length": 97.53125, "epoch": 0.38666666666666666, "grad_norm": 3.5280781071605922, "kl": 0.039306640625, "learning_rate": 8.066666666666666e-07, "loss": 0.0016, "reward": 1.625, "reward_std": 0.14201034605503082, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.625, "step": 580 }, { "completion_length": 116.84375, "epoch": 0.3873333333333333, "grad_norm": 4.377687739963903, "kl": 0.0556640625, "learning_rate": 8.063333333333333e-07, "loss": 0.0022, "reward": 1.7739583253860474, "reward_std": 0.1511169970035553, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8364583849906921, "step": 581 }, { "completion_length": 100.5625, "epoch": 0.388, "grad_norm": 3.4436005234009364, "kl": 0.053955078125, "learning_rate": 8.06e-07, "loss": 0.0022, "reward": 1.7604167461395264, "reward_std": 0.1763354390859604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7604166865348816, "step": 582 }, { "completion_length": 107.5625, "epoch": 0.38866666666666666, "grad_norm": 3.0486250931621215, "kl": 0.056884765625, "learning_rate": 8.056666666666666e-07, "loss": 0.0023, "reward": 1.6145833730697632, "reward_std": 0.2180021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6458333134651184, "step": 583 }, { "completion_length": 112.6875, "epoch": 0.3893333333333333, "grad_norm": 15.466817218987922, "kl": 0.07568359375, "learning_rate": 8.053333333333333e-07, "loss": 0.003, "reward": 1.6830357313156128, "reward_std": 0.1640399992465973, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7142857313156128, "step": 584 }, { "completion_length": 111.03125, "epoch": 0.39, "grad_norm": 2.65355310570935, "kl": 0.044189453125, "learning_rate": 8.05e-07, "loss": 0.0018, "reward": 1.8770833015441895, "reward_std": 0.08598805963993073, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8770833015441895, "step": 585 }, { "completion_length": 105.28125, "epoch": 0.39066666666666666, "grad_norm": 2.719872767425073, "kl": 0.040283203125, "learning_rate": 8.046666666666666e-07, "loss": 0.0016, "reward": 1.6640625, "reward_std": 0.2552083432674408, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6953125, "step": 586 }, { "completion_length": 90.21875, "epoch": 0.3913333333333333, "grad_norm": 3.748003330847342, "kl": 0.04833984375, "learning_rate": 8.043333333333333e-07, "loss": 0.0019, "reward": 1.71875, "reward_std": 0.25966876745224, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.8125, "step": 587 }, { "completion_length": 97.0, "epoch": 0.392, "grad_norm": 3.926246186296298, "kl": 0.05517578125, "learning_rate": 8.04e-07, "loss": 0.0022, "reward": 1.8015625476837158, "reward_std": 0.09035296738147736, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.801562488079071, "step": 588 }, { "completion_length": 114.71875, "epoch": 0.39266666666666666, "grad_norm": 2.2403288068134635, "kl": 0.050048828125, "learning_rate": 8.036666666666666e-07, "loss": 0.002, "reward": 1.796875, "reward_std": 0.20758545398712158, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.828125, "step": 589 }, { "completion_length": 104.46875, "epoch": 0.3933333333333333, "grad_norm": 2.8175135704011476, "kl": 0.045654296875, "learning_rate": 8.033333333333333e-07, "loss": 0.0018, "reward": 1.8333333730697632, "reward_std": 0.14201033115386963, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 590 }, { "completion_length": 98.40625, "epoch": 0.394, "grad_norm": 1.2263611591512713, "kl": 0.056396484375, "learning_rate": 8.03e-07, "loss": 0.0022, "reward": 1.984375, "reward_std": 0.018042195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.984375, "step": 591 }, { "completion_length": 102.59375, "epoch": 0.39466666666666667, "grad_norm": 2.3783227725805016, "kl": 0.03369140625, "learning_rate": 8.026666666666667e-07, "loss": 0.0013, "reward": 1.6875, "reward_std": 0.32216876745224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 592 }, { "completion_length": 112.28125, "epoch": 0.3953333333333333, "grad_norm": 1.9878880189191286, "kl": 0.03955078125, "learning_rate": 8.023333333333333e-07, "loss": 0.0016, "reward": 1.7083333730697632, "reward_std": 0.19716878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333134651184, "step": 593 }, { "completion_length": 97.5625, "epoch": 0.396, "grad_norm": 1.5961969918054713, "kl": 0.07861328125, "learning_rate": 8.02e-07, "loss": 0.0032, "reward": 1.7864583730697632, "reward_std": 0.03447291627526283, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7864583730697632, "step": 594 }, { "completion_length": 132.71875, "epoch": 0.39666666666666667, "grad_norm": 1.91203254765187, "kl": 0.040283203125, "learning_rate": 8.016666666666666e-07, "loss": 0.0016, "reward": 1.7693452835083008, "reward_std": 0.14358630776405334, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7693452835083008, "step": 595 }, { "completion_length": 114.8125, "epoch": 0.3973333333333333, "grad_norm": 2.692400503735617, "kl": 0.0625, "learning_rate": 8.013333333333333e-07, "loss": 0.0025, "reward": 1.6924480199813843, "reward_std": 0.26714515686035156, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7549479603767395, "step": 596 }, { "completion_length": 104.84375, "epoch": 0.398, "grad_norm": 3.4748771318405174, "kl": 0.07080078125, "learning_rate": 8.01e-07, "loss": 0.0028, "reward": 1.7440104484558105, "reward_std": 0.12610068917274475, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7440104484558105, "step": 597 }, { "completion_length": 98.71875, "epoch": 0.39866666666666667, "grad_norm": 2.921817495299893, "kl": 0.06884765625, "learning_rate": 8.006666666666666e-07, "loss": 0.0028, "reward": 1.78125, "reward_std": 0.25966876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 598 }, { "completion_length": 106.03125, "epoch": 0.3993333333333333, "grad_norm": 10.557389268670764, "kl": 0.08349609375, "learning_rate": 8.003333333333333e-07, "loss": 0.0033, "reward": 1.7838542461395264, "reward_std": 0.20570486783981323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7838541269302368, "step": 599 }, { "completion_length": 83.0, "epoch": 0.4, "grad_norm": 0.11885423335850295, "kl": 0.05224609375, "learning_rate": 8e-07, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 600 }, { "completion_length": 100.46875, "epoch": 0.40066666666666667, "grad_norm": 3.394595638909643, "kl": 0.039794921875, "learning_rate": 7.996666666666666e-07, "loss": 0.0016, "reward": 1.696874976158142, "reward_std": 0.1798219531774521, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6968750357627869, "step": 601 }, { "completion_length": 102.09375, "epoch": 0.4013333333333333, "grad_norm": 1.8028797129276335, "kl": 0.0625, "learning_rate": 7.993333333333333e-07, "loss": 0.0025, "reward": 1.9583333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333730697632, "step": 602 }, { "completion_length": 106.40625, "epoch": 0.402, "grad_norm": 4.154030024143636, "kl": 0.051025390625, "learning_rate": 7.99e-07, "loss": 0.002, "reward": 1.6921875476837158, "reward_std": 0.24088816344738007, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.692187488079071, "step": 603 }, { "completion_length": 117.625, "epoch": 0.4026666666666667, "grad_norm": 2.4203748794480036, "kl": 0.08203125, "learning_rate": 7.986666666666666e-07, "loss": 0.0033, "reward": 1.730208396911621, "reward_std": 0.21252527832984924, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7302082777023315, "step": 604 }, { "completion_length": 98.15625, "epoch": 0.4033333333333333, "grad_norm": 10.840287487641572, "kl": 0.04931640625, "learning_rate": 7.983333333333333e-07, "loss": 0.002, "reward": 1.7872395515441895, "reward_std": 0.09864427149295807, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7872395515441895, "step": 605 }, { "completion_length": 114.09375, "epoch": 0.404, "grad_norm": 2.7062936736516, "kl": 0.08056640625, "learning_rate": 7.98e-07, "loss": 0.0032, "reward": 1.754166603088379, "reward_std": 0.3942450284957886, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8166666626930237, "step": 606 }, { "completion_length": 104.5625, "epoch": 0.4046666666666667, "grad_norm": 2.1103005700286337, "kl": 0.053955078125, "learning_rate": 7.976666666666666e-07, "loss": 0.0022, "reward": 1.8125, "reward_std": 0.17075318098068237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 607 }, { "completion_length": 97.4375, "epoch": 0.4053333333333333, "grad_norm": 3.6030459050029098, "kl": 0.0771484375, "learning_rate": 7.973333333333333e-07, "loss": 0.0031, "reward": 1.7052083015441895, "reward_std": 0.06458333134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7052083015441895, "step": 608 }, { "completion_length": 108.28125, "epoch": 0.406, "grad_norm": 1.5254370152778545, "kl": 0.057373046875, "learning_rate": 7.970000000000001e-07, "loss": 0.0023, "reward": 1.8859374523162842, "reward_std": 0.08693695813417435, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.885937511920929, "step": 609 }, { "completion_length": 111.53125, "epoch": 0.4066666666666667, "grad_norm": 3.1203005337614607, "kl": 0.09033203125, "learning_rate": 7.966666666666666e-07, "loss": 0.0036, "reward": 1.6848958730697632, "reward_std": 0.07737711817026138, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6848958134651184, "step": 610 }, { "completion_length": 101.34375, "epoch": 0.4073333333333333, "grad_norm": 2.071898577612033, "kl": 0.0869140625, "learning_rate": 7.963333333333333e-07, "loss": 0.0035, "reward": 1.7708333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8020833730697632, "step": 611 }, { "completion_length": 94.46875, "epoch": 0.408, "grad_norm": 13.294377373222018, "kl": 0.059814453125, "learning_rate": 7.96e-07, "loss": 0.0024, "reward": 1.931249976158142, "reward_std": 0.012500002980232239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9312499761581421, "step": 612 }, { "completion_length": 97.125, "epoch": 0.4086666666666667, "grad_norm": 1.487325682574872, "kl": 0.08837890625, "learning_rate": 7.956666666666666e-07, "loss": 0.0035, "reward": 1.9270833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9270833730697632, "step": 613 }, { "completion_length": 94.03125, "epoch": 0.4093333333333333, "grad_norm": 3.679070161158337, "kl": 0.060302734375, "learning_rate": 7.953333333333333e-07, "loss": 0.0024, "reward": 1.921875, "reward_std": 0.010416666977107525, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 614 }, { "completion_length": 102.375, "epoch": 0.41, "grad_norm": 4.046978210833327, "kl": 0.06591796875, "learning_rate": 7.95e-07, "loss": 0.0026, "reward": 1.868749976158142, "reward_std": 0.07500000298023224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8687499761581421, "step": 615 }, { "completion_length": 111.15625, "epoch": 0.4106666666666667, "grad_norm": 3.9304135541237777, "kl": 0.07177734375, "learning_rate": 7.946666666666666e-07, "loss": 0.0029, "reward": 1.7687499523162842, "reward_std": 0.3462713360786438, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.831250011920929, "step": 616 }, { "completion_length": 109.25, "epoch": 0.41133333333333333, "grad_norm": 5.140760899617319, "kl": 0.08251953125, "learning_rate": 7.943333333333333e-07, "loss": 0.0033, "reward": 1.8666666746139526, "reward_std": 0.09200608730316162, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8666666746139526, "step": 617 }, { "completion_length": 104.0, "epoch": 0.412, "grad_norm": 4.09462829929584, "kl": 0.08642578125, "learning_rate": 7.94e-07, "loss": 0.0035, "reward": 1.7617559432983398, "reward_std": 0.09213188290596008, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7617559432983398, "step": 618 }, { "completion_length": 93.1875, "epoch": 0.4126666666666667, "grad_norm": 3.8093665798356495, "kl": 0.06201171875, "learning_rate": 7.936666666666666e-07, "loss": 0.0025, "reward": 1.8411458730697632, "reward_std": 0.18060964345932007, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8723958730697632, "step": 619 }, { "completion_length": 88.71875, "epoch": 0.41333333333333333, "grad_norm": 2.7775338977429382, "kl": 0.05908203125, "learning_rate": 7.933333333333333e-07, "loss": 0.0024, "reward": 1.7864583730697632, "reward_std": 0.14508545398712158, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8177083730697632, "step": 620 }, { "completion_length": 106.3125, "epoch": 0.414, "grad_norm": 3.118629039228201, "kl": 0.0458984375, "learning_rate": 7.93e-07, "loss": 0.0018, "reward": 1.734375, "reward_std": 0.32081207633018494, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.796875, "step": 621 }, { "completion_length": 134.53125, "epoch": 0.4146666666666667, "grad_norm": 2.7110991340567914, "kl": 0.0673828125, "learning_rate": 7.926666666666666e-07, "loss": 0.0027, "reward": 1.6848958730697632, "reward_std": 0.34794098138809204, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7786458730697632, "step": 622 }, { "completion_length": 98.71875, "epoch": 0.41533333333333333, "grad_norm": 3.0516161709348224, "kl": 0.08837890625, "learning_rate": 7.923333333333333e-07, "loss": 0.0035, "reward": 1.795163631439209, "reward_std": 0.2072899043560028, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7951636910438538, "step": 623 }, { "completion_length": 103.125, "epoch": 0.416, "grad_norm": 8.08497509263122, "kl": 0.06494140625, "learning_rate": 7.92e-07, "loss": 0.0026, "reward": 1.7781250476837158, "reward_std": 0.10208334028720856, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.778124988079071, "step": 624 }, { "completion_length": 97.4375, "epoch": 0.4166666666666667, "grad_norm": 1.6621229086563345, "kl": 0.0791015625, "learning_rate": 7.916666666666666e-07, "loss": 0.0032, "reward": 1.8125, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 625 }, { "completion_length": 99.1875, "epoch": 0.41733333333333333, "grad_norm": 2.0747900933219263, "kl": 0.056884765625, "learning_rate": 7.913333333333332e-07, "loss": 0.0023, "reward": 1.7135417461395264, "reward_std": 0.1450854390859604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7135416865348816, "step": 626 }, { "completion_length": 92.375, "epoch": 0.418, "grad_norm": 4.3061272447244905, "kl": 0.10400390625, "learning_rate": 7.91e-07, "loss": 0.0042, "reward": 1.6416666507720947, "reward_std": 0.1450408399105072, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6416667103767395, "step": 627 }, { "completion_length": 96.78125, "epoch": 0.4186666666666667, "grad_norm": 1.7462771081658204, "kl": 0.04296875, "learning_rate": 7.906666666666666e-07, "loss": 0.0017, "reward": 1.7083333730697632, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333730697632, "step": 628 }, { "completion_length": 95.0, "epoch": 0.41933333333333334, "grad_norm": 11.754879935534913, "kl": 0.059814453125, "learning_rate": 7.903333333333333e-07, "loss": 0.0024, "reward": 1.78125, "reward_std": 0.1660272479057312, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8124999403953552, "step": 629 }, { "completion_length": 88.8125, "epoch": 0.42, "grad_norm": 1.2386078448425344, "kl": 0.07568359375, "learning_rate": 7.9e-07, "loss": 0.003, "reward": 1.90625, "reward_std": 0.012499993667006493, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 630 }, { "completion_length": 103.1875, "epoch": 0.4206666666666667, "grad_norm": 2.611574301911987, "kl": 0.07080078125, "learning_rate": 7.896666666666666e-07, "loss": 0.0028, "reward": 1.765625, "reward_std": 0.04077973961830139, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.765625, "step": 631 }, { "completion_length": 101.15625, "epoch": 0.42133333333333334, "grad_norm": 2.7866139285632414, "kl": 0.0341796875, "learning_rate": 7.893333333333333e-07, "loss": 0.0014, "reward": 1.796875, "reward_std": 0.15625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.828125, "step": 632 }, { "completion_length": 116.5, "epoch": 0.422, "grad_norm": 2.2706426705636407, "kl": 0.057373046875, "learning_rate": 7.89e-07, "loss": 0.0023, "reward": 1.65625, "reward_std": 0.16753023862838745, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6875000596046448, "step": 633 }, { "completion_length": 96.1875, "epoch": 0.4226666666666667, "grad_norm": 1.249248687202498, "kl": 0.04443359375, "learning_rate": 7.886666666666666e-07, "loss": 0.0018, "reward": 1.9479167461395264, "reward_std": 0.012028136290609837, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9479166269302368, "step": 634 }, { "completion_length": 88.96875, "epoch": 0.42333333333333334, "grad_norm": 2.1436129704281046, "kl": 0.0732421875, "learning_rate": 7.883333333333333e-07, "loss": 0.0029, "reward": 1.75, "reward_std": 0.1555021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7499999403953552, "step": 635 }, { "completion_length": 91.15625, "epoch": 0.424, "grad_norm": 4.9152530604180775, "kl": 0.056884765625, "learning_rate": 7.88e-07, "loss": 0.0023, "reward": 1.7572916746139526, "reward_std": 0.17861242592334747, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7572916746139526, "step": 636 }, { "completion_length": 95.6875, "epoch": 0.4246666666666667, "grad_norm": 5.3654527909623, "kl": 0.0712890625, "learning_rate": 7.876666666666666e-07, "loss": 0.0028, "reward": 1.7708333730697632, "reward_std": 0.1587250530719757, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708332538604736, "step": 637 }, { "completion_length": 85.125, "epoch": 0.42533333333333334, "grad_norm": 3.230653765112418, "kl": 0.0927734375, "learning_rate": 7.873333333333333e-07, "loss": 0.0037, "reward": 1.5338542461395264, "reward_std": 0.0348825678229332, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5338541865348816, "step": 638 }, { "completion_length": 95.0, "epoch": 0.426, "grad_norm": 21.93821017533424, "kl": 0.068359375, "learning_rate": 7.87e-07, "loss": 0.0027, "reward": 1.7989583015441895, "reward_std": 0.1466306746006012, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7989583611488342, "step": 639 }, { "completion_length": 102.25, "epoch": 0.4266666666666667, "grad_norm": 6.69980154547221, "kl": 0.09765625, "learning_rate": 7.866666666666666e-07, "loss": 0.0039, "reward": 1.665624976158142, "reward_std": 0.07510441541671753, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6656249761581421, "step": 640 }, { "completion_length": 86.9375, "epoch": 0.42733333333333334, "grad_norm": 1.650776873990449, "kl": 0.08740234375, "learning_rate": 7.863333333333333e-07, "loss": 0.0035, "reward": 1.8671875, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8671875, "step": 641 }, { "completion_length": 83.3125, "epoch": 0.428, "grad_norm": 6.245040188440149, "kl": 0.0791015625, "learning_rate": 7.86e-07, "loss": 0.0032, "reward": 1.9296875, "reward_std": 0.0677083283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 642 }, { "completion_length": 86.59375, "epoch": 0.42866666666666664, "grad_norm": 2.7345873129586886, "kl": 0.060791015625, "learning_rate": 7.856666666666665e-07, "loss": 0.0024, "reward": 1.9375, "reward_std": 0.09858439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 643 }, { "completion_length": 85.0625, "epoch": 0.42933333333333334, "grad_norm": 1.5590307976633704, "kl": 0.056396484375, "learning_rate": 7.853333333333333e-07, "loss": 0.0023, "reward": 1.921875, "reward_std": 0.010416666977107525, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 644 }, { "completion_length": 91.34375, "epoch": 0.43, "grad_norm": 5.405816159054079, "kl": 0.06494140625, "learning_rate": 7.85e-07, "loss": 0.0026, "reward": 1.8723958730697632, "reward_std": 0.05815085768699646, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8723958134651184, "step": 645 }, { "completion_length": 91.46875, "epoch": 0.43066666666666664, "grad_norm": 0.8869438222503405, "kl": 0.0172119140625, "learning_rate": 7.846666666666666e-07, "loss": 0.0007, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 646 }, { "completion_length": 97.78125, "epoch": 0.43133333333333335, "grad_norm": 3.92306538175301, "kl": 0.08447265625, "learning_rate": 7.843333333333332e-07, "loss": 0.0034, "reward": 1.7786458730697632, "reward_std": 0.13485778868198395, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7786458730697632, "step": 647 }, { "completion_length": 98.15625, "epoch": 0.432, "grad_norm": 4.736327780181568, "kl": 0.07568359375, "learning_rate": 7.84e-07, "loss": 0.003, "reward": 1.6492671966552734, "reward_std": 0.2671886384487152, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6805171966552734, "step": 648 }, { "completion_length": 96.84375, "epoch": 0.43266666666666664, "grad_norm": 2.647128097018168, "kl": 0.10009765625, "learning_rate": 7.836666666666666e-07, "loss": 0.004, "reward": 1.7687499523162842, "reward_std": 0.10538912564516068, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.768750011920929, "step": 649 }, { "completion_length": 102.03125, "epoch": 0.43333333333333335, "grad_norm": 1.4538679834739936, "kl": 0.07666015625, "learning_rate": 7.833333333333333e-07, "loss": 0.0031, "reward": 1.765371322631836, "reward_std": 0.12696301937103271, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7966213226318359, "step": 650 }, { "completion_length": 85.78125, "epoch": 0.434, "grad_norm": 5.346531960309788, "kl": 0.1083984375, "learning_rate": 7.83e-07, "loss": 0.0043, "reward": 1.8333332538604736, "reward_std": 0.1127961054444313, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 651 }, { "completion_length": 86.71875, "epoch": 0.43466666666666665, "grad_norm": 2.154347093581524, "kl": 0.055908203125, "learning_rate": 7.826666666666666e-07, "loss": 0.0022, "reward": 1.7239583730697632, "reward_std": 0.14508545398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 652 }, { "completion_length": 86.125, "epoch": 0.43533333333333335, "grad_norm": 0.23056197819632998, "kl": 0.08056640625, "learning_rate": 7.823333333333333e-07, "loss": 0.0032, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 653 }, { "completion_length": 91.46875, "epoch": 0.436, "grad_norm": 3.559622960029036, "kl": 0.09130859375, "learning_rate": 7.82e-07, "loss": 0.0037, "reward": 1.9110863208770752, "reward_std": 0.14728419482707977, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9110863208770752, "step": 654 }, { "completion_length": 90.40625, "epoch": 0.43666666666666665, "grad_norm": 1.8402655275872468, "kl": 0.05078125, "learning_rate": 7.816666666666666e-07, "loss": 0.002, "reward": 1.8333333730697632, "reward_std": 0.04811251908540726, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333332538604736, "step": 655 }, { "completion_length": 103.53125, "epoch": 0.43733333333333335, "grad_norm": 3.017046900487505, "kl": 0.061279296875, "learning_rate": 7.813333333333332e-07, "loss": 0.0025, "reward": 1.7822916507720947, "reward_std": 0.11530856788158417, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7822916507720947, "step": 656 }, { "completion_length": 96.8125, "epoch": 0.438, "grad_norm": 2.737081606713289, "kl": 0.08203125, "learning_rate": 7.81e-07, "loss": 0.0033, "reward": 1.9114583730697632, "reward_std": 0.07708333432674408, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9114583134651184, "step": 657 }, { "completion_length": 98.34375, "epoch": 0.43866666666666665, "grad_norm": 8.010724328195792, "kl": 0.07177734375, "learning_rate": 7.806666666666666e-07, "loss": 0.0029, "reward": 1.6359374523162842, "reward_std": 0.08250976353883743, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6359374523162842, "step": 658 }, { "completion_length": 98.28125, "epoch": 0.43933333333333335, "grad_norm": 3.1810260548976346, "kl": 0.0830078125, "learning_rate": 7.803333333333333e-07, "loss": 0.0033, "reward": 1.9166667461395264, "reward_std": 0.14905625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9166667461395264, "step": 659 }, { "completion_length": 102.3125, "epoch": 0.44, "grad_norm": 1.9465463479667764, "kl": 0.0810546875, "learning_rate": 7.799999999999999e-07, "loss": 0.0033, "reward": 1.8958333730697632, "reward_std": 0.0919627845287323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333134651184, "step": 660 }, { "completion_length": 90.125, "epoch": 0.44066666666666665, "grad_norm": 3.0343283261849043, "kl": 0.0400390625, "learning_rate": 7.796666666666666e-07, "loss": 0.0016, "reward": 1.8468749523162842, "reward_std": 0.020728906616568565, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.846875011920929, "step": 661 }, { "completion_length": 98.0, "epoch": 0.44133333333333336, "grad_norm": 3.5968657477698596, "kl": 0.06884765625, "learning_rate": 7.793333333333333e-07, "loss": 0.0028, "reward": 1.6786458492279053, "reward_std": 0.15715713798999786, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6786458492279053, "step": 662 }, { "completion_length": 100.40625, "epoch": 0.442, "grad_norm": 4.9349451423636035, "kl": 0.11376953125, "learning_rate": 7.79e-07, "loss": 0.0045, "reward": 1.7713541984558105, "reward_std": 0.11893483251333237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7713541388511658, "step": 663 }, { "completion_length": 97.75, "epoch": 0.44266666666666665, "grad_norm": 3.7078612305655803, "kl": 0.07080078125, "learning_rate": 7.786666666666665e-07, "loss": 0.0028, "reward": 1.7864583730697632, "reward_std": 0.010416671633720398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7864583730697632, "step": 664 }, { "completion_length": 106.90625, "epoch": 0.44333333333333336, "grad_norm": 2.0989080632801627, "kl": 0.08544921875, "learning_rate": 7.783333333333333e-07, "loss": 0.0034, "reward": 1.7100818157196045, "reward_std": 0.09594244509935379, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7100818157196045, "step": 665 }, { "completion_length": 91.96875, "epoch": 0.444, "grad_norm": 5.356403237373664, "kl": 0.05908203125, "learning_rate": 7.78e-07, "loss": 0.0024, "reward": 1.8177083730697632, "reward_std": 0.09838348627090454, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083134651184, "step": 666 }, { "completion_length": 96.625, "epoch": 0.44466666666666665, "grad_norm": 1.725773928833938, "kl": 0.06396484375, "learning_rate": 7.776666666666666e-07, "loss": 0.0025, "reward": 1.921875, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 667 }, { "completion_length": 104.21875, "epoch": 0.44533333333333336, "grad_norm": 0.8194271297703175, "kl": 0.0390625, "learning_rate": 7.773333333333333e-07, "loss": 0.0016, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 668 }, { "completion_length": 95.21875, "epoch": 0.446, "grad_norm": 2.2567852004388342, "kl": 0.044677734375, "learning_rate": 7.77e-07, "loss": 0.0018, "reward": 1.5625, "reward_std": 0.26933756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5625, "step": 669 }, { "completion_length": 106.8125, "epoch": 0.44666666666666666, "grad_norm": 24.635233550092558, "kl": 0.06298828125, "learning_rate": 7.766666666666666e-07, "loss": 0.0025, "reward": 1.9328124523162842, "reward_std": 0.07979864627122879, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.932812511920929, "step": 670 }, { "completion_length": 98.65625, "epoch": 0.44733333333333336, "grad_norm": 1.9748176808864137, "kl": 0.09130859375, "learning_rate": 7.763333333333333e-07, "loss": 0.0037, "reward": 1.9364583492279053, "reward_std": 0.02708333171904087, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9364583492279053, "step": 671 }, { "completion_length": 97.6875, "epoch": 0.448, "grad_norm": 5.8797004237179635, "kl": 0.04736328125, "learning_rate": 7.76e-07, "loss": 0.0019, "reward": 1.818750023841858, "reward_std": 0.012500002980232239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8187500238418579, "step": 672 }, { "completion_length": 107.53125, "epoch": 0.44866666666666666, "grad_norm": 2.161539072580386, "kl": 0.08447265625, "learning_rate": 7.756666666666665e-07, "loss": 0.0034, "reward": 1.8876488208770752, "reward_std": 0.0372023805975914, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.88764888048172, "step": 673 }, { "completion_length": 102.875, "epoch": 0.4493333333333333, "grad_norm": 1.6965647465177007, "kl": 0.048828125, "learning_rate": 7.753333333333333e-07, "loss": 0.002, "reward": 1.7843749523162842, "reward_std": 0.15110903978347778, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.815625011920929, "step": 674 }, { "completion_length": 121.34375, "epoch": 0.45, "grad_norm": 2.3934175821087003, "kl": 0.068359375, "learning_rate": 7.75e-07, "loss": 0.0027, "reward": 1.7085756063461304, "reward_std": 0.10398969054222107, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7398255467414856, "step": 675 }, { "completion_length": 100.09375, "epoch": 0.45066666666666666, "grad_norm": 3.829642373829515, "kl": 0.06884765625, "learning_rate": 7.746666666666666e-07, "loss": 0.0028, "reward": 1.6875, "reward_std": 0.2805021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 676 }, { "completion_length": 86.96875, "epoch": 0.4513333333333333, "grad_norm": 3.9093264664140706, "kl": 0.04541015625, "learning_rate": 7.743333333333332e-07, "loss": 0.0018, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 677 }, { "completion_length": 99.0625, "epoch": 0.452, "grad_norm": 8.777708760917479, "kl": 0.04443359375, "learning_rate": 7.74e-07, "loss": 0.0018, "reward": 1.734375, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 678 }, { "completion_length": 101.15625, "epoch": 0.45266666666666666, "grad_norm": 4.404227778910752, "kl": 0.076171875, "learning_rate": 7.736666666666666e-07, "loss": 0.0031, "reward": 1.8463542461395264, "reward_std": 0.0677083283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8463542461395264, "step": 679 }, { "completion_length": 97.0, "epoch": 0.4533333333333333, "grad_norm": 1.6616885695336665, "kl": 0.068359375, "learning_rate": 7.733333333333333e-07, "loss": 0.0027, "reward": 1.8270833492279053, "reward_std": 0.019570570439100266, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8270833492279053, "step": 680 }, { "completion_length": 98.1875, "epoch": 0.454, "grad_norm": 2.032837443011196, "kl": 0.061279296875, "learning_rate": 7.729999999999999e-07, "loss": 0.0025, "reward": 1.9375, "reward_std": 0.05103103816509247, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 681 }, { "completion_length": 116.1875, "epoch": 0.45466666666666666, "grad_norm": 1.7149619790048147, "kl": 0.05908203125, "learning_rate": 7.726666666666666e-07, "loss": 0.0024, "reward": 1.7950148582458496, "reward_std": 0.10756883025169373, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7950148582458496, "step": 682 }, { "completion_length": 109.0625, "epoch": 0.4553333333333333, "grad_norm": 6.420591464760474, "kl": 0.052001953125, "learning_rate": 7.723333333333333e-07, "loss": 0.0021, "reward": 1.5750000476837158, "reward_std": 0.14298191666603088, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5750000476837158, "step": 683 }, { "completion_length": 107.78125, "epoch": 0.456, "grad_norm": 2.056574416980527, "kl": 0.0634765625, "learning_rate": 7.72e-07, "loss": 0.0025, "reward": 1.7578125, "reward_std": 0.06957196444272995, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7578125, "step": 684 }, { "completion_length": 112.96875, "epoch": 0.45666666666666667, "grad_norm": 4.556717423891207, "kl": 0.06005859375, "learning_rate": 7.716666666666665e-07, "loss": 0.0024, "reward": 1.8776042461395264, "reward_std": 0.18080280721187592, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9088541269302368, "step": 685 }, { "completion_length": 116.625, "epoch": 0.4573333333333333, "grad_norm": 8.814873799747478, "kl": 0.07080078125, "learning_rate": 7.713333333333333e-07, "loss": 0.0028, "reward": 1.8984375, "reward_std": 0.07737711071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8984374403953552, "step": 686 }, { "completion_length": 90.34375, "epoch": 0.458, "grad_norm": 1.355205511108117, "kl": 0.06298828125, "learning_rate": 7.71e-07, "loss": 0.0025, "reward": 1.8958333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 687 }, { "completion_length": 99.6875, "epoch": 0.45866666666666667, "grad_norm": 2.6389131317903427, "kl": 0.0673828125, "learning_rate": 7.706666666666667e-07, "loss": 0.0027, "reward": 1.828125, "reward_std": 0.12075906991958618, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.828125, "step": 688 }, { "completion_length": 109.1875, "epoch": 0.4593333333333333, "grad_norm": 17.34978955619206, "kl": 0.062255859375, "learning_rate": 7.703333333333333e-07, "loss": 0.0025, "reward": 1.5072917938232422, "reward_std": 0.1599823385477066, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5072916746139526, "step": 689 }, { "completion_length": 96.53125, "epoch": 0.46, "grad_norm": 2.680343911188758, "kl": 0.08251953125, "learning_rate": 7.699999999999999e-07, "loss": 0.0033, "reward": 1.7135417461395264, "reward_std": 0.15121470391750336, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7135416269302368, "step": 690 }, { "completion_length": 96.875, "epoch": 0.46066666666666667, "grad_norm": 1.6285882474622775, "kl": 0.050537109375, "learning_rate": 7.696666666666667e-07, "loss": 0.002, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 691 }, { "completion_length": 109.4375, "epoch": 0.4613333333333333, "grad_norm": 3.6724093174343335, "kl": 0.0703125, "learning_rate": 7.693333333333333e-07, "loss": 0.0028, "reward": 1.7654762268066406, "reward_std": 0.04210581257939339, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7654762268066406, "step": 692 }, { "completion_length": 103.28125, "epoch": 0.462, "grad_norm": 2.416736658825126, "kl": 0.06005859375, "learning_rate": 7.69e-07, "loss": 0.0024, "reward": 1.8177083730697632, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083730697632, "step": 693 }, { "completion_length": 94.28125, "epoch": 0.46266666666666667, "grad_norm": 2.75654505974085, "kl": 0.04638671875, "learning_rate": 7.686666666666666e-07, "loss": 0.0019, "reward": 1.765625, "reward_std": 0.15625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.796875, "step": 694 }, { "completion_length": 104.5, "epoch": 0.4633333333333333, "grad_norm": 1.6336410985452174, "kl": 0.068359375, "learning_rate": 7.683333333333333e-07, "loss": 0.0027, "reward": 1.8645833730697632, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833730697632, "step": 695 }, { "completion_length": 102.09375, "epoch": 0.464, "grad_norm": 2.1122416259965138, "kl": 0.068359375, "learning_rate": 7.68e-07, "loss": 0.0027, "reward": 1.8333333730697632, "reward_std": 0.13152486085891724, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8645833134651184, "step": 696 }, { "completion_length": 104.0625, "epoch": 0.4646666666666667, "grad_norm": 2.1250573141156406, "kl": 0.06005859375, "learning_rate": 7.676666666666667e-07, "loss": 0.0024, "reward": 1.9100446701049805, "reward_std": 0.11158294230699539, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9412946701049805, "step": 697 }, { "completion_length": 103.28125, "epoch": 0.4653333333333333, "grad_norm": 2.2017448535210273, "kl": 0.053466796875, "learning_rate": 7.673333333333332e-07, "loss": 0.0021, "reward": 1.8697917461395264, "reward_std": 0.09697292000055313, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8697916865348816, "step": 698 }, { "completion_length": 109.75, "epoch": 0.466, "grad_norm": 1.7947089460286716, "kl": 0.0634765625, "learning_rate": 7.67e-07, "loss": 0.0026, "reward": 1.6770833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6770833134651184, "step": 699 }, { "completion_length": 114.1875, "epoch": 0.4666666666666667, "grad_norm": 3.9414536414367194, "kl": 0.050048828125, "learning_rate": 7.666666666666667e-07, "loss": 0.002, "reward": 1.6927083730697632, "reward_std": 0.1427260935306549, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6927083134651184, "step": 700 }, { "completion_length": 105.59375, "epoch": 0.4673333333333333, "grad_norm": 3.938441707547346, "kl": 0.06689453125, "learning_rate": 7.663333333333333e-07, "loss": 0.0027, "reward": 1.6742277145385742, "reward_std": 0.3547779321670532, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.705477774143219, "step": 701 }, { "completion_length": 103.96875, "epoch": 0.468, "grad_norm": 2.90577442075983, "kl": 0.06591796875, "learning_rate": 7.66e-07, "loss": 0.0026, "reward": 1.883333444595337, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8833333253860474, "step": 702 }, { "completion_length": 110.375, "epoch": 0.4686666666666667, "grad_norm": 2.805422430826044, "kl": 0.046875, "learning_rate": 7.656666666666667e-07, "loss": 0.0019, "reward": 1.6984167098999023, "reward_std": 0.24341771006584167, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7296667098999023, "step": 703 }, { "completion_length": 115.875, "epoch": 0.4693333333333333, "grad_norm": 2.325331324764235, "kl": 0.051025390625, "learning_rate": 7.653333333333333e-07, "loss": 0.002, "reward": 1.6875, "reward_std": 0.375, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.75, "step": 704 }, { "completion_length": 99.28125, "epoch": 0.47, "grad_norm": 3.0524568333382893, "kl": 0.072265625, "learning_rate": 7.65e-07, "loss": 0.0029, "reward": 1.9239583015441895, "reward_std": 0.027083327993750572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9239583611488342, "step": 705 }, { "completion_length": 90.8125, "epoch": 0.4706666666666667, "grad_norm": 0.12725603157059379, "kl": 0.04638671875, "learning_rate": 7.646666666666667e-07, "loss": 0.0019, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 706 }, { "completion_length": 106.65625, "epoch": 0.4713333333333333, "grad_norm": 3.6249025821665937, "kl": 0.06005859375, "learning_rate": 7.643333333333332e-07, "loss": 0.0024, "reward": 1.6749999523162842, "reward_std": 0.11666666716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6749999523162842, "step": 707 }, { "completion_length": 100.71875, "epoch": 0.472, "grad_norm": 4.723388767918392, "kl": 0.06591796875, "learning_rate": 7.64e-07, "loss": 0.0026, "reward": 1.9395833015441895, "reward_std": 0.013819272629916668, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9395833015441895, "step": 708 }, { "completion_length": 91.21875, "epoch": 0.4726666666666667, "grad_norm": 4.078869903586357, "kl": 0.062255859375, "learning_rate": 7.636666666666667e-07, "loss": 0.0025, "reward": 1.8958333730697632, "reward_std": 0.02405625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958332538604736, "step": 709 }, { "completion_length": 109.90625, "epoch": 0.47333333333333333, "grad_norm": 5.099142132808174, "kl": 0.1025390625, "learning_rate": 7.633333333333333e-07, "loss": 0.0041, "reward": 1.7994047403335571, "reward_std": 0.1265389621257782, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7994047403335571, "step": 710 }, { "completion_length": 91.21875, "epoch": 0.474, "grad_norm": 3.1491513611506816, "kl": 0.054931640625, "learning_rate": 7.629999999999999e-07, "loss": 0.0022, "reward": 1.71875, "reward_std": 0.1494225263595581, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 711 }, { "completion_length": 107.84375, "epoch": 0.4746666666666667, "grad_norm": 1.329716784063638, "kl": 0.035888671875, "learning_rate": 7.626666666666667e-07, "loss": 0.0014, "reward": 1.7395833730697632, "reward_std": 0.27900636196136475, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8020833730697632, "step": 712 }, { "completion_length": 95.46875, "epoch": 0.47533333333333333, "grad_norm": 4.908943779981386, "kl": 0.05419921875, "learning_rate": 7.623333333333333e-07, "loss": 0.0022, "reward": 1.8781249523162842, "reward_std": 0.030829086899757385, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.878125011920929, "step": 713 }, { "completion_length": 101.53125, "epoch": 0.476, "grad_norm": 2.2604479625634486, "kl": 0.05712890625, "learning_rate": 7.62e-07, "loss": 0.0023, "reward": 1.8046875, "reward_std": 0.203125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8359375, "step": 714 }, { "completion_length": 108.90625, "epoch": 0.4766666666666667, "grad_norm": 1.9921853829299774, "kl": 0.07763671875, "learning_rate": 7.616666666666666e-07, "loss": 0.0031, "reward": 1.7390625476837158, "reward_std": 0.08437500149011612, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.739062488079071, "step": 715 }, { "completion_length": 104.90625, "epoch": 0.47733333333333333, "grad_norm": 3.737914465585726, "kl": 0.049560546875, "learning_rate": 7.613333333333333e-07, "loss": 0.002, "reward": 1.8020833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8333333134651184, "step": 716 }, { "completion_length": 110.96875, "epoch": 0.478, "grad_norm": 4.11922981426467, "kl": 0.03369140625, "learning_rate": 7.61e-07, "loss": 0.0013, "reward": 1.6041667461395264, "reward_std": 0.14201034605503082, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6041666865348816, "step": 717 }, { "completion_length": 115.3125, "epoch": 0.4786666666666667, "grad_norm": 24.533388895402414, "kl": 0.0703125, "learning_rate": 7.606666666666667e-07, "loss": 0.0028, "reward": 1.8328125476837158, "reward_std": 0.1003689169883728, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.832812488079071, "step": 718 }, { "completion_length": 109.21875, "epoch": 0.47933333333333333, "grad_norm": 3.434783722027455, "kl": 0.09375, "learning_rate": 7.603333333333332e-07, "loss": 0.0038, "reward": 1.7520833015441895, "reward_std": 0.20966878533363342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7520833611488342, "step": 719 }, { "completion_length": 121.5, "epoch": 0.48, "grad_norm": 2.518488288369065, "kl": 0.054443359375, "learning_rate": 7.599999999999999e-07, "loss": 0.0022, "reward": 1.803125023841858, "reward_std": 0.030542198568582535, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8031249642372131, "step": 720 }, { "completion_length": 117.4375, "epoch": 0.4806666666666667, "grad_norm": 5.80654131194134, "kl": 0.0693359375, "learning_rate": 7.596666666666667e-07, "loss": 0.0028, "reward": 1.7294384241104126, "reward_std": 0.30821681022644043, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.760688304901123, "step": 721 }, { "completion_length": 106.875, "epoch": 0.48133333333333334, "grad_norm": 1.4888774558160018, "kl": 0.0654296875, "learning_rate": 7.593333333333333e-07, "loss": 0.0026, "reward": 1.84375, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 722 }, { "completion_length": 97.625, "epoch": 0.482, "grad_norm": 2.8556957047762643, "kl": 0.053955078125, "learning_rate": 7.59e-07, "loss": 0.0022, "reward": 1.8333333730697632, "reward_std": 0.11619479954242706, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 723 }, { "completion_length": 101.0625, "epoch": 0.4826666666666667, "grad_norm": 2.226174629938657, "kl": 0.05078125, "learning_rate": 7.586666666666666e-07, "loss": 0.002, "reward": 1.7395833730697632, "reward_std": 0.33183756470680237, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 724 }, { "completion_length": 111.4375, "epoch": 0.48333333333333334, "grad_norm": 6.338282292529872, "kl": 0.06396484375, "learning_rate": 7.583333333333333e-07, "loss": 0.0026, "reward": 1.6640625, "reward_std": 0.2064596563577652, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6640625, "step": 725 }, { "completion_length": 105.25, "epoch": 0.484, "grad_norm": 4.909538742395873, "kl": 0.06787109375, "learning_rate": 7.58e-07, "loss": 0.0027, "reward": 1.578125, "reward_std": 0.16895677149295807, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.578125, "step": 726 }, { "completion_length": 115.6875, "epoch": 0.4846666666666667, "grad_norm": 3.078954818254768, "kl": 0.043701171875, "learning_rate": 7.576666666666667e-07, "loss": 0.0018, "reward": 1.7958333492279053, "reward_std": 0.22767089307308197, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7958332896232605, "step": 727 }, { "completion_length": 105.1875, "epoch": 0.48533333333333334, "grad_norm": 0.9300845474746853, "kl": 0.06591796875, "learning_rate": 7.573333333333332e-07, "loss": 0.0026, "reward": 1.9375, "reward_std": 0.07216878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.96875, "step": 728 }, { "completion_length": 95.875, "epoch": 0.486, "grad_norm": 4.439117951658907, "kl": 0.05224609375, "learning_rate": 7.57e-07, "loss": 0.0021, "reward": 1.6588542461395264, "reward_std": 0.1508232206106186, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6588541269302368, "step": 729 }, { "completion_length": 93.53125, "epoch": 0.4866666666666667, "grad_norm": 3.109480212444913, "kl": 0.07177734375, "learning_rate": 7.566666666666667e-07, "loss": 0.0029, "reward": 1.8229167461395264, "reward_std": 0.1763354390859604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166269302368, "step": 730 }, { "completion_length": 100.75, "epoch": 0.48733333333333334, "grad_norm": 3.1380356283192006, "kl": 0.05224609375, "learning_rate": 7.563333333333333e-07, "loss": 0.0021, "reward": 1.696056604385376, "reward_std": 0.23378220200538635, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6960565447807312, "step": 731 }, { "completion_length": 98.03125, "epoch": 0.488, "grad_norm": 2.2699755181112358, "kl": 0.0830078125, "learning_rate": 7.559999999999999e-07, "loss": 0.0033, "reward": 1.734375, "reward_std": 0.14508545398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 732 }, { "completion_length": 111.0, "epoch": 0.4886666666666667, "grad_norm": 11.27539363593745, "kl": 0.0556640625, "learning_rate": 7.556666666666667e-07, "loss": 0.0022, "reward": 1.8875000476837158, "reward_std": 0.02916252240538597, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8875000476837158, "step": 733 }, { "completion_length": 117.0, "epoch": 0.48933333333333334, "grad_norm": 2.7612964463824587, "kl": 0.048095703125, "learning_rate": 7.553333333333333e-07, "loss": 0.0019, "reward": 1.7625000476837158, "reward_std": 0.11456207185983658, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.793749988079071, "step": 734 }, { "completion_length": 98.6875, "epoch": 0.49, "grad_norm": 9.87960075264911, "kl": 0.439453125, "learning_rate": 7.55e-07, "loss": 0.0177, "reward": 1.7660714387893677, "reward_std": 0.08301085233688354, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7973214387893677, "step": 735 }, { "completion_length": 100.625, "epoch": 0.49066666666666664, "grad_norm": 1.171087110723512, "kl": 0.04833984375, "learning_rate": 7.546666666666666e-07, "loss": 0.0019, "reward": 1.912500023841858, "reward_std": 0.02041240967810154, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9125000238418579, "step": 736 }, { "completion_length": 87.53125, "epoch": 0.49133333333333334, "grad_norm": 3.0304029605427845, "kl": 0.07080078125, "learning_rate": 7.543333333333332e-07, "loss": 0.0028, "reward": 1.9322917461395264, "reward_std": 0.1090010479092598, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322916865348816, "step": 737 }, { "completion_length": 96.46875, "epoch": 0.492, "grad_norm": 7.127482371111232, "kl": 0.07861328125, "learning_rate": 7.54e-07, "loss": 0.0031, "reward": 1.8932292461395264, "reward_std": 0.1294604390859604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8932291865348816, "step": 738 }, { "completion_length": 106.5, "epoch": 0.49266666666666664, "grad_norm": 2.500827014939632, "kl": 0.11083984375, "learning_rate": 7.536666666666667e-07, "loss": 0.0044, "reward": 1.7869791984558105, "reward_std": 0.06256860494613647, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7869791984558105, "step": 739 }, { "completion_length": 95.28125, "epoch": 0.49333333333333335, "grad_norm": 7.165013378995151, "kl": 0.043212890625, "learning_rate": 7.533333333333332e-07, "loss": 0.0017, "reward": 1.5885417461395264, "reward_std": 0.1336427927017212, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5885416865348816, "step": 740 }, { "completion_length": 98.34375, "epoch": 0.494, "grad_norm": 6.234520551393728, "kl": 0.0791015625, "learning_rate": 7.529999999999999e-07, "loss": 0.0032, "reward": 1.600595235824585, "reward_std": 0.21520373225212097, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.631845235824585, "step": 741 }, { "completion_length": 101.90625, "epoch": 0.49466666666666664, "grad_norm": 3.6017470279585435, "kl": 0.036376953125, "learning_rate": 7.526666666666667e-07, "loss": 0.0015, "reward": 1.7682292461395264, "reward_std": 0.0989583358168602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7682291269302368, "step": 742 }, { "completion_length": 99.75, "epoch": 0.49533333333333335, "grad_norm": 2.465882851421736, "kl": 0.06982421875, "learning_rate": 7.523333333333333e-07, "loss": 0.0028, "reward": 1.7552083730697632, "reward_std": 0.1979166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7552083730697632, "step": 743 }, { "completion_length": 111.03125, "epoch": 0.496, "grad_norm": 1.4978778382570934, "kl": 0.0703125, "learning_rate": 7.52e-07, "loss": 0.0028, "reward": 1.7999999523162842, "reward_std": 0.08415063470602036, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.831250011920929, "step": 744 }, { "completion_length": 110.5625, "epoch": 0.49666666666666665, "grad_norm": 3.3928615850600137, "kl": 0.056396484375, "learning_rate": 7.516666666666666e-07, "loss": 0.0023, "reward": 1.7817708253860474, "reward_std": 0.25918370485305786, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8130208253860474, "step": 745 }, { "completion_length": 113.09375, "epoch": 0.49733333333333335, "grad_norm": 2.3596014275434167, "kl": 0.05224609375, "learning_rate": 7.513333333333333e-07, "loss": 0.0021, "reward": 1.8046875, "reward_std": 0.140625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8359375, "step": 746 }, { "completion_length": 109.875, "epoch": 0.498, "grad_norm": 4.9537434361838875, "kl": 0.054931640625, "learning_rate": 7.51e-07, "loss": 0.0022, "reward": 1.3151042461395264, "reward_std": 0.10480783134698868, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.3151041865348816, "step": 747 }, { "completion_length": 110.375, "epoch": 0.49866666666666665, "grad_norm": 6.508565826136786, "kl": 0.058349609375, "learning_rate": 7.506666666666667e-07, "loss": 0.0023, "reward": 1.6979167461395264, "reward_std": 0.14077475666999817, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7604166269302368, "step": 748 }, { "completion_length": 99.09375, "epoch": 0.49933333333333335, "grad_norm": 2.202032127373168, "kl": 0.0615234375, "learning_rate": 7.503333333333332e-07, "loss": 0.0025, "reward": 1.884374976158142, "reward_std": 0.07737797498703003, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8843749761581421, "step": 749 }, { "completion_length": 98.59375, "epoch": 0.5, "grad_norm": 3.624831488382142, "kl": 0.029052734375, "learning_rate": 7.5e-07, "loss": 0.0012, "reward": 1.7109375, "reward_std": 0.29080551862716675, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7421875, "step": 750 }, { "completion_length": 96.40625, "epoch": 0.5006666666666667, "grad_norm": 4.458979094274732, "kl": 0.0615234375, "learning_rate": 7.496666666666667e-07, "loss": 0.0025, "reward": 1.9031994342803955, "reward_std": 0.05498018115758896, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9031993746757507, "step": 751 }, { "completion_length": 92.65625, "epoch": 0.5013333333333333, "grad_norm": 1.699219628554621, "kl": 0.047119140625, "learning_rate": 7.493333333333333e-07, "loss": 0.0019, "reward": 1.75, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 752 }, { "completion_length": 84.71875, "epoch": 0.502, "grad_norm": 0.13036950405788653, "kl": 0.05517578125, "learning_rate": 7.489999999999999e-07, "loss": 0.0022, "reward": 1.9166667461395264, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9166666865348816, "step": 753 }, { "completion_length": 93.40625, "epoch": 0.5026666666666667, "grad_norm": 2.253207476840251, "kl": 0.058837890625, "learning_rate": 7.486666666666666e-07, "loss": 0.0024, "reward": 1.9583333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333134651184, "step": 754 }, { "completion_length": 94.6875, "epoch": 0.5033333333333333, "grad_norm": 2.704927953272205, "kl": 0.09130859375, "learning_rate": 7.483333333333333e-07, "loss": 0.0037, "reward": 1.7395833730697632, "reward_std": 0.1534588634967804, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333134651184, "step": 755 }, { "completion_length": 108.34375, "epoch": 0.504, "grad_norm": 2.197809773223387, "kl": 0.052734375, "learning_rate": 7.48e-07, "loss": 0.0021, "reward": 1.7708333730697632, "reward_std": 0.1555021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 756 }, { "completion_length": 87.625, "epoch": 0.5046666666666667, "grad_norm": 2.8096233051856547, "kl": 0.0517578125, "learning_rate": 7.476666666666667e-07, "loss": 0.0021, "reward": 1.8645833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8958333730697632, "step": 757 }, { "completion_length": 89.78125, "epoch": 0.5053333333333333, "grad_norm": 1.8430548196585312, "kl": 0.06640625, "learning_rate": 7.473333333333332e-07, "loss": 0.0027, "reward": 1.8541667461395264, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8854166865348816, "step": 758 }, { "completion_length": 102.9375, "epoch": 0.506, "grad_norm": 3.703096895565597, "kl": 0.0654296875, "learning_rate": 7.47e-07, "loss": 0.0026, "reward": 1.7151042222976685, "reward_std": 0.13397973775863647, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7463541030883789, "step": 759 }, { "completion_length": 98.3125, "epoch": 0.5066666666666667, "grad_norm": 1.1097550878903992, "kl": 0.06298828125, "learning_rate": 7.466666666666667e-07, "loss": 0.0025, "reward": 1.984375, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.984375, "step": 760 }, { "completion_length": 100.3125, "epoch": 0.5073333333333333, "grad_norm": 2.117700584068544, "kl": 0.037841796875, "learning_rate": 7.463333333333333e-07, "loss": 0.0015, "reward": 1.78125, "reward_std": 0.07693375647068024, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 761 }, { "completion_length": 103.53125, "epoch": 0.508, "grad_norm": 3.6555890080351423, "kl": 0.06884765625, "learning_rate": 7.459999999999999e-07, "loss": 0.0027, "reward": 1.78125, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 762 }, { "completion_length": 95.375, "epoch": 0.5086666666666667, "grad_norm": 1.8139898218464985, "kl": 0.07177734375, "learning_rate": 7.456666666666667e-07, "loss": 0.0029, "reward": 1.8562500476837158, "reward_std": 0.08291241526603699, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.856249988079071, "step": 763 }, { "completion_length": 107.09375, "epoch": 0.5093333333333333, "grad_norm": 4.469082531075757, "kl": 0.0634765625, "learning_rate": 7.453333333333333e-07, "loss": 0.0025, "reward": 1.7497106790542603, "reward_std": 0.15679530799388885, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7497106790542603, "step": 764 }, { "completion_length": 99.96875, "epoch": 0.51, "grad_norm": 5.862644874893255, "kl": 0.06640625, "learning_rate": 7.45e-07, "loss": 0.0026, "reward": 1.7645833492279053, "reward_std": 0.16275840997695923, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7645833492279053, "step": 765 }, { "completion_length": 94.40625, "epoch": 0.5106666666666667, "grad_norm": 6.645714742784713, "kl": 0.0908203125, "learning_rate": 7.446666666666666e-07, "loss": 0.0036, "reward": 1.765625, "reward_std": 0.2492521107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.796875, "step": 766 }, { "completion_length": 113.65625, "epoch": 0.5113333333333333, "grad_norm": 1.5082705846023334, "kl": 0.07763671875, "learning_rate": 7.443333333333332e-07, "loss": 0.0031, "reward": 1.8794642686843872, "reward_std": 0.11913228034973145, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8794642686843872, "step": 767 }, { "completion_length": 96.71875, "epoch": 0.512, "grad_norm": 5.037121135191664, "kl": 0.052490234375, "learning_rate": 7.44e-07, "loss": 0.0021, "reward": 1.734375, "reward_std": 0.1979166716337204, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7968749403953552, "step": 768 }, { "completion_length": 109.125, "epoch": 0.5126666666666667, "grad_norm": 3.8368275215212364, "kl": 0.0830078125, "learning_rate": 7.436666666666667e-07, "loss": 0.0033, "reward": 1.5458333492279053, "reward_std": 0.18399296700954437, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6083333492279053, "step": 769 }, { "completion_length": 95.03125, "epoch": 0.5133333333333333, "grad_norm": 3.7999526289661727, "kl": 0.076171875, "learning_rate": 7.433333333333332e-07, "loss": 0.003, "reward": 1.8959821462631226, "reward_std": 0.09905412793159485, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8959821462631226, "step": 770 }, { "completion_length": 97.59375, "epoch": 0.514, "grad_norm": 3.134137041107172, "kl": 0.0238037109375, "learning_rate": 7.429999999999999e-07, "loss": 0.0009, "reward": 1.59375, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.59375, "step": 771 }, { "completion_length": 93.90625, "epoch": 0.5146666666666667, "grad_norm": 6.885413581477516, "kl": 0.07568359375, "learning_rate": 7.426666666666667e-07, "loss": 0.003, "reward": 1.8229167461395264, "reward_std": 0.23443284630775452, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166269302368, "step": 772 }, { "completion_length": 86.5, "epoch": 0.5153333333333333, "grad_norm": 2.277041507307254, "kl": 0.06298828125, "learning_rate": 7.423333333333333e-07, "loss": 0.0025, "reward": 1.6380208730697632, "reward_std": 0.19678336381912231, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7005208730697632, "step": 773 }, { "completion_length": 90.5, "epoch": 0.516, "grad_norm": 2.499959391949406, "kl": 0.080078125, "learning_rate": 7.42e-07, "loss": 0.0032, "reward": 1.8046875, "reward_std": 0.08779378235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8359375, "step": 774 }, { "completion_length": 104.96875, "epoch": 0.5166666666666667, "grad_norm": 5.3471430492102705, "kl": 0.0634765625, "learning_rate": 7.416666666666666e-07, "loss": 0.0025, "reward": 1.7916667461395264, "reward_std": 0.2499999850988388, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8229166865348816, "step": 775 }, { "completion_length": 99.6875, "epoch": 0.5173333333333333, "grad_norm": 2.5383376654116745, "kl": 0.04443359375, "learning_rate": 7.413333333333333e-07, "loss": 0.0018, "reward": 1.8489583730697632, "reward_std": 0.11921682208776474, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583730697632, "step": 776 }, { "completion_length": 89.46875, "epoch": 0.518, "grad_norm": 1.7234449071337439, "kl": 0.05517578125, "learning_rate": 7.41e-07, "loss": 0.0022, "reward": 1.78125, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 777 }, { "completion_length": 82.1875, "epoch": 0.5186666666666667, "grad_norm": 2.0886337198432456, "kl": 0.07958984375, "learning_rate": 7.406666666666667e-07, "loss": 0.0032, "reward": 1.90625, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 778 }, { "completion_length": 91.8125, "epoch": 0.5193333333333333, "grad_norm": 4.5933978666777895, "kl": 0.05029296875, "learning_rate": 7.403333333333332e-07, "loss": 0.002, "reward": 1.8008928298950195, "reward_std": 0.10420062392950058, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8008928894996643, "step": 779 }, { "completion_length": 90.1875, "epoch": 0.52, "grad_norm": 4.050414687023766, "kl": 0.040283203125, "learning_rate": 7.4e-07, "loss": 0.0016, "reward": 1.8002976179122925, "reward_std": 0.1640157401561737, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8002976179122925, "step": 780 }, { "completion_length": 90.28125, "epoch": 0.5206666666666667, "grad_norm": 2.2879761037054553, "kl": 0.060791015625, "learning_rate": 7.396666666666667e-07, "loss": 0.0024, "reward": 1.703125, "reward_std": 0.15625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.734375, "step": 781 }, { "completion_length": 106.28125, "epoch": 0.5213333333333333, "grad_norm": 6.602897693001174, "kl": 0.091796875, "learning_rate": 7.393333333333333e-07, "loss": 0.0037, "reward": 1.7181919813156128, "reward_std": 0.014597686007618904, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7181919813156128, "step": 782 }, { "completion_length": 87.125, "epoch": 0.522, "grad_norm": 4.078808710363924, "kl": 0.06884765625, "learning_rate": 7.389999999999999e-07, "loss": 0.0028, "reward": 1.953125, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.953125, "step": 783 }, { "completion_length": 108.34375, "epoch": 0.5226666666666666, "grad_norm": 5.385637433170453, "kl": 0.0634765625, "learning_rate": 7.386666666666666e-07, "loss": 0.0025, "reward": 1.7239583730697632, "reward_std": 0.20264878869056702, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7552083730697632, "step": 784 }, { "completion_length": 91.0625, "epoch": 0.5233333333333333, "grad_norm": 2.161885294564776, "kl": 0.0458984375, "learning_rate": 7.383333333333333e-07, "loss": 0.0018, "reward": 1.9010417461395264, "reward_std": 0.031249988824129105, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9010416269302368, "step": 785 }, { "completion_length": 98.28125, "epoch": 0.524, "grad_norm": 15.347799051828193, "kl": 0.06787109375, "learning_rate": 7.38e-07, "loss": 0.0027, "reward": 1.683333396911621, "reward_std": 0.15303795039653778, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6833333373069763, "step": 786 }, { "completion_length": 93.15625, "epoch": 0.5246666666666666, "grad_norm": 2.1389806231236426, "kl": 0.061279296875, "learning_rate": 7.376666666666666e-07, "loss": 0.0025, "reward": 1.8645833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8958333134651184, "step": 787 }, { "completion_length": 102.75, "epoch": 0.5253333333333333, "grad_norm": 0.5137615399819253, "kl": 0.058837890625, "learning_rate": 7.373333333333332e-07, "loss": 0.0024, "reward": 1.6458333730697632, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6770833134651184, "step": 788 }, { "completion_length": 101.0, "epoch": 0.526, "grad_norm": 2.614400113749134, "kl": 0.0595703125, "learning_rate": 7.37e-07, "loss": 0.0024, "reward": 1.5834821462631226, "reward_std": 0.1444183588027954, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5834821462631226, "step": 789 }, { "completion_length": 109.59375, "epoch": 0.5266666666666666, "grad_norm": 2.5021977170857577, "kl": 0.05224609375, "learning_rate": 7.366666666666667e-07, "loss": 0.0021, "reward": 1.7083333730697632, "reward_std": 0.2805021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7395833730697632, "step": 790 }, { "completion_length": 91.90625, "epoch": 0.5273333333333333, "grad_norm": 3.4649247834612544, "kl": 0.0732421875, "learning_rate": 7.363333333333332e-07, "loss": 0.0029, "reward": 1.8603050708770752, "reward_std": 0.10382197797298431, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8603050708770752, "step": 791 }, { "completion_length": 115.0625, "epoch": 0.528, "grad_norm": 3.7475261481303717, "kl": 0.08251953125, "learning_rate": 7.359999999999999e-07, "loss": 0.0033, "reward": 1.5479166507720947, "reward_std": 0.3147531747817993, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5791666507720947, "step": 792 }, { "completion_length": 98.875, "epoch": 0.5286666666666666, "grad_norm": 2.513846249203066, "kl": 0.06298828125, "learning_rate": 7.356666666666667e-07, "loss": 0.0025, "reward": 1.7395833730697632, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 793 }, { "completion_length": 90.1875, "epoch": 0.5293333333333333, "grad_norm": 4.713201460906924, "kl": 0.068359375, "learning_rate": 7.353333333333333e-07, "loss": 0.0027, "reward": 1.7666666507720947, "reward_std": 0.2571783661842346, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8291667103767395, "step": 794 }, { "completion_length": 100.25, "epoch": 0.53, "grad_norm": 3.061892614431919, "kl": 0.06396484375, "learning_rate": 7.35e-07, "loss": 0.0026, "reward": 1.5130207538604736, "reward_std": 0.17897579073905945, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.5755208730697632, "step": 795 }, { "completion_length": 87.1875, "epoch": 0.5306666666666666, "grad_norm": 1.5988351557019245, "kl": 0.0556640625, "learning_rate": 7.346666666666666e-07, "loss": 0.0022, "reward": 1.8828125, "reward_std": 0.0924195945262909, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8828125, "step": 796 }, { "completion_length": 98.0, "epoch": 0.5313333333333333, "grad_norm": 2.3230050435988066, "kl": 0.09716796875, "learning_rate": 7.343333333333332e-07, "loss": 0.0039, "reward": 1.7552083730697632, "reward_std": 0.28559717535972595, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7864583134651184, "step": 797 }, { "completion_length": 90.96875, "epoch": 0.532, "grad_norm": 9.45198753308534, "kl": 0.06201171875, "learning_rate": 7.34e-07, "loss": 0.0025, "reward": 1.84375, "reward_std": 0.25966876745224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 798 }, { "completion_length": 86.96875, "epoch": 0.5326666666666666, "grad_norm": 2.9081676765926137, "kl": 0.05908203125, "learning_rate": 7.336666666666667e-07, "loss": 0.0024, "reward": 1.834375023841858, "reward_std": 0.08125000447034836, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8343750238418579, "step": 799 }, { "completion_length": 107.96875, "epoch": 0.5333333333333333, "grad_norm": 3.3177681198563858, "kl": 0.06787109375, "learning_rate": 7.333333333333332e-07, "loss": 0.0027, "reward": 1.850000023841858, "reward_std": 0.22957530617713928, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8812500238418579, "step": 800 }, { "completion_length": 82.3125, "epoch": 0.534, "grad_norm": 6.1246191522909665, "kl": 0.0771484375, "learning_rate": 7.329999999999999e-07, "loss": 0.0031, "reward": 1.5625, "reward_std": 0.27169692516326904, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.59375, "step": 801 }, { "completion_length": 92.28125, "epoch": 0.5346666666666666, "grad_norm": 7.042623731944093, "kl": 0.051513671875, "learning_rate": 7.326666666666667e-07, "loss": 0.0021, "reward": 1.7447917461395264, "reward_std": 0.18527019023895264, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7760416865348816, "step": 802 }, { "completion_length": 83.28125, "epoch": 0.5353333333333333, "grad_norm": 6.650548533250386, "kl": 0.095703125, "learning_rate": 7.323333333333333e-07, "loss": 0.0038, "reward": 1.7687499523162842, "reward_std": 0.05610043182969093, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7687500715255737, "step": 803 }, { "completion_length": 92.0625, "epoch": 0.536, "grad_norm": 17.86383752635268, "kl": 0.05029296875, "learning_rate": 7.319999999999999e-07, "loss": 0.002, "reward": 1.7734375, "reward_std": 0.40661248564720154, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8359375, "step": 804 }, { "completion_length": 90.6875, "epoch": 0.5366666666666666, "grad_norm": 14.41308935166233, "kl": 0.0771484375, "learning_rate": 7.316666666666666e-07, "loss": 0.0031, "reward": 1.9036458730697632, "reward_std": 0.15109659731388092, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9036458730697632, "step": 805 }, { "completion_length": 88.3125, "epoch": 0.5373333333333333, "grad_norm": 1.5399924888157182, "kl": 0.047119140625, "learning_rate": 7.313333333333333e-07, "loss": 0.0019, "reward": 1.8979166746139526, "reward_std": 0.08655625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8979166746139526, "step": 806 }, { "completion_length": 95.5, "epoch": 0.538, "grad_norm": 19.844219012405762, "kl": 0.0966796875, "learning_rate": 7.31e-07, "loss": 0.0039, "reward": 1.7338542938232422, "reward_std": 0.1196601539850235, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7338541746139526, "step": 807 }, { "completion_length": 96.8125, "epoch": 0.5386666666666666, "grad_norm": 3.9039855541064847, "kl": 0.05419921875, "learning_rate": 7.306666666666666e-07, "loss": 0.0022, "reward": 1.8333333730697632, "reward_std": 0.3333333134651184, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8958333134651184, "step": 808 }, { "completion_length": 95.375, "epoch": 0.5393333333333333, "grad_norm": 5.097681284115407, "kl": 0.0908203125, "learning_rate": 7.303333333333332e-07, "loss": 0.0036, "reward": 1.816145896911621, "reward_std": 0.10501229017972946, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8161458373069763, "step": 809 }, { "completion_length": 89.8125, "epoch": 0.54, "grad_norm": 2.4876888627634, "kl": 0.0576171875, "learning_rate": 7.3e-07, "loss": 0.0023, "reward": 1.875, "reward_std": 0.2273927927017212, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 810 }, { "completion_length": 80.03125, "epoch": 0.5406666666666666, "grad_norm": 5.316649876551583, "kl": 0.08984375, "learning_rate": 7.296666666666667e-07, "loss": 0.0036, "reward": 1.90625, "reward_std": 0.1194177195429802, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 811 }, { "completion_length": 96.4375, "epoch": 0.5413333333333333, "grad_norm": 4.849663776087787, "kl": 0.087890625, "learning_rate": 7.293333333333332e-07, "loss": 0.0035, "reward": 1.6651041507720947, "reward_std": 0.18571044504642487, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6963541507720947, "step": 812 }, { "completion_length": 100.5625, "epoch": 0.542, "grad_norm": 3.76856713549597, "kl": 0.08837890625, "learning_rate": 7.289999999999999e-07, "loss": 0.0035, "reward": 1.6140625476837158, "reward_std": 0.08017023652791977, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6140625476837158, "step": 813 }, { "completion_length": 90.875, "epoch": 0.5426666666666666, "grad_norm": 2.9952666719371326, "kl": 0.07470703125, "learning_rate": 7.286666666666666e-07, "loss": 0.003, "reward": 1.9505208730697632, "reward_std": 0.04377468302845955, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9505208730697632, "step": 814 }, { "completion_length": 77.25, "epoch": 0.5433333333333333, "grad_norm": 0.11008193436209421, "kl": 0.051513671875, "learning_rate": 7.283333333333334e-07, "loss": 0.0021, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 815 }, { "completion_length": 84.4375, "epoch": 0.544, "grad_norm": 1.8204695773624977, "kl": 0.06640625, "learning_rate": 7.28e-07, "loss": 0.0026, "reward": 1.8442420959472656, "reward_std": 0.18651574850082397, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8754920959472656, "step": 816 }, { "completion_length": 86.875, "epoch": 0.5446666666666666, "grad_norm": 4.901392909363632, "kl": 0.0693359375, "learning_rate": 7.276666666666666e-07, "loss": 0.0028, "reward": 1.859002947807312, "reward_std": 0.0944940447807312, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859002947807312, "step": 817 }, { "completion_length": 101.25, "epoch": 0.5453333333333333, "grad_norm": 3.415345307771005, "kl": 0.052978515625, "learning_rate": 7.273333333333333e-07, "loss": 0.0021, "reward": 1.7046875953674316, "reward_std": 0.17746803164482117, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7046874761581421, "step": 818 }, { "completion_length": 87.9375, "epoch": 0.546, "grad_norm": 6.620120712008824, "kl": 0.115234375, "learning_rate": 7.27e-07, "loss": 0.0046, "reward": 1.7958333492279053, "reward_std": 0.1798742562532425, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7958333492279053, "step": 819 }, { "completion_length": 87.46875, "epoch": 0.5466666666666666, "grad_norm": 3.2957686019894084, "kl": 0.08349609375, "learning_rate": 7.266666666666667e-07, "loss": 0.0033, "reward": 1.7239583730697632, "reward_std": 0.16033649444580078, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 820 }, { "completion_length": 92.875, "epoch": 0.5473333333333333, "grad_norm": 6.014120773061448, "kl": 0.0771484375, "learning_rate": 7.263333333333333e-07, "loss": 0.0031, "reward": 1.8010417222976685, "reward_std": 0.07772792130708694, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8010416626930237, "step": 821 }, { "completion_length": 93.09375, "epoch": 0.548, "grad_norm": 2.604031358969273, "kl": 0.051025390625, "learning_rate": 7.259999999999999e-07, "loss": 0.002, "reward": 1.65625, "reward_std": 0.1458333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.65625, "step": 822 }, { "completion_length": 84.4375, "epoch": 0.5486666666666666, "grad_norm": 4.34368219661778, "kl": 0.0654296875, "learning_rate": 7.256666666666667e-07, "loss": 0.0026, "reward": 1.7395833730697632, "reward_std": 0.15625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 823 }, { "completion_length": 89.8125, "epoch": 0.5493333333333333, "grad_norm": 2.919825403477114, "kl": 0.09130859375, "learning_rate": 7.253333333333334e-07, "loss": 0.0037, "reward": 1.751562476158142, "reward_std": 0.16197282075881958, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7828124761581421, "step": 824 }, { "completion_length": 93.125, "epoch": 0.55, "grad_norm": 3.132816917447675, "kl": 0.06884765625, "learning_rate": 7.249999999999999e-07, "loss": 0.0027, "reward": 1.8958333730697632, "reward_std": 0.1666666567325592, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9270833730697632, "step": 825 }, { "completion_length": 89.6875, "epoch": 0.5506666666666666, "grad_norm": 2.421250124456766, "kl": 0.03955078125, "learning_rate": 7.246666666666666e-07, "loss": 0.0016, "reward": 1.649999976158142, "reward_std": 0.10550211369991302, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6500000953674316, "step": 826 }, { "completion_length": 85.09375, "epoch": 0.5513333333333333, "grad_norm": 2.5003881082716175, "kl": 0.025390625, "learning_rate": 7.243333333333334e-07, "loss": 0.001, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 827 }, { "completion_length": 85.71875, "epoch": 0.552, "grad_norm": 3.386123872049028, "kl": 0.06298828125, "learning_rate": 7.24e-07, "loss": 0.0025, "reward": 1.9635417461395264, "reward_std": 0.026214702054858208, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9635416865348816, "step": 828 }, { "completion_length": 75.875, "epoch": 0.5526666666666666, "grad_norm": 2.0459152718786275, "kl": 0.08544921875, "learning_rate": 7.236666666666666e-07, "loss": 0.0034, "reward": 1.696874976158142, "reward_std": 0.006250003818422556, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6968750357627869, "step": 829 }, { "completion_length": 103.5, "epoch": 0.5533333333333333, "grad_norm": 1.666300976543401, "kl": 0.047119140625, "learning_rate": 7.233333333333333e-07, "loss": 0.0019, "reward": 1.96875, "reward_std": 0.03608439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 830 }, { "completion_length": 92.65625, "epoch": 0.554, "grad_norm": 13.23886744673945, "kl": 0.07421875, "learning_rate": 7.229999999999999e-07, "loss": 0.003, "reward": 1.730208396911621, "reward_std": 0.2577306926250458, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7302082777023315, "step": 831 }, { "completion_length": 96.4375, "epoch": 0.5546666666666666, "grad_norm": 1.5060305174339466, "kl": 0.060302734375, "learning_rate": 7.226666666666667e-07, "loss": 0.0024, "reward": 1.7864583730697632, "reward_std": 0.010416671633720398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7864583730697632, "step": 832 }, { "completion_length": 95.40625, "epoch": 0.5553333333333333, "grad_norm": 4.952882841290324, "kl": 0.06982421875, "learning_rate": 7.223333333333334e-07, "loss": 0.0028, "reward": 1.8333333730697632, "reward_std": 0.2180021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8645833730697632, "step": 833 }, { "completion_length": 106.3125, "epoch": 0.556, "grad_norm": 4.179778282515002, "kl": 0.1044921875, "learning_rate": 7.219999999999999e-07, "loss": 0.0042, "reward": 1.724367618560791, "reward_std": 0.19583511352539062, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7243675589561462, "step": 834 }, { "completion_length": 100.1875, "epoch": 0.5566666666666666, "grad_norm": 8.048044961487731, "kl": 0.07177734375, "learning_rate": 7.216666666666666e-07, "loss": 0.0029, "reward": 1.6479166746139526, "reward_std": 0.26503169536590576, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6479166746139526, "step": 835 }, { "completion_length": 88.8125, "epoch": 0.5573333333333333, "grad_norm": 2.508325256396268, "kl": 0.06640625, "learning_rate": 7.213333333333334e-07, "loss": 0.0027, "reward": 1.96875, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 836 }, { "completion_length": 95.0625, "epoch": 0.558, "grad_norm": 7.082447836756185, "kl": 0.062255859375, "learning_rate": 7.21e-07, "loss": 0.0025, "reward": 1.7166666984558105, "reward_std": 0.1242058277130127, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7166666984558105, "step": 837 }, { "completion_length": 109.1875, "epoch": 0.5586666666666666, "grad_norm": 3.9626535416626067, "kl": 0.05322265625, "learning_rate": 7.206666666666666e-07, "loss": 0.0021, "reward": 1.7859375476837158, "reward_std": 0.20877143740653992, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.785937488079071, "step": 838 }, { "completion_length": 113.21875, "epoch": 0.5593333333333333, "grad_norm": 1.5628199186523943, "kl": 0.0810546875, "learning_rate": 7.203333333333333e-07, "loss": 0.0032, "reward": 1.6875, "reward_std": 0.20683756470680237, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.71875, "step": 839 }, { "completion_length": 97.15625, "epoch": 0.56, "grad_norm": 2.467472485159238, "kl": 0.07568359375, "learning_rate": 7.2e-07, "loss": 0.003, "reward": 1.9197916984558105, "reward_std": 0.07708334177732468, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9197916388511658, "step": 840 }, { "completion_length": 104.46875, "epoch": 0.5606666666666666, "grad_norm": 1.6169328551216013, "kl": 0.0703125, "learning_rate": 7.196666666666667e-07, "loss": 0.0028, "reward": 1.7476823329925537, "reward_std": 0.14903804659843445, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7789323925971985, "step": 841 }, { "completion_length": 98.40625, "epoch": 0.5613333333333334, "grad_norm": 1.0228869305408108, "kl": 0.05517578125, "learning_rate": 7.193333333333333e-07, "loss": 0.0022, "reward": 1.8645833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833134651184, "step": 842 }, { "completion_length": 92.75, "epoch": 0.562, "grad_norm": 2.78709404387182, "kl": 0.06396484375, "learning_rate": 7.189999999999999e-07, "loss": 0.0026, "reward": 1.8381695747375488, "reward_std": 0.024173453450202942, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8381696343421936, "step": 843 }, { "completion_length": 100.0625, "epoch": 0.5626666666666666, "grad_norm": 2.4351038817767527, "kl": 0.06591796875, "learning_rate": 7.186666666666667e-07, "loss": 0.0026, "reward": 1.921875, "reward_std": 0.05681329220533371, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 844 }, { "completion_length": 107.21875, "epoch": 0.5633333333333334, "grad_norm": 4.672112617841763, "kl": 0.091796875, "learning_rate": 7.183333333333334e-07, "loss": 0.0037, "reward": 1.640625, "reward_std": 0.26039621233940125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6718750596046448, "step": 845 }, { "completion_length": 96.96875, "epoch": 0.564, "grad_norm": 1.6055253504163038, "kl": 0.040771484375, "learning_rate": 7.179999999999999e-07, "loss": 0.0016, "reward": 1.921875, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 846 }, { "completion_length": 104.15625, "epoch": 0.5646666666666667, "grad_norm": 6.499313033315428, "kl": 0.057861328125, "learning_rate": 7.176666666666666e-07, "loss": 0.0023, "reward": 1.7463542222976685, "reward_std": 0.05748751387000084, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7463541626930237, "step": 847 }, { "completion_length": 99.15625, "epoch": 0.5653333333333334, "grad_norm": 5.593157857810672, "kl": 0.0810546875, "learning_rate": 7.173333333333333e-07, "loss": 0.0032, "reward": 1.7869791984558105, "reward_std": 0.2041110396385193, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8182291388511658, "step": 848 }, { "completion_length": 119.0, "epoch": 0.566, "grad_norm": 4.060906055197428, "kl": 0.06640625, "learning_rate": 7.17e-07, "loss": 0.0027, "reward": 1.6750000715255737, "reward_std": 0.16688883304595947, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7687499523162842, "step": 849 }, { "completion_length": 91.0625, "epoch": 0.5666666666666667, "grad_norm": 2.739417674331969, "kl": 0.10693359375, "learning_rate": 7.166666666666667e-07, "loss": 0.0043, "reward": 1.6354167461395264, "reward_std": 0.2933938205242157, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6979166269302368, "step": 850 }, { "completion_length": 102.40625, "epoch": 0.5673333333333334, "grad_norm": 2.535379262254511, "kl": 0.06982421875, "learning_rate": 7.163333333333333e-07, "loss": 0.0028, "reward": 1.7546875476837158, "reward_std": 0.2292420119047165, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7859375476837158, "step": 851 }, { "completion_length": 94.625, "epoch": 0.568, "grad_norm": 5.462940738184966, "kl": 0.11181640625, "learning_rate": 7.159999999999999e-07, "loss": 0.0045, "reward": 1.7218749523162842, "reward_std": 0.17291666567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.721875011920929, "step": 852 }, { "completion_length": 87.96875, "epoch": 0.5686666666666667, "grad_norm": 2.566410729710649, "kl": 0.08544921875, "learning_rate": 7.156666666666667e-07, "loss": 0.0034, "reward": 1.7470238208770752, "reward_std": 0.06741928309202194, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.74702388048172, "step": 853 }, { "completion_length": 110.4375, "epoch": 0.5693333333333334, "grad_norm": 48.172280412601026, "kl": 0.09375, "learning_rate": 7.153333333333334e-07, "loss": 0.0037, "reward": 1.8484375476837158, "reward_std": 0.14871002733707428, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8796875476837158, "step": 854 }, { "completion_length": 104.96875, "epoch": 0.57, "grad_norm": 6.317361109334244, "kl": 0.083984375, "learning_rate": 7.149999999999999e-07, "loss": 0.0034, "reward": 1.5692708492279053, "reward_std": 0.18849249184131622, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6005208492279053, "step": 855 }, { "completion_length": 101.78125, "epoch": 0.5706666666666667, "grad_norm": 3.683899306962854, "kl": 0.0732421875, "learning_rate": 7.146666666666666e-07, "loss": 0.0029, "reward": 1.5989583730697632, "reward_std": 0.14508545398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5989583730697632, "step": 856 }, { "completion_length": 96.40625, "epoch": 0.5713333333333334, "grad_norm": 6.034554694679321, "kl": 0.07568359375, "learning_rate": 7.143333333333334e-07, "loss": 0.003, "reward": 1.7578125, "reward_std": 0.046875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7578125, "step": 857 }, { "completion_length": 95.0, "epoch": 0.572, "grad_norm": 0.26704780932459077, "kl": 0.0810546875, "learning_rate": 7.14e-07, "loss": 0.0032, "reward": 1.84375, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 858 }, { "completion_length": 103.6875, "epoch": 0.5726666666666667, "grad_norm": 1.6425154204860077, "kl": 0.03515625, "learning_rate": 7.136666666666666e-07, "loss": 0.0014, "reward": 1.8333333730697632, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 859 }, { "completion_length": 84.8125, "epoch": 0.5733333333333334, "grad_norm": 3.168986696018615, "kl": 0.08544921875, "learning_rate": 7.133333333333333e-07, "loss": 0.0034, "reward": 1.8229167461395264, "reward_std": 0.2898927927017212, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8541666865348816, "step": 860 }, { "completion_length": 97.25, "epoch": 0.574, "grad_norm": 4.022449835951297, "kl": 0.0712890625, "learning_rate": 7.129999999999999e-07, "loss": 0.0028, "reward": 1.8229167461395264, "reward_std": 0.17633545398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166865348816, "step": 861 }, { "completion_length": 98.78125, "epoch": 0.5746666666666667, "grad_norm": 2.9271978934714373, "kl": 0.076171875, "learning_rate": 7.126666666666667e-07, "loss": 0.0031, "reward": 1.7395833730697632, "reward_std": 0.05746470391750336, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 862 }, { "completion_length": 91.5, "epoch": 0.5753333333333334, "grad_norm": 4.203144135550193, "kl": 0.1005859375, "learning_rate": 7.123333333333333e-07, "loss": 0.004, "reward": 1.9083333015441895, "reward_std": 0.07631926983594894, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9083333015441895, "step": 863 }, { "completion_length": 102.03125, "epoch": 0.576, "grad_norm": 3.083058203178422, "kl": 0.076171875, "learning_rate": 7.119999999999999e-07, "loss": 0.003, "reward": 1.8098958730697632, "reward_std": 0.171875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8098958730697632, "step": 864 }, { "completion_length": 99.90625, "epoch": 0.5766666666666667, "grad_norm": 7.842765243760025, "kl": 0.05419921875, "learning_rate": 7.116666666666666e-07, "loss": 0.0022, "reward": 1.7421875, "reward_std": 0.1770426332950592, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8046875, "step": 865 }, { "completion_length": 93.53125, "epoch": 0.5773333333333334, "grad_norm": 6.658654493433317, "kl": 0.06787109375, "learning_rate": 7.113333333333334e-07, "loss": 0.0027, "reward": 1.8645833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833134651184, "step": 866 }, { "completion_length": 89.84375, "epoch": 0.578, "grad_norm": 4.211387146096345, "kl": 0.09619140625, "learning_rate": 7.11e-07, "loss": 0.0038, "reward": 1.7578125, "reward_std": 0.17977949976921082, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7578125, "step": 867 }, { "completion_length": 93.65625, "epoch": 0.5786666666666667, "grad_norm": 2.1147264010611844, "kl": 0.08544921875, "learning_rate": 7.106666666666666e-07, "loss": 0.0034, "reward": 1.7291667461395264, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7291666865348816, "step": 868 }, { "completion_length": 105.8125, "epoch": 0.5793333333333334, "grad_norm": 2.849396928201652, "kl": 0.058837890625, "learning_rate": 7.103333333333333e-07, "loss": 0.0024, "reward": 1.734375, "reward_std": 0.3049485683441162, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7968750596046448, "step": 869 }, { "completion_length": 90.3125, "epoch": 0.58, "grad_norm": 3.7438732959508023, "kl": 0.07958984375, "learning_rate": 7.1e-07, "loss": 0.0032, "reward": 1.796875, "reward_std": 0.15625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.828125, "step": 870 }, { "completion_length": 90.0625, "epoch": 0.5806666666666667, "grad_norm": 0.11997112188491844, "kl": 0.08447265625, "learning_rate": 7.096666666666667e-07, "loss": 0.0034, "reward": 1.9249999523162842, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.925000011920929, "step": 871 }, { "completion_length": 105.125, "epoch": 0.5813333333333334, "grad_norm": 4.452265469724954, "kl": 0.07763671875, "learning_rate": 7.093333333333333e-07, "loss": 0.0031, "reward": 1.7395832538604736, "reward_std": 0.1666666716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833134651184, "step": 872 }, { "completion_length": 102.09375, "epoch": 0.582, "grad_norm": 3.1406831585351864, "kl": 0.07421875, "learning_rate": 7.089999999999999e-07, "loss": 0.003, "reward": 1.7291667461395264, "reward_std": 0.06985540688037872, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7291666865348816, "step": 873 }, { "completion_length": 88.375, "epoch": 0.5826666666666667, "grad_norm": 0.10281829436938549, "kl": 0.04833984375, "learning_rate": 7.086666666666667e-07, "loss": 0.0019, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 874 }, { "completion_length": 96.03125, "epoch": 0.5833333333333334, "grad_norm": 2.141640039281006, "kl": 0.08447265625, "learning_rate": 7.083333333333334e-07, "loss": 0.0034, "reward": 1.797247052192688, "reward_std": 0.02864699810743332, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7972469329833984, "step": 875 }, { "completion_length": 88.25, "epoch": 0.584, "grad_norm": 1.3538488906249366, "kl": 0.045654296875, "learning_rate": 7.079999999999999e-07, "loss": 0.0018, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 876 }, { "completion_length": 100.75, "epoch": 0.5846666666666667, "grad_norm": 7.200905026060537, "kl": 0.07470703125, "learning_rate": 7.076666666666666e-07, "loss": 0.003, "reward": 1.792708396911621, "reward_std": 0.08360882103443146, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7927083373069763, "step": 877 }, { "completion_length": 91.3125, "epoch": 0.5853333333333334, "grad_norm": 1.8596067402770937, "kl": 0.057861328125, "learning_rate": 7.073333333333333e-07, "loss": 0.0023, "reward": 1.7916667461395264, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666269302368, "step": 878 }, { "completion_length": 89.28125, "epoch": 0.586, "grad_norm": 1.0698622078028144, "kl": 0.06982421875, "learning_rate": 7.07e-07, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 879 }, { "completion_length": 102.5, "epoch": 0.5866666666666667, "grad_norm": 3.1387058614374026, "kl": 0.06201171875, "learning_rate": 7.066666666666666e-07, "loss": 0.0025, "reward": 1.8390624523162842, "reward_std": 0.13437500596046448, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.839062511920929, "step": 880 }, { "completion_length": 96.09375, "epoch": 0.5873333333333334, "grad_norm": 1.5222866302243128, "kl": 0.08203125, "learning_rate": 7.063333333333333e-07, "loss": 0.0033, "reward": 1.834375023841858, "reward_std": 0.06875000149011612, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8343750238418579, "step": 881 }, { "completion_length": 85.03125, "epoch": 0.588, "grad_norm": 1.9066177917694374, "kl": 0.083984375, "learning_rate": 7.059999999999999e-07, "loss": 0.0034, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 882 }, { "completion_length": 100.125, "epoch": 0.5886666666666667, "grad_norm": 2.8268711333542633, "kl": 0.083984375, "learning_rate": 7.056666666666667e-07, "loss": 0.0034, "reward": 1.71875, "reward_std": 0.1194177195429802, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 883 }, { "completion_length": 96.21875, "epoch": 0.5893333333333334, "grad_norm": 2.607625595906785, "kl": 0.05908203125, "learning_rate": 7.053333333333333e-07, "loss": 0.0024, "reward": 1.875, "reward_std": 0.014433760195970535, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 884 }, { "completion_length": 98.375, "epoch": 0.59, "grad_norm": 3.46722925664578, "kl": 0.041259765625, "learning_rate": 7.049999999999999e-07, "loss": 0.0016, "reward": 1.734375, "reward_std": 0.3950854539871216, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.796875, "step": 885 }, { "completion_length": 96.6875, "epoch": 0.5906666666666667, "grad_norm": 2.6968642903546445, "kl": 0.043212890625, "learning_rate": 7.046666666666666e-07, "loss": 0.0017, "reward": 1.734375, "reward_std": 0.21664540469646454, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.765625, "step": 886 }, { "completion_length": 92.5, "epoch": 0.5913333333333334, "grad_norm": 1.2861943503327726, "kl": 0.06201171875, "learning_rate": 7.043333333333334e-07, "loss": 0.0025, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 887 }, { "completion_length": 93.53125, "epoch": 0.592, "grad_norm": 2.5471466717727327, "kl": 0.04736328125, "learning_rate": 7.04e-07, "loss": 0.0019, "reward": 1.8489583730697632, "reward_std": 0.2074463963508606, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8802083730697632, "step": 888 }, { "completion_length": 96.90625, "epoch": 0.5926666666666667, "grad_norm": 34.938551629833654, "kl": 0.087890625, "learning_rate": 7.036666666666666e-07, "loss": 0.0035, "reward": 1.7734375, "reward_std": 0.0989583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7734374403953552, "step": 889 }, { "completion_length": 109.375, "epoch": 0.5933333333333334, "grad_norm": 4.107253079306405, "kl": 0.08203125, "learning_rate": 7.033333333333333e-07, "loss": 0.0033, "reward": 1.7718749046325684, "reward_std": 0.10554219037294388, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7718750238418579, "step": 890 }, { "completion_length": 92.34375, "epoch": 0.594, "grad_norm": 5.468792424093218, "kl": 0.06640625, "learning_rate": 7.029999999999999e-07, "loss": 0.0026, "reward": 1.71875, "reward_std": 0.21886569261550903, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 891 }, { "completion_length": 97.59375, "epoch": 0.5946666666666667, "grad_norm": 6.078348736823284, "kl": 0.058349609375, "learning_rate": 7.026666666666667e-07, "loss": 0.0023, "reward": 1.8328125476837158, "reward_std": 0.17491298913955688, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.832812488079071, "step": 892 }, { "completion_length": 90.875, "epoch": 0.5953333333333334, "grad_norm": 2.7609401360686845, "kl": 0.057373046875, "learning_rate": 7.023333333333333e-07, "loss": 0.0023, "reward": 1.693750023841858, "reward_std": 0.20966878533363342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6937500238418579, "step": 893 }, { "completion_length": 99.71875, "epoch": 0.596, "grad_norm": 8.443457798230865, "kl": 0.05859375, "learning_rate": 7.019999999999999e-07, "loss": 0.0023, "reward": 1.796875, "reward_std": 0.17055226862430573, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7968750596046448, "step": 894 }, { "completion_length": 99.6875, "epoch": 0.5966666666666667, "grad_norm": 0.934289011982526, "kl": 0.043212890625, "learning_rate": 7.016666666666666e-07, "loss": 0.0017, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 895 }, { "completion_length": 90.0625, "epoch": 0.5973333333333334, "grad_norm": 2.414220046520026, "kl": 0.049072265625, "learning_rate": 7.013333333333334e-07, "loss": 0.002, "reward": 1.9302083253860474, "reward_std": 0.08958332985639572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9302083253860474, "step": 896 }, { "completion_length": 79.125, "epoch": 0.598, "grad_norm": 1.14989125305251, "kl": 0.056640625, "learning_rate": 7.009999999999999e-07, "loss": 0.0023, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 897 }, { "completion_length": 93.96875, "epoch": 0.5986666666666667, "grad_norm": 5.620802967386947, "kl": 0.0654296875, "learning_rate": 7.006666666666666e-07, "loss": 0.0026, "reward": 1.921875, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 898 }, { "completion_length": 87.90625, "epoch": 0.5993333333333334, "grad_norm": 2.98884415524732, "kl": 0.06298828125, "learning_rate": 7.003333333333333e-07, "loss": 0.0025, "reward": 1.78125, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 899 }, { "completion_length": 104.28125, "epoch": 0.6, "grad_norm": 3.6416454146806783, "kl": 0.08447265625, "learning_rate": 7e-07, "loss": 0.0034, "reward": 1.6197917461395264, "reward_std": 0.16709844768047333, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6510416865348816, "step": 900 }, { "completion_length": 105.0625, "epoch": 0.6006666666666667, "grad_norm": 21.52917869044563, "kl": 0.06298828125, "learning_rate": 6.996666666666666e-07, "loss": 0.0025, "reward": 1.8427083492279053, "reward_std": 0.13987797498703003, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8427083492279053, "step": 901 }, { "completion_length": 97.65625, "epoch": 0.6013333333333334, "grad_norm": 2.662888938912276, "kl": 0.03515625, "learning_rate": 6.993333333333333e-07, "loss": 0.0014, "reward": 1.7604167461395264, "reward_std": 0.151679128408432, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7604167461395264, "step": 902 }, { "completion_length": 95.40625, "epoch": 0.602, "grad_norm": 4.172461298826297, "kl": 0.06640625, "learning_rate": 6.989999999999999e-07, "loss": 0.0027, "reward": 1.9135416746139526, "reward_std": 0.09906215220689774, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9135416746139526, "step": 903 }, { "completion_length": 91.25, "epoch": 0.6026666666666667, "grad_norm": 2.9021347267126862, "kl": 0.0712890625, "learning_rate": 6.986666666666667e-07, "loss": 0.0029, "reward": 1.8125, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 904 }, { "completion_length": 102.03125, "epoch": 0.6033333333333334, "grad_norm": 3.53200948323955, "kl": 0.043701171875, "learning_rate": 6.983333333333334e-07, "loss": 0.0017, "reward": 1.6633927822113037, "reward_std": 0.2202741801738739, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6633928418159485, "step": 905 }, { "completion_length": 98.875, "epoch": 0.604, "grad_norm": 7.000379015504136, "kl": 0.11572265625, "learning_rate": 6.979999999999999e-07, "loss": 0.0046, "reward": 1.6541666984558105, "reward_std": 0.152337446808815, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6854166984558105, "step": 906 }, { "completion_length": 106.875, "epoch": 0.6046666666666667, "grad_norm": 2.001963935674285, "kl": 0.036376953125, "learning_rate": 6.976666666666666e-07, "loss": 0.0014, "reward": 1.7999999523162842, "reward_std": 0.14716878533363342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7999999523162842, "step": 907 }, { "completion_length": 98.875, "epoch": 0.6053333333333333, "grad_norm": 3.4711991649653506, "kl": 0.05078125, "learning_rate": 6.973333333333333e-07, "loss": 0.002, "reward": 1.8963541984558105, "reward_std": 0.04970378056168556, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8963541388511658, "step": 908 }, { "completion_length": 100.71875, "epoch": 0.606, "grad_norm": 23.57016973667994, "kl": 0.0537109375, "learning_rate": 6.97e-07, "loss": 0.0022, "reward": 1.7815475463867188, "reward_std": 0.0941489040851593, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7815476059913635, "step": 909 }, { "completion_length": 97.125, "epoch": 0.6066666666666667, "grad_norm": 5.647105510966143, "kl": 0.0771484375, "learning_rate": 6.966666666666666e-07, "loss": 0.0031, "reward": 1.9140625, "reward_std": 0.09885390102863312, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9453125, "step": 910 }, { "completion_length": 102.1875, "epoch": 0.6073333333333333, "grad_norm": 2.685908151804429, "kl": 0.07666015625, "learning_rate": 6.963333333333333e-07, "loss": 0.0031, "reward": 1.5671875476837158, "reward_std": 0.3092547655105591, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.598437488079071, "step": 911 }, { "completion_length": 102.0, "epoch": 0.608, "grad_norm": 13.382467514521409, "kl": 0.083984375, "learning_rate": 6.959999999999999e-07, "loss": 0.0034, "reward": 1.7411458492279053, "reward_std": 0.16412922739982605, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8036458492279053, "step": 912 }, { "completion_length": 94.34375, "epoch": 0.6086666666666667, "grad_norm": 0.20923590966865888, "kl": 0.052490234375, "learning_rate": 6.956666666666667e-07, "loss": 0.0021, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 913 }, { "completion_length": 93.09375, "epoch": 0.6093333333333333, "grad_norm": 1.4655347219750219, "kl": 0.031494140625, "learning_rate": 6.953333333333333e-07, "loss": 0.0013, "reward": 1.8125, "reward_std": 0.0833333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 914 }, { "completion_length": 105.03125, "epoch": 0.61, "grad_norm": 1.7026741572768835, "kl": 0.04736328125, "learning_rate": 6.949999999999999e-07, "loss": 0.0019, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 915 }, { "completion_length": 84.3125, "epoch": 0.6106666666666667, "grad_norm": 3.189732309896307, "kl": 0.09033203125, "learning_rate": 6.946666666666666e-07, "loss": 0.0036, "reward": 1.8875000476837158, "reward_std": 0.1586044579744339, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.887499988079071, "step": 916 }, { "completion_length": 95.0625, "epoch": 0.6113333333333333, "grad_norm": 4.247376127476828, "kl": 0.09130859375, "learning_rate": 6.943333333333334e-07, "loss": 0.0037, "reward": 1.8453125953674316, "reward_std": 0.10855265706777573, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8453124761581421, "step": 917 }, { "completion_length": 100.5, "epoch": 0.612, "grad_norm": 2.287182107749346, "kl": 0.0625, "learning_rate": 6.939999999999999e-07, "loss": 0.0025, "reward": 1.7156250476837158, "reward_std": 0.17057538032531738, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.715624988079071, "step": 918 }, { "completion_length": 96.65625, "epoch": 0.6126666666666667, "grad_norm": 3.5881045960441464, "kl": 0.08154296875, "learning_rate": 6.936666666666666e-07, "loss": 0.0033, "reward": 1.8541667461395264, "reward_std": 0.04697880893945694, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666865348816, "step": 919 }, { "completion_length": 112.96875, "epoch": 0.6133333333333333, "grad_norm": 1.7739692783729615, "kl": 0.056396484375, "learning_rate": 6.933333333333333e-07, "loss": 0.0023, "reward": 1.7916667461395264, "reward_std": 0.2083333432674408, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8229166269302368, "step": 920 }, { "completion_length": 102.71875, "epoch": 0.614, "grad_norm": 9.909824955480149, "kl": 0.0712890625, "learning_rate": 6.929999999999999e-07, "loss": 0.0028, "reward": 1.7729167938232422, "reward_std": 0.2047049105167389, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7729166746139526, "step": 921 }, { "completion_length": 103.84375, "epoch": 0.6146666666666667, "grad_norm": 6.79194459971564, "kl": 0.053466796875, "learning_rate": 6.926666666666666e-07, "loss": 0.0021, "reward": 1.640625, "reward_std": 0.047680724412202835, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.640625, "step": 922 }, { "completion_length": 85.09375, "epoch": 0.6153333333333333, "grad_norm": 1.818440282139317, "kl": 0.1005859375, "learning_rate": 6.923333333333333e-07, "loss": 0.004, "reward": 1.8645833730697632, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8958333730697632, "step": 923 }, { "completion_length": 93.3125, "epoch": 0.616, "grad_norm": 3.366864707032528, "kl": 0.050537109375, "learning_rate": 6.919999999999999e-07, "loss": 0.002, "reward": 1.7239583730697632, "reward_std": 0.17794691026210785, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 924 }, { "completion_length": 100.5, "epoch": 0.6166666666666667, "grad_norm": 2.068969351583713, "kl": 0.10400390625, "learning_rate": 6.916666666666666e-07, "loss": 0.0042, "reward": 1.875, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 925 }, { "completion_length": 100.1875, "epoch": 0.6173333333333333, "grad_norm": 1.3310975371036584, "kl": 0.052001953125, "learning_rate": 6.913333333333334e-07, "loss": 0.0021, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 926 }, { "completion_length": 117.4375, "epoch": 0.618, "grad_norm": 2.581741444537859, "kl": 0.0751953125, "learning_rate": 6.909999999999999e-07, "loss": 0.003, "reward": 1.7760417461395264, "reward_std": 0.22459951043128967, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8072917461395264, "step": 927 }, { "completion_length": 88.0625, "epoch": 0.6186666666666667, "grad_norm": 2.288141146639471, "kl": 0.07666015625, "learning_rate": 6.906666666666666e-07, "loss": 0.0031, "reward": 1.6692708730697632, "reward_std": 0.005208328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6692708730697632, "step": 928 }, { "completion_length": 108.0, "epoch": 0.6193333333333333, "grad_norm": 19.01820606187008, "kl": 0.11572265625, "learning_rate": 6.903333333333333e-07, "loss": 0.0046, "reward": 1.6630208492279053, "reward_std": 0.20927056670188904, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7567707896232605, "step": 929 }, { "completion_length": 93.65625, "epoch": 0.62, "grad_norm": 2.6553078789595475, "kl": 0.0673828125, "learning_rate": 6.9e-07, "loss": 0.0027, "reward": 1.7239583730697632, "reward_std": 0.15625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7552083730697632, "step": 930 }, { "completion_length": 81.34375, "epoch": 0.6206666666666667, "grad_norm": 2.005678438338836, "kl": 0.08056640625, "learning_rate": 6.896666666666666e-07, "loss": 0.0032, "reward": 1.84375, "reward_std": 0.16456207633018494, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.90625, "step": 931 }, { "completion_length": 106.875, "epoch": 0.6213333333333333, "grad_norm": 9.21890352383369, "kl": 0.08447265625, "learning_rate": 6.893333333333333e-07, "loss": 0.0034, "reward": 1.810286521911621, "reward_std": 0.10290548205375671, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8102864623069763, "step": 932 }, { "completion_length": 93.03125, "epoch": 0.622, "grad_norm": 2.2598367574737455, "kl": 0.05029296875, "learning_rate": 6.889999999999999e-07, "loss": 0.002, "reward": 1.609375, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.609375, "step": 933 }, { "completion_length": 111.625, "epoch": 0.6226666666666667, "grad_norm": 3.204699166810905, "kl": 0.0830078125, "learning_rate": 6.886666666666667e-07, "loss": 0.0033, "reward": 1.8385417461395264, "reward_std": 0.16591878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416269302368, "step": 934 }, { "completion_length": 85.40625, "epoch": 0.6233333333333333, "grad_norm": 5.567428482545694, "kl": 0.08203125, "learning_rate": 6.883333333333333e-07, "loss": 0.0033, "reward": 1.8703124523162842, "reward_std": 0.16008900105953217, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9015625715255737, "step": 935 }, { "completion_length": 94.3125, "epoch": 0.624, "grad_norm": 3.253674188180038, "kl": 0.08154296875, "learning_rate": 6.879999999999999e-07, "loss": 0.0033, "reward": 1.863020896911621, "reward_std": 0.14356878399848938, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8942708373069763, "step": 936 }, { "completion_length": 101.46875, "epoch": 0.6246666666666667, "grad_norm": 5.6150425333503495, "kl": 0.04296875, "learning_rate": 6.876666666666666e-07, "loss": 0.0017, "reward": 1.443750023841858, "reward_std": 0.45795589685440063, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.5375000238418579, "step": 937 }, { "completion_length": 87.21875, "epoch": 0.6253333333333333, "grad_norm": 1.7504758722486293, "kl": 0.09716796875, "learning_rate": 6.873333333333334e-07, "loss": 0.0039, "reward": 1.875, "reward_std": 0.18217839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 938 }, { "completion_length": 109.0, "epoch": 0.626, "grad_norm": 4.695388383363293, "kl": 0.0615234375, "learning_rate": 6.87e-07, "loss": 0.0025, "reward": 1.870634913444519, "reward_std": 0.12124886363744736, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9018849730491638, "step": 939 }, { "completion_length": 100.375, "epoch": 0.6266666666666667, "grad_norm": 13.360374200564008, "kl": 0.08251953125, "learning_rate": 6.866666666666666e-07, "loss": 0.0033, "reward": 1.8392857313156128, "reward_std": 0.15176568925380707, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8392857313156128, "step": 940 }, { "completion_length": 90.3125, "epoch": 0.6273333333333333, "grad_norm": 18.824301695600965, "kl": 0.091796875, "learning_rate": 6.863333333333333e-07, "loss": 0.0037, "reward": 1.8154761791229248, "reward_std": 0.11361434310674667, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8154761791229248, "step": 941 }, { "completion_length": 121.71875, "epoch": 0.628, "grad_norm": 3.92525471061555, "kl": 0.048828125, "learning_rate": 6.86e-07, "loss": 0.002, "reward": 1.7676711082458496, "reward_std": 0.25045377016067505, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.8614211678504944, "step": 942 }, { "completion_length": 110.875, "epoch": 0.6286666666666667, "grad_norm": 2.199333855164596, "kl": 0.0751953125, "learning_rate": 6.856666666666667e-07, "loss": 0.003, "reward": 1.836458444595337, "reward_std": 0.07332531362771988, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8364583849906921, "step": 943 }, { "completion_length": 87.8125, "epoch": 0.6293333333333333, "grad_norm": 1.432823358086253, "kl": 0.05810546875, "learning_rate": 6.853333333333333e-07, "loss": 0.0023, "reward": 1.8645833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833730697632, "step": 944 }, { "completion_length": 92.9375, "epoch": 0.63, "grad_norm": 3.6592226084459814, "kl": 0.08447265625, "learning_rate": 6.85e-07, "loss": 0.0034, "reward": 1.722395896911621, "reward_std": 0.22125336527824402, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7848958373069763, "step": 945 }, { "completion_length": 82.15625, "epoch": 0.6306666666666667, "grad_norm": 7.701588824840256, "kl": 0.072265625, "learning_rate": 6.846666666666666e-07, "loss": 0.0029, "reward": 1.8781249523162842, "reward_std": 0.19091877341270447, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.878125011920929, "step": 946 }, { "completion_length": 103.875, "epoch": 0.6313333333333333, "grad_norm": 5.170394038539576, "kl": 0.07373046875, "learning_rate": 6.843333333333334e-07, "loss": 0.0029, "reward": 1.7291667461395264, "reward_std": 0.19952812790870667, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7604166865348816, "step": 947 }, { "completion_length": 98.21875, "epoch": 0.632, "grad_norm": 2.9922501907097363, "kl": 0.05224609375, "learning_rate": 6.84e-07, "loss": 0.0021, "reward": 1.8020833730697632, "reward_std": 0.2169627845287323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 948 }, { "completion_length": 102.21875, "epoch": 0.6326666666666667, "grad_norm": 2.537793611566122, "kl": 0.08642578125, "learning_rate": 6.836666666666666e-07, "loss": 0.0035, "reward": 1.7053570747375488, "reward_std": 0.1411585956811905, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7366071343421936, "step": 949 }, { "completion_length": 89.28125, "epoch": 0.6333333333333333, "grad_norm": 2.0500399615176077, "kl": 0.0888671875, "learning_rate": 6.833333333333333e-07, "loss": 0.0035, "reward": 1.8984375, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8984375, "step": 950 }, { "completion_length": 103.28125, "epoch": 0.634, "grad_norm": 2.1556482086235604, "kl": 0.0703125, "learning_rate": 6.830000000000001e-07, "loss": 0.0028, "reward": 1.7239583730697632, "reward_std": 0.19642089307308197, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 951 }, { "completion_length": 104.34375, "epoch": 0.6346666666666667, "grad_norm": 5.094889774733232, "kl": 0.12158203125, "learning_rate": 6.826666666666666e-07, "loss": 0.0049, "reward": 1.7000000476837158, "reward_std": 0.2668471932411194, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7312500476837158, "step": 952 }, { "completion_length": 96.9375, "epoch": 0.6353333333333333, "grad_norm": 1.6173777775443547, "kl": 0.06640625, "learning_rate": 6.823333333333333e-07, "loss": 0.0026, "reward": 1.8020833730697632, "reward_std": 0.0919627845287323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 953 }, { "completion_length": 118.96875, "epoch": 0.636, "grad_norm": 2.81010322855961, "kl": 0.062255859375, "learning_rate": 6.82e-07, "loss": 0.0025, "reward": 1.7346100807189941, "reward_std": 0.21909199655056, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7658599615097046, "step": 954 }, { "completion_length": 93.53125, "epoch": 0.6366666666666667, "grad_norm": 7.614110574619496, "kl": 0.08740234375, "learning_rate": 6.816666666666666e-07, "loss": 0.0035, "reward": 1.8192708492279053, "reward_std": 0.10383900254964828, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8192707896232605, "step": 955 }, { "completion_length": 91.46875, "epoch": 0.6373333333333333, "grad_norm": 4.679652544398324, "kl": 0.119140625, "learning_rate": 6.813333333333333e-07, "loss": 0.0048, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 956 }, { "completion_length": 91.40625, "epoch": 0.638, "grad_norm": 5.60642251498064, "kl": 0.08642578125, "learning_rate": 6.81e-07, "loss": 0.0035, "reward": 1.8411458730697632, "reward_std": 0.10162988305091858, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8411458134651184, "step": 957 }, { "completion_length": 98.21875, "epoch": 0.6386666666666667, "grad_norm": 2.597739349128229, "kl": 0.1171875, "learning_rate": 6.806666666666666e-07, "loss": 0.0047, "reward": 1.9406249523162842, "reward_std": 0.08490592241287231, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.940625011920929, "step": 958 }, { "completion_length": 89.03125, "epoch": 0.6393333333333333, "grad_norm": 2.832070800913049, "kl": 0.0966796875, "learning_rate": 6.803333333333333e-07, "loss": 0.0039, "reward": 1.788690447807312, "reward_std": 0.2167084813117981, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.788690447807312, "step": 959 }, { "completion_length": 92.625, "epoch": 0.64, "grad_norm": 5.032035225891278, "kl": 0.0908203125, "learning_rate": 6.800000000000001e-07, "loss": 0.0036, "reward": 1.7727679014205933, "reward_std": 0.08363094180822372, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7727678418159485, "step": 960 }, { "completion_length": 97.90625, "epoch": 0.6406666666666667, "grad_norm": 5.6196718106673815, "kl": 0.09130859375, "learning_rate": 6.796666666666666e-07, "loss": 0.0037, "reward": 1.8125, "reward_std": 0.19716878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 961 }, { "completion_length": 85.375, "epoch": 0.6413333333333333, "grad_norm": 21.46990200166488, "kl": 0.0849609375, "learning_rate": 6.793333333333333e-07, "loss": 0.0034, "reward": 1.823958396911621, "reward_std": 0.08340559899806976, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8239583373069763, "step": 962 }, { "completion_length": 100.3125, "epoch": 0.642, "grad_norm": 5.335591067308332, "kl": 0.08251953125, "learning_rate": 6.79e-07, "loss": 0.0033, "reward": 1.7182291746139526, "reward_std": 0.10821044445037842, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7182291746139526, "step": 963 }, { "completion_length": 97.78125, "epoch": 0.6426666666666667, "grad_norm": 8.328965927173561, "kl": 0.04296875, "learning_rate": 6.786666666666667e-07, "loss": 0.0017, "reward": 1.7708333730697632, "reward_std": 0.22767089307308197, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333134651184, "step": 964 }, { "completion_length": 102.5, "epoch": 0.6433333333333333, "grad_norm": 3.277230282906284, "kl": 0.07373046875, "learning_rate": 6.783333333333333e-07, "loss": 0.003, "reward": 1.8718750476837158, "reward_std": 0.10385178029537201, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.903124988079071, "step": 965 }, { "completion_length": 113.6875, "epoch": 0.644, "grad_norm": 5.6048208295488235, "kl": 0.058349609375, "learning_rate": 6.78e-07, "loss": 0.0023, "reward": 1.5572917461395264, "reward_std": 0.3331198990345001, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.7135417461395264, "step": 966 }, { "completion_length": 87.25, "epoch": 0.6446666666666667, "grad_norm": 1.059097664820336, "kl": 0.064453125, "learning_rate": 6.776666666666666e-07, "loss": 0.0026, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 967 }, { "completion_length": 137.84375, "epoch": 0.6453333333333333, "grad_norm": 6.997677209883652, "kl": 0.06201171875, "learning_rate": 6.773333333333334e-07, "loss": 0.0025, "reward": 1.5750000476837158, "reward_std": 0.42430007457733154, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.668749988079071, "step": 968 }, { "completion_length": 98.625, "epoch": 0.646, "grad_norm": 6.364613703643921, "kl": 0.0703125, "learning_rate": 6.77e-07, "loss": 0.0028, "reward": 1.6822917461395264, "reward_std": 0.27051427960395813, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7135416269302368, "step": 969 }, { "completion_length": 89.125, "epoch": 0.6466666666666666, "grad_norm": 2.0158214463840256, "kl": 0.06591796875, "learning_rate": 6.766666666666666e-07, "loss": 0.0026, "reward": 1.748437523841858, "reward_std": 0.14327162504196167, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8109375238418579, "step": 970 }, { "completion_length": 127.3125, "epoch": 0.6473333333333333, "grad_norm": 1.4691976255285404, "kl": 0.05810546875, "learning_rate": 6.763333333333333e-07, "loss": 0.0023, "reward": 1.8333333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8645833730697632, "step": 971 }, { "completion_length": 107.3125, "epoch": 0.648, "grad_norm": 3.9224726656696665, "kl": 0.0732421875, "learning_rate": 6.76e-07, "loss": 0.0029, "reward": 1.587499976158142, "reward_std": 0.25187498331069946, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.6812500357627869, "step": 972 }, { "completion_length": 92.375, "epoch": 0.6486666666666666, "grad_norm": 3.1718899790020405, "kl": 0.038330078125, "learning_rate": 6.756666666666666e-07, "loss": 0.0015, "reward": 1.9296875, "reward_std": 0.140625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9609375, "step": 973 }, { "completion_length": 110.71875, "epoch": 0.6493333333333333, "grad_norm": 1.5230975306772572, "kl": 0.08642578125, "learning_rate": 6.753333333333333e-07, "loss": 0.0035, "reward": 1.6145833730697632, "reward_std": 0.2217404693365097, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6770833730697632, "step": 974 }, { "completion_length": 114.3125, "epoch": 0.65, "grad_norm": 4.9602307451787, "kl": 0.07470703125, "learning_rate": 6.75e-07, "loss": 0.003, "reward": 1.6218750476837158, "reward_std": 0.2790796756744385, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.621874988079071, "step": 975 }, { "completion_length": 96.8125, "epoch": 0.6506666666666666, "grad_norm": 7.227266429887916, "kl": 0.0908203125, "learning_rate": 6.746666666666666e-07, "loss": 0.0036, "reward": 1.719270944595337, "reward_std": 0.1661882847547531, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7192709445953369, "step": 976 }, { "completion_length": 98.375, "epoch": 0.6513333333333333, "grad_norm": 4.089529382000932, "kl": 0.07421875, "learning_rate": 6.743333333333333e-07, "loss": 0.003, "reward": 1.707291603088379, "reward_std": 0.0453990213572979, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7072916626930237, "step": 977 }, { "completion_length": 92.1875, "epoch": 0.652, "grad_norm": 1.220031844027701, "kl": 0.0634765625, "learning_rate": 6.74e-07, "loss": 0.0025, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 978 }, { "completion_length": 88.34375, "epoch": 0.6526666666666666, "grad_norm": 3.8535388388971996, "kl": 0.060302734375, "learning_rate": 6.736666666666666e-07, "loss": 0.0024, "reward": 1.859375, "reward_std": 0.14304219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 979 }, { "completion_length": 102.0, "epoch": 0.6533333333333333, "grad_norm": 2.4927777135885596, "kl": 0.08837890625, "learning_rate": 6.733333333333333e-07, "loss": 0.0035, "reward": 1.7912945747375488, "reward_std": 0.084077388048172, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8225446343421936, "step": 980 }, { "completion_length": 101.625, "epoch": 0.654, "grad_norm": 2.7526280382820794, "kl": 0.0673828125, "learning_rate": 6.730000000000001e-07, "loss": 0.0027, "reward": 1.859375, "reward_std": 0.13009506464004517, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.921875, "step": 981 }, { "completion_length": 110.4375, "epoch": 0.6546666666666666, "grad_norm": 4.903983007197228, "kl": 0.09619140625, "learning_rate": 6.726666666666666e-07, "loss": 0.0038, "reward": 1.7979166507720947, "reward_std": 0.16800212860107422, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8291666507720947, "step": 982 }, { "completion_length": 94.25, "epoch": 0.6553333333333333, "grad_norm": 1.3727974602341975, "kl": 0.0849609375, "learning_rate": 6.723333333333333e-07, "loss": 0.0034, "reward": 1.7105655670166016, "reward_std": 0.004464283585548401, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7105655670166016, "step": 983 }, { "completion_length": 95.5625, "epoch": 0.656, "grad_norm": 17.333911717408334, "kl": 0.07470703125, "learning_rate": 6.72e-07, "loss": 0.003, "reward": 1.6739583015441895, "reward_std": 0.12444992363452911, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6739583611488342, "step": 984 }, { "completion_length": 102.21875, "epoch": 0.6566666666666666, "grad_norm": 2.3734184627881323, "kl": 0.072265625, "learning_rate": 6.716666666666666e-07, "loss": 0.0029, "reward": 1.7239583730697632, "reward_std": 0.14317253232002258, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 985 }, { "completion_length": 104.46875, "epoch": 0.6573333333333333, "grad_norm": 0.9007561158323755, "kl": 0.06103515625, "learning_rate": 6.713333333333333e-07, "loss": 0.0024, "reward": 1.9270833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9270833730697632, "step": 986 }, { "completion_length": 106.15625, "epoch": 0.658, "grad_norm": 3.390663352896947, "kl": 0.06884765625, "learning_rate": 6.71e-07, "loss": 0.0028, "reward": 1.6875, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 987 }, { "completion_length": 113.65625, "epoch": 0.6586666666666666, "grad_norm": 1.248846713691231, "kl": 0.04541015625, "learning_rate": 6.706666666666666e-07, "loss": 0.0018, "reward": 1.6354167461395264, "reward_std": 0.28956207633018494, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.6979166865348816, "step": 988 }, { "completion_length": 88.59375, "epoch": 0.6593333333333333, "grad_norm": 3.42761555911554, "kl": 0.07373046875, "learning_rate": 6.703333333333333e-07, "loss": 0.003, "reward": 1.709375023841858, "reward_std": 0.14374999701976776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7093750238418579, "step": 989 }, { "completion_length": 105.8125, "epoch": 0.66, "grad_norm": 1.8368939690421284, "kl": 0.0625, "learning_rate": 6.7e-07, "loss": 0.0025, "reward": 1.8627582788467407, "reward_std": 0.24351754784584045, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8940082788467407, "step": 990 }, { "completion_length": 106.53125, "epoch": 0.6606666666666666, "grad_norm": 3.4810779833056396, "kl": 0.08251953125, "learning_rate": 6.696666666666666e-07, "loss": 0.0033, "reward": 1.6822917461395264, "reward_std": 0.17440789937973022, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6822916269302368, "step": 991 }, { "completion_length": 96.8125, "epoch": 0.6613333333333333, "grad_norm": 5.093361348770527, "kl": 0.0712890625, "learning_rate": 6.693333333333333e-07, "loss": 0.0028, "reward": 1.935156226158142, "reward_std": 0.07329048961400986, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9351563453674316, "step": 992 }, { "completion_length": 89.75, "epoch": 0.662, "grad_norm": 5.609506526380558, "kl": 0.06103515625, "learning_rate": 6.69e-07, "loss": 0.0024, "reward": 1.8733630180358887, "reward_std": 0.07877519726753235, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8733630776405334, "step": 993 }, { "completion_length": 94.6875, "epoch": 0.6626666666666666, "grad_norm": 2.028174533972563, "kl": 0.052978515625, "learning_rate": 6.686666666666666e-07, "loss": 0.0021, "reward": 1.78125, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 994 }, { "completion_length": 105.90625, "epoch": 0.6633333333333333, "grad_norm": 3.279698276667852, "kl": 0.0546875, "learning_rate": 6.683333333333333e-07, "loss": 0.0022, "reward": 1.5234375, "reward_std": 0.18154378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5234375, "step": 995 }, { "completion_length": 105.96875, "epoch": 0.664, "grad_norm": 1.8050007531080046, "kl": 0.046142578125, "learning_rate": 6.68e-07, "loss": 0.0018, "reward": 1.9083333015441895, "reward_std": 0.07631926983594894, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9083333015441895, "step": 996 }, { "completion_length": 95.96875, "epoch": 0.6646666666666666, "grad_norm": 13.384907672659214, "kl": 0.07568359375, "learning_rate": 6.676666666666666e-07, "loss": 0.003, "reward": 1.7427083253860474, "reward_std": 0.20453429222106934, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7427083253860474, "step": 997 }, { "completion_length": 116.96875, "epoch": 0.6653333333333333, "grad_norm": 3.6243502554581264, "kl": 0.052001953125, "learning_rate": 6.673333333333334e-07, "loss": 0.0021, "reward": 1.7218749523162842, "reward_std": 0.35496991872787476, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7531249523162842, "step": 998 }, { "completion_length": 103.5, "epoch": 0.666, "grad_norm": 14.360916274431947, "kl": 0.107421875, "learning_rate": 6.67e-07, "loss": 0.0043, "reward": 1.8427083492279053, "reward_std": 0.1069745346903801, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8427083492279053, "step": 999 }, { "completion_length": 97.78125, "epoch": 0.6666666666666666, "grad_norm": 4.696360215773675, "kl": 0.04638671875, "learning_rate": 6.666666666666666e-07, "loss": 0.0019, "reward": 1.5677083730697632, "reward_std": 0.1770833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5677083730697632, "step": 1000 }, { "completion_length": 87.5, "epoch": 0.6673333333333333, "grad_norm": 4.548871255234566, "kl": 0.0537109375, "learning_rate": 6.663333333333333e-07, "loss": 0.0022, "reward": 1.8286458253860474, "reward_std": 0.10600503534078598, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8286458849906921, "step": 1001 }, { "completion_length": 109.96875, "epoch": 0.668, "grad_norm": 1.9745027248055331, "kl": 0.047119140625, "learning_rate": 6.66e-07, "loss": 0.0019, "reward": 1.828125, "reward_std": 0.09375, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.859375, "step": 1002 }, { "completion_length": 95.03125, "epoch": 0.6686666666666666, "grad_norm": 3.4342256219334506, "kl": 0.0634765625, "learning_rate": 6.656666666666666e-07, "loss": 0.0025, "reward": 1.640625, "reward_std": 0.07110428810119629, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6406250596046448, "step": 1003 }, { "completion_length": 93.0625, "epoch": 0.6693333333333333, "grad_norm": 4.81896022666555, "kl": 0.09033203125, "learning_rate": 6.653333333333333e-07, "loss": 0.0036, "reward": 1.7781250476837158, "reward_std": 0.0595480352640152, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7781250476837158, "step": 1004 }, { "completion_length": 102.0625, "epoch": 0.67, "grad_norm": 3.746982968403197, "kl": 0.06689453125, "learning_rate": 6.65e-07, "loss": 0.0027, "reward": 1.769270896911621, "reward_std": 0.10230620950460434, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8005208373069763, "step": 1005 }, { "completion_length": 107.3125, "epoch": 0.6706666666666666, "grad_norm": 1.4817232664135442, "kl": 0.05419921875, "learning_rate": 6.646666666666666e-07, "loss": 0.0022, "reward": 1.8385417461395264, "reward_std": 0.0729166641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416269302368, "step": 1006 }, { "completion_length": 99.65625, "epoch": 0.6713333333333333, "grad_norm": 11.931096713159697, "kl": 0.08544921875, "learning_rate": 6.643333333333333e-07, "loss": 0.0034, "reward": 1.9270833730697632, "reward_std": 0.041666656732559204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9270833730697632, "step": 1007 }, { "completion_length": 101.84375, "epoch": 0.672, "grad_norm": 3.7419581025483857, "kl": 0.06787109375, "learning_rate": 6.64e-07, "loss": 0.0027, "reward": 1.9296875, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 1008 }, { "completion_length": 103.125, "epoch": 0.6726666666666666, "grad_norm": 5.769297125224972, "kl": 0.09423828125, "learning_rate": 6.636666666666666e-07, "loss": 0.0038, "reward": 1.8614583015441895, "reward_std": 0.026375528424978256, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8614583015441895, "step": 1009 }, { "completion_length": 97.6875, "epoch": 0.6733333333333333, "grad_norm": 2.0228128167246435, "kl": 0.07080078125, "learning_rate": 6.633333333333333e-07, "loss": 0.0028, "reward": 1.8041666746139526, "reward_std": 0.1165376752614975, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8041666150093079, "step": 1010 }, { "completion_length": 100.0, "epoch": 0.674, "grad_norm": 9.869974727045806, "kl": 0.0791015625, "learning_rate": 6.63e-07, "loss": 0.0032, "reward": 1.6796875, "reward_std": 0.1510416716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6796875, "step": 1011 }, { "completion_length": 86.84375, "epoch": 0.6746666666666666, "grad_norm": 2.9517746331635317, "kl": 0.043701171875, "learning_rate": 6.626666666666666e-07, "loss": 0.0017, "reward": 1.78125, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1012 }, { "completion_length": 102.1875, "epoch": 0.6753333333333333, "grad_norm": 3.3352189086765556, "kl": 0.07763671875, "learning_rate": 6.623333333333333e-07, "loss": 0.0031, "reward": 1.785416603088379, "reward_std": 0.12278340756893158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7854166626930237, "step": 1013 }, { "completion_length": 95.71875, "epoch": 0.676, "grad_norm": 17.261805500287924, "kl": 0.05859375, "learning_rate": 6.62e-07, "loss": 0.0023, "reward": 1.8036458492279053, "reward_std": 0.05096535384654999, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8036458492279053, "step": 1014 }, { "completion_length": 105.5625, "epoch": 0.6766666666666666, "grad_norm": 3.7299655609098705, "kl": 0.080078125, "learning_rate": 6.616666666666665e-07, "loss": 0.0032, "reward": 1.951562523841858, "reward_std": 0.03437500074505806, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9515625238418579, "step": 1015 }, { "completion_length": 90.375, "epoch": 0.6773333333333333, "grad_norm": 4.2034180239315795, "kl": 0.064453125, "learning_rate": 6.613333333333333e-07, "loss": 0.0026, "reward": 1.8984375, "reward_std": 0.12013056874275208, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8984375, "step": 1016 }, { "completion_length": 121.75, "epoch": 0.678, "grad_norm": 2.905318930229861, "kl": 0.09765625, "learning_rate": 6.61e-07, "loss": 0.0039, "reward": 1.71875, "reward_std": 0.17265737056732178, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 1017 }, { "completion_length": 91.875, "epoch": 0.6786666666666666, "grad_norm": 2.6102041843237145, "kl": 0.07177734375, "learning_rate": 6.606666666666666e-07, "loss": 0.0029, "reward": 1.829687476158142, "reward_std": 0.15110599994659424, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8296874761581421, "step": 1018 }, { "completion_length": 108.5, "epoch": 0.6793333333333333, "grad_norm": 2.0410288849401312, "kl": 0.0712890625, "learning_rate": 6.603333333333333e-07, "loss": 0.0029, "reward": 1.6145833730697632, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6145833730697632, "step": 1019 }, { "completion_length": 113.75, "epoch": 0.68, "grad_norm": 2.4020897836850312, "kl": 0.072265625, "learning_rate": 6.6e-07, "loss": 0.0029, "reward": 1.5322916507720947, "reward_std": 0.31408804655075073, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.5947916507720947, "step": 1020 }, { "completion_length": 117.03125, "epoch": 0.6806666666666666, "grad_norm": 3.230479593440142, "kl": 0.08203125, "learning_rate": 6.596666666666666e-07, "loss": 0.0033, "reward": 1.7776042222976685, "reward_std": 0.07064713537693024, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7776042222976685, "step": 1021 }, { "completion_length": 108.03125, "epoch": 0.6813333333333333, "grad_norm": 4.538726826565661, "kl": 0.08837890625, "learning_rate": 6.593333333333333e-07, "loss": 0.0035, "reward": 1.8291666507720947, "reward_std": 0.10738959163427353, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8291666507720947, "step": 1022 }, { "completion_length": 93.21875, "epoch": 0.682, "grad_norm": 1.1478155515041317, "kl": 0.0595703125, "learning_rate": 6.59e-07, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1023 }, { "completion_length": 100.84375, "epoch": 0.6826666666666666, "grad_norm": 4.52347977327865, "kl": 0.06689453125, "learning_rate": 6.586666666666666e-07, "loss": 0.0027, "reward": 1.894270896911621, "reward_std": 0.0233013778924942, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8942708373069763, "step": 1024 }, { "completion_length": 90.84375, "epoch": 0.6833333333333333, "grad_norm": 5.423116157570121, "kl": 0.043701171875, "learning_rate": 6.583333333333333e-07, "loss": 0.0018, "reward": 1.6354167461395264, "reward_std": 0.44567298889160156, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6666666865348816, "step": 1025 }, { "completion_length": 96.5625, "epoch": 0.684, "grad_norm": 7.072882508882533, "kl": 0.060791015625, "learning_rate": 6.58e-07, "loss": 0.0024, "reward": 1.953125, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.953125, "step": 1026 }, { "completion_length": 90.03125, "epoch": 0.6846666666666666, "grad_norm": 2.416088014998812, "kl": 0.0771484375, "learning_rate": 6.576666666666666e-07, "loss": 0.0031, "reward": 1.7161458730697632, "reward_std": 0.15029378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7161458730697632, "step": 1027 }, { "completion_length": 100.6875, "epoch": 0.6853333333333333, "grad_norm": 2.417266486111616, "kl": 0.0732421875, "learning_rate": 6.573333333333333e-07, "loss": 0.0029, "reward": 1.75, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 1028 }, { "completion_length": 102.21875, "epoch": 0.686, "grad_norm": 1.3131873188656416, "kl": 0.0556640625, "learning_rate": 6.57e-07, "loss": 0.0022, "reward": 1.84375, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1029 }, { "completion_length": 105.59375, "epoch": 0.6866666666666666, "grad_norm": 3.3112846801509144, "kl": 0.06640625, "learning_rate": 6.566666666666666e-07, "loss": 0.0027, "reward": 1.874627947807312, "reward_std": 0.15423652529716492, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.905877947807312, "step": 1030 }, { "completion_length": 105.15625, "epoch": 0.6873333333333334, "grad_norm": 2.446704894618676, "kl": 0.0576171875, "learning_rate": 6.563333333333333e-07, "loss": 0.0023, "reward": 1.517187476158142, "reward_std": 0.2917999029159546, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.5796874761581421, "step": 1031 }, { "completion_length": 98.21875, "epoch": 0.688, "grad_norm": 2.508587443362537, "kl": 0.0537109375, "learning_rate": 6.56e-07, "loss": 0.0022, "reward": 1.8203125, "reward_std": 0.140625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203125, "step": 1032 }, { "completion_length": 112.15625, "epoch": 0.6886666666666666, "grad_norm": 2.9405393242661964, "kl": 0.05615234375, "learning_rate": 6.556666666666666e-07, "loss": 0.0023, "reward": 1.878645896911621, "reward_std": 0.06461097300052643, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8786458373069763, "step": 1033 }, { "completion_length": 103.21875, "epoch": 0.6893333333333334, "grad_norm": 11.946358585150248, "kl": 0.0654296875, "learning_rate": 6.553333333333333e-07, "loss": 0.0026, "reward": 1.8203125, "reward_std": 0.203125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8515625, "step": 1034 }, { "completion_length": 83.75, "epoch": 0.69, "grad_norm": 1.173217015148218, "kl": 0.051025390625, "learning_rate": 6.55e-07, "loss": 0.002, "reward": 1.9375, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1035 }, { "completion_length": 95.1875, "epoch": 0.6906666666666667, "grad_norm": 3.1425569421324506, "kl": 0.0517578125, "learning_rate": 6.546666666666665e-07, "loss": 0.0021, "reward": 1.953125, "reward_std": 0.08054219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.953125, "step": 1036 }, { "completion_length": 112.3125, "epoch": 0.6913333333333334, "grad_norm": 4.338805057773137, "kl": 0.07958984375, "learning_rate": 6.543333333333333e-07, "loss": 0.0032, "reward": 1.573958396911621, "reward_std": 0.24989575147628784, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6052083373069763, "step": 1037 }, { "completion_length": 115.15625, "epoch": 0.692, "grad_norm": 1.8456118943170197, "kl": 0.05078125, "learning_rate": 6.54e-07, "loss": 0.002, "reward": 1.8760416507720947, "reward_std": 0.07818283885717392, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8760416507720947, "step": 1038 }, { "completion_length": 91.53125, "epoch": 0.6926666666666667, "grad_norm": 5.404980672325098, "kl": 0.058837890625, "learning_rate": 6.536666666666666e-07, "loss": 0.0024, "reward": 1.6328125, "reward_std": 0.18738672137260437, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6640625, "step": 1039 }, { "completion_length": 104.65625, "epoch": 0.6933333333333334, "grad_norm": 2.3132095396804657, "kl": 0.072265625, "learning_rate": 6.533333333333333e-07, "loss": 0.0029, "reward": 1.7395833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 1040 }, { "completion_length": 115.625, "epoch": 0.694, "grad_norm": 4.263303082081205, "kl": 0.08984375, "learning_rate": 6.53e-07, "loss": 0.0036, "reward": 1.7817708253860474, "reward_std": 0.12395833432674408, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7817708253860474, "step": 1041 }, { "completion_length": 116.8125, "epoch": 0.6946666666666667, "grad_norm": 7.4968082521242545, "kl": 0.068359375, "learning_rate": 6.526666666666666e-07, "loss": 0.0027, "reward": 1.7223098278045654, "reward_std": 0.18108505010604858, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7535597681999207, "step": 1042 }, { "completion_length": 98.8125, "epoch": 0.6953333333333334, "grad_norm": 4.613851966352614, "kl": 0.06494140625, "learning_rate": 6.523333333333333e-07, "loss": 0.0026, "reward": 1.8203125, "reward_std": 0.18551458418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203124403953552, "step": 1043 }, { "completion_length": 97.875, "epoch": 0.696, "grad_norm": 4.199929875109541, "kl": 0.08740234375, "learning_rate": 6.52e-07, "loss": 0.0035, "reward": 1.6614583730697632, "reward_std": 0.1131451427936554, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6614583730697632, "step": 1044 }, { "completion_length": 119.6875, "epoch": 0.6966666666666667, "grad_norm": 3.4196130922448695, "kl": 0.078125, "learning_rate": 6.516666666666666e-07, "loss": 0.0031, "reward": 1.8286458253860474, "reward_std": 0.05822284519672394, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8286458849906921, "step": 1045 }, { "completion_length": 117.9375, "epoch": 0.6973333333333334, "grad_norm": 17.895816598482337, "kl": 0.07373046875, "learning_rate": 6.513333333333333e-07, "loss": 0.003, "reward": 1.6696428060531616, "reward_std": 0.09652995318174362, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6696429252624512, "step": 1046 }, { "completion_length": 100.09375, "epoch": 0.698, "grad_norm": 2.469492653251995, "kl": 0.056640625, "learning_rate": 6.51e-07, "loss": 0.0023, "reward": 1.8958333730697632, "reward_std": 0.09806472063064575, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 1047 }, { "completion_length": 101.28125, "epoch": 0.6986666666666667, "grad_norm": 2.5328994352118204, "kl": 0.058349609375, "learning_rate": 6.506666666666666e-07, "loss": 0.0023, "reward": 1.7546875476837158, "reward_std": 0.03161248192191124, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.754687488079071, "step": 1048 }, { "completion_length": 106.3125, "epoch": 0.6993333333333334, "grad_norm": 6.565018280956239, "kl": 0.059326171875, "learning_rate": 6.503333333333332e-07, "loss": 0.0024, "reward": 1.7510416507720947, "reward_std": 0.05947291851043701, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7510416507720947, "step": 1049 }, { "completion_length": 123.75, "epoch": 0.7, "grad_norm": 2.581689976494823, "kl": 0.044189453125, "learning_rate": 6.5e-07, "loss": 0.0018, "reward": 1.703125, "reward_std": 0.20554219186306, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.734375, "step": 1050 }, { "completion_length": 97.4375, "epoch": 0.7006666666666667, "grad_norm": 4.4086783303877475, "kl": 0.040771484375, "learning_rate": 6.496666666666666e-07, "loss": 0.0016, "reward": 1.8385417461395264, "reward_std": 0.1145833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416269302368, "step": 1051 }, { "completion_length": 122.28125, "epoch": 0.7013333333333334, "grad_norm": 3.215650243083319, "kl": 0.10107421875, "learning_rate": 6.493333333333333e-07, "loss": 0.004, "reward": 1.7596042156219482, "reward_std": 0.19902774691581726, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7908541560173035, "step": 1052 }, { "completion_length": 106.1875, "epoch": 0.702, "grad_norm": 3.370912839542088, "kl": 0.0634765625, "learning_rate": 6.49e-07, "loss": 0.0025, "reward": 1.8515625, "reward_std": 0.07737711817026138, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8515625, "step": 1053 }, { "completion_length": 105.71875, "epoch": 0.7026666666666667, "grad_norm": 9.3413649989688, "kl": 0.08251953125, "learning_rate": 6.486666666666666e-07, "loss": 0.0033, "reward": 1.5937871932983398, "reward_std": 0.15580108761787415, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6250371932983398, "step": 1054 }, { "completion_length": 110.875, "epoch": 0.7033333333333334, "grad_norm": 3.5414308911214607, "kl": 0.046142578125, "learning_rate": 6.483333333333333e-07, "loss": 0.0018, "reward": 1.9036458730697632, "reward_std": 0.0989583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9036458134651184, "step": 1055 }, { "completion_length": 97.5625, "epoch": 0.704, "grad_norm": 3.2867373964411692, "kl": 0.061279296875, "learning_rate": 6.48e-07, "loss": 0.0025, "reward": 1.8019344806671143, "reward_std": 0.20579972863197327, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.833184540271759, "step": 1056 }, { "completion_length": 103.46875, "epoch": 0.7046666666666667, "grad_norm": 3.5130477289204847, "kl": 0.0888671875, "learning_rate": 6.476666666666666e-07, "loss": 0.0036, "reward": 1.8302083015441895, "reward_std": 0.050667714327573776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8302083611488342, "step": 1057 }, { "completion_length": 101.75, "epoch": 0.7053333333333334, "grad_norm": 2.322428732226174, "kl": 0.049560546875, "learning_rate": 6.473333333333333e-07, "loss": 0.002, "reward": 1.84375, "reward_std": 0.2180021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 1058 }, { "completion_length": 106.15625, "epoch": 0.706, "grad_norm": 2.983219009902211, "kl": 0.05615234375, "learning_rate": 6.47e-07, "loss": 0.0022, "reward": 1.7479166984558105, "reward_std": 0.19076919555664062, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7479166984558105, "step": 1059 }, { "completion_length": 99.125, "epoch": 0.7066666666666667, "grad_norm": 2.1987026922482027, "kl": 0.07666015625, "learning_rate": 6.466666666666666e-07, "loss": 0.0031, "reward": 1.681249976158142, "reward_std": 0.14397059381008148, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7124999761581421, "step": 1060 }, { "completion_length": 108.46875, "epoch": 0.7073333333333334, "grad_norm": 3.5932131421219813, "kl": 0.08544921875, "learning_rate": 6.463333333333333e-07, "loss": 0.0034, "reward": 1.8854167461395264, "reward_std": 0.03663136810064316, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8854166865348816, "step": 1061 }, { "completion_length": 111.40625, "epoch": 0.708, "grad_norm": 2.4565338941638597, "kl": 0.05322265625, "learning_rate": 6.46e-07, "loss": 0.0021, "reward": 1.8541667461395264, "reward_std": 0.2388354390859604, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8854166865348816, "step": 1062 }, { "completion_length": 108.03125, "epoch": 0.7086666666666667, "grad_norm": 3.767367480178452, "kl": 0.068359375, "learning_rate": 6.456666666666667e-07, "loss": 0.0027, "reward": 1.7296874523162842, "reward_std": 0.07790182530879974, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.729687511920929, "step": 1063 }, { "completion_length": 108.03125, "epoch": 0.7093333333333334, "grad_norm": 1.419692064625191, "kl": 0.048583984375, "learning_rate": 6.453333333333333e-07, "loss": 0.0019, "reward": 1.711458444595337, "reward_std": 0.0784187838435173, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7114582657814026, "step": 1064 }, { "completion_length": 105.3125, "epoch": 0.71, "grad_norm": 1.4715444992160853, "kl": 0.06396484375, "learning_rate": 6.45e-07, "loss": 0.0026, "reward": 1.90625, "reward_std": 0.053791437298059464, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1065 }, { "completion_length": 99.21875, "epoch": 0.7106666666666667, "grad_norm": 2.174173499042298, "kl": 0.07177734375, "learning_rate": 6.446666666666666e-07, "loss": 0.0029, "reward": 1.7604167461395264, "reward_std": 0.10048859566450119, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7604166865348816, "step": 1066 }, { "completion_length": 105.46875, "epoch": 0.7113333333333334, "grad_norm": 1.9290155040622599, "kl": 0.068359375, "learning_rate": 6.443333333333333e-07, "loss": 0.0027, "reward": 1.8828125, "reward_std": 0.15029378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8828125, "step": 1067 }, { "completion_length": 109.75, "epoch": 0.712, "grad_norm": 1.1114560571006318, "kl": 0.043212890625, "learning_rate": 6.44e-07, "loss": 0.0017, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1068 }, { "completion_length": 106.65625, "epoch": 0.7126666666666667, "grad_norm": 4.141285971038736, "kl": 0.0634765625, "learning_rate": 6.436666666666667e-07, "loss": 0.0025, "reward": 1.8546874523162842, "reward_std": 0.10572704672813416, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8546874523162842, "step": 1069 }, { "completion_length": 101.0625, "epoch": 0.7133333333333334, "grad_norm": 6.97803437168503, "kl": 0.052490234375, "learning_rate": 6.433333333333332e-07, "loss": 0.0021, "reward": 1.7265625, "reward_std": 0.22708837687969208, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7578125, "step": 1070 }, { "completion_length": 100.625, "epoch": 0.714, "grad_norm": 2.307420570898336, "kl": 0.04296875, "learning_rate": 6.43e-07, "loss": 0.0017, "reward": 1.875, "reward_std": 0.10239279270172119, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 1071 }, { "completion_length": 100.78125, "epoch": 0.7146666666666667, "grad_norm": 1.8775294117175372, "kl": 0.07177734375, "learning_rate": 6.426666666666667e-07, "loss": 0.0029, "reward": 1.814062476158142, "reward_std": 0.06562499701976776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8140624761581421, "step": 1072 }, { "completion_length": 101.1875, "epoch": 0.7153333333333334, "grad_norm": 1.6251742431740397, "kl": 0.046630859375, "learning_rate": 6.423333333333333e-07, "loss": 0.0019, "reward": 1.920312523841858, "reward_std": 0.02468431554734707, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9203125238418579, "step": 1073 }, { "completion_length": 95.53125, "epoch": 0.716, "grad_norm": 1.8834217546011065, "kl": 0.06787109375, "learning_rate": 6.42e-07, "loss": 0.0027, "reward": 1.9635416269302368, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9635416269302368, "step": 1074 }, { "completion_length": 93.34375, "epoch": 0.7166666666666667, "grad_norm": 2.409863201855667, "kl": 0.07373046875, "learning_rate": 6.416666666666667e-07, "loss": 0.0029, "reward": 1.8468750715255737, "reward_std": 0.028956202790141106, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.846875011920929, "step": 1075 }, { "completion_length": 96.78125, "epoch": 0.7173333333333334, "grad_norm": 3.8119375571386924, "kl": 0.06689453125, "learning_rate": 6.413333333333333e-07, "loss": 0.0027, "reward": 1.7994792461395264, "reward_std": 0.02991960011422634, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7994791865348816, "step": 1076 }, { "completion_length": 107.34375, "epoch": 0.718, "grad_norm": 9.977378144236049, "kl": 0.06591796875, "learning_rate": 6.41e-07, "loss": 0.0026, "reward": 1.8854167461395264, "reward_std": 0.09365712106227875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8854167461395264, "step": 1077 }, { "completion_length": 96.65625, "epoch": 0.7186666666666667, "grad_norm": 3.675952592756745, "kl": 0.06494140625, "learning_rate": 6.406666666666667e-07, "loss": 0.0026, "reward": 1.7421875, "reward_std": 0.1743500530719757, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7421875596046448, "step": 1078 }, { "completion_length": 103.59375, "epoch": 0.7193333333333334, "grad_norm": 1.553420829420041, "kl": 0.072265625, "learning_rate": 6.403333333333332e-07, "loss": 0.0029, "reward": 1.9083333015441895, "reward_std": 0.022047923877835274, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9083333611488342, "step": 1079 }, { "completion_length": 110.40625, "epoch": 0.72, "grad_norm": 1.9880256671927923, "kl": 0.061279296875, "learning_rate": 6.4e-07, "loss": 0.0025, "reward": 1.65625, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6875, "step": 1080 }, { "completion_length": 117.15625, "epoch": 0.7206666666666667, "grad_norm": 4.221539090910846, "kl": 0.041015625, "learning_rate": 6.396666666666667e-07, "loss": 0.0016, "reward": 1.7937500476837158, "reward_std": 0.21530932188034058, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.824999988079071, "step": 1081 }, { "completion_length": 106.9375, "epoch": 0.7213333333333334, "grad_norm": 5.024611162498005, "kl": 0.0478515625, "learning_rate": 6.393333333333333e-07, "loss": 0.0019, "reward": 1.890625, "reward_std": 0.0729166641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8906250596046448, "step": 1082 }, { "completion_length": 101.15625, "epoch": 0.722, "grad_norm": 2.1995183618000476, "kl": 0.05126953125, "learning_rate": 6.389999999999999e-07, "loss": 0.002, "reward": 1.8958333730697632, "reward_std": 0.2083333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9270833730697632, "step": 1083 }, { "completion_length": 106.75, "epoch": 0.7226666666666667, "grad_norm": 2.684589508758036, "kl": 0.05419921875, "learning_rate": 6.386666666666667e-07, "loss": 0.0022, "reward": 1.9442708492279053, "reward_std": 0.09977055341005325, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9755208492279053, "step": 1084 }, { "completion_length": 109.78125, "epoch": 0.7233333333333334, "grad_norm": 6.464776236140039, "kl": 0.1572265625, "learning_rate": 6.383333333333333e-07, "loss": 0.0063, "reward": 1.9296131134033203, "reward_std": 0.07722216844558716, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296130537986755, "step": 1085 }, { "completion_length": 98.75, "epoch": 0.724, "grad_norm": 4.576597404728882, "kl": 0.0849609375, "learning_rate": 6.38e-07, "loss": 0.0034, "reward": 1.7921874523162842, "reward_std": 0.14479167759418488, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7921874523162842, "step": 1086 }, { "completion_length": 117.09375, "epoch": 0.7246666666666667, "grad_norm": 2.7915486624333687, "kl": 0.0439453125, "learning_rate": 6.376666666666666e-07, "loss": 0.0018, "reward": 1.7074404954910278, "reward_std": 0.2510165572166443, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7386904954910278, "step": 1087 }, { "completion_length": 99.78125, "epoch": 0.7253333333333334, "grad_norm": 9.487167476744414, "kl": 0.06689453125, "learning_rate": 6.373333333333333e-07, "loss": 0.0027, "reward": 1.6963541507720947, "reward_std": 0.171920508146286, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6963541507720947, "step": 1088 }, { "completion_length": 102.3125, "epoch": 0.726, "grad_norm": 3.9538608796438366, "kl": 0.0615234375, "learning_rate": 6.37e-07, "loss": 0.0025, "reward": 1.7708333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 1089 }, { "completion_length": 95.75, "epoch": 0.7266666666666667, "grad_norm": 9.231033906590145, "kl": 0.0771484375, "learning_rate": 6.366666666666667e-07, "loss": 0.0031, "reward": 1.710416555404663, "reward_std": 0.24583330750465393, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7104166746139526, "step": 1090 }, { "completion_length": 101.15625, "epoch": 0.7273333333333334, "grad_norm": 4.611288957621085, "kl": 0.06787109375, "learning_rate": 6.363333333333332e-07, "loss": 0.0027, "reward": 1.8286458253860474, "reward_std": 0.22997337579727173, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8598958253860474, "step": 1091 }, { "completion_length": 116.1875, "epoch": 0.728, "grad_norm": 2.5614108651981478, "kl": 0.060791015625, "learning_rate": 6.36e-07, "loss": 0.0024, "reward": 1.8020833730697632, "reward_std": 0.012028127908706665, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833134651184, "step": 1092 }, { "completion_length": 92.8125, "epoch": 0.7286666666666667, "grad_norm": 1.9133680505664878, "kl": 0.0595703125, "learning_rate": 6.356666666666667e-07, "loss": 0.0024, "reward": 1.826388955116272, "reward_std": 0.06374241411685944, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8263888955116272, "step": 1093 }, { "completion_length": 101.71875, "epoch": 0.7293333333333333, "grad_norm": 3.073626124918754, "kl": 0.04296875, "learning_rate": 6.353333333333333e-07, "loss": 0.0017, "reward": 1.7135417461395264, "reward_std": 0.2700854539871216, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7447916865348816, "step": 1094 }, { "completion_length": 109.25, "epoch": 0.73, "grad_norm": 5.331195088103986, "kl": 0.060546875, "learning_rate": 6.35e-07, "loss": 0.0024, "reward": 1.8197916746139526, "reward_std": 0.14418618381023407, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8197916746139526, "step": 1095 }, { "completion_length": 106.0, "epoch": 0.7306666666666667, "grad_norm": 2.6551137099313817, "kl": 0.115234375, "learning_rate": 6.346666666666666e-07, "loss": 0.0046, "reward": 1.6017940044403076, "reward_std": 0.15057870745658875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6330440044403076, "step": 1096 }, { "completion_length": 96.96875, "epoch": 0.7313333333333333, "grad_norm": 10.424682290541002, "kl": 0.07421875, "learning_rate": 6.343333333333333e-07, "loss": 0.003, "reward": 1.90625, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1097 }, { "completion_length": 100.84375, "epoch": 0.732, "grad_norm": 7.067690210559755, "kl": 0.048095703125, "learning_rate": 6.34e-07, "loss": 0.0019, "reward": 1.5963542461395264, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5963541269302368, "step": 1098 }, { "completion_length": 91.6875, "epoch": 0.7326666666666667, "grad_norm": 3.1015644630970924, "kl": 0.06201171875, "learning_rate": 6.336666666666667e-07, "loss": 0.0025, "reward": 1.984375, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.984375, "step": 1099 }, { "completion_length": 105.0625, "epoch": 0.7333333333333333, "grad_norm": 1.978060989363142, "kl": 0.064453125, "learning_rate": 6.333333333333332e-07, "loss": 0.0026, "reward": 1.84375, "reward_std": 0.012499993667006493, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1100 }, { "completion_length": 95.3125, "epoch": 0.734, "grad_norm": 1.1878446933041555, "kl": 0.05419921875, "learning_rate": 6.33e-07, "loss": 0.0022, "reward": 1.9375, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1101 }, { "completion_length": 98.25, "epoch": 0.7346666666666667, "grad_norm": 0.09636232637382396, "kl": 0.06640625, "learning_rate": 6.326666666666667e-07, "loss": 0.0027, "reward": 1.8125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1102 }, { "completion_length": 108.96875, "epoch": 0.7353333333333333, "grad_norm": 1.864024449477312, "kl": 0.041015625, "learning_rate": 6.323333333333333e-07, "loss": 0.0016, "reward": 1.8072917461395264, "reward_std": 0.1090010553598404, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8072916269302368, "step": 1103 }, { "completion_length": 102.21875, "epoch": 0.736, "grad_norm": 2.071805911851158, "kl": 0.07470703125, "learning_rate": 6.319999999999999e-07, "loss": 0.003, "reward": 1.9479167461395264, "reward_std": 0.020833337679505348, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9479167461395264, "step": 1104 }, { "completion_length": 95.15625, "epoch": 0.7366666666666667, "grad_norm": 7.557163692141627, "kl": 0.06396484375, "learning_rate": 6.316666666666667e-07, "loss": 0.0026, "reward": 1.7416666746139526, "reward_std": 0.19285784661769867, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7729166150093079, "step": 1105 }, { "completion_length": 106.84375, "epoch": 0.7373333333333333, "grad_norm": 2.246798061258956, "kl": 0.07470703125, "learning_rate": 6.313333333333333e-07, "loss": 0.003, "reward": 1.859909176826477, "reward_std": 0.1551816165447235, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.891159176826477, "step": 1106 }, { "completion_length": 103.90625, "epoch": 0.738, "grad_norm": 4.629320522960924, "kl": 0.07421875, "learning_rate": 6.31e-07, "loss": 0.003, "reward": 1.8038194179534912, "reward_std": 0.28190183639526367, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.835069477558136, "step": 1107 }, { "completion_length": 99.71875, "epoch": 0.7386666666666667, "grad_norm": 0.31519503740991334, "kl": 0.044677734375, "learning_rate": 6.306666666666666e-07, "loss": 0.0018, "reward": 1.975000023841858, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9750000238418579, "step": 1108 }, { "completion_length": 108.25, "epoch": 0.7393333333333333, "grad_norm": 4.940326065263452, "kl": 0.05224609375, "learning_rate": 6.303333333333332e-07, "loss": 0.0021, "reward": 1.7958333492279053, "reward_std": 0.2451845407485962, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7958333492279053, "step": 1109 }, { "completion_length": 113.21875, "epoch": 0.74, "grad_norm": 10.067915400023573, "kl": 0.064453125, "learning_rate": 6.3e-07, "loss": 0.0026, "reward": 1.8838541507720947, "reward_std": 0.07740195095539093, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8838541507720947, "step": 1110 }, { "completion_length": 116.21875, "epoch": 0.7406666666666667, "grad_norm": 3.4010543933423536, "kl": 0.054443359375, "learning_rate": 6.296666666666667e-07, "loss": 0.0022, "reward": 1.7265625, "reward_std": 0.26560309529304504, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7265625, "step": 1111 }, { "completion_length": 101.21875, "epoch": 0.7413333333333333, "grad_norm": 4.31567648616534, "kl": 0.06494140625, "learning_rate": 6.293333333333333e-07, "loss": 0.0026, "reward": 1.7078125476837158, "reward_std": 0.23077766597270966, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.739062488079071, "step": 1112 }, { "completion_length": 92.65625, "epoch": 0.742, "grad_norm": 4.896485998172797, "kl": 0.059814453125, "learning_rate": 6.289999999999999e-07, "loss": 0.0024, "reward": 1.828125, "reward_std": 0.21875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.828125, "step": 1113 }, { "completion_length": 109.3125, "epoch": 0.7426666666666667, "grad_norm": 4.024146383125046, "kl": 0.05517578125, "learning_rate": 6.286666666666667e-07, "loss": 0.0022, "reward": 1.6197917461395264, "reward_std": 0.10664170980453491, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6197916865348816, "step": 1114 }, { "completion_length": 107.34375, "epoch": 0.7433333333333333, "grad_norm": 5.7699497585910295, "kl": 0.08544921875, "learning_rate": 6.283333333333333e-07, "loss": 0.0034, "reward": 1.634374976158142, "reward_std": 0.2584989666938782, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6656249761581421, "step": 1115 }, { "completion_length": 100.90625, "epoch": 0.744, "grad_norm": 1.3619609512867177, "kl": 0.043701171875, "learning_rate": 6.28e-07, "loss": 0.0017, "reward": 1.7083333730697632, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333134651184, "step": 1116 }, { "completion_length": 110.25, "epoch": 0.7446666666666667, "grad_norm": 1.7506582150857675, "kl": 0.05712890625, "learning_rate": 6.276666666666666e-07, "loss": 0.0023, "reward": 1.84375, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1117 }, { "completion_length": 104.1875, "epoch": 0.7453333333333333, "grad_norm": 1.322128872850834, "kl": 0.0556640625, "learning_rate": 6.273333333333333e-07, "loss": 0.0022, "reward": 1.84375, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1118 }, { "completion_length": 122.15625, "epoch": 0.746, "grad_norm": 2.8975184182069893, "kl": 0.05859375, "learning_rate": 6.27e-07, "loss": 0.0023, "reward": 1.6770833730697632, "reward_std": 0.2083333432674408, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7083333134651184, "step": 1119 }, { "completion_length": 98.28125, "epoch": 0.7466666666666667, "grad_norm": 1.8895753114772729, "kl": 0.049072265625, "learning_rate": 6.266666666666667e-07, "loss": 0.002, "reward": 1.71875, "reward_std": 0.1458333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 1120 }, { "completion_length": 99.0, "epoch": 0.7473333333333333, "grad_norm": 9.909660360778025, "kl": 0.0634765625, "learning_rate": 6.263333333333332e-07, "loss": 0.0025, "reward": 1.7234375476837158, "reward_std": 0.1456502377986908, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7234375476837158, "step": 1121 }, { "completion_length": 86.0625, "epoch": 0.748, "grad_norm": 32.30498729715363, "kl": 0.0576171875, "learning_rate": 6.26e-07, "loss": 0.0023, "reward": 1.8135416507720947, "reward_std": 0.027742967009544373, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8135416507720947, "step": 1122 }, { "completion_length": 113.53125, "epoch": 0.7486666666666667, "grad_norm": 5.595604438919309, "kl": 0.051025390625, "learning_rate": 6.256666666666667e-07, "loss": 0.002, "reward": 1.7395833730697632, "reward_std": 0.27518051862716675, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8020833134651184, "step": 1123 }, { "completion_length": 99.625, "epoch": 0.7493333333333333, "grad_norm": 4.543413748227606, "kl": 0.056884765625, "learning_rate": 6.253333333333333e-07, "loss": 0.0023, "reward": 1.8463542461395264, "reward_std": 0.1162172332406044, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8463541865348816, "step": 1124 }, { "completion_length": 114.5625, "epoch": 0.75, "grad_norm": 2.2331243904212394, "kl": 0.060302734375, "learning_rate": 6.249999999999999e-07, "loss": 0.0024, "reward": 1.8541667461395264, "reward_std": 0.19952814280986786, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8854167461395264, "step": 1125 }, { "completion_length": 104.4375, "epoch": 0.7506666666666667, "grad_norm": 1.2183965077063166, "kl": 0.052490234375, "learning_rate": 6.246666666666667e-07, "loss": 0.0021, "reward": 1.8703124523162842, "reward_std": 0.018662994727492332, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.870312511920929, "step": 1126 }, { "completion_length": 86.40625, "epoch": 0.7513333333333333, "grad_norm": 4.808271370735808, "kl": 0.057861328125, "learning_rate": 6.243333333333333e-07, "loss": 0.0023, "reward": 1.8958333730697632, "reward_std": 0.11383543908596039, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333134651184, "step": 1127 }, { "completion_length": 100.34375, "epoch": 0.752, "grad_norm": 15.316944428065979, "kl": 0.05078125, "learning_rate": 6.24e-07, "loss": 0.002, "reward": 1.8411458730697632, "reward_std": 0.1757529228925705, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8723958730697632, "step": 1128 }, { "completion_length": 109.21875, "epoch": 0.7526666666666667, "grad_norm": 1.8845750761263482, "kl": 0.056884765625, "learning_rate": 6.236666666666667e-07, "loss": 0.0023, "reward": 1.7880208492279053, "reward_std": 0.008899999782443047, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7880208492279053, "step": 1129 }, { "completion_length": 103.0625, "epoch": 0.7533333333333333, "grad_norm": 3.7666559653362928, "kl": 0.07421875, "learning_rate": 6.233333333333332e-07, "loss": 0.003, "reward": 1.633333444595337, "reward_std": 0.10330983996391296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6333333253860474, "step": 1130 }, { "completion_length": 112.03125, "epoch": 0.754, "grad_norm": 8.60228657387015, "kl": 0.058837890625, "learning_rate": 6.23e-07, "loss": 0.0024, "reward": 1.7041666507720947, "reward_std": 0.1403263509273529, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7041667103767395, "step": 1131 }, { "completion_length": 99.40625, "epoch": 0.7546666666666667, "grad_norm": 2.81729052348384, "kl": 0.0732421875, "learning_rate": 6.226666666666667e-07, "loss": 0.0029, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1132 }, { "completion_length": 102.46875, "epoch": 0.7553333333333333, "grad_norm": 1.646582374871284, "kl": 0.050537109375, "learning_rate": 6.223333333333333e-07, "loss": 0.002, "reward": 1.8312499523162842, "reward_std": 0.21247810125350952, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.862500011920929, "step": 1133 }, { "completion_length": 112.03125, "epoch": 0.756, "grad_norm": 7.458329806839589, "kl": 0.0576171875, "learning_rate": 6.219999999999999e-07, "loss": 0.0023, "reward": 1.728124976158142, "reward_std": 0.20136070251464844, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7906250357627869, "step": 1134 }, { "completion_length": 86.625, "epoch": 0.7566666666666667, "grad_norm": 1.1966753928557237, "kl": 0.053955078125, "learning_rate": 6.216666666666667e-07, "loss": 0.0022, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1135 }, { "completion_length": 102.5, "epoch": 0.7573333333333333, "grad_norm": 4.661271644452035, "kl": 0.0390625, "learning_rate": 6.213333333333333e-07, "loss": 0.0016, "reward": 1.8104166984558105, "reward_std": 0.16567063331604004, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8104166388511658, "step": 1136 }, { "completion_length": 100.0625, "epoch": 0.758, "grad_norm": 2.283417784677246, "kl": 0.0888671875, "learning_rate": 6.21e-07, "loss": 0.0035, "reward": 1.8645833730697632, "reward_std": 0.08466877788305283, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833730697632, "step": 1137 }, { "completion_length": 91.84375, "epoch": 0.7586666666666667, "grad_norm": 1.9461476327328822, "kl": 0.059814453125, "learning_rate": 6.206666666666666e-07, "loss": 0.0024, "reward": 1.9375, "reward_std": 0.09858439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1138 }, { "completion_length": 99.5625, "epoch": 0.7593333333333333, "grad_norm": 5.799713493033583, "kl": 0.05078125, "learning_rate": 6.203333333333333e-07, "loss": 0.002, "reward": 1.6437499523162842, "reward_std": 0.23947890102863312, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.643750011920929, "step": 1139 }, { "completion_length": 99.90625, "epoch": 0.76, "grad_norm": 1.4941801959134204, "kl": 0.07958984375, "learning_rate": 6.2e-07, "loss": 0.0032, "reward": 1.7842261791229248, "reward_std": 0.07512690871953964, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7842261791229248, "step": 1140 }, { "completion_length": 99.9375, "epoch": 0.7606666666666667, "grad_norm": 6.566622214061034, "kl": 0.05029296875, "learning_rate": 6.196666666666667e-07, "loss": 0.002, "reward": 1.6843006610870361, "reward_std": 0.17696979641914368, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6843006014823914, "step": 1141 }, { "completion_length": 109.875, "epoch": 0.7613333333333333, "grad_norm": 4.022218829919972, "kl": 0.0654296875, "learning_rate": 6.193333333333332e-07, "loss": 0.0026, "reward": 1.8703124523162842, "reward_std": 0.15259283781051636, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.901562511920929, "step": 1142 }, { "completion_length": 96.125, "epoch": 0.762, "grad_norm": 2.1313597709106333, "kl": 0.0830078125, "learning_rate": 6.189999999999999e-07, "loss": 0.0033, "reward": 1.8583333492279053, "reward_std": 0.024292198941111565, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8583333492279053, "step": 1143 }, { "completion_length": 103.09375, "epoch": 0.7626666666666667, "grad_norm": 5.81252216740045, "kl": 0.0654296875, "learning_rate": 6.186666666666667e-07, "loss": 0.0026, "reward": 1.766145944595337, "reward_std": 0.1028965562582016, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7661458253860474, "step": 1144 }, { "completion_length": 94.375, "epoch": 0.7633333333333333, "grad_norm": 2.3451678695887135, "kl": 0.06689453125, "learning_rate": 6.183333333333333e-07, "loss": 0.0027, "reward": 1.899999976158142, "reward_std": 0.11682654172182083, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8999999761581421, "step": 1145 }, { "completion_length": 102.28125, "epoch": 0.764, "grad_norm": 5.930674605380688, "kl": 0.051025390625, "learning_rate": 6.18e-07, "loss": 0.002, "reward": 1.642187476158142, "reward_std": 0.01770833134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6421874761581421, "step": 1146 }, { "completion_length": 99.125, "epoch": 0.7646666666666667, "grad_norm": 2.5693617771475132, "kl": 0.04638671875, "learning_rate": 6.176666666666666e-07, "loss": 0.0019, "reward": 1.8489583730697632, "reward_std": 0.179977148771286, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583134651184, "step": 1147 }, { "completion_length": 122.78125, "epoch": 0.7653333333333333, "grad_norm": 2.3886942374420896, "kl": 0.06005859375, "learning_rate": 6.173333333333333e-07, "loss": 0.0024, "reward": 1.7734375, "reward_std": 0.1640559881925583, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8046875, "step": 1148 }, { "completion_length": 88.25, "epoch": 0.766, "grad_norm": 4.167065162968694, "kl": 0.06591796875, "learning_rate": 6.17e-07, "loss": 0.0026, "reward": 1.5932291746139526, "reward_std": 0.2987888753414154, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6244791746139526, "step": 1149 }, { "completion_length": 90.90625, "epoch": 0.7666666666666667, "grad_norm": 5.42623792421679, "kl": 0.07666015625, "learning_rate": 6.166666666666667e-07, "loss": 0.0031, "reward": 1.875, "reward_std": 0.13815811276435852, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8749999403953552, "step": 1150 }, { "completion_length": 102.21875, "epoch": 0.7673333333333333, "grad_norm": 2.694023220340184, "kl": 0.0654296875, "learning_rate": 6.163333333333332e-07, "loss": 0.0026, "reward": 1.8802083730697632, "reward_std": 0.018042195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8802083730697632, "step": 1151 }, { "completion_length": 95.15625, "epoch": 0.768, "grad_norm": 2.618335321980928, "kl": 0.050048828125, "learning_rate": 6.16e-07, "loss": 0.002, "reward": 1.890625, "reward_std": 0.1145833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8906249403953552, "step": 1152 }, { "completion_length": 103.8125, "epoch": 0.7686666666666667, "grad_norm": 1.1918930579979377, "kl": 0.0703125, "learning_rate": 6.156666666666667e-07, "loss": 0.0028, "reward": 1.7999999523162842, "reward_std": 0.08923543244600296, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.831250011920929, "step": 1153 }, { "completion_length": 113.0, "epoch": 0.7693333333333333, "grad_norm": 5.01526656998149, "kl": 0.06640625, "learning_rate": 6.153333333333333e-07, "loss": 0.0027, "reward": 1.7151042222976685, "reward_std": 0.05104167386889458, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7151041626930237, "step": 1154 }, { "completion_length": 99.59375, "epoch": 0.77, "grad_norm": 1.2991809994024004, "kl": 0.0458984375, "learning_rate": 6.149999999999999e-07, "loss": 0.0018, "reward": 1.9322917461395264, "reward_std": 0.010416661389172077, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322916865348816, "step": 1155 }, { "completion_length": 118.0625, "epoch": 0.7706666666666667, "grad_norm": 2.289407673258514, "kl": 0.072265625, "learning_rate": 6.146666666666667e-07, "loss": 0.0029, "reward": 1.5572916269302368, "reward_std": 0.07837249338626862, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5572916865348816, "step": 1156 }, { "completion_length": 95.3125, "epoch": 0.7713333333333333, "grad_norm": 1.5501642780993903, "kl": 0.05908203125, "learning_rate": 6.143333333333333e-07, "loss": 0.0024, "reward": 1.90625, "reward_std": 0.09300211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9062499403953552, "step": 1157 }, { "completion_length": 97.5, "epoch": 0.772, "grad_norm": 5.462891730269292, "kl": 0.06494140625, "learning_rate": 6.14e-07, "loss": 0.0026, "reward": 1.5750000476837158, "reward_std": 0.2510744631290436, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.574999988079071, "step": 1158 }, { "completion_length": 108.15625, "epoch": 0.7726666666666666, "grad_norm": 2.0724543000777516, "kl": 0.068359375, "learning_rate": 6.136666666666666e-07, "loss": 0.0027, "reward": 1.5416667461395264, "reward_std": 0.13702812790870667, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5416666865348816, "step": 1159 }, { "completion_length": 95.9375, "epoch": 0.7733333333333333, "grad_norm": 2.4183039982461216, "kl": 0.051025390625, "learning_rate": 6.133333333333332e-07, "loss": 0.002, "reward": 1.921875, "reward_std": 0.07822882384061813, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1160 }, { "completion_length": 106.0625, "epoch": 0.774, "grad_norm": 5.406859001496124, "kl": 0.0732421875, "learning_rate": 6.13e-07, "loss": 0.0029, "reward": 1.6510417461395264, "reward_std": 0.2413104921579361, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6510416269302368, "step": 1161 }, { "completion_length": 95.15625, "epoch": 0.7746666666666666, "grad_norm": 1.5218524592285545, "kl": 0.083984375, "learning_rate": 6.126666666666667e-07, "loss": 0.0034, "reward": 1.9296875, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 1162 }, { "completion_length": 115.34375, "epoch": 0.7753333333333333, "grad_norm": 2.416767989658881, "kl": 0.05517578125, "learning_rate": 6.123333333333332e-07, "loss": 0.0022, "reward": 1.6848958730697632, "reward_std": 0.16071045398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6848958134651184, "step": 1163 }, { "completion_length": 101.0625, "epoch": 0.776, "grad_norm": 4.467329627609828, "kl": 0.0771484375, "learning_rate": 6.119999999999999e-07, "loss": 0.0031, "reward": 1.8020833730697632, "reward_std": 0.031039537861943245, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 1164 }, { "completion_length": 97.34375, "epoch": 0.7766666666666666, "grad_norm": 0.16749804055397477, "kl": 0.0546875, "learning_rate": 6.116666666666667e-07, "loss": 0.0022, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1165 }, { "completion_length": 85.125, "epoch": 0.7773333333333333, "grad_norm": 1.0263908524241179, "kl": 0.047119140625, "learning_rate": 6.113333333333333e-07, "loss": 0.0019, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1166 }, { "completion_length": 96.09375, "epoch": 0.778, "grad_norm": 1.5132621240699589, "kl": 0.060302734375, "learning_rate": 6.11e-07, "loss": 0.0024, "reward": 1.7708333730697632, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8020833134651184, "step": 1167 }, { "completion_length": 105.28125, "epoch": 0.7786666666666666, "grad_norm": 2.9372520298112246, "kl": 0.0771484375, "learning_rate": 6.106666666666666e-07, "loss": 0.0031, "reward": 1.725000023841858, "reward_std": 0.1639954000711441, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7562500238418579, "step": 1168 }, { "completion_length": 109.875, "epoch": 0.7793333333333333, "grad_norm": 5.500088079456028, "kl": 0.09716796875, "learning_rate": 6.103333333333333e-07, "loss": 0.0039, "reward": 1.5360863208770752, "reward_std": 0.1119513213634491, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5360863208770752, "step": 1169 }, { "completion_length": 104.5625, "epoch": 0.78, "grad_norm": 9.92709123483477, "kl": 0.0654296875, "learning_rate": 6.1e-07, "loss": 0.0026, "reward": 1.7755208015441895, "reward_std": 0.22871257364749908, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7755208611488342, "step": 1170 }, { "completion_length": 98.625, "epoch": 0.7806666666666666, "grad_norm": 2.4268999496910624, "kl": 0.04931640625, "learning_rate": 6.096666666666667e-07, "loss": 0.002, "reward": 1.868749976158142, "reward_std": 0.15966877341270447, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8999999761581421, "step": 1171 }, { "completion_length": 99.4375, "epoch": 0.7813333333333333, "grad_norm": 4.8059549045828245, "kl": 0.0849609375, "learning_rate": 6.093333333333332e-07, "loss": 0.0034, "reward": 1.771875023841858, "reward_std": 0.23780955374240875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8031250238418579, "step": 1172 }, { "completion_length": 85.125, "epoch": 0.782, "grad_norm": 2.630222184037779, "kl": 0.150390625, "learning_rate": 6.089999999999999e-07, "loss": 0.006, "reward": 1.90625, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9375, "step": 1173 }, { "completion_length": 91.75, "epoch": 0.7826666666666666, "grad_norm": 0.22656001945964185, "kl": 0.09033203125, "learning_rate": 6.086666666666667e-07, "loss": 0.0036, "reward": 1.9166667461395264, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9166666269302368, "step": 1174 }, { "completion_length": 117.96875, "epoch": 0.7833333333333333, "grad_norm": 3.5498740240407143, "kl": 0.15625, "learning_rate": 6.083333333333333e-07, "loss": 0.0062, "reward": 1.6875, "reward_std": 0.26933756470680237, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.71875, "step": 1175 }, { "completion_length": 109.875, "epoch": 0.784, "grad_norm": 6.779515559770789, "kl": 0.0654296875, "learning_rate": 6.079999999999999e-07, "loss": 0.0026, "reward": 1.5567708015441895, "reward_std": 0.07663612812757492, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5567708611488342, "step": 1176 }, { "completion_length": 99.0625, "epoch": 0.7846666666666666, "grad_norm": 5.261521679093019, "kl": 0.07861328125, "learning_rate": 6.076666666666666e-07, "loss": 0.0031, "reward": 1.6770833730697632, "reward_std": 0.1041666567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6770833730697632, "step": 1177 }, { "completion_length": 96.875, "epoch": 0.7853333333333333, "grad_norm": 2.263823491561841, "kl": 0.055419921875, "learning_rate": 6.073333333333333e-07, "loss": 0.0022, "reward": 1.8958333730697632, "reward_std": 0.1555021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333134651184, "step": 1178 }, { "completion_length": 102.03125, "epoch": 0.786, "grad_norm": 1.9003922994050717, "kl": 0.056396484375, "learning_rate": 6.07e-07, "loss": 0.0023, "reward": 1.796875, "reward_std": 0.010416666977107525, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7968750596046448, "step": 1179 }, { "completion_length": 97.46875, "epoch": 0.7866666666666666, "grad_norm": 6.565799896471397, "kl": 0.0576171875, "learning_rate": 6.066666666666666e-07, "loss": 0.0023, "reward": 1.853124976158142, "reward_std": 0.1370512694120407, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8531249761581421, "step": 1180 }, { "completion_length": 102.75, "epoch": 0.7873333333333333, "grad_norm": 5.387385529945051, "kl": 0.0849609375, "learning_rate": 6.063333333333332e-07, "loss": 0.0034, "reward": 1.7447917461395264, "reward_std": 0.13055890798568726, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7447916269302368, "step": 1181 }, { "completion_length": 90.875, "epoch": 0.788, "grad_norm": 1.995888121759287, "kl": 0.0751953125, "learning_rate": 6.06e-07, "loss": 0.003, "reward": 1.890625, "reward_std": 0.010416661389172077, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.890625, "step": 1182 }, { "completion_length": 96.0, "epoch": 0.7886666666666666, "grad_norm": 4.141885954020584, "kl": 0.12451171875, "learning_rate": 6.056666666666667e-07, "loss": 0.005, "reward": 1.6515624523162842, "reward_std": 0.11568933725357056, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.651562511920929, "step": 1183 }, { "completion_length": 96.40625, "epoch": 0.7893333333333333, "grad_norm": 4.518677458114242, "kl": 0.10888671875, "learning_rate": 6.053333333333332e-07, "loss": 0.0044, "reward": 1.7526042461395264, "reward_std": 0.14647451043128967, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7838541269302368, "step": 1184 }, { "completion_length": 93.96875, "epoch": 0.79, "grad_norm": 2.90420911905763, "kl": 0.060546875, "learning_rate": 6.049999999999999e-07, "loss": 0.0024, "reward": 1.5812499523162842, "reward_std": 0.15966878831386566, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.581250011920929, "step": 1185 }, { "completion_length": 87.125, "epoch": 0.7906666666666666, "grad_norm": 4.169226591977732, "kl": 0.057373046875, "learning_rate": 6.046666666666667e-07, "loss": 0.0023, "reward": 1.7468750476837158, "reward_std": 0.126811683177948, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7468750476837158, "step": 1186 }, { "completion_length": 91.78125, "epoch": 0.7913333333333333, "grad_norm": 2.326293736959065, "kl": 0.083984375, "learning_rate": 6.043333333333333e-07, "loss": 0.0034, "reward": 1.84375, "reward_std": 0.16108438372612, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 1187 }, { "completion_length": 98.34375, "epoch": 0.792, "grad_norm": 2.3143451085420623, "kl": 0.0732421875, "learning_rate": 6.04e-07, "loss": 0.0029, "reward": 1.837499976158142, "reward_std": 0.1586044579744339, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8687499761581421, "step": 1188 }, { "completion_length": 97.75, "epoch": 0.7926666666666666, "grad_norm": 4.15332750508888, "kl": 0.10546875, "learning_rate": 6.036666666666666e-07, "loss": 0.0042, "reward": 1.693973183631897, "reward_std": 0.26534923911094666, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.725223183631897, "step": 1189 }, { "completion_length": 105.09375, "epoch": 0.7933333333333333, "grad_norm": 2.9317116237284244, "kl": 0.06396484375, "learning_rate": 6.033333333333333e-07, "loss": 0.0026, "reward": 1.7239583730697632, "reward_std": 0.16979670524597168, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7239583730697632, "step": 1190 }, { "completion_length": 98.125, "epoch": 0.794, "grad_norm": 1.9860971080634613, "kl": 0.06689453125, "learning_rate": 6.03e-07, "loss": 0.0027, "reward": 1.7838542461395264, "reward_std": 0.0918872058391571, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7838541865348816, "step": 1191 }, { "completion_length": 101.1875, "epoch": 0.7946666666666666, "grad_norm": 1.8051273259288312, "kl": 0.0556640625, "learning_rate": 6.026666666666667e-07, "loss": 0.0022, "reward": 1.7720832824707031, "reward_std": 0.17971301078796387, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8033333420753479, "step": 1192 }, { "completion_length": 94.75, "epoch": 0.7953333333333333, "grad_norm": 1.2276580282288707, "kl": 0.07470703125, "learning_rate": 6.023333333333333e-07, "loss": 0.003, "reward": 1.8177083730697632, "reward_std": 0.010416671633720398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083730697632, "step": 1193 }, { "completion_length": 97.40625, "epoch": 0.796, "grad_norm": 1.5273703397230398, "kl": 0.0927734375, "learning_rate": 6.019999999999999e-07, "loss": 0.0037, "reward": 1.9187500476837158, "reward_std": 0.08291241526603699, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9187500476837158, "step": 1194 }, { "completion_length": 101.46875, "epoch": 0.7966666666666666, "grad_norm": 6.8441685048404795, "kl": 0.140625, "learning_rate": 6.016666666666667e-07, "loss": 0.0056, "reward": 1.761458396911621, "reward_std": 0.2340402603149414, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7614583373069763, "step": 1195 }, { "completion_length": 98.375, "epoch": 0.7973333333333333, "grad_norm": 1.5279169182699095, "kl": 0.055908203125, "learning_rate": 6.013333333333334e-07, "loss": 0.0022, "reward": 1.7446428537368774, "reward_std": 0.0714285746216774, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7758928537368774, "step": 1196 }, { "completion_length": 104.53125, "epoch": 0.798, "grad_norm": 2.840988501033963, "kl": 0.06982421875, "learning_rate": 6.009999999999999e-07, "loss": 0.0028, "reward": 1.71875, "reward_std": 0.31684717535972595, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.78125, "step": 1197 }, { "completion_length": 101.3125, "epoch": 0.7986666666666666, "grad_norm": 2.0141736387439932, "kl": 0.06396484375, "learning_rate": 6.006666666666666e-07, "loss": 0.0026, "reward": 1.759300708770752, "reward_std": 0.07560735195875168, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7593006491661072, "step": 1198 }, { "completion_length": 101.78125, "epoch": 0.7993333333333333, "grad_norm": 17.24877579644996, "kl": 0.09423828125, "learning_rate": 6.003333333333334e-07, "loss": 0.0038, "reward": 1.7561756372451782, "reward_std": 0.1465774029493332, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7874256372451782, "step": 1199 }, { "completion_length": 94.28125, "epoch": 0.8, "grad_norm": 7.449108569585182, "kl": 0.091796875, "learning_rate": 6e-07, "loss": 0.0037, "reward": 1.5229167938232422, "reward_std": 0.12916666269302368, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5229166746139526, "step": 1200 }, { "completion_length": 84.1875, "epoch": 0.8006666666666666, "grad_norm": 3.534310320062781, "kl": 0.08251953125, "learning_rate": 5.996666666666666e-07, "loss": 0.0033, "reward": 1.78125, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 1201 }, { "completion_length": 112.125, "epoch": 0.8013333333333333, "grad_norm": 5.165117321025068, "kl": 0.0859375, "learning_rate": 5.993333333333333e-07, "loss": 0.0034, "reward": 1.6005208492279053, "reward_std": 0.13590984046459198, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6005208492279053, "step": 1202 }, { "completion_length": 105.34375, "epoch": 0.802, "grad_norm": 1.553866822467803, "kl": 0.052978515625, "learning_rate": 5.989999999999999e-07, "loss": 0.0021, "reward": 1.84375, "reward_std": 0.25, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.90625, "step": 1203 }, { "completion_length": 98.9375, "epoch": 0.8026666666666666, "grad_norm": 4.8019995626662535, "kl": 0.05908203125, "learning_rate": 5.986666666666667e-07, "loss": 0.0024, "reward": 1.633333444595337, "reward_std": 0.10894809663295746, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6333333253860474, "step": 1204 }, { "completion_length": 105.90625, "epoch": 0.8033333333333333, "grad_norm": 6.453824015873036, "kl": 0.06005859375, "learning_rate": 5.983333333333334e-07, "loss": 0.0024, "reward": 1.714583396911621, "reward_std": 0.20966877043247223, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7458333969116211, "step": 1205 }, { "completion_length": 103.03125, "epoch": 0.804, "grad_norm": 2.1909841738839333, "kl": 0.056396484375, "learning_rate": 5.979999999999999e-07, "loss": 0.0023, "reward": 1.9395833015441895, "reward_std": 0.023935679346323013, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9395833015441895, "step": 1206 }, { "completion_length": 104.84375, "epoch": 0.8046666666666666, "grad_norm": 2.9631273581276063, "kl": 0.0517578125, "learning_rate": 5.976666666666666e-07, "loss": 0.0021, "reward": 1.732812523841858, "reward_std": 0.1383022665977478, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7328125238418579, "step": 1207 }, { "completion_length": 87.78125, "epoch": 0.8053333333333333, "grad_norm": 1.133785995751624, "kl": 0.07080078125, "learning_rate": 5.973333333333334e-07, "loss": 0.0028, "reward": 1.8645833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645833134651184, "step": 1208 }, { "completion_length": 108.46875, "epoch": 0.806, "grad_norm": 2.974507726926626, "kl": 0.0869140625, "learning_rate": 5.97e-07, "loss": 0.0035, "reward": 1.7312500476837158, "reward_std": 0.21331222355365753, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7625000476837158, "step": 1209 }, { "completion_length": 110.21875, "epoch": 0.8066666666666666, "grad_norm": 1.4914991155806776, "kl": 0.095703125, "learning_rate": 5.966666666666666e-07, "loss": 0.0038, "reward": 1.9583333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333134651184, "step": 1210 }, { "completion_length": 99.375, "epoch": 0.8073333333333333, "grad_norm": 3.541926356893117, "kl": 0.12353515625, "learning_rate": 5.963333333333333e-07, "loss": 0.0049, "reward": 1.8499999046325684, "reward_std": 0.09541241824626923, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8499999642372131, "step": 1211 }, { "completion_length": 95.6875, "epoch": 0.808, "grad_norm": 3.653289653841634, "kl": 0.048583984375, "learning_rate": 5.96e-07, "loss": 0.0019, "reward": 1.7921874523162842, "reward_std": 0.20753690600395203, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.792187511920929, "step": 1212 }, { "completion_length": 110.90625, "epoch": 0.8086666666666666, "grad_norm": 2.354310248801495, "kl": 0.0517578125, "learning_rate": 5.956666666666667e-07, "loss": 0.0021, "reward": 1.6848958730697632, "reward_std": 0.16071045398712158, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6848958134651184, "step": 1213 }, { "completion_length": 102.75, "epoch": 0.8093333333333333, "grad_norm": 1.8301328934748022, "kl": 0.06494140625, "learning_rate": 5.953333333333333e-07, "loss": 0.0026, "reward": 1.9119791984558105, "reward_std": 0.08437499403953552, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9119791984558105, "step": 1214 }, { "completion_length": 91.21875, "epoch": 0.81, "grad_norm": 2.2813898690702565, "kl": 0.06494140625, "learning_rate": 5.949999999999999e-07, "loss": 0.0026, "reward": 1.90625, "reward_std": 0.1875, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.96875, "step": 1215 }, { "completion_length": 111.4375, "epoch": 0.8106666666666666, "grad_norm": 2.7134643805762755, "kl": 0.0771484375, "learning_rate": 5.946666666666667e-07, "loss": 0.0031, "reward": 1.6177083253860474, "reward_std": 0.13775646686553955, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6177083253860474, "step": 1216 }, { "completion_length": 124.875, "epoch": 0.8113333333333334, "grad_norm": 3.3796173642746314, "kl": 0.048828125, "learning_rate": 5.943333333333334e-07, "loss": 0.002, "reward": 1.7197916507720947, "reward_std": 0.19845297932624817, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7510416507720947, "step": 1217 }, { "completion_length": 101.71875, "epoch": 0.812, "grad_norm": 5.383346525292572, "kl": 0.0615234375, "learning_rate": 5.939999999999999e-07, "loss": 0.0025, "reward": 1.6339285373687744, "reward_std": 0.24100764095783234, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6339285969734192, "step": 1218 }, { "completion_length": 114.0625, "epoch": 0.8126666666666666, "grad_norm": 6.199360202904313, "kl": 0.05712890625, "learning_rate": 5.936666666666666e-07, "loss": 0.0023, "reward": 1.9505208730697632, "reward_std": 0.08397450298070908, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9505208730697632, "step": 1219 }, { "completion_length": 108.46875, "epoch": 0.8133333333333334, "grad_norm": 1.847065414493251, "kl": 0.059814453125, "learning_rate": 5.933333333333334e-07, "loss": 0.0024, "reward": 1.8828125, "reward_std": 0.203125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9140625, "step": 1220 }, { "completion_length": 102.375, "epoch": 0.814, "grad_norm": 3.6801673120365974, "kl": 0.076171875, "learning_rate": 5.93e-07, "loss": 0.003, "reward": 1.917708396911621, "reward_std": 0.010825316421687603, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9177083969116211, "step": 1221 }, { "completion_length": 114.1875, "epoch": 0.8146666666666667, "grad_norm": 2.7590831015824495, "kl": 0.0634765625, "learning_rate": 5.926666666666667e-07, "loss": 0.0025, "reward": 1.792708396911621, "reward_std": 0.16403505206108093, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7927083969116211, "step": 1222 }, { "completion_length": 110.5625, "epoch": 0.8153333333333334, "grad_norm": 1.2547165296342944, "kl": 0.04296875, "learning_rate": 5.923333333333333e-07, "loss": 0.0017, "reward": 1.75, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1223 }, { "completion_length": 98.125, "epoch": 0.816, "grad_norm": 11.424967048740642, "kl": 0.08447265625, "learning_rate": 5.919999999999999e-07, "loss": 0.0034, "reward": 1.9427083730697632, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083134651184, "step": 1224 }, { "completion_length": 101.90625, "epoch": 0.8166666666666667, "grad_norm": 4.275975581941719, "kl": 0.0654296875, "learning_rate": 5.916666666666667e-07, "loss": 0.0026, "reward": 1.6817708015441895, "reward_std": 0.19062499701976776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6817708611488342, "step": 1225 }, { "completion_length": 99.1875, "epoch": 0.8173333333333334, "grad_norm": 3.229368450716725, "kl": 0.06494140625, "learning_rate": 5.913333333333334e-07, "loss": 0.0026, "reward": 1.8390624523162842, "reward_std": 0.009375001303851604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.839062511920929, "step": 1226 }, { "completion_length": 102.4375, "epoch": 0.818, "grad_norm": 7.069089893789342, "kl": 0.06982421875, "learning_rate": 5.909999999999999e-07, "loss": 0.0028, "reward": 1.5958333015441895, "reward_std": 0.25824224948883057, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5958333611488342, "step": 1227 }, { "completion_length": 110.625, "epoch": 0.8186666666666667, "grad_norm": 3.7818621113659954, "kl": 0.06494140625, "learning_rate": 5.906666666666666e-07, "loss": 0.0026, "reward": 1.701562523841858, "reward_std": 0.17812499403953552, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7015625238418579, "step": 1228 }, { "completion_length": 98.90625, "epoch": 0.8193333333333334, "grad_norm": 1.8413663956966009, "kl": 0.049072265625, "learning_rate": 5.903333333333334e-07, "loss": 0.002, "reward": 1.6354167461395264, "reward_std": 0.08655625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6354166865348816, "step": 1229 }, { "completion_length": 104.75, "epoch": 0.82, "grad_norm": 4.2400128791314735, "kl": 0.06494140625, "learning_rate": 5.9e-07, "loss": 0.0026, "reward": 1.6729166507720947, "reward_std": 0.19439370930194855, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6729166507720947, "step": 1230 }, { "completion_length": 94.96875, "epoch": 0.8206666666666667, "grad_norm": 4.993451742894648, "kl": 0.07470703125, "learning_rate": 5.896666666666666e-07, "loss": 0.003, "reward": 1.7873512506484985, "reward_std": 0.019128823652863503, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7873512506484985, "step": 1231 }, { "completion_length": 109.75, "epoch": 0.8213333333333334, "grad_norm": 2.364595710352995, "kl": 0.0576171875, "learning_rate": 5.893333333333333e-07, "loss": 0.0023, "reward": 1.53125, "reward_std": 0.151679128408432, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.53125, "step": 1232 }, { "completion_length": 113.25, "epoch": 0.822, "grad_norm": 3.1325228260337576, "kl": 0.10595703125, "learning_rate": 5.89e-07, "loss": 0.0042, "reward": 1.746837854385376, "reward_std": 0.2400388866662979, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7780877947807312, "step": 1233 }, { "completion_length": 94.90625, "epoch": 0.8226666666666667, "grad_norm": 10.769334371113688, "kl": 0.07958984375, "learning_rate": 5.886666666666667e-07, "loss": 0.0032, "reward": 1.6927083730697632, "reward_std": 0.17045171558856964, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6927083730697632, "step": 1234 }, { "completion_length": 125.8125, "epoch": 0.8233333333333334, "grad_norm": 2.524519290005704, "kl": 0.047119140625, "learning_rate": 5.883333333333333e-07, "loss": 0.0019, "reward": 1.4708333015441895, "reward_std": 0.14723435044288635, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5020833015441895, "step": 1235 }, { "completion_length": 108.96875, "epoch": 0.824, "grad_norm": 1.9144340515546612, "kl": 0.0576171875, "learning_rate": 5.879999999999999e-07, "loss": 0.0023, "reward": 1.649999976158142, "reward_std": 0.2193375676870346, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6499999761581421, "step": 1236 }, { "completion_length": 102.71875, "epoch": 0.8246666666666667, "grad_norm": 207.835054578787, "kl": 0.076171875, "learning_rate": 5.876666666666666e-07, "loss": 0.003, "reward": 1.953125, "reward_std": 0.07613958418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.953125, "step": 1237 }, { "completion_length": 100.5, "epoch": 0.8253333333333334, "grad_norm": 0.2575159581176103, "kl": 0.09521484375, "learning_rate": 5.873333333333334e-07, "loss": 0.0038, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 1238 }, { "completion_length": 104.65625, "epoch": 0.826, "grad_norm": 4.3134095841662985, "kl": 0.08203125, "learning_rate": 5.87e-07, "loss": 0.0033, "reward": 1.8302083015441895, "reward_std": 0.18980032205581665, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8302083611488342, "step": 1239 }, { "completion_length": 109.21875, "epoch": 0.8266666666666667, "grad_norm": 2.631613819016054, "kl": 0.07666015625, "learning_rate": 5.866666666666666e-07, "loss": 0.0031, "reward": 1.9130208492279053, "reward_std": 0.043434299528598785, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9130208492279053, "step": 1240 }, { "completion_length": 97.125, "epoch": 0.8273333333333334, "grad_norm": 1.5921595090289178, "kl": 0.0712890625, "learning_rate": 5.863333333333333e-07, "loss": 0.0029, "reward": 1.75, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 1241 }, { "completion_length": 107.6875, "epoch": 0.828, "grad_norm": 2.168346150694175, "kl": 0.060546875, "learning_rate": 5.86e-07, "loss": 0.0024, "reward": 1.9322917461395264, "reward_std": 0.03447292745113373, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322916269302368, "step": 1242 }, { "completion_length": 102.875, "epoch": 0.8286666666666667, "grad_norm": 3.782073332399985, "kl": 0.07373046875, "learning_rate": 5.856666666666667e-07, "loss": 0.003, "reward": 1.6229166984558105, "reward_std": 0.17315879464149475, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6229166984558105, "step": 1243 }, { "completion_length": 103.125, "epoch": 0.8293333333333334, "grad_norm": 19.96035351239073, "kl": 0.08056640625, "learning_rate": 5.853333333333333e-07, "loss": 0.0032, "reward": 1.783333420753479, "reward_std": 0.21027246117591858, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8145833015441895, "step": 1244 }, { "completion_length": 102.28125, "epoch": 0.83, "grad_norm": 4.912569032859135, "kl": 0.05712890625, "learning_rate": 5.849999999999999e-07, "loss": 0.0023, "reward": 1.5839285850524902, "reward_std": 0.05734420567750931, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5839285850524902, "step": 1245 }, { "completion_length": 95.3125, "epoch": 0.8306666666666667, "grad_norm": 3.1493029652577627, "kl": 0.07470703125, "learning_rate": 5.846666666666667e-07, "loss": 0.003, "reward": 1.734375, "reward_std": 0.15271097421646118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 1246 }, { "completion_length": 110.15625, "epoch": 0.8313333333333334, "grad_norm": 8.745267641504523, "kl": 0.059814453125, "learning_rate": 5.843333333333334e-07, "loss": 0.0024, "reward": 1.779036521911621, "reward_std": 0.22249168157577515, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8102865219116211, "step": 1247 }, { "completion_length": 112.1875, "epoch": 0.832, "grad_norm": 3.2525138604898247, "kl": 0.07080078125, "learning_rate": 5.839999999999999e-07, "loss": 0.0028, "reward": 1.8203125, "reward_std": 0.195602148771286, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203125, "step": 1248 }, { "completion_length": 102.40625, "epoch": 0.8326666666666667, "grad_norm": 1.8655330801508312, "kl": 0.0712890625, "learning_rate": 5.836666666666666e-07, "loss": 0.0029, "reward": 1.7395833730697632, "reward_std": 0.0624999962747097, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 1249 }, { "completion_length": 106.25, "epoch": 0.8333333333333334, "grad_norm": 3.94500291509454, "kl": 0.08544921875, "learning_rate": 5.833333333333334e-07, "loss": 0.0034, "reward": 1.8286458253860474, "reward_std": 0.04366099089384079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8286458253860474, "step": 1250 }, { "completion_length": 104.125, "epoch": 0.834, "grad_norm": 2.7599240674789263, "kl": 0.08154296875, "learning_rate": 5.83e-07, "loss": 0.0033, "reward": 1.8333333730697632, "reward_std": 0.15576279163360596, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8645833730697632, "step": 1251 }, { "completion_length": 95.5, "epoch": 0.8346666666666667, "grad_norm": 4.0396983443084595, "kl": 0.054931640625, "learning_rate": 5.826666666666666e-07, "loss": 0.0022, "reward": 1.8214285373687744, "reward_std": 0.17824122309684753, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8214285373687744, "step": 1252 }, { "completion_length": 102.5625, "epoch": 0.8353333333333334, "grad_norm": 3.1132557857274503, "kl": 0.058349609375, "learning_rate": 5.823333333333333e-07, "loss": 0.0023, "reward": 1.59375, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.59375, "step": 1253 }, { "completion_length": 100.5625, "epoch": 0.836, "grad_norm": 0.8864224915339075, "kl": 0.08349609375, "learning_rate": 5.819999999999999e-07, "loss": 0.0033, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1254 }, { "completion_length": 92.3125, "epoch": 0.8366666666666667, "grad_norm": 1.7035159694353166, "kl": 0.08251953125, "learning_rate": 5.816666666666667e-07, "loss": 0.0033, "reward": 1.984375, "reward_std": 0.018042195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.984375, "step": 1255 }, { "completion_length": 110.0, "epoch": 0.8373333333333334, "grad_norm": 1.6345459041428099, "kl": 0.0634765625, "learning_rate": 5.813333333333334e-07, "loss": 0.0025, "reward": 1.875, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1256 }, { "completion_length": 99.46875, "epoch": 0.838, "grad_norm": 3.257246882748313, "kl": 0.0439453125, "learning_rate": 5.809999999999999e-07, "loss": 0.0018, "reward": 1.6813616752624512, "reward_std": 0.2605289816856384, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6813616156578064, "step": 1257 }, { "completion_length": 100.40625, "epoch": 0.8386666666666667, "grad_norm": 1.673788979550631, "kl": 0.06884765625, "learning_rate": 5.806666666666666e-07, "loss": 0.0028, "reward": 1.7872023582458496, "reward_std": 0.069373220205307, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7872023582458496, "step": 1258 }, { "completion_length": 112.21875, "epoch": 0.8393333333333334, "grad_norm": 15.564141196142897, "kl": 0.09375, "learning_rate": 5.803333333333334e-07, "loss": 0.0037, "reward": 1.646875023841858, "reward_std": 0.30665940046310425, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6781250238418579, "step": 1259 }, { "completion_length": 105.65625, "epoch": 0.84, "grad_norm": 2.2703499044198714, "kl": 0.04443359375, "learning_rate": 5.8e-07, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.012028136290609837, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9062500596046448, "step": 1260 }, { "completion_length": 100.3125, "epoch": 0.8406666666666667, "grad_norm": 2.001187663406032, "kl": 0.059814453125, "learning_rate": 5.796666666666666e-07, "loss": 0.0024, "reward": 1.9708333015441895, "reward_std": 0.008333325386047363, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9708333015441895, "step": 1261 }, { "completion_length": 103.8125, "epoch": 0.8413333333333334, "grad_norm": 2.361428556867434, "kl": 0.06494140625, "learning_rate": 5.793333333333333e-07, "loss": 0.0026, "reward": 1.8958333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333134651184, "step": 1262 }, { "completion_length": 101.375, "epoch": 0.842, "grad_norm": 2.7914384302744595, "kl": 0.0634765625, "learning_rate": 5.79e-07, "loss": 0.0025, "reward": 1.857812523841858, "reward_std": 0.01354166865348816, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8578124642372131, "step": 1263 }, { "completion_length": 103.59375, "epoch": 0.8426666666666667, "grad_norm": 5.549002341316093, "kl": 0.05419921875, "learning_rate": 5.786666666666667e-07, "loss": 0.0022, "reward": 1.890625, "reward_std": 0.16591878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.921875, "step": 1264 }, { "completion_length": 95.5, "epoch": 0.8433333333333334, "grad_norm": 2.5334474844960324, "kl": 0.10400390625, "learning_rate": 5.783333333333333e-07, "loss": 0.0042, "reward": 1.8125, "reward_std": 0.32216876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 1265 }, { "completion_length": 97.0, "epoch": 0.844, "grad_norm": 2.2012045194239516, "kl": 0.08154296875, "learning_rate": 5.779999999999999e-07, "loss": 0.0033, "reward": 1.7916667461395264, "reward_std": 0.22122503817081451, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8229166269302368, "step": 1266 }, { "completion_length": 104.4375, "epoch": 0.8446666666666667, "grad_norm": 1.6425614030413467, "kl": 0.060546875, "learning_rate": 5.776666666666666e-07, "loss": 0.0024, "reward": 1.8645833730697632, "reward_std": 0.12289540469646454, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8958333730697632, "step": 1267 }, { "completion_length": 117.03125, "epoch": 0.8453333333333334, "grad_norm": 2.453754538534212, "kl": 0.07080078125, "learning_rate": 5.773333333333334e-07, "loss": 0.0028, "reward": 1.8098958730697632, "reward_std": 0.10685735195875168, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8098958730697632, "step": 1268 }, { "completion_length": 95.65625, "epoch": 0.846, "grad_norm": 3.5933938089243447, "kl": 0.07470703125, "learning_rate": 5.769999999999999e-07, "loss": 0.003, "reward": 1.7657737731933594, "reward_std": 0.13775409758090973, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7657738327980042, "step": 1269 }, { "completion_length": 112.125, "epoch": 0.8466666666666667, "grad_norm": 1.9764098632238087, "kl": 0.0712890625, "learning_rate": 5.766666666666666e-07, "loss": 0.0029, "reward": 1.7890625, "reward_std": 0.0989583358168602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7890625, "step": 1270 }, { "completion_length": 99.4375, "epoch": 0.8473333333333334, "grad_norm": 4.406249985667551, "kl": 0.07275390625, "learning_rate": 5.763333333333333e-07, "loss": 0.0029, "reward": 1.7890625, "reward_std": 0.18263056874275208, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7890625, "step": 1271 }, { "completion_length": 98.28125, "epoch": 0.848, "grad_norm": 38.28433088939034, "kl": 0.107421875, "learning_rate": 5.76e-07, "loss": 0.0043, "reward": 1.800520896911621, "reward_std": 0.22205547988414764, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8317708373069763, "step": 1272 }, { "completion_length": 89.90625, "epoch": 0.8486666666666667, "grad_norm": 3.3421014607795683, "kl": 0.0673828125, "learning_rate": 5.756666666666666e-07, "loss": 0.0027, "reward": 1.7078125476837158, "reward_std": 0.12462606281042099, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.707812488079071, "step": 1273 }, { "completion_length": 106.65625, "epoch": 0.8493333333333334, "grad_norm": 26.279925498019363, "kl": 0.057861328125, "learning_rate": 5.753333333333333e-07, "loss": 0.0023, "reward": 1.6901042461395264, "reward_std": 0.14381834864616394, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6901041865348816, "step": 1274 }, { "completion_length": 98.75, "epoch": 0.85, "grad_norm": 1.5330370606797195, "kl": 0.0693359375, "learning_rate": 5.749999999999999e-07, "loss": 0.0028, "reward": 1.850000023841858, "reward_std": 0.18217839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8812500238418579, "step": 1275 }, { "completion_length": 95.59375, "epoch": 0.8506666666666667, "grad_norm": 3.112666575344946, "kl": 0.080078125, "learning_rate": 5.746666666666667e-07, "loss": 0.0032, "reward": 1.7729166746139526, "reward_std": 0.04999999329447746, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7729166746139526, "step": 1276 }, { "completion_length": 102.9375, "epoch": 0.8513333333333334, "grad_norm": 1.8270646765633598, "kl": 0.07763671875, "learning_rate": 5.743333333333334e-07, "loss": 0.0031, "reward": 1.8854167461395264, "reward_std": 0.08419691771268845, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8854166269302368, "step": 1277 }, { "completion_length": 90.96875, "epoch": 0.852, "grad_norm": 0.7890873817077493, "kl": 0.05126953125, "learning_rate": 5.739999999999999e-07, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1278 }, { "completion_length": 97.0625, "epoch": 0.8526666666666667, "grad_norm": 1.0023521305833754, "kl": 0.052490234375, "learning_rate": 5.736666666666666e-07, "loss": 0.0021, "reward": 1.9270833730697632, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9270833730697632, "step": 1279 }, { "completion_length": 120.9375, "epoch": 0.8533333333333334, "grad_norm": 8.725168262870303, "kl": 0.04541015625, "learning_rate": 5.733333333333334e-07, "loss": 0.0018, "reward": 1.6979167461395264, "reward_std": 0.1951255202293396, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6979166269302368, "step": 1280 }, { "completion_length": 101.375, "epoch": 0.854, "grad_norm": 3.887632208672108, "kl": 0.08203125, "learning_rate": 5.73e-07, "loss": 0.0033, "reward": 1.7630952596664429, "reward_std": 0.18777546286582947, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7630953192710876, "step": 1281 }, { "completion_length": 91.90625, "epoch": 0.8546666666666667, "grad_norm": 2.2542470407035453, "kl": 0.0537109375, "learning_rate": 5.726666666666666e-07, "loss": 0.0021, "reward": 1.90625, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9375, "step": 1282 }, { "completion_length": 100.0625, "epoch": 0.8553333333333333, "grad_norm": 3.5477061965400996, "kl": 0.06982421875, "learning_rate": 5.723333333333333e-07, "loss": 0.0028, "reward": 1.7447917461395264, "reward_std": 0.09005148708820343, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7447917461395264, "step": 1283 }, { "completion_length": 111.0, "epoch": 0.856, "grad_norm": 2.916331252575323, "kl": 0.04345703125, "learning_rate": 5.719999999999999e-07, "loss": 0.0017, "reward": 1.8359375, "reward_std": 0.1510416716337204, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8671874403953552, "step": 1284 }, { "completion_length": 99.59375, "epoch": 0.8566666666666667, "grad_norm": 2.9301247535146735, "kl": 0.076171875, "learning_rate": 5.716666666666667e-07, "loss": 0.003, "reward": 1.9375, "reward_std": 0.08537659049034119, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1285 }, { "completion_length": 99.46875, "epoch": 0.8573333333333333, "grad_norm": 7.689659204187606, "kl": 0.0654296875, "learning_rate": 5.713333333333333e-07, "loss": 0.0026, "reward": 1.7494791746139526, "reward_std": 0.1705901026725769, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7494792342185974, "step": 1286 }, { "completion_length": 97.90625, "epoch": 0.858, "grad_norm": 4.907591751363132, "kl": 0.11474609375, "learning_rate": 5.709999999999999e-07, "loss": 0.0046, "reward": 1.8802083730697632, "reward_std": 0.11009378731250763, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8802083134651184, "step": 1287 }, { "completion_length": 91.71875, "epoch": 0.8586666666666667, "grad_norm": 2.6606692535783805, "kl": 0.055908203125, "learning_rate": 5.706666666666666e-07, "loss": 0.0022, "reward": 1.859375, "reward_std": 0.1770833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 1288 }, { "completion_length": 97.25, "epoch": 0.8593333333333333, "grad_norm": 1.6092704483789395, "kl": 0.08740234375, "learning_rate": 5.703333333333334e-07, "loss": 0.0035, "reward": 1.951562523841858, "reward_std": 0.012884705327451229, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9515625238418579, "step": 1289 }, { "completion_length": 99.21875, "epoch": 0.86, "grad_norm": 2.74270724523421, "kl": 0.05615234375, "learning_rate": 5.699999999999999e-07, "loss": 0.0022, "reward": 1.7208333015441895, "reward_std": 0.027809306979179382, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7208333611488342, "step": 1290 }, { "completion_length": 117.03125, "epoch": 0.8606666666666667, "grad_norm": 1.1412921947852692, "kl": 0.064453125, "learning_rate": 5.696666666666666e-07, "loss": 0.0026, "reward": 1.875, "reward_std": 0.18217839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 1291 }, { "completion_length": 103.1875, "epoch": 0.8613333333333333, "grad_norm": 4.533074251848977, "kl": 0.0615234375, "learning_rate": 5.693333333333333e-07, "loss": 0.0025, "reward": 1.7135417461395264, "reward_std": 0.21371470391750336, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7447916865348816, "step": 1292 }, { "completion_length": 94.28125, "epoch": 0.862, "grad_norm": 2.0946796022020644, "kl": 0.04833984375, "learning_rate": 5.69e-07, "loss": 0.0019, "reward": 1.8723958730697632, "reward_std": 0.0885416641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8723958134651184, "step": 1293 }, { "completion_length": 105.46875, "epoch": 0.8626666666666667, "grad_norm": 4.136354534728849, "kl": 0.06884765625, "learning_rate": 5.686666666666667e-07, "loss": 0.0028, "reward": 1.8687500953674316, "reward_std": 0.11460646241903305, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8687500357627869, "step": 1294 }, { "completion_length": 91.5, "epoch": 0.8633333333333333, "grad_norm": 3.5784530100959167, "kl": 0.04150390625, "learning_rate": 5.683333333333333e-07, "loss": 0.0017, "reward": 1.875, "reward_std": 0.14079803228378296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1295 }, { "completion_length": 97.03125, "epoch": 0.864, "grad_norm": 1.135919797963186, "kl": 0.0673828125, "learning_rate": 5.679999999999999e-07, "loss": 0.0027, "reward": 1.9270833730697632, "reward_std": 0.03608439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9270833730697632, "step": 1296 }, { "completion_length": 113.21875, "epoch": 0.8646666666666667, "grad_norm": 8.408953902333783, "kl": 0.06640625, "learning_rate": 5.676666666666666e-07, "loss": 0.0027, "reward": 1.7999999523162842, "reward_std": 0.26443374156951904, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.831250011920929, "step": 1297 }, { "completion_length": 102.15625, "epoch": 0.8653333333333333, "grad_norm": 1.56258998056477, "kl": 0.057861328125, "learning_rate": 5.673333333333334e-07, "loss": 0.0023, "reward": 1.703125, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.703125, "step": 1298 }, { "completion_length": 110.9375, "epoch": 0.866, "grad_norm": 2.9937559872002164, "kl": 0.0986328125, "learning_rate": 5.669999999999999e-07, "loss": 0.004, "reward": 1.8588541746139526, "reward_std": 0.03968124836683273, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8588541746139526, "step": 1299 }, { "completion_length": 102.71875, "epoch": 0.8666666666666667, "grad_norm": 2.4171074083142043, "kl": 0.0439453125, "learning_rate": 5.666666666666666e-07, "loss": 0.0018, "reward": 1.9114583730697632, "reward_std": 0.1145833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9114583730697632, "step": 1300 }, { "completion_length": 107.875, "epoch": 0.8673333333333333, "grad_norm": 5.512400081827362, "kl": 0.08349609375, "learning_rate": 5.663333333333333e-07, "loss": 0.0033, "reward": 1.7609374523162842, "reward_std": 0.09556056559085846, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7609375715255737, "step": 1301 }, { "completion_length": 103.15625, "epoch": 0.868, "grad_norm": 5.632991116219441, "kl": 0.091796875, "learning_rate": 5.66e-07, "loss": 0.0037, "reward": 1.688020944595337, "reward_std": 0.18142303824424744, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6880208253860474, "step": 1302 }, { "completion_length": 98.6875, "epoch": 0.8686666666666667, "grad_norm": 2.0039106137855143, "kl": 0.064453125, "learning_rate": 5.656666666666666e-07, "loss": 0.0026, "reward": 1.9796874523162842, "reward_std": 0.024672331288456917, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.979687511920929, "step": 1303 }, { "completion_length": 111.3125, "epoch": 0.8693333333333333, "grad_norm": 1.3899514615280648, "kl": 0.044677734375, "learning_rate": 5.653333333333333e-07, "loss": 0.0018, "reward": 1.9328124523162842, "reward_std": 0.02858421579003334, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.932812511920929, "step": 1304 }, { "completion_length": 99.875, "epoch": 0.87, "grad_norm": 3.1802474693552694, "kl": 0.0673828125, "learning_rate": 5.649999999999999e-07, "loss": 0.0027, "reward": 1.881250023841858, "reward_std": 0.07693374902009964, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8812499642372131, "step": 1305 }, { "completion_length": 110.5, "epoch": 0.8706666666666667, "grad_norm": 21.653379929645148, "kl": 0.10888671875, "learning_rate": 5.646666666666667e-07, "loss": 0.0044, "reward": 1.7885416746139526, "reward_std": 0.22406214475631714, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8197916746139526, "step": 1306 }, { "completion_length": 107.15625, "epoch": 0.8713333333333333, "grad_norm": 2.9788490324429624, "kl": 0.05078125, "learning_rate": 5.643333333333333e-07, "loss": 0.002, "reward": 1.6796875, "reward_std": 0.22729694843292236, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7421875, "step": 1307 }, { "completion_length": 97.53125, "epoch": 0.872, "grad_norm": 1.2270776605716045, "kl": 0.07470703125, "learning_rate": 5.639999999999999e-07, "loss": 0.003, "reward": 1.9427083730697632, "reward_std": 0.018042195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083134651184, "step": 1308 }, { "completion_length": 111.25, "epoch": 0.8726666666666667, "grad_norm": 1.121825450654655, "kl": 0.032958984375, "learning_rate": 5.636666666666666e-07, "loss": 0.0013, "reward": 1.65625, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6875, "step": 1309 }, { "completion_length": 107.4375, "epoch": 0.8733333333333333, "grad_norm": 1.674870770500917, "kl": 0.05078125, "learning_rate": 5.633333333333334e-07, "loss": 0.002, "reward": 1.9250000715255737, "reward_std": 0.03465259075164795, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.925000011920929, "step": 1310 }, { "completion_length": 97.9375, "epoch": 0.874, "grad_norm": 3.37236740451422, "kl": 0.055908203125, "learning_rate": 5.629999999999999e-07, "loss": 0.0022, "reward": 1.7786458730697632, "reward_std": 0.07828401029109955, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7786458730697632, "step": 1311 }, { "completion_length": 103.03125, "epoch": 0.8746666666666667, "grad_norm": 2.1318018313275875, "kl": 0.040283203125, "learning_rate": 5.626666666666666e-07, "loss": 0.0016, "reward": 1.8776042461395264, "reward_std": 0.11522451043128967, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8776041865348816, "step": 1312 }, { "completion_length": 98.75, "epoch": 0.8753333333333333, "grad_norm": 2.3116581181439306, "kl": 0.0634765625, "learning_rate": 5.623333333333333e-07, "loss": 0.0025, "reward": 1.9609375, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9609375, "step": 1313 }, { "completion_length": 97.5, "epoch": 0.876, "grad_norm": 1.9285141825001877, "kl": 0.048828125, "learning_rate": 5.620000000000001e-07, "loss": 0.0019, "reward": 1.7946312427520752, "reward_std": 0.1809525489807129, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8258811831474304, "step": 1314 }, { "completion_length": 100.625, "epoch": 0.8766666666666667, "grad_norm": 2.290565234849001, "kl": 0.0732421875, "learning_rate": 5.616666666666667e-07, "loss": 0.0029, "reward": 1.7395833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 1315 }, { "completion_length": 104.09375, "epoch": 0.8773333333333333, "grad_norm": 1.6628147324313471, "kl": 0.064453125, "learning_rate": 5.613333333333333e-07, "loss": 0.0026, "reward": 1.8671875, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8671875, "step": 1316 }, { "completion_length": 102.09375, "epoch": 0.878, "grad_norm": 1.9085354019402525, "kl": 0.057861328125, "learning_rate": 5.61e-07, "loss": 0.0023, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1317 }, { "completion_length": 116.21875, "epoch": 0.8786666666666667, "grad_norm": 2.0266610593756225, "kl": 0.07666015625, "learning_rate": 5.606666666666666e-07, "loss": 0.0031, "reward": 1.84375, "reward_std": 0.15244734287261963, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1318 }, { "completion_length": 99.5, "epoch": 0.8793333333333333, "grad_norm": 5.757921510256819, "kl": 0.0625, "learning_rate": 5.603333333333334e-07, "loss": 0.0025, "reward": 1.610937476158142, "reward_std": 0.02812500298023224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6109374761581421, "step": 1319 }, { "completion_length": 103.5, "epoch": 0.88, "grad_norm": 0.07072082627126106, "kl": 0.060791015625, "learning_rate": 5.6e-07, "loss": 0.0024, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1320 }, { "completion_length": 86.125, "epoch": 0.8806666666666667, "grad_norm": 0.8323365966911984, "kl": 0.06640625, "learning_rate": 5.596666666666666e-07, "loss": 0.0027, "reward": 1.71875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 1321 }, { "completion_length": 111.875, "epoch": 0.8813333333333333, "grad_norm": 2.031698952994187, "kl": 0.0849609375, "learning_rate": 5.593333333333333e-07, "loss": 0.0034, "reward": 1.9337053298950195, "reward_std": 0.05101045221090317, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9337053894996643, "step": 1322 }, { "completion_length": 103.65625, "epoch": 0.882, "grad_norm": 3.2568085171878987, "kl": 0.06298828125, "learning_rate": 5.590000000000001e-07, "loss": 0.0025, "reward": 1.6822917461395264, "reward_std": 0.0729166567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6822916269302368, "step": 1323 }, { "completion_length": 99.53125, "epoch": 0.8826666666666667, "grad_norm": 4.109442983850265, "kl": 0.1328125, "learning_rate": 5.586666666666666e-07, "loss": 0.0053, "reward": 1.8854167461395264, "reward_std": 0.21155625581741333, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9166666269302368, "step": 1324 }, { "completion_length": 92.46875, "epoch": 0.8833333333333333, "grad_norm": 16.26335035134262, "kl": 0.058349609375, "learning_rate": 5.583333333333333e-07, "loss": 0.0023, "reward": 1.871354103088379, "reward_std": 0.12522053718566895, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8713541626930237, "step": 1325 }, { "completion_length": 98.28125, "epoch": 0.884, "grad_norm": 2.0228951724073228, "kl": 0.0947265625, "learning_rate": 5.58e-07, "loss": 0.0038, "reward": 1.8781249523162842, "reward_std": 0.13684068620204926, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8781249523162842, "step": 1326 }, { "completion_length": 96.84375, "epoch": 0.8846666666666667, "grad_norm": 4.043989639347246, "kl": 0.0751953125, "learning_rate": 5.576666666666667e-07, "loss": 0.003, "reward": 1.834970235824585, "reward_std": 0.1457929015159607, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.834970235824585, "step": 1327 }, { "completion_length": 113.96875, "epoch": 0.8853333333333333, "grad_norm": 4.30925523652543, "kl": 0.0693359375, "learning_rate": 5.573333333333333e-07, "loss": 0.0028, "reward": 1.9354166984558105, "reward_std": 0.03936556726694107, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9354166984558105, "step": 1328 }, { "completion_length": 123.53125, "epoch": 0.886, "grad_norm": 1.2569549404613636, "kl": 0.03759765625, "learning_rate": 5.57e-07, "loss": 0.0015, "reward": 1.71875, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.75, "step": 1329 }, { "completion_length": 93.4375, "epoch": 0.8866666666666667, "grad_norm": 5.395882203168452, "kl": 0.0927734375, "learning_rate": 5.566666666666666e-07, "loss": 0.0037, "reward": 1.8078124523162842, "reward_std": 0.10618552565574646, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.807812511920929, "step": 1330 }, { "completion_length": 95.6875, "epoch": 0.8873333333333333, "grad_norm": 3.492757212362924, "kl": 0.08056640625, "learning_rate": 5.563333333333333e-07, "loss": 0.0032, "reward": 1.7895833253860474, "reward_std": 0.17695289850234985, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7895833253860474, "step": 1331 }, { "completion_length": 93.21875, "epoch": 0.888, "grad_norm": 2.5824620600852737, "kl": 0.06787109375, "learning_rate": 5.560000000000001e-07, "loss": 0.0027, "reward": 1.75, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 1332 }, { "completion_length": 103.34375, "epoch": 0.8886666666666667, "grad_norm": 2.6551271792530513, "kl": 0.04638671875, "learning_rate": 5.556666666666666e-07, "loss": 0.0019, "reward": 1.607812523841858, "reward_std": 0.1739690899848938, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6078125238418579, "step": 1333 }, { "completion_length": 95.375, "epoch": 0.8893333333333333, "grad_norm": 0.9046715162975852, "kl": 0.07470703125, "learning_rate": 5.553333333333333e-07, "loss": 0.003, "reward": 1.8458333015441895, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8458333611488342, "step": 1334 }, { "completion_length": 105.96875, "epoch": 0.89, "grad_norm": 1.0565527061846285, "kl": 0.040283203125, "learning_rate": 5.55e-07, "loss": 0.0016, "reward": 1.9375, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1335 }, { "completion_length": 103.71875, "epoch": 0.8906666666666667, "grad_norm": 5.201283204613273, "kl": 0.056640625, "learning_rate": 5.546666666666667e-07, "loss": 0.0023, "reward": 1.5776042938232422, "reward_std": 0.12353657931089401, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6088541746139526, "step": 1336 }, { "completion_length": 93.96875, "epoch": 0.8913333333333333, "grad_norm": 0.328254879155955, "kl": 0.055419921875, "learning_rate": 5.543333333333333e-07, "loss": 0.0022, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1337 }, { "completion_length": 100.46875, "epoch": 0.892, "grad_norm": 4.269430619111832, "kl": 0.047119140625, "learning_rate": 5.54e-07, "loss": 0.0019, "reward": 1.734375, "reward_std": 0.010416671633720398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 1338 }, { "completion_length": 86.6875, "epoch": 0.8926666666666667, "grad_norm": 5.473510441708784, "kl": 0.06494140625, "learning_rate": 5.536666666666666e-07, "loss": 0.0026, "reward": 1.890625, "reward_std": 0.10341878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.890625, "step": 1339 }, { "completion_length": 109.28125, "epoch": 0.8933333333333333, "grad_norm": 1.9855291914674005, "kl": 0.041015625, "learning_rate": 5.533333333333334e-07, "loss": 0.0016, "reward": 1.875, "reward_std": 0.18217839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 1340 }, { "completion_length": 93.28125, "epoch": 0.894, "grad_norm": 5.790186448190846, "kl": 0.0625, "learning_rate": 5.53e-07, "loss": 0.0025, "reward": 1.806249976158142, "reward_std": 0.07942508161067963, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8062499761581421, "step": 1341 }, { "completion_length": 94.28125, "epoch": 0.8946666666666667, "grad_norm": 3.5588163836222346, "kl": 0.06591796875, "learning_rate": 5.526666666666666e-07, "loss": 0.0026, "reward": 1.7567708492279053, "reward_std": 0.06169174611568451, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7567708492279053, "step": 1342 }, { "completion_length": 110.59375, "epoch": 0.8953333333333333, "grad_norm": 8.687462859074897, "kl": 0.060546875, "learning_rate": 5.523333333333333e-07, "loss": 0.0024, "reward": 1.4473958015441895, "reward_std": 0.41615572571754456, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.5411458611488342, "step": 1343 }, { "completion_length": 99.5, "epoch": 0.896, "grad_norm": 1.800861610990724, "kl": 0.0654296875, "learning_rate": 5.520000000000001e-07, "loss": 0.0026, "reward": 1.9557292461395264, "reward_std": 0.013107352890074253, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9557291865348816, "step": 1344 }, { "completion_length": 99.65625, "epoch": 0.8966666666666666, "grad_norm": 5.752188823429821, "kl": 0.044189453125, "learning_rate": 5.516666666666666e-07, "loss": 0.0018, "reward": 1.703125, "reward_std": 0.15271097421646118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.703125, "step": 1345 }, { "completion_length": 97.8125, "epoch": 0.8973333333333333, "grad_norm": 1.513773479379011, "kl": 0.05419921875, "learning_rate": 5.513333333333333e-07, "loss": 0.0022, "reward": 1.8125, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 1346 }, { "completion_length": 96.5625, "epoch": 0.898, "grad_norm": 7.49840042598453, "kl": 0.050048828125, "learning_rate": 5.51e-07, "loss": 0.002, "reward": 1.7265625, "reward_std": 0.15426458418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7265625, "step": 1347 }, { "completion_length": 96.4375, "epoch": 0.8986666666666666, "grad_norm": 4.518840998867073, "kl": 0.08056640625, "learning_rate": 5.506666666666666e-07, "loss": 0.0032, "reward": 1.6848958730697632, "reward_std": 0.16963425278663635, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7161458730697632, "step": 1348 }, { "completion_length": 94.0625, "epoch": 0.8993333333333333, "grad_norm": 7.7661428787246605, "kl": 0.076171875, "learning_rate": 5.503333333333334e-07, "loss": 0.003, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1349 }, { "completion_length": 93.75, "epoch": 0.9, "grad_norm": 4.878151785449189, "kl": 0.0859375, "learning_rate": 5.5e-07, "loss": 0.0034, "reward": 1.7979166507720947, "reward_std": 0.2145201563835144, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.82916659116745, "step": 1350 }, { "completion_length": 95.15625, "epoch": 0.9006666666666666, "grad_norm": 2.6834249196262494, "kl": 0.0419921875, "learning_rate": 5.496666666666666e-07, "loss": 0.0017, "reward": 1.8171875476837158, "reward_std": 0.0677083283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.817187488079071, "step": 1351 }, { "completion_length": 94.0625, "epoch": 0.9013333333333333, "grad_norm": 5.949209493065658, "kl": 0.07763671875, "learning_rate": 5.493333333333333e-07, "loss": 0.0031, "reward": 1.8562500476837158, "reward_std": 0.2380698323249817, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8875000476837158, "step": 1352 }, { "completion_length": 81.4375, "epoch": 0.902, "grad_norm": 46.11187563779079, "kl": 0.039306640625, "learning_rate": 5.490000000000001e-07, "loss": 0.0016, "reward": 1.6041667461395264, "reward_std": 0.10738958418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6041666865348816, "step": 1353 }, { "completion_length": 105.59375, "epoch": 0.9026666666666666, "grad_norm": 3.9446160136676123, "kl": 0.08740234375, "learning_rate": 5.486666666666666e-07, "loss": 0.0035, "reward": 1.676041603088379, "reward_std": 0.15002527832984924, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7072917222976685, "step": 1354 }, { "completion_length": 100.21875, "epoch": 0.9033333333333333, "grad_norm": 2.198910287485149, "kl": 0.06640625, "learning_rate": 5.483333333333333e-07, "loss": 0.0027, "reward": 1.9088542461395264, "reward_std": 0.04183970391750336, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9088541865348816, "step": 1355 }, { "completion_length": 86.0625, "epoch": 0.904, "grad_norm": 2.9133986415145525, "kl": 0.03466796875, "learning_rate": 5.48e-07, "loss": 0.0014, "reward": 1.828125, "reward_std": 0.17991724610328674, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.828125, "step": 1356 }, { "completion_length": 100.96875, "epoch": 0.9046666666666666, "grad_norm": 3.4492109841892122, "kl": 0.0859375, "learning_rate": 5.476666666666667e-07, "loss": 0.0034, "reward": 1.8899552822113037, "reward_std": 0.05107562616467476, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8899553418159485, "step": 1357 }, { "completion_length": 96.0625, "epoch": 0.9053333333333333, "grad_norm": 4.119880082969663, "kl": 0.0830078125, "learning_rate": 5.473333333333333e-07, "loss": 0.0033, "reward": 1.5645833015441895, "reward_std": 0.2959929406642914, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5645833611488342, "step": 1358 }, { "completion_length": 92.15625, "epoch": 0.906, "grad_norm": 5.149888919093567, "kl": 0.0791015625, "learning_rate": 5.47e-07, "loss": 0.0032, "reward": 1.7703125476837158, "reward_std": 0.2718749940395355, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.832812488079071, "step": 1359 }, { "completion_length": 82.84375, "epoch": 0.9066666666666666, "grad_norm": 3.7271406261078166, "kl": 0.078125, "learning_rate": 5.466666666666666e-07, "loss": 0.0031, "reward": 1.7734375, "reward_std": 0.170988067984581, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8046875, "step": 1360 }, { "completion_length": 100.53125, "epoch": 0.9073333333333333, "grad_norm": 3.8308436020111944, "kl": 0.060302734375, "learning_rate": 5.463333333333333e-07, "loss": 0.0024, "reward": 1.6848958730697632, "reward_std": 0.2239583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6848958730697632, "step": 1361 }, { "completion_length": 86.34375, "epoch": 0.908, "grad_norm": 30.422038328799008, "kl": 0.0830078125, "learning_rate": 5.46e-07, "loss": 0.0033, "reward": 1.6635416746139526, "reward_std": 0.11723804473876953, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6635416746139526, "step": 1362 }, { "completion_length": 94.40625, "epoch": 0.9086666666666666, "grad_norm": 1.8904131345893538, "kl": 0.041015625, "learning_rate": 5.456666666666666e-07, "loss": 0.0016, "reward": 1.5833333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5833333730697632, "step": 1363 }, { "completion_length": 94.0625, "epoch": 0.9093333333333333, "grad_norm": 1.809557143257448, "kl": 0.060546875, "learning_rate": 5.453333333333333e-07, "loss": 0.0024, "reward": 1.890625, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.890625, "step": 1364 }, { "completion_length": 96.8125, "epoch": 0.91, "grad_norm": 1.5955782873706978, "kl": 0.048828125, "learning_rate": 5.45e-07, "loss": 0.002, "reward": 1.921875, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1365 }, { "completion_length": 94.90625, "epoch": 0.9106666666666666, "grad_norm": 1.4726398033000772, "kl": 0.0693359375, "learning_rate": 5.446666666666666e-07, "loss": 0.0028, "reward": 1.84375, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1366 }, { "completion_length": 89.375, "epoch": 0.9113333333333333, "grad_norm": 3.3422770259315806, "kl": 0.06103515625, "learning_rate": 5.443333333333333e-07, "loss": 0.0024, "reward": 1.8125, "reward_std": 0.19716878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1367 }, { "completion_length": 82.125, "epoch": 0.912, "grad_norm": 2.4216320386533345, "kl": 0.053466796875, "learning_rate": 5.44e-07, "loss": 0.0021, "reward": 1.7843749523162842, "reward_std": 0.11709193885326385, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.815625011920929, "step": 1368 }, { "completion_length": 105.15625, "epoch": 0.9126666666666666, "grad_norm": 3.7251345447471227, "kl": 0.0654296875, "learning_rate": 5.436666666666666e-07, "loss": 0.0026, "reward": 1.7916667461395264, "reward_std": 0.13662631809711456, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666269302368, "step": 1369 }, { "completion_length": 93.03125, "epoch": 0.9133333333333333, "grad_norm": 4.907344212102432, "kl": 0.072265625, "learning_rate": 5.433333333333334e-07, "loss": 0.0029, "reward": 1.8671875, "reward_std": 0.11420938372612, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8671874403953552, "step": 1370 }, { "completion_length": 101.78125, "epoch": 0.914, "grad_norm": 4.29359987634364, "kl": 0.036376953125, "learning_rate": 5.43e-07, "loss": 0.0015, "reward": 1.7630208730697632, "reward_std": 0.1510416716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7630208730697632, "step": 1371 }, { "completion_length": 76.34375, "epoch": 0.9146666666666666, "grad_norm": 4.835744519147403, "kl": 0.051513671875, "learning_rate": 5.426666666666666e-07, "loss": 0.0021, "reward": 1.921875, "reward_std": 0.059839196503162384, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1372 }, { "completion_length": 105.9375, "epoch": 0.9153333333333333, "grad_norm": 2.000323647886971, "kl": 0.060791015625, "learning_rate": 5.423333333333333e-07, "loss": 0.0024, "reward": 1.8546874523162842, "reward_std": 0.10777028650045395, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8546875715255737, "step": 1373 }, { "completion_length": 93.90625, "epoch": 0.916, "grad_norm": 1.8492403826612505, "kl": 0.08251953125, "learning_rate": 5.420000000000001e-07, "loss": 0.0033, "reward": 1.859375, "reward_std": 0.08258544653654099, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8593750596046448, "step": 1374 }, { "completion_length": 107.59375, "epoch": 0.9166666666666666, "grad_norm": 2.558817373354404, "kl": 0.08935546875, "learning_rate": 5.416666666666666e-07, "loss": 0.0036, "reward": 1.886979103088379, "reward_std": 0.10104166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8869791626930237, "step": 1375 }, { "completion_length": 99.71875, "epoch": 0.9173333333333333, "grad_norm": 45.26074413406574, "kl": 0.078125, "learning_rate": 5.413333333333333e-07, "loss": 0.0031, "reward": 1.7713541984558105, "reward_std": 0.20213362574577332, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7713541984558105, "step": 1376 }, { "completion_length": 105.0625, "epoch": 0.918, "grad_norm": 4.049687280249501, "kl": 0.07763671875, "learning_rate": 5.41e-07, "loss": 0.0031, "reward": 1.84375, "reward_std": 0.04488958418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1377 }, { "completion_length": 88.875, "epoch": 0.9186666666666666, "grad_norm": 2.1726937592945257, "kl": 0.07421875, "learning_rate": 5.406666666666666e-07, "loss": 0.003, "reward": 1.7416666746139526, "reward_std": 0.09905625879764557, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7416666746139526, "step": 1378 }, { "completion_length": 111.03125, "epoch": 0.9193333333333333, "grad_norm": 2.187075940923063, "kl": 0.0595703125, "learning_rate": 5.403333333333333e-07, "loss": 0.0024, "reward": 1.8020833730697632, "reward_std": 0.23733967542648315, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 1379 }, { "completion_length": 89.25, "epoch": 0.92, "grad_norm": 1.2653751895696677, "kl": 0.07861328125, "learning_rate": 5.4e-07, "loss": 0.0031, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1380 }, { "completion_length": 99.0625, "epoch": 0.9206666666666666, "grad_norm": 2.5981525788350375, "kl": 0.054931640625, "learning_rate": 5.396666666666666e-07, "loss": 0.0022, "reward": 1.8333333730697632, "reward_std": 0.2180021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 1381 }, { "completion_length": 90.15625, "epoch": 0.9213333333333333, "grad_norm": 3.3917682512650646, "kl": 0.083984375, "learning_rate": 5.393333333333333e-07, "loss": 0.0034, "reward": 1.8776042461395264, "reward_std": 0.0820029228925705, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8776041865348816, "step": 1382 }, { "completion_length": 100.03125, "epoch": 0.922, "grad_norm": 3.600786270847861, "kl": 0.10400390625, "learning_rate": 5.39e-07, "loss": 0.0042, "reward": 1.6927082538604736, "reward_std": 0.15341879427433014, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6927083134651184, "step": 1383 }, { "completion_length": 98.625, "epoch": 0.9226666666666666, "grad_norm": 3.1379214656479144, "kl": 0.06298828125, "learning_rate": 5.386666666666666e-07, "loss": 0.0025, "reward": 1.738541603088379, "reward_std": 0.054598454385995865, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7385417222976685, "step": 1384 }, { "completion_length": 92.25, "epoch": 0.9233333333333333, "grad_norm": 4.116854631164324, "kl": 0.043212890625, "learning_rate": 5.383333333333333e-07, "loss": 0.0017, "reward": 1.845312476158142, "reward_std": 0.0831502377986908, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8453124761581421, "step": 1385 }, { "completion_length": 96.1875, "epoch": 0.924, "grad_norm": 2.8905893657264015, "kl": 0.0703125, "learning_rate": 5.38e-07, "loss": 0.0028, "reward": 1.75, "reward_std": 0.1666666716337204, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8125, "step": 1386 }, { "completion_length": 83.84375, "epoch": 0.9246666666666666, "grad_norm": 9.735948399347192, "kl": 0.1025390625, "learning_rate": 5.376666666666667e-07, "loss": 0.0041, "reward": 1.706770896911621, "reward_std": 0.12440761178731918, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7380207777023315, "step": 1387 }, { "completion_length": 104.375, "epoch": 0.9253333333333333, "grad_norm": 2.169377572200425, "kl": 0.04248046875, "learning_rate": 5.373333333333333e-07, "loss": 0.0017, "reward": 1.75, "reward_std": 0.37934717535972595, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8125, "step": 1388 }, { "completion_length": 104.03125, "epoch": 0.926, "grad_norm": 1.995419469411153, "kl": 0.051513671875, "learning_rate": 5.37e-07, "loss": 0.0021, "reward": 1.71875, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 1389 }, { "completion_length": 91.8125, "epoch": 0.9266666666666666, "grad_norm": 3.645345852959116, "kl": 0.059326171875, "learning_rate": 5.366666666666666e-07, "loss": 0.0024, "reward": 1.7999999523162842, "reward_std": 0.2749781012535095, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.800000011920929, "step": 1390 }, { "completion_length": 91.03125, "epoch": 0.9273333333333333, "grad_norm": 3.300670350452552, "kl": 0.050048828125, "learning_rate": 5.363333333333333e-07, "loss": 0.002, "reward": 1.8989583253860474, "reward_std": 0.18292967975139618, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9302083253860474, "step": 1391 }, { "completion_length": 108.96875, "epoch": 0.928, "grad_norm": 5.439941344144598, "kl": 0.052490234375, "learning_rate": 5.36e-07, "loss": 0.0021, "reward": 1.7083333730697632, "reward_std": 0.11001245677471161, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333134651184, "step": 1392 }, { "completion_length": 88.375, "epoch": 0.9286666666666666, "grad_norm": 1.9185927765147557, "kl": 0.06884765625, "learning_rate": 5.356666666666666e-07, "loss": 0.0028, "reward": 1.8541667461395264, "reward_std": 0.14905625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666269302368, "step": 1393 }, { "completion_length": 105.34375, "epoch": 0.9293333333333333, "grad_norm": 2.6337602102115776, "kl": 0.05322265625, "learning_rate": 5.353333333333333e-07, "loss": 0.0021, "reward": 1.8020833730697632, "reward_std": 0.09536145627498627, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833134651184, "step": 1394 }, { "completion_length": 101.25, "epoch": 0.93, "grad_norm": 23.33950713030375, "kl": 0.06103515625, "learning_rate": 5.35e-07, "loss": 0.0024, "reward": 1.7765624523162842, "reward_std": 0.10851817578077316, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.776562511920929, "step": 1395 }, { "completion_length": 107.875, "epoch": 0.9306666666666666, "grad_norm": 5.211724957030164, "kl": 0.0673828125, "learning_rate": 5.346666666666666e-07, "loss": 0.0027, "reward": 1.6916667222976685, "reward_std": 0.17539171874523163, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7229167222976685, "step": 1396 }, { "completion_length": 92.5625, "epoch": 0.9313333333333333, "grad_norm": 1.7043355849901263, "kl": 0.0751953125, "learning_rate": 5.343333333333333e-07, "loss": 0.003, "reward": 1.8510416746139526, "reward_std": 0.07894822955131531, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8510416746139526, "step": 1397 }, { "completion_length": 92.3125, "epoch": 0.932, "grad_norm": 1.997333423677732, "kl": 0.034912109375, "learning_rate": 5.34e-07, "loss": 0.0014, "reward": 1.65625, "reward_std": 0.1875, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.75, "step": 1398 }, { "completion_length": 97.8125, "epoch": 0.9326666666666666, "grad_norm": 3.427331012762009, "kl": 0.062255859375, "learning_rate": 5.336666666666666e-07, "loss": 0.0025, "reward": 1.6927083730697632, "reward_std": 0.09807968884706497, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7239583134651184, "step": 1399 }, { "completion_length": 89.6875, "epoch": 0.9333333333333333, "grad_norm": 4.8707781387956075, "kl": 0.08935546875, "learning_rate": 5.333333333333333e-07, "loss": 0.0036, "reward": 1.8854167461395264, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8854166865348816, "step": 1400 }, { "completion_length": 94.59375, "epoch": 0.934, "grad_norm": 5.711852058819851, "kl": 0.060546875, "learning_rate": 5.33e-07, "loss": 0.0024, "reward": 1.743749976158142, "reward_std": 0.10344192385673523, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7437500953674316, "step": 1401 }, { "completion_length": 89.28125, "epoch": 0.9346666666666666, "grad_norm": 2.720495926791351, "kl": 0.0732421875, "learning_rate": 5.326666666666666e-07, "loss": 0.0029, "reward": 1.8145833015441895, "reward_std": 0.16249999403953552, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.814583420753479, "step": 1402 }, { "completion_length": 97.21875, "epoch": 0.9353333333333333, "grad_norm": 8.688733908460824, "kl": 0.0986328125, "learning_rate": 5.323333333333333e-07, "loss": 0.0039, "reward": 1.777604103088379, "reward_std": 0.13596303761005402, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8088541626930237, "step": 1403 }, { "completion_length": 87.96875, "epoch": 0.936, "grad_norm": 3.4789866972188213, "kl": 0.08154296875, "learning_rate": 5.32e-07, "loss": 0.0033, "reward": 1.9739583730697632, "reward_std": 0.0388755239546299, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9739583134651184, "step": 1404 }, { "completion_length": 108.84375, "epoch": 0.9366666666666666, "grad_norm": 8.297419431833834, "kl": 0.0693359375, "learning_rate": 5.316666666666666e-07, "loss": 0.0028, "reward": 1.8666666746139526, "reward_std": 0.21666665375232697, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8979166746139526, "step": 1405 }, { "completion_length": 107.25, "epoch": 0.9373333333333334, "grad_norm": 3.3975591048763443, "kl": 0.07958984375, "learning_rate": 5.313333333333333e-07, "loss": 0.0032, "reward": 1.860937476158142, "reward_std": 0.14143723249435425, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8921874761581421, "step": 1406 }, { "completion_length": 85.03125, "epoch": 0.938, "grad_norm": 1.217967876189561, "kl": 0.07275390625, "learning_rate": 5.31e-07, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1407 }, { "completion_length": 95.34375, "epoch": 0.9386666666666666, "grad_norm": 3.153413630491554, "kl": 0.0771484375, "learning_rate": 5.306666666666665e-07, "loss": 0.0031, "reward": 1.8110119104385376, "reward_std": 0.1924692988395691, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8110119104385376, "step": 1408 }, { "completion_length": 99.28125, "epoch": 0.9393333333333334, "grad_norm": 2.9056972016641653, "kl": 0.06396484375, "learning_rate": 5.303333333333333e-07, "loss": 0.0026, "reward": 1.8583333492279053, "reward_std": 0.10244926810264587, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8583333492279053, "step": 1409 }, { "completion_length": 94.40625, "epoch": 0.94, "grad_norm": 1.4472038040090685, "kl": 0.0830078125, "learning_rate": 5.3e-07, "loss": 0.0033, "reward": 1.7395833730697632, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 1410 }, { "completion_length": 104.0, "epoch": 0.9406666666666667, "grad_norm": 2.687185673249364, "kl": 0.07958984375, "learning_rate": 5.296666666666666e-07, "loss": 0.0032, "reward": 1.8125, "reward_std": 0.14905625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125000596046448, "step": 1411 }, { "completion_length": 94.65625, "epoch": 0.9413333333333334, "grad_norm": 1.88868870678509, "kl": 0.052490234375, "learning_rate": 5.293333333333333e-07, "loss": 0.0021, "reward": 1.7890625, "reward_std": 0.15029378235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8203125, "step": 1412 }, { "completion_length": 107.40625, "epoch": 0.942, "grad_norm": 30.223712106556768, "kl": 0.08203125, "learning_rate": 5.29e-07, "loss": 0.0033, "reward": 1.9052083492279053, "reward_std": 0.04375000298023224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9052083492279053, "step": 1413 }, { "completion_length": 87.28125, "epoch": 0.9426666666666667, "grad_norm": 2.8612498548424523, "kl": 0.06982421875, "learning_rate": 5.286666666666666e-07, "loss": 0.0028, "reward": 1.9270833730697632, "reward_std": 0.09300211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9270833730697632, "step": 1414 }, { "completion_length": 96.40625, "epoch": 0.9433333333333334, "grad_norm": 0.7829319497712943, "kl": 0.06982421875, "learning_rate": 5.283333333333333e-07, "loss": 0.0028, "reward": 1.78125, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1415 }, { "completion_length": 101.78125, "epoch": 0.944, "grad_norm": 3.894978398090345, "kl": 0.08740234375, "learning_rate": 5.28e-07, "loss": 0.0035, "reward": 1.798437476158142, "reward_std": 0.15164342522621155, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7984375357627869, "step": 1416 }, { "completion_length": 95.0, "epoch": 0.9446666666666667, "grad_norm": 4.3815773156100235, "kl": 0.08251953125, "learning_rate": 5.276666666666666e-07, "loss": 0.0033, "reward": 1.7989583015441895, "reward_std": 0.15567253530025482, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.798958420753479, "step": 1417 }, { "completion_length": 83.3125, "epoch": 0.9453333333333334, "grad_norm": 1.983228701771683, "kl": 0.0712890625, "learning_rate": 5.273333333333333e-07, "loss": 0.0028, "reward": 1.7317708730697632, "reward_std": 0.14647451043128967, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7317708730697632, "step": 1418 }, { "completion_length": 101.90625, "epoch": 0.946, "grad_norm": 3.0511174948418422, "kl": 0.0400390625, "learning_rate": 5.27e-07, "loss": 0.0016, "reward": 1.7574405670166016, "reward_std": 0.1339285671710968, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.8511905670166016, "step": 1419 }, { "completion_length": 83.84375, "epoch": 0.9466666666666667, "grad_norm": 3.1374116671911616, "kl": 0.07080078125, "learning_rate": 5.266666666666666e-07, "loss": 0.0028, "reward": 1.6395833492279053, "reward_std": 0.07693374902009964, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6708333492279053, "step": 1420 }, { "completion_length": 94.03125, "epoch": 0.9473333333333334, "grad_norm": 3.4624652725555833, "kl": 0.051025390625, "learning_rate": 5.263333333333333e-07, "loss": 0.002, "reward": 1.6848958730697632, "reward_std": 0.2577776610851288, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6848958730697632, "step": 1421 }, { "completion_length": 90.5, "epoch": 0.948, "grad_norm": 12.274028487064518, "kl": 0.08154296875, "learning_rate": 5.26e-07, "loss": 0.0033, "reward": 1.9093749523162842, "reward_std": 0.09304219484329224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.940625011920929, "step": 1422 }, { "completion_length": 97.875, "epoch": 0.9486666666666667, "grad_norm": 1.38558793766315, "kl": 0.1005859375, "learning_rate": 5.256666666666666e-07, "loss": 0.004, "reward": 1.5989583730697632, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5989583134651184, "step": 1423 }, { "completion_length": 88.0625, "epoch": 0.9493333333333334, "grad_norm": 5.5183401246129105, "kl": 0.2001953125, "learning_rate": 5.253333333333333e-07, "loss": 0.008, "reward": 1.7916667461395264, "reward_std": 0.051196396350860596, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666865348816, "step": 1424 }, { "completion_length": 93.15625, "epoch": 0.95, "grad_norm": 6.940517349249131, "kl": 0.11083984375, "learning_rate": 5.25e-07, "loss": 0.0044, "reward": 1.7109375, "reward_std": 0.13151061534881592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7109375, "step": 1425 }, { "completion_length": 94.15625, "epoch": 0.9506666666666667, "grad_norm": 1.9867970294701287, "kl": 0.08349609375, "learning_rate": 5.246666666666666e-07, "loss": 0.0033, "reward": 1.9666666984558105, "reward_std": 0.009622495621442795, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9666666984558105, "step": 1426 }, { "completion_length": 98.375, "epoch": 0.9513333333333334, "grad_norm": 8.180517344809365, "kl": 0.10400390625, "learning_rate": 5.243333333333333e-07, "loss": 0.0041, "reward": 1.671875, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.671875, "step": 1427 }, { "completion_length": 84.21875, "epoch": 0.952, "grad_norm": 2.97961625113234, "kl": 0.05517578125, "learning_rate": 5.24e-07, "loss": 0.0022, "reward": 1.921875, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1428 }, { "completion_length": 90.0625, "epoch": 0.9526666666666667, "grad_norm": 2.959764321950468, "kl": 0.091796875, "learning_rate": 5.236666666666666e-07, "loss": 0.0037, "reward": 1.7479166984558105, "reward_std": 0.0877341777086258, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7479166984558105, "step": 1429 }, { "completion_length": 96.625, "epoch": 0.9533333333333334, "grad_norm": 5.4371951333349715, "kl": 0.06982421875, "learning_rate": 5.233333333333333e-07, "loss": 0.0028, "reward": 1.7942708730697632, "reward_std": 0.0953838899731636, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7942708730697632, "step": 1430 }, { "completion_length": 94.96875, "epoch": 0.954, "grad_norm": 1.6494244567871301, "kl": 0.055419921875, "learning_rate": 5.23e-07, "loss": 0.0022, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1431 }, { "completion_length": 101.25, "epoch": 0.9546666666666667, "grad_norm": 1.436663243534042, "kl": 0.08251953125, "learning_rate": 5.226666666666666e-07, "loss": 0.0033, "reward": 1.921875, "reward_std": 0.010416666977107525, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1432 }, { "completion_length": 104.125, "epoch": 0.9553333333333334, "grad_norm": 12.067857762622182, "kl": 0.1015625, "learning_rate": 5.223333333333333e-07, "loss": 0.0041, "reward": 1.8526785373687744, "reward_std": 0.06621253490447998, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8526785969734192, "step": 1433 }, { "completion_length": 109.625, "epoch": 0.956, "grad_norm": 2.9697425884815285, "kl": 0.130859375, "learning_rate": 5.22e-07, "loss": 0.0052, "reward": 1.837499976158142, "reward_std": 0.19646097719669342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8374999761581421, "step": 1434 }, { "completion_length": 84.625, "epoch": 0.9566666666666667, "grad_norm": 5.163984356419959, "kl": 0.07861328125, "learning_rate": 5.216666666666666e-07, "loss": 0.0032, "reward": 1.7161458730697632, "reward_std": 0.16468125581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7161458730697632, "step": 1435 }, { "completion_length": 107.0, "epoch": 0.9573333333333334, "grad_norm": 2.0173869858436073, "kl": 0.0703125, "learning_rate": 5.213333333333333e-07, "loss": 0.0028, "reward": 1.8671875, "reward_std": 0.13530339300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8984375, "step": 1436 }, { "completion_length": 100.65625, "epoch": 0.958, "grad_norm": 2.317240643226758, "kl": 0.087890625, "learning_rate": 5.21e-07, "loss": 0.0035, "reward": 1.8697917461395264, "reward_std": 0.1770833432674408, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.9322917461395264, "step": 1437 }, { "completion_length": 104.8125, "epoch": 0.9586666666666667, "grad_norm": 2.0549395983480596, "kl": 0.0771484375, "learning_rate": 5.206666666666667e-07, "loss": 0.0031, "reward": 1.868749976158142, "reward_std": 0.16250000894069672, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9000000357627869, "step": 1438 }, { "completion_length": 105.46875, "epoch": 0.9593333333333334, "grad_norm": 6.3084281841354946, "kl": 0.365234375, "learning_rate": 5.203333333333333e-07, "loss": 0.0146, "reward": 1.787500023841858, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8187500238418579, "step": 1439 }, { "completion_length": 108.5625, "epoch": 0.96, "grad_norm": 1.2369580774120317, "kl": 0.09130859375, "learning_rate": 5.2e-07, "loss": 0.0036, "reward": 1.6458333730697632, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6770833730697632, "step": 1440 }, { "completion_length": 98.21875, "epoch": 0.9606666666666667, "grad_norm": 4.173405264865265, "kl": 0.0849609375, "learning_rate": 5.196666666666667e-07, "loss": 0.0034, "reward": 1.7364583015441895, "reward_std": 0.27404090762138367, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7989583611488342, "step": 1441 }, { "completion_length": 91.8125, "epoch": 0.9613333333333334, "grad_norm": 2.2607670380514002, "kl": 0.052490234375, "learning_rate": 5.193333333333332e-07, "loss": 0.0021, "reward": 1.78125, "reward_std": 0.28466877341270447, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 1442 }, { "completion_length": 98.96875, "epoch": 0.962, "grad_norm": 4.960158212948423, "kl": 0.0771484375, "learning_rate": 5.19e-07, "loss": 0.0031, "reward": 1.6546874046325684, "reward_std": 0.46985238790512085, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6859374642372131, "step": 1443 }, { "completion_length": 87.65625, "epoch": 0.9626666666666667, "grad_norm": 3.249371977860067, "kl": 0.1162109375, "learning_rate": 5.186666666666667e-07, "loss": 0.0046, "reward": 1.8781249523162842, "reward_std": 0.18125000596046448, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.909375011920929, "step": 1444 }, { "completion_length": 99.0625, "epoch": 0.9633333333333334, "grad_norm": 24.09423087382169, "kl": 0.10302734375, "learning_rate": 5.183333333333333e-07, "loss": 0.0041, "reward": 1.693750023841858, "reward_std": 0.40400636196136475, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7875000238418579, "step": 1445 }, { "completion_length": 106.9375, "epoch": 0.964, "grad_norm": 2.1550123152864034, "kl": 0.05712890625, "learning_rate": 5.18e-07, "loss": 0.0023, "reward": 1.7294270992279053, "reward_std": 0.1990073025226593, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7606770992279053, "step": 1446 }, { "completion_length": 105.71875, "epoch": 0.9646666666666667, "grad_norm": 4.24201697061723, "kl": 0.05126953125, "learning_rate": 5.176666666666667e-07, "loss": 0.002, "reward": 1.7374999523162842, "reward_std": 0.20280930399894714, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.737500011920929, "step": 1447 }, { "completion_length": 96.0625, "epoch": 0.9653333333333334, "grad_norm": 1.7897487269776517, "kl": 0.043701171875, "learning_rate": 5.173333333333333e-07, "loss": 0.0017, "reward": 1.8958333730697632, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9270833730697632, "step": 1448 }, { "completion_length": 101.625, "epoch": 0.966, "grad_norm": 4.066047561531194, "kl": 0.064453125, "learning_rate": 5.17e-07, "loss": 0.0026, "reward": 1.8177661895751953, "reward_std": 0.16248711943626404, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8490162491798401, "step": 1449 }, { "completion_length": 99.40625, "epoch": 0.9666666666666667, "grad_norm": 6.875062894839696, "kl": 0.0537109375, "learning_rate": 5.166666666666667e-07, "loss": 0.0021, "reward": 1.7999999523162842, "reward_std": 0.2273927927017212, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.831250011920929, "step": 1450 }, { "completion_length": 94.75, "epoch": 0.9673333333333334, "grad_norm": 4.970650324185926, "kl": 0.091796875, "learning_rate": 5.163333333333333e-07, "loss": 0.0037, "reward": 1.7166666984558105, "reward_std": 0.2596225142478943, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.8104166984558105, "step": 1451 }, { "completion_length": 103.625, "epoch": 0.968, "grad_norm": 3.0034075801515137, "kl": 0.072265625, "learning_rate": 5.16e-07, "loss": 0.0029, "reward": 1.625, "reward_std": 0.5236847400665283, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.75, "step": 1452 }, { "completion_length": 94.3125, "epoch": 0.9686666666666667, "grad_norm": 2.9107915213505273, "kl": 0.0576171875, "learning_rate": 5.156666666666667e-07, "loss": 0.0023, "reward": 1.9025297164916992, "reward_std": 0.10498328506946564, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9025297164916992, "step": 1453 }, { "completion_length": 104.96875, "epoch": 0.9693333333333334, "grad_norm": 1.7118450762689859, "kl": 0.06298828125, "learning_rate": 5.153333333333333e-07, "loss": 0.0025, "reward": 1.7395833730697632, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 1454 }, { "completion_length": 98.90625, "epoch": 0.97, "grad_norm": 2.5886857031188786, "kl": 0.0830078125, "learning_rate": 5.149999999999999e-07, "loss": 0.0033, "reward": 1.8781249523162842, "reward_std": 0.16656222939491272, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9093750715255737, "step": 1455 }, { "completion_length": 98.78125, "epoch": 0.9706666666666667, "grad_norm": 10.176797500774578, "kl": 0.08056640625, "learning_rate": 5.146666666666667e-07, "loss": 0.0032, "reward": 1.8177083730697632, "reward_std": 0.2759283781051636, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8802083134651184, "step": 1456 }, { "completion_length": 105.28125, "epoch": 0.9713333333333334, "grad_norm": 17.81036702164623, "kl": 0.062255859375, "learning_rate": 5.143333333333333e-07, "loss": 0.0025, "reward": 1.78125, "reward_std": 0.28483179211616516, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 1457 }, { "completion_length": 101.21875, "epoch": 0.972, "grad_norm": 29.817742407755354, "kl": 0.09375, "learning_rate": 5.14e-07, "loss": 0.0038, "reward": 1.738020896911621, "reward_std": 0.15689393877983093, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7692708373069763, "step": 1458 }, { "completion_length": 117.53125, "epoch": 0.9726666666666667, "grad_norm": 0.8488220020584453, "kl": 0.03466796875, "learning_rate": 5.136666666666666e-07, "loss": 0.0014, "reward": 1.71875, "reward_std": 0.1875, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.875, "step": 1459 }, { "completion_length": 106.25, "epoch": 0.9733333333333334, "grad_norm": 6.985655971005839, "kl": 0.07568359375, "learning_rate": 5.133333333333333e-07, "loss": 0.003, "reward": 1.7183034420013428, "reward_std": 0.11835549771785736, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7495536208152771, "step": 1460 }, { "completion_length": 93.9375, "epoch": 0.974, "grad_norm": 4.743396838164001, "kl": 0.08984375, "learning_rate": 5.13e-07, "loss": 0.0036, "reward": 1.8599703311920166, "reward_std": 0.05432291701436043, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859970211982727, "step": 1461 }, { "completion_length": 92.15625, "epoch": 0.9746666666666667, "grad_norm": 4.86552377088903, "kl": 0.08642578125, "learning_rate": 5.126666666666667e-07, "loss": 0.0035, "reward": 1.9718750715255737, "reward_std": 0.044495441019535065, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9718749523162842, "step": 1462 }, { "completion_length": 87.65625, "epoch": 0.9753333333333334, "grad_norm": 3.067622983401742, "kl": 0.068359375, "learning_rate": 5.123333333333332e-07, "loss": 0.0027, "reward": 1.84375, "reward_std": 0.24467839300632477, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.90625, "step": 1463 }, { "completion_length": 97.9375, "epoch": 0.976, "grad_norm": 4.430152547639064, "kl": 0.08447265625, "learning_rate": 5.12e-07, "loss": 0.0034, "reward": 1.8338541984558105, "reward_std": 0.06435371935367584, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8338541388511658, "step": 1464 }, { "completion_length": 111.28125, "epoch": 0.9766666666666667, "grad_norm": 3.312939163609083, "kl": 0.06494140625, "learning_rate": 5.116666666666667e-07, "loss": 0.0026, "reward": 1.659895896911621, "reward_std": 0.2206602543592453, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7223958373069763, "step": 1465 }, { "completion_length": 91.78125, "epoch": 0.9773333333333334, "grad_norm": 2.4143389943116, "kl": 0.080078125, "learning_rate": 5.113333333333333e-07, "loss": 0.0032, "reward": 1.8203125, "reward_std": 0.04218100756406784, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203125, "step": 1466 }, { "completion_length": 113.25, "epoch": 0.978, "grad_norm": 12.03860616786337, "kl": 0.056884765625, "learning_rate": 5.11e-07, "loss": 0.0023, "reward": 1.796875, "reward_std": 0.10341878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.796875, "step": 1467 }, { "completion_length": 87.28125, "epoch": 0.9786666666666667, "grad_norm": 12.105503367052462, "kl": 0.06298828125, "learning_rate": 5.106666666666667e-07, "loss": 0.0025, "reward": 1.6796875, "reward_std": 0.09821044653654099, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6796875, "step": 1468 }, { "completion_length": 92.75, "epoch": 0.9793333333333333, "grad_norm": 5.935693346103569, "kl": 0.08740234375, "learning_rate": 5.103333333333333e-07, "loss": 0.0035, "reward": 1.6864583492279053, "reward_std": 0.08752527087926865, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6864583492279053, "step": 1469 }, { "completion_length": 109.3125, "epoch": 0.98, "grad_norm": 2.0699053707078394, "kl": 0.07275390625, "learning_rate": 5.1e-07, "loss": 0.0029, "reward": 1.9010416269302368, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9010416865348816, "step": 1470 }, { "completion_length": 100.1875, "epoch": 0.9806666666666667, "grad_norm": 4.160705850081456, "kl": 0.0537109375, "learning_rate": 5.096666666666667e-07, "loss": 0.0021, "reward": 1.8480653762817383, "reward_std": 0.12124966084957123, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8480654954910278, "step": 1471 }, { "completion_length": 79.90625, "epoch": 0.9813333333333333, "grad_norm": 1.3842845534210007, "kl": 0.048583984375, "learning_rate": 5.093333333333332e-07, "loss": 0.0019, "reward": 1.8229167461395264, "reward_std": 0.020833337679505348, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166269302368, "step": 1472 }, { "completion_length": 91.8125, "epoch": 0.982, "grad_norm": 106.38597303532552, "kl": 0.0625, "learning_rate": 5.09e-07, "loss": 0.0025, "reward": 1.904687523841858, "reward_std": 0.12617579102516174, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9046875238418579, "step": 1473 }, { "completion_length": 104.40625, "epoch": 0.9826666666666667, "grad_norm": 0.9935898481612351, "kl": 0.038330078125, "learning_rate": 5.086666666666667e-07, "loss": 0.0015, "reward": 1.912500023841858, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9125000238418579, "step": 1474 }, { "completion_length": 91.84375, "epoch": 0.9833333333333333, "grad_norm": 7.469220758622371, "kl": 0.06640625, "learning_rate": 5.083333333333333e-07, "loss": 0.0027, "reward": 1.8624999523162842, "reward_std": 0.06722996383905411, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8625000715255737, "step": 1475 }, { "completion_length": 102.09375, "epoch": 0.984, "grad_norm": 3.6726544394655694, "kl": 0.05126953125, "learning_rate": 5.079999999999999e-07, "loss": 0.0021, "reward": 1.7630208730697632, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7630208730697632, "step": 1476 }, { "completion_length": 106.0, "epoch": 0.9846666666666667, "grad_norm": 2.795695314475665, "kl": 0.0595703125, "learning_rate": 5.076666666666667e-07, "loss": 0.0024, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1477 }, { "completion_length": 104.75, "epoch": 0.9853333333333333, "grad_norm": 3.2293259897112847, "kl": 0.0947265625, "learning_rate": 5.073333333333333e-07, "loss": 0.0038, "reward": 1.691666603088379, "reward_std": 0.24339382350444794, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7541666626930237, "step": 1478 }, { "completion_length": 91.9375, "epoch": 0.986, "grad_norm": 2.9112083671888054, "kl": 0.08984375, "learning_rate": 5.07e-07, "loss": 0.0036, "reward": 1.7630208730697632, "reward_std": 0.0989583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7630208730697632, "step": 1479 }, { "completion_length": 111.40625, "epoch": 0.9866666666666667, "grad_norm": 3.1810682783459745, "kl": 0.0693359375, "learning_rate": 5.066666666666667e-07, "loss": 0.0028, "reward": 1.7000000476837158, "reward_std": 0.20000000298023224, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7625000476837158, "step": 1480 }, { "completion_length": 99.15625, "epoch": 0.9873333333333333, "grad_norm": 2.7211727472475618, "kl": 0.07568359375, "learning_rate": 5.063333333333333e-07, "loss": 0.003, "reward": 1.7197916507720947, "reward_std": 0.1796986162662506, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7510417103767395, "step": 1481 }, { "completion_length": 96.21875, "epoch": 0.988, "grad_norm": 2.151868973498801, "kl": 0.0654296875, "learning_rate": 5.06e-07, "loss": 0.0026, "reward": 1.9213541746139526, "reward_std": 0.03529377654194832, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9213541746139526, "step": 1482 }, { "completion_length": 105.5625, "epoch": 0.9886666666666667, "grad_norm": 3.6643620293568153, "kl": 0.06640625, "learning_rate": 5.056666666666667e-07, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.08750000596046448, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1483 }, { "completion_length": 93.375, "epoch": 0.9893333333333333, "grad_norm": 2.3160509298907668, "kl": 0.06396484375, "learning_rate": 5.053333333333333e-07, "loss": 0.0026, "reward": 1.875, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1484 }, { "completion_length": 89.3125, "epoch": 0.99, "grad_norm": 2.0757511164715106, "kl": 0.037353515625, "learning_rate": 5.049999999999999e-07, "loss": 0.0015, "reward": 1.8989583253860474, "reward_std": 0.025256488472223282, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8989583253860474, "step": 1485 }, { "completion_length": 95.90625, "epoch": 0.9906666666666667, "grad_norm": 3.599894525960627, "kl": 0.05810546875, "learning_rate": 5.046666666666667e-07, "loss": 0.0023, "reward": 1.837499976158142, "reward_std": 0.18111848831176758, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8375000357627869, "step": 1486 }, { "completion_length": 108.71875, "epoch": 0.9913333333333333, "grad_norm": 2.235032767965405, "kl": 0.068359375, "learning_rate": 5.043333333333333e-07, "loss": 0.0027, "reward": 1.75, "reward_std": 0.1555021107196808, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1487 }, { "completion_length": 99.46875, "epoch": 0.992, "grad_norm": 2.867749739946645, "kl": 0.058349609375, "learning_rate": 5.04e-07, "loss": 0.0023, "reward": 1.7083333730697632, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333730697632, "step": 1488 }, { "completion_length": 108.125, "epoch": 0.9926666666666667, "grad_norm": 3.202920019444098, "kl": 0.07763671875, "learning_rate": 5.036666666666666e-07, "loss": 0.0031, "reward": 1.7890625, "reward_std": 0.20721149444580078, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7890625, "step": 1489 }, { "completion_length": 112.09375, "epoch": 0.9933333333333333, "grad_norm": 2.084374857475752, "kl": 0.0458984375, "learning_rate": 5.033333333333333e-07, "loss": 0.0018, "reward": 1.8427083492279053, "reward_std": 0.13595813512802124, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8427083492279053, "step": 1490 }, { "completion_length": 90.96875, "epoch": 0.994, "grad_norm": 7.374413806999838, "kl": 0.09716796875, "learning_rate": 5.03e-07, "loss": 0.0039, "reward": 1.852083444595337, "reward_std": 0.0472952164709568, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8520833253860474, "step": 1491 }, { "completion_length": 97.5625, "epoch": 0.9946666666666667, "grad_norm": 3.254387383219098, "kl": 0.078125, "learning_rate": 5.026666666666667e-07, "loss": 0.0031, "reward": 1.8807291984558105, "reward_std": 0.07509791851043701, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8807291984558105, "step": 1492 }, { "completion_length": 103.09375, "epoch": 0.9953333333333333, "grad_norm": 3.3785042220512804, "kl": 0.057373046875, "learning_rate": 5.023333333333332e-07, "loss": 0.0023, "reward": 1.59375, "reward_std": 0.23325318098068237, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.625, "step": 1493 }, { "completion_length": 88.625, "epoch": 0.996, "grad_norm": 12.12783422221258, "kl": 0.134765625, "learning_rate": 5.02e-07, "loss": 0.0054, "reward": 1.5385416746139526, "reward_std": 0.14025351405143738, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5385416746139526, "step": 1494 }, { "completion_length": 96.96875, "epoch": 0.9966666666666667, "grad_norm": 17.785233620302122, "kl": 0.09814453125, "learning_rate": 5.016666666666667e-07, "loss": 0.0039, "reward": 1.7969756126403809, "reward_std": 0.17319706082344055, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8282255530357361, "step": 1495 }, { "completion_length": 99.6875, "epoch": 0.9973333333333333, "grad_norm": 4.4518451201154265, "kl": 0.06005859375, "learning_rate": 5.013333333333333e-07, "loss": 0.0024, "reward": 1.8791667222976685, "reward_std": 0.02805020846426487, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8791666626930237, "step": 1496 }, { "completion_length": 105.09375, "epoch": 0.998, "grad_norm": 2.3106498211094264, "kl": 0.0546875, "learning_rate": 5.009999999999999e-07, "loss": 0.0022, "reward": 1.9166667461395264, "reward_std": 0.012028136290609837, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9166667461395264, "step": 1497 }, { "completion_length": 94.65625, "epoch": 0.9986666666666667, "grad_norm": 40.07727886579413, "kl": 0.10400390625, "learning_rate": 5.006666666666667e-07, "loss": 0.0042, "reward": 1.6979167461395264, "reward_std": 0.25098347663879395, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7291666865348816, "step": 1498 }, { "completion_length": 109.6875, "epoch": 0.9993333333333333, "grad_norm": 4.594618306309033, "kl": 0.048095703125, "learning_rate": 5.003333333333333e-07, "loss": 0.0019, "reward": 1.9208333492279053, "reward_std": 0.10374574363231659, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9208333492279053, "step": 1499 }, { "completion_length": 98.9375, "epoch": 1.0, "grad_norm": 2.7039249139074313, "kl": 0.080078125, "learning_rate": 5e-07, "loss": 0.0032, "reward": 1.671875, "reward_std": 0.24775634706020355, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.734375, "step": 1500 }, { "completion_length": 99.6875, "epoch": 1.0006666666666666, "grad_norm": 1.29431392788463, "kl": 0.043701171875, "learning_rate": 4.996666666666667e-07, "loss": 0.0017, "reward": 1.75, "reward_std": 0.19716878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1501 }, { "completion_length": 96.25, "epoch": 1.0013333333333334, "grad_norm": 2.823542754937458, "kl": 0.06103515625, "learning_rate": 4.993333333333333e-07, "loss": 0.0024, "reward": 1.9140625, "reward_std": 0.140625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9140625, "step": 1502 }, { "completion_length": 92.96875, "epoch": 1.002, "grad_norm": 4.190583704414798, "kl": 0.0732421875, "learning_rate": 4.99e-07, "loss": 0.0029, "reward": 1.6713541746139526, "reward_std": 0.08411861211061478, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6713541746139526, "step": 1503 }, { "completion_length": 92.59375, "epoch": 1.0026666666666666, "grad_norm": 2.6172881395729695, "kl": 0.0791015625, "learning_rate": 4.986666666666666e-07, "loss": 0.0032, "reward": 1.7604167461395264, "reward_std": 0.2687346339225769, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8229166865348816, "step": 1504 }, { "completion_length": 103.15625, "epoch": 1.0033333333333334, "grad_norm": 1.6652669414648757, "kl": 0.05517578125, "learning_rate": 4.983333333333334e-07, "loss": 0.0022, "reward": 1.71875, "reward_std": 0.16456207633018494, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.75, "step": 1505 }, { "completion_length": 102.0625, "epoch": 1.004, "grad_norm": 3.0045952339427155, "kl": 0.0634765625, "learning_rate": 4.979999999999999e-07, "loss": 0.0025, "reward": 1.6744792461395264, "reward_std": 0.3153773546218872, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7057291865348816, "step": 1506 }, { "completion_length": 121.0, "epoch": 1.0046666666666666, "grad_norm": 2.2693378558836543, "kl": 0.05908203125, "learning_rate": 4.976666666666666e-07, "loss": 0.0024, "reward": 1.7473958730697632, "reward_std": 0.3578792214393616, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8098958730697632, "step": 1507 }, { "completion_length": 105.625, "epoch": 1.0053333333333334, "grad_norm": 3.2581258000771856, "kl": 0.0810546875, "learning_rate": 4.973333333333333e-07, "loss": 0.0032, "reward": 1.8390624523162842, "reward_std": 0.041737549006938934, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8390624523162842, "step": 1508 }, { "completion_length": 123.03125, "epoch": 1.006, "grad_norm": 2.3089874218949604, "kl": 0.0458984375, "learning_rate": 4.97e-07, "loss": 0.0018, "reward": 1.890625, "reward_std": 0.20554219186306, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.953125, "step": 1509 }, { "completion_length": 113.21875, "epoch": 1.0066666666666666, "grad_norm": 2.40562496173657, "kl": 0.0478515625, "learning_rate": 4.966666666666666e-07, "loss": 0.0019, "reward": 1.834375023841858, "reward_std": 0.13582530617713928, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8656250238418579, "step": 1510 }, { "completion_length": 98.75, "epoch": 1.0073333333333334, "grad_norm": 5.576039171356128, "kl": 0.08056640625, "learning_rate": 4.963333333333333e-07, "loss": 0.0032, "reward": 1.828125, "reward_std": 0.1354166567325592, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.859375, "step": 1511 }, { "completion_length": 87.1875, "epoch": 1.008, "grad_norm": 16.965857732545274, "kl": 0.07666015625, "learning_rate": 4.96e-07, "loss": 0.0031, "reward": 1.9375, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.96875, "step": 1512 }, { "completion_length": 98.09375, "epoch": 1.0086666666666666, "grad_norm": 1.9941811028036271, "kl": 0.0634765625, "learning_rate": 4.956666666666667e-07, "loss": 0.0025, "reward": 1.6770833730697632, "reward_std": 0.25966876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7083333730697632, "step": 1513 }, { "completion_length": 108.375, "epoch": 1.0093333333333334, "grad_norm": 3.3228232448067634, "kl": 0.060302734375, "learning_rate": 4.953333333333333e-07, "loss": 0.0024, "reward": 1.7410714626312256, "reward_std": 0.12196988612413406, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7410714626312256, "step": 1514 }, { "completion_length": 86.53125, "epoch": 1.01, "grad_norm": 5.094854526296945, "kl": 0.068359375, "learning_rate": 4.95e-07, "loss": 0.0027, "reward": 1.9036458730697632, "reward_std": 0.1302083283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9348958730697632, "step": 1515 }, { "completion_length": 100.59375, "epoch": 1.0106666666666666, "grad_norm": 4.891464021244395, "kl": 0.07421875, "learning_rate": 4.946666666666666e-07, "loss": 0.003, "reward": 1.754166603088379, "reward_std": 0.06492941081523895, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7541666626930237, "step": 1516 }, { "completion_length": 102.25, "epoch": 1.0113333333333334, "grad_norm": 5.330005946523682, "kl": 0.059814453125, "learning_rate": 4.943333333333333e-07, "loss": 0.0024, "reward": 1.8203125, "reward_std": 0.029264584183692932, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203124403953552, "step": 1517 }, { "completion_length": 103.25, "epoch": 1.012, "grad_norm": 3.1956624305221975, "kl": 0.05908203125, "learning_rate": 4.94e-07, "loss": 0.0024, "reward": 1.9479167461395264, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9479166865348816, "step": 1518 }, { "completion_length": 81.375, "epoch": 1.0126666666666666, "grad_norm": 3.2967043180565336, "kl": 0.08251953125, "learning_rate": 4.936666666666666e-07, "loss": 0.0033, "reward": 1.9192708730697632, "reward_std": 0.07576563954353333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9192708730697632, "step": 1519 }, { "completion_length": 106.375, "epoch": 1.0133333333333334, "grad_norm": 12.070680802088972, "kl": 0.07958984375, "learning_rate": 4.933333333333333e-07, "loss": 0.0032, "reward": 1.8583333492279053, "reward_std": 0.15305021405220032, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8895833492279053, "step": 1520 }, { "completion_length": 105.78125, "epoch": 1.014, "grad_norm": 51.41640888747982, "kl": 0.048583984375, "learning_rate": 4.93e-07, "loss": 0.0019, "reward": 1.984375, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.984375, "step": 1521 }, { "completion_length": 102.375, "epoch": 1.0146666666666666, "grad_norm": 1.9985993504766661, "kl": 0.06298828125, "learning_rate": 4.926666666666667e-07, "loss": 0.0025, "reward": 1.7843749523162842, "reward_std": 0.18582531809806824, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.815625011920929, "step": 1522 }, { "completion_length": 86.59375, "epoch": 1.0153333333333334, "grad_norm": 1.0402929860602146, "kl": 0.044189453125, "learning_rate": 4.923333333333333e-07, "loss": 0.0018, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1523 }, { "completion_length": 111.875, "epoch": 1.016, "grad_norm": 0.7259154208198787, "kl": 0.05517578125, "learning_rate": 4.92e-07, "loss": 0.0022, "reward": 1.7604167461395264, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7604166269302368, "step": 1524 }, { "completion_length": 91.71875, "epoch": 1.0166666666666666, "grad_norm": 2.284494884676374, "kl": 0.053466796875, "learning_rate": 4.916666666666666e-07, "loss": 0.0021, "reward": 1.515625, "reward_std": 0.20554219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.515625, "step": 1525 }, { "completion_length": 100.84375, "epoch": 1.0173333333333334, "grad_norm": 2.210240408811141, "kl": 0.0712890625, "learning_rate": 4.913333333333334e-07, "loss": 0.0028, "reward": 1.917708396911621, "reward_std": 0.02708333171904087, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9177082777023315, "step": 1526 }, { "completion_length": 82.875, "epoch": 1.018, "grad_norm": 10.912584757018848, "kl": 0.06494140625, "learning_rate": 4.909999999999999e-07, "loss": 0.0026, "reward": 1.853124976158142, "reward_std": 0.11591878533363342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8531249761581421, "step": 1527 }, { "completion_length": 92.875, "epoch": 1.0186666666666666, "grad_norm": 6.795091893717162, "kl": 0.08642578125, "learning_rate": 4.906666666666666e-07, "loss": 0.0035, "reward": 1.921875, "reward_std": 0.15625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.953125, "step": 1528 }, { "completion_length": 91.375, "epoch": 1.0193333333333334, "grad_norm": 2.001766674100266, "kl": 0.056396484375, "learning_rate": 4.903333333333333e-07, "loss": 0.0023, "reward": 1.6796875, "reward_std": 0.140625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6796875, "step": 1529 }, { "completion_length": 89.5, "epoch": 1.02, "grad_norm": 4.2365446623586696, "kl": 0.07275390625, "learning_rate": 4.9e-07, "loss": 0.0029, "reward": 1.7057292461395264, "reward_std": 0.0781249925494194, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7057291865348816, "step": 1530 }, { "completion_length": 89.375, "epoch": 1.0206666666666666, "grad_norm": 7.777116973536991, "kl": 0.0712890625, "learning_rate": 4.896666666666666e-07, "loss": 0.0028, "reward": 1.7510416507720947, "reward_std": 0.21780626475811005, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7510416507720947, "step": 1531 }, { "completion_length": 101.8125, "epoch": 1.0213333333333334, "grad_norm": 5.600841791209826, "kl": 0.052490234375, "learning_rate": 4.893333333333333e-07, "loss": 0.0021, "reward": 1.875, "reward_std": 0.1666666567325592, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 1532 }, { "completion_length": 94.1875, "epoch": 1.022, "grad_norm": 3.5494381265728814, "kl": 0.04541015625, "learning_rate": 4.89e-07, "loss": 0.0018, "reward": 1.671875, "reward_std": 0.3647119998931885, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.703125, "step": 1533 }, { "completion_length": 86.25, "epoch": 1.0226666666666666, "grad_norm": 1.8109437108674824, "kl": 0.0693359375, "learning_rate": 4.886666666666667e-07, "loss": 0.0028, "reward": 1.7833333015441895, "reward_std": 0.019543396309018135, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7833333015441895, "step": 1534 }, { "completion_length": 89.65625, "epoch": 1.0233333333333334, "grad_norm": 2.060145537234263, "kl": 0.06689453125, "learning_rate": 4.883333333333334e-07, "loss": 0.0027, "reward": 1.766369104385376, "reward_std": 0.02703244984149933, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7663690447807312, "step": 1535 }, { "completion_length": 108.1875, "epoch": 1.024, "grad_norm": 5.601688247963329, "kl": 0.060791015625, "learning_rate": 4.879999999999999e-07, "loss": 0.0024, "reward": 1.7199616432189941, "reward_std": 0.2234019935131073, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7512115240097046, "step": 1536 }, { "completion_length": 77.84375, "epoch": 1.0246666666666666, "grad_norm": 7.880148889276826, "kl": 0.0771484375, "learning_rate": 4.876666666666667e-07, "loss": 0.0031, "reward": 1.8802083730697632, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8802083730697632, "step": 1537 }, { "completion_length": 99.625, "epoch": 1.0253333333333334, "grad_norm": 2.3463686062173745, "kl": 0.09130859375, "learning_rate": 4.873333333333333e-07, "loss": 0.0036, "reward": 1.84375, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 1538 }, { "completion_length": 94.15625, "epoch": 1.026, "grad_norm": 1.4555852669236735, "kl": 0.0703125, "learning_rate": 4.87e-07, "loss": 0.0028, "reward": 1.9375, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1539 }, { "completion_length": 89.0, "epoch": 1.0266666666666666, "grad_norm": 3.752253461868893, "kl": 0.10107421875, "learning_rate": 4.866666666666666e-07, "loss": 0.004, "reward": 1.8000000715255737, "reward_std": 0.034652598202228546, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.800000011920929, "step": 1540 }, { "completion_length": 84.96875, "epoch": 1.0273333333333334, "grad_norm": 4.7166235355626585, "kl": 0.0703125, "learning_rate": 4.863333333333333e-07, "loss": 0.0028, "reward": 1.8947917222976685, "reward_std": 0.09280625730752945, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8947916626930237, "step": 1541 }, { "completion_length": 106.0, "epoch": 1.028, "grad_norm": 2.657213646761315, "kl": 0.05517578125, "learning_rate": 4.86e-07, "loss": 0.0022, "reward": 1.75, "reward_std": 0.21650634706020355, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8125, "step": 1542 }, { "completion_length": 93.78125, "epoch": 1.0286666666666666, "grad_norm": 3.90650782141693, "kl": 0.1142578125, "learning_rate": 4.856666666666667e-07, "loss": 0.0046, "reward": 1.8588541746139526, "reward_std": 0.07395833730697632, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8588541746139526, "step": 1543 }, { "completion_length": 94.90625, "epoch": 1.0293333333333334, "grad_norm": 2.746065819621612, "kl": 0.0693359375, "learning_rate": 4.853333333333333e-07, "loss": 0.0028, "reward": 1.78125, "reward_std": 0.19362975656986237, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.84375, "step": 1544 }, { "completion_length": 98.40625, "epoch": 1.03, "grad_norm": 2.44425596789566, "kl": 0.0634765625, "learning_rate": 4.85e-07, "loss": 0.0025, "reward": 1.6770833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7083333134651184, "step": 1545 }, { "completion_length": 106.03125, "epoch": 1.0306666666666666, "grad_norm": 2.5286055826309632, "kl": 0.0947265625, "learning_rate": 4.846666666666667e-07, "loss": 0.0038, "reward": 1.7974780797958374, "reward_std": 0.18837720155715942, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8287280797958374, "step": 1546 }, { "completion_length": 103.46875, "epoch": 1.0313333333333334, "grad_norm": 1.8211709958349172, "kl": 0.03955078125, "learning_rate": 4.843333333333334e-07, "loss": 0.0016, "reward": 1.75, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1547 }, { "completion_length": 87.40625, "epoch": 1.032, "grad_norm": 4.478505447713254, "kl": 0.06982421875, "learning_rate": 4.839999999999999e-07, "loss": 0.0028, "reward": 1.9401042461395264, "reward_std": 0.0364583358168602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9401041269302368, "step": 1548 }, { "completion_length": 85.375, "epoch": 1.0326666666666666, "grad_norm": 1.1882906870022858, "kl": 0.0537109375, "learning_rate": 4.836666666666666e-07, "loss": 0.0022, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1549 }, { "completion_length": 85.53125, "epoch": 1.0333333333333334, "grad_norm": 6.93704527170352, "kl": 0.07568359375, "learning_rate": 4.833333333333333e-07, "loss": 0.003, "reward": 1.8234374523162842, "reward_std": 0.23779377341270447, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.854687511920929, "step": 1550 }, { "completion_length": 97.125, "epoch": 1.034, "grad_norm": 2.0559445858427536, "kl": 0.07861328125, "learning_rate": 4.83e-07, "loss": 0.0032, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1551 }, { "completion_length": 86.5, "epoch": 1.0346666666666666, "grad_norm": 3.4569713518579883, "kl": 0.08349609375, "learning_rate": 4.826666666666666e-07, "loss": 0.0033, "reward": 1.753645896911621, "reward_std": 0.13680440187454224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7536457777023315, "step": 1552 }, { "completion_length": 108.0625, "epoch": 1.0353333333333334, "grad_norm": 5.903809498111754, "kl": 0.0810546875, "learning_rate": 4.823333333333333e-07, "loss": 0.0032, "reward": 1.7317708730697632, "reward_std": 0.15549421310424805, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7630208134651184, "step": 1553 }, { "completion_length": 99.5, "epoch": 1.036, "grad_norm": 10.028189510689224, "kl": 0.09912109375, "learning_rate": 4.82e-07, "loss": 0.004, "reward": 1.7729166746139526, "reward_std": 0.35133546590805054, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8354166746139526, "step": 1554 }, { "completion_length": 102.5, "epoch": 1.0366666666666666, "grad_norm": 1.856666882566485, "kl": 0.047119140625, "learning_rate": 4.816666666666667e-07, "loss": 0.0019, "reward": 1.8838541507720947, "reward_std": 0.16979166865348816, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9151041507720947, "step": 1555 }, { "completion_length": 97.21875, "epoch": 1.0373333333333334, "grad_norm": 4.293473168693523, "kl": 0.072265625, "learning_rate": 4.813333333333334e-07, "loss": 0.0029, "reward": 1.9812500476837158, "reward_std": 0.012500002980232239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9812500476837158, "step": 1556 }, { "completion_length": 106.5625, "epoch": 1.038, "grad_norm": 2.5245849911781892, "kl": 0.047119140625, "learning_rate": 4.809999999999999e-07, "loss": 0.0019, "reward": 1.828125, "reward_std": 0.21342839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.859375, "step": 1557 }, { "completion_length": 83.8125, "epoch": 1.0386666666666666, "grad_norm": 2.211907060332901, "kl": 0.07373046875, "learning_rate": 4.806666666666667e-07, "loss": 0.0029, "reward": 1.7135417461395264, "reward_std": 0.1979166567325592, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7447916865348816, "step": 1558 }, { "completion_length": 100.625, "epoch": 1.0393333333333334, "grad_norm": 10.2248740462783, "kl": 0.0625, "learning_rate": 4.803333333333333e-07, "loss": 0.0025, "reward": 1.7911458015441895, "reward_std": 0.20289263129234314, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7911458015441895, "step": 1559 }, { "completion_length": 101.0, "epoch": 1.04, "grad_norm": 0.20615002280971054, "kl": 0.072265625, "learning_rate": 4.8e-07, "loss": 0.0029, "reward": 1.8125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1560 }, { "completion_length": 97.53125, "epoch": 1.0406666666666666, "grad_norm": 1.3256210864253422, "kl": 0.0703125, "learning_rate": 4.796666666666666e-07, "loss": 0.0028, "reward": 1.90625, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1561 }, { "completion_length": 97.0, "epoch": 1.0413333333333332, "grad_norm": 2.4176924017507835, "kl": 0.07470703125, "learning_rate": 4.793333333333333e-07, "loss": 0.003, "reward": 1.7916667461395264, "reward_std": 0.27169692516326904, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8229167461395264, "step": 1562 }, { "completion_length": 89.1875, "epoch": 1.042, "grad_norm": 33.30750227953641, "kl": 0.091796875, "learning_rate": 4.79e-07, "loss": 0.0037, "reward": 1.8520833253860474, "reward_std": 0.030849505215883255, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8520833253860474, "step": 1563 }, { "completion_length": 111.8125, "epoch": 1.0426666666666666, "grad_norm": 2.5366363859089978, "kl": 0.0869140625, "learning_rate": 4.786666666666667e-07, "loss": 0.0035, "reward": 1.859375, "reward_std": 0.14304219186306, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.890625, "step": 1564 }, { "completion_length": 100.5625, "epoch": 1.0433333333333334, "grad_norm": 4.330106513329277, "kl": 0.07666015625, "learning_rate": 4.783333333333333e-07, "loss": 0.0031, "reward": 1.8468749523162842, "reward_std": 0.16521096229553223, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8468749523162842, "step": 1565 }, { "completion_length": 89.96875, "epoch": 1.044, "grad_norm": 5.7606942717292915, "kl": 0.06298828125, "learning_rate": 4.779999999999999e-07, "loss": 0.0025, "reward": 1.7859375476837158, "reward_std": 0.12812499701976776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7859375476837158, "step": 1566 }, { "completion_length": 102.03125, "epoch": 1.0446666666666666, "grad_norm": 2.006691349473339, "kl": 0.051513671875, "learning_rate": 4.776666666666667e-07, "loss": 0.0021, "reward": 1.84375, "reward_std": 0.09300211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8437500596046448, "step": 1567 }, { "completion_length": 86.15625, "epoch": 1.0453333333333332, "grad_norm": 3.079300995059884, "kl": 0.08984375, "learning_rate": 4.773333333333333e-07, "loss": 0.0036, "reward": 1.8052083253860474, "reward_std": 0.051794592291116714, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8052083849906921, "step": 1568 }, { "completion_length": 98.15625, "epoch": 1.046, "grad_norm": 1.9403529199626917, "kl": 0.078125, "learning_rate": 4.769999999999999e-07, "loss": 0.0031, "reward": 1.8958333730697632, "reward_std": 0.1310943365097046, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 1569 }, { "completion_length": 95.5, "epoch": 1.0466666666666666, "grad_norm": 3.2619320796777664, "kl": 0.0927734375, "learning_rate": 4.7666666666666667e-07, "loss": 0.0037, "reward": 1.8670387268066406, "reward_std": 0.05811561271548271, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8670387268066406, "step": 1570 }, { "completion_length": 78.46875, "epoch": 1.0473333333333332, "grad_norm": 4.330083127940947, "kl": 0.072265625, "learning_rate": 4.763333333333333e-07, "loss": 0.0029, "reward": 1.625, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.65625, "step": 1571 }, { "completion_length": 82.4375, "epoch": 1.048, "grad_norm": 1.291344310041839, "kl": 0.05908203125, "learning_rate": 4.76e-07, "loss": 0.0024, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 1572 }, { "completion_length": 104.40625, "epoch": 1.0486666666666666, "grad_norm": 4.795211242807727, "kl": 0.0859375, "learning_rate": 4.7566666666666665e-07, "loss": 0.0034, "reward": 1.6218750476837158, "reward_std": 0.1409187763929367, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.621874988079071, "step": 1573 }, { "completion_length": 101.9375, "epoch": 1.0493333333333332, "grad_norm": 2.5523631620339233, "kl": 0.056396484375, "learning_rate": 4.7533333333333333e-07, "loss": 0.0022, "reward": 1.7983630895614624, "reward_std": 0.07424873113632202, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7983630895614624, "step": 1574 }, { "completion_length": 86.96875, "epoch": 1.05, "grad_norm": 21.40328328246686, "kl": 0.080078125, "learning_rate": 4.7499999999999995e-07, "loss": 0.0032, "reward": 1.7464284896850586, "reward_std": 0.020444253459572792, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7464285492897034, "step": 1575 }, { "completion_length": 98.9375, "epoch": 1.0506666666666666, "grad_norm": 6.3786624464734025, "kl": 0.06396484375, "learning_rate": 4.746666666666667e-07, "loss": 0.0025, "reward": 1.8302083015441895, "reward_std": 0.08958332985639572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8302083611488342, "step": 1576 }, { "completion_length": 92.8125, "epoch": 1.0513333333333332, "grad_norm": 5.546552742529377, "kl": 0.06689453125, "learning_rate": 4.743333333333333e-07, "loss": 0.0027, "reward": 1.75, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1577 }, { "completion_length": 99.84375, "epoch": 1.052, "grad_norm": 3.7102778948177155, "kl": 0.0751953125, "learning_rate": 4.7399999999999993e-07, "loss": 0.003, "reward": 1.7140624523162842, "reward_std": 0.20089209079742432, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7140624523162842, "step": 1578 }, { "completion_length": 108.15625, "epoch": 1.0526666666666666, "grad_norm": 2.6845137162482455, "kl": 0.10888671875, "learning_rate": 4.7366666666666666e-07, "loss": 0.0044, "reward": 1.8541667461395264, "reward_std": 0.22358438372612, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8854166865348816, "step": 1579 }, { "completion_length": 96.96875, "epoch": 1.0533333333333332, "grad_norm": 2.47898427697115, "kl": 0.0791015625, "learning_rate": 4.733333333333333e-07, "loss": 0.0032, "reward": 1.836309552192688, "reward_std": 0.10755520313978195, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.836309552192688, "step": 1580 }, { "completion_length": 95.0625, "epoch": 1.054, "grad_norm": 2.6922725852588405, "kl": 0.060302734375, "learning_rate": 4.7299999999999996e-07, "loss": 0.0024, "reward": 1.745833396911621, "reward_std": 0.2377961128950119, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7770833373069763, "step": 1581 }, { "completion_length": 96.5625, "epoch": 1.0546666666666666, "grad_norm": 2.0130975736025314, "kl": 0.0732421875, "learning_rate": 4.7266666666666664e-07, "loss": 0.0029, "reward": 1.6864583492279053, "reward_std": 0.10818375647068024, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6864583492279053, "step": 1582 }, { "completion_length": 86.65625, "epoch": 1.0553333333333332, "grad_norm": 7.049486590150991, "kl": 0.099609375, "learning_rate": 4.723333333333333e-07, "loss": 0.004, "reward": 1.742708444595337, "reward_std": 0.022916670888662338, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7427082657814026, "step": 1583 }, { "completion_length": 93.78125, "epoch": 1.056, "grad_norm": 0.12892854401647894, "kl": 0.0712890625, "learning_rate": 4.7199999999999994e-07, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 1584 }, { "completion_length": 86.5, "epoch": 1.0566666666666666, "grad_norm": 2.8001712476059715, "kl": 0.068359375, "learning_rate": 4.7166666666666667e-07, "loss": 0.0027, "reward": 1.8359375, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8359375, "step": 1585 }, { "completion_length": 90.0625, "epoch": 1.0573333333333332, "grad_norm": 4.872293579206793, "kl": 0.0673828125, "learning_rate": 4.713333333333333e-07, "loss": 0.0027, "reward": 1.8182291984558105, "reward_std": 0.10385098308324814, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8182291984558105, "step": 1586 }, { "completion_length": 96.0625, "epoch": 1.058, "grad_norm": 1.7009433918440182, "kl": 0.08251953125, "learning_rate": 4.7099999999999997e-07, "loss": 0.0033, "reward": 1.8541667461395264, "reward_std": 0.02405625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666865348816, "step": 1587 }, { "completion_length": 94.90625, "epoch": 1.0586666666666666, "grad_norm": 2.1622652550597614, "kl": 0.08740234375, "learning_rate": 4.7066666666666665e-07, "loss": 0.0035, "reward": 1.8156249523162842, "reward_std": 0.13696783781051636, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.846875011920929, "step": 1588 }, { "completion_length": 99.6875, "epoch": 1.0593333333333332, "grad_norm": 1.3555017850459383, "kl": 0.078125, "learning_rate": 4.703333333333333e-07, "loss": 0.0031, "reward": 1.6927083730697632, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6927083134651184, "step": 1589 }, { "completion_length": 96.71875, "epoch": 1.06, "grad_norm": 7.43467500435708, "kl": 0.267578125, "learning_rate": 4.6999999999999995e-07, "loss": 0.0108, "reward": 1.792708396911621, "reward_std": 0.19148236513137817, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8239583373069763, "step": 1590 }, { "completion_length": 104.84375, "epoch": 1.0606666666666666, "grad_norm": 3.9023346669395615, "kl": 0.0927734375, "learning_rate": 4.696666666666667e-07, "loss": 0.0037, "reward": 1.7218749523162842, "reward_std": 0.2131471484899521, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.753125011920929, "step": 1591 }, { "completion_length": 91.4375, "epoch": 1.0613333333333332, "grad_norm": 0.4161899307017755, "kl": 0.06787109375, "learning_rate": 4.693333333333333e-07, "loss": 0.0027, "reward": 1.9375, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.96875, "step": 1592 }, { "completion_length": 94.0625, "epoch": 1.062, "grad_norm": 0.08038045368336012, "kl": 0.062255859375, "learning_rate": 4.689999999999999e-07, "loss": 0.0025, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 1593 }, { "completion_length": 83.375, "epoch": 1.0626666666666666, "grad_norm": 1.7553108505960793, "kl": 0.03759765625, "learning_rate": 4.6866666666666665e-07, "loss": 0.0015, "reward": 1.6875, "reward_std": 0.0416666641831398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875000596046448, "step": 1594 }, { "completion_length": 90.6875, "epoch": 1.0633333333333332, "grad_norm": 2.714573437255074, "kl": 0.06689453125, "learning_rate": 4.683333333333333e-07, "loss": 0.0027, "reward": 1.8963541984558105, "reward_std": 0.1268632858991623, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8963541984558105, "step": 1595 }, { "completion_length": 95.84375, "epoch": 1.064, "grad_norm": 1.916487307025546, "kl": 0.0830078125, "learning_rate": 4.68e-07, "loss": 0.0033, "reward": 1.8677455186843872, "reward_std": 0.0680803582072258, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8677455186843872, "step": 1596 }, { "completion_length": 108.25, "epoch": 1.0646666666666667, "grad_norm": 12.965269680434744, "kl": 0.1298828125, "learning_rate": 4.6766666666666663e-07, "loss": 0.0052, "reward": 1.7572916746139526, "reward_std": 0.2239268273115158, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7885416746139526, "step": 1597 }, { "completion_length": 98.03125, "epoch": 1.0653333333333332, "grad_norm": 2.6452398421147403, "kl": 0.06298828125, "learning_rate": 4.673333333333333e-07, "loss": 0.0025, "reward": 1.7213542461395264, "reward_std": 0.2135416716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7213541865348816, "step": 1598 }, { "completion_length": 86.53125, "epoch": 1.066, "grad_norm": 4.303529781289758, "kl": 0.0673828125, "learning_rate": 4.67e-07, "loss": 0.0027, "reward": 1.828125, "reward_std": 0.0729166567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.828125, "step": 1599 }, { "completion_length": 90.25, "epoch": 1.0666666666666667, "grad_norm": 5.456038813849112, "kl": 0.0732421875, "learning_rate": 4.6666666666666666e-07, "loss": 0.0029, "reward": 1.7135417461395264, "reward_std": 0.2823058068752289, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7760416865348816, "step": 1600 }, { "completion_length": 89.125, "epoch": 1.0673333333333332, "grad_norm": 1.2416301857338004, "kl": 0.0458984375, "learning_rate": 4.663333333333333e-07, "loss": 0.0018, "reward": 1.7083333730697632, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7395833730697632, "step": 1601 }, { "completion_length": 89.53125, "epoch": 1.068, "grad_norm": 1.9512527803052573, "kl": 0.07177734375, "learning_rate": 4.66e-07, "loss": 0.0029, "reward": 1.90625, "reward_std": 0.020833343267440796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9062500596046448, "step": 1602 }, { "completion_length": 87.03125, "epoch": 1.0686666666666667, "grad_norm": 2.7587305201424908, "kl": 0.038330078125, "learning_rate": 4.6566666666666664e-07, "loss": 0.0015, "reward": 1.7447917461395264, "reward_std": 0.09144821763038635, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7447916269302368, "step": 1603 }, { "completion_length": 93.3125, "epoch": 1.0693333333333332, "grad_norm": 4.74839398382346, "kl": 0.0634765625, "learning_rate": 4.653333333333333e-07, "loss": 0.0025, "reward": 1.8802083730697632, "reward_std": 0.11411616206169128, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8802083134651184, "step": 1604 }, { "completion_length": 93.84375, "epoch": 1.07, "grad_norm": 1.559091926825019, "kl": 0.061767578125, "learning_rate": 4.65e-07, "loss": 0.0025, "reward": 1.6979167461395264, "reward_std": 0.08655625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6979166865348816, "step": 1605 }, { "completion_length": 87.03125, "epoch": 1.0706666666666667, "grad_norm": 2.868852648957287, "kl": 0.0732421875, "learning_rate": 4.6466666666666667e-07, "loss": 0.0029, "reward": 1.9479167461395264, "reward_std": 0.03608439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9479166865348816, "step": 1606 }, { "completion_length": 94.40625, "epoch": 1.0713333333333332, "grad_norm": 4.0753612131775885, "kl": 0.0654296875, "learning_rate": 4.643333333333333e-07, "loss": 0.0026, "reward": 1.850000023841858, "reward_std": 0.09377313405275345, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8500000238418579, "step": 1607 }, { "completion_length": 97.90625, "epoch": 1.072, "grad_norm": 2.1413622295727888, "kl": 0.053955078125, "learning_rate": 4.64e-07, "loss": 0.0022, "reward": 1.8489583730697632, "reward_std": 0.20554219186306, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8802083134651184, "step": 1608 }, { "completion_length": 110.34375, "epoch": 1.0726666666666667, "grad_norm": 0.887684091648187, "kl": 0.0654296875, "learning_rate": 4.6366666666666665e-07, "loss": 0.0026, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1609 }, { "completion_length": 116.71875, "epoch": 1.0733333333333333, "grad_norm": 1.6894843858482826, "kl": 0.056640625, "learning_rate": 4.633333333333333e-07, "loss": 0.0023, "reward": 1.59375, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.59375, "step": 1610 }, { "completion_length": 101.53125, "epoch": 1.074, "grad_norm": 4.261594570089079, "kl": 0.0986328125, "learning_rate": 4.63e-07, "loss": 0.0039, "reward": 1.7291667461395264, "reward_std": 0.03333333134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7291666865348816, "step": 1611 }, { "completion_length": 100.09375, "epoch": 1.0746666666666667, "grad_norm": 10.074271184453785, "kl": 0.0908203125, "learning_rate": 4.6266666666666663e-07, "loss": 0.0036, "reward": 1.839583396911621, "reward_std": 0.04793979972600937, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8395833373069763, "step": 1612 }, { "completion_length": 89.5625, "epoch": 1.0753333333333333, "grad_norm": 2.6682192408752408, "kl": 0.06982421875, "learning_rate": 4.623333333333333e-07, "loss": 0.0028, "reward": 1.9052083492279053, "reward_std": 0.10818374902009964, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9052083492279053, "step": 1613 }, { "completion_length": 106.28125, "epoch": 1.076, "grad_norm": 0.6790883513246131, "kl": 0.052734375, "learning_rate": 4.62e-07, "loss": 0.0021, "reward": 1.625, "reward_std": 0.10206207633018494, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.65625, "step": 1614 }, { "completion_length": 105.46875, "epoch": 1.0766666666666667, "grad_norm": 4.259993720395056, "kl": 0.109375, "learning_rate": 4.6166666666666666e-07, "loss": 0.0044, "reward": 1.8010417222976685, "reward_std": 0.10275696218013763, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8010416626930237, "step": 1615 }, { "completion_length": 85.875, "epoch": 1.0773333333333333, "grad_norm": 3.8410505677058695, "kl": 0.052734375, "learning_rate": 4.613333333333333e-07, "loss": 0.0021, "reward": 1.8125, "reward_std": 0.25, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1616 }, { "completion_length": 97.28125, "epoch": 1.078, "grad_norm": 4.335818881034851, "kl": 0.08935546875, "learning_rate": 4.61e-07, "loss": 0.0036, "reward": 1.9629464149475098, "reward_std": 0.03319448605179787, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9629464149475098, "step": 1617 }, { "completion_length": 108.28125, "epoch": 1.0786666666666667, "grad_norm": 4.154132506925075, "kl": 0.05712890625, "learning_rate": 4.6066666666666664e-07, "loss": 0.0023, "reward": 1.727529764175415, "reward_std": 0.12019617855548859, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.727529764175415, "step": 1618 }, { "completion_length": 105.78125, "epoch": 1.0793333333333333, "grad_norm": 16.123209597159242, "kl": 0.0439453125, "learning_rate": 4.603333333333333e-07, "loss": 0.0018, "reward": 1.6416666507720947, "reward_std": 0.07499999552965164, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6729167103767395, "step": 1619 }, { "completion_length": 103.59375, "epoch": 1.08, "grad_norm": 2.4324198996946866, "kl": 0.07373046875, "learning_rate": 4.6e-07, "loss": 0.003, "reward": 1.6688988208770752, "reward_std": 0.150297611951828, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6688988208770752, "step": 1620 }, { "completion_length": 96.5, "epoch": 1.0806666666666667, "grad_norm": 2.5653582635392547, "kl": 0.12109375, "learning_rate": 4.5966666666666667e-07, "loss": 0.0048, "reward": 1.759374976158142, "reward_std": 0.2755875587463379, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7906250357627869, "step": 1621 }, { "completion_length": 116.96875, "epoch": 1.0813333333333333, "grad_norm": 1.8966257948360046, "kl": 0.057373046875, "learning_rate": 4.593333333333333e-07, "loss": 0.0023, "reward": 1.609375, "reward_std": 0.24537084996700287, "rewards/format_reward": 0.84375, "rewards/iou_reward": 0.7656250596046448, "step": 1622 }, { "completion_length": 94.25, "epoch": 1.082, "grad_norm": 3.3795734160770725, "kl": 0.06591796875, "learning_rate": 4.59e-07, "loss": 0.0026, "reward": 1.9296875, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9296875, "step": 1623 }, { "completion_length": 100.28125, "epoch": 1.0826666666666667, "grad_norm": 3.0054502429888683, "kl": 0.0830078125, "learning_rate": 4.5866666666666664e-07, "loss": 0.0033, "reward": 1.5677082538604736, "reward_std": 0.14842961728572845, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5989583134651184, "step": 1624 }, { "completion_length": 108.46875, "epoch": 1.0833333333333333, "grad_norm": 1.883742980476822, "kl": 0.0888671875, "learning_rate": 4.5833333333333327e-07, "loss": 0.0036, "reward": 1.8984375, "reward_std": 0.13530339300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9296875, "step": 1625 }, { "completion_length": 92.84375, "epoch": 1.084, "grad_norm": 7.059903856144121, "kl": 0.080078125, "learning_rate": 4.58e-07, "loss": 0.0032, "reward": 1.75, "reward_std": 0.10326046496629715, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1626 }, { "completion_length": 104.3125, "epoch": 1.0846666666666667, "grad_norm": 11.903462530711954, "kl": 0.046875, "learning_rate": 4.576666666666666e-07, "loss": 0.0019, "reward": 1.6276042461395264, "reward_std": 0.3586271107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6588541865348816, "step": 1627 }, { "completion_length": 78.8125, "epoch": 1.0853333333333333, "grad_norm": 2.8317523483503684, "kl": 0.078125, "learning_rate": 4.573333333333333e-07, "loss": 0.0031, "reward": 1.8802083730697632, "reward_std": 0.09375, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9114583134651184, "step": 1628 }, { "completion_length": 83.9375, "epoch": 1.086, "grad_norm": 4.289929835286131, "kl": 0.06640625, "learning_rate": 4.57e-07, "loss": 0.0027, "reward": 1.7761905193328857, "reward_std": 0.04720054194331169, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.776190459728241, "step": 1629 }, { "completion_length": 107.25, "epoch": 1.0866666666666667, "grad_norm": 3.1362864240366983, "kl": 0.07421875, "learning_rate": 4.5666666666666665e-07, "loss": 0.003, "reward": 1.765625, "reward_std": 0.1354166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7656249403953552, "step": 1630 }, { "completion_length": 86.9375, "epoch": 1.0873333333333333, "grad_norm": 3.351327157359746, "kl": 0.06591796875, "learning_rate": 4.563333333333333e-07, "loss": 0.0026, "reward": 1.8203125, "reward_std": 0.0958106741309166, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203125, "step": 1631 }, { "completion_length": 102.0, "epoch": 1.088, "grad_norm": 1.8247193586085253, "kl": 0.10205078125, "learning_rate": 4.56e-07, "loss": 0.0041, "reward": 1.7604166269302368, "reward_std": 0.09300211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7604166269302368, "step": 1632 }, { "completion_length": 99.5625, "epoch": 1.0886666666666667, "grad_norm": 2.307540357141836, "kl": 0.125, "learning_rate": 4.5566666666666663e-07, "loss": 0.005, "reward": 1.8020833730697632, "reward_std": 0.16456207633018494, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8645833730697632, "step": 1633 }, { "completion_length": 103.40625, "epoch": 1.0893333333333333, "grad_norm": 2.69930968749193, "kl": 0.044921875, "learning_rate": 4.553333333333333e-07, "loss": 0.0018, "reward": 1.7109375, "reward_std": 0.203125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7421875, "step": 1634 }, { "completion_length": 87.78125, "epoch": 1.09, "grad_norm": 2.374855140215003, "kl": 0.07373046875, "learning_rate": 4.55e-07, "loss": 0.0029, "reward": 1.7447917461395264, "reward_std": 0.15461517870426178, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7447916269302368, "step": 1635 }, { "completion_length": 98.21875, "epoch": 1.0906666666666667, "grad_norm": 5.978316193605684, "kl": 0.0732421875, "learning_rate": 4.5466666666666666e-07, "loss": 0.0029, "reward": 1.8869792222976685, "reward_std": 0.04964974895119667, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8869791626930237, "step": 1636 }, { "completion_length": 100.1875, "epoch": 1.0913333333333333, "grad_norm": 10.033877031071796, "kl": 0.08984375, "learning_rate": 4.543333333333333e-07, "loss": 0.0036, "reward": 1.5703125, "reward_std": 0.10822412371635437, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5703125, "step": 1637 }, { "completion_length": 91.71875, "epoch": 1.092, "grad_norm": 4.312449877243337, "kl": 0.06884765625, "learning_rate": 4.54e-07, "loss": 0.0027, "reward": 1.8411458730697632, "reward_std": 0.14241987466812134, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8411458730697632, "step": 1638 }, { "completion_length": 97.6875, "epoch": 1.0926666666666667, "grad_norm": 63.2868443795919, "kl": 0.0771484375, "learning_rate": 4.5366666666666664e-07, "loss": 0.0031, "reward": 1.9072916507720947, "reward_std": 0.006909634452313185, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9072917103767395, "step": 1639 }, { "completion_length": 94.375, "epoch": 1.0933333333333333, "grad_norm": 2.8355157357005716, "kl": 0.0419921875, "learning_rate": 4.5333333333333326e-07, "loss": 0.0017, "reward": 1.7447917461395264, "reward_std": 0.10544901341199875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7447916865348816, "step": 1640 }, { "completion_length": 83.90625, "epoch": 1.094, "grad_norm": 3.310627411013284, "kl": 0.05908203125, "learning_rate": 4.53e-07, "loss": 0.0024, "reward": 1.859375, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 1641 }, { "completion_length": 97.28125, "epoch": 1.0946666666666667, "grad_norm": 6.125224298238855, "kl": 0.08251953125, "learning_rate": 4.526666666666666e-07, "loss": 0.0033, "reward": 1.7750000953674316, "reward_std": 0.09905624389648438, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7750000357627869, "step": 1642 }, { "completion_length": 87.46875, "epoch": 1.0953333333333333, "grad_norm": 5.323438310883723, "kl": 0.1142578125, "learning_rate": 4.523333333333333e-07, "loss": 0.0046, "reward": 1.5130208730697632, "reward_std": 0.1927083283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5130208730697632, "step": 1643 }, { "completion_length": 105.5625, "epoch": 1.096, "grad_norm": 5.92688615038045, "kl": 0.08203125, "learning_rate": 4.5199999999999997e-07, "loss": 0.0033, "reward": 1.6744792461395264, "reward_std": 0.21099074184894562, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7057291865348816, "step": 1644 }, { "completion_length": 97.34375, "epoch": 1.0966666666666667, "grad_norm": 4.381452275775399, "kl": 0.07958984375, "learning_rate": 4.5166666666666665e-07, "loss": 0.0032, "reward": 1.8958333730697632, "reward_std": 0.10738958418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 1645 }, { "completion_length": 84.15625, "epoch": 1.0973333333333333, "grad_norm": 3.584371003376744, "kl": 0.05126953125, "learning_rate": 4.5133333333333327e-07, "loss": 0.0021, "reward": 1.8116071224212646, "reward_std": 0.20178571343421936, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8116071224212646, "step": 1646 }, { "completion_length": 94.21875, "epoch": 1.098, "grad_norm": 0.8343225108838831, "kl": 0.044677734375, "learning_rate": 4.51e-07, "loss": 0.0018, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1647 }, { "completion_length": 108.40625, "epoch": 1.0986666666666667, "grad_norm": 5.990708443670674, "kl": 0.06884765625, "learning_rate": 4.506666666666666e-07, "loss": 0.0028, "reward": 1.8937499523162842, "reward_std": 0.10571783781051636, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.893750011920929, "step": 1648 }, { "completion_length": 99.59375, "epoch": 1.0993333333333333, "grad_norm": 0.8181675223463568, "kl": 0.043212890625, "learning_rate": 4.503333333333333e-07, "loss": 0.0017, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1649 }, { "completion_length": 92.21875, "epoch": 1.1, "grad_norm": 3.95205529062572, "kl": 0.10546875, "learning_rate": 4.5e-07, "loss": 0.0042, "reward": 1.7313988208770752, "reward_std": 0.26979804039001465, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7626488208770752, "step": 1650 }, { "completion_length": 105.0, "epoch": 1.1006666666666667, "grad_norm": 2.736768166087945, "kl": 0.07568359375, "learning_rate": 4.4966666666666666e-07, "loss": 0.003, "reward": 1.5572917461395264, "reward_std": 0.1767178177833557, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5885416865348816, "step": 1651 }, { "completion_length": 100.3125, "epoch": 1.1013333333333333, "grad_norm": 1.2975212067636672, "kl": 0.0625, "learning_rate": 4.493333333333333e-07, "loss": 0.0025, "reward": 1.7291667461395264, "reward_std": 0.02405625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7291666269302368, "step": 1652 }, { "completion_length": 103.78125, "epoch": 1.102, "grad_norm": 2.1400172592616213, "kl": 0.06640625, "learning_rate": 4.49e-07, "loss": 0.0027, "reward": 1.8468749523162842, "reward_std": 0.17930270731449127, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.846875011920929, "step": 1653 }, { "completion_length": 88.84375, "epoch": 1.1026666666666667, "grad_norm": 13.56740376348041, "kl": 0.1875, "learning_rate": 4.4866666666666663e-07, "loss": 0.0075, "reward": 1.8020833730697632, "reward_std": 0.09032616019248962, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8020833730697632, "step": 1654 }, { "completion_length": 99.1875, "epoch": 1.1033333333333333, "grad_norm": 5.32204647353279, "kl": 0.083984375, "learning_rate": 4.483333333333333e-07, "loss": 0.0033, "reward": 1.659895896911621, "reward_std": 0.10680850595235825, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6598958373069763, "step": 1655 }, { "completion_length": 89.90625, "epoch": 1.104, "grad_norm": 5.339556219204595, "kl": 0.0888671875, "learning_rate": 4.48e-07, "loss": 0.0036, "reward": 1.771875023841858, "reward_std": 0.09791666269302368, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7718750238418579, "step": 1656 }, { "completion_length": 98.5, "epoch": 1.1046666666666667, "grad_norm": 1.9931518644296857, "kl": 0.0849609375, "learning_rate": 4.476666666666666e-07, "loss": 0.0034, "reward": 1.90625, "reward_std": 0.01979013904929161, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9062500596046448, "step": 1657 }, { "completion_length": 110.34375, "epoch": 1.1053333333333333, "grad_norm": 1.8746633373014707, "kl": 0.06396484375, "learning_rate": 4.4733333333333334e-07, "loss": 0.0026, "reward": 1.9166667461395264, "reward_std": 0.09622503817081451, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9166666865348816, "step": 1658 }, { "completion_length": 99.09375, "epoch": 1.106, "grad_norm": 3.37375900628684, "kl": 0.09326171875, "learning_rate": 4.4699999999999997e-07, "loss": 0.0037, "reward": 1.6322917938232422, "reward_std": 0.186069518327713, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6322916150093079, "step": 1659 }, { "completion_length": 101.34375, "epoch": 1.1066666666666667, "grad_norm": 2.241692673047674, "kl": 0.048828125, "learning_rate": 4.4666666666666664e-07, "loss": 0.002, "reward": 1.859375, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 1660 }, { "completion_length": 100.78125, "epoch": 1.1073333333333333, "grad_norm": 13.970520291441568, "kl": 0.0859375, "learning_rate": 4.463333333333333e-07, "loss": 0.0034, "reward": 1.6583333015441895, "reward_std": 0.2928248643875122, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7208333015441895, "step": 1661 }, { "completion_length": 85.125, "epoch": 1.108, "grad_norm": 0.16355331950704216, "kl": 0.05029296875, "learning_rate": 4.46e-07, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 1662 }, { "completion_length": 94.34375, "epoch": 1.1086666666666667, "grad_norm": 1.840507253563673, "kl": 0.0791015625, "learning_rate": 4.456666666666666e-07, "loss": 0.0032, "reward": 1.8697917461395264, "reward_std": 0.026214702054858208, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8697917461395264, "step": 1663 }, { "completion_length": 109.0625, "epoch": 1.1093333333333333, "grad_norm": 2.680652986103106, "kl": 0.05859375, "learning_rate": 4.4533333333333335e-07, "loss": 0.0023, "reward": 1.7348958253860474, "reward_std": 0.14404664933681488, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7661458849906921, "step": 1664 }, { "completion_length": 95.84375, "epoch": 1.11, "grad_norm": 0.8233631171270558, "kl": 0.053466796875, "learning_rate": 4.45e-07, "loss": 0.0021, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1665 }, { "completion_length": 99.40625, "epoch": 1.1106666666666667, "grad_norm": 4.668496877923659, "kl": 0.11669921875, "learning_rate": 4.4466666666666665e-07, "loss": 0.0047, "reward": 1.8177083730697632, "reward_std": 0.20370778441429138, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8489583134651184, "step": 1666 }, { "completion_length": 100.28125, "epoch": 1.1113333333333333, "grad_norm": 5.28936287128506, "kl": 0.06689453125, "learning_rate": 4.4433333333333333e-07, "loss": 0.0027, "reward": 1.644270896911621, "reward_std": 0.04033626988530159, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6442708373069763, "step": 1667 }, { "completion_length": 84.15625, "epoch": 1.112, "grad_norm": 8.572207283119942, "kl": 0.046875, "learning_rate": 4.44e-07, "loss": 0.0019, "reward": 1.912500023841858, "reward_std": 0.014433751814067364, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9125000238418579, "step": 1668 }, { "completion_length": 98.21875, "epoch": 1.1126666666666667, "grad_norm": 4.6853743961746925, "kl": 0.07177734375, "learning_rate": 4.4366666666666663e-07, "loss": 0.0029, "reward": 1.65625, "reward_std": 0.3060176372528076, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6875, "step": 1669 }, { "completion_length": 108.0625, "epoch": 1.1133333333333333, "grad_norm": 5.949887682980396, "kl": 0.051513671875, "learning_rate": 4.4333333333333336e-07, "loss": 0.0021, "reward": 1.6075148582458496, "reward_std": 0.31269365549087524, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6387648582458496, "step": 1670 }, { "completion_length": 102.03125, "epoch": 1.114, "grad_norm": 2.644417671826536, "kl": 0.037841796875, "learning_rate": 4.43e-07, "loss": 0.0015, "reward": 1.78125, "reward_std": 0.1666666716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7812499403953552, "step": 1671 }, { "completion_length": 99.6875, "epoch": 1.1146666666666667, "grad_norm": 1.2912335192437692, "kl": 0.06640625, "learning_rate": 4.426666666666666e-07, "loss": 0.0027, "reward": 1.84375, "reward_std": 0.023935673758387566, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1672 }, { "completion_length": 95.53125, "epoch": 1.1153333333333333, "grad_norm": 3.863544231951972, "kl": 0.10009765625, "learning_rate": 4.4233333333333334e-07, "loss": 0.004, "reward": 1.8515625, "reward_std": 0.22708837687969208, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8828125, "step": 1673 }, { "completion_length": 109.65625, "epoch": 1.116, "grad_norm": 5.760485682045722, "kl": 0.0673828125, "learning_rate": 4.4199999999999996e-07, "loss": 0.0027, "reward": 1.8156249523162842, "reward_std": 0.16591878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.815625011920929, "step": 1674 }, { "completion_length": 104.96875, "epoch": 1.1166666666666667, "grad_norm": 3.4795387727118, "kl": 0.0830078125, "learning_rate": 4.4166666666666664e-07, "loss": 0.0033, "reward": 1.7807291746139526, "reward_std": 0.2140415608882904, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8432291746139526, "step": 1675 }, { "completion_length": 99.03125, "epoch": 1.1173333333333333, "grad_norm": 3.5341116340599967, "kl": 0.09130859375, "learning_rate": 4.413333333333333e-07, "loss": 0.0036, "reward": 1.8229167461395264, "reward_std": 0.20743322372436523, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229167461395264, "step": 1676 }, { "completion_length": 96.65625, "epoch": 1.1179999999999999, "grad_norm": 3.908615404849252, "kl": 0.0712890625, "learning_rate": 4.41e-07, "loss": 0.0029, "reward": 1.863020896911621, "reward_std": 0.04891005903482437, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8630208373069763, "step": 1677 }, { "completion_length": 120.46875, "epoch": 1.1186666666666667, "grad_norm": 2.3877452930170793, "kl": 0.0517578125, "learning_rate": 4.406666666666666e-07, "loss": 0.0021, "reward": 1.9505208730697632, "reward_std": 0.0989583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9505208730697632, "step": 1678 }, { "completion_length": 109.3125, "epoch": 1.1193333333333333, "grad_norm": 1.7738107614199403, "kl": 0.0810546875, "learning_rate": 4.4033333333333335e-07, "loss": 0.0032, "reward": 1.71875, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.75, "step": 1679 }, { "completion_length": 109.28125, "epoch": 1.12, "grad_norm": 7.170691257940478, "kl": 0.052978515625, "learning_rate": 4.3999999999999997e-07, "loss": 0.0021, "reward": 1.9520833492279053, "reward_std": 0.08291241526603699, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9520833492279053, "step": 1680 }, { "completion_length": 97.53125, "epoch": 1.1206666666666667, "grad_norm": 3.080464465195273, "kl": 0.09130859375, "learning_rate": 4.3966666666666665e-07, "loss": 0.0036, "reward": 1.8385417461395264, "reward_std": 0.16591878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416865348816, "step": 1681 }, { "completion_length": 101.03125, "epoch": 1.1213333333333333, "grad_norm": 13.566514291302838, "kl": 0.10205078125, "learning_rate": 4.393333333333333e-07, "loss": 0.0041, "reward": 1.832663655281067, "reward_std": 0.13940191268920898, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8326636552810669, "step": 1682 }, { "completion_length": 104.9375, "epoch": 1.1219999999999999, "grad_norm": 6.530492279318521, "kl": 0.0771484375, "learning_rate": 4.39e-07, "loss": 0.0031, "reward": 1.4817708730697632, "reward_std": 0.171875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.4817708432674408, "step": 1683 }, { "completion_length": 97.90625, "epoch": 1.1226666666666667, "grad_norm": 4.854881422739464, "kl": 0.052490234375, "learning_rate": 4.386666666666666e-07, "loss": 0.0021, "reward": 1.5697916746139526, "reward_std": 0.35795456171035767, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6010416746139526, "step": 1684 }, { "completion_length": 96.40625, "epoch": 1.1233333333333333, "grad_norm": 1.5486425735448006, "kl": 0.051513671875, "learning_rate": 4.3833333333333335e-07, "loss": 0.0021, "reward": 1.9583333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333730697632, "step": 1685 }, { "completion_length": 110.15625, "epoch": 1.124, "grad_norm": 4.271075568544819, "kl": 0.0986328125, "learning_rate": 4.38e-07, "loss": 0.0039, "reward": 1.6279761791229248, "reward_std": 0.3769040107727051, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6592262387275696, "step": 1686 }, { "completion_length": 96.40625, "epoch": 1.1246666666666667, "grad_norm": 5.579043150944625, "kl": 0.06787109375, "learning_rate": 4.376666666666666e-07, "loss": 0.0027, "reward": 1.90625, "reward_std": 0.028458867222070694, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9062499403953552, "step": 1687 }, { "completion_length": 96.4375, "epoch": 1.1253333333333333, "grad_norm": 3.833717116299089, "kl": 0.07861328125, "learning_rate": 4.3733333333333333e-07, "loss": 0.0031, "reward": 1.892187476158142, "reward_std": 0.10795938968658447, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9234374761581421, "step": 1688 }, { "completion_length": 90.625, "epoch": 1.126, "grad_norm": 1.3305437005896055, "kl": 0.044677734375, "learning_rate": 4.3699999999999996e-07, "loss": 0.0018, "reward": 1.90625, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1689 }, { "completion_length": 91.0625, "epoch": 1.1266666666666667, "grad_norm": 1.5602593566142975, "kl": 0.051513671875, "learning_rate": 4.3666666666666663e-07, "loss": 0.0021, "reward": 1.920163631439209, "reward_std": 0.04866071045398712, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9201637506484985, "step": 1690 }, { "completion_length": 100.09375, "epoch": 1.1273333333333333, "grad_norm": 0.9572719923447384, "kl": 0.07373046875, "learning_rate": 4.363333333333333e-07, "loss": 0.003, "reward": 1.8562500476837158, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.856249988079071, "step": 1691 }, { "completion_length": 89.03125, "epoch": 1.1280000000000001, "grad_norm": 2.7493387552349917, "kl": 0.07861328125, "learning_rate": 4.36e-07, "loss": 0.0031, "reward": 1.75, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.75, "step": 1692 }, { "completion_length": 98.40625, "epoch": 1.1286666666666667, "grad_norm": 2.938694273600485, "kl": 0.09228515625, "learning_rate": 4.356666666666666e-07, "loss": 0.0037, "reward": 1.7965030670166016, "reward_std": 0.05670531466603279, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.796502947807312, "step": 1693 }, { "completion_length": 106.71875, "epoch": 1.1293333333333333, "grad_norm": 3.6104950591523863, "kl": 0.06396484375, "learning_rate": 4.3533333333333334e-07, "loss": 0.0026, "reward": 1.8565564155578613, "reward_std": 0.1868872493505478, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8878063559532166, "step": 1694 }, { "completion_length": 98.21875, "epoch": 1.13, "grad_norm": 2.9456106436428002, "kl": 0.052001953125, "learning_rate": 4.3499999999999996e-07, "loss": 0.0021, "reward": 1.921875, "reward_std": 0.14304219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1695 }, { "completion_length": 86.28125, "epoch": 1.1306666666666667, "grad_norm": 5.919270408250594, "kl": 0.087890625, "learning_rate": 4.3466666666666664e-07, "loss": 0.0035, "reward": 1.9734375476837158, "reward_std": 0.02812500298023224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9734375476837158, "step": 1696 }, { "completion_length": 92.25, "epoch": 1.1313333333333333, "grad_norm": 10.862546943263823, "kl": 0.061767578125, "learning_rate": 4.343333333333333e-07, "loss": 0.0025, "reward": 1.9375, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1697 }, { "completion_length": 88.90625, "epoch": 1.1320000000000001, "grad_norm": 4.67499022777308, "kl": 0.072265625, "learning_rate": 4.34e-07, "loss": 0.0029, "reward": 1.734375, "reward_std": 0.30294692516326904, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7656250596046448, "step": 1698 }, { "completion_length": 94.65625, "epoch": 1.1326666666666667, "grad_norm": 4.682689156915478, "kl": 0.0546875, "learning_rate": 4.336666666666666e-07, "loss": 0.0022, "reward": 1.9322917461395264, "reward_std": 0.010416661389172077, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322916865348816, "step": 1699 }, { "completion_length": 86.65625, "epoch": 1.1333333333333333, "grad_norm": 5.656563520713145, "kl": 0.0703125, "learning_rate": 4.3333333333333335e-07, "loss": 0.0028, "reward": 1.956770896911621, "reward_std": 0.0599394217133522, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9567707777023315, "step": 1700 }, { "completion_length": 96.96875, "epoch": 1.134, "grad_norm": 3.1543187128192267, "kl": 0.0703125, "learning_rate": 4.3299999999999997e-07, "loss": 0.0028, "reward": 1.7098958492279053, "reward_std": 0.03854167088866234, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7098957896232605, "step": 1701 }, { "completion_length": 96.9375, "epoch": 1.1346666666666667, "grad_norm": 3.0074481029398155, "kl": 0.0810546875, "learning_rate": 4.3266666666666665e-07, "loss": 0.0032, "reward": 1.8125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1702 }, { "completion_length": 103.0, "epoch": 1.1353333333333333, "grad_norm": 2.6050523912262515, "kl": 0.05224609375, "learning_rate": 4.3233333333333333e-07, "loss": 0.0021, "reward": 1.65625, "reward_std": 0.3430021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6875, "step": 1703 }, { "completion_length": 101.90625, "epoch": 1.1360000000000001, "grad_norm": 1.0993728494575372, "kl": 0.0634765625, "learning_rate": 4.3199999999999995e-07, "loss": 0.0025, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1704 }, { "completion_length": 79.53125, "epoch": 1.1366666666666667, "grad_norm": 1.15657801583109, "kl": 0.0703125, "learning_rate": 4.3166666666666663e-07, "loss": 0.0028, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1705 }, { "completion_length": 109.75, "epoch": 1.1373333333333333, "grad_norm": 1.4389940376842465, "kl": 0.060791015625, "learning_rate": 4.313333333333333e-07, "loss": 0.0024, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1706 }, { "completion_length": 95.3125, "epoch": 1.138, "grad_norm": 2.2533466498881727, "kl": 0.049560546875, "learning_rate": 4.31e-07, "loss": 0.002, "reward": 1.84375, "reward_std": 0.1458333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1707 }, { "completion_length": 98.15625, "epoch": 1.1386666666666667, "grad_norm": 4.97287360303785, "kl": 0.11279296875, "learning_rate": 4.306666666666666e-07, "loss": 0.0045, "reward": 1.827604055404663, "reward_std": 0.17353491485118866, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8276041746139526, "step": 1708 }, { "completion_length": 87.5, "epoch": 1.1393333333333333, "grad_norm": 2.484366418165338, "kl": 0.0830078125, "learning_rate": 4.3033333333333334e-07, "loss": 0.0033, "reward": 1.703125, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.734375, "step": 1709 }, { "completion_length": 113.65625, "epoch": 1.1400000000000001, "grad_norm": 2.295943088858196, "kl": 0.045166015625, "learning_rate": 4.2999999999999996e-07, "loss": 0.0018, "reward": 1.625, "reward_std": 0.2395627796649933, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.625, "step": 1710 }, { "completion_length": 93.5625, "epoch": 1.1406666666666667, "grad_norm": 2.2412917072018765, "kl": 0.047607421875, "learning_rate": 4.2966666666666664e-07, "loss": 0.0019, "reward": 1.7916667461395264, "reward_std": 0.18496489524841309, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666865348816, "step": 1711 }, { "completion_length": 91.09375, "epoch": 1.1413333333333333, "grad_norm": 2.1534403360276837, "kl": 0.05908203125, "learning_rate": 4.293333333333333e-07, "loss": 0.0024, "reward": 1.5, "reward_std": 0.19716878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5, "step": 1712 }, { "completion_length": 95.3125, "epoch": 1.142, "grad_norm": 2.1118395584851988, "kl": 0.08349609375, "learning_rate": 4.29e-07, "loss": 0.0033, "reward": 1.860937476158142, "reward_std": 0.08596115559339523, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8609375357627869, "step": 1713 }, { "completion_length": 98.375, "epoch": 1.1426666666666667, "grad_norm": 5.434130839935186, "kl": 0.09716796875, "learning_rate": 4.286666666666666e-07, "loss": 0.0039, "reward": 1.84375, "reward_std": 0.24467839300632477, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.90625, "step": 1714 }, { "completion_length": 102.75, "epoch": 1.1433333333333333, "grad_norm": 1.6943294744007118, "kl": 0.08203125, "learning_rate": 4.2833333333333334e-07, "loss": 0.0033, "reward": 1.8359375, "reward_std": 0.01762310042977333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8359375, "step": 1715 }, { "completion_length": 96.59375, "epoch": 1.144, "grad_norm": 5.046336848709212, "kl": 0.08203125, "learning_rate": 4.2799999999999997e-07, "loss": 0.0033, "reward": 1.8880208730697632, "reward_std": 0.06051459163427353, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8880208730697632, "step": 1716 }, { "completion_length": 97.03125, "epoch": 1.1446666666666667, "grad_norm": 4.040895284083568, "kl": 0.0751953125, "learning_rate": 4.2766666666666664e-07, "loss": 0.003, "reward": 1.7963541746139526, "reward_std": 0.201351135969162, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8276041746139526, "step": 1717 }, { "completion_length": 94.25, "epoch": 1.1453333333333333, "grad_norm": 2.3223548973493413, "kl": 0.07861328125, "learning_rate": 4.273333333333333e-07, "loss": 0.0031, "reward": 1.9328124523162842, "reward_std": 0.09885390847921371, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.932812511920929, "step": 1718 }, { "completion_length": 86.0, "epoch": 1.146, "grad_norm": 5.259438986084818, "kl": 0.09912109375, "learning_rate": 4.2699999999999995e-07, "loss": 0.004, "reward": 1.9187500476837158, "reward_std": 0.09583333134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.918749988079071, "step": 1719 }, { "completion_length": 103.6875, "epoch": 1.1466666666666667, "grad_norm": 1.6027472335449435, "kl": 0.0791015625, "learning_rate": 4.266666666666667e-07, "loss": 0.0032, "reward": 1.859375, "reward_std": 0.03125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 1720 }, { "completion_length": 103.9375, "epoch": 1.1473333333333333, "grad_norm": 2.0483992110097673, "kl": 0.10009765625, "learning_rate": 4.263333333333333e-07, "loss": 0.004, "reward": 1.850000023841858, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8812500238418579, "step": 1721 }, { "completion_length": 110.40625, "epoch": 1.148, "grad_norm": 3.5993080381561904, "kl": 0.0595703125, "learning_rate": 4.26e-07, "loss": 0.0024, "reward": 1.6458333730697632, "reward_std": 0.29017090797424316, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6770833730697632, "step": 1722 }, { "completion_length": 94.5625, "epoch": 1.1486666666666667, "grad_norm": 2.831518349324288, "kl": 0.06787109375, "learning_rate": 4.2566666666666665e-07, "loss": 0.0027, "reward": 1.6947916746139526, "reward_std": 0.11771160364151001, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7260416746139526, "step": 1723 }, { "completion_length": 101.4375, "epoch": 1.1493333333333333, "grad_norm": 3.101476221411046, "kl": 0.060546875, "learning_rate": 4.2533333333333333e-07, "loss": 0.0024, "reward": 1.6270833015441895, "reward_std": 0.2711769938468933, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6270833611488342, "step": 1724 }, { "completion_length": 95.59375, "epoch": 1.15, "grad_norm": 1.917670511495202, "kl": 0.083984375, "learning_rate": 4.2499999999999995e-07, "loss": 0.0034, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1725 }, { "completion_length": 98.625, "epoch": 1.1506666666666667, "grad_norm": 3.1886557542977196, "kl": 0.06787109375, "learning_rate": 4.246666666666667e-07, "loss": 0.0027, "reward": 1.75, "reward_std": 0.18217839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 1726 }, { "completion_length": 83.9375, "epoch": 1.1513333333333333, "grad_norm": 2.158753518842095, "kl": 0.06787109375, "learning_rate": 4.243333333333333e-07, "loss": 0.0027, "reward": 1.9484374523162842, "reward_std": 0.04062500223517418, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.948437511920929, "step": 1727 }, { "completion_length": 96.03125, "epoch": 1.152, "grad_norm": 2.448645985951796, "kl": 0.0830078125, "learning_rate": 4.24e-07, "loss": 0.0033, "reward": 1.8880208730697632, "reward_std": 0.10862711071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8880208730697632, "step": 1728 }, { "completion_length": 91.375, "epoch": 1.1526666666666667, "grad_norm": 0.9968969101905834, "kl": 0.047607421875, "learning_rate": 4.2366666666666666e-07, "loss": 0.0019, "reward": 1.6458333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6458333730697632, "step": 1729 }, { "completion_length": 94.21875, "epoch": 1.1533333333333333, "grad_norm": 3.1047625126781644, "kl": 0.06640625, "learning_rate": 4.2333333333333334e-07, "loss": 0.0027, "reward": 1.7625000476837158, "reward_std": 0.06072612479329109, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7625000476837158, "step": 1730 }, { "completion_length": 96.90625, "epoch": 1.154, "grad_norm": 0.09001260987299557, "kl": 0.0712890625, "learning_rate": 4.2299999999999996e-07, "loss": 0.0029, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1731 }, { "completion_length": 88.03125, "epoch": 1.1546666666666667, "grad_norm": 1.9505988882455154, "kl": 0.05224609375, "learning_rate": 4.226666666666667e-07, "loss": 0.0021, "reward": 1.8958333730697632, "reward_std": 0.07326273620128632, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 1732 }, { "completion_length": 101.3125, "epoch": 1.1553333333333333, "grad_norm": 4.010535551052484, "kl": 0.09423828125, "learning_rate": 4.223333333333333e-07, "loss": 0.0038, "reward": 1.837499976158142, "reward_std": 0.11666667461395264, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8375000953674316, "step": 1733 }, { "completion_length": 110.15625, "epoch": 1.156, "grad_norm": 2.6438395069847935, "kl": 0.0712890625, "learning_rate": 4.2199999999999994e-07, "loss": 0.0028, "reward": 1.6749999523162842, "reward_std": 0.2230161726474762, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.675000011920929, "step": 1734 }, { "completion_length": 97.0, "epoch": 1.1566666666666667, "grad_norm": 1.8805880230251375, "kl": 0.072265625, "learning_rate": 4.2166666666666667e-07, "loss": 0.0029, "reward": 1.8828125, "reward_std": 0.078125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8828125, "step": 1735 }, { "completion_length": 94.78125, "epoch": 1.1573333333333333, "grad_norm": 2.286168033843526, "kl": 0.05908203125, "learning_rate": 4.213333333333333e-07, "loss": 0.0024, "reward": 1.84375, "reward_std": 0.19158649444580078, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1736 }, { "completion_length": 95.65625, "epoch": 1.158, "grad_norm": 2.0194223297078127, "kl": 0.0810546875, "learning_rate": 4.2099999999999997e-07, "loss": 0.0032, "reward": 1.859375, "reward_std": 0.11662659049034119, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.859375, "step": 1737 }, { "completion_length": 86.28125, "epoch": 1.1586666666666667, "grad_norm": 1.5670117391086325, "kl": 0.06640625, "learning_rate": 4.2066666666666665e-07, "loss": 0.0027, "reward": 1.875, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 1738 }, { "completion_length": 90.46875, "epoch": 1.1593333333333333, "grad_norm": 2.8441325291560062, "kl": 0.09326171875, "learning_rate": 4.203333333333333e-07, "loss": 0.0037, "reward": 1.882440447807312, "reward_std": 0.1517857164144516, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.913690447807312, "step": 1739 }, { "completion_length": 108.21875, "epoch": 1.16, "grad_norm": 3.195437255671178, "kl": 0.05615234375, "learning_rate": 4.1999999999999995e-07, "loss": 0.0022, "reward": 1.7459821701049805, "reward_std": 0.16885854303836823, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7459821105003357, "step": 1740 }, { "completion_length": 104.3125, "epoch": 1.1606666666666667, "grad_norm": 3.0961031703371105, "kl": 0.0732421875, "learning_rate": 4.196666666666667e-07, "loss": 0.0029, "reward": 1.898958444595337, "reward_std": 0.09530116617679596, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8989583253860474, "step": 1741 }, { "completion_length": 90.4375, "epoch": 1.1613333333333333, "grad_norm": 3.843071179818084, "kl": 0.07958984375, "learning_rate": 4.193333333333333e-07, "loss": 0.0032, "reward": 1.7937500476837158, "reward_std": 0.09103886783123016, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.793749988079071, "step": 1742 }, { "completion_length": 103.71875, "epoch": 1.162, "grad_norm": 1.0900757450524838, "kl": 0.0517578125, "learning_rate": 4.19e-07, "loss": 0.0021, "reward": 1.7083333730697632, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333730697632, "step": 1743 }, { "completion_length": 86.28125, "epoch": 1.1626666666666667, "grad_norm": 13.886393393325886, "kl": 0.0869140625, "learning_rate": 4.1866666666666666e-07, "loss": 0.0035, "reward": 1.798437476158142, "reward_std": 0.03437499701976776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7984374761581421, "step": 1744 }, { "completion_length": 104.34375, "epoch": 1.1633333333333333, "grad_norm": 2.9010348834738773, "kl": 0.056396484375, "learning_rate": 4.1833333333333333e-07, "loss": 0.0023, "reward": 1.6614583730697632, "reward_std": 0.3422542214393616, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6614583134651184, "step": 1745 }, { "completion_length": 85.40625, "epoch": 1.164, "grad_norm": 1.9125832373341134, "kl": 0.0703125, "learning_rate": 4.1799999999999996e-07, "loss": 0.0028, "reward": 1.9296875, "reward_std": 0.140625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9609375, "step": 1746 }, { "completion_length": 96.8125, "epoch": 1.1646666666666667, "grad_norm": 1.3483117873512975, "kl": 0.072265625, "learning_rate": 4.176666666666667e-07, "loss": 0.0029, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1747 }, { "completion_length": 94.59375, "epoch": 1.1653333333333333, "grad_norm": 1.0629695805410428, "kl": 0.0771484375, "learning_rate": 4.173333333333333e-07, "loss": 0.0031, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1748 }, { "completion_length": 101.53125, "epoch": 1.166, "grad_norm": 5.36436319690539, "kl": 0.060546875, "learning_rate": 4.17e-07, "loss": 0.0024, "reward": 1.8781249523162842, "reward_std": 0.02572597563266754, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.878125011920929, "step": 1749 }, { "completion_length": 93.65625, "epoch": 1.1666666666666667, "grad_norm": 3.1289338607942576, "kl": 0.07177734375, "learning_rate": 4.1666666666666667e-07, "loss": 0.0029, "reward": 1.730208396911621, "reward_std": 0.03958332911133766, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7302083969116211, "step": 1750 }, { "completion_length": 95.4375, "epoch": 1.1673333333333333, "grad_norm": 1.1097678696577633, "kl": 0.0712890625, "learning_rate": 4.163333333333333e-07, "loss": 0.0029, "reward": 1.828125, "reward_std": 0.010416671633720398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.828125, "step": 1751 }, { "completion_length": 96.09375, "epoch": 1.168, "grad_norm": 14.244321761354609, "kl": 0.08837890625, "learning_rate": 4.1599999999999997e-07, "loss": 0.0035, "reward": 1.8776042461395264, "reward_std": 0.0714031308889389, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8776041865348816, "step": 1752 }, { "completion_length": 102.8125, "epoch": 1.1686666666666667, "grad_norm": 3.6295505091519153, "kl": 0.062255859375, "learning_rate": 4.1566666666666664e-07, "loss": 0.0025, "reward": 1.7864583730697632, "reward_std": 0.12425211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7864583730697632, "step": 1753 }, { "completion_length": 94.71875, "epoch": 1.1693333333333333, "grad_norm": 0.7865281727935776, "kl": 0.0654296875, "learning_rate": 4.153333333333333e-07, "loss": 0.0026, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1754 }, { "completion_length": 110.71875, "epoch": 1.17, "grad_norm": 4.322444627967889, "kl": 0.054443359375, "learning_rate": 4.1499999999999994e-07, "loss": 0.0022, "reward": 1.671875, "reward_std": 0.29091876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.703125, "step": 1755 }, { "completion_length": 111.59375, "epoch": 1.1706666666666667, "grad_norm": 2.427796509553445, "kl": 0.04931640625, "learning_rate": 4.146666666666667e-07, "loss": 0.002, "reward": 1.7630208730697632, "reward_std": 0.2114429920911789, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7942708730697632, "step": 1756 }, { "completion_length": 88.34375, "epoch": 1.1713333333333333, "grad_norm": 4.146424039249912, "kl": 0.07177734375, "learning_rate": 4.143333333333333e-07, "loss": 0.0029, "reward": 1.845312476158142, "reward_std": 0.05346832424402237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8453124761581421, "step": 1757 }, { "completion_length": 93.125, "epoch": 1.172, "grad_norm": 3.8173430627258393, "kl": 0.061767578125, "learning_rate": 4.14e-07, "loss": 0.0025, "reward": 1.7239583730697632, "reward_std": 0.09375, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7552083730697632, "step": 1758 }, { "completion_length": 101.65625, "epoch": 1.1726666666666667, "grad_norm": 3.17484492654084, "kl": 0.07275390625, "learning_rate": 4.1366666666666665e-07, "loss": 0.0029, "reward": 1.7395833730697632, "reward_std": 0.2805021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708332538604736, "step": 1759 }, { "completion_length": 102.9375, "epoch": 1.1733333333333333, "grad_norm": 5.3821533423525585, "kl": 0.0400390625, "learning_rate": 4.1333333333333333e-07, "loss": 0.0016, "reward": 1.8302083015441895, "reward_std": 0.12446298450231552, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8302083611488342, "step": 1760 }, { "completion_length": 101.8125, "epoch": 1.174, "grad_norm": 3.8262453150610503, "kl": 0.058349609375, "learning_rate": 4.1299999999999995e-07, "loss": 0.0023, "reward": 1.7625000476837158, "reward_std": 0.0907980427145958, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.762499988079071, "step": 1761 }, { "completion_length": 99.6875, "epoch": 1.1746666666666667, "grad_norm": 2.1517308945868976, "kl": 0.061279296875, "learning_rate": 4.126666666666667e-07, "loss": 0.0025, "reward": 1.8937499523162842, "reward_std": 0.13750000298023224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.893750011920929, "step": 1762 }, { "completion_length": 87.71875, "epoch": 1.1753333333333333, "grad_norm": 3.868359706956447, "kl": 0.07275390625, "learning_rate": 4.123333333333333e-07, "loss": 0.0029, "reward": 1.7395833730697632, "reward_std": 0.25966876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 1763 }, { "completion_length": 104.125, "epoch": 1.176, "grad_norm": 4.866644820748072, "kl": 0.103515625, "learning_rate": 4.12e-07, "loss": 0.0041, "reward": 1.6841518878936768, "reward_std": 0.15716701745986938, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6841517686843872, "step": 1764 }, { "completion_length": 101.46875, "epoch": 1.1766666666666667, "grad_norm": 3.7538182496252794, "kl": 0.0751953125, "learning_rate": 4.1166666666666666e-07, "loss": 0.003, "reward": 1.6145833730697632, "reward_std": 0.2083333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6145833730697632, "step": 1765 }, { "completion_length": 89.8125, "epoch": 1.1773333333333333, "grad_norm": 6.511212956513368, "kl": 0.0810546875, "learning_rate": 4.113333333333333e-07, "loss": 0.0033, "reward": 1.7708333730697632, "reward_std": 0.10503024607896805, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 1766 }, { "completion_length": 106.59375, "epoch": 1.178, "grad_norm": 3.960715759616578, "kl": 0.053955078125, "learning_rate": 4.1099999999999996e-07, "loss": 0.0022, "reward": 1.734375, "reward_std": 0.08494480699300766, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.734375, "step": 1767 }, { "completion_length": 97.3125, "epoch": 1.1786666666666668, "grad_norm": 6.279905604888964, "kl": 0.07080078125, "learning_rate": 4.1066666666666664e-07, "loss": 0.0028, "reward": 1.9453125, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9453125, "step": 1768 }, { "completion_length": 99.21875, "epoch": 1.1793333333333333, "grad_norm": 1.0092379770626034, "kl": 0.04736328125, "learning_rate": 4.103333333333333e-07, "loss": 0.0019, "reward": 1.7083333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7083333134651184, "step": 1769 }, { "completion_length": 100.59375, "epoch": 1.18, "grad_norm": 4.0410797504175795, "kl": 0.07861328125, "learning_rate": 4.0999999999999994e-07, "loss": 0.0031, "reward": 1.640625, "reward_std": 0.20554219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.640625, "step": 1770 }, { "completion_length": 105.34375, "epoch": 1.1806666666666668, "grad_norm": 5.445352946967957, "kl": 0.10498046875, "learning_rate": 4.0966666666666667e-07, "loss": 0.0042, "reward": 1.5692708492279053, "reward_std": 0.05794697254896164, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5692708492279053, "step": 1771 }, { "completion_length": 96.125, "epoch": 1.1813333333333333, "grad_norm": 5.666093400906191, "kl": 0.12890625, "learning_rate": 4.093333333333333e-07, "loss": 0.0051, "reward": 1.8255208730697632, "reward_std": 0.24801458418369293, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8567708730697632, "step": 1772 }, { "completion_length": 92.75, "epoch": 1.182, "grad_norm": 2.8172982936212447, "kl": 0.07861328125, "learning_rate": 4.0899999999999997e-07, "loss": 0.0031, "reward": 1.8718750476837158, "reward_std": 0.006249999161809683, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.871874988079071, "step": 1773 }, { "completion_length": 110.375, "epoch": 1.1826666666666668, "grad_norm": 2.716110569030617, "kl": 0.06787109375, "learning_rate": 4.0866666666666665e-07, "loss": 0.0027, "reward": 1.921875, "reward_std": 0.018042195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 1774 }, { "completion_length": 104.96875, "epoch": 1.1833333333333333, "grad_norm": 1.536237041357145, "kl": 0.050537109375, "learning_rate": 4.083333333333333e-07, "loss": 0.002, "reward": 1.96875, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9687499403953552, "step": 1775 }, { "completion_length": 85.0, "epoch": 1.184, "grad_norm": 4.263038581856662, "kl": 0.06787109375, "learning_rate": 4.0799999999999995e-07, "loss": 0.0027, "reward": 1.8958333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333134651184, "step": 1776 }, { "completion_length": 95.28125, "epoch": 1.1846666666666668, "grad_norm": 3.7365761959030084, "kl": 0.0546875, "learning_rate": 4.076666666666667e-07, "loss": 0.0022, "reward": 1.821874976158142, "reward_std": 0.08911146223545074, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8218749761581421, "step": 1777 }, { "completion_length": 107.53125, "epoch": 1.1853333333333333, "grad_norm": 1.42194462803486, "kl": 0.07275390625, "learning_rate": 4.073333333333333e-07, "loss": 0.0029, "reward": 1.8098958730697632, "reward_std": 0.015625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8098958730697632, "step": 1778 }, { "completion_length": 83.59375, "epoch": 1.186, "grad_norm": 1.072679290643272, "kl": 0.05810546875, "learning_rate": 4.07e-07, "loss": 0.0023, "reward": 1.8958333730697632, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 1779 }, { "completion_length": 101.0625, "epoch": 1.1866666666666668, "grad_norm": 2.6542024850840873, "kl": 0.08203125, "learning_rate": 4.0666666666666666e-07, "loss": 0.0033, "reward": 1.8098958730697632, "reward_std": 0.07535238564014435, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8098958730697632, "step": 1780 }, { "completion_length": 93.53125, "epoch": 1.1873333333333334, "grad_norm": 3.046286205750069, "kl": 0.12451171875, "learning_rate": 4.063333333333333e-07, "loss": 0.005, "reward": 1.6796875, "reward_std": 0.26997217535972595, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7109375, "step": 1781 }, { "completion_length": 99.78125, "epoch": 1.188, "grad_norm": 2.8564144824404374, "kl": 0.0791015625, "learning_rate": 4.06e-07, "loss": 0.0032, "reward": 1.84375, "reward_std": 0.25966876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.875, "step": 1782 }, { "completion_length": 104.25, "epoch": 1.1886666666666668, "grad_norm": 3.316589106796601, "kl": 0.0830078125, "learning_rate": 4.0566666666666663e-07, "loss": 0.0033, "reward": 1.7395833730697632, "reward_std": 0.21032969653606415, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833730697632, "step": 1783 }, { "completion_length": 100.34375, "epoch": 1.1893333333333334, "grad_norm": 2.591118001528191, "kl": 0.08056640625, "learning_rate": 4.053333333333333e-07, "loss": 0.0032, "reward": 1.845312476158142, "reward_std": 0.11018722504377365, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8453124761581421, "step": 1784 }, { "completion_length": 95.8125, "epoch": 1.19, "grad_norm": 2.6675635752441242, "kl": 0.1181640625, "learning_rate": 4.05e-07, "loss": 0.0047, "reward": 1.962499976158142, "reward_std": 0.04612797498703003, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9624999761581421, "step": 1785 }, { "completion_length": 91.09375, "epoch": 1.1906666666666668, "grad_norm": 2.441694785451235, "kl": 0.045654296875, "learning_rate": 4.0466666666666666e-07, "loss": 0.0018, "reward": 1.8240530490875244, "reward_std": 0.10189393907785416, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8553030490875244, "step": 1786 }, { "completion_length": 89.59375, "epoch": 1.1913333333333334, "grad_norm": 1.2178375890594662, "kl": 0.10791015625, "learning_rate": 4.043333333333333e-07, "loss": 0.0043, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1787 }, { "completion_length": 91.96875, "epoch": 1.192, "grad_norm": 4.3061920351891105, "kl": 0.0771484375, "learning_rate": 4.04e-07, "loss": 0.0031, "reward": 1.7578125, "reward_std": 0.27529376745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7890625, "step": 1788 }, { "completion_length": 104.84375, "epoch": 1.1926666666666668, "grad_norm": 3.6473212723400876, "kl": 0.040771484375, "learning_rate": 4.0366666666666664e-07, "loss": 0.0016, "reward": 1.7453124523162842, "reward_std": 0.10725778341293335, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.745312511920929, "step": 1789 }, { "completion_length": 104.96875, "epoch": 1.1933333333333334, "grad_norm": 4.210483720311097, "kl": 0.04833984375, "learning_rate": 4.033333333333333e-07, "loss": 0.0019, "reward": 1.6749999523162842, "reward_std": 0.17358440160751343, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.675000011920929, "step": 1790 }, { "completion_length": 89.9375, "epoch": 1.194, "grad_norm": 3.176576923120261, "kl": 0.0537109375, "learning_rate": 4.03e-07, "loss": 0.0022, "reward": 1.743749976158142, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7437499761581421, "step": 1791 }, { "completion_length": 100.375, "epoch": 1.1946666666666665, "grad_norm": 1.5632176896814272, "kl": 0.0595703125, "learning_rate": 4.0266666666666667e-07, "loss": 0.0024, "reward": 1.773214340209961, "reward_std": 0.02348991669714451, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7732142806053162, "step": 1792 }, { "completion_length": 106.15625, "epoch": 1.1953333333333334, "grad_norm": 1.3375543005512638, "kl": 0.04345703125, "learning_rate": 4.023333333333333e-07, "loss": 0.0017, "reward": 1.78125, "reward_std": 0.20683756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1793 }, { "completion_length": 85.84375, "epoch": 1.196, "grad_norm": 3.295440241981515, "kl": 0.09375, "learning_rate": 4.02e-07, "loss": 0.0037, "reward": 1.9140625, "reward_std": 0.026041671633720398, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9140624403953552, "step": 1794 }, { "completion_length": 96.0625, "epoch": 1.1966666666666668, "grad_norm": 8.391945187522504, "kl": 0.1044921875, "learning_rate": 4.0166666666666665e-07, "loss": 0.0042, "reward": 1.7630208730697632, "reward_std": 0.13343125581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7630208134651184, "step": 1795 }, { "completion_length": 89.625, "epoch": 1.1973333333333334, "grad_norm": 1.1122841794596419, "kl": 0.049560546875, "learning_rate": 4.0133333333333333e-07, "loss": 0.002, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1796 }, { "completion_length": 96.53125, "epoch": 1.198, "grad_norm": 2.7861342500827226, "kl": 0.09521484375, "learning_rate": 4.01e-07, "loss": 0.0038, "reward": 1.671875, "reward_std": 0.1979166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.671875, "step": 1797 }, { "completion_length": 88.65625, "epoch": 1.1986666666666665, "grad_norm": 4.591041530128822, "kl": 0.052734375, "learning_rate": 4.0066666666666663e-07, "loss": 0.0021, "reward": 1.8291666507720947, "reward_std": 0.06666666269302368, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8291667103767395, "step": 1798 }, { "completion_length": 98.90625, "epoch": 1.1993333333333334, "grad_norm": 5.48234039120641, "kl": 0.0869140625, "learning_rate": 4.003333333333333e-07, "loss": 0.0035, "reward": 1.8859374523162842, "reward_std": 0.18343330919742584, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.917187511920929, "step": 1799 }, { "completion_length": 90.625, "epoch": 1.2, "grad_norm": 9.54444502973234, "kl": 0.08203125, "learning_rate": 4e-07, "loss": 0.0033, "reward": 1.7010416984558105, "reward_std": 0.1762310266494751, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7010416984558105, "step": 1800 }, { "completion_length": 99.09375, "epoch": 1.2006666666666668, "grad_norm": 3.710169777362074, "kl": 0.0546875, "learning_rate": 3.9966666666666666e-07, "loss": 0.0022, "reward": 1.96875, "reward_std": 0.025515519082546234, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1801 }, { "completion_length": 97.15625, "epoch": 1.2013333333333334, "grad_norm": 2.6972232459384062, "kl": 0.10009765625, "learning_rate": 3.993333333333333e-07, "loss": 0.004, "reward": 1.939062476158142, "reward_std": 0.04062499850988388, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9390625357627869, "step": 1802 }, { "completion_length": 102.375, "epoch": 1.202, "grad_norm": 5.163553568398121, "kl": 0.09423828125, "learning_rate": 3.99e-07, "loss": 0.0038, "reward": 1.8151041269302368, "reward_std": 0.11145833134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8151041269302368, "step": 1803 }, { "completion_length": 96.3125, "epoch": 1.2026666666666666, "grad_norm": 1.4028524229133097, "kl": 0.06689453125, "learning_rate": 3.9866666666666664e-07, "loss": 0.0027, "reward": 1.625, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.65625, "step": 1804 }, { "completion_length": 105.90625, "epoch": 1.2033333333333334, "grad_norm": 1.6942491942734816, "kl": 0.04736328125, "learning_rate": 3.983333333333333e-07, "loss": 0.0019, "reward": 1.5625, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.59375, "step": 1805 }, { "completion_length": 103.59375, "epoch": 1.204, "grad_norm": 3.0941014268519096, "kl": 0.0849609375, "learning_rate": 3.98e-07, "loss": 0.0034, "reward": 1.675520896911621, "reward_std": 0.18655292689800262, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6755208373069763, "step": 1806 }, { "completion_length": 107.0, "epoch": 1.2046666666666668, "grad_norm": 0.7175852813039209, "kl": 0.050048828125, "learning_rate": 3.9766666666666667e-07, "loss": 0.002, "reward": 1.90625, "reward_std": 0.11967839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9375, "step": 1807 }, { "completion_length": 78.78125, "epoch": 1.2053333333333334, "grad_norm": 2.381814174849463, "kl": 0.10009765625, "learning_rate": 3.973333333333333e-07, "loss": 0.004, "reward": 1.84375, "reward_std": 0.0777510553598404, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1808 }, { "completion_length": 99.25, "epoch": 1.206, "grad_norm": 3.666245170439755, "kl": 0.06640625, "learning_rate": 3.97e-07, "loss": 0.0026, "reward": 1.8416666984558105, "reward_std": 0.06123931705951691, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8416666388511658, "step": 1809 }, { "completion_length": 88.84375, "epoch": 1.2066666666666666, "grad_norm": 3.6684909084000203, "kl": 0.059326171875, "learning_rate": 3.9666666666666665e-07, "loss": 0.0024, "reward": 1.5281250476837158, "reward_std": 0.1827297806739807, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5281250476837158, "step": 1810 }, { "completion_length": 106.625, "epoch": 1.2073333333333334, "grad_norm": 3.196839674596228, "kl": 0.08203125, "learning_rate": 3.963333333333333e-07, "loss": 0.0033, "reward": 1.742708444595337, "reward_std": 0.15208332240581512, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7427083253860474, "step": 1811 }, { "completion_length": 94.46875, "epoch": 1.208, "grad_norm": 3.2691691462288466, "kl": 0.1943359375, "learning_rate": 3.96e-07, "loss": 0.0078, "reward": 1.779687523841858, "reward_std": 0.12142627686262131, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7796875238418579, "step": 1812 }, { "completion_length": 103.25, "epoch": 1.2086666666666668, "grad_norm": 2.209116124681983, "kl": 0.0693359375, "learning_rate": 3.956666666666666e-07, "loss": 0.0028, "reward": 1.9140625, "reward_std": 0.04387339949607849, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9140625, "step": 1813 }, { "completion_length": 96.21875, "epoch": 1.2093333333333334, "grad_norm": 1.9801022671147723, "kl": 0.05615234375, "learning_rate": 3.953333333333333e-07, "loss": 0.0022, "reward": 1.8489583730697632, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583730697632, "step": 1814 }, { "completion_length": 93.3125, "epoch": 1.21, "grad_norm": 2.4885434297664197, "kl": 0.0791015625, "learning_rate": 3.95e-07, "loss": 0.0032, "reward": 1.993749976158142, "reward_std": 0.012500002980232239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9937499761581421, "step": 1815 }, { "completion_length": 96.71875, "epoch": 1.2106666666666666, "grad_norm": 1.7371665047610723, "kl": 0.08544921875, "learning_rate": 3.9466666666666665e-07, "loss": 0.0034, "reward": 1.96875, "reward_std": 0.049292195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1816 }, { "completion_length": 95.875, "epoch": 1.2113333333333334, "grad_norm": 5.142174900186736, "kl": 0.083984375, "learning_rate": 3.943333333333333e-07, "loss": 0.0034, "reward": 1.8468749523162842, "reward_std": 0.13124999403953552, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8468749523162842, "step": 1817 }, { "completion_length": 96.34375, "epoch": 1.212, "grad_norm": 2.6087674646880266, "kl": 0.08642578125, "learning_rate": 3.94e-07, "loss": 0.0035, "reward": 1.8325892686843872, "reward_std": 0.20294085144996643, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8325892686843872, "step": 1818 }, { "completion_length": 99.96875, "epoch": 1.2126666666666668, "grad_norm": 9.642093780035623, "kl": 0.06591796875, "learning_rate": 3.9366666666666663e-07, "loss": 0.0026, "reward": 1.7307292222976685, "reward_std": 0.1565437763929367, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7307292222976685, "step": 1819 }, { "completion_length": 96.4375, "epoch": 1.2133333333333334, "grad_norm": 1.4986619621424586, "kl": 0.049560546875, "learning_rate": 3.933333333333333e-07, "loss": 0.002, "reward": 1.8515625, "reward_std": 0.029919598251581192, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8515625, "step": 1820 }, { "completion_length": 97.59375, "epoch": 1.214, "grad_norm": 3.777499762645589, "kl": 0.11181640625, "learning_rate": 3.93e-07, "loss": 0.0045, "reward": 1.8611234426498413, "reward_std": 0.11295534670352936, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8611235022544861, "step": 1821 }, { "completion_length": 106.65625, "epoch": 1.2146666666666666, "grad_norm": 1.942949439376628, "kl": 0.05810546875, "learning_rate": 3.9266666666666666e-07, "loss": 0.0023, "reward": 1.8489583730697632, "reward_std": 0.0312499962747097, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583134651184, "step": 1822 }, { "completion_length": 101.4375, "epoch": 1.2153333333333334, "grad_norm": 2.6452915800997006, "kl": 0.06396484375, "learning_rate": 3.923333333333333e-07, "loss": 0.0026, "reward": 1.8406250476837158, "reward_std": 0.09585151076316833, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.871874988079071, "step": 1823 }, { "completion_length": 89.09375, "epoch": 1.216, "grad_norm": 2.5364171691422124, "kl": 0.08349609375, "learning_rate": 3.92e-07, "loss": 0.0033, "reward": 1.9114583730697632, "reward_std": 0.15625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9114583730697632, "step": 1824 }, { "completion_length": 91.5, "epoch": 1.2166666666666668, "grad_norm": 6.84479428979368, "kl": 0.052978515625, "learning_rate": 3.9166666666666664e-07, "loss": 0.0021, "reward": 1.8229167461395264, "reward_std": 0.19158650934696198, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229167461395264, "step": 1825 }, { "completion_length": 88.4375, "epoch": 1.2173333333333334, "grad_norm": 2.7385105270742636, "kl": 0.0576171875, "learning_rate": 3.913333333333333e-07, "loss": 0.0023, "reward": 1.9328124523162842, "reward_std": 0.07187499850988388, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.932812511920929, "step": 1826 }, { "completion_length": 94.125, "epoch": 1.218, "grad_norm": 3.130901739273101, "kl": 0.060546875, "learning_rate": 3.91e-07, "loss": 0.0024, "reward": 1.8403273820877075, "reward_std": 0.08914965391159058, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8403273820877075, "step": 1827 }, { "completion_length": 100.0, "epoch": 1.2186666666666666, "grad_norm": 4.555390086878766, "kl": 0.06689453125, "learning_rate": 3.906666666666666e-07, "loss": 0.0027, "reward": 1.875, "reward_std": 0.1666666567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1828 }, { "completion_length": 95.9375, "epoch": 1.2193333333333334, "grad_norm": 1.08166573964295, "kl": 0.0625, "learning_rate": 3.903333333333333e-07, "loss": 0.0025, "reward": 1.8770833015441895, "reward_std": 0.020412415266036987, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8770833611488342, "step": 1829 }, { "completion_length": 109.96875, "epoch": 1.22, "grad_norm": 1.4631095996702705, "kl": 0.072265625, "learning_rate": 3.8999999999999997e-07, "loss": 0.0029, "reward": 1.78125, "reward_std": 0.10750514268875122, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1830 }, { "completion_length": 88.90625, "epoch": 1.2206666666666668, "grad_norm": 3.0680886136151027, "kl": 0.0732421875, "learning_rate": 3.8966666666666665e-07, "loss": 0.0029, "reward": 1.9364583492279053, "reward_std": 0.015728827565908432, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9364583492279053, "step": 1831 }, { "completion_length": 101.03125, "epoch": 1.2213333333333334, "grad_norm": 1.726263347786674, "kl": 0.064453125, "learning_rate": 3.8933333333333327e-07, "loss": 0.0026, "reward": 1.9583333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333134651184, "step": 1832 }, { "completion_length": 104.46875, "epoch": 1.222, "grad_norm": 4.331910094798244, "kl": 0.099609375, "learning_rate": 3.89e-07, "loss": 0.004, "reward": 1.9635417461395264, "reward_std": 0.026214702054858208, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9635416269302368, "step": 1833 }, { "completion_length": 103.65625, "epoch": 1.2226666666666666, "grad_norm": 1.6942862698408037, "kl": 0.049072265625, "learning_rate": 3.8866666666666663e-07, "loss": 0.002, "reward": 1.8125, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8124999403953552, "step": 1834 }, { "completion_length": 112.125, "epoch": 1.2233333333333334, "grad_norm": 1.2105398462696524, "kl": 0.08642578125, "learning_rate": 3.883333333333333e-07, "loss": 0.0035, "reward": 1.78125, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 1835 }, { "completion_length": 99.34375, "epoch": 1.224, "grad_norm": 0.27464098116243996, "kl": 0.061767578125, "learning_rate": 3.88e-07, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 1836 }, { "completion_length": 117.28125, "epoch": 1.2246666666666666, "grad_norm": 2.361536336627649, "kl": 0.0712890625, "learning_rate": 3.8766666666666666e-07, "loss": 0.0028, "reward": 1.7135417461395264, "reward_std": 0.14508545398712158, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7447916865348816, "step": 1837 }, { "completion_length": 92.03125, "epoch": 1.2253333333333334, "grad_norm": 3.023650447591506, "kl": 0.052734375, "learning_rate": 3.873333333333333e-07, "loss": 0.0021, "reward": 1.8359375, "reward_std": 0.27529376745224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8359375, "step": 1838 }, { "completion_length": 92.28125, "epoch": 1.226, "grad_norm": 1.9623398039363102, "kl": 0.072265625, "learning_rate": 3.87e-07, "loss": 0.0029, "reward": 1.84375, "reward_std": 0.09300211817026138, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8437500596046448, "step": 1839 }, { "completion_length": 86.53125, "epoch": 1.2266666666666666, "grad_norm": 3.2436587612694256, "kl": 0.0322265625, "learning_rate": 3.8666666666666664e-07, "loss": 0.0013, "reward": 1.9479167461395264, "reward_std": 0.012028136290609837, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9479166269302368, "step": 1840 }, { "completion_length": 101.09375, "epoch": 1.2273333333333334, "grad_norm": 2.490074970664843, "kl": 0.06494140625, "learning_rate": 3.863333333333333e-07, "loss": 0.0026, "reward": 1.859375, "reward_std": 0.1354166567325592, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8906250596046448, "step": 1841 }, { "completion_length": 96.9375, "epoch": 1.228, "grad_norm": 2.5252459430993803, "kl": 0.06884765625, "learning_rate": 3.86e-07, "loss": 0.0027, "reward": 1.703125, "reward_std": 0.35341876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.734375, "step": 1842 }, { "completion_length": 96.71875, "epoch": 1.2286666666666666, "grad_norm": 2.2512510498358576, "kl": 0.083984375, "learning_rate": 3.8566666666666667e-07, "loss": 0.0034, "reward": 1.8875000476837158, "reward_std": 0.11085229367017746, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.887499988079071, "step": 1843 }, { "completion_length": 95.84375, "epoch": 1.2293333333333334, "grad_norm": 6.765201274189515, "kl": 0.0751953125, "learning_rate": 3.8533333333333334e-07, "loss": 0.003, "reward": 1.8135416507720947, "reward_std": 0.15250736474990845, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8447916507720947, "step": 1844 }, { "completion_length": 98.4375, "epoch": 1.23, "grad_norm": 2.8037373960178784, "kl": 0.0693359375, "learning_rate": 3.8499999999999997e-07, "loss": 0.0028, "reward": 1.8072917461395264, "reward_std": 0.19606778025627136, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8072916269302368, "step": 1845 }, { "completion_length": 93.65625, "epoch": 1.2306666666666666, "grad_norm": 5.095174032042471, "kl": 0.08154296875, "learning_rate": 3.8466666666666664e-07, "loss": 0.0033, "reward": 1.7713541984558105, "reward_std": 0.24802134931087494, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7713541984558105, "step": 1846 }, { "completion_length": 107.375, "epoch": 1.2313333333333334, "grad_norm": 6.794827507800707, "kl": 0.08447265625, "learning_rate": 3.843333333333333e-07, "loss": 0.0034, "reward": 1.8546874523162842, "reward_std": 0.09746210277080536, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.854687511920929, "step": 1847 }, { "completion_length": 105.5625, "epoch": 1.232, "grad_norm": 24.48606311941005, "kl": 0.1005859375, "learning_rate": 3.84e-07, "loss": 0.004, "reward": 1.8046875, "reward_std": 0.07742579281330109, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8046875, "step": 1848 }, { "completion_length": 91.625, "epoch": 1.2326666666666666, "grad_norm": 0.1221194646741975, "kl": 0.058837890625, "learning_rate": 3.836666666666666e-07, "loss": 0.0023, "reward": 1.875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1849 }, { "completion_length": 115.53125, "epoch": 1.2333333333333334, "grad_norm": 5.436777677129636, "kl": 0.0703125, "learning_rate": 3.8333333333333335e-07, "loss": 0.0028, "reward": 1.8666666746139526, "reward_std": 0.11631220579147339, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8666666746139526, "step": 1850 }, { "completion_length": 95.09375, "epoch": 1.234, "grad_norm": 1.7371434911531416, "kl": 0.07763671875, "learning_rate": 3.83e-07, "loss": 0.0031, "reward": 1.8125, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 1851 }, { "completion_length": 111.78125, "epoch": 1.2346666666666666, "grad_norm": 1.1027942053762847, "kl": 0.037841796875, "learning_rate": 3.8266666666666665e-07, "loss": 0.0015, "reward": 1.7708333730697632, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333134651184, "step": 1852 }, { "completion_length": 91.1875, "epoch": 1.2353333333333334, "grad_norm": 3.066613052104343, "kl": 0.0849609375, "learning_rate": 3.8233333333333333e-07, "loss": 0.0034, "reward": 1.847916603088379, "reward_std": 0.18251954019069672, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8791667222976685, "step": 1853 }, { "completion_length": 131.625, "epoch": 1.236, "grad_norm": 4.0760269604440875, "kl": 0.055419921875, "learning_rate": 3.82e-07, "loss": 0.0022, "reward": 1.6328125, "reward_std": 0.40029376745224, "rewards/format_reward": 0.875, "rewards/iou_reward": 0.7578125, "step": 1854 }, { "completion_length": 98.3125, "epoch": 1.2366666666666666, "grad_norm": 2.5607341923610347, "kl": 0.058349609375, "learning_rate": 3.8166666666666663e-07, "loss": 0.0023, "reward": 1.890625, "reward_std": 0.09375, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.890625, "step": 1855 }, { "completion_length": 120.21875, "epoch": 1.2373333333333334, "grad_norm": 2.087544893284126, "kl": 0.06494140625, "learning_rate": 3.8133333333333336e-07, "loss": 0.0026, "reward": 1.6484375, "reward_std": 0.265625, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.7421875, "step": 1856 }, { "completion_length": 107.8125, "epoch": 1.238, "grad_norm": 0.5950577911393694, "kl": 0.040771484375, "learning_rate": 3.81e-07, "loss": 0.0016, "reward": 1.6875, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6875, "step": 1857 }, { "completion_length": 99.0, "epoch": 1.2386666666666666, "grad_norm": 1.5428589351822617, "kl": 0.0693359375, "learning_rate": 3.8066666666666666e-07, "loss": 0.0028, "reward": 1.8333333730697632, "reward_std": 0.0833333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 1858 }, { "completion_length": 82.28125, "epoch": 1.2393333333333334, "grad_norm": 1.8985741781462984, "kl": 0.048828125, "learning_rate": 3.8033333333333334e-07, "loss": 0.0019, "reward": 1.9322917461395264, "reward_std": 0.010416661389172077, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322916865348816, "step": 1859 }, { "completion_length": 99.3125, "epoch": 1.24, "grad_norm": 0.2408459551014951, "kl": 0.06103515625, "learning_rate": 3.7999999999999996e-07, "loss": 0.0024, "reward": 1.9583333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333730697632, "step": 1860 }, { "completion_length": 111.15625, "epoch": 1.2406666666666666, "grad_norm": 4.051502055093499, "kl": 0.07861328125, "learning_rate": 3.7966666666666664e-07, "loss": 0.0031, "reward": 1.8235118389129639, "reward_std": 0.19337248802185059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8235118985176086, "step": 1861 }, { "completion_length": 96.78125, "epoch": 1.2413333333333334, "grad_norm": 2.634326828154094, "kl": 0.08154296875, "learning_rate": 3.793333333333333e-07, "loss": 0.0033, "reward": 1.941666603088379, "reward_std": 0.03849000856280327, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9416667222976685, "step": 1862 }, { "completion_length": 99.1875, "epoch": 1.242, "grad_norm": 0.13034194046963762, "kl": 0.0634765625, "learning_rate": 3.79e-07, "loss": 0.0025, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 1863 }, { "completion_length": 93.0625, "epoch": 1.2426666666666666, "grad_norm": 5.094214001847422, "kl": 0.0595703125, "learning_rate": 3.786666666666666e-07, "loss": 0.0024, "reward": 1.7109375, "reward_std": 0.15996256470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7109375, "step": 1864 }, { "completion_length": 104.28125, "epoch": 1.2433333333333334, "grad_norm": 22.956150037547307, "kl": 0.05859375, "learning_rate": 3.7833333333333335e-07, "loss": 0.0024, "reward": 1.8411458730697632, "reward_std": 0.11904378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8411458730697632, "step": 1865 }, { "completion_length": 85.84375, "epoch": 1.244, "grad_norm": 0.3268722372290383, "kl": 0.06884765625, "learning_rate": 3.7799999999999997e-07, "loss": 0.0028, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 1866 }, { "completion_length": 101.875, "epoch": 1.2446666666666666, "grad_norm": 1.785362418982276, "kl": 0.09521484375, "learning_rate": 3.7766666666666665e-07, "loss": 0.0038, "reward": 1.7708333730697632, "reward_std": 0.0416666716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7708333730697632, "step": 1867 }, { "completion_length": 90.625, "epoch": 1.2453333333333334, "grad_norm": 46.58086701374594, "kl": 0.061279296875, "learning_rate": 3.773333333333333e-07, "loss": 0.0024, "reward": 1.831770896911621, "reward_std": 0.033645376563072205, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8317707777023315, "step": 1868 }, { "completion_length": 89.3125, "epoch": 1.246, "grad_norm": 4.200886500608005, "kl": 0.08837890625, "learning_rate": 3.77e-07, "loss": 0.0035, "reward": 1.761979103088379, "reward_std": 0.15531902015209198, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7619791626930237, "step": 1869 }, { "completion_length": 97.09375, "epoch": 1.2466666666666666, "grad_norm": 1.2875958196805557, "kl": 0.08203125, "learning_rate": 3.766666666666666e-07, "loss": 0.0033, "reward": 1.90625, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1870 }, { "completion_length": 84.96875, "epoch": 1.2473333333333334, "grad_norm": 2.5251609583896135, "kl": 0.037353515625, "learning_rate": 3.7633333333333335e-07, "loss": 0.0015, "reward": 1.7447917461395264, "reward_std": 0.1354166567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7447916865348816, "step": 1871 }, { "completion_length": 98.875, "epoch": 1.248, "grad_norm": 2.6894537907413043, "kl": 0.05810546875, "learning_rate": 3.76e-07, "loss": 0.0023, "reward": 1.8333333730697632, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 1872 }, { "completion_length": 103.5625, "epoch": 1.2486666666666666, "grad_norm": 2.570491830561605, "kl": 0.061279296875, "learning_rate": 3.7566666666666666e-07, "loss": 0.0024, "reward": 1.9562500715255737, "reward_std": 0.030455823987722397, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9562500715255737, "step": 1873 }, { "completion_length": 109.5, "epoch": 1.2493333333333334, "grad_norm": 6.0606770604468485, "kl": 0.07763671875, "learning_rate": 3.7533333333333333e-07, "loss": 0.0031, "reward": 1.6541666984558105, "reward_std": 0.13394075632095337, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6541666984558105, "step": 1874 }, { "completion_length": 100.0, "epoch": 1.25, "grad_norm": 6.69422070928353, "kl": 0.06591796875, "learning_rate": 3.75e-07, "loss": 0.0026, "reward": 1.8098958730697632, "reward_std": 0.1427895426750183, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8098957538604736, "step": 1875 }, { "completion_length": 106.75, "epoch": 1.2506666666666666, "grad_norm": 2.624824760959978, "kl": 0.05322265625, "learning_rate": 3.7466666666666663e-07, "loss": 0.0021, "reward": 1.8624999523162842, "reward_std": 0.10051293671131134, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.862500011920929, "step": 1876 }, { "completion_length": 82.75, "epoch": 1.2513333333333334, "grad_norm": 4.125885793909869, "kl": 0.11767578125, "learning_rate": 3.743333333333333e-07, "loss": 0.0047, "reward": 1.8604166507720947, "reward_std": 0.09945249557495117, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8604166507720947, "step": 1877 }, { "completion_length": 92.875, "epoch": 1.252, "grad_norm": 1.0013307453629534, "kl": 0.07470703125, "learning_rate": 3.74e-07, "loss": 0.003, "reward": 1.8583333492279053, "reward_std": 0.054006174206733704, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8583332896232605, "step": 1878 }, { "completion_length": 104.78125, "epoch": 1.2526666666666666, "grad_norm": 5.725026581204204, "kl": 0.05810546875, "learning_rate": 3.736666666666666e-07, "loss": 0.0023, "reward": 1.6729166507720947, "reward_std": 0.07693374902009964, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6729166507720947, "step": 1879 }, { "completion_length": 108.28125, "epoch": 1.2533333333333334, "grad_norm": 2.9878502814351844, "kl": 0.07666015625, "learning_rate": 3.7333333333333334e-07, "loss": 0.0031, "reward": 1.5385416746139526, "reward_std": 0.5789482593536377, "rewards/format_reward": 0.90625, "rewards/iou_reward": 0.6322916746139526, "step": 1880 }, { "completion_length": 94.625, "epoch": 1.254, "grad_norm": 24.314005364169816, "kl": 0.0771484375, "learning_rate": 3.7299999999999997e-07, "loss": 0.0031, "reward": 1.9036458730697632, "reward_std": 0.07737711071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9036458730697632, "step": 1881 }, { "completion_length": 93.25, "epoch": 1.2546666666666666, "grad_norm": 6.601323155869658, "kl": 0.09033203125, "learning_rate": 3.7266666666666664e-07, "loss": 0.0036, "reward": 1.640625, "reward_std": 0.15051552653312683, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.640625, "step": 1882 }, { "completion_length": 99.84375, "epoch": 1.2553333333333334, "grad_norm": 0.06927971638906934, "kl": 0.0634765625, "learning_rate": 3.723333333333333e-07, "loss": 0.0025, "reward": 1.975000023841858, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9750000238418579, "step": 1883 }, { "completion_length": 92.21875, "epoch": 1.256, "grad_norm": 3.743586052311708, "kl": 0.0791015625, "learning_rate": 3.72e-07, "loss": 0.0032, "reward": 1.8575892448425293, "reward_std": 0.03287185728549957, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8575892448425293, "step": 1884 }, { "completion_length": 92.625, "epoch": 1.2566666666666666, "grad_norm": 1.69589932436707, "kl": 0.04052734375, "learning_rate": 3.716666666666666e-07, "loss": 0.0016, "reward": 1.8125, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 1885 }, { "completion_length": 100.28125, "epoch": 1.2573333333333334, "grad_norm": 1.431338965539198, "kl": 0.07275390625, "learning_rate": 3.7133333333333335e-07, "loss": 0.0029, "reward": 1.7161458730697632, "reward_std": 0.11904378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7161458134651184, "step": 1886 }, { "completion_length": 91.09375, "epoch": 1.258, "grad_norm": 5.210225997142745, "kl": 0.07666015625, "learning_rate": 3.71e-07, "loss": 0.0031, "reward": 1.906770944595337, "reward_std": 0.05686085671186447, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9067708849906921, "step": 1887 }, { "completion_length": 104.375, "epoch": 1.2586666666666666, "grad_norm": 2.2177373008648087, "kl": 0.042724609375, "learning_rate": 3.7066666666666665e-07, "loss": 0.0017, "reward": 1.8020833730697632, "reward_std": 0.2180021107196808, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8333333730697632, "step": 1888 }, { "completion_length": 104.96875, "epoch": 1.2593333333333334, "grad_norm": 2.267496479319022, "kl": 0.06884765625, "learning_rate": 3.7033333333333333e-07, "loss": 0.0028, "reward": 1.6640625, "reward_std": 0.17991960048675537, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6953125, "step": 1889 }, { "completion_length": 90.375, "epoch": 1.26, "grad_norm": 4.669611332337522, "kl": 0.06640625, "learning_rate": 3.7e-07, "loss": 0.0027, "reward": 1.651711344718933, "reward_std": 0.11750097572803497, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6829612851142883, "step": 1890 }, { "completion_length": 107.71875, "epoch": 1.2606666666666666, "grad_norm": 3.1304043053135304, "kl": 0.07177734375, "learning_rate": 3.6966666666666663e-07, "loss": 0.0029, "reward": 1.6593749523162842, "reward_std": 0.13820861279964447, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6593749523162842, "step": 1891 }, { "completion_length": 109.8125, "epoch": 1.2613333333333334, "grad_norm": 2.9528037328393664, "kl": 0.06787109375, "learning_rate": 3.693333333333333e-07, "loss": 0.0027, "reward": 1.7447917461395264, "reward_std": 0.2630212604999542, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7760416269302368, "step": 1892 }, { "completion_length": 90.1875, "epoch": 1.262, "grad_norm": 3.777073448223815, "kl": 0.0634765625, "learning_rate": 3.69e-07, "loss": 0.0025, "reward": 1.9713542461395264, "reward_std": 0.03968125581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9713542461395264, "step": 1893 }, { "completion_length": 100.9375, "epoch": 1.2626666666666666, "grad_norm": 0.18850696811386133, "kl": 0.05322265625, "learning_rate": 3.686666666666666e-07, "loss": 0.0021, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 1894 }, { "completion_length": 93.0625, "epoch": 1.2633333333333332, "grad_norm": 2.519525214682489, "kl": 0.07763671875, "learning_rate": 3.6833333333333334e-07, "loss": 0.0031, "reward": 1.7979166507720947, "reward_std": 0.16988958418369293, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7979167103767395, "step": 1895 }, { "completion_length": 94.03125, "epoch": 1.264, "grad_norm": 3.1428844632556365, "kl": 0.1318359375, "learning_rate": 3.6799999999999996e-07, "loss": 0.0053, "reward": 1.9895833730697632, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9895833730697632, "step": 1896 }, { "completion_length": 93.8125, "epoch": 1.2646666666666666, "grad_norm": 1.3805968945271403, "kl": 0.0859375, "learning_rate": 3.6766666666666664e-07, "loss": 0.0034, "reward": 1.8562500476837158, "reward_std": 0.012500002980232239, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8562500476837158, "step": 1897 }, { "completion_length": 106.96875, "epoch": 1.2653333333333334, "grad_norm": 4.181006661137894, "kl": 0.07958984375, "learning_rate": 3.673333333333333e-07, "loss": 0.0032, "reward": 1.9505208730697632, "reward_std": 0.0989583283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9817708134651184, "step": 1898 }, { "completion_length": 93.03125, "epoch": 1.266, "grad_norm": 3.8334163174195126, "kl": 0.06396484375, "learning_rate": 3.67e-07, "loss": 0.0026, "reward": 1.8270833492279053, "reward_std": 0.02428511157631874, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8270833492279053, "step": 1899 }, { "completion_length": 93.875, "epoch": 1.2666666666666666, "grad_norm": 3.159082960065842, "kl": 0.0771484375, "learning_rate": 3.666666666666666e-07, "loss": 0.0031, "reward": 1.5187499523162842, "reward_std": 0.1383839249610901, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.518750011920929, "step": 1900 }, { "completion_length": 92.03125, "epoch": 1.2673333333333332, "grad_norm": 16.264228112038335, "kl": 0.07470703125, "learning_rate": 3.6633333333333334e-07, "loss": 0.003, "reward": 1.9380208253860474, "reward_std": 0.04825052618980408, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9380208253860474, "step": 1901 }, { "completion_length": 105.90625, "epoch": 1.268, "grad_norm": 4.079260557007177, "kl": 0.1103515625, "learning_rate": 3.6599999999999997e-07, "loss": 0.0044, "reward": 1.9442708492279053, "reward_std": 0.09977055341005325, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9442708492279053, "step": 1902 }, { "completion_length": 110.34375, "epoch": 1.2686666666666666, "grad_norm": 5.194336797535965, "kl": 0.08837890625, "learning_rate": 3.6566666666666665e-07, "loss": 0.0035, "reward": 1.6552083492279053, "reward_std": 0.2562499940395355, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6552082896232605, "step": 1903 }, { "completion_length": 104.0625, "epoch": 1.2693333333333334, "grad_norm": 2.7317559905348436, "kl": 0.0634765625, "learning_rate": 3.653333333333333e-07, "loss": 0.0025, "reward": 1.7395833730697632, "reward_std": 0.25966876745224, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7395833134651184, "step": 1904 }, { "completion_length": 82.625, "epoch": 1.27, "grad_norm": 1.5725858907483727, "kl": 0.0576171875, "learning_rate": 3.65e-07, "loss": 0.0023, "reward": 1.9609375, "reward_std": 0.078125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9921875, "step": 1905 }, { "completion_length": 92.65625, "epoch": 1.2706666666666666, "grad_norm": 12.810661551282601, "kl": 0.08349609375, "learning_rate": 3.646666666666666e-07, "loss": 0.0033, "reward": 1.7687500715255737, "reward_std": 0.014583339914679527, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7687499523162842, "step": 1906 }, { "completion_length": 93.53125, "epoch": 1.2713333333333332, "grad_norm": 5.243030914056081, "kl": 0.061279296875, "learning_rate": 3.643333333333333e-07, "loss": 0.0025, "reward": 1.9427083730697632, "reward_std": 0.0388755239546299, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083134651184, "step": 1907 }, { "completion_length": 96.0, "epoch": 1.272, "grad_norm": 4.348479478360791, "kl": 0.061767578125, "learning_rate": 3.64e-07, "loss": 0.0025, "reward": 1.7395833730697632, "reward_std": 0.1875, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 1908 }, { "completion_length": 100.8125, "epoch": 1.2726666666666666, "grad_norm": 3.077322707428849, "kl": 0.07177734375, "learning_rate": 3.6366666666666665e-07, "loss": 0.0029, "reward": 1.859375, "reward_std": 0.25483438372612, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.921875, "step": 1909 }, { "completion_length": 114.65625, "epoch": 1.2733333333333334, "grad_norm": 1.6413535487189996, "kl": 0.07275390625, "learning_rate": 3.6333333333333333e-07, "loss": 0.0029, "reward": 1.9375, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1910 }, { "completion_length": 96.40625, "epoch": 1.274, "grad_norm": 3.8037597850912648, "kl": 0.09521484375, "learning_rate": 3.6299999999999995e-07, "loss": 0.0038, "reward": 1.7109375, "reward_std": 0.12623751163482666, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7109375, "step": 1911 }, { "completion_length": 94.40625, "epoch": 1.2746666666666666, "grad_norm": 5.119045653567309, "kl": 0.0654296875, "learning_rate": 3.626666666666667e-07, "loss": 0.0026, "reward": 1.7489583492279053, "reward_std": 0.033589817583560944, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7489583492279053, "step": 1912 }, { "completion_length": 101.0625, "epoch": 1.2753333333333332, "grad_norm": 2.3784910625240796, "kl": 0.0634765625, "learning_rate": 3.623333333333333e-07, "loss": 0.0025, "reward": 1.9505208730697632, "reward_std": 0.0364583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9505208134651184, "step": 1913 }, { "completion_length": 100.96875, "epoch": 1.276, "grad_norm": 2.619619824835672, "kl": 0.0595703125, "learning_rate": 3.62e-07, "loss": 0.0024, "reward": 1.9375, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.96875, "step": 1914 }, { "completion_length": 92.5, "epoch": 1.2766666666666666, "grad_norm": 4.795595000468826, "kl": 0.043212890625, "learning_rate": 3.6166666666666666e-07, "loss": 0.0017, "reward": 1.84375, "reward_std": 0.10478722304105759, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1915 }, { "completion_length": 100.25, "epoch": 1.2773333333333334, "grad_norm": 1.9002552120710823, "kl": 0.05126953125, "learning_rate": 3.6133333333333334e-07, "loss": 0.0021, "reward": 1.9114583730697632, "reward_std": 0.08054219186306, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9114583730697632, "step": 1916 }, { "completion_length": 102.90625, "epoch": 1.278, "grad_norm": 9.16310555136084, "kl": 0.0625, "learning_rate": 3.6099999999999996e-07, "loss": 0.0025, "reward": 1.75, "reward_std": 0.19072291254997253, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7500000596046448, "step": 1917 }, { "completion_length": 79.8125, "epoch": 1.2786666666666666, "grad_norm": 6.983125970149697, "kl": 0.060302734375, "learning_rate": 3.606666666666667e-07, "loss": 0.0024, "reward": 1.8666666746139526, "reward_std": 0.0624999962747097, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8666666746139526, "step": 1918 }, { "completion_length": 100.75, "epoch": 1.2793333333333332, "grad_norm": 5.037257984647024, "kl": 0.06396484375, "learning_rate": 3.603333333333333e-07, "loss": 0.0026, "reward": 1.8723958730697632, "reward_std": 0.1523953676223755, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8723958730697632, "step": 1919 }, { "completion_length": 106.15625, "epoch": 1.28, "grad_norm": 4.237005486909313, "kl": 0.04638671875, "learning_rate": 3.6e-07, "loss": 0.0019, "reward": 1.8203125, "reward_std": 0.18154378235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8203125, "step": 1920 }, { "completion_length": 94.0625, "epoch": 1.2806666666666666, "grad_norm": 0.2863053916328159, "kl": 0.07763671875, "learning_rate": 3.5966666666666667e-07, "loss": 0.0031, "reward": 1.9375, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9375, "step": 1921 }, { "completion_length": 97.1875, "epoch": 1.2813333333333334, "grad_norm": 2.691081617317944, "kl": 0.04833984375, "learning_rate": 3.5933333333333335e-07, "loss": 0.0019, "reward": 1.829315423965454, "reward_std": 0.09074495732784271, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8293154239654541, "step": 1922 }, { "completion_length": 100.09375, "epoch": 1.282, "grad_norm": 2.680959723161791, "kl": 0.0673828125, "learning_rate": 3.5899999999999997e-07, "loss": 0.0027, "reward": 1.6588542461395264, "reward_std": 0.14545938372612, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6588541865348816, "step": 1923 }, { "completion_length": 98.21875, "epoch": 1.2826666666666666, "grad_norm": 3.6896627952662313, "kl": 0.072265625, "learning_rate": 3.5866666666666665e-07, "loss": 0.0029, "reward": 1.7072917222976685, "reward_std": 0.22588221728801727, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7072916626930237, "step": 1924 }, { "completion_length": 88.8125, "epoch": 1.2833333333333332, "grad_norm": 0.08984892998461501, "kl": 0.044921875, "learning_rate": 3.583333333333333e-07, "loss": 0.0018, "reward": 1.96875, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1925 }, { "completion_length": 90.96875, "epoch": 1.284, "grad_norm": 4.991298692808747, "kl": 0.1708984375, "learning_rate": 3.5799999999999995e-07, "loss": 0.0069, "reward": 1.7552083730697632, "reward_std": 0.20212478935718536, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8177083134651184, "step": 1926 }, { "completion_length": 100.46875, "epoch": 1.2846666666666666, "grad_norm": 2.94349087200918, "kl": 0.0810546875, "learning_rate": 3.576666666666667e-07, "loss": 0.0032, "reward": 1.7833333015441895, "reward_std": 0.022243909537792206, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.783333420753479, "step": 1927 }, { "completion_length": 110.5625, "epoch": 1.2853333333333334, "grad_norm": 3.6558988044474674, "kl": 0.0771484375, "learning_rate": 3.573333333333333e-07, "loss": 0.0031, "reward": 1.7505208253860474, "reward_std": 0.17267829179763794, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7505208253860474, "step": 1928 }, { "completion_length": 88.3125, "epoch": 1.286, "grad_norm": 2.154812146179765, "kl": 0.07861328125, "learning_rate": 3.57e-07, "loss": 0.0031, "reward": 1.890625, "reward_std": 0.1354166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.890625, "step": 1929 }, { "completion_length": 100.90625, "epoch": 1.2866666666666666, "grad_norm": 6.568870086759852, "kl": 0.0517578125, "learning_rate": 3.5666666666666666e-07, "loss": 0.0021, "reward": 1.8194196224212646, "reward_std": 0.1205972209572792, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8194196820259094, "step": 1930 }, { "completion_length": 101.71875, "epoch": 1.2873333333333332, "grad_norm": 0.5751205389088779, "kl": 0.033447265625, "learning_rate": 3.5633333333333333e-07, "loss": 0.0013, "reward": 1.8541667461395264, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8854166269302368, "step": 1931 }, { "completion_length": 112.125, "epoch": 1.288, "grad_norm": 24.925952172998297, "kl": 0.07373046875, "learning_rate": 3.5599999999999996e-07, "loss": 0.0029, "reward": 1.8039435148239136, "reward_std": 0.16044270992279053, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8039435148239136, "step": 1932 }, { "completion_length": 96.78125, "epoch": 1.2886666666666666, "grad_norm": 2.924990645765162, "kl": 0.07568359375, "learning_rate": 3.556666666666667e-07, "loss": 0.003, "reward": 1.8385417461395264, "reward_std": 0.1562499850988388, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8697916269302368, "step": 1933 }, { "completion_length": 108.3125, "epoch": 1.2893333333333334, "grad_norm": 2.3117350452432635, "kl": 0.05810546875, "learning_rate": 3.553333333333333e-07, "loss": 0.0023, "reward": 1.796875, "reward_std": 0.15475423634052277, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8593750596046448, "step": 1934 }, { "completion_length": 99.34375, "epoch": 1.29, "grad_norm": 3.4791896774546, "kl": 0.080078125, "learning_rate": 3.55e-07, "loss": 0.0032, "reward": 1.8854167461395264, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9166666269302368, "step": 1935 }, { "completion_length": 96.625, "epoch": 1.2906666666666666, "grad_norm": 1.9514564123453662, "kl": 0.07177734375, "learning_rate": 3.5466666666666667e-07, "loss": 0.0029, "reward": 1.9322917461395264, "reward_std": 0.010416661389172077, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9322916865348816, "step": 1936 }, { "completion_length": 106.3125, "epoch": 1.2913333333333332, "grad_norm": 1.5223574713797225, "kl": 0.06689453125, "learning_rate": 3.5433333333333334e-07, "loss": 0.0027, "reward": 1.7625000476837158, "reward_std": 0.09225594997406006, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.762499988079071, "step": 1937 }, { "completion_length": 120.34375, "epoch": 1.292, "grad_norm": 1.8547058999625612, "kl": 0.06640625, "learning_rate": 3.5399999999999997e-07, "loss": 0.0027, "reward": 1.78125, "reward_std": 0.2103152573108673, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 1938 }, { "completion_length": 114.75, "epoch": 1.2926666666666666, "grad_norm": 3.7897211296952533, "kl": 0.0634765625, "learning_rate": 3.5366666666666664e-07, "loss": 0.0025, "reward": 1.8223958015441895, "reward_std": 0.03635391220450401, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8223958015441895, "step": 1939 }, { "completion_length": 103.9375, "epoch": 1.2933333333333334, "grad_norm": 2.3395073805020394, "kl": 0.06982421875, "learning_rate": 3.533333333333333e-07, "loss": 0.0028, "reward": 1.8104166984558105, "reward_std": 0.14429128170013428, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8104166984558105, "step": 1940 }, { "completion_length": 96.46875, "epoch": 1.294, "grad_norm": 1.5856603460730254, "kl": 0.05810546875, "learning_rate": 3.5299999999999994e-07, "loss": 0.0023, "reward": 1.8541667461395264, "reward_std": 0.125, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666269302368, "step": 1941 }, { "completion_length": 104.875, "epoch": 1.2946666666666666, "grad_norm": 13.003961707211037, "kl": 0.078125, "learning_rate": 3.526666666666667e-07, "loss": 0.0031, "reward": 1.7265625, "reward_std": 0.1614583283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7265625, "step": 1942 }, { "completion_length": 103.25, "epoch": 1.2953333333333332, "grad_norm": 3.0975898533368165, "kl": 0.0712890625, "learning_rate": 3.523333333333333e-07, "loss": 0.0029, "reward": 1.8937499523162842, "reward_std": 0.05833332613110542, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8937499523162842, "step": 1943 }, { "completion_length": 100.5625, "epoch": 1.296, "grad_norm": 2.520860153519375, "kl": 0.0791015625, "learning_rate": 3.52e-07, "loss": 0.0032, "reward": 1.9104167222976685, "reward_std": 0.13270895183086395, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9416666030883789, "step": 1944 }, { "completion_length": 108.53125, "epoch": 1.2966666666666666, "grad_norm": 6.8308287844572915, "kl": 0.0732421875, "learning_rate": 3.5166666666666665e-07, "loss": 0.0029, "reward": 1.6640625, "reward_std": 0.1579701453447342, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6640625, "step": 1945 }, { "completion_length": 105.21875, "epoch": 1.2973333333333334, "grad_norm": 6.875832734250909, "kl": 0.07421875, "learning_rate": 3.5133333333333333e-07, "loss": 0.003, "reward": 1.828125, "reward_std": 0.12425211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8281250596046448, "step": 1946 }, { "completion_length": 89.28125, "epoch": 1.298, "grad_norm": 9.881283300623299, "kl": 0.07275390625, "learning_rate": 3.5099999999999995e-07, "loss": 0.0029, "reward": 1.8020833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8333333134651184, "step": 1947 }, { "completion_length": 96.25, "epoch": 1.2986666666666666, "grad_norm": 3.1683353098062605, "kl": 0.053955078125, "learning_rate": 3.506666666666667e-07, "loss": 0.0022, "reward": 1.7729166746139526, "reward_std": 0.11049193143844604, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7729166746139526, "step": 1948 }, { "completion_length": 102.875, "epoch": 1.2993333333333332, "grad_norm": 2.547477137485717, "kl": 0.06982421875, "learning_rate": 3.503333333333333e-07, "loss": 0.0028, "reward": 1.8177083730697632, "reward_std": 0.0388755239546299, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8177083730697632, "step": 1949 }, { "completion_length": 123.5625, "epoch": 1.3, "grad_norm": 1.632093756594164, "kl": 0.052734375, "learning_rate": 3.5e-07, "loss": 0.0021, "reward": 1.796875, "reward_std": 0.10341878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.796875, "step": 1950 }, { "completion_length": 107.90625, "epoch": 1.3006666666666666, "grad_norm": 2.5156166010957857, "kl": 0.0595703125, "learning_rate": 3.4966666666666666e-07, "loss": 0.0024, "reward": 1.889062523841858, "reward_std": 0.08505348861217499, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8890625238418579, "step": 1951 }, { "completion_length": 101.5625, "epoch": 1.3013333333333335, "grad_norm": 6.992293955939115, "kl": 0.06884765625, "learning_rate": 3.4933333333333334e-07, "loss": 0.0028, "reward": 1.8586939573287964, "reward_std": 0.13663730025291443, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8899439573287964, "step": 1952 }, { "completion_length": 96.84375, "epoch": 1.302, "grad_norm": 0.14601693062234267, "kl": 0.049072265625, "learning_rate": 3.4899999999999996e-07, "loss": 0.002, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 1953 }, { "completion_length": 102.75, "epoch": 1.3026666666666666, "grad_norm": 0.7466121547445317, "kl": 0.047607421875, "learning_rate": 3.4866666666666664e-07, "loss": 0.0019, "reward": 1.96875, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 1954 }, { "completion_length": 102.0, "epoch": 1.3033333333333332, "grad_norm": 2.3510992142365184, "kl": 0.12451171875, "learning_rate": 3.483333333333333e-07, "loss": 0.005, "reward": 1.8411458730697632, "reward_std": 0.13530339300632477, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.9036458730697632, "step": 1955 }, { "completion_length": 98.1875, "epoch": 1.304, "grad_norm": 0.13767275612963786, "kl": 0.058349609375, "learning_rate": 3.4799999999999994e-07, "loss": 0.0023, "reward": 1.8333333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 1956 }, { "completion_length": 118.0625, "epoch": 1.3046666666666666, "grad_norm": 12.082182846865651, "kl": 0.056640625, "learning_rate": 3.4766666666666667e-07, "loss": 0.0023, "reward": 1.8255208730697632, "reward_std": 0.1197916567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8255208730697632, "step": 1957 }, { "completion_length": 111.03125, "epoch": 1.3053333333333335, "grad_norm": 2.502673948313421, "kl": 0.0908203125, "learning_rate": 3.473333333333333e-07, "loss": 0.0036, "reward": 1.7270833253860474, "reward_std": 0.22083333134651184, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7583333253860474, "step": 1958 }, { "completion_length": 99.84375, "epoch": 1.306, "grad_norm": 20.980490596742182, "kl": 0.08642578125, "learning_rate": 3.4699999999999997e-07, "loss": 0.0035, "reward": 1.8104166984558105, "reward_std": 0.12827971577644348, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8104166984558105, "step": 1959 }, { "completion_length": 99.8125, "epoch": 1.3066666666666666, "grad_norm": 0.20286686600301682, "kl": 0.068359375, "learning_rate": 3.4666666666666665e-07, "loss": 0.0027, "reward": 1.78125, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1960 }, { "completion_length": 111.21875, "epoch": 1.3073333333333332, "grad_norm": 9.233787651117757, "kl": 0.07080078125, "learning_rate": 3.463333333333333e-07, "loss": 0.0028, "reward": 1.875, "reward_std": 0.22358438372612, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.9375, "step": 1961 }, { "completion_length": 113.46875, "epoch": 1.308, "grad_norm": 3.5265372592584567, "kl": 0.09033203125, "learning_rate": 3.4599999999999995e-07, "loss": 0.0036, "reward": 1.5183780193328857, "reward_std": 0.20792587101459503, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5496280193328857, "step": 1962 }, { "completion_length": 85.25, "epoch": 1.3086666666666666, "grad_norm": 2.7286938609160734, "kl": 0.060546875, "learning_rate": 3.456666666666667e-07, "loss": 0.0024, "reward": 1.9114583730697632, "reward_std": 0.08244640380144119, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9114583134651184, "step": 1963 }, { "completion_length": 93.8125, "epoch": 1.3093333333333335, "grad_norm": 2.822697347342541, "kl": 0.0947265625, "learning_rate": 3.453333333333333e-07, "loss": 0.0038, "reward": 1.8671875, "reward_std": 0.203125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8984375, "step": 1964 }, { "completion_length": 89.0, "epoch": 1.31, "grad_norm": 1.4708033831474763, "kl": 0.07763671875, "learning_rate": 3.45e-07, "loss": 0.0031, "reward": 1.8125, "reward_std": 0.18217839300632477, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 1965 }, { "completion_length": 107.65625, "epoch": 1.3106666666666666, "grad_norm": 3.5542613636473384, "kl": 0.091796875, "learning_rate": 3.4466666666666666e-07, "loss": 0.0037, "reward": 1.7084821462631226, "reward_std": 0.2105484902858734, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7397320866584778, "step": 1966 }, { "completion_length": 102.625, "epoch": 1.3113333333333332, "grad_norm": 2.561412207308422, "kl": 0.1025390625, "learning_rate": 3.4433333333333333e-07, "loss": 0.0041, "reward": 1.875, "reward_std": 0.25, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 1967 }, { "completion_length": 92.5, "epoch": 1.312, "grad_norm": 2.0190013013635935, "kl": 0.0966796875, "learning_rate": 3.4399999999999996e-07, "loss": 0.0039, "reward": 1.90625, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 1968 }, { "completion_length": 102.125, "epoch": 1.3126666666666666, "grad_norm": 0.5945601275953329, "kl": 0.05126953125, "learning_rate": 3.436666666666667e-07, "loss": 0.0021, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1969 }, { "completion_length": 108.71875, "epoch": 1.3133333333333335, "grad_norm": 2.835562997159618, "kl": 0.07080078125, "learning_rate": 3.433333333333333e-07, "loss": 0.0028, "reward": 1.7760417461395264, "reward_std": 0.1013755351305008, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8072917461395264, "step": 1970 }, { "completion_length": 100.375, "epoch": 1.314, "grad_norm": 9.591471720841252, "kl": 0.080078125, "learning_rate": 3.43e-07, "loss": 0.0032, "reward": 1.8364583253860474, "reward_std": 0.1545688956975937, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8364583253860474, "step": 1971 }, { "completion_length": 93.75, "epoch": 1.3146666666666667, "grad_norm": 3.172891485144474, "kl": 0.0625, "learning_rate": 3.4266666666666666e-07, "loss": 0.0025, "reward": 1.8312499523162842, "reward_std": 0.11364547908306122, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.831250011920929, "step": 1972 }, { "completion_length": 110.9375, "epoch": 1.3153333333333332, "grad_norm": 5.266690143499945, "kl": 0.08154296875, "learning_rate": 3.423333333333333e-07, "loss": 0.0033, "reward": 1.5890624523162842, "reward_std": 0.1618286669254303, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.620312511920929, "step": 1973 }, { "completion_length": 105.03125, "epoch": 1.316, "grad_norm": 2.762428400982101, "kl": 0.06640625, "learning_rate": 3.42e-07, "loss": 0.0026, "reward": 1.8541667461395264, "reward_std": 0.13017091155052185, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8541666865348816, "step": 1974 }, { "completion_length": 100.65625, "epoch": 1.3166666666666667, "grad_norm": 6.25848717144362, "kl": 0.06005859375, "learning_rate": 3.4166666666666664e-07, "loss": 0.0024, "reward": 1.7520833015441895, "reward_std": 0.2097439169883728, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7833333015441895, "step": 1975 }, { "completion_length": 103.1875, "epoch": 1.3173333333333335, "grad_norm": 8.38822658888461, "kl": 0.07421875, "learning_rate": 3.413333333333333e-07, "loss": 0.003, "reward": 1.7927141189575195, "reward_std": 0.19544164836406708, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8239641189575195, "step": 1976 }, { "completion_length": 97.125, "epoch": 1.318, "grad_norm": 7.267485190070478, "kl": 0.09033203125, "learning_rate": 3.41e-07, "loss": 0.0036, "reward": 1.8921875953674316, "reward_std": 0.1618310511112213, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8921874761581421, "step": 1977 }, { "completion_length": 96.71875, "epoch": 1.3186666666666667, "grad_norm": 3.4021329152598616, "kl": 0.044677734375, "learning_rate": 3.4066666666666667e-07, "loss": 0.0018, "reward": 1.8984375, "reward_std": 0.07737711071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8984375, "step": 1978 }, { "completion_length": 98.25, "epoch": 1.3193333333333332, "grad_norm": 4.362102785423955, "kl": 0.0849609375, "learning_rate": 3.403333333333333e-07, "loss": 0.0034, "reward": 1.8359375, "reward_std": 0.24444013833999634, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8671875, "step": 1979 }, { "completion_length": 108.71875, "epoch": 1.32, "grad_norm": 1.564866008668422, "kl": 0.03173828125, "learning_rate": 3.4000000000000003e-07, "loss": 0.0013, "reward": 1.7604167461395264, "reward_std": 0.1458333432674408, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7916666269302368, "step": 1980 }, { "completion_length": 107.59375, "epoch": 1.3206666666666667, "grad_norm": 3.766706666705986, "kl": 0.07763671875, "learning_rate": 3.3966666666666665e-07, "loss": 0.0031, "reward": 1.796875, "reward_std": 0.16331222653388977, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.796875, "step": 1981 }, { "completion_length": 100.6875, "epoch": 1.3213333333333335, "grad_norm": 1.6999870148892633, "kl": 0.07080078125, "learning_rate": 3.3933333333333333e-07, "loss": 0.0028, "reward": 1.875, "reward_std": 0.14433756470680237, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.875, "step": 1982 }, { "completion_length": 114.90625, "epoch": 1.322, "grad_norm": 2.095552301159818, "kl": 0.07421875, "learning_rate": 3.39e-07, "loss": 0.003, "reward": 1.8963541984558105, "reward_std": 0.025831211358308792, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8963541984558105, "step": 1983 }, { "completion_length": 95.5, "epoch": 1.3226666666666667, "grad_norm": 3.2112554194294436, "kl": 0.05078125, "learning_rate": 3.386666666666667e-07, "loss": 0.002, "reward": 1.8130208253860474, "reward_std": 0.12655460834503174, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8130208253860474, "step": 1984 }, { "completion_length": 113.65625, "epoch": 1.3233333333333333, "grad_norm": 11.334545629207222, "kl": 0.06298828125, "learning_rate": 3.383333333333333e-07, "loss": 0.0025, "reward": 1.7979166507720947, "reward_std": 0.1758635938167572, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7979167103767395, "step": 1985 }, { "completion_length": 105.40625, "epoch": 1.324, "grad_norm": 2.8051991231269287, "kl": 0.0712890625, "learning_rate": 3.38e-07, "loss": 0.0029, "reward": 1.8645833730697632, "reward_std": 0.1458333283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8645832538604736, "step": 1986 }, { "completion_length": 112.09375, "epoch": 1.3246666666666667, "grad_norm": 1.181119999079246, "kl": 0.031494140625, "learning_rate": 3.3766666666666666e-07, "loss": 0.0013, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 1987 }, { "completion_length": 108.625, "epoch": 1.3253333333333333, "grad_norm": 4.825434370267353, "kl": 0.0693359375, "learning_rate": 3.373333333333333e-07, "loss": 0.0028, "reward": 1.7979166507720947, "reward_std": 0.02916666492819786, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7979167103767395, "step": 1988 }, { "completion_length": 109.6875, "epoch": 1.326, "grad_norm": 2.066439512202386, "kl": 0.04541015625, "learning_rate": 3.37e-07, "loss": 0.0018, "reward": 1.5864455699920654, "reward_std": 0.31594419479370117, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.648945689201355, "step": 1989 }, { "completion_length": 111.875, "epoch": 1.3266666666666667, "grad_norm": 1.5302116241485983, "kl": 0.058349609375, "learning_rate": 3.3666666666666664e-07, "loss": 0.0023, "reward": 1.625, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.65625, "step": 1990 }, { "completion_length": 99.15625, "epoch": 1.3273333333333333, "grad_norm": 1.4641515703102403, "kl": 0.06640625, "learning_rate": 3.363333333333333e-07, "loss": 0.0027, "reward": 1.9187500476837158, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.918749988079071, "step": 1991 }, { "completion_length": 95.65625, "epoch": 1.328, "grad_norm": 2.7307585075807244, "kl": 0.04296875, "learning_rate": 3.36e-07, "loss": 0.0017, "reward": 1.7552083730697632, "reward_std": 0.11926551163196564, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7552083134651184, "step": 1992 }, { "completion_length": 122.84375, "epoch": 1.3286666666666667, "grad_norm": 5.445173560765713, "kl": 0.051025390625, "learning_rate": 3.3566666666666667e-07, "loss": 0.002, "reward": 1.790624976158142, "reward_std": 0.15518566966056824, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8218749761581421, "step": 1993 }, { "completion_length": 111.875, "epoch": 1.3293333333333333, "grad_norm": 3.578704305592164, "kl": 0.054443359375, "learning_rate": 3.353333333333333e-07, "loss": 0.0022, "reward": 1.948958396911621, "reward_std": 0.023514799773693085, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9489583969116211, "step": 1994 }, { "completion_length": 105.21875, "epoch": 1.33, "grad_norm": 3.9813710848115735, "kl": 0.057373046875, "learning_rate": 3.35e-07, "loss": 0.0023, "reward": 1.846428632736206, "reward_std": 0.04782798886299133, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8464285731315613, "step": 1995 }, { "completion_length": 108.0625, "epoch": 1.3306666666666667, "grad_norm": 10.936014540852037, "kl": 0.07080078125, "learning_rate": 3.3466666666666665e-07, "loss": 0.0028, "reward": 1.829687476158142, "reward_std": 0.2593749761581421, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8921874761581421, "step": 1996 }, { "completion_length": 108.6875, "epoch": 1.3313333333333333, "grad_norm": 1.3805253601500658, "kl": 0.08056640625, "learning_rate": 3.343333333333333e-07, "loss": 0.0032, "reward": 1.84375, "reward_std": 0.0625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 1997 }, { "completion_length": 99.71875, "epoch": 1.332, "grad_norm": 17.947224672025357, "kl": 0.07666015625, "learning_rate": 3.34e-07, "loss": 0.0031, "reward": 1.8270833492279053, "reward_std": 0.11202813684940338, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8270833492279053, "step": 1998 }, { "completion_length": 97.5625, "epoch": 1.3326666666666667, "grad_norm": 1.9006005281656495, "kl": 0.07568359375, "learning_rate": 3.336666666666667e-07, "loss": 0.003, "reward": 1.8333333730697632, "reward_std": 0.15612266957759857, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333134651184, "step": 1999 }, { "completion_length": 105.09375, "epoch": 1.3333333333333333, "grad_norm": 3.53847250076022, "kl": 0.140625, "learning_rate": 3.333333333333333e-07, "loss": 0.0056, "reward": 1.921875, "reward_std": 0.10624999552965164, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.953125, "step": 2000 }, { "completion_length": 105.78125, "epoch": 1.334, "grad_norm": 2.7764325763165885, "kl": 0.07275390625, "learning_rate": 3.33e-07, "loss": 0.0029, "reward": 1.9427083730697632, "reward_std": 0.1145833283662796, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9427083730697632, "step": 2001 }, { "completion_length": 98.1875, "epoch": 1.3346666666666667, "grad_norm": 2.4154382728658277, "kl": 0.08544921875, "learning_rate": 3.3266666666666665e-07, "loss": 0.0034, "reward": 1.8234374523162842, "reward_std": 0.027417197823524475, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.823437511920929, "step": 2002 }, { "completion_length": 113.59375, "epoch": 1.3353333333333333, "grad_norm": 2.666317675955444, "kl": 0.06982421875, "learning_rate": 3.323333333333333e-07, "loss": 0.0028, "reward": 1.6906249523162842, "reward_std": 0.17124442756175995, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.721875011920929, "step": 2003 }, { "completion_length": 107.875, "epoch": 1.336, "grad_norm": 2.469524579303206, "kl": 0.06640625, "learning_rate": 3.32e-07, "loss": 0.0026, "reward": 1.8666666746139526, "reward_std": 0.09166665375232697, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8666666746139526, "step": 2004 }, { "completion_length": 112.40625, "epoch": 1.3366666666666667, "grad_norm": 3.501773577020674, "kl": 0.0673828125, "learning_rate": 3.3166666666666663e-07, "loss": 0.0027, "reward": 1.671279788017273, "reward_std": 0.12456458061933517, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6712797284126282, "step": 2005 }, { "completion_length": 91.53125, "epoch": 1.3373333333333333, "grad_norm": 3.7587213032184374, "kl": 0.061279296875, "learning_rate": 3.313333333333333e-07, "loss": 0.0025, "reward": 1.8697917461395264, "reward_std": 0.010416661389172077, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8697916865348816, "step": 2006 }, { "completion_length": 103.25, "epoch": 1.338, "grad_norm": 11.64810079913596, "kl": 0.0869140625, "learning_rate": 3.31e-07, "loss": 0.0035, "reward": 1.931249976158142, "reward_std": 0.05416667461395264, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9312500357627869, "step": 2007 }, { "completion_length": 107.9375, "epoch": 1.3386666666666667, "grad_norm": 2.8144247222554495, "kl": 0.07080078125, "learning_rate": 3.3066666666666666e-07, "loss": 0.0028, "reward": 1.605208396911621, "reward_std": 0.12150105834007263, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6364583969116211, "step": 2008 }, { "completion_length": 99.0, "epoch": 1.3393333333333333, "grad_norm": 0.9737528514819069, "kl": 0.0654296875, "learning_rate": 3.303333333333333e-07, "loss": 0.0026, "reward": 1.9375, "reward_std": 0.0625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.96875, "step": 2009 }, { "completion_length": 108.15625, "epoch": 1.34, "grad_norm": 6.367123866853232, "kl": 0.0673828125, "learning_rate": 3.3e-07, "loss": 0.0027, "reward": 1.5, "reward_std": 0.37660688161849976, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.5625, "step": 2010 }, { "completion_length": 104.59375, "epoch": 1.3406666666666667, "grad_norm": 0.11834081705490689, "kl": 0.044921875, "learning_rate": 3.2966666666666664e-07, "loss": 0.0018, "reward": 1.625, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.625, "step": 2011 }, { "completion_length": 98.0, "epoch": 1.3413333333333333, "grad_norm": 3.634639014368979, "kl": 0.0615234375, "learning_rate": 3.293333333333333e-07, "loss": 0.0025, "reward": 1.8234374523162842, "reward_std": 0.09343431144952774, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.823437511920929, "step": 2012 }, { "completion_length": 105.03125, "epoch": 1.342, "grad_norm": 2.181446600495226, "kl": 0.0498046875, "learning_rate": 3.29e-07, "loss": 0.002, "reward": 1.8125, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 2013 }, { "completion_length": 99.28125, "epoch": 1.3426666666666667, "grad_norm": 2.63635217630195, "kl": 0.06982421875, "learning_rate": 3.2866666666666667e-07, "loss": 0.0028, "reward": 1.8958333730697632, "reward_std": 0.17311251163482666, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9270833730697632, "step": 2014 }, { "completion_length": 108.28125, "epoch": 1.3433333333333333, "grad_norm": 1.2906902047714082, "kl": 0.047607421875, "learning_rate": 3.283333333333333e-07, "loss": 0.0019, "reward": 1.8125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8125, "step": 2015 }, { "completion_length": 92.34375, "epoch": 1.3439999999999999, "grad_norm": 2.428576719271932, "kl": 0.10595703125, "learning_rate": 3.28e-07, "loss": 0.0042, "reward": 1.71875, "reward_std": 0.3125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.75, "step": 2016 }, { "completion_length": 107.0625, "epoch": 1.3446666666666667, "grad_norm": 4.433267748143834, "kl": 0.048583984375, "learning_rate": 3.2766666666666665e-07, "loss": 0.0019, "reward": 1.84375, "reward_std": 0.18396097421646118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.84375, "step": 2017 }, { "completion_length": 104.5, "epoch": 1.3453333333333333, "grad_norm": 1.941773419274935, "kl": 0.076171875, "learning_rate": 3.2733333333333327e-07, "loss": 0.003, "reward": 1.9765625, "reward_std": 0.029919598251581192, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9765625, "step": 2018 }, { "completion_length": 103.1875, "epoch": 1.346, "grad_norm": 2.346650552516021, "kl": 0.068359375, "learning_rate": 3.27e-07, "loss": 0.0027, "reward": 1.7109375, "reward_std": 0.005208328366279602, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7421875, "step": 2019 }, { "completion_length": 91.3125, "epoch": 1.3466666666666667, "grad_norm": 26.81386251509788, "kl": 0.10595703125, "learning_rate": 3.2666666666666663e-07, "loss": 0.0042, "reward": 1.8546874523162842, "reward_std": 0.09062499552965164, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.885937511920929, "step": 2020 }, { "completion_length": 96.71875, "epoch": 1.3473333333333333, "grad_norm": 7.047809290966178, "kl": 0.0595703125, "learning_rate": 3.263333333333333e-07, "loss": 0.0024, "reward": 1.555208444595337, "reward_std": 0.28392088413238525, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5864583253860474, "step": 2021 }, { "completion_length": 113.53125, "epoch": 1.3479999999999999, "grad_norm": 13.456941960028526, "kl": 0.07958984375, "learning_rate": 3.26e-07, "loss": 0.0032, "reward": 1.6552083492279053, "reward_std": 0.22291666269302368, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6864583492279053, "step": 2022 }, { "completion_length": 102.40625, "epoch": 1.3486666666666667, "grad_norm": 2.8005839366700327, "kl": 0.05908203125, "learning_rate": 3.2566666666666666e-07, "loss": 0.0024, "reward": 1.8567708730697632, "reward_std": 0.1510416716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8567708730697632, "step": 2023 }, { "completion_length": 103.96875, "epoch": 1.3493333333333333, "grad_norm": 2.9392354552698903, "kl": 0.08544921875, "learning_rate": 3.253333333333333e-07, "loss": 0.0034, "reward": 1.7781250476837158, "reward_std": 0.2235240936279297, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8093750476837158, "step": 2024 }, { "completion_length": 104.65625, "epoch": 1.35, "grad_norm": 1.4591125587699267, "kl": 0.03857421875, "learning_rate": 3.25e-07, "loss": 0.0015, "reward": 1.6875, "reward_std": 0.26933756470680237, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.71875, "step": 2025 }, { "completion_length": 95.84375, "epoch": 1.3506666666666667, "grad_norm": 3.6234333102527394, "kl": 0.1025390625, "learning_rate": 3.2466666666666664e-07, "loss": 0.0041, "reward": 1.921875, "reward_std": 0.0729166716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.921875, "step": 2026 }, { "completion_length": 121.53125, "epoch": 1.3513333333333333, "grad_norm": 5.347467265734969, "kl": 0.0751953125, "learning_rate": 3.243333333333333e-07, "loss": 0.003, "reward": 1.6489583253860474, "reward_std": 0.12841877341270447, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6802083253860474, "step": 2027 }, { "completion_length": 90.03125, "epoch": 1.3519999999999999, "grad_norm": 3.3920491325851296, "kl": 0.048828125, "learning_rate": 3.24e-07, "loss": 0.0019, "reward": 1.8333333730697632, "reward_std": 0.1535891890525818, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 2028 }, { "completion_length": 121.125, "epoch": 1.3526666666666667, "grad_norm": 3.8930444110129163, "kl": 0.060546875, "learning_rate": 3.2366666666666667e-07, "loss": 0.0024, "reward": 1.6156249046325684, "reward_std": 0.2041398286819458, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6156250238418579, "step": 2029 }, { "completion_length": 103.53125, "epoch": 1.3533333333333333, "grad_norm": 2.4572219635085712, "kl": 0.051025390625, "learning_rate": 3.233333333333333e-07, "loss": 0.002, "reward": 1.8385417461395264, "reward_std": 0.10341878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8385416865348816, "step": 2030 }, { "completion_length": 105.5625, "epoch": 1.354, "grad_norm": 3.1568230227283522, "kl": 0.048828125, "learning_rate": 3.23e-07, "loss": 0.002, "reward": 1.6171875, "reward_std": 0.15215739607810974, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6171875, "step": 2031 }, { "completion_length": 96.46875, "epoch": 1.3546666666666667, "grad_norm": 3.033148172276476, "kl": 0.06396484375, "learning_rate": 3.2266666666666664e-07, "loss": 0.0026, "reward": 1.5398437976837158, "reward_std": 0.13100676238536835, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.5398437976837158, "step": 2032 }, { "completion_length": 90.875, "epoch": 1.3553333333333333, "grad_norm": 5.505107042269364, "kl": 0.0966796875, "learning_rate": 3.223333333333333e-07, "loss": 0.0039, "reward": 1.879166603088379, "reward_std": 0.027638545259833336, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8791667222976685, "step": 2033 }, { "completion_length": 101.15625, "epoch": 1.3559999999999999, "grad_norm": 7.4261217091963125, "kl": 0.1064453125, "learning_rate": 3.22e-07, "loss": 0.0043, "reward": 1.7572916746139526, "reward_std": 0.15525397658348083, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7885416150093079, "step": 2034 }, { "completion_length": 96.28125, "epoch": 1.3566666666666667, "grad_norm": 7.906139770771684, "kl": 0.08154296875, "learning_rate": 3.216666666666666e-07, "loss": 0.0033, "reward": 1.96875, "reward_std": 0.049292195588350296, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.96875, "step": 2035 }, { "completion_length": 109.3125, "epoch": 1.3573333333333333, "grad_norm": 5.097390264678242, "kl": 0.181640625, "learning_rate": 3.2133333333333335e-07, "loss": 0.0073, "reward": 1.7401041984558105, "reward_std": 0.2331584095954895, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7713541984558105, "step": 2036 }, { "completion_length": 81.40625, "epoch": 1.358, "grad_norm": 3.4625493740394573, "kl": 0.060546875, "learning_rate": 3.21e-07, "loss": 0.0024, "reward": 1.890625, "reward_std": 0.16591878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.921875, "step": 2037 }, { "completion_length": 91.65625, "epoch": 1.3586666666666667, "grad_norm": 1.5550930001065628, "kl": 0.07763671875, "learning_rate": 3.2066666666666665e-07, "loss": 0.0031, "reward": 1.875, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 2038 }, { "completion_length": 89.71875, "epoch": 1.3593333333333333, "grad_norm": 8.8873725148151, "kl": 0.119140625, "learning_rate": 3.2033333333333333e-07, "loss": 0.0048, "reward": 1.8958333730697632, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9270833730697632, "step": 2039 }, { "completion_length": 102.09375, "epoch": 1.3599999999999999, "grad_norm": 3.447722265419072, "kl": 0.07470703125, "learning_rate": 3.2e-07, "loss": 0.003, "reward": 1.7140624523162842, "reward_std": 0.2572740614414215, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.745312511920929, "step": 2040 }, { "completion_length": 91.3125, "epoch": 1.3606666666666667, "grad_norm": 23.702912722632576, "kl": 0.0703125, "learning_rate": 3.1966666666666663e-07, "loss": 0.0028, "reward": 1.8828125, "reward_std": 0.1510416716337204, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8828125, "step": 2041 }, { "completion_length": 101.65625, "epoch": 1.3613333333333333, "grad_norm": 2.6587001538059534, "kl": 0.062255859375, "learning_rate": 3.1933333333333336e-07, "loss": 0.0025, "reward": 1.8958332538604736, "reward_std": 0.05677473545074463, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8958333730697632, "step": 2042 }, { "completion_length": 113.21875, "epoch": 1.362, "grad_norm": 4.417185448294158, "kl": 0.0771484375, "learning_rate": 3.19e-07, "loss": 0.0031, "reward": 1.7645833492279053, "reward_std": 0.2120281308889389, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.8270833492279053, "step": 2043 }, { "completion_length": 102.125, "epoch": 1.3626666666666667, "grad_norm": 1.6966909499651746, "kl": 0.09033203125, "learning_rate": 3.1866666666666666e-07, "loss": 0.0036, "reward": 1.9197916984558105, "reward_std": 0.047766223549842834, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9197916984558105, "step": 2044 }, { "completion_length": 111.0, "epoch": 1.3633333333333333, "grad_norm": 12.245990831642777, "kl": 0.04345703125, "learning_rate": 3.1833333333333334e-07, "loss": 0.0017, "reward": 1.6588542461395264, "reward_std": 0.11145833134651184, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6588541865348816, "step": 2045 }, { "completion_length": 103.6875, "epoch": 1.3639999999999999, "grad_norm": 4.37074593414578, "kl": 0.061767578125, "learning_rate": 3.18e-07, "loss": 0.0025, "reward": 1.7817708253860474, "reward_std": 0.11540654301643372, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7817708253860474, "step": 2046 }, { "completion_length": 110.15625, "epoch": 1.3646666666666667, "grad_norm": 1.280811846491508, "kl": 0.044677734375, "learning_rate": 3.1766666666666664e-07, "loss": 0.0018, "reward": 1.9791667461395264, "reward_std": 0.02405625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9791666269302368, "step": 2047 }, { "completion_length": 99.375, "epoch": 1.3653333333333333, "grad_norm": 1.1795539112601743, "kl": 0.0986328125, "learning_rate": 3.173333333333333e-07, "loss": 0.004, "reward": 1.8041666746139526, "reward_std": 0.022047923877835274, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8041666746139526, "step": 2048 }, { "completion_length": 110.21875, "epoch": 1.366, "grad_norm": 1.06725257138705, "kl": 0.04736328125, "learning_rate": 3.17e-07, "loss": 0.0019, "reward": 1.78125, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 2049 }, { "completion_length": 100.5, "epoch": 1.3666666666666667, "grad_norm": 4.9484164852429755, "kl": 0.04443359375, "learning_rate": 3.166666666666666e-07, "loss": 0.0018, "reward": 1.9239583015441895, "reward_std": 0.07554227858781815, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9239583015441895, "step": 2050 }, { "completion_length": 86.75, "epoch": 1.3673333333333333, "grad_norm": 0.1292918812255821, "kl": 0.0673828125, "learning_rate": 3.1633333333333335e-07, "loss": 0.0027, "reward": 2.0, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 1.0, "step": 2051 }, { "completion_length": 102.875, "epoch": 1.3679999999999999, "grad_norm": 5.542249607587478, "kl": 0.083984375, "learning_rate": 3.1599999999999997e-07, "loss": 0.0034, "reward": 1.9578125476837158, "reward_std": 0.03437499701976776, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9578125476837158, "step": 2052 }, { "completion_length": 91.21875, "epoch": 1.3686666666666667, "grad_norm": 1.3971975872143259, "kl": 0.06494140625, "learning_rate": 3.1566666666666665e-07, "loss": 0.0026, "reward": 1.881250023841858, "reward_std": 0.13466878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8812500238418579, "step": 2053 }, { "completion_length": 110.8125, "epoch": 1.3693333333333333, "grad_norm": 11.546228449850469, "kl": 0.07080078125, "learning_rate": 3.153333333333333e-07, "loss": 0.0028, "reward": 1.75, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.78125, "step": 2054 }, { "completion_length": 99.46875, "epoch": 1.37, "grad_norm": 1.0163402931207584, "kl": 0.08642578125, "learning_rate": 3.15e-07, "loss": 0.0035, "reward": 1.8125, "reward_std": 0.07216878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.84375, "step": 2055 }, { "completion_length": 97.71875, "epoch": 1.3706666666666667, "grad_norm": 3.7806631195379357, "kl": 0.099609375, "learning_rate": 3.146666666666666e-07, "loss": 0.004, "reward": 1.6609375476837158, "reward_std": 0.22550055384635925, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.692187488079071, "step": 2056 }, { "completion_length": 99.8125, "epoch": 1.3713333333333333, "grad_norm": 2.518089775327889, "kl": 0.07470703125, "learning_rate": 3.1433333333333336e-07, "loss": 0.003, "reward": 1.8078124523162842, "reward_std": 0.03878508508205414, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8078125715255737, "step": 2057 }, { "completion_length": 94.375, "epoch": 1.3719999999999999, "grad_norm": 3.7014555867227537, "kl": 0.08544921875, "learning_rate": 3.14e-07, "loss": 0.0034, "reward": 1.8489583730697632, "reward_std": 0.1770833283662796, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8802083730697632, "step": 2058 }, { "completion_length": 103.375, "epoch": 1.3726666666666667, "grad_norm": 3.073161671838763, "kl": 0.058837890625, "learning_rate": 3.1366666666666666e-07, "loss": 0.0024, "reward": 1.7916667461395264, "reward_std": 0.19952812790870667, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8229166269302368, "step": 2059 }, { "completion_length": 93.71875, "epoch": 1.3733333333333333, "grad_norm": 5.967419224720931, "kl": 0.09375, "learning_rate": 3.1333333333333333e-07, "loss": 0.0037, "reward": 1.8333333730697632, "reward_std": 0.0208333320915699, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 2060 }, { "completion_length": 95.34375, "epoch": 1.374, "grad_norm": 2.4870866147238524, "kl": 0.0859375, "learning_rate": 3.13e-07, "loss": 0.0034, "reward": 1.9088542461395264, "reward_std": 0.08098871260881424, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9088541269302368, "step": 2061 }, { "completion_length": 100.75, "epoch": 1.3746666666666667, "grad_norm": 3.06139831421547, "kl": 0.052734375, "learning_rate": 3.1266666666666663e-07, "loss": 0.0021, "reward": 1.7890625, "reward_std": 0.16502517461776733, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7890625, "step": 2062 }, { "completion_length": 99.46875, "epoch": 1.3753333333333333, "grad_norm": 1.9717800244336405, "kl": 0.06396484375, "learning_rate": 3.1233333333333336e-07, "loss": 0.0026, "reward": 1.946874976158142, "reward_std": 0.029321927577257156, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9468750357627869, "step": 2063 }, { "completion_length": 98.53125, "epoch": 1.376, "grad_norm": 2.403007691298716, "kl": 0.07177734375, "learning_rate": 3.12e-07, "loss": 0.0029, "reward": 1.9114583730697632, "reward_std": 0.0729166716337204, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.9427083134651184, "step": 2064 }, { "completion_length": 93.46875, "epoch": 1.3766666666666667, "grad_norm": 3.910734506675865, "kl": 0.07373046875, "learning_rate": 3.116666666666666e-07, "loss": 0.003, "reward": 1.7291667461395264, "reward_std": 0.15692904591560364, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7604166269302368, "step": 2065 }, { "completion_length": 95.5, "epoch": 1.3773333333333333, "grad_norm": 1.35062290261048, "kl": 0.07763671875, "learning_rate": 3.1133333333333334e-07, "loss": 0.0031, "reward": 1.8229167461395264, "reward_std": 0.08655625581741333, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8229166269302368, "step": 2066 }, { "completion_length": 100.96875, "epoch": 1.3780000000000001, "grad_norm": 2.849124474568225, "kl": 0.062255859375, "learning_rate": 3.1099999999999997e-07, "loss": 0.0025, "reward": 1.8671875, "reward_std": 0.140625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8984375, "step": 2067 }, { "completion_length": 90.03125, "epoch": 1.3786666666666667, "grad_norm": 2.5460931107128437, "kl": 0.050048828125, "learning_rate": 3.1066666666666664e-07, "loss": 0.002, "reward": 1.6875, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.71875, "step": 2068 }, { "completion_length": 109.96875, "epoch": 1.3793333333333333, "grad_norm": 1.7829092109995426, "kl": 0.07275390625, "learning_rate": 3.103333333333333e-07, "loss": 0.0029, "reward": 1.7999999523162842, "reward_std": 0.24149583280086517, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.862500011920929, "step": 2069 }, { "completion_length": 108.28125, "epoch": 1.38, "grad_norm": 2.978362513770888, "kl": 0.09521484375, "learning_rate": 3.1e-07, "loss": 0.0038, "reward": 1.6302083730697632, "reward_std": 0.23296192288398743, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.6302083730697632, "step": 2070 }, { "completion_length": 97.0625, "epoch": 1.3806666666666667, "grad_norm": 45.31627541991653, "kl": 0.0654296875, "learning_rate": 3.096666666666666e-07, "loss": 0.0026, "reward": 1.91015625, "reward_std": 0.040061552077531815, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.91015625, "step": 2071 }, { "completion_length": 96.25, "epoch": 1.3813333333333333, "grad_norm": 1.275968052564133, "kl": 0.064453125, "learning_rate": 3.0933333333333335e-07, "loss": 0.0026, "reward": 1.8567708730697632, "reward_std": 0.021474504843354225, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8567708730697632, "step": 2072 }, { "completion_length": 85.53125, "epoch": 1.3820000000000001, "grad_norm": 2.3364707230857, "kl": 0.038330078125, "learning_rate": 3.09e-07, "loss": 0.0015, "reward": 1.8333333730697632, "reward_std": 0.09858439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8333333730697632, "step": 2073 }, { "completion_length": 85.40625, "epoch": 1.3826666666666667, "grad_norm": 1.2758431950503, "kl": 0.048828125, "learning_rate": 3.0866666666666665e-07, "loss": 0.002, "reward": 1.9895833730697632, "reward_std": 0.020833328366279602, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9895833730697632, "step": 2074 }, { "completion_length": 106.3125, "epoch": 1.3833333333333333, "grad_norm": 4.1999307873944955, "kl": 0.0556640625, "learning_rate": 3.0833333333333333e-07, "loss": 0.0022, "reward": 1.7578125, "reward_std": 0.140625, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7578125, "step": 2075 }, { "completion_length": 90.90625, "epoch": 1.384, "grad_norm": 2.9407152558991325, "kl": 0.06591796875, "learning_rate": 3.08e-07, "loss": 0.0026, "reward": 1.7125000953674316, "reward_std": 0.04124574363231659, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7124999761581421, "step": 2076 }, { "completion_length": 109.71875, "epoch": 1.3846666666666667, "grad_norm": 2.5703790018505135, "kl": 0.083984375, "learning_rate": 3.0766666666666663e-07, "loss": 0.0034, "reward": 1.7999999523162842, "reward_std": 0.07780931144952774, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.800000011920929, "step": 2077 }, { "completion_length": 96.75, "epoch": 1.3853333333333333, "grad_norm": 5.525008282855642, "kl": 0.0732421875, "learning_rate": 3.0733333333333336e-07, "loss": 0.0029, "reward": 1.6822917461395264, "reward_std": 0.35926172137260437, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7447916269302368, "step": 2078 }, { "completion_length": 87.1875, "epoch": 1.3860000000000001, "grad_norm": 2.068431200143079, "kl": 0.07861328125, "learning_rate": 3.07e-07, "loss": 0.0031, "reward": 1.8489583730697632, "reward_std": 0.12425211071968079, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8489583730697632, "step": 2079 }, { "completion_length": 110.25, "epoch": 1.3866666666666667, "grad_norm": 3.975081216048427, "kl": 0.1376953125, "learning_rate": 3.066666666666666e-07, "loss": 0.0055, "reward": 1.6586050987243652, "reward_std": 0.3607383072376251, "rewards/format_reward": 0.9375, "rewards/iou_reward": 0.7211050987243652, "step": 2080 }, { "completion_length": 96.34375, "epoch": 1.3873333333333333, "grad_norm": 2.147285094590146, "kl": 0.08203125, "learning_rate": 3.0633333333333334e-07, "loss": 0.0033, "reward": 1.875, "reward_std": 0.19716878235340118, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.90625, "step": 2081 }, { "completion_length": 106.375, "epoch": 1.388, "grad_norm": 2.6145570425574043, "kl": 0.08349609375, "learning_rate": 3.0599999999999996e-07, "loss": 0.0033, "reward": 1.6786458492279053, "reward_std": 0.17267119884490967, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7098958492279053, "step": 2082 }, { "completion_length": 89.8125, "epoch": 1.3886666666666667, "grad_norm": 3.3866814464781507, "kl": 0.06884765625, "learning_rate": 3.0566666666666664e-07, "loss": 0.0028, "reward": 1.71875, "reward_std": 0.25966876745224, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.75, "step": 2083 }, { "completion_length": 108.40625, "epoch": 1.3893333333333333, "grad_norm": 5.151559623298027, "kl": 0.10009765625, "learning_rate": 3.053333333333333e-07, "loss": 0.004, "reward": 1.7541667222976685, "reward_std": 0.12066175043582916, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7541666626930237, "step": 2084 }, { "completion_length": 103.34375, "epoch": 1.3900000000000001, "grad_norm": 4.931641752361429, "kl": 0.052734375, "learning_rate": 3.05e-07, "loss": 0.0021, "reward": 1.78125, "reward_std": 0.07216878235340118, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.78125, "step": 2085 }, { "completion_length": 93.4375, "epoch": 1.3906666666666667, "grad_norm": 1.5989249865792254, "kl": 0.0625, "learning_rate": 3.046666666666666e-07, "loss": 0.0025, "reward": 1.71875, "reward_std": 0.03608439117670059, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.71875, "step": 2086 }, { "completion_length": 106.15625, "epoch": 1.3913333333333333, "grad_norm": 1.0579562227675734, "kl": 0.0576171875, "learning_rate": 3.0433333333333335e-07, "loss": 0.0023, "reward": 1.5520833730697632, "reward_std": 0.0625, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.5833333730697632, "step": 2087 }, { "completion_length": 96.84375, "epoch": 1.392, "grad_norm": 2.074401701686187, "kl": 0.07080078125, "learning_rate": 3.0399999999999997e-07, "loss": 0.0028, "reward": 1.9229166507720947, "reward_std": 0.1041666567325592, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9229166507720947, "step": 2088 }, { "completion_length": 110.8125, "epoch": 1.3926666666666667, "grad_norm": 2.094186821354729, "kl": 0.0537109375, "learning_rate": 3.0366666666666665e-07, "loss": 0.0022, "reward": 1.90625, "reward_std": 0.1875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.90625, "step": 2089 }, { "completion_length": 95.71875, "epoch": 1.3933333333333333, "grad_norm": 4.387949804731543, "kl": 0.056396484375, "learning_rate": 3.033333333333333e-07, "loss": 0.0023, "reward": 1.879166603088379, "reward_std": 0.09920123964548111, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8791667222976685, "step": 2090 }, { "completion_length": 113.3125, "epoch": 1.3940000000000001, "grad_norm": 0.6342822071683273, "kl": 0.051025390625, "learning_rate": 3.03e-07, "loss": 0.002, "reward": 1.78125, "reward_std": 0.125, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.8125, "step": 2091 }, { "completion_length": 89.90625, "epoch": 1.3946666666666667, "grad_norm": 2.131332104020387, "kl": 0.041748046875, "learning_rate": 3.026666666666666e-07, "loss": 0.0017, "reward": 1.7916667461395264, "reward_std": 0.2169627845287323, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.7916666865348816, "step": 2092 }, { "completion_length": 103.21875, "epoch": 1.3953333333333333, "grad_norm": 20.82165795019999, "kl": 0.064453125, "learning_rate": 3.0233333333333335e-07, "loss": 0.0026, "reward": 1.9479167461395264, "reward_std": 0.0797588899731636, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9479167461395264, "step": 2093 }, { "completion_length": 95.03125, "epoch": 1.396, "grad_norm": 0.275030257432285, "kl": 0.078125, "learning_rate": 3.02e-07, "loss": 0.0031, "reward": 1.9583333730697632, "reward_std": 0.0, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9583333134651184, "step": 2094 }, { "completion_length": 97.46875, "epoch": 1.3966666666666667, "grad_norm": 3.3278541732416, "kl": 0.10302734375, "learning_rate": 3.0166666666666665e-07, "loss": 0.0041, "reward": 1.8307292461395264, "reward_std": 0.1231372058391571, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8307291865348816, "step": 2095 }, { "completion_length": 99.90625, "epoch": 1.3973333333333333, "grad_norm": 2.102478432188712, "kl": 0.0791015625, "learning_rate": 3.0133333333333333e-07, "loss": 0.0032, "reward": 1.9453125, "reward_std": 0.046875, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.9453125, "step": 2096 }, { "completion_length": 114.9375, "epoch": 1.3980000000000001, "grad_norm": 3.6125415060569455, "kl": 0.052001953125, "learning_rate": 3.0099999999999996e-07, "loss": 0.0021, "reward": 1.659895896911621, "reward_std": 0.2294604480266571, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.6911458373069763, "step": 2097 }, { "completion_length": 105.25, "epoch": 1.3986666666666667, "grad_norm": 1.4839878564236268, "kl": 0.04736328125, "learning_rate": 3.006666666666667e-07, "loss": 0.0019, "reward": 1.7395833730697632, "reward_std": 0.16108438372612, "rewards/format_reward": 0.96875, "rewards/iou_reward": 0.7708333730697632, "step": 2098 }, { "completion_length": 100.75, "epoch": 1.3993333333333333, "grad_norm": 5.57492795330445, "kl": 0.06884765625, "learning_rate": 3.003333333333333e-07, "loss": 0.0028, "reward": 1.800520896911621, "reward_std": 0.10432139784097672, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8005208373069763, "step": 2099 }, { "completion_length": 95.28125, "epoch": 1.4, "grad_norm": 3.5131056453411826, "kl": 0.0712890625, "learning_rate": 3e-07, "loss": 0.0028, "reward": 1.8666666746139526, "reward_std": 0.09905625879764557, "rewards/format_reward": 1.0, "rewards/iou_reward": 0.8666666746139526, "step": 2100 } ], "logging_steps": 1.0, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }