{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5263157894736842, "eval_steps": 50, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.125, "completions/max_terminated_length": 451.125, "completions/mean_length": 241.560546875, "completions/mean_terminated_length": 241.560546875, "completions/min_length": 111.75, "completions/min_terminated_length": 111.75, "epoch": 0.002105263157894737, "grad_norm": 0.0383942686021328, "learning_rate": 0.0, "loss": -0.0025, "num_tokens": 484639.0, "reward": 0.9500823765993118, "reward_std": 0.6353622525930405, "rewards/format_reward_embodied/mean": 0.501953125, "rewards/format_reward_embodied/std": 0.4904305227100849, "rewards/stop_prediction_reward/mean": 0.439453125, "rewards/stop_prediction_reward/std": 0.3919360339641571, "rewards/waypoint_pred_accuracy/mean": 0.004338064874811504, "rewards/waypoint_pred_accuracy/std": 0.012508220294698669, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 985.25, "completions/max_terminated_length": 985.25, "completions/mean_length": 266.818359375, "completions/mean_terminated_length": 267.813928604126, "completions/min_length": 94.0, "completions/min_terminated_length": 119.375, "epoch": 0.004210526315789474, "grad_norm": 0.041216954588890076, "learning_rate": 2.083333333333333e-08, "loss": 0.0025, "num_tokens": 982274.0, "reward": 0.9463644102215767, "reward_std": 0.6123590245842934, "rewards/format_reward_embodied/mean": 0.50390625, "rewards/format_reward_embodied/std": 0.49139947816729546, "rewards/stop_prediction_reward/mean": 0.42578125, "rewards/stop_prediction_reward/std": 0.3781757093966007, "rewards/waypoint_pred_accuracy/mean": 0.008338454590998856, "rewards/waypoint_pred_accuracy/std": 0.02023129865005227, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 247.1015625, "completions/mean_terminated_length": 247.1015625, "completions/min_length": 117.875, "completions/min_terminated_length": 117.875, "epoch": 0.00631578947368421, "grad_norm": 0.035625893622636795, "learning_rate": 4.166666666666666e-08, "loss": -0.001, "num_tokens": 1467318.0, "reward": 0.9623514339327812, "reward_std": 0.6189808771014214, "rewards/format_reward_embodied/mean": 0.498046875, "rewards/format_reward_embodied/std": 0.4842093959450722, "rewards/stop_prediction_reward/mean": 0.443359375, "rewards/stop_prediction_reward/std": 0.44101808220148087, "rewards/waypoint_pred_accuracy/mean": 0.010472597823421942, "rewards/waypoint_pred_accuracy/std": 0.03422736286900763, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.5, "completions/max_terminated_length": 509.5, "completions/mean_length": 237.654296875, "completions/mean_terminated_length": 237.654296875, "completions/min_length": 113.375, "completions/min_terminated_length": 113.375, "epoch": 0.008421052631578947, "grad_norm": 0.03751353174448013, "learning_rate": 6.25e-08, "loss": -0.0035, "num_tokens": 1950021.0, "reward": 0.9695519432425499, "reward_std": 0.6773256361484528, "rewards/format_reward_embodied/mean": 0.4765625, "rewards/format_reward_embodied/std": 0.4892418272793293, "rewards/stop_prediction_reward/mean": 0.421875, "rewards/stop_prediction_reward/std": 0.3986336216330528, "rewards/waypoint_pred_accuracy/mean": 0.03555722400778907, "rewards/waypoint_pred_accuracy/std": 0.08721220167353771, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.375, "completions/max_terminated_length": 813.375, "completions/mean_length": 265.58984375, "completions/mean_terminated_length": 265.58984375, "completions/min_length": 116.625, "completions/min_terminated_length": 116.625, "epoch": 0.010526315789473684, "grad_norm": 0.0453697107732296, "learning_rate": 8.333333333333333e-08, "loss": 0.0003, "num_tokens": 2446643.0, "reward": 0.9188483878970146, "reward_std": 0.610739640891552, "rewards/format_reward_embodied/mean": 0.5625, "rewards/format_reward_embodied/std": 0.4938563257455826, "rewards/stop_prediction_reward/mean": 0.345703125, "rewards/stop_prediction_reward/std": 0.4086693823337555, "rewards/waypoint_pred_accuracy/mean": 0.005322630658819445, "rewards/waypoint_pred_accuracy/std": 0.016403043182279513, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 252.169921875, "completions/mean_terminated_length": 252.66043663024902, "completions/min_length": 102.375, "completions/min_terminated_length": 118.25, "epoch": 0.01263157894736842, "grad_norm": 0.049299150705337524, "learning_rate": 1.0416666666666667e-07, "loss": 0.0011, "num_tokens": 2935818.0, "reward": 0.877131775021553, "reward_std": 0.6145607680082321, "rewards/format_reward_embodied/mean": 0.509765625, "rewards/format_reward_embodied/std": 0.4843035563826561, "rewards/stop_prediction_reward/mean": 0.3671875, "rewards/stop_prediction_reward/std": 0.38137195259332657, "rewards/waypoint_pred_accuracy/mean": 8.932340400540724e-05, "rewards/waypoint_pred_accuracy/std": 0.0005101569792639827, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.75, "completions/max_terminated_length": 1163.75, "completions/mean_length": 255.529296875, "completions/mean_terminated_length": 255.529296875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.014736842105263158, "grad_norm": 0.036646511405706406, "learning_rate": 1.25e-07, "loss": 0.001, "num_tokens": 3427097.0, "reward": 0.6614178493618965, "reward_std": 0.6105708554387093, "rewards/format_reward_embodied/mean": 0.46484375, "rewards/format_reward_embodied/std": 0.48722705617547035, "rewards/stop_prediction_reward/mean": 0.173828125, "rewards/stop_prediction_reward/std": 0.3640986457467079, "rewards/waypoint_pred_accuracy/mean": 0.011372994726074323, "rewards/waypoint_pred_accuracy/std": 0.04579899070052374, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.25, "completions/max_terminated_length": 467.25, "completions/mean_length": 246.98828125, "completions/mean_terminated_length": 246.98828125, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "epoch": 0.016842105263157894, "grad_norm": 0.047037359327077866, "learning_rate": 1.4583333333333335e-07, "loss": -0.0023, "num_tokens": 3915411.0, "reward": 0.9769175350666046, "reward_std": 0.6314118355512619, "rewards/format_reward_embodied/mean": 0.568359375, "rewards/format_reward_embodied/std": 0.4869700260460377, "rewards/stop_prediction_reward/mean": 0.376953125, "rewards/stop_prediction_reward/std": 0.4093479886651039, "rewards/waypoint_pred_accuracy/mean": 0.015802525533167768, "rewards/waypoint_pred_accuracy/std": 0.032816135895782333, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.125, "completions/max_terminated_length": 481.125, "completions/mean_length": 241.935546875, "completions/mean_terminated_length": 241.935546875, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.018947368421052633, "grad_norm": 0.04171831160783768, "learning_rate": 1.6666666666666665e-07, "loss": -0.0015, "num_tokens": 4397042.0, "reward": 0.9027341902256012, "reward_std": 0.6853612437844276, "rewards/format_reward_embodied/mean": 0.48828125, "rewards/format_reward_embodied/std": 0.4951612576842308, "rewards/stop_prediction_reward/mean": 0.322265625, "rewards/stop_prediction_reward/std": 0.4115743637084961, "rewards/waypoint_pred_accuracy/mean": 0.04609366483055356, "rewards/waypoint_pred_accuracy/std": 0.09995413944127579, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.625, "completions/max_terminated_length": 695.625, "completions/mean_length": 261.609375, "completions/mean_terminated_length": 261.609375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.021052631578947368, "grad_norm": 0.03897935897111893, "learning_rate": 1.875e-07, "loss": 0.0002, "num_tokens": 4893354.0, "reward": 1.0423217862844467, "reward_std": 0.6282145008444786, "rewards/format_reward_embodied/mean": 0.548828125, "rewards/format_reward_embodied/std": 0.48774589598178864, "rewards/stop_prediction_reward/mean": 0.478515625, "rewards/stop_prediction_reward/std": 0.4023555275052786, "rewards/waypoint_pred_accuracy/mean": 0.007489029231998282, "rewards/waypoint_pred_accuracy/std": 0.0251983865261218, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.375, "completions/max_terminated_length": 470.375, "completions/mean_length": 253.248046875, "completions/mean_terminated_length": 253.248046875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.023157894736842106, "grad_norm": 0.039429888129234314, "learning_rate": 2.0833333333333333e-07, "loss": -0.0022, "num_tokens": 5384873.0, "reward": 0.7660543769598007, "reward_std": 0.5604145936667919, "rewards/format_reward_embodied/mean": 0.607421875, "rewards/format_reward_embodied/std": 0.479397177696228, "rewards/stop_prediction_reward/mean": 0.15625, "rewards/stop_prediction_reward/std": 0.3427934180945158, "rewards/waypoint_pred_accuracy/mean": 0.0011912494257570604, "rewards/waypoint_pred_accuracy/std": 0.00802605507872417, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 263.59375, "completions/mean_terminated_length": 264.0594940185547, "completions/min_length": 99.75, "completions/min_terminated_length": 114.625, "epoch": 0.02526315789473684, "grad_norm": 0.030105428770184517, "learning_rate": 2.2916666666666663e-07, "loss": 0.0033, "num_tokens": 5878681.0, "reward": 1.0901148244738579, "reward_std": 0.6814222931861877, "rewards/format_reward_embodied/mean": 0.6171875, "rewards/format_reward_embodied/std": 0.46372338756918907, "rewards/stop_prediction_reward/mean": 0.400390625, "rewards/stop_prediction_reward/std": 0.4396743141114712, "rewards/waypoint_pred_accuracy/mean": 0.036268358699724924, "rewards/waypoint_pred_accuracy/std": 0.07070713029423034, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.875, "completions/max_terminated_length": 476.875, "completions/mean_length": 257.3125, "completions/mean_terminated_length": 257.3125, "completions/min_length": 118.625, "completions/min_terminated_length": 118.625, "epoch": 0.02736842105263158, "grad_norm": 0.03954648971557617, "learning_rate": 2.5e-07, "loss": -0.0013, "num_tokens": 6370745.0, "reward": 1.1011288091540337, "reward_std": 0.6459922045469284, "rewards/format_reward_embodied/mean": 0.634765625, "rewards/format_reward_embodied/std": 0.47677353397011757, "rewards/stop_prediction_reward/mean": 0.44140625, "rewards/stop_prediction_reward/std": 0.42084217444062233, "rewards/waypoint_pred_accuracy/mean": 0.012478479564244083, "rewards/waypoint_pred_accuracy/std": 0.04567733465950141, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.5, "completions/max_terminated_length": 1179.5, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 120.375, "completions/min_terminated_length": 120.375, "epoch": 0.029473684210526315, "grad_norm": 0.03879372030496597, "learning_rate": 2.708333333333333e-07, "loss": 0.0013, "num_tokens": 6872585.0, "reward": 1.069977581501007, "reward_std": 0.6529600322246552, "rewards/format_reward_embodied/mean": 0.626953125, "rewards/format_reward_embodied/std": 0.46956589445471764, "rewards/stop_prediction_reward/mean": 0.412109375, "rewards/stop_prediction_reward/std": 0.42556022480130196, "rewards/waypoint_pred_accuracy/mean": 0.015457541714965616, "rewards/waypoint_pred_accuracy/std": 0.05261327166544845, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.75, "completions/max_terminated_length": 456.75, "completions/mean_length": 247.5078125, "completions/mean_terminated_length": 247.5078125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.031578947368421054, "grad_norm": 0.03746689483523369, "learning_rate": 2.916666666666667e-07, "loss": -0.004, "num_tokens": 7358669.0, "reward": 0.816399596631527, "reward_std": 0.6439896002411842, "rewards/format_reward_embodied/mean": 0.5546875, "rewards/format_reward_embodied/std": 0.49549105390906334, "rewards/stop_prediction_reward/mean": 0.216796875, "rewards/stop_prediction_reward/std": 0.3949273619800806, "rewards/waypoint_pred_accuracy/mean": 0.022457610069225337, "rewards/waypoint_pred_accuracy/std": 0.04548973154789149, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.25, "completions/max_terminated_length": 501.25, "completions/mean_length": 254.716796875, "completions/mean_terminated_length": 254.716796875, "completions/min_length": 120.375, "completions/min_terminated_length": 120.375, "epoch": 0.03368421052631579, "grad_norm": 0.053112324327230453, "learning_rate": 3.1249999999999997e-07, "loss": 0.0002, "num_tokens": 7852156.0, "reward": 1.2096271365880966, "reward_std": 0.6100753545761108, "rewards/format_reward_embodied/mean": 0.76171875, "rewards/format_reward_embodied/std": 0.4093044362962246, "rewards/stop_prediction_reward/mean": 0.4453125, "rewards/stop_prediction_reward/std": 0.4225916638970375, "rewards/waypoint_pred_accuracy/mean": 0.0012979521083353873, "rewards/waypoint_pred_accuracy/std": 0.009194809671299708, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.375, "completions/max_terminated_length": 472.375, "completions/mean_length": 247.376953125, "completions/mean_terminated_length": 247.376953125, "completions/min_length": 116.25, "completions/min_terminated_length": 116.25, "epoch": 0.035789473684210524, "grad_norm": 0.03221385180950165, "learning_rate": 3.333333333333333e-07, "loss": -0.0006, "num_tokens": 8339517.0, "reward": 1.1803481727838516, "reward_std": 0.5297410599887371, "rewards/format_reward_embodied/mean": 0.814453125, "rewards/format_reward_embodied/std": 0.3874172270298004, "rewards/stop_prediction_reward/mean": 0.345703125, "rewards/stop_prediction_reward/std": 0.3674583863466978, "rewards/waypoint_pred_accuracy/mean": 0.010095963222276419, "rewards/waypoint_pred_accuracy/std": 0.0323235778856652, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.875, "completions/max_terminated_length": 811.875, "completions/mean_length": 254.984375, "completions/mean_terminated_length": 254.984375, "completions/min_length": 112.625, "completions/min_terminated_length": 112.625, "epoch": 0.037894736842105266, "grad_norm": 0.034303538501262665, "learning_rate": 3.541666666666667e-07, "loss": 0.0065, "num_tokens": 8828533.0, "reward": 1.0827482342720032, "reward_std": 0.49626101925969124, "rewards/format_reward_embodied/mean": 0.84765625, "rewards/format_reward_embodied/std": 0.35438157618045807, "rewards/stop_prediction_reward/mean": 0.234375, "rewards/stop_prediction_reward/std": 0.360489659011364, "rewards/waypoint_pred_accuracy/mean": 0.00035850519751079446, "rewards/waypoint_pred_accuracy/std": 0.000990356254078506, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 469.5, "completions/max_terminated_length": 469.5, "completions/mean_length": 242.603515625, "completions/mean_terminated_length": 243.05292129516602, "completions/min_length": 99.5, "completions/min_terminated_length": 107.75, "epoch": 0.04, "grad_norm": 0.03210434690117836, "learning_rate": 3.75e-07, "loss": 0.0002, "num_tokens": 9313706.0, "reward": 1.1656895354390144, "reward_std": 0.5405256152153015, "rewards/format_reward_embodied/mean": 0.841796875, "rewards/format_reward_embodied/std": 0.35758682526648045, "rewards/stop_prediction_reward/mean": 0.322265625, "rewards/stop_prediction_reward/std": 0.4072440378367901, "rewards/waypoint_pred_accuracy/mean": 0.0008135107927961789, "rewards/waypoint_pred_accuracy/std": 0.004946927132555481, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.5, "completions/max_terminated_length": 887.5, "completions/mean_length": 257.341796875, "completions/mean_terminated_length": 257.341796875, "completions/min_length": 117.75, "completions/min_terminated_length": 117.75, "epoch": 0.042105263157894736, "grad_norm": 0.023722035810351372, "learning_rate": 3.958333333333333e-07, "loss": 0.0063, "num_tokens": 9807001.0, "reward": 1.2326279431581497, "reward_std": 0.5118205770850182, "rewards/format_reward_embodied/mean": 0.880859375, "rewards/format_reward_embodied/std": 0.31777896732091904, "rewards/stop_prediction_reward/mean": 0.29296875, "rewards/stop_prediction_reward/std": 0.3428589329123497, "rewards/waypoint_pred_accuracy/mean": 0.029399914224090686, "rewards/waypoint_pred_accuracy/std": 0.06371456215110564, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.25, "completions/max_terminated_length": 1251.25, "completions/mean_length": 258.982421875, "completions/mean_terminated_length": 258.982421875, "completions/min_length": 107.75, "completions/min_terminated_length": 107.75, "epoch": 0.04421052631578947, "grad_norm": 0.027127819135785103, "learning_rate": 4.1666666666666667e-07, "loss": 0.0107, "num_tokens": 10300304.0, "reward": 1.3775597661733627, "reward_std": 0.5649962350726128, "rewards/format_reward_embodied/mean": 0.880859375, "rewards/format_reward_embodied/std": 0.3204925637692213, "rewards/stop_prediction_reward/mean": 0.451171875, "rewards/stop_prediction_reward/std": 0.4135790057480335, "rewards/waypoint_pred_accuracy/mean": 0.02276425497597198, "rewards/waypoint_pred_accuracy/std": 0.053010769921375774, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.5, "completions/max_terminated_length": 764.5, "completions/mean_length": 247.626953125, "completions/mean_terminated_length": 247.626953125, "completions/min_length": 120.625, "completions/min_terminated_length": 120.625, "epoch": 0.04631578947368421, "grad_norm": 0.026827372610569, "learning_rate": 4.375e-07, "loss": 0.0046, "num_tokens": 10787473.0, "reward": 1.2097989320755005, "reward_std": 0.4106667507439852, "rewards/format_reward_embodied/mean": 0.9609375, "rewards/format_reward_embodied/std": 0.1782014612108469, "rewards/stop_prediction_reward/mean": 0.248046875, "rewards/stop_prediction_reward/std": 0.3576600421220064, "rewards/waypoint_pred_accuracy/mean": 0.0004072752802812829, "rewards/waypoint_pred_accuracy/std": 0.001209557721267629, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.125, "completions/max_terminated_length": 699.125, "completions/mean_length": 247.458984375, "completions/mean_terminated_length": 247.458984375, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.04842105263157895, "grad_norm": 0.024113576859235764, "learning_rate": 4.5833333333333327e-07, "loss": 0.0032, "num_tokens": 11273276.0, "reward": 1.3881124705076218, "reward_std": 0.5006838031113148, "rewards/format_reward_embodied/mean": 0.943359375, "rewards/format_reward_embodied/std": 0.22258390858769417, "rewards/stop_prediction_reward/mean": 0.42578125, "rewards/stop_prediction_reward/std": 0.4117406941950321, "rewards/waypoint_pred_accuracy/mean": 0.00948592593158537, "rewards/waypoint_pred_accuracy/std": 0.03859481842846435, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1426.0, "completions/max_terminated_length": 1426.0, "completions/mean_length": 270.671875, "completions/mean_terminated_length": 271.2647590637207, "completions/min_length": 100.0, "completions/min_terminated_length": 115.5, "epoch": 0.05052631578947368, "grad_norm": 0.021521741524338722, "learning_rate": 4.791666666666667e-07, "loss": 0.0138, "num_tokens": 11772628.0, "reward": 1.412862166762352, "reward_std": 0.4836365692317486, "rewards/format_reward_embodied/mean": 0.94140625, "rewards/format_reward_embodied/std": 0.22402114421129227, "rewards/stop_prediction_reward/mean": 0.46875, "rewards/stop_prediction_reward/std": 0.4050610587000847, "rewards/waypoint_pred_accuracy/mean": 0.0013529594958612285, "rewards/waypoint_pred_accuracy/std": 0.008442860726859186, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.375, "completions/max_terminated_length": 503.375, "completions/mean_length": 252.7265625, "completions/mean_terminated_length": 252.7265625, "completions/min_length": 115.75, "completions/min_terminated_length": 115.75, "epoch": 0.05263157894736842, "grad_norm": 0.023303357884287834, "learning_rate": 5e-07, "loss": 0.0022, "num_tokens": 12260936.0, "reward": 1.3870358616113663, "reward_std": 0.4634270928800106, "rewards/format_reward_embodied/mean": 0.97265625, "rewards/format_reward_embodied/std": 0.14094455912709236, "rewards/stop_prediction_reward/mean": 0.404296875, "rewards/stop_prediction_reward/std": 0.41460882127285004, "rewards/waypoint_pred_accuracy/mean": 0.005041355174832356, "rewards/waypoint_pred_accuracy/std": 0.030215839982022222, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 802.125, "completions/max_terminated_length": 802.125, "completions/mean_length": 252.96484375, "completions/mean_terminated_length": 252.96484375, "completions/min_length": 117.625, "completions/min_terminated_length": 117.625, "epoch": 0.05473684210526316, "grad_norm": 0.022509992122650146, "learning_rate": 5.208333333333334e-07, "loss": 0.0072, "num_tokens": 12752118.0, "reward": 1.3777707070112228, "reward_std": 0.4540855921804905, "rewards/format_reward_embodied/mean": 0.962890625, "rewards/format_reward_embodied/std": 0.1879090555012226, "rewards/stop_prediction_reward/mean": 0.388671875, "rewards/stop_prediction_reward/std": 0.3826281502842903, "rewards/waypoint_pred_accuracy/mean": 0.013104110299219339, "rewards/waypoint_pred_accuracy/std": 0.023219284697190704, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1812.5, "completions/max_terminated_length": 1812.5, "completions/mean_length": 276.763671875, "completions/mean_terminated_length": 277.41750717163086, "completions/min_length": 101.75, "completions/min_terminated_length": 118.5, "epoch": 0.056842105263157895, "grad_norm": 0.025979243218898773, "learning_rate": 5.416666666666666e-07, "loss": 0.0201, "num_tokens": 13257853.0, "reward": 1.195212036371231, "reward_std": 0.4585261270403862, "rewards/format_reward_embodied/mean": 0.9453125, "rewards/format_reward_embodied/std": 0.21002393402159214, "rewards/stop_prediction_reward/mean": 0.248046875, "rewards/stop_prediction_reward/std": 0.3809744007885456, "rewards/waypoint_pred_accuracy/mean": 0.0009263323242719891, "rewards/waypoint_pred_accuracy/std": 0.0071097194065939074, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.875, "completions/max_terminated_length": 1423.875, "completions/mean_length": 266.958984375, "completions/mean_terminated_length": 266.958984375, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "epoch": 0.05894736842105263, "grad_norm": 0.02617989294230938, "learning_rate": 5.625e-07, "loss": 0.0123, "num_tokens": 13756648.0, "reward": 1.4108145833015442, "reward_std": 0.4676571935415268, "rewards/format_reward_embodied/mean": 0.96484375, "rewards/format_reward_embodied/std": 0.1660303734242916, "rewards/stop_prediction_reward/mean": 0.427734375, "rewards/stop_prediction_reward/std": 0.40000360645353794, "rewards/waypoint_pred_accuracy/mean": 0.009118233890500024, "rewards/waypoint_pred_accuracy/std": 0.03189600147387662, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.375, "completions/max_terminated_length": 790.375, "completions/mean_length": 253.384765625, "completions/mean_terminated_length": 253.384765625, "completions/min_length": 113.125, "completions/min_terminated_length": 113.125, "epoch": 0.061052631578947365, "grad_norm": 0.0284319706261158, "learning_rate": 5.833333333333334e-07, "loss": 0.004, "num_tokens": 14246829.0, "reward": 1.3669871091842651, "reward_std": 0.476172287017107, "rewards/format_reward_embodied/mean": 0.9609375, "rewards/format_reward_embodied/std": 0.19018890894949436, "rewards/stop_prediction_reward/mean": 0.376953125, "rewards/stop_prediction_reward/std": 0.38546351715922356, "rewards/waypoint_pred_accuracy/mean": 0.014548240964149528, "rewards/waypoint_pred_accuracy/std": 0.048405178813538896, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.125, "completions/max_terminated_length": 946.125, "completions/mean_length": 267.513671875, "completions/mean_terminated_length": 267.513671875, "completions/min_length": 118.25, "completions/min_terminated_length": 118.25, "epoch": 0.06315789473684211, "grad_norm": 0.02758762799203396, "learning_rate": 6.041666666666666e-07, "loss": 0.0087, "num_tokens": 14742708.0, "reward": 1.3311925828456879, "reward_std": 0.5349335558712482, "rewards/format_reward_embodied/mean": 0.94921875, "rewards/format_reward_embodied/std": 0.21471346728503704, "rewards/stop_prediction_reward/mean": 0.375, "rewards/stop_prediction_reward/std": 0.4617812857031822, "rewards/waypoint_pred_accuracy/mean": 0.003486919093547547, "rewards/waypoint_pred_accuracy/std": 0.022435040602790033, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.875, "completions/max_terminated_length": 468.875, "completions/mean_length": 247.80859375, "completions/mean_terminated_length": 247.80859375, "completions/min_length": 116.125, "completions/min_terminated_length": 116.125, "epoch": 0.06526315789473684, "grad_norm": 0.024238400161266327, "learning_rate": 6.249999999999999e-07, "loss": 0.0006, "num_tokens": 15230930.0, "reward": 1.4348655045032501, "reward_std": 0.4678279310464859, "rewards/format_reward_embodied/mean": 0.982421875, "rewards/format_reward_embodied/std": 0.09929289110004902, "rewards/stop_prediction_reward/mean": 0.439453125, "rewards/stop_prediction_reward/std": 0.4299692139029503, "rewards/waypoint_pred_accuracy/mean": 0.006495264507384302, "rewards/waypoint_pred_accuracy/std": 0.03605599632994938, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.5, "completions/max_terminated_length": 701.5, "completions/mean_length": 247.302734375, "completions/mean_terminated_length": 247.302734375, "completions/min_length": 102.625, "completions/min_terminated_length": 102.625, "epoch": 0.06736842105263158, "grad_norm": 0.03117675893008709, "learning_rate": 6.458333333333333e-07, "loss": 0.0048, "num_tokens": 15717165.0, "reward": 1.276307299733162, "reward_std": 0.4745783172547817, "rewards/format_reward_embodied/mean": 0.9765625, "rewards/format_reward_embodied/std": 0.11361248232424259, "rewards/stop_prediction_reward/mean": 0.287109375, "rewards/stop_prediction_reward/std": 0.4378196634352207, "rewards/waypoint_pred_accuracy/mean": 0.006317712715827028, "rewards/waypoint_pred_accuracy/std": 0.02758992835879357, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.625, "completions/max_terminated_length": 736.625, "completions/mean_length": 259.27734375, "completions/mean_terminated_length": 259.27734375, "completions/min_length": 132.125, "completions/min_terminated_length": 132.125, "epoch": 0.06947368421052631, "grad_norm": 0.0244793351739645, "learning_rate": 6.666666666666666e-07, "loss": 0.0053, "num_tokens": 16211835.0, "reward": 1.540584921836853, "reward_std": 0.49051226675510406, "rewards/format_reward_embodied/mean": 0.96484375, "rewards/format_reward_embodied/std": 0.18235719576478004, "rewards/stop_prediction_reward/mean": 0.57421875, "rewards/stop_prediction_reward/std": 0.43413203582167625, "rewards/waypoint_pred_accuracy/mean": 0.0007612065199357397, "rewards/waypoint_pred_accuracy/std": 0.005327658200312494, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 881.375, "completions/max_terminated_length": 881.375, "completions/mean_length": 264.103515625, "completions/mean_terminated_length": 264.58401679992676, "completions/min_length": 101.375, "completions/min_terminated_length": 113.25, "epoch": 0.07157894736842105, "grad_norm": 0.0207452904433012, "learning_rate": 6.875e-07, "loss": 0.0068, "num_tokens": 16706928.0, "reward": 1.4342780411243439, "reward_std": 0.5214410163462162, "rewards/format_reward_embodied/mean": 0.9765625, "rewards/format_reward_embodied/std": 0.12835253402590752, "rewards/stop_prediction_reward/mean": 0.453125, "rewards/stop_prediction_reward/std": 0.47532549500465393, "rewards/waypoint_pred_accuracy/mean": 0.0022952774760399775, "rewards/waypoint_pred_accuracy/std": 0.016314118760955294, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.75, "completions/max_terminated_length": 544.75, "completions/mean_length": 247.744140625, "completions/mean_terminated_length": 247.744140625, "completions/min_length": 115.375, "completions/min_terminated_length": 115.375, "epoch": 0.07368421052631578, "grad_norm": 0.02019825391471386, "learning_rate": 7.083333333333334e-07, "loss": 0.0026, "num_tokens": 17196205.0, "reward": 1.4062748402357101, "reward_std": 0.5094473846256733, "rewards/format_reward_embodied/mean": 0.97265625, "rewards/format_reward_embodied/std": 0.15960253402590752, "rewards/stop_prediction_reward/mean": 0.43359375, "rewards/stop_prediction_reward/std": 0.4651285596191883, "rewards/waypoint_pred_accuracy/mean": 1.2417779022436235e-05, "rewards/waypoint_pred_accuracy/std": 2.711040125229227e-05, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.875, "completions/max_terminated_length": 1784.875, "completions/mean_length": 291.40625, "completions/mean_terminated_length": 291.40625, "completions/min_length": 128.75, "completions/min_terminated_length": 128.75, "epoch": 0.07578947368421053, "grad_norm": 0.038221534341573715, "learning_rate": 7.291666666666666e-07, "loss": 0.0175, "num_tokens": 17710461.0, "reward": 1.4911664873361588, "reward_std": 0.5095744393765926, "rewards/format_reward_embodied/mean": 0.978515625, "rewards/format_reward_embodied/std": 0.11029814556241035, "rewards/stop_prediction_reward/mean": 0.498046875, "rewards/stop_prediction_reward/std": 0.46525831148028374, "rewards/waypoint_pred_accuracy/mean": 0.007301997149683504, "rewards/waypoint_pred_accuracy/std": 0.024017843105910822, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.75, "completions/max_terminated_length": 472.75, "completions/mean_length": 248.03125, "completions/mean_terminated_length": 248.03125, "completions/min_length": 117.375, "completions/min_terminated_length": 117.375, "epoch": 0.07789473684210527, "grad_norm": 0.02083674818277359, "learning_rate": 7.5e-07, "loss": 0.0023, "num_tokens": 18197965.0, "reward": 1.4861425906419754, "reward_std": 0.5738211683928967, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.42578125, "rewards/stop_prediction_reward/std": 0.4866231083869934, "rewards/waypoint_pred_accuracy/mean": 0.03408691443473799, "rewards/waypoint_pred_accuracy/std": 0.08652095211436972, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.625, "completions/max_terminated_length": 884.625, "completions/mean_length": 258.75390625, "completions/mean_terminated_length": 258.75390625, "completions/min_length": 125.5, "completions/min_terminated_length": 125.5, "epoch": 0.08, "grad_norm": 0.021931249648332596, "learning_rate": 7.708333333333333e-07, "loss": 0.0048, "num_tokens": 18692239.0, "reward": 1.6138971894979477, "reward_std": 0.5961987935006618, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.07509202510118484, "rewards/stop_prediction_reward/mean": 0.513671875, "rewards/stop_prediction_reward/std": 0.488056443631649, "rewards/waypoint_pred_accuracy/mean": 0.05597203067737698, "rewards/waypoint_pred_accuracy/std": 0.09776196270839843, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.875, "completions/max_terminated_length": 755.875, "completions/mean_length": 250.787109375, "completions/mean_terminated_length": 250.787109375, "completions/min_length": 119.125, "completions/min_terminated_length": 119.125, "epoch": 0.08210526315789474, "grad_norm": 0.15428805351257324, "learning_rate": 7.916666666666666e-07, "loss": 0.0043, "num_tokens": 19183074.0, "reward": 1.4990187734365463, "reward_std": 0.49899255111813545, "rewards/format_reward_embodied/mean": 0.98046875, "rewards/format_reward_embodied/std": 0.1064315214753151, "rewards/stop_prediction_reward/mean": 0.517578125, "rewards/stop_prediction_reward/std": 0.4714191108942032, "rewards/waypoint_pred_accuracy/mean": 0.00048596067600919097, "rewards/waypoint_pred_accuracy/std": 0.0015497554879284957, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 246.05078125, "completions/mean_terminated_length": 246.05078125, "completions/min_length": 117.125, "completions/min_terminated_length": 117.125, "epoch": 0.08421052631578947, "grad_norm": 0.033689290285110474, "learning_rate": 8.125e-07, "loss": 0.0002, "num_tokens": 19674556.0, "reward": 1.52012699842453, "reward_std": 0.5320924893021584, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.09375, "rewards/stop_prediction_reward/mean": 0.494140625, "rewards/stop_prediction_reward/std": 0.4897289089858532, "rewards/waypoint_pred_accuracy/mean": 0.018852562323445468, "rewards/waypoint_pred_accuracy/std": 0.04221876614610676, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.625, "completions/max_terminated_length": 451.625, "completions/mean_length": 233.865234375, "completions/mean_terminated_length": 233.865234375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.0863157894736842, "grad_norm": 0.025640549138188362, "learning_rate": 8.333333333333333e-07, "loss": 0.0011, "num_tokens": 20156151.0, "reward": 1.591551125049591, "reward_std": 0.5175898559391499, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.08442101255059242, "rewards/stop_prediction_reward/mean": 0.5703125, "rewards/stop_prediction_reward/std": 0.4659374840557575, "rewards/waypoint_pred_accuracy/mean": 0.016478684779628456, "rewards/waypoint_pred_accuracy/std": 0.05339623770077609, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 242.478515625, "completions/mean_terminated_length": 242.478515625, "completions/min_length": 112.25, "completions/min_terminated_length": 112.25, "epoch": 0.08842105263157894, "grad_norm": 0.026281312108039856, "learning_rate": 8.541666666666666e-07, "loss": -0.0, "num_tokens": 20639788.0, "reward": 1.5611481666564941, "reward_std": 0.4784001186490059, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.560546875, "rewards/stop_prediction_reward/std": 0.4726823903620243, "rewards/waypoint_pred_accuracy/mean": 0.002253779932873278, "rewards/waypoint_pred_accuracy/std": 0.015716358982028673, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.875, "completions/max_terminated_length": 810.875, "completions/mean_length": 249.939453125, "completions/mean_terminated_length": 249.939453125, "completions/min_length": 122.875, "completions/min_terminated_length": 122.875, "epoch": 0.09052631578947369, "grad_norm": 0.021544892340898514, "learning_rate": 8.75e-07, "loss": 0.0029, "num_tokens": 21128013.0, "reward": 1.4784268736839294, "reward_std": 0.48740382865071297, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.06879601255059242, "rewards/stop_prediction_reward/mean": 0.484375, "rewards/stop_prediction_reward/std": 0.4655333496630192, "rewards/waypoint_pred_accuracy/mean": 0.0019087545888396566, "rewards/waypoint_pred_accuracy/std": 0.010897019660660593, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 643.625, "completions/max_terminated_length": 643.625, "completions/mean_length": 245.134765625, "completions/mean_terminated_length": 245.66865158081055, "completions/min_length": 101.25, "completions/min_terminated_length": 117.75, "epoch": 0.09263157894736843, "grad_norm": 0.025143882259726524, "learning_rate": 8.958333333333334e-07, "loss": 0.0038, "num_tokens": 21614866.0, "reward": 1.6192794144153595, "reward_std": 0.4844088666141033, "rewards/format_reward_embodied/mean": 0.982421875, "rewards/format_reward_embodied/std": 0.11263803765177727, "rewards/stop_prediction_reward/mean": 0.63671875, "rewards/stop_prediction_reward/std": 0.4543136991560459, "rewards/waypoint_pred_accuracy/mean": 6.939472140954406e-05, "rewards/waypoint_pred_accuracy/std": 0.00043998260686706566, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.375, "completions/max_terminated_length": 479.375, "completions/mean_length": 238.673828125, "completions/mean_terminated_length": 238.673828125, "completions/min_length": 115.625, "completions/min_terminated_length": 115.625, "epoch": 0.09473684210526316, "grad_norm": 0.023476244881749153, "learning_rate": 9.166666666666665e-07, "loss": 0.0007, "num_tokens": 22098731.0, "reward": 1.512737661600113, "reward_std": 0.5365940853953362, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.026630254462361336, "rewards/stop_prediction_reward/mean": 0.45703125, "rewards/stop_prediction_reward/std": 0.48733755201101303, "rewards/waypoint_pred_accuracy/mean": 0.03078289819380149, "rewards/waypoint_pred_accuracy/std": 0.043302708805491946, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.875, "completions/max_terminated_length": 641.875, "completions/mean_length": 240.724609375, "completions/mean_terminated_length": 240.724609375, "completions/min_length": 118.625, "completions/min_terminated_length": 118.625, "epoch": 0.0968421052631579, "grad_norm": 0.020364033058285713, "learning_rate": 9.374999999999999e-07, "loss": 0.0029, "num_tokens": 22583070.0, "reward": 1.6677764654159546, "reward_std": 0.5154935717582703, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.07350525446236134, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.44941750913858414, "rewards/waypoint_pred_accuracy/mean": 0.02627104918529134, "rewards/waypoint_pred_accuracy/std": 0.0397095277659446, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.375, "completions/max_terminated_length": 657.375, "completions/mean_length": 239.5546875, "completions/mean_terminated_length": 239.5546875, "completions/min_length": 118.875, "completions/min_terminated_length": 118.875, "epoch": 0.09894736842105263, "grad_norm": 0.023410305380821228, "learning_rate": 9.583333333333334e-07, "loss": 0.0039, "num_tokens": 23067066.0, "reward": 1.7648355215787888, "reward_std": 0.5304676033556461, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.685546875, "rewards/stop_prediction_reward/std": 0.45773619785904884, "rewards/waypoint_pred_accuracy/mean": 0.042574012356195935, "rewards/waypoint_pred_accuracy/std": 0.07306883804197034, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.625, "completions/max_terminated_length": 423.625, "completions/mean_length": 236.12890625, "completions/mean_terminated_length": 236.12890625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.10105263157894737, "grad_norm": 0.023688504472374916, "learning_rate": 9.791666666666667e-07, "loss": -0.0006, "num_tokens": 23549436.0, "reward": 1.70817232131958, "reward_std": 0.5296522080898285, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.06879601255059242, "rewards/stop_prediction_reward/mean": 0.63671875, "rewards/stop_prediction_reward/std": 0.45957546308636665, "rewards/waypoint_pred_accuracy/mean": 0.04060960025526583, "rewards/waypoint_pred_accuracy/std": 0.07721152482554317, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.5, "completions/max_terminated_length": 439.5, "completions/mean_length": 239.724609375, "completions/mean_terminated_length": 239.724609375, "completions/min_length": 112.625, "completions/min_terminated_length": 112.625, "epoch": 0.1031578947368421, "grad_norm": 0.04075814038515091, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 24036719.0, "reward": 1.5996688604354858, "reward_std": 0.43832388520240784, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.607421875, "rewards/stop_prediction_reward/std": 0.4225967414677143, "rewards/waypoint_pred_accuracy/mean": 2.974685131453066e-05, "rewards/waypoint_pred_accuracy/std": 0.00014841147029464143, "step": 49 }, { "epoch": 0.10526315789473684, "grad_norm": 0.02866051159799099, "learning_rate": 9.999878206375666e-07, "loss": 0.0015, "step": 50 }, { "epoch": 0.10526315789473684, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.00015625, "eval_completions/max_length": 683.96, "eval_completions/max_terminated_length": 683.96, "eval_completions/mean_length": 238.32913192749024, "eval_completions/mean_terminated_length": 238.36774322509766, "eval_completions/min_length": 112.41, "eval_completions/min_terminated_length": 113.78, "eval_loss": 0.003631497733294964, "eval_num_tokens": 24520656.0, "eval_reward": 1.6925481045246125, "eval_reward_std": 0.4437328398227692, "eval_rewards/format_reward_embodied/mean": 0.99328125, "eval_rewards/format_reward_embodied/std": 0.04765250638127327, "eval_rewards/stop_prediction_reward/mean": 0.66328125, "eval_rewards/stop_prediction_reward/std": 0.40405205205082895, "eval_rewards/waypoint_pred_accuracy/mean": 0.017992802756573712, "eval_rewards/waypoint_pred_accuracy/std": 0.04001634344926445, "eval_runtime": 1355.6164, "eval_samples_per_second": 0.074, "eval_steps_per_second": 0.001, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.375, "completions/max_terminated_length": 450.375, "completions/mean_length": 239.6259765625, "completions/mean_terminated_length": 239.6259765625, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.10736842105263159, "grad_norm": 0.02182384580373764, "learning_rate": 9.999512832095417e-07, "loss": 0.0005, "num_tokens": 25006064.0, "reward": 1.68679628521204, "reward_std": 0.4470259975641966, "rewards/format_reward_embodied/mean": 0.9931640625, "rewards/format_reward_embodied/std": 0.05002300627529621, "rewards/stop_prediction_reward/mean": 0.65234375, "rewards/stop_prediction_reward/std": 0.4022445324808359, "rewards/waypoint_pred_accuracy/mean": 0.02064424328141934, "rewards/waypoint_pred_accuracy/std": 0.05010251636930185, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 427.5, "completions/max_terminated_length": 427.5, "completions/mean_length": 230.73828125, "completions/mean_terminated_length": 231.21041297912598, "completions/min_length": 99.875, "completions/min_terminated_length": 114.375, "epoch": 0.10947368421052632, "grad_norm": 0.02381049655377865, "learning_rate": 9.998903896937148e-07, "loss": 0.0012, "num_tokens": 25485546.0, "reward": 1.7381224930286407, "reward_std": 0.4321938529610634, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.716796875, "rewards/stop_prediction_reward/std": 0.3914393372833729, "rewards/waypoint_pred_accuracy/mean": 0.01261594578698389, "rewards/waypoint_pred_accuracy/std": 0.03588760504630355, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.5, "completions/max_terminated_length": 450.5, "completions/mean_length": 234.287109375, "completions/mean_terminated_length": 234.287109375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.11157894736842106, "grad_norm": 0.05357426404953003, "learning_rate": 9.998051433862818e-07, "loss": -0.0001, "num_tokens": 25966589.0, "reward": 1.6857829988002777, "reward_std": 0.3670726828277111, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.69140625, "rewards/stop_prediction_reward/std": 0.3526647798717022, "rewards/waypoint_pred_accuracy/mean": 0.00011806207834793561, "rewards/waypoint_pred_accuracy/std": 0.0009230129067357363, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.75, "completions/max_terminated_length": 413.75, "completions/mean_length": 226.93359375, "completions/mean_terminated_length": 226.93359375, "completions/min_length": 108.125, "completions/min_terminated_length": 108.125, "epoch": 0.11368421052631579, "grad_norm": 0.024311864748597145, "learning_rate": 9.996955489016681e-07, "loss": 0.0006, "num_tokens": 26444507.0, "reward": 1.5553173422813416, "reward_std": 0.38553351908922195, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5234375, "rewards/stop_prediction_reward/std": 0.3467428870499134, "rewards/waypoint_pred_accuracy/mean": 0.015939914255370412, "rewards/waypoint_pred_accuracy/std": 0.039412272800977984, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.5, "completions/max_terminated_length": 667.5, "completions/mean_length": 246.060546875, "completions/mean_terminated_length": 246.060546875, "completions/min_length": 111.125, "completions/min_terminated_length": 111.125, "epoch": 0.11578947368421053, "grad_norm": 0.020382562652230263, "learning_rate": 9.995616121722783e-07, "loss": 0.0024, "num_tokens": 26931770.0, "reward": 1.543064832687378, "reward_std": 0.35424431413412094, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.06879601255059242, "rewards/stop_prediction_reward/mean": 0.548828125, "rewards/stop_prediction_reward/std": 0.3284119311720133, "rewards/waypoint_pred_accuracy/mean": 0.002001162469869708, "rewards/waypoint_pred_accuracy/std": 0.01592240231514961, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.75, "completions/max_terminated_length": 462.75, "completions/mean_length": 225.984375, "completions/mean_terminated_length": 225.984375, "completions/min_length": 113.875, "completions/min_terminated_length": 113.875, "epoch": 0.11789473684210526, "grad_norm": 0.020005574449896812, "learning_rate": 9.994033404481736e-07, "loss": 0.0004, "num_tokens": 27406770.0, "reward": 1.6594459414482117, "reward_std": 0.3725608382374048, "rewards/format_reward_embodied/mean": 0.986328125, "rewards/format_reward_embodied/std": 0.07980126701295376, "rewards/stop_prediction_reward/mean": 0.6640625, "rewards/stop_prediction_reward/std": 0.33192422799766064, "rewards/waypoint_pred_accuracy/mean": 0.004527658324403952, "rewards/waypoint_pred_accuracy/std": 0.017977507960307844, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.5, "completions/max_terminated_length": 450.5, "completions/mean_length": 234.10546875, "completions/mean_terminated_length": 234.10546875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.12, "grad_norm": 0.02229822427034378, "learning_rate": 9.992207422966824e-07, "loss": -0.0001, "num_tokens": 27889640.0, "reward": 1.4391246140003204, "reward_std": 0.35483869537711143, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.4375, "rewards/stop_prediction_reward/std": 0.34185592643916607, "rewards/waypoint_pred_accuracy/mean": 0.002765428128381, "rewards/waypoint_pred_accuracy/std": 0.014523094108374813, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 747.625, "completions/max_terminated_length": 747.625, "completions/mean_length": 233.369140625, "completions/mean_terminated_length": 233.90501022338867, "completions/min_length": 99.75, "completions/min_terminated_length": 113.5, "epoch": 0.12210526315789473, "grad_norm": 0.025426389649510384, "learning_rate": 9.990138276019335e-07, "loss": 0.0038, "num_tokens": 28370213.0, "reward": 1.6101858913898468, "reward_std": 0.3749941308051348, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.59375, "rewards/stop_prediction_reward/std": 0.32783540338277817, "rewards/waypoint_pred_accuracy/mean": 0.012124196402905074, "rewards/waypoint_pred_accuracy/std": 0.041881935120967384, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1494.75, "completions/max_terminated_length": 1494.75, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 119.125, "completions/min_terminated_length": 119.125, "epoch": 0.12421052631578948, "grad_norm": 0.029843533411622047, "learning_rate": 9.987826075643228e-07, "loss": 0.0225, "num_tokens": 28866405.0, "reward": 1.608642503619194, "reward_std": 0.37335721030831337, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.07509202510118484, "rewards/stop_prediction_reward/mean": 0.619140625, "rewards/stop_prediction_reward/std": 0.33958676643669605, "rewards/waypoint_pred_accuracy/mean": 0.0006103267239083258, "rewards/waypoint_pred_accuracy/std": 0.003143552353714926, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.875, "completions/max_terminated_length": 460.875, "completions/mean_length": 250.77734375, "completions/mean_terminated_length": 250.77734375, "completions/min_length": 122.25, "completions/min_terminated_length": 122.25, "epoch": 0.12631578947368421, "grad_norm": 0.02350872941315174, "learning_rate": 9.985270946999066e-07, "loss": -0.0006, "num_tokens": 29357939.0, "reward": 1.5302962958812714, "reward_std": 0.39041563123464584, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.51171875, "rewards/stop_prediction_reward/std": 0.3654658328741789, "rewards/waypoint_pred_accuracy/mean": 0.01026534708216827, "rewards/waypoint_pred_accuracy/std": 0.03301238967106072, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 254.34765625, "completions/mean_terminated_length": 254.9265899658203, "completions/min_length": 107.25, "completions/min_terminated_length": 128.25, "epoch": 0.12842105263157894, "grad_norm": 0.028141073882579803, "learning_rate": 9.982473028397236e-07, "loss": 0.004, "num_tokens": 29850341.0, "reward": 1.908710554242134, "reward_std": 0.4512513056397438, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.07509202510118484, "rewards/stop_prediction_reward/mean": 0.8515625, "rewards/stop_prediction_reward/std": 0.34670688211917877, "rewards/waypoint_pred_accuracy/mean": 0.034433397710529334, "rewards/waypoint_pred_accuracy/std": 0.06185820607151982, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 255.419921875, "completions/mean_terminated_length": 255.419921875, "completions/min_length": 119.125, "completions/min_terminated_length": 119.125, "epoch": 0.13052631578947368, "grad_norm": 0.0289030522108078, "learning_rate": 9.979432471290472e-07, "loss": 0.0055, "num_tokens": 30342588.0, "reward": 1.9491963237524033, "reward_std": 0.4529041275382042, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.802734375, "rewards/stop_prediction_reward/std": 0.3396564405411482, "rewards/waypoint_pred_accuracy/mean": 0.07518408738549646, "rewards/waypoint_pred_accuracy/std": 0.09874280986073203, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.625, "completions/max_terminated_length": 485.625, "completions/mean_length": 243.8125, "completions/mean_terminated_length": 243.8125, "completions/min_length": 104.25, "completions/min_terminated_length": 104.25, "epoch": 0.13263157894736843, "grad_norm": 0.02499276027083397, "learning_rate": 9.97614944026565e-07, "loss": -0.0005, "num_tokens": 30829532.0, "reward": 1.6152346730232239, "reward_std": 0.3560887239873409, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.043842025101184845, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.3392041679471731, "rewards/waypoint_pred_accuracy/mean": 1.5068036063656496e-07, "rewards/waypoint_pred_accuracy/std": 1.1848213716913133e-06, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.75, "completions/max_terminated_length": 487.75, "completions/mean_length": 242.8671875, "completions/mean_terminated_length": 242.8671875, "completions/min_length": 116.875, "completions/min_terminated_length": 116.875, "epoch": 0.13473684210526315, "grad_norm": 0.08438508957624435, "learning_rate": 9.97262411303488e-07, "loss": -0.0008, "num_tokens": 31316376.0, "reward": 1.676452487707138, "reward_std": 0.38882749900221825, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.654296875, "rewards/stop_prediction_reward/std": 0.3573997803032398, "rewards/waypoint_pred_accuracy/mean": 0.014007493944973248, "rewards/waypoint_pred_accuracy/std": 0.033498459750262555, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.75, "completions/max_terminated_length": 715.75, "completions/mean_length": 239.49609375, "completions/mean_terminated_length": 239.49609375, "completions/min_length": 118.25, "completions/min_terminated_length": 118.25, "epoch": 0.1368421052631579, "grad_norm": 0.027747681364417076, "learning_rate": 9.968856680425886e-07, "loss": 0.0053, "num_tokens": 31798614.0, "reward": 1.7896013855934143, "reward_std": 0.33628559671342373, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.7890625, "rewards/stop_prediction_reward/std": 0.3223333489149809, "rewards/waypoint_pred_accuracy/mean": 0.0022225715887884694, "rewards/waypoint_pred_accuracy/std": 0.00479067146991952, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.75, "completions/max_terminated_length": 639.75, "completions/mean_length": 251.96875, "completions/mean_terminated_length": 251.96875, "completions/min_length": 115.625, "completions/min_terminated_length": 115.625, "epoch": 0.13894736842105262, "grad_norm": 0.028032371774315834, "learning_rate": 9.964847346371676e-07, "loss": 0.004, "num_tokens": 32286790.0, "reward": 1.8506833761930466, "reward_std": 0.3642146345227957, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.830078125, "rewards/stop_prediction_reward/std": 0.3009116370230913, "rewards/waypoint_pred_accuracy/mean": 0.013232328761660028, "rewards/waypoint_pred_accuracy/std": 0.03860976189025678, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 481.25, "completions/max_terminated_length": 481.25, "completions/mean_length": 247.291015625, "completions/mean_terminated_length": 247.7331371307373, "completions/min_length": 100.25, "completions/min_terminated_length": 115.125, "epoch": 0.14105263157894737, "grad_norm": 0.021038714796304703, "learning_rate": 9.96059632789951e-07, "loss": -0.0006, "num_tokens": 32774747.0, "reward": 1.5779267400503159, "reward_std": 0.42914118245244026, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.505859375, "rewards/stop_prediction_reward/std": 0.3645100612193346, "rewards/waypoint_pred_accuracy/mean": 0.039939936966421924, "rewards/waypoint_pred_accuracy/std": 0.058027153559010024, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.875, "completions/max_terminated_length": 748.875, "completions/mean_length": 253.40234375, "completions/mean_terminated_length": 253.40234375, "completions/min_length": 119.875, "completions/min_terminated_length": 119.875, "epoch": 0.1431578947368421, "grad_norm": 0.024472616612911224, "learning_rate": 9.956103855119138e-07, "loss": 0.0036, "num_tokens": 33265961.0, "reward": 1.6555908024311066, "reward_std": 0.385839419439435, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.078125, "rewards/stop_prediction_reward/mean": 0.6640625, "rewards/stop_prediction_reward/std": 0.36797660402953625, "rewards/waypoint_pred_accuracy/mean": 0.0006469660570671656, "rewards/waypoint_pred_accuracy/std": 0.003268853310080662, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.375, "completions/max_terminated_length": 496.375, "completions/mean_length": 253.37109375, "completions/mean_terminated_length": 253.37109375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.14526315789473684, "grad_norm": 0.038161348551511765, "learning_rate": 9.951370171210359e-07, "loss": 0.002, "num_tokens": 33757543.0, "reward": 1.755501314997673, "reward_std": 0.34114088118076324, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.32307766377925873, "rewards/waypoint_pred_accuracy/mean": 0.004703786010358479, "rewards/waypoint_pred_accuracy/std": 0.016449308837348298, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.25, "completions/max_terminated_length": 833.25, "completions/mean_length": 262.56640625, "completions/mean_terminated_length": 262.56640625, "completions/min_length": 109.875, "completions/min_terminated_length": 109.875, "epoch": 0.14736842105263157, "grad_norm": 0.024708310142159462, "learning_rate": 9.946395532409847e-07, "loss": 0.0085, "num_tokens": 34253513.0, "reward": 1.7724248170852661, "reward_std": 0.354646734893322, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.06417626701295376, "rewards/stop_prediction_reward/mean": 0.7734375, "rewards/stop_prediction_reward/std": 0.3255934212356806, "rewards/waypoint_pred_accuracy/mean": 0.005353036525642015, "rewards/waypoint_pred_accuracy/std": 0.013709596910865714, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.25, "completions/max_terminated_length": 497.25, "completions/mean_length": 261.169921875, "completions/mean_terminated_length": 261.169921875, "completions/min_length": 127.25, "completions/min_terminated_length": 127.25, "epoch": 0.14947368421052631, "grad_norm": 0.02306721918284893, "learning_rate": 9.941180207997288e-07, "loss": -0.0001, "num_tokens": 34747616.0, "reward": 1.8229519128799438, "reward_std": 0.33919697254896164, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.81640625, "rewards/stop_prediction_reward/std": 0.3285626471042633, "rewards/waypoint_pred_accuracy/mean": 0.003272829200625438, "rewards/waypoint_pred_accuracy/std": 0.01800350467010503, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 244.701171875, "completions/mean_terminated_length": 244.701171875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.15157894736842106, "grad_norm": 0.03545621410012245, "learning_rate": 9.935724480280795e-07, "loss": 0.0, "num_tokens": 35235719.0, "reward": 1.7902133017778397, "reward_std": 0.45963282138109207, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.6796875, "rewards/stop_prediction_reward/std": 0.38505756109952927, "rewards/waypoint_pred_accuracy/mean": 0.056239478069983306, "rewards/waypoint_pred_accuracy/std": 0.08321574779984076, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.125, "completions/max_terminated_length": 423.125, "completions/mean_length": 241.056640625, "completions/mean_terminated_length": 241.056640625, "completions/min_length": 115.875, "completions/min_terminated_length": 115.875, "epoch": 0.15368421052631578, "grad_norm": 0.025743646547198296, "learning_rate": 9.93002864458164e-07, "loss": 0.0017, "num_tokens": 35717284.0, "reward": 1.7117034643888474, "reward_std": 0.34931979328393936, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.7109375, "rewards/stop_prediction_reward/std": 0.3488571159541607, "rewards/waypoint_pred_accuracy/mean": 0.00038298743026394556, "rewards/waypoint_pred_accuracy/std": 0.0022788381146801144, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.375, "completions/max_terminated_length": 484.375, "completions/mean_length": 236.037109375, "completions/mean_terminated_length": 236.037109375, "completions/min_length": 110.375, "completions/min_terminated_length": 110.375, "epoch": 0.15578947368421053, "grad_norm": 0.031387291848659515, "learning_rate": 9.924093009218252e-07, "loss": -0.0001, "num_tokens": 36196791.0, "reward": 1.53607939183712, "reward_std": 0.3938233330845833, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.04855126701295376, "rewards/stop_prediction_reward/mean": 0.515625, "rewards/stop_prediction_reward/std": 0.35425616055727005, "rewards/waypoint_pred_accuracy/mean": 0.01511000881408156, "rewards/waypoint_pred_accuracy/std": 0.024702310350773107, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 682.875, "completions/max_terminated_length": 682.875, "completions/mean_length": 246.58203125, "completions/mean_terminated_length": 247.04151344299316, "completions/min_length": 102.375, "completions/min_terminated_length": 120.375, "epoch": 0.15789473684210525, "grad_norm": 0.026448730379343033, "learning_rate": 9.917917895489542e-07, "loss": 0.0036, "num_tokens": 36683937.0, "reward": 1.7273263335227966, "reward_std": 0.3566751927137375, "rewards/format_reward_embodied/mean": 0.98046875, "rewards/format_reward_embodied/std": 0.12826303765177727, "rewards/stop_prediction_reward/mean": 0.7109375, "rewards/stop_prediction_reward/std": 0.2865038700401783, "rewards/waypoint_pred_accuracy/mean": 0.01796005329746431, "rewards/waypoint_pred_accuracy/std": 0.03300400403918848, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.5, "completions/max_terminated_length": 466.5, "completions/mean_length": 246.384765625, "completions/mean_terminated_length": 246.384765625, "completions/min_length": 122.625, "completions/min_terminated_length": 122.625, "epoch": 0.16, "grad_norm": 0.026203418150544167, "learning_rate": 9.9115036376575e-07, "loss": 0.0007, "num_tokens": 37170854.0, "reward": 1.4314483553171158, "reward_std": 0.3206907380372286, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.435546875, "rewards/stop_prediction_reward/std": 0.31111637130379677, "rewards/waypoint_pred_accuracy/mean": 0.0008804261656223389, "rewards/waypoint_pred_accuracy/std": 0.007043409327252448, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 461.375, "completions/max_terminated_length": 461.375, "completions/mean_length": 245.017578125, "completions/mean_terminated_length": 245.51959609985352, "completions/min_length": 97.0, "completions/min_terminated_length": 110.875, "epoch": 0.16210526315789472, "grad_norm": 0.021599190309643745, "learning_rate": 9.904850582929109e-07, "loss": -0.001, "num_tokens": 37659375.0, "reward": 1.7587501555681229, "reward_std": 0.45513000525534153, "rewards/format_reward_embodied/mean": 0.9765625, "rewards/format_reward_embodied/std": 0.12835253402590752, "rewards/stop_prediction_reward/mean": 0.6328125, "rewards/stop_prediction_reward/std": 0.3041498549282551, "rewards/waypoint_pred_accuracy/mean": 0.07468759280809832, "rewards/waypoint_pred_accuracy/std": 0.08736424083447313, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.375, "completions/max_terminated_length": 431.375, "completions/mean_length": 251.69921875, "completions/mean_terminated_length": 251.69921875, "completions/min_length": 118.125, "completions/min_terminated_length": 118.125, "epoch": 0.16421052631578947, "grad_norm": 0.023476749658584595, "learning_rate": 9.897959091437545e-07, "loss": -0.0017, "num_tokens": 38147157.0, "reward": 1.7629946172237396, "reward_std": 0.3216873835772276, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.72265625, "rewards/stop_prediction_reward/std": 0.2610730957239866, "rewards/waypoint_pred_accuracy/mean": 0.02212230950329659, "rewards/waypoint_pred_accuracy/std": 0.03588474957179061, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.875, "completions/max_terminated_length": 464.875, "completions/mean_length": 238.640625, "completions/mean_terminated_length": 238.640625, "completions/min_length": 118.375, "completions/min_terminated_length": 118.375, "epoch": 0.16631578947368422, "grad_norm": 0.03286973387002945, "learning_rate": 9.890829536222686e-07, "loss": -0.0025, "num_tokens": 38629981.0, "reward": 1.7297946512699127, "reward_std": 0.2887336425483227, "rewards/format_reward_embodied/mean": 0.974609375, "rewards/format_reward_embodied/std": 0.12744012847542763, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.23198767006397247, "rewards/waypoint_pred_accuracy/mean": 0.004545772711879333, "rewards/waypoint_pred_accuracy/std": 0.008970102488097233, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.125, "completions/max_terminated_length": 476.125, "completions/mean_length": 233.671875, "completions/mean_terminated_length": 233.671875, "completions/min_length": 97.875, "completions/min_terminated_length": 97.875, "epoch": 0.16842105263157894, "grad_norm": 0.022502202540636063, "learning_rate": 9.88346230321092e-07, "loss": 0.0003, "num_tokens": 39111733.0, "reward": 1.6976664066314697, "reward_std": 0.36001696437597275, "rewards/format_reward_embodied/mean": 0.986328125, "rewards/format_reward_embodied/std": 0.09071702510118484, "rewards/stop_prediction_reward/mean": 0.68359375, "rewards/stop_prediction_reward/std": 0.3152890168130398, "rewards/waypoint_pred_accuracy/mean": 0.013872268769774525, "rewards/waypoint_pred_accuracy/std": 0.03006945856032351, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.875, "completions/max_terminated_length": 428.875, "completions/mean_length": 236.8515625, "completions/mean_terminated_length": 236.8515625, "completions/min_length": 110.125, "completions/min_terminated_length": 110.125, "epoch": 0.1705263157894737, "grad_norm": 0.03461969271302223, "learning_rate": 9.875857791194251e-07, "loss": 0.0001, "num_tokens": 39593449.0, "reward": 1.8557344675064087, "reward_std": 0.3654259257018566, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.798828125, "rewards/stop_prediction_reward/std": 0.2945959325879812, "rewards/waypoint_pred_accuracy/mean": 0.032359407392959616, "rewards/waypoint_pred_accuracy/std": 0.051165802276623415, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.75, "completions/max_terminated_length": 773.75, "completions/mean_length": 242.849609375, "completions/mean_terminated_length": 242.849609375, "completions/min_length": 111.625, "completions/min_terminated_length": 111.625, "epoch": 0.1726315789473684, "grad_norm": 0.025375094264745712, "learning_rate": 9.868016411808711e-07, "loss": 0.008, "num_tokens": 40080732.0, "reward": 1.8185276985168457, "reward_std": 0.2778010666370392, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.822265625, "rewards/stop_prediction_reward/std": 0.26144965551793575, "rewards/waypoint_pred_accuracy/mean": 8.415357677159232e-05, "rewards/waypoint_pred_accuracy/std": 0.0006628100762255675, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 239.771484375, "completions/mean_terminated_length": 239.771484375, "completions/min_length": 118.125, "completions/min_terminated_length": 118.125, "epoch": 0.17473684210526316, "grad_norm": 0.02282722294330597, "learning_rate": 9.85993858951209e-07, "loss": -0.0, "num_tokens": 40564327.0, "reward": 1.8071236461400986, "reward_std": 0.3150872718542814, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.7109375, "rewards/stop_prediction_reward/std": 0.2674474287778139, "rewards/waypoint_pred_accuracy/mean": 0.04809306015794699, "rewards/waypoint_pred_accuracy/std": 0.0513845629028903, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.875, "completions/max_terminated_length": 428.875, "completions/mean_length": 237.19140625, "completions/mean_terminated_length": 237.19140625, "completions/min_length": 117.75, "completions/min_terminated_length": 117.75, "epoch": 0.17684210526315788, "grad_norm": 0.024264369159936905, "learning_rate": 9.851624761560941e-07, "loss": 0.0003, "num_tokens": 41047305.0, "reward": 1.6365228593349457, "reward_std": 0.2356659732758999, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.603515625, "rewards/stop_prediction_reward/std": 0.21942270919680595, "rewards/waypoint_pred_accuracy/mean": 0.016503638941257396, "rewards/waypoint_pred_accuracy/std": 0.019282840457430492, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.125, "completions/max_terminated_length": 443.125, "completions/mean_length": 241.103515625, "completions/mean_terminated_length": 241.103515625, "completions/min_length": 119.125, "completions/min_terminated_length": 119.125, "epoch": 0.17894736842105263, "grad_norm": 0.020238297060132027, "learning_rate": 9.843075377986927e-07, "loss": 0.0003, "num_tokens": 41531134.0, "reward": 1.74451445043087, "reward_std": 0.27907660976052284, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.693359375, "rewards/stop_prediction_reward/std": 0.22875236719846725, "rewards/waypoint_pred_accuracy/mean": 0.025577549161396262, "rewards/waypoint_pred_accuracy/std": 0.03110517306985683, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.5, "completions/max_terminated_length": 419.5, "completions/mean_length": 231.287109375, "completions/mean_terminated_length": 231.287109375, "completions/min_length": 118.875, "completions/min_terminated_length": 118.875, "epoch": 0.18105263157894738, "grad_norm": 0.02773391455411911, "learning_rate": 9.834290901572454e-07, "loss": 0.0005, "num_tokens": 42010385.0, "reward": 1.8274007737636566, "reward_std": 0.2518170941621065, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.822265625, "rewards/stop_prediction_reward/std": 0.2328737936913967, "rewards/waypoint_pred_accuracy/mean": 0.0035441384432527657, "rewards/waypoint_pred_accuracy/std": 0.011852368814229316, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.25, "completions/max_terminated_length": 486.25, "completions/mean_length": 242.701171875, "completions/mean_terminated_length": 242.701171875, "completions/min_length": 112.125, "completions/min_terminated_length": 112.125, "epoch": 0.1831578947368421, "grad_norm": 0.017116429284214973, "learning_rate": 9.82527180782562e-07, "loss": 0.0007, "num_tokens": 42495544.0, "reward": 1.7215514183044434, "reward_std": 0.24272998422384262, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.71875, "rewards/stop_prediction_reward/std": 0.22877886332571507, "rewards/waypoint_pred_accuracy/mean": 0.0033538464465563867, "rewards/waypoint_pred_accuracy/std": 0.008977477041369256, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.625, "completions/max_terminated_length": 407.625, "completions/mean_length": 227.666015625, "completions/mean_terminated_length": 227.666015625, "completions/min_length": 124.125, "completions/min_terminated_length": 124.125, "epoch": 0.18526315789473685, "grad_norm": 0.025679390877485275, "learning_rate": 9.816018584954474e-07, "loss": -0.0001, "num_tokens": 42972109.0, "reward": 1.543754830956459, "reward_std": 0.3515052441507578, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.423828125, "rewards/stop_prediction_reward/std": 0.25312468968331814, "rewards/waypoint_pred_accuracy/mean": 0.06093992558778956, "rewards/waypoint_pred_accuracy/std": 0.090891926485829, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 229.5078125, "completions/mean_terminated_length": 229.5078125, "completions/min_length": 114.375, "completions/min_terminated_length": 114.375, "epoch": 0.18736842105263157, "grad_norm": 0.025896085426211357, "learning_rate": 9.806531733840594e-07, "loss": 0.0009, "num_tokens": 43453905.0, "reward": 1.9336253255605698, "reward_std": 0.3347325623035431, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.82421875, "rewards/stop_prediction_reward/std": 0.24601666443049908, "rewards/waypoint_pred_accuracy/mean": 0.05470328652882017, "rewards/waypoint_pred_accuracy/std": 0.07181658712215722, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.375, "completions/max_terminated_length": 432.375, "completions/mean_length": 235.38671875, "completions/mean_terminated_length": 235.38671875, "completions/min_length": 115.875, "completions/min_terminated_length": 115.875, "epoch": 0.18947368421052632, "grad_norm": 0.03356796130537987, "learning_rate": 9.796811768011975e-07, "loss": -0.0006, "num_tokens": 43934935.0, "reward": 1.7414738535881042, "reward_std": 0.30686922930181026, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.607421875, "rewards/stop_prediction_reward/std": 0.2297498807311058, "rewards/waypoint_pred_accuracy/mean": 0.06702600460093983, "rewards/waypoint_pred_accuracy/std": 0.05166525034704193, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.25, "completions/max_terminated_length": 406.25, "completions/mean_length": 222.359375, "completions/mean_terminated_length": 222.359375, "completions/min_length": 118.875, "completions/min_terminated_length": 118.875, "epoch": 0.19157894736842104, "grad_norm": 0.01943252608180046, "learning_rate": 9.78685921361522e-07, "loss": -0.0005, "num_tokens": 44409551.0, "reward": 1.6487830728292465, "reward_std": 0.30546887032687664, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.58203125, "rewards/stop_prediction_reward/std": 0.2547878734767437, "rewards/waypoint_pred_accuracy/mean": 0.03337590532140797, "rewards/waypoint_pred_accuracy/std": 0.052433368229147945, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.375, "completions/max_terminated_length": 385.375, "completions/mean_length": 217.119140625, "completions/mean_terminated_length": 217.119140625, "completions/min_length": 113.5, "completions/min_terminated_length": 113.5, "epoch": 0.1936842105263158, "grad_norm": 0.021023401990532875, "learning_rate": 9.776674609387076e-07, "loss": -0.0006, "num_tokens": 44880844.0, "reward": 1.756135731935501, "reward_std": 0.264364130795002, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.7109375, "rewards/stop_prediction_reward/std": 0.23147857002913952, "rewards/waypoint_pred_accuracy/mean": 0.022599116048866108, "rewards/waypoint_pred_accuracy/std": 0.03211659606313333, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.125, "completions/max_terminated_length": 400.125, "completions/mean_length": 218.37109375, "completions/mean_terminated_length": 218.37109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.1957894736842105, "grad_norm": 0.020804792642593384, "learning_rate": 9.766258506625257e-07, "loss": 0.0009, "num_tokens": 45354762.0, "reward": 1.703179121017456, "reward_std": 0.31406174413859844, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.2401380892843008, "rewards/waypoint_pred_accuracy/mean": 0.04104270155312406, "rewards/waypoint_pred_accuracy/std": 0.047771165754966205, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.125, "completions/max_terminated_length": 385.125, "completions/mean_length": 220.767578125, "completions/mean_terminated_length": 220.767578125, "completions/min_length": 108.25, "completions/min_terminated_length": 108.25, "epoch": 0.19789473684210526, "grad_norm": 0.05812095105648041, "learning_rate": 9.75561146915861e-07, "loss": 0.0007, "num_tokens": 45828691.0, "reward": 1.7804509848356247, "reward_std": 0.24969635531306267, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.728515625, "rewards/stop_prediction_reward/std": 0.2045249417424202, "rewards/waypoint_pred_accuracy/mean": 0.02596768177553266, "rewards/waypoint_pred_accuracy/std": 0.03161624588427525, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.125, "completions/max_terminated_length": 406.125, "completions/mean_length": 221.861328125, "completions/mean_terminated_length": 221.861328125, "completions/min_length": 118.75, "completions/min_terminated_length": 118.75, "epoch": 0.2, "grad_norm": 0.019957855343818665, "learning_rate": 9.744734073316595e-07, "loss": -0.0003, "num_tokens": 46303244.0, "reward": 1.6937783360481262, "reward_std": 0.18066157400608063, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.6171875, "rewards/stop_prediction_reward/std": 0.12835253402590752, "rewards/waypoint_pred_accuracy/mean": 0.03829541802406311, "rewards/waypoint_pred_accuracy/std": 0.038537144660949735, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.5, "completions/max_terminated_length": 407.5, "completions/mean_length": 213.76953125, "completions/mean_terminated_length": 213.76953125, "completions/min_length": 101.625, "completions/min_terminated_length": 101.625, "epoch": 0.20210526315789473, "grad_norm": 0.02102278172969818, "learning_rate": 9.73362690789808e-07, "loss": -0.001, "num_tokens": 46770710.0, "reward": 1.7461132854223251, "reward_std": 0.28678105026483536, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.640625, "rewards/stop_prediction_reward/std": 0.21690494194626808, "rewards/waypoint_pred_accuracy/mean": 0.05372073073522188, "rewards/waypoint_pred_accuracy/std": 0.054212798771914095, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 219.494140625, "completions/mean_terminated_length": 219.494140625, "completions/min_length": 107.75, "completions/min_terminated_length": 107.75, "epoch": 0.20421052631578948, "grad_norm": 0.023958692327141762, "learning_rate": 9.722290574139486e-07, "loss": -0.0001, "num_tokens": 47243155.0, "reward": 1.747300535440445, "reward_std": 0.17760824598371983, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.7421875, "rewards/stop_prediction_reward/std": 0.15745450742542744, "rewards/waypoint_pred_accuracy/mean": 0.0035330769751453772, "rewards/waypoint_pred_accuracy/std": 0.017783919582143426, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.375, "completions/max_terminated_length": 380.375, "completions/mean_length": 215.083984375, "completions/mean_terminated_length": 215.083984375, "completions/min_length": 109.125, "completions/min_terminated_length": 109.125, "epoch": 0.2063157894736842, "grad_norm": 0.016943305730819702, "learning_rate": 9.71072568568222e-07, "loss": -0.0005, "num_tokens": 47713406.0, "reward": 1.851816326379776, "reward_std": 0.2104954868555069, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.849609375, "rewards/stop_prediction_reward/std": 0.1879090555012226, "rewards/waypoint_pred_accuracy/mean": 0.0020800431666430333, "rewards/waypoint_pred_accuracy/std": 0.011219009378692333, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.625, "completions/max_terminated_length": 392.625, "completions/mean_length": 211.724609375, "completions/mean_terminated_length": 211.724609375, "completions/min_length": 104.125, "completions/min_terminated_length": 104.125, "epoch": 0.20842105263157895, "grad_norm": 0.014324544928967953, "learning_rate": 9.698932868539475e-07, "loss": -0.0006, "num_tokens": 48180849.0, "reward": 1.6212971061468124, "reward_std": 0.16710768891550742, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.498046875, "rewards/stop_prediction_reward/std": 0.07980126701295376, "rewards/waypoint_pred_accuracy/mean": 0.06260167788853366, "rewards/waypoint_pred_accuracy/std": 0.04448781939231594, "step": 99 }, { "epoch": 0.21052631578947367, "grad_norm": 0.016340678557753563, "learning_rate": 9.686912761062337e-07, "loss": -0.001, "step": 100 }, { "epoch": 0.21052631578947367, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.00015625, "eval_completions/max_length": 457.89, "eval_completions/max_terminated_length": 457.89, "eval_completions/mean_length": 219.56675354003906, "eval_completions/mean_terminated_length": 219.59981658935547, "eval_completions/min_length": 112.23, "eval_completions/min_terminated_length": 113.11, "eval_loss": 0.0013147207209840417, "eval_num_tokens": 48647110.0, "eval_reward": 1.8446430933475495, "eval_reward_std": 0.20415274247365858, "eval_rewards/format_reward_embodied/mean": 0.99921875, "eval_rewards/format_reward_embodied/std": 0.00625, "eval_rewards/stop_prediction_reward/mean": 0.75734375, "eval_rewards/stop_prediction_reward/std": 0.1349847713112831, "eval_rewards/waypoint_pred_accuracy/mean": 0.04404030321765386, "eval_rewards/waypoint_pred_accuracy/std": 0.04408785154897764, "eval_runtime": 1143.7465, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.002, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.1875, "completions/max_terminated_length": 387.1875, "completions/mean_length": 211.564453125, "completions/mean_terminated_length": 211.564453125, "completions/min_length": 109.25, "completions/min_terminated_length": 109.25, "epoch": 0.21263157894736842, "grad_norm": 7.573169568786398e-05, "learning_rate": 9.674666013905223e-07, "loss": 0.0, "num_tokens": 49119859.0, "reward": 1.8795468658208847, "reward_std": 0.13644913337626896, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.8642578125, "rewards/stop_prediction_reward/std": 0.10762263275682926, "rewards/waypoint_pred_accuracy/mean": 0.007644527649440409, "rewards/waypoint_pred_accuracy/std": 0.02138127497060599, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.625, "completions/max_terminated_length": 423.625, "completions/mean_length": 216.150390625, "completions/mean_terminated_length": 216.150390625, "completions/min_length": 112.875, "completions/min_terminated_length": 112.875, "epoch": 0.21473684210526317, "grad_norm": 0.02024409919977188, "learning_rate": 9.662193289990683e-07, "loss": 0.0001, "num_tokens": 49589696.0, "reward": 1.9582752585411072, "reward_std": 0.170343800484261, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.059467025101184845, "rewards/waypoint_pred_accuracy/mean": 0.04359079086862039, "rewards/waypoint_pred_accuracy/std": 0.05229037126991898, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.875, "completions/max_terminated_length": 419.875, "completions/mean_length": 220.193359375, "completions/mean_terminated_length": 220.193359375, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.2168421052631579, "grad_norm": 0.020008524879813194, "learning_rate": 9.649495264473496e-07, "loss": -0.0008, "num_tokens": 50062179.0, "reward": 1.4985756427049637, "reward_std": 0.15706685557961464, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.15027354657649994, "rewards/waypoint_pred_accuracy/mean": 0.00026439113654092976, "rewards/waypoint_pred_accuracy/std": 0.0021146056694039514, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.375, "completions/max_terminated_length": 389.375, "completions/mean_length": 224.572265625, "completions/mean_terminated_length": 224.572265625, "completions/min_length": 113.5, "completions/min_terminated_length": 113.5, "epoch": 0.21894736842105264, "grad_norm": 0.010618757456541061, "learning_rate": 9.636572624704126e-07, "loss": 0.0008, "num_tokens": 50537928.0, "reward": 2.029994383454323, "reward_std": 0.16062275879085064, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.98046875, "rewards/stop_prediction_reward/std": 0.11734727956354618, "rewards/waypoint_pred_accuracy/mean": 0.02476281741601838, "rewards/waypoint_pred_accuracy/std": 0.035420115480633285, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.75, "completions/max_terminated_length": 375.75, "completions/mean_length": 208.478515625, "completions/mean_terminated_length": 208.478515625, "completions/min_length": 112.875, "completions/min_terminated_length": 112.875, "epoch": 0.22105263157894736, "grad_norm": 0.0, "learning_rate": 9.62342607019152e-07, "loss": 0.0003, "num_tokens": 51004477.0, "reward": 1.7948767840862274, "reward_std": 0.0999540267221164, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.734375, "rewards/stop_prediction_reward/std": 0.0729166679084301, "rewards/waypoint_pred_accuracy/mean": 0.03025089303362355, "rewards/waypoint_pred_accuracy/std": 0.024821761748978988, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.25, "completions/max_terminated_length": 391.25, "completions/mean_length": 218.388671875, "completions/mean_terminated_length": 218.388671875, "completions/min_length": 113.375, "completions/min_terminated_length": 113.375, "epoch": 0.2231578947368421, "grad_norm": 0.024067817255854607, "learning_rate": 9.610056312565245e-07, "loss": 0.0004, "num_tokens": 51477636.0, "reward": 2.026669681072235, "reward_std": 0.1696237076412217, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.8671875, "rewards/stop_prediction_reward/std": 0.09542626701295376, "rewards/waypoint_pred_accuracy/mean": 0.07974108902908483, "rewards/waypoint_pred_accuracy/std": 0.04533190939855558, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.75, "completions/max_terminated_length": 420.75, "completions/mean_length": 217.77734375, "completions/mean_terminated_length": 217.77734375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.22526315789473683, "grad_norm": 0.013704587705433369, "learning_rate": 9.596464075536963e-07, "loss": 0.0003, "num_tokens": 51951186.0, "reward": 1.8652345538139343, "reward_std": 0.09080793828513833, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.865234375, "rewards/stop_prediction_reward/std": 0.0908065214753151, "rewards/waypoint_pred_accuracy/mean": 9.960914946921718e-08, "rewards/waypoint_pred_accuracy/std": 7.087302705995463e-07, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 403.875, "completions/max_terminated_length": 403.875, "completions/mean_length": 217.728515625, "completions/mean_terminated_length": 218.15721321105957, "completions/min_length": 100.25, "completions/min_terminated_length": 116.625, "epoch": 0.22736842105263158, "grad_norm": 0.02499217353761196, "learning_rate": 9.582650094861256e-07, "loss": -0.0001, "num_tokens": 52426055.0, "reward": 1.8535159230232239, "reward_std": 0.1513118724361675, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.857421875, "rewards/stop_prediction_reward/std": 0.1338059287518263, "rewards/waypoint_pred_accuracy/mean": 1.5656563736666032e-07, "rewards/waypoint_pred_accuracy/std": 8.466679199507776e-07, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 217.58984375, "completions/mean_terminated_length": 217.58984375, "completions/min_length": 108.5, "completions/min_terminated_length": 108.5, "epoch": 0.2294736842105263, "grad_norm": 0.018018925562500954, "learning_rate": 9.568615118295798e-07, "loss": 0.0002, "num_tokens": 52898613.0, "reward": 1.5074883997440338, "reward_std": 0.08622953689905444, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.505859375, "rewards/stop_prediction_reward/std": 0.07980126701295376, "rewards/waypoint_pred_accuracy/mean": 0.0008145241825951426, "rewards/waypoint_pred_accuracy/std": 0.003929519949304638, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.625, "completions/max_terminated_length": 410.625, "completions/mean_length": 220.09375, "completions/mean_terminated_length": 220.09375, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.23157894736842105, "grad_norm": 0.026054121553897858, "learning_rate": 9.554359905560885e-07, "loss": 0.0009, "num_tokens": 53371493.0, "reward": 1.6272786408662796, "reward_std": 0.1345605030655861, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.607421875, "rewards/stop_prediction_reward/std": 0.10731646977365017, "rewards/waypoint_pred_accuracy/mean": 0.009928377814540968, "rewards/waypoint_pred_accuracy/std": 0.021277805810678764, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.5, "completions/max_terminated_length": 451.5, "completions/mean_length": 226.404296875, "completions/mean_terminated_length": 226.404296875, "completions/min_length": 104.75, "completions/min_terminated_length": 104.75, "epoch": 0.2336842105263158, "grad_norm": 0.019862722605466843, "learning_rate": 9.53988522829831e-07, "loss": -0.0002, "num_tokens": 53849844.0, "reward": 1.7356369495391846, "reward_std": 0.17475299397483468, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.6328125, "rewards/stop_prediction_reward/std": 0.0929968785494566, "rewards/waypoint_pred_accuracy/mean": 0.05238880167820055, "rewards/waypoint_pred_accuracy/std": 0.03885504556402302, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.125, "completions/max_terminated_length": 429.125, "completions/mean_length": 228.794921875, "completions/mean_terminated_length": 228.794921875, "completions/min_length": 102.625, "completions/min_terminated_length": 102.625, "epoch": 0.23578947368421052, "grad_norm": 0.013491553254425526, "learning_rate": 9.52519187002958e-07, "loss": -0.0013, "num_tokens": 54326667.0, "reward": 2.0176089107990265, "reward_std": 0.09997917944565415, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.990234375, "rewards/stop_prediction_reward/std": 0.06879601255059242, "rewards/waypoint_pred_accuracy/mean": 0.013687264542047125, "rewards/waypoint_pred_accuracy/std": 0.02499246856624292, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.25, "completions/max_terminated_length": 419.25, "completions/mean_length": 223.84765625, "completions/mean_terminated_length": 223.84765625, "completions/min_length": 116.125, "completions/min_terminated_length": 116.125, "epoch": 0.23789473684210527, "grad_norm": 0.0, "learning_rate": 9.510280626113524e-07, "loss": 0.0002, "num_tokens": 54804477.0, "reward": 2.0416500568389893, "reward_std": 0.15113097801804543, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.87109375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.08527813665590386, "rewards/waypoint_pred_accuracy/std": 0.06661079078944249, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 418.375, "completions/max_terminated_length": 418.375, "completions/mean_length": 230.408203125, "completions/mean_terminated_length": 230.9030590057373, "completions/min_length": 102.375, "completions/min_terminated_length": 116.25, "epoch": 0.24, "grad_norm": 0.027175500988960266, "learning_rate": 9.495152303703225e-07, "loss": -0.0, "num_tokens": 55282254.0, "reward": 1.9152479320764542, "reward_std": 0.15890819625928998, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.08138803765177727, "rewards/waypoint_pred_accuracy/mean": 0.024030234897509217, "rewards/waypoint_pred_accuracy/std": 0.038782567949965596, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.5, "completions/max_terminated_length": 379.5, "completions/mean_length": 218.421875, "completions/mean_terminated_length": 218.421875, "completions/min_length": 108.625, "completions/min_terminated_length": 108.625, "epoch": 0.24210526315789474, "grad_norm": 0.0, "learning_rate": 9.479807721702337e-07, "loss": 0.0009, "num_tokens": 55753702.0, "reward": 1.964790791273117, "reward_std": 0.14700112864375114, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.04880167031660676, "rewards/waypoint_pred_accuracy/std": 0.04630230367183685, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.75, "completions/max_terminated_length": 418.75, "completions/mean_length": 234.615234375, "completions/mean_terminated_length": 234.615234375, "completions/min_length": 107.25, "completions/min_terminated_length": 107.25, "epoch": 0.24421052631578946, "grad_norm": 0.01868380233645439, "learning_rate": 9.46424771072075e-07, "loss": -0.0016, "num_tokens": 56236001.0, "reward": 1.7634397000074387, "reward_std": 0.11784930247813463, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.744140625, "rewards/stop_prediction_reward/std": 0.08138803765177727, "rewards/waypoint_pred_accuracy/mean": 0.009649543033447117, "rewards/waypoint_pred_accuracy/std": 0.01907464297255501, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.5, "completions/max_terminated_length": 393.5, "completions/mean_length": 218.193359375, "completions/mean_terminated_length": 218.193359375, "completions/min_length": 111.75, "completions/min_terminated_length": 111.75, "epoch": 0.2463157894736842, "grad_norm": 0.013114881701767445, "learning_rate": 9.448473113029633e-07, "loss": 0.0009, "num_tokens": 56707844.0, "reward": 1.6348197907209396, "reward_std": 0.15209808605868602, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.513671875, "rewards/stop_prediction_reward/std": 0.08837713301181793, "rewards/waypoint_pred_accuracy/mean": 0.0605739434017778, "rewards/waypoint_pred_accuracy/std": 0.03603998847574985, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.625, "completions/max_terminated_length": 385.625, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 115.125, "completions/min_terminated_length": 115.125, "epoch": 0.24842105263157896, "grad_norm": 0.021751945838332176, "learning_rate": 9.432484782515842e-07, "loss": 0.0011, "num_tokens": 57177540.0, "reward": 1.8951680064201355, "reward_std": 0.09731905919034034, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.87109375, "rewards/stop_prediction_reward/std": 0.05317101255059242, "rewards/waypoint_pred_accuracy/mean": 0.012037134467476562, "rewards/waypoint_pred_accuracy/std": 0.022074746749979113, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.875, "completions/max_terminated_length": 733.875, "completions/mean_length": 230.4921875, "completions/mean_terminated_length": 230.4921875, "completions/min_length": 114.125, "completions/min_terminated_length": 114.125, "epoch": 0.2505263157894737, "grad_norm": 0.017523573711514473, "learning_rate": 9.416283584635699e-07, "loss": 0.0073, "num_tokens": 57653632.0, "reward": 1.7819086909294128, "reward_std": 0.13167815032647923, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.0625, "rewards/waypoint_pred_accuracy/mean": 0.018884045333834365, "rewards/waypoint_pred_accuracy/std": 0.04270522284787148, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.75, "completions/max_terminated_length": 383.75, "completions/mean_length": 230.603515625, "completions/mean_terminated_length": 230.603515625, "completions/min_length": 109.25, "completions/min_terminated_length": 109.25, "epoch": 0.25263157894736843, "grad_norm": 0.020878760144114494, "learning_rate": 9.399870396368137e-07, "loss": -0.0003, "num_tokens": 58132533.0, "reward": 1.75615593791008, "reward_std": 0.08499195147633398, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.744140625, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 0.006007667677934543, "rewards/waypoint_pred_accuracy/std": 0.023722964530922208, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.75, "completions/max_terminated_length": 393.75, "completions/mean_length": 228.326171875, "completions/mean_terminated_length": 228.326171875, "completions/min_length": 110.875, "completions/min_terminated_length": 110.875, "epoch": 0.25473684210526315, "grad_norm": 0.030050212517380714, "learning_rate": 9.383246106167244e-07, "loss": -0.0007, "num_tokens": 58610652.0, "reward": 2.005710780620575, "reward_std": 0.1616469284220443, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.8671875, "rewards/stop_prediction_reward/std": 0.05317101255059242, "rewards/waypoint_pred_accuracy/mean": 0.06926164017690084, "rewards/waypoint_pred_accuracy/std": 0.06174082592511354, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.75, "completions/max_terminated_length": 411.75, "completions/mean_length": 235.259765625, "completions/mean_terminated_length": 235.259765625, "completions/min_length": 119.75, "completions/min_terminated_length": 119.75, "epoch": 0.25684210526315787, "grad_norm": 0.05214720964431763, "learning_rate": 9.366411613914151e-07, "loss": 0.0, "num_tokens": 59091681.0, "reward": 2.0128368139266968, "reward_std": 0.041966003568632004, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.998046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.007394973285158812, "rewards/waypoint_pred_accuracy/std": 0.013188560594699084, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.375, "completions/max_terminated_length": 738.375, "completions/mean_length": 229.73828125, "completions/mean_terminated_length": 229.73828125, "completions/min_length": 113.125, "completions/min_terminated_length": 113.125, "epoch": 0.25894736842105265, "grad_norm": 0.020252572372555733, "learning_rate": 9.349367830868338e-07, "loss": 0.0065, "num_tokens": 59571867.0, "reward": 1.7482015490531921, "reward_std": 0.07813655398786068, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.75390625, "rewards/stop_prediction_reward/std": 0.042255254462361336, "rewards/waypoint_pred_accuracy/mean": 7.734416431048885e-05, "rewards/waypoint_pred_accuracy/std": 0.00014155825192574412, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.5, "completions/max_terminated_length": 421.5, "completions/mean_length": 232.515625, "completions/mean_terminated_length": 232.515625, "completions/min_length": 116.625, "completions/min_terminated_length": 116.625, "epoch": 0.26105263157894737, "grad_norm": 0.012004735879600048, "learning_rate": 9.332115679618299e-07, "loss": -0.0002, "num_tokens": 60051875.0, "reward": 1.87109375, "reward_std": 0.06475212238729, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.059467025101184845, "rewards/waypoint_pred_accuracy/mean": 3.7687339174058405e-25, "rewards/waypoint_pred_accuracy/std": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.375, "completions/max_terminated_length": 373.375, "completions/mean_length": 221.771484375, "completions/mean_terminated_length": 221.771484375, "completions/min_length": 113.625, "completions/min_terminated_length": 113.625, "epoch": 0.2631578947368421, "grad_norm": 0.01381534244865179, "learning_rate": 9.3146560940316e-07, "loss": -0.0001, "num_tokens": 60525038.0, "reward": 1.8031584918498993, "reward_std": 0.08486801406252198, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.021921012550592422, "rewards/waypoint_pred_accuracy/mean": 0.02853236788253366, "rewards/waypoint_pred_accuracy/std": 0.03149077049183591, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.125, "completions/max_terminated_length": 379.125, "completions/mean_length": 225.11328125, "completions/mean_terminated_length": 225.11328125, "completions/min_length": 106.375, "completions/min_terminated_length": 106.375, "epoch": 0.26526315789473687, "grad_norm": 0.0, "learning_rate": 9.296990019204335e-07, "loss": 0.0003, "num_tokens": 61001576.0, "reward": 1.838148683309555, "reward_std": 0.13138162437826395, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.1065743277722504, "rewards/waypoint_pred_accuracy/std": 0.058355900342576206, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.5, "completions/max_terminated_length": 596.5, "completions/mean_length": 223.703125, "completions/mean_terminated_length": 223.703125, "completions/min_length": 122.375, "completions/min_terminated_length": 122.375, "epoch": 0.2673684210526316, "grad_norm": 0.015371584333479404, "learning_rate": 9.279118411409962e-07, "loss": 0.0076, "num_tokens": 61475152.0, "reward": 2.0289103388786316, "reward_std": 0.14714059105608612, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.986328125, "rewards/stop_prediction_reward/std": 0.05234810337424278, "rewards/waypoint_pred_accuracy/mean": 0.02226770008679324, "rewards/waypoint_pred_accuracy/std": 0.03958419876477137, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 225.896484375, "completions/mean_terminated_length": 225.896484375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2694736842105263, "grad_norm": 0.0, "learning_rate": 9.261042238047539e-07, "loss": 0.0003, "num_tokens": 61948507.0, "reward": 1.7672365009784698, "reward_std": 0.08540481339514372, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07111826360536146, "rewards/waypoint_pred_accuracy/std": 0.04270240558859675, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 229.087890625, "completions/mean_terminated_length": 229.087890625, "completions/min_length": 122.25, "completions/min_terminated_length": 122.25, "epoch": 0.27157894736842103, "grad_norm": 0.0, "learning_rate": 9.242762477589369e-07, "loss": -0.0008, "num_tokens": 62425864.0, "reward": 1.7555950731039047, "reward_std": 0.150370123796165, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.06725065042865234, "rewards/waypoint_pred_accuracy/std": 0.060643474211165796, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 377.25, "completions/max_terminated_length": 377.25, "completions/mean_length": 218.873046875, "completions/mean_terminated_length": 219.36554527282715, "completions/min_length": 95.875, "completions/min_terminated_length": 110.5, "epoch": 0.2736842105263158, "grad_norm": 0.015500541776418686, "learning_rate": 9.224280119528013e-07, "loss": -0.0006, "num_tokens": 62900679.0, "reward": 1.86328125, "reward_std": 0.08442101255059242, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 5.195248799303594e-14, "rewards/waypoint_pred_accuracy/std": 3.9519006556770764e-13, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.375, "completions/max_terminated_length": 389.375, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 116.375, "completions/min_terminated_length": 116.375, "epoch": 0.27578947368421053, "grad_norm": 0.0, "learning_rate": 9.205596164322753e-07, "loss": 0.0005, "num_tokens": 63375047.0, "reward": 1.8198472261428833, "reward_std": 0.06600932776927948, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.03687673434615135, "rewards/waypoint_pred_accuracy/std": 0.023463066667318344, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.875, "completions/max_terminated_length": 386.875, "completions/mean_length": 223.994140625, "completions/mean_terminated_length": 223.994140625, "completions/min_length": 106.125, "completions/min_terminated_length": 106.125, "epoch": 0.27789473684210525, "grad_norm": 5.267659071250819e-05, "learning_rate": 9.186711623345419e-07, "loss": 0.0007, "num_tokens": 63848068.0, "reward": 1.8460404872894287, "reward_std": 0.07753154253146377, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.04802026903570478, "rewards/waypoint_pred_accuracy/std": 0.03876577728947339, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.625, "completions/max_terminated_length": 370.625, "completions/mean_length": 216.8359375, "completions/mean_terminated_length": 216.8359375, "completions/min_length": 113.25, "completions/min_terminated_length": 113.25, "epoch": 0.28, "grad_norm": 0.019873064011335373, "learning_rate": 9.167627518825651e-07, "loss": -0.0003, "num_tokens": 64319408.0, "reward": 1.66434708237648, "reward_std": 0.10717772238422185, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.619140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.022603242181673977, "rewards/waypoint_pred_accuracy/std": 0.030156509747939708, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.75, "completions/max_terminated_length": 394.75, "completions/mean_length": 221.185546875, "completions/mean_terminated_length": 221.185546875, "completions/min_length": 105.75, "completions/min_terminated_length": 105.75, "epoch": 0.28210526315789475, "grad_norm": 0.016748478636145592, "learning_rate": 9.148344883795563e-07, "loss": 0.0002, "num_tokens": 64797519.0, "reward": 1.6543543934822083, "reward_std": 0.09143044333904982, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0625, "rewards/waypoint_pred_accuracy/mean": 0.014677208887757254, "rewards/waypoint_pred_accuracy/std": 0.02070214717129737, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.125, "completions/max_terminated_length": 347.125, "completions/mean_length": 207.6796875, "completions/mean_terminated_length": 207.6796875, "completions/min_length": 100.75, "completions/min_terminated_length": 100.75, "epoch": 0.28421052631578947, "grad_norm": 0.018281355500221252, "learning_rate": 9.128864762033824e-07, "loss": 0.0009, "num_tokens": 65264811.0, "reward": 1.8661604225635529, "reward_std": 0.13888043258339167, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 0.05905676480875123, "rewards/waypoint_pred_accuracy/std": 0.053127349918608825, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.625, "completions/max_terminated_length": 354.625, "completions/mean_length": 213.609375, "completions/mean_terminated_length": 213.609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.2863157894736842, "grad_norm": 0.01513614784926176, "learning_rate": 9.10918820800916e-07, "loss": -0.0, "num_tokens": 65734563.0, "reward": 1.7799546718597412, "reward_std": 0.05648380851107504, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.751953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.014000790968564693, "rewards/waypoint_pred_accuracy/std": 0.0204294033423741, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.5, "completions/max_terminated_length": 359.5, "completions/mean_length": 209.94140625, "completions/mean_terminated_length": 209.94140625, "completions/min_length": 112.625, "completions/min_terminated_length": 112.625, "epoch": 0.28842105263157897, "grad_norm": 0.014532121829688549, "learning_rate": 9.089316286823274e-07, "loss": -0.0006, "num_tokens": 66202821.0, "reward": 1.8316712975502014, "reward_std": 0.14127142806610715, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.042788804041257456, "rewards/waypoint_pred_accuracy/std": 0.05501071766701504, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.75, "completions/max_terminated_length": 384.75, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.2905263157894737, "grad_norm": 0.013312633149325848, "learning_rate": 9.069250074153191e-07, "loss": -0.0001, "num_tokens": 66681989.0, "reward": 1.9901870042085648, "reward_std": 0.08008617826271802, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.98828125, "rewards/stop_prediction_reward/std": 0.07509202510118484, "rewards/waypoint_pred_accuracy/mean": 0.0009528863083687822, "rewards/waypoint_pred_accuracy/std": 0.007327620231080863, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.375, "completions/max_terminated_length": 373.375, "completions/mean_length": 206.775390625, "completions/mean_terminated_length": 206.775390625, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.2926315789473684, "grad_norm": 0.0, "learning_rate": 9.048990656193024e-07, "loss": 0.0, "num_tokens": 67151378.0, "reward": 1.8710939586162567, "reward_std": 0.03125098500925105, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.87109375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 1.0124033455018937e-07, "rewards/waypoint_pred_accuracy/std": 4.912120291946227e-07, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.25, "completions/max_terminated_length": 365.25, "completions/mean_length": 223.37109375, "completions/mean_terminated_length": 223.37109375, "completions/min_length": 115.875, "completions/min_terminated_length": 115.875, "epoch": 0.29473684210526313, "grad_norm": 0.020622270181775093, "learning_rate": 9.028539129595197e-07, "loss": -0.0001, "num_tokens": 67625936.0, "reward": 1.9263398349285126, "reward_std": 0.07740109786391258, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.025669913738965988, "rewards/waypoint_pred_accuracy/std": 0.028465650044381622, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.75, "completions/max_terminated_length": 378.75, "completions/mean_length": 213.451171875, "completions/mean_terminated_length": 213.451171875, "completions/min_length": 107.875, "completions/min_terminated_length": 107.875, "epoch": 0.2968421052631579, "grad_norm": 0.01621430739760399, "learning_rate": 9.00789660141106e-07, "loss": -0.0004, "num_tokens": 68094391.0, "reward": 1.8975248336791992, "reward_std": 0.056784010463506895, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.012238975709265076, "rewards/waypoint_pred_accuracy/std": 0.02289922521287524, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.125, "completions/max_terminated_length": 366.125, "completions/mean_length": 207.638671875, "completions/mean_terminated_length": 207.638671875, "completions/min_length": 111.25, "completions/min_terminated_length": 111.25, "epoch": 0.29894736842105263, "grad_norm": 0.0012398953549563885, "learning_rate": 8.987064189030983e-07, "loss": -0.0, "num_tokens": 68561854.0, "reward": 1.809123456478119, "reward_std": 0.05541337472914165, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.029561737023570345, "rewards/waypoint_pred_accuracy/std": 0.027706685019552424, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.5, "completions/max_terminated_length": 336.5, "completions/mean_length": 201.529296875, "completions/mean_terminated_length": 201.529296875, "completions/min_length": 111.875, "completions/min_terminated_length": 111.875, "epoch": 0.30105263157894735, "grad_norm": 0.0, "learning_rate": 8.966043020123855e-07, "loss": 0.0001, "num_tokens": 69026509.0, "reward": 2.1186273992061615, "reward_std": 0.09686689289469541, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.994140625, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 0.06224340945057809, "rewards/waypoint_pred_accuracy/std": 0.029665624278586974, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.5, "completions/max_terminated_length": 371.5, "completions/mean_length": 212.501953125, "completions/mean_terminated_length": 212.501953125, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.3031578947368421, "grad_norm": 0.0, "learning_rate": 8.944834232576054e-07, "loss": 0.0002, "num_tokens": 69494414.0, "reward": 2.0669292509555817, "reward_std": 0.09019226813688874, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.99609375, "rewards/stop_prediction_reward/std": 0.021921012550592422, "rewards/waypoint_pred_accuracy/mean": 0.03541775744817477, "rewards/waypoint_pred_accuracy/std": 0.03677688956680679, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.875, "completions/max_terminated_length": 377.875, "completions/mean_length": 214.53125, "completions/mean_terminated_length": 214.53125, "completions/min_length": 111.875, "completions/min_terminated_length": 111.875, "epoch": 0.30526315789473685, "grad_norm": 0.0, "learning_rate": 8.923438974429849e-07, "loss": -0.0, "num_tokens": 69966238.0, "reward": 1.500933289527893, "reward_std": 0.022255118004977703, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.498046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0014432001626119018, "rewards/waypoint_pred_accuracy/std": 0.0033150608651340008, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.375, "completions/max_terminated_length": 341.375, "completions/mean_length": 207.609375, "completions/mean_terminated_length": 207.609375, "completions/min_length": 108.625, "completions/min_terminated_length": 108.625, "epoch": 0.30736842105263157, "grad_norm": 0.01606505736708641, "learning_rate": 8.901858403821253e-07, "loss": -0.0003, "num_tokens": 70430934.0, "reward": 1.9067281186580658, "reward_std": 0.11152400076389313, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 0.08031718447636607, "rewards/waypoint_pred_accuracy/std": 0.041061242358370054, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.125, "completions/max_terminated_length": 339.125, "completions/mean_length": 200.755859375, "completions/mean_terminated_length": 200.755859375, "completions/min_length": 108.125, "completions/min_terminated_length": 108.125, "epoch": 0.3094736842105263, "grad_norm": 0.022619424387812614, "learning_rate": 8.880093688917338e-07, "loss": 0.0006, "num_tokens": 70895897.0, "reward": 1.979694738984108, "reward_std": 0.10517071333015338, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.052347380133141996, "rewards/waypoint_pred_accuracy/std": 0.05258536203473341, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.25, "completions/max_terminated_length": 351.25, "completions/mean_length": 209.3203125, "completions/mean_terminated_length": 209.3203125, "completions/min_length": 106.375, "completions/min_terminated_length": 106.375, "epoch": 0.31157894736842107, "grad_norm": 0.015080302953720093, "learning_rate": 8.858146007853e-07, "loss": 0.0007, "num_tokens": 71362301.0, "reward": 1.7541356086730957, "reward_std": 0.0744232046417892, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.003044351096450817, "rewards/waypoint_pred_accuracy/std": 0.014287834603145175, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.25, "completions/max_terminated_length": 339.25, "completions/mean_length": 204.564453125, "completions/mean_terminated_length": 204.564453125, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.3136842105263158, "grad_norm": 0.014518975280225277, "learning_rate": 8.836016548667178e-07, "loss": 0.0, "num_tokens": 71831198.0, "reward": 1.7602195739746094, "reward_std": 0.04601290519349277, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.751953125, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 0.004133254632145133, "rewards/waypoint_pred_accuracy/std": 0.004251232765162753, "step": 149 }, { "epoch": 0.3157894736842105, "grad_norm": 0.0, "learning_rate": 8.813706509238558e-07, "loss": -0.0002, "step": 150 }, { "epoch": 0.3157894736842105, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 410.41, "eval_completions/max_terminated_length": 410.41, "eval_completions/mean_length": 207.72815979003906, "eval_completions/mean_terminated_length": 207.72815979003906, "eval_completions/min_length": 111.67, "eval_completions/min_terminated_length": 111.67, "eval_loss": 0.0016332893865182996, "eval_num_tokens": 72298062.0, "eval_reward": 1.8651788556575775, "eval_reward_std": 0.10482742591684201, "eval_rewards/format_reward_embodied/mean": 0.9990625, "eval_rewards/format_reward_embodied/std": 0.0075, "eval_rewards/stop_prediction_reward/mean": 0.76859375, "eval_rewards/stop_prediction_reward/std": 0.020503681004047394, "eval_rewards/waypoint_pred_accuracy/mean": 0.04876130852479674, "eval_rewards/waypoint_pred_accuracy/std": 0.0403332443083783, "eval_runtime": 1073.7169, "eval_samples_per_second": 0.093, "eval_steps_per_second": 0.002, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.75, "completions/max_terminated_length": 359.75, "completions/mean_length": 208.6748046875, "completions/mean_terminated_length": 208.6748046875, "completions/min_length": 112.5, "completions/min_terminated_length": 112.5, "epoch": 0.3178947368421053, "grad_norm": 0.013742033392190933, "learning_rate": 8.791217097220724e-07, "loss": -0.0001, "num_tokens": 72761681.0, "reward": 1.875585325062275, "reward_std": 0.023173605810638875, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.8720703125, "rewards/stop_prediction_reward/std": 0.01877300627529621, "rewards/waypoint_pred_accuracy/mean": 0.0017575172029685837, "rewards/waypoint_pred_accuracy/std": 0.0022003025424544072, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.375, "completions/max_terminated_length": 350.375, "completions/mean_length": 210.005859375, "completions/mean_terminated_length": 210.005859375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.32, "grad_norm": 0.010572736151516438, "learning_rate": 8.768549529976783e-07, "loss": 0.0006, "num_tokens": 73228180.0, "reward": 1.9192677438259125, "reward_std": 0.049104438461654354, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.022133867223146808, "rewards/waypoint_pred_accuracy/std": 0.02455222301614413, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 197.775390625, "completions/mean_terminated_length": 197.775390625, "completions/min_length": 107.625, "completions/min_terminated_length": 107.625, "epoch": 0.32210526315789473, "grad_norm": 0.01287839561700821, "learning_rate": 8.74570503451348e-07, "loss": 0.0003, "num_tokens": 73691105.0, "reward": 1.9361660480499268, "reward_std": 0.10891422609623902, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.87109375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.03253614324701938, "rewards/waypoint_pred_accuracy/std": 0.03883211300195525, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 202.033203125, "completions/mean_terminated_length": 202.033203125, "completions/min_length": 115.75, "completions/min_terminated_length": 115.75, "epoch": 0.32421052631578945, "grad_norm": 0.018562331795692444, "learning_rate": 8.72268484741477e-07, "loss": -0.0005, "num_tokens": 74156146.0, "reward": 1.8251054883003235, "reward_std": 0.1710243321698499, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.1000527671312958, "rewards/waypoint_pred_accuracy/std": 0.07437628054339515, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 204.990234375, "completions/mean_terminated_length": 204.990234375, "completions/min_length": 115.75, "completions/min_terminated_length": 115.75, "epoch": 0.3263157894736842, "grad_norm": 0.018523743376135826, "learning_rate": 8.699490214774881e-07, "loss": 0.0002, "num_tokens": 74622701.0, "reward": 1.7455661296844482, "reward_std": 0.05205453363100787, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.744140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.0007127649293749982, "rewards/waypoint_pred_accuracy/std": 0.0025897676093791233, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.75, "completions/max_terminated_length": 333.75, "completions/mean_length": 193.486328125, "completions/mean_terminated_length": 193.486328125, "completions/min_length": 102.625, "completions/min_terminated_length": 102.625, "epoch": 0.32842105263157895, "grad_norm": 0.0, "learning_rate": 8.676122392130872e-07, "loss": 0.0, "num_tokens": 75081254.0, "reward": 1.623046875, "reward_std": 0.015625, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 7.546804675536426e-25, "rewards/waypoint_pred_accuracy/std": 4.67924008313652e-24, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.25, "completions/max_terminated_length": 352.25, "completions/mean_length": 199.650390625, "completions/mean_terminated_length": 199.650390625, "completions/min_length": 109.75, "completions/min_terminated_length": 109.75, "epoch": 0.33052631578947367, "grad_norm": 0.0, "learning_rate": 8.652582644394657e-07, "loss": 0.001, "num_tokens": 75543923.0, "reward": 1.763193666934967, "reward_std": 0.043326430561137386, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.007573404463160793, "rewards/waypoint_pred_accuracy/std": 0.013850717227414266, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.375, "completions/max_terminated_length": 351.375, "completions/mean_length": 211.6484375, "completions/mean_terminated_length": 211.6484375, "completions/min_length": 105.125, "completions/min_terminated_length": 105.125, "epoch": 0.33263157894736844, "grad_norm": 0.015552366152405739, "learning_rate": 8.628872245784545e-07, "loss": 0.0007, "num_tokens": 76014975.0, "reward": 1.7972655892372131, "reward_std": 0.11691518849693239, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.025585949169908424, "rewards/waypoint_pred_accuracy/std": 0.027767218511144182, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 200.451171875, "completions/mean_terminated_length": 200.451171875, "completions/min_length": 113.625, "completions/min_terminated_length": 113.625, "epoch": 0.33473684210526317, "grad_norm": 0.012406791560351849, "learning_rate": 8.60499247975626e-07, "loss": 0.0001, "num_tokens": 76479526.0, "reward": 1.692464992403984, "reward_std": 0.10753844678401947, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0327559362485772, "rewards/waypoint_pred_accuracy/std": 0.0465778008219786, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.875, "completions/max_terminated_length": 320.875, "completions/mean_length": 193.51171875, "completions/mean_terminated_length": 193.51171875, "completions/min_length": 102.875, "completions/min_terminated_length": 102.875, "epoch": 0.3368421052631579, "grad_norm": 0.030549675226211548, "learning_rate": 8.58094463893347e-07, "loss": 0.0002, "num_tokens": 76938732.0, "reward": 1.9054777026176453, "reward_std": 0.05647589443033718, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.016215412090739154, "rewards/waypoint_pred_accuracy/std": 0.02042544638196753, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.125, "completions/max_terminated_length": 334.125, "completions/mean_length": 199.755859375, "completions/mean_terminated_length": 199.755859375, "completions/min_length": 113.875, "completions/min_terminated_length": 113.875, "epoch": 0.3389473684210526, "grad_norm": 0.014368158765137196, "learning_rate": 8.556730025037819e-07, "loss": -0.0002, "num_tokens": 77399855.0, "reward": 1.9117814898490906, "reward_std": 0.06853678584000633, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.020343898673879646, "rewards/waypoint_pred_accuracy/std": 0.018643394690372794, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.125, "completions/max_terminated_length": 311.125, "completions/mean_length": 194.9609375, "completions/mean_terminated_length": 194.9609375, "completions/min_length": 111.875, "completions/min_terminated_length": 111.875, "epoch": 0.3410526315789474, "grad_norm": 0.0, "learning_rate": 8.532349948818453e-07, "loss": 0.0001, "num_tokens": 77858715.0, "reward": 1.5617362409830093, "reward_std": 0.09266455079254143, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.030868121356284917, "rewards/waypoint_pred_accuracy/std": 0.04633227763988046, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.125, "completions/max_terminated_length": 359.125, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 114.25, "completions/min_terminated_length": 114.25, "epoch": 0.3431578947368421, "grad_norm": 0.0, "learning_rate": 8.507805729981081e-07, "loss": -0.0003, "num_tokens": 78320411.0, "reward": 1.5313882529735565, "reward_std": 0.13795647164806724, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.375, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07819415128325216, "rewards/waypoint_pred_accuracy/std": 0.06897824443884724, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.125, "completions/max_terminated_length": 356.125, "completions/mean_length": 200.798828125, "completions/mean_terminated_length": 200.798828125, "completions/min_length": 112.75, "completions/min_terminated_length": 112.75, "epoch": 0.3452631578947368, "grad_norm": 0.0, "learning_rate": 8.483098697116535e-07, "loss": -0.0002, "num_tokens": 78785204.0, "reward": 1.8663674592971802, "reward_std": 0.0740682063976692, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05818372880702327, "rewards/waypoint_pred_accuracy/std": 0.03703410336356683, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 184.767578125, "completions/mean_terminated_length": 184.767578125, "completions/min_length": 108.125, "completions/min_terminated_length": 108.125, "epoch": 0.3473684210526316, "grad_norm": 0.014639639295637608, "learning_rate": 8.45823018762885e-07, "loss": 0.0004, "num_tokens": 79241469.0, "reward": 1.5147821009159088, "reward_std": 0.028108830246765137, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.007391049890770773, "rewards/waypoint_pred_accuracy/std": 0.01405441654196693, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.125, "completions/max_terminated_length": 327.125, "completions/mean_length": 188.45703125, "completions/mean_terminated_length": 188.45703125, "completions/min_length": 108.75, "completions/min_terminated_length": 108.75, "epoch": 0.3494736842105263, "grad_norm": 0.01364043541252613, "learning_rate": 8.43320154766287e-07, "loss": 0.0, "num_tokens": 79696807.0, "reward": 1.9586426615715027, "reward_std": 0.09136595235713685, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.04182136098216205, "rewards/waypoint_pred_accuracy/std": 0.04568298065798615, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.375, "completions/max_terminated_length": 332.375, "completions/mean_length": 192.078125, "completions/mean_terminated_length": 192.078125, "completions/min_length": 112.125, "completions/min_terminated_length": 112.125, "epoch": 0.35157894736842105, "grad_norm": 0.0016441630432382226, "learning_rate": 8.408014132031385e-07, "loss": -0.0002, "num_tokens": 80154511.0, "reward": 1.753662645816803, "reward_std": 0.020446277248106215, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0018313333685417293, "rewards/waypoint_pred_accuracy/std": 0.010223136260532173, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.625, "completions/max_terminated_length": 350.625, "completions/mean_length": 195.697265625, "completions/mean_terminated_length": 195.697265625, "completions/min_length": 111.25, "completions/min_terminated_length": 111.25, "epoch": 0.35368421052631577, "grad_norm": 0.0191575326025486, "learning_rate": 8.382669304141789e-07, "loss": 0.0002, "num_tokens": 80616180.0, "reward": 1.7313858270645142, "reward_std": 0.07452307981657214, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05319291944942961, "rewards/waypoint_pred_accuracy/std": 0.03726154523974404, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.125, "completions/max_terminated_length": 339.125, "completions/mean_length": 199.1796875, "completions/mean_terminated_length": 199.1796875, "completions/min_length": 113.125, "completions/min_terminated_length": 113.125, "epoch": 0.35578947368421054, "grad_norm": 0.0011054413625970483, "learning_rate": 8.35716843592228e-07, "loss": -0.0002, "num_tokens": 81079184.0, "reward": 1.877403125166893, "reward_std": 0.014616520323585291, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0012015849925077877, "rewards/waypoint_pred_accuracy/std": 0.007308262612468781, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.625, "completions/max_terminated_length": 345.625, "completions/mean_length": 194.3984375, "completions/mean_terminated_length": 194.3984375, "completions/min_length": 118.875, "completions/min_terminated_length": 118.875, "epoch": 0.35789473684210527, "grad_norm": 0.04289071634411812, "learning_rate": 8.331512907747596e-07, "loss": -0.0002, "num_tokens": 81539356.0, "reward": 1.8431425243616104, "reward_std": 0.17017098766780236, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.10907126411490253, "rewards/waypoint_pred_accuracy/std": 0.0850854907983205, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.125, "completions/max_terminated_length": 312.125, "completions/mean_length": 189.50390625, "completions/mean_terminated_length": 189.50390625, "completions/min_length": 114.375, "completions/min_terminated_length": 114.375, "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 8.305704108364301e-07, "loss": 0.0003, "num_tokens": 81995934.0, "reward": 1.8737435936927795, "reward_std": 0.02003212797418996, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0003483741428915721, "rewards/waypoint_pred_accuracy/std": 0.0022035639689050868, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.625, "completions/max_terminated_length": 353.625, "completions/mean_length": 196.31640625, "completions/mean_terminated_length": 196.31640625, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "epoch": 0.36210526315789476, "grad_norm": 0.014004958793520927, "learning_rate": 8.279743434815599e-07, "loss": 0.0, "num_tokens": 82457920.0, "reward": 1.6893496811389923, "reward_std": 0.07697248342446983, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.03217484994092959, "rewards/waypoint_pred_accuracy/std": 0.038486242177896396, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.375, "completions/max_terminated_length": 310.375, "completions/mean_length": 184.642578125, "completions/mean_terminated_length": 184.642578125, "completions/min_length": 110.875, "completions/min_terminated_length": 110.875, "epoch": 0.3642105263157895, "grad_norm": 0.013178675435483456, "learning_rate": 8.253632292365726e-07, "loss": 0.0, "num_tokens": 82915145.0, "reward": 2.0115868896245956, "reward_std": 0.08305720053613186, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06829345226288144, "rewards/waypoint_pred_accuracy/std": 0.041528596542799806, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.125, "completions/max_terminated_length": 318.125, "completions/mean_length": 190.236328125, "completions/mean_terminated_length": 190.236328125, "completions/min_length": 112.125, "completions/min_terminated_length": 112.125, "epoch": 0.3663157894736842, "grad_norm": 0.02363615669310093, "learning_rate": 8.227372094423864e-07, "loss": -0.0001, "num_tokens": 83374914.0, "reward": 2.0038606971502304, "reward_std": 0.14384562149643898, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0644303746521473, "rewards/waypoint_pred_accuracy/std": 0.07192281540483236, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.125, "completions/max_terminated_length": 332.125, "completions/mean_length": 180.150390625, "completions/mean_terminated_length": 180.150390625, "completions/min_length": 106.875, "completions/min_terminated_length": 106.875, "epoch": 0.3684210526315789, "grad_norm": 0.0, "learning_rate": 8.200964262467656e-07, "loss": 0.0001, "num_tokens": 83825615.0, "reward": 1.4097924530506134, "reward_std": 0.04181510955095291, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.375, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0173962339758873, "rewards/waypoint_pred_accuracy/std": 0.020907556638121605, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.625, "completions/max_terminated_length": 307.625, "completions/mean_length": 184.662109375, "completions/mean_terminated_length": 184.662109375, "completions/min_length": 108.25, "completions/min_terminated_length": 108.25, "epoch": 0.3705263157894737, "grad_norm": 0.01097350474447012, "learning_rate": 8.174410225966239e-07, "loss": 0.0001, "num_tokens": 84280098.0, "reward": 2.0639708340168, "reward_std": 0.17662093978196225, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.09448541931482657, "rewards/waypoint_pred_accuracy/std": 0.0883104762174689, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.375, "completions/max_terminated_length": 311.375, "completions/mean_length": 179.3046875, "completions/mean_terminated_length": 179.3046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3726315789473684, "grad_norm": 0.0, "learning_rate": 8.147711422302881e-07, "loss": 0.0, "num_tokens": 84732926.0, "reward": 1.5250985324382782, "reward_std": 0.028973333232215737, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.01254925754006564, "rewards/waypoint_pred_accuracy/std": 0.014486670532335214, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.625, "completions/max_terminated_length": 315.625, "completions/mean_length": 182.26171875, "completions/mean_terminated_length": 182.26171875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.37473684210526315, "grad_norm": 0.010854351334273815, "learning_rate": 8.120869296697162e-07, "loss": -0.0, "num_tokens": 85187204.0, "reward": 1.7528592348098755, "reward_std": 0.01418565196615873, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0014296227405005812, "rewards/waypoint_pred_accuracy/std": 0.007092827672981208, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 185.01953125, "completions/mean_terminated_length": 185.01953125, "completions/min_length": 105.25, "completions/min_terminated_length": 105.25, "epoch": 0.37684210526315787, "grad_norm": 0.022924024611711502, "learning_rate": 8.093885302126754e-07, "loss": 0.0073, "num_tokens": 85641038.0, "reward": 1.7433076351881027, "reward_std": 0.06334595192311099, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.744140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.0005600605727522634, "rewards/waypoint_pred_accuracy/std": 0.003230394551792415, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 179.8359375, "completions/mean_terminated_length": 179.8359375, "completions/min_length": 108.625, "completions/min_terminated_length": 108.625, "epoch": 0.37894736842105264, "grad_norm": 0.0, "learning_rate": 8.06676089924877e-07, "loss": 0.0004, "num_tokens": 86092666.0, "reward": 1.8267612159252167, "reward_std": 0.1466955652579145, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.10088061677343774, "rewards/waypoint_pred_accuracy/std": 0.0733477777656617, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.75, "completions/max_terminated_length": 298.75, "completions/mean_length": 184.97265625, "completions/mean_terminated_length": 184.97265625, "completions/min_length": 111.625, "completions/min_terminated_length": 111.625, "epoch": 0.38105263157894737, "grad_norm": 0.0, "learning_rate": 8.03949755632069e-07, "loss": -0.0004, "num_tokens": 86546476.0, "reward": 1.7814702987670898, "reward_std": 0.08937373897060752, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07823515147902071, "rewards/waypoint_pred_accuracy/std": 0.04468687262851745, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.625, "completions/max_terminated_length": 309.625, "completions/mean_length": 179.16015625, "completions/mean_terminated_length": 179.16015625, "completions/min_length": 105.625, "completions/min_terminated_length": 105.625, "epoch": 0.3831578947368421, "grad_norm": 0.02093261480331421, "learning_rate": 8.01209674912089e-07, "loss": 0.0, "num_tokens": 86998398.0, "reward": 1.9096409678459167, "reward_std": 0.0670090508647263, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07982050045393407, "rewards/waypoint_pred_accuracy/std": 0.033504527527838945, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.375, "completions/max_terminated_length": 286.375, "completions/mean_length": 171.54296875, "completions/mean_terminated_length": 171.54296875, "completions/min_length": 102.875, "completions/min_terminated_length": 102.875, "epoch": 0.38526315789473686, "grad_norm": 0.01432411465793848, "learning_rate": 7.984559960868759e-07, "loss": -0.0004, "num_tokens": 87445908.0, "reward": 1.6769181191921234, "reward_std": 0.10013374220579863, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.02595905796625718, "rewards/waypoint_pred_accuracy/std": 0.05006687459543879, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.375, "completions/max_terminated_length": 317.375, "completions/mean_length": 181.97265625, "completions/mean_terminated_length": 181.97265625, "completions/min_length": 106.875, "completions/min_terminated_length": 106.875, "epoch": 0.3873684210526316, "grad_norm": 0.0, "learning_rate": 7.956888682144403e-07, "loss": 0.0001, "num_tokens": 87901126.0, "reward": 2.113648146390915, "reward_std": 0.08789801027160138, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05682408035500002, "rewards/waypoint_pred_accuracy/std": 0.04394900894840466, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.375, "completions/max_terminated_length": 297.375, "completions/mean_length": 170.03125, "completions/mean_terminated_length": 170.03125, "completions/min_length": 107.25, "completions/min_terminated_length": 107.25, "epoch": 0.3894736842105263, "grad_norm": 0.00041478071943856776, "learning_rate": 7.929084410807964e-07, "loss": -0.0, "num_tokens": 88348630.0, "reward": 1.8102026730775833, "reward_std": 0.08258083421748097, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.032054444891858935, "rewards/waypoint_pred_accuracy/std": 0.025665418093367975, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.125, "completions/max_terminated_length": 285.125, "completions/mean_length": 167.068359375, "completions/mean_terminated_length": 167.068359375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.391578947368421, "grad_norm": 0.010830595158040524, "learning_rate": 7.90114865191855e-07, "loss": -0.0002, "num_tokens": 88793017.0, "reward": 1.899033010005951, "reward_std": 0.05657581372270215, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.012016504072278167, "rewards/waypoint_pred_accuracy/std": 0.02828790664943881, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.625, "completions/max_terminated_length": 270.625, "completions/mean_length": 170.416015625, "completions/mean_terminated_length": 170.416015625, "completions/min_length": 110.5, "completions/min_terminated_length": 110.5, "epoch": 0.3936842105263158, "grad_norm": 0.02063567005097866, "learning_rate": 7.873082917652743e-07, "loss": -0.0001, "num_tokens": 89241230.0, "reward": 1.6706158965826035, "reward_std": 0.083322549238801, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.023784521038484087, "rewards/waypoint_pred_accuracy/std": 0.03384877370171229, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.25, "completions/max_terminated_length": 305.25, "completions/mean_length": 181.23046875, "completions/mean_terminated_length": 181.23046875, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.3957894736842105, "grad_norm": 0.0, "learning_rate": 7.844888727222768e-07, "loss": 0.0004, "num_tokens": 89692484.0, "reward": 1.630469560623169, "reward_std": 0.031483914237469435, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0037113359924784914, "rewards/waypoint_pred_accuracy/std": 0.00797604240300253, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.875, "completions/max_terminated_length": 331.875, "completions/mean_length": 173.87109375, "completions/mean_terminated_length": 173.87109375, "completions/min_length": 113.75, "completions/min_terminated_length": 113.75, "epoch": 0.39789473684210525, "grad_norm": 0.042730070650577545, "learning_rate": 7.816567606794239e-07, "loss": -0.0002, "num_tokens": 90142082.0, "reward": 1.7876133099198341, "reward_std": 0.06541969033423811, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0197832117555663, "rewards/waypoint_pred_accuracy/std": 0.02541783277411014, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.125, "completions/max_terminated_length": 302.125, "completions/mean_length": 182.103515625, "completions/mean_terminated_length": 182.103515625, "completions/min_length": 115.25, "completions/min_terminated_length": 115.25, "epoch": 0.4, "grad_norm": 0.018826643005013466, "learning_rate": 7.788121089403557e-07, "loss": 0.0001, "num_tokens": 90596087.0, "reward": 1.805692046880722, "reward_std": 0.06880141280907992, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.027846033899431428, "rewards/waypoint_pred_accuracy/std": 0.03440070046775373, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.875, "completions/max_terminated_length": 267.875, "completions/mean_length": 175.751953125, "completions/mean_terminated_length": 175.751953125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.40210526315789474, "grad_norm": 0.013081498444080353, "learning_rate": 7.759550714874924e-07, "loss": -0.0004, "num_tokens": 91046072.0, "reward": 2.0215499103069305, "reward_std": 0.13955920189619064, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.07522808883824439, "rewards/waypoint_pred_accuracy/std": 0.05415468077226393, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.5, "completions/max_terminated_length": 278.5, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 114.75, "completions/min_terminated_length": 114.75, "epoch": 0.40421052631578946, "grad_norm": 0.0, "learning_rate": 7.730858029736989e-07, "loss": 0.0, "num_tokens": 91491928.0, "reward": 1.75, "reward_std": 0.0, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 1.474326771777612e-10, "rewards/waypoint_pred_accuracy/std": 1.1786529530155576e-09, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.875, "completions/max_terminated_length": 306.875, "completions/mean_length": 177.765625, "completions/mean_terminated_length": 177.765625, "completions/min_length": 114.875, "completions/min_terminated_length": 114.875, "epoch": 0.4063157894736842, "grad_norm": 0.0, "learning_rate": 7.702044587139137e-07, "loss": 0.0002, "num_tokens": 91941856.0, "reward": 1.7487657219171524, "reward_std": 0.01981517393141985, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0003594244599299801, "rewards/waypoint_pred_accuracy/std": 0.002140052256436602, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 303.5, "completions/max_terminated_length": 303.5, "completions/mean_length": 171.814453125, "completions/mean_terminated_length": 172.19029235839844, "completions/min_length": 92.375, "completions/min_terminated_length": 106.75, "epoch": 0.40842105263157896, "grad_norm": 0.016274407505989075, "learning_rate": 7.673111946767413e-07, "loss": -0.0001, "num_tokens": 92389185.0, "reward": 2.0959380865097046, "reward_std": 0.12629193731117994, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.11046904484828701, "rewards/waypoint_pred_accuracy/std": 0.06314596923766658, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.125, "completions/max_terminated_length": 290.125, "completions/mean_length": 180.162109375, "completions/mean_terminated_length": 180.162109375, "completions/min_length": 107.5, "completions/min_terminated_length": 107.5, "epoch": 0.4105263157894737, "grad_norm": 0.020258145406842232, "learning_rate": 7.644061674760101e-07, "loss": 0.0001, "num_tokens": 92840532.0, "reward": 2.294894278049469, "reward_std": 0.23648597935971338, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.998046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.1494002838226665, "rewards/waypoint_pred_accuracy/std": 0.11227664479537411, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.75, "completions/max_terminated_length": 299.75, "completions/mean_length": 169.263671875, "completions/mean_terminated_length": 169.263671875, "completions/min_length": 113.25, "completions/min_terminated_length": 113.25, "epoch": 0.4126315789473684, "grad_norm": 0.04483957961201668, "learning_rate": 7.61489534362294e-07, "loss": -0.0002, "num_tokens": 93288091.0, "reward": 1.5267165899276733, "reward_std": 0.05776224182045553, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.501953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.012381742581055732, "rewards/waypoint_pred_accuracy/std": 0.02106862150685629, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.125, "completions/max_terminated_length": 264.125, "completions/mean_length": 165.439453125, "completions/mean_terminated_length": 165.439453125, "completions/min_length": 107.625, "completions/min_terminated_length": 107.625, "epoch": 0.4147368421052632, "grad_norm": 0.0, "learning_rate": 7.585614532144007e-07, "loss": 0.0002, "num_tokens": 93733372.0, "reward": 1.80105559527874, "reward_std": 0.17875106693827547, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.08705123437539442, "rewards/waypoint_pred_accuracy/std": 0.08156303651776398, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.5, "completions/max_terminated_length": 328.5, "completions/mean_length": 170.4609375, "completions/mean_terminated_length": 170.4609375, "completions/min_length": 103.125, "completions/min_terminated_length": 103.125, "epoch": 0.4168421052631579, "grad_norm": 0.009551014751195908, "learning_rate": 7.556220825308261e-07, "loss": 0.0002, "num_tokens": 94180968.0, "reward": 1.8914762139320374, "reward_std": 0.05954795209981967, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.009214683991414209, "rewards/waypoint_pred_accuracy/std": 0.021961479235898466, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.125, "completions/max_terminated_length": 315.125, "completions/mean_length": 176.51171875, "completions/mean_terminated_length": 176.51171875, "completions/min_length": 119.25, "completions/min_terminated_length": 119.25, "epoch": 0.4189473684210526, "grad_norm": 0.0, "learning_rate": 7.526715814211739e-07, "loss": 0.0001, "num_tokens": 94629294.0, "reward": 2.0064243376255035, "reward_std": 0.03297104453667998, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.004188739636447281, "rewards/waypoint_pred_accuracy/std": 0.008869364886777475, "step": 199 }, { "epoch": 0.42105263157894735, "grad_norm": 0.0, "learning_rate": 7.49710109597544e-07, "loss": 0.0002, "step": 200 }, { "epoch": 0.42105263157894735, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 290.91, "eval_completions/max_terminated_length": 290.91, "eval_completions/mean_length": 172.36274307250977, "eval_completions/mean_terminated_length": 172.36274307250977, "eval_completions/min_length": 109.78, "eval_completions/min_terminated_length": 109.78, "eval_loss": -4.892464494332671e-05, "eval_num_tokens": 95080220.0, "eval_reward": 1.8596287977695465, "eval_reward_std": 0.08604613540126138, "eval_rewards/format_reward_embodied/mean": 0.9996875, "eval_rewards/format_reward_embodied/std": 0.0025, "eval_rewards/stop_prediction_reward/mean": 0.7696875, "eval_rewards/stop_prediction_reward/std": 0.005, "eval_rewards/waypoint_pred_accuracy/mean": 0.045126906880960875, "eval_rewards/waypoint_pred_accuracy/std": 0.039966236149646756, "eval_runtime": 941.9095, "eval_samples_per_second": 0.106, "eval_steps_per_second": 0.002, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.9375, "completions/max_terminated_length": 292.9375, "completions/mean_length": 174.2548828125, "completions/mean_terminated_length": 174.2548828125, "completions/min_length": 110.5625, "completions/min_terminated_length": 110.5625, "epoch": 0.4231578947368421, "grad_norm": 0.0, "learning_rate": 7.467378273658856e-07, "loss": 0.0005, "num_tokens": 95528819.0, "reward": 1.9443908333778381, "reward_std": 0.04222029652737547, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.8740234375, "rewards/stop_prediction_reward/std": 0.0078125, "rewards/waypoint_pred_accuracy/mean": 0.03518370707206486, "rewards/waypoint_pred_accuracy/std": 0.017203900250024166, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.5, "completions/max_terminated_length": 305.5, "completions/mean_length": 170.724609375, "completions/mean_terminated_length": 170.724609375, "completions/min_length": 112.375, "completions/min_terminated_length": 112.375, "epoch": 0.42526315789473684, "grad_norm": 0.012682443484663963, "learning_rate": 7.437548956173213e-07, "loss": -0.0004, "num_tokens": 95979302.0, "reward": 1.9156423211097717, "reward_std": 0.1263027695240453, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.751953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0818446125079697, "rewards/waypoint_pred_accuracy/std": 0.055338893387193444, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 293.875, "completions/max_terminated_length": 293.875, "completions/mean_length": 176.544921875, "completions/mean_terminated_length": 176.8624153137207, "completions/min_length": 101.625, "completions/min_terminated_length": 111.625, "epoch": 0.42736842105263156, "grad_norm": 0.0, "learning_rate": 7.407614758194373e-07, "loss": -0.0006, "num_tokens": 96431805.0, "reward": 1.707748532295227, "reward_std": 0.10709417768262597, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.04332738941334985, "rewards/waypoint_pred_accuracy/std": 0.03793920004429459, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.5, "completions/max_terminated_length": 321.5, "completions/mean_length": 176.828125, "completions/mean_terminated_length": 176.828125, "completions/min_length": 115.625, "completions/min_terminated_length": 115.625, "epoch": 0.42947368421052634, "grad_norm": 0.0, "learning_rate": 7.377577300075431e-07, "loss": 0.0, "num_tokens": 96881189.0, "reward": 1.760355144739151, "reward_std": 0.02672452749078502, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.005177572603687428, "rewards/waypoint_pred_accuracy/std": 0.013362265083738705, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.75, "completions/max_terminated_length": 304.75, "completions/mean_length": 175.419921875, "completions/mean_terminated_length": 175.419921875, "completions/min_length": 117.75, "completions/min_terminated_length": 117.75, "epoch": 0.43157894736842106, "grad_norm": 0.015611842274665833, "learning_rate": 7.347438207759002e-07, "loss": -0.0002, "num_tokens": 97333116.0, "reward": 1.9850184619426727, "reward_std": 0.06815559589631448, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.055009242740135744, "rewards/waypoint_pred_accuracy/std": 0.0340777950465283, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.875, "completions/max_terminated_length": 306.875, "completions/mean_length": 174.68359375, "completions/mean_terminated_length": 174.68359375, "completions/min_length": 107.125, "completions/min_terminated_length": 107.125, "epoch": 0.4336842105263158, "grad_norm": 0.0, "learning_rate": 7.317199112689219e-07, "loss": -0.0003, "num_tokens": 97780314.0, "reward": 1.6253042817115784, "reward_std": 0.0009640372365589123, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0001521414744787989, "rewards/waypoint_pred_accuracy/std": 0.00048202241833421994, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.625, "completions/max_terminated_length": 294.625, "completions/mean_length": 178.39453125, "completions/mean_terminated_length": 178.39453125, "completions/min_length": 117.125, "completions/min_terminated_length": 117.125, "epoch": 0.4357894736842105, "grad_norm": 0.0, "learning_rate": 7.286861651723403e-07, "loss": 0.0, "num_tokens": 98230564.0, "reward": 1.8962776064872742, "reward_std": 0.06431814459938323, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07313878076765164, "rewards/waypoint_pred_accuracy/std": 0.03215907396928667, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.125, "completions/max_terminated_length": 297.125, "completions/mean_length": 170.755859375, "completions/mean_terminated_length": 170.755859375, "completions/min_length": 108.125, "completions/min_terminated_length": 108.125, "epoch": 0.4378947368421053, "grad_norm": 0.02003273367881775, "learning_rate": 7.256427467043479e-07, "loss": 0.0001, "num_tokens": 98680935.0, "reward": 1.7773285955190659, "reward_std": 0.045251342578694675, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.013664303890479346, "rewards/waypoint_pred_accuracy/std": 0.022625670864954373, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.625, "completions/max_terminated_length": 307.625, "completions/mean_length": 175.458984375, "completions/mean_terminated_length": 175.458984375, "completions/min_length": 109.75, "completions/min_terminated_length": 109.75, "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 7.225898206067071e-07, "loss": 0.0, "num_tokens": 99131986.0, "reward": 2.000874102115631, "reward_std": 0.07641031977254897, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06293705673306249, "rewards/waypoint_pred_accuracy/std": 0.038205162913072854, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.25, "completions/max_terminated_length": 295.25, "completions/mean_length": 177.837890625, "completions/mean_terminated_length": 177.837890625, "completions/min_length": 118.75, "completions/min_terminated_length": 118.75, "epoch": 0.4421052631578947, "grad_norm": 0.01405387930572033, "learning_rate": 7.195275521358332e-07, "loss": -0.0003, "num_tokens": 99580031.0, "reward": 2.028854936361313, "reward_std": 0.11646178726022072, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07692747169583214, "rewards/waypoint_pred_accuracy/std": 0.05823090146415666, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.5, "completions/max_terminated_length": 307.5, "completions/mean_length": 177.76171875, "completions/mean_terminated_length": 177.76171875, "completions/min_length": 115.5, "completions/min_terminated_length": 115.5, "epoch": 0.4442105263157895, "grad_norm": 0.0, "learning_rate": 7.164561070538488e-07, "loss": 0.0003, "num_tokens": 100033989.0, "reward": 1.7999018132686615, "reward_std": 0.21613861247897148, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.14995091408491135, "rewards/waypoint_pred_accuracy/std": 0.10806930996477604, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.875, "completions/max_terminated_length": 431.875, "completions/mean_length": 174.076171875, "completions/mean_terminated_length": 174.076171875, "completions/min_length": 112.375, "completions/min_terminated_length": 112.375, "epoch": 0.4463157894736842, "grad_norm": 0.04999241605401039, "learning_rate": 7.133756516196107e-07, "loss": -0.0003, "num_tokens": 100487340.0, "reward": 1.7891514897346497, "reward_std": 0.0568979331983428, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.01957578670394547, "rewards/waypoint_pred_accuracy/std": 0.028448972130328657, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 171.359375, "completions/mean_terminated_length": 171.359375, "completions/min_length": 109.375, "completions/min_terminated_length": 109.375, "epoch": 0.44842105263157894, "grad_norm": 0.0, "learning_rate": 7.102863525797112e-07, "loss": -0.0001, "num_tokens": 100937124.0, "reward": 1.9840654134750366, "reward_std": 0.17244431003928185, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.17953270860016346, "rewards/waypoint_pred_accuracy/std": 0.08622215129435062, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.375, "completions/max_terminated_length": 291.375, "completions/mean_length": 175.158203125, "completions/mean_terminated_length": 175.158203125, "completions/min_length": 113.25, "completions/min_terminated_length": 113.25, "epoch": 0.45052631578947366, "grad_norm": 0.01138628926128149, "learning_rate": 7.071883771594509e-07, "loss": 0.0, "num_tokens": 101387957.0, "reward": 1.8491481095552444, "reward_std": 0.1147658722824616, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.049574058981537675, "rewards/waypoint_pred_accuracy/std": 0.05738293592632728, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.125, "completions/max_terminated_length": 327.125, "completions/mean_length": 187.966796875, "completions/mean_terminated_length": 187.966796875, "completions/min_length": 114.625, "completions/min_terminated_length": 114.625, "epoch": 0.45263157894736844, "grad_norm": 0.011563337408006191, "learning_rate": 7.040818930537874e-07, "loss": -0.0003, "num_tokens": 101845412.0, "reward": 1.6611975878477097, "reward_std": 0.0494131935941482, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.018098798598909838, "rewards/waypoint_pred_accuracy/std": 0.02470659996008351, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.25, "completions/max_terminated_length": 287.25, "completions/mean_length": 171.841796875, "completions/mean_terminated_length": 171.841796875, "completions/min_length": 107.125, "completions/min_terminated_length": 107.125, "epoch": 0.45473684210526316, "grad_norm": 2.7086246063845465e-06, "learning_rate": 7.009670684182576e-07, "loss": -0.0, "num_tokens": 102293587.0, "reward": 1.5, "reward_std": 3.251700020356907e-09, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 4.380867890674267e-10, "rewards/waypoint_pred_accuracy/std": 1.9065319947775272e-09, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.75, "completions/max_terminated_length": 286.75, "completions/mean_length": 176.490234375, "completions/mean_terminated_length": 176.490234375, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "epoch": 0.4568421052631579, "grad_norm": 0.0, "learning_rate": 6.978440718598756e-07, "loss": 0.0002, "num_tokens": 102744270.0, "reward": 1.7497325837612152, "reward_std": 0.06904890944133513, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06236628795954857, "rewards/waypoint_pred_accuracy/std": 0.03452445384209568, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 182.732421875, "completions/mean_terminated_length": 182.732421875, "completions/min_length": 111.375, "completions/min_terminated_length": 111.375, "epoch": 0.4589473684210526, "grad_norm": 0.029011964797973633, "learning_rate": 6.947130724280057e-07, "loss": 0.0, "num_tokens": 103198789.0, "reward": 1.7562294006347656, "reward_std": 0.013579967227997258, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0031146957662715976, "rewards/waypoint_pred_accuracy/std": 0.006789985702200646, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.375, "completions/max_terminated_length": 270.375, "completions/mean_length": 172.20703125, "completions/mean_terminated_length": 172.20703125, "completions/min_length": 111.875, "completions/min_terminated_length": 111.875, "epoch": 0.4610526315789474, "grad_norm": 0.013365295715630054, "learning_rate": 6.915742396052115e-07, "loss": -0.0001, "num_tokens": 103649519.0, "reward": 1.5070368647575378, "reward_std": 0.023722524622826313, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.501953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.002541873606073473, "rewards/waypoint_pred_accuracy/std": 0.004048763120422751, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.375, "completions/max_terminated_length": 291.375, "completions/mean_length": 176.5546875, "completions/mean_terminated_length": 176.5546875, "completions/min_length": 110.875, "completions/min_terminated_length": 110.875, "epoch": 0.4631578947368421, "grad_norm": 0.0, "learning_rate": 6.884277432980825e-07, "loss": 0.0001, "num_tokens": 104099915.0, "reward": 1.8983599245548248, "reward_std": 0.022330745094222948, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.011679970655677607, "rewards/waypoint_pred_accuracy/std": 0.01116537469351897, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.875, "completions/max_terminated_length": 272.875, "completions/mean_length": 176.126953125, "completions/mean_terminated_length": 176.126953125, "completions/min_length": 118.875, "completions/min_terminated_length": 118.875, "epoch": 0.4652631578947368, "grad_norm": 0.017878547310829163, "learning_rate": 6.852737538280359e-07, "loss": -0.0001, "num_tokens": 104550732.0, "reward": 1.7993512451648712, "reward_std": 0.045344060357820126, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.024675624659721507, "rewards/waypoint_pred_accuracy/std": 0.022672027718726895, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.5, "completions/max_terminated_length": 305.5, "completions/mean_length": 181.193359375, "completions/mean_terminated_length": 181.193359375, "completions/min_length": 113.875, "completions/min_terminated_length": 113.875, "epoch": 0.4673684210526316, "grad_norm": 0.01165629643946886, "learning_rate": 6.821124419220978e-07, "loss": 0.0002, "num_tokens": 105002991.0, "reward": 1.8998810648918152, "reward_std": 0.040346091078029334, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.012440536318531065, "rewards/waypoint_pred_accuracy/std": 0.020173047916574705, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.375, "completions/max_terminated_length": 288.375, "completions/mean_length": 175.021484375, "completions/mean_terminated_length": 175.021484375, "completions/min_length": 109.75, "completions/min_terminated_length": 109.75, "epoch": 0.4694736842105263, "grad_norm": 0.0, "learning_rate": 6.789439787036614e-07, "loss": -0.0001, "num_tokens": 105453626.0, "reward": 1.9986501336097717, "reward_std": 0.019883400294929743, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.998046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0003016188566107303, "rewards/waypoint_pred_accuracy/std": 0.0021291994489729404, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.375, "completions/max_terminated_length": 309.375, "completions/mean_length": 179.6640625, "completions/mean_terminated_length": 179.6640625, "completions/min_length": 112.75, "completions/min_terminated_length": 112.75, "epoch": 0.47157894736842104, "grad_norm": 0.015561181120574474, "learning_rate": 6.757685356832242e-07, "loss": -0.0, "num_tokens": 105904398.0, "reward": 2.101376533508301, "reward_std": 0.06816481053829193, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05068826675858098, "rewards/waypoint_pred_accuracy/std": 0.0340824015760622, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 175.58203125, "completions/mean_terminated_length": 175.58203125, "completions/min_length": 109.625, "completions/min_terminated_length": 109.625, "epoch": 0.47368421052631576, "grad_norm": 0.013554830104112625, "learning_rate": 6.725862847491034e-07, "loss": 0.0002, "num_tokens": 106353592.0, "reward": 1.7516418248414993, "reward_std": 0.0034361608559265733, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0008209160487240297, "rewards/waypoint_pred_accuracy/std": 0.0017180802678922191, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.5, "completions/max_terminated_length": 306.5, "completions/mean_length": 183.095703125, "completions/mean_terminated_length": 183.095703125, "completions/min_length": 108.125, "completions/min_terminated_length": 108.125, "epoch": 0.47578947368421054, "grad_norm": 0.00029330080724321306, "learning_rate": 6.693973981581324e-07, "loss": 0.0, "num_tokens": 106808553.0, "reward": 1.6250907480716705, "reward_std": 0.00024353076181782285, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 4.53742529797907e-05, "rewards/waypoint_pred_accuracy/std": 0.00012176844195366245, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.5, "completions/max_terminated_length": 292.5, "completions/mean_length": 181.337890625, "completions/mean_terminated_length": 181.337890625, "completions/min_length": 119.625, "completions/min_terminated_length": 119.625, "epoch": 0.47789473684210526, "grad_norm": 0.0, "learning_rate": 6.662020485263358e-07, "loss": -0.0001, "num_tokens": 107258774.0, "reward": 1.625608280301094, "reward_std": 0.0014176478143781424, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.00030413969053576185, "rewards/waypoint_pred_accuracy/std": 0.00070882499138026, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.25, "completions/max_terminated_length": 291.25, "completions/mean_length": 173.318359375, "completions/mean_terminated_length": 173.318359375, "completions/min_length": 107.875, "completions/min_terminated_length": 107.875, "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 6.630004088195858e-07, "loss": 0.0002, "num_tokens": 107708793.0, "reward": 1.876689851284027, "reward_std": 0.0026773642748594284, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0008449186572583362, "rewards/waypoint_pred_accuracy/std": 0.0013386833028957005, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.5, "completions/max_terminated_length": 308.5, "completions/mean_length": 176.880859375, "completions/mean_terminated_length": 176.880859375, "completions/min_length": 117.25, "completions/min_terminated_length": 117.25, "epoch": 0.48210526315789476, "grad_norm": 0.0, "learning_rate": 6.597926523442398e-07, "loss": 0.0, "num_tokens": 108161148.0, "reward": 1.509762555360794, "reward_std": 0.021483093870038772, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0048812878156252685, "rewards/waypoint_pred_accuracy/std": 0.01074154757443697, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.125, "completions/max_terminated_length": 312.125, "completions/mean_length": 173.00390625, "completions/mean_terminated_length": 173.00390625, "completions/min_length": 110.25, "completions/min_terminated_length": 110.25, "epoch": 0.4842105263157895, "grad_norm": 0.013146106153726578, "learning_rate": 6.565789527377587e-07, "loss": -0.0005, "num_tokens": 108611454.0, "reward": 1.7406707108020782, "reward_std": 0.09716045763343573, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05783534119836986, "rewards/waypoint_pred_accuracy/std": 0.048580223228782415, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.625, "completions/max_terminated_length": 286.625, "completions/mean_length": 172.7578125, "completions/mean_terminated_length": 172.7578125, "completions/min_length": 111.75, "completions/min_terminated_length": 111.75, "epoch": 0.4863157894736842, "grad_norm": 0.00952562689781189, "learning_rate": 6.533594839593081e-07, "loss": 0.0001, "num_tokens": 109059522.0, "reward": 1.7838719189167023, "reward_std": 0.09348210319876671, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07943596761145057, "rewards/waypoint_pred_accuracy/std": 0.04674104697497585, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.25, "completions/max_terminated_length": 293.25, "completions/mean_length": 175.3828125, "completions/mean_terminated_length": 175.3828125, "completions/min_length": 114.75, "completions/min_terminated_length": 114.75, "epoch": 0.4884210526315789, "grad_norm": 0.0, "learning_rate": 6.501344202803414e-07, "loss": 0.0001, "num_tokens": 109511942.0, "reward": 1.5068429559469223, "reward_std": 0.023580931854667142, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.003421477313622745, "rewards/waypoint_pred_accuracy/std": 0.011790467111495673, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 178.611328125, "completions/mean_terminated_length": 178.9438877105713, "completions/min_length": 97.375, "completions/min_terminated_length": 110.125, "epoch": 0.4905263157894737, "grad_norm": 0.010548449121415615, "learning_rate": 6.469039362751677e-07, "loss": -0.0002, "num_tokens": 109963455.0, "reward": 1.7827188670635223, "reward_std": 0.06010658299783245, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.018312573189177783, "rewards/waypoint_pred_accuracy/std": 0.024115337153489236, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 301.125, "completions/max_terminated_length": 301.125, "completions/mean_length": 179.953125, "completions/mean_terminated_length": 180.33076095581055, "completions/min_length": 101.375, "completions/min_terminated_length": 115.75, "epoch": 0.4926315789473684, "grad_norm": 0.0, "learning_rate": 6.436682068115002e-07, "loss": -0.0007, "num_tokens": 110416615.0, "reward": 1.99609375, "reward_std": 0.03125, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.998046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 2.0173116242793235e-14, "rewards/waypoint_pred_accuracy/std": 1.1195781985957193e-13, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 173.30859375, "completions/mean_terminated_length": 173.30859375, "completions/min_length": 112.75, "completions/min_terminated_length": 112.75, "epoch": 0.49473684210526314, "grad_norm": 0.023477498441934586, "learning_rate": 6.404274070409915e-07, "loss": 0.0002, "num_tokens": 110865797.0, "reward": 1.8803559094667435, "reward_std": 0.11074852512797406, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06517796947535714, "rewards/waypoint_pred_accuracy/std": 0.05537425884915592, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.875, "completions/max_terminated_length": 299.875, "completions/mean_length": 176.0390625, "completions/mean_terminated_length": 176.0390625, "completions/min_length": 114.375, "completions/min_terminated_length": 114.375, "epoch": 0.4968421052631579, "grad_norm": 0.008372652344405651, "learning_rate": 6.371817123897528e-07, "loss": 0.0006, "num_tokens": 111319513.0, "reward": 1.8294343054294586, "reward_std": 0.08038223959738389, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.040693737070411296, "rewards/waypoint_pred_accuracy/std": 0.03237862857612228, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.25, "completions/max_terminated_length": 251.25, "completions/mean_length": 166.775390625, "completions/mean_terminated_length": 166.775390625, "completions/min_length": 109.5, "completions/min_terminated_length": 109.5, "epoch": 0.49894736842105264, "grad_norm": 0.0, "learning_rate": 6.339312985488576e-07, "loss": -0.0001, "num_tokens": 111764710.0, "reward": 1.8851255774497986, "reward_std": 0.04906696546822786, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.005062782554890241, "rewards/waypoint_pred_accuracy/std": 0.024533490184718572, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.5, "completions/max_terminated_length": 296.5, "completions/mean_length": 172.90625, "completions/mean_terminated_length": 172.90625, "completions/min_length": 111.125, "completions/min_terminated_length": 111.125, "epoch": 0.5010526315789474, "grad_norm": 0.01637883298099041, "learning_rate": 6.30676341464831e-07, "loss": -0.0003, "num_tokens": 112215734.0, "reward": 1.6509484648704529, "reward_std": 0.06736253991999908, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.012974242886534648, "rewards/waypoint_pred_accuracy/std": 0.01805627301899171, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.625, "completions/max_terminated_length": 289.625, "completions/mean_length": 169.388671875, "completions/mean_terminated_length": 169.388671875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5031578947368421, "grad_norm": 0.0, "learning_rate": 6.274170173301268e-07, "loss": 0.0001, "num_tokens": 112666173.0, "reward": 1.9938509166240692, "reward_std": 0.11555472994223237, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05942547577433288, "rewards/waypoint_pred_accuracy/std": 0.05777736520394683, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 172.37890625, "completions/mean_terminated_length": 172.37890625, "completions/min_length": 108.875, "completions/min_terminated_length": 108.875, "epoch": 0.5052631578947369, "grad_norm": 0.0, "learning_rate": 6.24153502573589e-07, "loss": -0.0001, "num_tokens": 113115903.0, "reward": 1.7577707767486572, "reward_std": 0.016411395743489265, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0038853867445141077, "rewards/waypoint_pred_accuracy/std": 0.008205699268728495, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 175.09375, "completions/mean_terminated_length": 175.09375, "completions/min_length": 116.375, "completions/min_terminated_length": 116.375, "epoch": 0.5073684210526316, "grad_norm": 0.03419042006134987, "learning_rate": 6.208859738509021e-07, "loss": 0.0007, "num_tokens": 113568495.0, "reward": 1.7237209975719452, "reward_std": 0.14016160182654858, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.049360513221472516, "rewards/waypoint_pred_accuracy/std": 0.07008079765364533, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.875, "completions/max_terminated_length": 289.875, "completions/mean_length": 173.462890625, "completions/mean_terminated_length": 173.462890625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5094736842105263, "grad_norm": 0.0, "learning_rate": 6.176146080350286e-07, "loss": 0.0, "num_tokens": 114018972.0, "reward": 1.8751783967018127, "reward_std": 0.00036079369601793587, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 8.91994423000142e-05, "rewards/waypoint_pred_accuracy/std": 0.00018039710994344205, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 173.265625, "completions/mean_terminated_length": 173.265625, "completions/min_length": 114.25, "completions/min_terminated_length": 114.25, "epoch": 0.511578947368421, "grad_norm": 0.013551232405006886, "learning_rate": 6.14339582206635e-07, "loss": 0.0001, "num_tokens": 114468132.0, "reward": 1.9178512692451477, "reward_std": 0.06416846066713333, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.021425632760024103, "rewards/waypoint_pred_accuracy/std": 0.03208423405926866, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.125, "completions/max_terminated_length": 312.125, "completions/mean_length": 175.140625, "completions/mean_terminated_length": 175.140625, "completions/min_length": 110.75, "completions/min_terminated_length": 110.75, "epoch": 0.5136842105263157, "grad_norm": 0.02614566497504711, "learning_rate": 6.110610736445058e-07, "loss": 0.0003, "num_tokens": 114913708.0, "reward": 1.8622263967990875, "reward_std": 0.18642316292971373, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.11861319048330188, "rewards/waypoint_pred_accuracy/std": 0.09321157418889925, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.125, "completions/max_terminated_length": 301.125, "completions/mean_length": 172.37890625, "completions/mean_terminated_length": 172.37890625, "completions/min_length": 110.75, "completions/min_terminated_length": 110.75, "epoch": 0.5157894736842106, "grad_norm": 0.013545077294111252, "learning_rate": 6.077792598159479e-07, "loss": -0.0, "num_tokens": 115361006.0, "reward": 1.9425796866416931, "reward_std": 0.10113994629730882, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.03378988173824164, "rewards/waypoint_pred_accuracy/std": 0.05056997878441809, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.5, "completions/max_terminated_length": 276.5, "completions/mean_length": 173.775390625, "completions/mean_terminated_length": 173.775390625, "completions/min_length": 116.5, "completions/min_terminated_length": 116.5, "epoch": 0.5178947368421053, "grad_norm": 0.02074316143989563, "learning_rate": 6.044943183671836e-07, "loss": 0.0004, "num_tokens": 115809723.0, "reward": 1.7704734951257706, "reward_std": 0.09013272261904604, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.07371331164245536, "rewards/waypoint_pred_accuracy/std": 0.03725385823372892, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.375, "completions/max_terminated_length": 258.375, "completions/mean_length": 170.69140625, "completions/mean_terminated_length": 170.69140625, "completions/min_length": 119.125, "completions/min_terminated_length": 119.125, "epoch": 0.52, "grad_norm": 0.021677250042557716, "learning_rate": 6.01206427113735e-07, "loss": -0.0001, "num_tokens": 116257053.0, "reward": 1.8927133083343506, "reward_std": 0.0853966644051809, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.07135666430194423, "rewards/waypoint_pred_accuracy/std": 0.04269833582034271, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.125, "completions/max_terminated_length": 272.125, "completions/mean_length": 169.240234375, "completions/mean_terminated_length": 169.240234375, "completions/min_length": 112.375, "completions/min_terminated_length": 112.375, "epoch": 0.5221052631578947, "grad_norm": 0.000306050234939903, "learning_rate": 5.97915764030799e-07, "loss": -0.0, "num_tokens": 116703576.0, "reward": 1.6304270327091217, "reward_std": 0.011630857972136255, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.002713506846724556, "rewards/waypoint_pred_accuracy/std": 0.005815430337122507, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.375, "completions/max_terminated_length": 282.375, "completions/mean_length": 166.033203125, "completions/mean_terminated_length": 166.033203125, "completions/min_length": 113.625, "completions/min_terminated_length": 113.625, "epoch": 0.5242105263157895, "grad_norm": 0.0, "learning_rate": 5.946225072436121e-07, "loss": 0.0001, "num_tokens": 117151145.0, "reward": 1.8943032920360565, "reward_std": 0.03212926587002585, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.009651642787417297, "rewards/waypoint_pred_accuracy/std": 0.016064628072191757, "step": 249 }, { "epoch": 0.5263157894736842, "grad_norm": 0.015220913104712963, "learning_rate": 5.913268350178101e-07, "loss": 0.0001, "step": 250 }, { "epoch": 0.5263157894736842, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.00015625, "eval_completions/max_length": 278.13, "eval_completions/max_terminated_length": 278.13, "eval_completions/mean_length": 169.34763885498046, "eval_completions/mean_terminated_length": 169.375066986084, "eval_completions/min_length": 110.76, "eval_completions/min_terminated_length": 111.92, "eval_loss": -2.504486656107474e-05, "eval_num_tokens": 117598301.0, "eval_reward": 1.865585025548935, "eval_reward_std": 0.0873453421616474, "eval_rewards/format_reward_embodied/mean": 0.9996875, "eval_rewards/format_reward_embodied/std": 0.0025, "eval_rewards/stop_prediction_reward/mean": 0.76984375, "eval_rewards/stop_prediction_reward/std": 0.008003681004047393, "eval_rewards/waypoint_pred_accuracy/mean": 0.048026895228197336, "eval_rewards/waypoint_pred_accuracy/std": 0.038523975913848735, "eval_runtime": 963.8699, "eval_samples_per_second": 0.104, "eval_steps_per_second": 0.002, "step": 250 } ], "logging_steps": 1, "max_steps": 475, "num_input_tokens_seen": 117598301, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }