diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6074 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5263157894736842, + "eval_steps": 50, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.125, + "completions/max_terminated_length": 451.125, + "completions/mean_length": 241.560546875, + "completions/mean_terminated_length": 241.560546875, + "completions/min_length": 111.75, + "completions/min_terminated_length": 111.75, + "epoch": 0.002105263157894737, + "grad_norm": 0.0383942686021328, + "learning_rate": 0.0, + "loss": -0.0025, + "num_tokens": 484639.0, + "reward": 0.9500823765993118, + "reward_std": 0.6353622525930405, + "rewards/format_reward_embodied/mean": 0.501953125, + "rewards/format_reward_embodied/std": 0.4904305227100849, + "rewards/stop_prediction_reward/mean": 0.439453125, + "rewards/stop_prediction_reward/std": 0.3919360339641571, + "rewards/waypoint_pred_accuracy/mean": 0.004338064874811504, + "rewards/waypoint_pred_accuracy/std": 0.012508220294698669, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 985.25, + "completions/max_terminated_length": 985.25, + "completions/mean_length": 266.818359375, + "completions/mean_terminated_length": 267.813928604126, + "completions/min_length": 94.0, + "completions/min_terminated_length": 119.375, + "epoch": 0.004210526315789474, + "grad_norm": 0.041216954588890076, + "learning_rate": 2.083333333333333e-08, + "loss": 0.0025, + "num_tokens": 982274.0, + "reward": 0.9463644102215767, + "reward_std": 0.6123590245842934, + "rewards/format_reward_embodied/mean": 0.50390625, + "rewards/format_reward_embodied/std": 0.49139947816729546, + "rewards/stop_prediction_reward/mean": 0.42578125, + "rewards/stop_prediction_reward/std": 0.3781757093966007, + "rewards/waypoint_pred_accuracy/mean": 0.008338454590998856, + "rewards/waypoint_pred_accuracy/std": 0.02023129865005227, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 247.1015625, + "completions/mean_terminated_length": 247.1015625, + "completions/min_length": 117.875, + "completions/min_terminated_length": 117.875, + "epoch": 0.00631578947368421, + "grad_norm": 0.035625893622636795, + "learning_rate": 4.166666666666666e-08, + "loss": -0.001, + "num_tokens": 1467318.0, + "reward": 0.9623514339327812, + "reward_std": 0.6189808771014214, + "rewards/format_reward_embodied/mean": 0.498046875, + "rewards/format_reward_embodied/std": 0.4842093959450722, + "rewards/stop_prediction_reward/mean": 0.443359375, + "rewards/stop_prediction_reward/std": 0.44101808220148087, + "rewards/waypoint_pred_accuracy/mean": 0.010472597823421942, + "rewards/waypoint_pred_accuracy/std": 0.03422736286900763, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.5, + "completions/max_terminated_length": 509.5, + "completions/mean_length": 237.654296875, + "completions/mean_terminated_length": 237.654296875, + "completions/min_length": 113.375, + "completions/min_terminated_length": 113.375, + "epoch": 0.008421052631578947, + "grad_norm": 0.03751353174448013, + "learning_rate": 6.25e-08, + "loss": -0.0035, + "num_tokens": 1950021.0, + "reward": 0.9695519432425499, + "reward_std": 0.6773256361484528, + "rewards/format_reward_embodied/mean": 0.4765625, + "rewards/format_reward_embodied/std": 0.4892418272793293, + "rewards/stop_prediction_reward/mean": 0.421875, + "rewards/stop_prediction_reward/std": 0.3986336216330528, + "rewards/waypoint_pred_accuracy/mean": 0.03555722400778907, + "rewards/waypoint_pred_accuracy/std": 0.08721220167353771, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.375, + "completions/max_terminated_length": 813.375, + "completions/mean_length": 265.58984375, + "completions/mean_terminated_length": 265.58984375, + "completions/min_length": 116.625, + "completions/min_terminated_length": 116.625, + "epoch": 0.010526315789473684, + "grad_norm": 0.0453697107732296, + "learning_rate": 8.333333333333333e-08, + "loss": 0.0003, + "num_tokens": 2446643.0, + "reward": 0.9188483878970146, + "reward_std": 0.610739640891552, + "rewards/format_reward_embodied/mean": 0.5625, + "rewards/format_reward_embodied/std": 0.4938563257455826, + "rewards/stop_prediction_reward/mean": 0.345703125, + "rewards/stop_prediction_reward/std": 0.4086693823337555, + "rewards/waypoint_pred_accuracy/mean": 0.005322630658819445, + "rewards/waypoint_pred_accuracy/std": 0.016403043182279513, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 812.0, + "completions/max_terminated_length": 812.0, + "completions/mean_length": 252.169921875, + "completions/mean_terminated_length": 252.66043663024902, + "completions/min_length": 102.375, + "completions/min_terminated_length": 118.25, + "epoch": 0.01263157894736842, + "grad_norm": 0.049299150705337524, + "learning_rate": 1.0416666666666667e-07, + "loss": 0.0011, + "num_tokens": 2935818.0, + "reward": 0.877131775021553, + "reward_std": 0.6145607680082321, + "rewards/format_reward_embodied/mean": 0.509765625, + "rewards/format_reward_embodied/std": 0.4843035563826561, + "rewards/stop_prediction_reward/mean": 0.3671875, + "rewards/stop_prediction_reward/std": 0.38137195259332657, + "rewards/waypoint_pred_accuracy/mean": 8.932340400540724e-05, + "rewards/waypoint_pred_accuracy/std": 0.0005101569792639827, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1163.75, + "completions/max_terminated_length": 1163.75, + "completions/mean_length": 255.529296875, + "completions/mean_terminated_length": 255.529296875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.014736842105263158, + "grad_norm": 0.036646511405706406, + "learning_rate": 1.25e-07, + "loss": 0.001, + "num_tokens": 3427097.0, + "reward": 0.6614178493618965, + "reward_std": 0.6105708554387093, + "rewards/format_reward_embodied/mean": 0.46484375, + "rewards/format_reward_embodied/std": 0.48722705617547035, + "rewards/stop_prediction_reward/mean": 0.173828125, + "rewards/stop_prediction_reward/std": 0.3640986457467079, + "rewards/waypoint_pred_accuracy/mean": 0.011372994726074323, + "rewards/waypoint_pred_accuracy/std": 0.04579899070052374, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.25, + "completions/max_terminated_length": 467.25, + "completions/mean_length": 246.98828125, + "completions/mean_terminated_length": 246.98828125, + "completions/min_length": 119.5, + "completions/min_terminated_length": 119.5, + "epoch": 0.016842105263157894, + "grad_norm": 0.047037359327077866, + "learning_rate": 1.4583333333333335e-07, + "loss": -0.0023, + "num_tokens": 3915411.0, + "reward": 0.9769175350666046, + "reward_std": 0.6314118355512619, + "rewards/format_reward_embodied/mean": 0.568359375, + "rewards/format_reward_embodied/std": 0.4869700260460377, + "rewards/stop_prediction_reward/mean": 0.376953125, + "rewards/stop_prediction_reward/std": 0.4093479886651039, + "rewards/waypoint_pred_accuracy/mean": 0.015802525533167768, + "rewards/waypoint_pred_accuracy/std": 0.032816135895782333, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.125, + "completions/max_terminated_length": 481.125, + "completions/mean_length": 241.935546875, + "completions/mean_terminated_length": 241.935546875, + "completions/min_length": 114.5, + "completions/min_terminated_length": 114.5, + "epoch": 0.018947368421052633, + "grad_norm": 0.04171831160783768, + "learning_rate": 1.6666666666666665e-07, + "loss": -0.0015, + "num_tokens": 4397042.0, + "reward": 0.9027341902256012, + "reward_std": 0.6853612437844276, + "rewards/format_reward_embodied/mean": 0.48828125, + "rewards/format_reward_embodied/std": 0.4951612576842308, + "rewards/stop_prediction_reward/mean": 0.322265625, + "rewards/stop_prediction_reward/std": 0.4115743637084961, + "rewards/waypoint_pred_accuracy/mean": 0.04609366483055356, + "rewards/waypoint_pred_accuracy/std": 0.09995413944127579, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 695.625, + "completions/max_terminated_length": 695.625, + "completions/mean_length": 261.609375, + "completions/mean_terminated_length": 261.609375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.021052631578947368, + "grad_norm": 0.03897935897111893, + "learning_rate": 1.875e-07, + "loss": 0.0002, + "num_tokens": 4893354.0, + "reward": 1.0423217862844467, + "reward_std": 0.6282145008444786, + "rewards/format_reward_embodied/mean": 0.548828125, + "rewards/format_reward_embodied/std": 0.48774589598178864, + "rewards/stop_prediction_reward/mean": 0.478515625, + "rewards/stop_prediction_reward/std": 0.4023555275052786, + "rewards/waypoint_pred_accuracy/mean": 0.007489029231998282, + "rewards/waypoint_pred_accuracy/std": 0.0251983865261218, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.375, + "completions/max_terminated_length": 470.375, + "completions/mean_length": 253.248046875, + "completions/mean_terminated_length": 253.248046875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.023157894736842106, + "grad_norm": 0.039429888129234314, + "learning_rate": 2.0833333333333333e-07, + "loss": -0.0022, + "num_tokens": 5384873.0, + "reward": 0.7660543769598007, + "reward_std": 0.5604145936667919, + "rewards/format_reward_embodied/mean": 0.607421875, + "rewards/format_reward_embodied/std": 0.479397177696228, + "rewards/stop_prediction_reward/mean": 0.15625, + "rewards/stop_prediction_reward/std": 0.3427934180945158, + "rewards/waypoint_pred_accuracy/mean": 0.0011912494257570604, + "rewards/waypoint_pred_accuracy/std": 0.00802605507872417, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 263.59375, + "completions/mean_terminated_length": 264.0594940185547, + "completions/min_length": 99.75, + "completions/min_terminated_length": 114.625, + "epoch": 0.02526315789473684, + "grad_norm": 0.030105428770184517, + "learning_rate": 2.2916666666666663e-07, + "loss": 0.0033, + "num_tokens": 5878681.0, + "reward": 1.0901148244738579, + "reward_std": 0.6814222931861877, + "rewards/format_reward_embodied/mean": 0.6171875, + "rewards/format_reward_embodied/std": 0.46372338756918907, + "rewards/stop_prediction_reward/mean": 0.400390625, + "rewards/stop_prediction_reward/std": 0.4396743141114712, + "rewards/waypoint_pred_accuracy/mean": 0.036268358699724924, + "rewards/waypoint_pred_accuracy/std": 0.07070713029423034, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.875, + "completions/max_terminated_length": 476.875, + "completions/mean_length": 257.3125, + "completions/mean_terminated_length": 257.3125, + "completions/min_length": 118.625, + "completions/min_terminated_length": 118.625, + "epoch": 0.02736842105263158, + "grad_norm": 0.03954648971557617, + "learning_rate": 2.5e-07, + "loss": -0.0013, + "num_tokens": 6370745.0, + "reward": 1.1011288091540337, + "reward_std": 0.6459922045469284, + "rewards/format_reward_embodied/mean": 0.634765625, + "rewards/format_reward_embodied/std": 0.47677353397011757, + "rewards/stop_prediction_reward/mean": 0.44140625, + "rewards/stop_prediction_reward/std": 0.42084217444062233, + "rewards/waypoint_pred_accuracy/mean": 0.012478479564244083, + "rewards/waypoint_pred_accuracy/std": 0.04567733465950141, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1179.5, + "completions/max_terminated_length": 1179.5, + "completions/mean_length": 272.15625, + "completions/mean_terminated_length": 272.15625, + "completions/min_length": 120.375, + "completions/min_terminated_length": 120.375, + "epoch": 0.029473684210526315, + "grad_norm": 0.03879372030496597, + "learning_rate": 2.708333333333333e-07, + "loss": 0.0013, + "num_tokens": 6872585.0, + "reward": 1.069977581501007, + "reward_std": 0.6529600322246552, + "rewards/format_reward_embodied/mean": 0.626953125, + "rewards/format_reward_embodied/std": 0.46956589445471764, + "rewards/stop_prediction_reward/mean": 0.412109375, + "rewards/stop_prediction_reward/std": 0.42556022480130196, + "rewards/waypoint_pred_accuracy/mean": 0.015457541714965616, + "rewards/waypoint_pred_accuracy/std": 0.05261327166544845, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.75, + "completions/max_terminated_length": 456.75, + "completions/mean_length": 247.5078125, + "completions/mean_terminated_length": 247.5078125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.031578947368421054, + "grad_norm": 0.03746689483523369, + "learning_rate": 2.916666666666667e-07, + "loss": -0.004, + "num_tokens": 7358669.0, + "reward": 0.816399596631527, + "reward_std": 0.6439896002411842, + "rewards/format_reward_embodied/mean": 0.5546875, + "rewards/format_reward_embodied/std": 0.49549105390906334, + "rewards/stop_prediction_reward/mean": 0.216796875, + "rewards/stop_prediction_reward/std": 0.3949273619800806, + "rewards/waypoint_pred_accuracy/mean": 0.022457610069225337, + "rewards/waypoint_pred_accuracy/std": 0.04548973154789149, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.25, + "completions/max_terminated_length": 501.25, + "completions/mean_length": 254.716796875, + "completions/mean_terminated_length": 254.716796875, + "completions/min_length": 120.375, + "completions/min_terminated_length": 120.375, + "epoch": 0.03368421052631579, + "grad_norm": 0.053112324327230453, + "learning_rate": 3.1249999999999997e-07, + "loss": 0.0002, + "num_tokens": 7852156.0, + "reward": 1.2096271365880966, + "reward_std": 0.6100753545761108, + "rewards/format_reward_embodied/mean": 0.76171875, + "rewards/format_reward_embodied/std": 0.4093044362962246, + "rewards/stop_prediction_reward/mean": 0.4453125, + "rewards/stop_prediction_reward/std": 0.4225916638970375, + "rewards/waypoint_pred_accuracy/mean": 0.0012979521083353873, + "rewards/waypoint_pred_accuracy/std": 0.009194809671299708, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.375, + "completions/max_terminated_length": 472.375, + "completions/mean_length": 247.376953125, + "completions/mean_terminated_length": 247.376953125, + "completions/min_length": 116.25, + "completions/min_terminated_length": 116.25, + "epoch": 0.035789473684210524, + "grad_norm": 0.03221385180950165, + "learning_rate": 3.333333333333333e-07, + "loss": -0.0006, + "num_tokens": 8339517.0, + "reward": 1.1803481727838516, + "reward_std": 0.5297410599887371, + "rewards/format_reward_embodied/mean": 0.814453125, + "rewards/format_reward_embodied/std": 0.3874172270298004, + "rewards/stop_prediction_reward/mean": 0.345703125, + "rewards/stop_prediction_reward/std": 0.3674583863466978, + "rewards/waypoint_pred_accuracy/mean": 0.010095963222276419, + "rewards/waypoint_pred_accuracy/std": 0.0323235778856652, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.875, + "completions/max_terminated_length": 811.875, + "completions/mean_length": 254.984375, + "completions/mean_terminated_length": 254.984375, + "completions/min_length": 112.625, + "completions/min_terminated_length": 112.625, + "epoch": 0.037894736842105266, + "grad_norm": 0.034303538501262665, + "learning_rate": 3.541666666666667e-07, + "loss": 0.0065, + "num_tokens": 8828533.0, + "reward": 1.0827482342720032, + "reward_std": 0.49626101925969124, + "rewards/format_reward_embodied/mean": 0.84765625, + "rewards/format_reward_embodied/std": 0.35438157618045807, + "rewards/stop_prediction_reward/mean": 0.234375, + "rewards/stop_prediction_reward/std": 0.360489659011364, + "rewards/waypoint_pred_accuracy/mean": 0.00035850519751079446, + "rewards/waypoint_pred_accuracy/std": 0.000990356254078506, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 469.5, + "completions/max_terminated_length": 469.5, + "completions/mean_length": 242.603515625, + "completions/mean_terminated_length": 243.05292129516602, + "completions/min_length": 99.5, + "completions/min_terminated_length": 107.75, + "epoch": 0.04, + "grad_norm": 0.03210434690117836, + "learning_rate": 3.75e-07, + "loss": 0.0002, + "num_tokens": 9313706.0, + "reward": 1.1656895354390144, + "reward_std": 0.5405256152153015, + "rewards/format_reward_embodied/mean": 0.841796875, + "rewards/format_reward_embodied/std": 0.35758682526648045, + "rewards/stop_prediction_reward/mean": 0.322265625, + "rewards/stop_prediction_reward/std": 0.4072440378367901, + "rewards/waypoint_pred_accuracy/mean": 0.0008135107927961789, + "rewards/waypoint_pred_accuracy/std": 0.004946927132555481, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.5, + "completions/max_terminated_length": 887.5, + "completions/mean_length": 257.341796875, + "completions/mean_terminated_length": 257.341796875, + "completions/min_length": 117.75, + "completions/min_terminated_length": 117.75, + "epoch": 0.042105263157894736, + "grad_norm": 0.023722035810351372, + "learning_rate": 3.958333333333333e-07, + "loss": 0.0063, + "num_tokens": 9807001.0, + "reward": 1.2326279431581497, + "reward_std": 0.5118205770850182, + "rewards/format_reward_embodied/mean": 0.880859375, + "rewards/format_reward_embodied/std": 0.31777896732091904, + "rewards/stop_prediction_reward/mean": 0.29296875, + "rewards/stop_prediction_reward/std": 0.3428589329123497, + "rewards/waypoint_pred_accuracy/mean": 0.029399914224090686, + "rewards/waypoint_pred_accuracy/std": 0.06371456215110564, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1251.25, + "completions/max_terminated_length": 1251.25, + "completions/mean_length": 258.982421875, + "completions/mean_terminated_length": 258.982421875, + "completions/min_length": 107.75, + "completions/min_terminated_length": 107.75, + "epoch": 0.04421052631578947, + "grad_norm": 0.027127819135785103, + "learning_rate": 4.1666666666666667e-07, + "loss": 0.0107, + "num_tokens": 10300304.0, + "reward": 1.3775597661733627, + "reward_std": 0.5649962350726128, + "rewards/format_reward_embodied/mean": 0.880859375, + "rewards/format_reward_embodied/std": 0.3204925637692213, + "rewards/stop_prediction_reward/mean": 0.451171875, + "rewards/stop_prediction_reward/std": 0.4135790057480335, + "rewards/waypoint_pred_accuracy/mean": 0.02276425497597198, + "rewards/waypoint_pred_accuracy/std": 0.053010769921375774, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.5, + "completions/max_terminated_length": 764.5, + "completions/mean_length": 247.626953125, + "completions/mean_terminated_length": 247.626953125, + "completions/min_length": 120.625, + "completions/min_terminated_length": 120.625, + "epoch": 0.04631578947368421, + "grad_norm": 0.026827372610569, + "learning_rate": 4.375e-07, + "loss": 0.0046, + "num_tokens": 10787473.0, + "reward": 1.2097989320755005, + "reward_std": 0.4106667507439852, + "rewards/format_reward_embodied/mean": 0.9609375, + "rewards/format_reward_embodied/std": 0.1782014612108469, + "rewards/stop_prediction_reward/mean": 0.248046875, + "rewards/stop_prediction_reward/std": 0.3576600421220064, + "rewards/waypoint_pred_accuracy/mean": 0.0004072752802812829, + "rewards/waypoint_pred_accuracy/std": 0.001209557721267629, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.125, + "completions/max_terminated_length": 699.125, + "completions/mean_length": 247.458984375, + "completions/mean_terminated_length": 247.458984375, + "completions/min_length": 121.5, + "completions/min_terminated_length": 121.5, + "epoch": 0.04842105263157895, + "grad_norm": 0.024113576859235764, + "learning_rate": 4.5833333333333327e-07, + "loss": 0.0032, + "num_tokens": 11273276.0, + "reward": 1.3881124705076218, + "reward_std": 0.5006838031113148, + "rewards/format_reward_embodied/mean": 0.943359375, + "rewards/format_reward_embodied/std": 0.22258390858769417, + "rewards/stop_prediction_reward/mean": 0.42578125, + "rewards/stop_prediction_reward/std": 0.4117406941950321, + "rewards/waypoint_pred_accuracy/mean": 0.00948592593158537, + "rewards/waypoint_pred_accuracy/std": 0.03859481842846435, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1426.0, + "completions/max_terminated_length": 1426.0, + "completions/mean_length": 270.671875, + "completions/mean_terminated_length": 271.2647590637207, + "completions/min_length": 100.0, + "completions/min_terminated_length": 115.5, + "epoch": 0.05052631578947368, + "grad_norm": 0.021521741524338722, + "learning_rate": 4.791666666666667e-07, + "loss": 0.0138, + "num_tokens": 11772628.0, + "reward": 1.412862166762352, + "reward_std": 0.4836365692317486, + "rewards/format_reward_embodied/mean": 0.94140625, + "rewards/format_reward_embodied/std": 0.22402114421129227, + "rewards/stop_prediction_reward/mean": 0.46875, + "rewards/stop_prediction_reward/std": 0.4050610587000847, + "rewards/waypoint_pred_accuracy/mean": 0.0013529594958612285, + "rewards/waypoint_pred_accuracy/std": 0.008442860726859186, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.375, + "completions/max_terminated_length": 503.375, + "completions/mean_length": 252.7265625, + "completions/mean_terminated_length": 252.7265625, + "completions/min_length": 115.75, + "completions/min_terminated_length": 115.75, + "epoch": 0.05263157894736842, + "grad_norm": 0.023303357884287834, + "learning_rate": 5e-07, + "loss": 0.0022, + "num_tokens": 12260936.0, + "reward": 1.3870358616113663, + "reward_std": 0.4634270928800106, + "rewards/format_reward_embodied/mean": 0.97265625, + "rewards/format_reward_embodied/std": 0.14094455912709236, + "rewards/stop_prediction_reward/mean": 0.404296875, + "rewards/stop_prediction_reward/std": 0.41460882127285004, + "rewards/waypoint_pred_accuracy/mean": 0.005041355174832356, + "rewards/waypoint_pred_accuracy/std": 0.030215839982022222, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.125, + "completions/max_terminated_length": 802.125, + "completions/mean_length": 252.96484375, + "completions/mean_terminated_length": 252.96484375, + "completions/min_length": 117.625, + "completions/min_terminated_length": 117.625, + "epoch": 0.05473684210526316, + "grad_norm": 0.022509992122650146, + "learning_rate": 5.208333333333334e-07, + "loss": 0.0072, + "num_tokens": 12752118.0, + "reward": 1.3777707070112228, + "reward_std": 0.4540855921804905, + "rewards/format_reward_embodied/mean": 0.962890625, + "rewards/format_reward_embodied/std": 0.1879090555012226, + "rewards/stop_prediction_reward/mean": 0.388671875, + "rewards/stop_prediction_reward/std": 0.3826281502842903, + "rewards/waypoint_pred_accuracy/mean": 0.013104110299219339, + "rewards/waypoint_pred_accuracy/std": 0.023219284697190704, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 1812.5, + "completions/max_terminated_length": 1812.5, + "completions/mean_length": 276.763671875, + "completions/mean_terminated_length": 277.41750717163086, + "completions/min_length": 101.75, + "completions/min_terminated_length": 118.5, + "epoch": 0.056842105263157895, + "grad_norm": 0.025979243218898773, + "learning_rate": 5.416666666666666e-07, + "loss": 0.0201, + "num_tokens": 13257853.0, + "reward": 1.195212036371231, + "reward_std": 0.4585261270403862, + "rewards/format_reward_embodied/mean": 0.9453125, + "rewards/format_reward_embodied/std": 0.21002393402159214, + "rewards/stop_prediction_reward/mean": 0.248046875, + "rewards/stop_prediction_reward/std": 0.3809744007885456, + "rewards/waypoint_pred_accuracy/mean": 0.0009263323242719891, + "rewards/waypoint_pred_accuracy/std": 0.0071097194065939074, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1423.875, + "completions/max_terminated_length": 1423.875, + "completions/mean_length": 266.958984375, + "completions/mean_terminated_length": 266.958984375, + "completions/min_length": 121.5, + "completions/min_terminated_length": 121.5, + "epoch": 0.05894736842105263, + "grad_norm": 0.02617989294230938, + "learning_rate": 5.625e-07, + "loss": 0.0123, + "num_tokens": 13756648.0, + "reward": 1.4108145833015442, + "reward_std": 0.4676571935415268, + "rewards/format_reward_embodied/mean": 0.96484375, + "rewards/format_reward_embodied/std": 0.1660303734242916, + "rewards/stop_prediction_reward/mean": 0.427734375, + "rewards/stop_prediction_reward/std": 0.40000360645353794, + "rewards/waypoint_pred_accuracy/mean": 0.009118233890500024, + "rewards/waypoint_pred_accuracy/std": 0.03189600147387662, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 790.375, + "completions/max_terminated_length": 790.375, + "completions/mean_length": 253.384765625, + "completions/mean_terminated_length": 253.384765625, + "completions/min_length": 113.125, + "completions/min_terminated_length": 113.125, + "epoch": 0.061052631578947365, + "grad_norm": 0.0284319706261158, + "learning_rate": 5.833333333333334e-07, + "loss": 0.004, + "num_tokens": 14246829.0, + "reward": 1.3669871091842651, + "reward_std": 0.476172287017107, + "rewards/format_reward_embodied/mean": 0.9609375, + "rewards/format_reward_embodied/std": 0.19018890894949436, + "rewards/stop_prediction_reward/mean": 0.376953125, + "rewards/stop_prediction_reward/std": 0.38546351715922356, + "rewards/waypoint_pred_accuracy/mean": 0.014548240964149528, + "rewards/waypoint_pred_accuracy/std": 0.048405178813538896, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.125, + "completions/max_terminated_length": 946.125, + "completions/mean_length": 267.513671875, + "completions/mean_terminated_length": 267.513671875, + "completions/min_length": 118.25, + "completions/min_terminated_length": 118.25, + "epoch": 0.06315789473684211, + "grad_norm": 0.02758762799203396, + "learning_rate": 6.041666666666666e-07, + "loss": 0.0087, + "num_tokens": 14742708.0, + "reward": 1.3311925828456879, + "reward_std": 0.5349335558712482, + "rewards/format_reward_embodied/mean": 0.94921875, + "rewards/format_reward_embodied/std": 0.21471346728503704, + "rewards/stop_prediction_reward/mean": 0.375, + "rewards/stop_prediction_reward/std": 0.4617812857031822, + "rewards/waypoint_pred_accuracy/mean": 0.003486919093547547, + "rewards/waypoint_pred_accuracy/std": 0.022435040602790033, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.875, + "completions/max_terminated_length": 468.875, + "completions/mean_length": 247.80859375, + "completions/mean_terminated_length": 247.80859375, + "completions/min_length": 116.125, + "completions/min_terminated_length": 116.125, + "epoch": 0.06526315789473684, + "grad_norm": 0.024238400161266327, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0006, + "num_tokens": 15230930.0, + "reward": 1.4348655045032501, + "reward_std": 0.4678279310464859, + "rewards/format_reward_embodied/mean": 0.982421875, + "rewards/format_reward_embodied/std": 0.09929289110004902, + "rewards/stop_prediction_reward/mean": 0.439453125, + "rewards/stop_prediction_reward/std": 0.4299692139029503, + "rewards/waypoint_pred_accuracy/mean": 0.006495264507384302, + "rewards/waypoint_pred_accuracy/std": 0.03605599632994938, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.5, + "completions/max_terminated_length": 701.5, + "completions/mean_length": 247.302734375, + "completions/mean_terminated_length": 247.302734375, + "completions/min_length": 102.625, + "completions/min_terminated_length": 102.625, + "epoch": 0.06736842105263158, + "grad_norm": 0.03117675893008709, + "learning_rate": 6.458333333333333e-07, + "loss": 0.0048, + "num_tokens": 15717165.0, + "reward": 1.276307299733162, + "reward_std": 0.4745783172547817, + "rewards/format_reward_embodied/mean": 0.9765625, + "rewards/format_reward_embodied/std": 0.11361248232424259, + "rewards/stop_prediction_reward/mean": 0.287109375, + "rewards/stop_prediction_reward/std": 0.4378196634352207, + "rewards/waypoint_pred_accuracy/mean": 0.006317712715827028, + "rewards/waypoint_pred_accuracy/std": 0.02758992835879357, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.625, + "completions/max_terminated_length": 736.625, + "completions/mean_length": 259.27734375, + "completions/mean_terminated_length": 259.27734375, + "completions/min_length": 132.125, + "completions/min_terminated_length": 132.125, + "epoch": 0.06947368421052631, + "grad_norm": 0.0244793351739645, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0053, + "num_tokens": 16211835.0, + "reward": 1.540584921836853, + "reward_std": 0.49051226675510406, + "rewards/format_reward_embodied/mean": 0.96484375, + "rewards/format_reward_embodied/std": 0.18235719576478004, + "rewards/stop_prediction_reward/mean": 0.57421875, + "rewards/stop_prediction_reward/std": 0.43413203582167625, + "rewards/waypoint_pred_accuracy/mean": 0.0007612065199357397, + "rewards/waypoint_pred_accuracy/std": 0.005327658200312494, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 881.375, + "completions/max_terminated_length": 881.375, + "completions/mean_length": 264.103515625, + "completions/mean_terminated_length": 264.58401679992676, + "completions/min_length": 101.375, + "completions/min_terminated_length": 113.25, + "epoch": 0.07157894736842105, + "grad_norm": 0.0207452904433012, + "learning_rate": 6.875e-07, + "loss": 0.0068, + "num_tokens": 16706928.0, + "reward": 1.4342780411243439, + "reward_std": 0.5214410163462162, + "rewards/format_reward_embodied/mean": 0.9765625, + "rewards/format_reward_embodied/std": 0.12835253402590752, + "rewards/stop_prediction_reward/mean": 0.453125, + "rewards/stop_prediction_reward/std": 0.47532549500465393, + "rewards/waypoint_pred_accuracy/mean": 0.0022952774760399775, + "rewards/waypoint_pred_accuracy/std": 0.016314118760955294, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.75, + "completions/max_terminated_length": 544.75, + "completions/mean_length": 247.744140625, + "completions/mean_terminated_length": 247.744140625, + "completions/min_length": 115.375, + "completions/min_terminated_length": 115.375, + "epoch": 0.07368421052631578, + "grad_norm": 0.02019825391471386, + "learning_rate": 7.083333333333334e-07, + "loss": 0.0026, + "num_tokens": 17196205.0, + "reward": 1.4062748402357101, + "reward_std": 0.5094473846256733, + "rewards/format_reward_embodied/mean": 0.97265625, + "rewards/format_reward_embodied/std": 0.15960253402590752, + "rewards/stop_prediction_reward/mean": 0.43359375, + "rewards/stop_prediction_reward/std": 0.4651285596191883, + "rewards/waypoint_pred_accuracy/mean": 1.2417779022436235e-05, + "rewards/waypoint_pred_accuracy/std": 2.711040125229227e-05, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1784.875, + "completions/max_terminated_length": 1784.875, + "completions/mean_length": 291.40625, + "completions/mean_terminated_length": 291.40625, + "completions/min_length": 128.75, + "completions/min_terminated_length": 128.75, + "epoch": 0.07578947368421053, + "grad_norm": 0.038221534341573715, + "learning_rate": 7.291666666666666e-07, + "loss": 0.0175, + "num_tokens": 17710461.0, + "reward": 1.4911664873361588, + "reward_std": 0.5095744393765926, + "rewards/format_reward_embodied/mean": 0.978515625, + "rewards/format_reward_embodied/std": 0.11029814556241035, + "rewards/stop_prediction_reward/mean": 0.498046875, + "rewards/stop_prediction_reward/std": 0.46525831148028374, + "rewards/waypoint_pred_accuracy/mean": 0.007301997149683504, + "rewards/waypoint_pred_accuracy/std": 0.024017843105910822, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.75, + "completions/max_terminated_length": 472.75, + "completions/mean_length": 248.03125, + "completions/mean_terminated_length": 248.03125, + "completions/min_length": 117.375, + "completions/min_terminated_length": 117.375, + "epoch": 0.07789473684210527, + "grad_norm": 0.02083674818277359, + "learning_rate": 7.5e-07, + "loss": 0.0023, + "num_tokens": 18197965.0, + "reward": 1.4861425906419754, + "reward_std": 0.5738211683928967, + "rewards/format_reward_embodied/mean": 0.9921875, + "rewards/format_reward_embodied/std": 0.05317101255059242, + "rewards/stop_prediction_reward/mean": 0.42578125, + "rewards/stop_prediction_reward/std": 0.4866231083869934, + "rewards/waypoint_pred_accuracy/mean": 0.03408691443473799, + "rewards/waypoint_pred_accuracy/std": 0.08652095211436972, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.625, + "completions/max_terminated_length": 884.625, + "completions/mean_length": 258.75390625, + "completions/mean_terminated_length": 258.75390625, + "completions/min_length": 125.5, + "completions/min_terminated_length": 125.5, + "epoch": 0.08, + "grad_norm": 0.021931249648332596, + "learning_rate": 7.708333333333333e-07, + "loss": 0.0048, + "num_tokens": 18692239.0, + "reward": 1.6138971894979477, + "reward_std": 0.5961987935006618, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.07509202510118484, + "rewards/stop_prediction_reward/mean": 0.513671875, + "rewards/stop_prediction_reward/std": 0.488056443631649, + "rewards/waypoint_pred_accuracy/mean": 0.05597203067737698, + "rewards/waypoint_pred_accuracy/std": 0.09776196270839843, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.875, + "completions/max_terminated_length": 755.875, + "completions/mean_length": 250.787109375, + "completions/mean_terminated_length": 250.787109375, + "completions/min_length": 119.125, + "completions/min_terminated_length": 119.125, + "epoch": 0.08210526315789474, + "grad_norm": 0.15428805351257324, + "learning_rate": 7.916666666666666e-07, + "loss": 0.0043, + "num_tokens": 19183074.0, + "reward": 1.4990187734365463, + "reward_std": 0.49899255111813545, + "rewards/format_reward_embodied/mean": 0.98046875, + "rewards/format_reward_embodied/std": 0.1064315214753151, + "rewards/stop_prediction_reward/mean": 0.517578125, + "rewards/stop_prediction_reward/std": 0.4714191108942032, + "rewards/waypoint_pred_accuracy/mean": 0.00048596067600919097, + "rewards/waypoint_pred_accuracy/std": 0.0015497554879284957, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 246.05078125, + "completions/mean_terminated_length": 246.05078125, + "completions/min_length": 117.125, + "completions/min_terminated_length": 117.125, + "epoch": 0.08421052631578947, + "grad_norm": 0.033689290285110474, + "learning_rate": 8.125e-07, + "loss": 0.0002, + "num_tokens": 19674556.0, + "reward": 1.52012699842453, + "reward_std": 0.5320924893021584, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.09375, + "rewards/stop_prediction_reward/mean": 0.494140625, + "rewards/stop_prediction_reward/std": 0.4897289089858532, + "rewards/waypoint_pred_accuracy/mean": 0.018852562323445468, + "rewards/waypoint_pred_accuracy/std": 0.04221876614610676, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.625, + "completions/max_terminated_length": 451.625, + "completions/mean_length": 233.865234375, + "completions/mean_terminated_length": 233.865234375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.0863157894736842, + "grad_norm": 0.025640549138188362, + "learning_rate": 8.333333333333333e-07, + "loss": 0.0011, + "num_tokens": 20156151.0, + "reward": 1.591551125049591, + "reward_std": 0.5175898559391499, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.08442101255059242, + "rewards/stop_prediction_reward/mean": 0.5703125, + "rewards/stop_prediction_reward/std": 0.4659374840557575, + "rewards/waypoint_pred_accuracy/mean": 0.016478684779628456, + "rewards/waypoint_pred_accuracy/std": 0.05339623770077609, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 242.478515625, + "completions/mean_terminated_length": 242.478515625, + "completions/min_length": 112.25, + "completions/min_terminated_length": 112.25, + "epoch": 0.08842105263157894, + "grad_norm": 0.026281312108039856, + "learning_rate": 8.541666666666666e-07, + "loss": -0.0, + "num_tokens": 20639788.0, + "reward": 1.5611481666564941, + "reward_std": 0.4784001186490059, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.560546875, + "rewards/stop_prediction_reward/std": 0.4726823903620243, + "rewards/waypoint_pred_accuracy/mean": 0.002253779932873278, + "rewards/waypoint_pred_accuracy/std": 0.015716358982028673, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 810.875, + "completions/max_terminated_length": 810.875, + "completions/mean_length": 249.939453125, + "completions/mean_terminated_length": 249.939453125, + "completions/min_length": 122.875, + "completions/min_terminated_length": 122.875, + "epoch": 0.09052631578947369, + "grad_norm": 0.021544892340898514, + "learning_rate": 8.75e-07, + "loss": 0.0029, + "num_tokens": 21128013.0, + "reward": 1.4784268736839294, + "reward_std": 0.48740382865071297, + "rewards/format_reward_embodied/mean": 0.990234375, + "rewards/format_reward_embodied/std": 0.06879601255059242, + "rewards/stop_prediction_reward/mean": 0.484375, + "rewards/stop_prediction_reward/std": 0.4655333496630192, + "rewards/waypoint_pred_accuracy/mean": 0.0019087545888396566, + "rewards/waypoint_pred_accuracy/std": 0.010897019660660593, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 643.625, + "completions/max_terminated_length": 643.625, + "completions/mean_length": 245.134765625, + "completions/mean_terminated_length": 245.66865158081055, + "completions/min_length": 101.25, + "completions/min_terminated_length": 117.75, + "epoch": 0.09263157894736843, + "grad_norm": 0.025143882259726524, + "learning_rate": 8.958333333333334e-07, + "loss": 0.0038, + "num_tokens": 21614866.0, + "reward": 1.6192794144153595, + "reward_std": 0.4844088666141033, + "rewards/format_reward_embodied/mean": 0.982421875, + "rewards/format_reward_embodied/std": 0.11263803765177727, + "rewards/stop_prediction_reward/mean": 0.63671875, + "rewards/stop_prediction_reward/std": 0.4543136991560459, + "rewards/waypoint_pred_accuracy/mean": 6.939472140954406e-05, + "rewards/waypoint_pred_accuracy/std": 0.00043998260686706566, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.375, + "completions/max_terminated_length": 479.375, + "completions/mean_length": 238.673828125, + "completions/mean_terminated_length": 238.673828125, + "completions/min_length": 115.625, + "completions/min_terminated_length": 115.625, + "epoch": 0.09473684210526316, + "grad_norm": 0.023476244881749153, + "learning_rate": 9.166666666666665e-07, + "loss": 0.0007, + "num_tokens": 22098731.0, + "reward": 1.512737661600113, + "reward_std": 0.5365940853953362, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.026630254462361336, + "rewards/stop_prediction_reward/mean": 0.45703125, + "rewards/stop_prediction_reward/std": 0.48733755201101303, + "rewards/waypoint_pred_accuracy/mean": 0.03078289819380149, + "rewards/waypoint_pred_accuracy/std": 0.043302708805491946, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 641.875, + "completions/max_terminated_length": 641.875, + "completions/mean_length": 240.724609375, + "completions/mean_terminated_length": 240.724609375, + "completions/min_length": 118.625, + "completions/min_terminated_length": 118.625, + "epoch": 0.0968421052631579, + "grad_norm": 0.020364033058285713, + "learning_rate": 9.374999999999999e-07, + "loss": 0.0029, + "num_tokens": 22583070.0, + "reward": 1.6677764654159546, + "reward_std": 0.5154935717582703, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.07350525446236134, + "rewards/stop_prediction_reward/mean": 0.626953125, + "rewards/stop_prediction_reward/std": 0.44941750913858414, + "rewards/waypoint_pred_accuracy/mean": 0.02627104918529134, + "rewards/waypoint_pred_accuracy/std": 0.0397095277659446, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.375, + "completions/max_terminated_length": 657.375, + "completions/mean_length": 239.5546875, + "completions/mean_terminated_length": 239.5546875, + "completions/min_length": 118.875, + "completions/min_terminated_length": 118.875, + "epoch": 0.09894736842105263, + "grad_norm": 0.023410305380821228, + "learning_rate": 9.583333333333334e-07, + "loss": 0.0039, + "num_tokens": 23067066.0, + "reward": 1.7648355215787888, + "reward_std": 0.5304676033556461, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.685546875, + "rewards/stop_prediction_reward/std": 0.45773619785904884, + "rewards/waypoint_pred_accuracy/mean": 0.042574012356195935, + "rewards/waypoint_pred_accuracy/std": 0.07306883804197034, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.625, + "completions/max_terminated_length": 423.625, + "completions/mean_length": 236.12890625, + "completions/mean_terminated_length": 236.12890625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.10105263157894737, + "grad_norm": 0.023688504472374916, + "learning_rate": 9.791666666666667e-07, + "loss": -0.0006, + "num_tokens": 23549436.0, + "reward": 1.70817232131958, + "reward_std": 0.5296522080898285, + "rewards/format_reward_embodied/mean": 0.990234375, + "rewards/format_reward_embodied/std": 0.06879601255059242, + "rewards/stop_prediction_reward/mean": 0.63671875, + "rewards/stop_prediction_reward/std": 0.45957546308636665, + "rewards/waypoint_pred_accuracy/mean": 0.04060960025526583, + "rewards/waypoint_pred_accuracy/std": 0.07721152482554317, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.5, + "completions/max_terminated_length": 439.5, + "completions/mean_length": 239.724609375, + "completions/mean_terminated_length": 239.724609375, + "completions/min_length": 112.625, + "completions/min_terminated_length": 112.625, + "epoch": 0.1031578947368421, + "grad_norm": 0.04075814038515091, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 24036719.0, + "reward": 1.5996688604354858, + "reward_std": 0.43832388520240784, + "rewards/format_reward_embodied/mean": 0.9921875, + "rewards/format_reward_embodied/std": 0.05317101255059242, + "rewards/stop_prediction_reward/mean": 0.607421875, + "rewards/stop_prediction_reward/std": 0.4225967414677143, + "rewards/waypoint_pred_accuracy/mean": 2.974685131453066e-05, + "rewards/waypoint_pred_accuracy/std": 0.00014841147029464143, + "step": 49 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.02866051159799099, + "learning_rate": 9.999878206375666e-07, + "loss": 0.0015, + "step": 50 + }, + { + "epoch": 0.10526315789473684, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.00015625, + "eval_completions/max_length": 683.96, + "eval_completions/max_terminated_length": 683.96, + "eval_completions/mean_length": 238.32913192749024, + "eval_completions/mean_terminated_length": 238.36774322509766, + "eval_completions/min_length": 112.41, + "eval_completions/min_terminated_length": 113.78, + "eval_loss": 0.003631497733294964, + "eval_num_tokens": 24520656.0, + "eval_reward": 1.6925481045246125, + "eval_reward_std": 0.4437328398227692, + "eval_rewards/format_reward_embodied/mean": 0.99328125, + "eval_rewards/format_reward_embodied/std": 0.04765250638127327, + "eval_rewards/stop_prediction_reward/mean": 0.66328125, + "eval_rewards/stop_prediction_reward/std": 0.40405205205082895, + "eval_rewards/waypoint_pred_accuracy/mean": 0.017992802756573712, + "eval_rewards/waypoint_pred_accuracy/std": 0.04001634344926445, + "eval_runtime": 1355.6164, + "eval_samples_per_second": 0.074, + "eval_steps_per_second": 0.001, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.375, + "completions/max_terminated_length": 450.375, + "completions/mean_length": 239.6259765625, + "completions/mean_terminated_length": 239.6259765625, + "completions/min_length": 116.5, + "completions/min_terminated_length": 116.5, + "epoch": 0.10736842105263159, + "grad_norm": 0.02182384580373764, + "learning_rate": 9.999512832095417e-07, + "loss": 0.0005, + "num_tokens": 25006064.0, + "reward": 1.68679628521204, + "reward_std": 0.4470259975641966, + "rewards/format_reward_embodied/mean": 0.9931640625, + "rewards/format_reward_embodied/std": 0.05002300627529621, + "rewards/stop_prediction_reward/mean": 0.65234375, + "rewards/stop_prediction_reward/std": 0.4022445324808359, + "rewards/waypoint_pred_accuracy/mean": 0.02064424328141934, + "rewards/waypoint_pred_accuracy/std": 0.05010251636930185, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 427.5, + "completions/max_terminated_length": 427.5, + "completions/mean_length": 230.73828125, + "completions/mean_terminated_length": 231.21041297912598, + "completions/min_length": 99.875, + "completions/min_terminated_length": 114.375, + "epoch": 0.10947368421052632, + "grad_norm": 0.02381049655377865, + "learning_rate": 9.998903896937148e-07, + "loss": 0.0012, + "num_tokens": 25485546.0, + "reward": 1.7381224930286407, + "reward_std": 0.4321938529610634, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.716796875, + "rewards/stop_prediction_reward/std": 0.3914393372833729, + "rewards/waypoint_pred_accuracy/mean": 0.01261594578698389, + "rewards/waypoint_pred_accuracy/std": 0.03588760504630355, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.5, + "completions/max_terminated_length": 450.5, + "completions/mean_length": 234.287109375, + "completions/mean_terminated_length": 234.287109375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.11157894736842106, + "grad_norm": 0.05357426404953003, + "learning_rate": 9.998051433862818e-07, + "loss": -0.0001, + "num_tokens": 25966589.0, + "reward": 1.6857829988002777, + "reward_std": 0.3670726828277111, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.69140625, + "rewards/stop_prediction_reward/std": 0.3526647798717022, + "rewards/waypoint_pred_accuracy/mean": 0.00011806207834793561, + "rewards/waypoint_pred_accuracy/std": 0.0009230129067357363, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.75, + "completions/max_terminated_length": 413.75, + "completions/mean_length": 226.93359375, + "completions/mean_terminated_length": 226.93359375, + "completions/min_length": 108.125, + "completions/min_terminated_length": 108.125, + "epoch": 0.11368421052631579, + "grad_norm": 0.024311864748597145, + "learning_rate": 9.996955489016681e-07, + "loss": 0.0006, + "num_tokens": 26444507.0, + "reward": 1.5553173422813416, + "reward_std": 0.38553351908922195, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5234375, + "rewards/stop_prediction_reward/std": 0.3467428870499134, + "rewards/waypoint_pred_accuracy/mean": 0.015939914255370412, + "rewards/waypoint_pred_accuracy/std": 0.039412272800977984, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 667.5, + "completions/max_terminated_length": 667.5, + "completions/mean_length": 246.060546875, + "completions/mean_terminated_length": 246.060546875, + "completions/min_length": 111.125, + "completions/min_terminated_length": 111.125, + "epoch": 0.11578947368421053, + "grad_norm": 0.020382562652230263, + "learning_rate": 9.995616121722783e-07, + "loss": 0.0024, + "num_tokens": 26931770.0, + "reward": 1.543064832687378, + "reward_std": 0.35424431413412094, + "rewards/format_reward_embodied/mean": 0.990234375, + "rewards/format_reward_embodied/std": 0.06879601255059242, + "rewards/stop_prediction_reward/mean": 0.548828125, + "rewards/stop_prediction_reward/std": 0.3284119311720133, + "rewards/waypoint_pred_accuracy/mean": 0.002001162469869708, + "rewards/waypoint_pred_accuracy/std": 0.01592240231514961, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.75, + "completions/max_terminated_length": 462.75, + "completions/mean_length": 225.984375, + "completions/mean_terminated_length": 225.984375, + "completions/min_length": 113.875, + "completions/min_terminated_length": 113.875, + "epoch": 0.11789473684210526, + "grad_norm": 0.020005574449896812, + "learning_rate": 9.994033404481736e-07, + "loss": 0.0004, + "num_tokens": 27406770.0, + "reward": 1.6594459414482117, + "reward_std": 0.3725608382374048, + "rewards/format_reward_embodied/mean": 0.986328125, + "rewards/format_reward_embodied/std": 0.07980126701295376, + "rewards/stop_prediction_reward/mean": 0.6640625, + "rewards/stop_prediction_reward/std": 0.33192422799766064, + "rewards/waypoint_pred_accuracy/mean": 0.004527658324403952, + "rewards/waypoint_pred_accuracy/std": 0.017977507960307844, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.5, + "completions/max_terminated_length": 450.5, + "completions/mean_length": 234.10546875, + "completions/mean_terminated_length": 234.10546875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.12, + "grad_norm": 0.02229822427034378, + "learning_rate": 9.992207422966824e-07, + "loss": -0.0001, + "num_tokens": 27889640.0, + "reward": 1.4391246140003204, + "reward_std": 0.35483869537711143, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.4375, + "rewards/stop_prediction_reward/std": 0.34185592643916607, + "rewards/waypoint_pred_accuracy/mean": 0.002765428128381, + "rewards/waypoint_pred_accuracy/std": 0.014523094108374813, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 747.625, + "completions/max_terminated_length": 747.625, + "completions/mean_length": 233.369140625, + "completions/mean_terminated_length": 233.90501022338867, + "completions/min_length": 99.75, + "completions/min_terminated_length": 113.5, + "epoch": 0.12210526315789473, + "grad_norm": 0.025426389649510384, + "learning_rate": 9.990138276019335e-07, + "loss": 0.0038, + "num_tokens": 28370213.0, + "reward": 1.6101858913898468, + "reward_std": 0.3749941308051348, + "rewards/format_reward_embodied/mean": 0.9921875, + "rewards/format_reward_embodied/std": 0.05317101255059242, + "rewards/stop_prediction_reward/mean": 0.59375, + "rewards/stop_prediction_reward/std": 0.32783540338277817, + "rewards/waypoint_pred_accuracy/mean": 0.012124196402905074, + "rewards/waypoint_pred_accuracy/std": 0.041881935120967384, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1494.75, + "completions/max_terminated_length": 1494.75, + "completions/mean_length": 264.875, + "completions/mean_terminated_length": 264.875, + "completions/min_length": 119.125, + "completions/min_terminated_length": 119.125, + "epoch": 0.12421052631578948, + "grad_norm": 0.029843533411622047, + "learning_rate": 9.987826075643228e-07, + "loss": 0.0225, + "num_tokens": 28866405.0, + "reward": 1.608642503619194, + "reward_std": 0.37335721030831337, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.07509202510118484, + "rewards/stop_prediction_reward/mean": 0.619140625, + "rewards/stop_prediction_reward/std": 0.33958676643669605, + "rewards/waypoint_pred_accuracy/mean": 0.0006103267239083258, + "rewards/waypoint_pred_accuracy/std": 0.003143552353714926, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.875, + "completions/max_terminated_length": 460.875, + "completions/mean_length": 250.77734375, + "completions/mean_terminated_length": 250.77734375, + "completions/min_length": 122.25, + "completions/min_terminated_length": 122.25, + "epoch": 0.12631578947368421, + "grad_norm": 0.02350872941315174, + "learning_rate": 9.985270946999066e-07, + "loss": -0.0006, + "num_tokens": 29357939.0, + "reward": 1.5302962958812714, + "reward_std": 0.39041563123464584, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.51171875, + "rewards/stop_prediction_reward/std": 0.3654658328741789, + "rewards/waypoint_pred_accuracy/mean": 0.01026534708216827, + "rewards/waypoint_pred_accuracy/std": 0.03301238967106072, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 254.34765625, + "completions/mean_terminated_length": 254.9265899658203, + "completions/min_length": 107.25, + "completions/min_terminated_length": 128.25, + "epoch": 0.12842105263157894, + "grad_norm": 0.028141073882579803, + "learning_rate": 9.982473028397236e-07, + "loss": 0.004, + "num_tokens": 29850341.0, + "reward": 1.908710554242134, + "reward_std": 0.4512513056397438, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.07509202510118484, + "rewards/stop_prediction_reward/mean": 0.8515625, + "rewards/stop_prediction_reward/std": 0.34670688211917877, + "rewards/waypoint_pred_accuracy/mean": 0.034433397710529334, + "rewards/waypoint_pred_accuracy/std": 0.06185820607151982, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 888.0, + "completions/max_terminated_length": 888.0, + "completions/mean_length": 255.419921875, + "completions/mean_terminated_length": 255.419921875, + "completions/min_length": 119.125, + "completions/min_terminated_length": 119.125, + "epoch": 0.13052631578947368, + "grad_norm": 0.0289030522108078, + "learning_rate": 9.979432471290472e-07, + "loss": 0.0055, + "num_tokens": 30342588.0, + "reward": 1.9491963237524033, + "reward_std": 0.4529041275382042, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.802734375, + "rewards/stop_prediction_reward/std": 0.3396564405411482, + "rewards/waypoint_pred_accuracy/mean": 0.07518408738549646, + "rewards/waypoint_pred_accuracy/std": 0.09874280986073203, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.625, + "completions/max_terminated_length": 485.625, + "completions/mean_length": 243.8125, + "completions/mean_terminated_length": 243.8125, + "completions/min_length": 104.25, + "completions/min_terminated_length": 104.25, + "epoch": 0.13263157894736843, + "grad_norm": 0.02499276027083397, + "learning_rate": 9.97614944026565e-07, + "loss": -0.0005, + "num_tokens": 30829532.0, + "reward": 1.6152346730232239, + "reward_std": 0.3560887239873409, + "rewards/format_reward_embodied/mean": 0.9921875, + "rewards/format_reward_embodied/std": 0.043842025101184845, + "rewards/stop_prediction_reward/mean": 0.623046875, + "rewards/stop_prediction_reward/std": 0.3392041679471731, + "rewards/waypoint_pred_accuracy/mean": 1.5068036063656496e-07, + "rewards/waypoint_pred_accuracy/std": 1.1848213716913133e-06, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.75, + "completions/max_terminated_length": 487.75, + "completions/mean_length": 242.8671875, + "completions/mean_terminated_length": 242.8671875, + "completions/min_length": 116.875, + "completions/min_terminated_length": 116.875, + "epoch": 0.13473684210526315, + "grad_norm": 0.08438508957624435, + "learning_rate": 9.97262411303488e-07, + "loss": -0.0008, + "num_tokens": 31316376.0, + "reward": 1.676452487707138, + "reward_std": 0.38882749900221825, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.654296875, + "rewards/stop_prediction_reward/std": 0.3573997803032398, + "rewards/waypoint_pred_accuracy/mean": 0.014007493944973248, + "rewards/waypoint_pred_accuracy/std": 0.033498459750262555, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.75, + "completions/max_terminated_length": 715.75, + "completions/mean_length": 239.49609375, + "completions/mean_terminated_length": 239.49609375, + "completions/min_length": 118.25, + "completions/min_terminated_length": 118.25, + "epoch": 0.1368421052631579, + "grad_norm": 0.027747681364417076, + "learning_rate": 9.968856680425886e-07, + "loss": 0.0053, + "num_tokens": 31798614.0, + "reward": 1.7896013855934143, + "reward_std": 0.33628559671342373, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.7890625, + "rewards/stop_prediction_reward/std": 0.3223333489149809, + "rewards/waypoint_pred_accuracy/mean": 0.0022225715887884694, + "rewards/waypoint_pred_accuracy/std": 0.00479067146991952, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.75, + "completions/max_terminated_length": 639.75, + "completions/mean_length": 251.96875, + "completions/mean_terminated_length": 251.96875, + "completions/min_length": 115.625, + "completions/min_terminated_length": 115.625, + "epoch": 0.13894736842105262, + "grad_norm": 0.028032371774315834, + "learning_rate": 9.964847346371676e-07, + "loss": 0.004, + "num_tokens": 32286790.0, + "reward": 1.8506833761930466, + "reward_std": 0.3642146345227957, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.830078125, + "rewards/stop_prediction_reward/std": 0.3009116370230913, + "rewards/waypoint_pred_accuracy/mean": 0.013232328761660028, + "rewards/waypoint_pred_accuracy/std": 0.03860976189025678, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 481.25, + "completions/max_terminated_length": 481.25, + "completions/mean_length": 247.291015625, + "completions/mean_terminated_length": 247.7331371307373, + "completions/min_length": 100.25, + "completions/min_terminated_length": 115.125, + "epoch": 0.14105263157894737, + "grad_norm": 0.021038714796304703, + "learning_rate": 9.96059632789951e-07, + "loss": -0.0006, + "num_tokens": 32774747.0, + "reward": 1.5779267400503159, + "reward_std": 0.42914118245244026, + "rewards/format_reward_embodied/mean": 0.9921875, + "rewards/format_reward_embodied/std": 0.05317101255059242, + "rewards/stop_prediction_reward/mean": 0.505859375, + "rewards/stop_prediction_reward/std": 0.3645100612193346, + "rewards/waypoint_pred_accuracy/mean": 0.039939936966421924, + "rewards/waypoint_pred_accuracy/std": 0.058027153559010024, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 748.875, + "completions/max_terminated_length": 748.875, + "completions/mean_length": 253.40234375, + "completions/mean_terminated_length": 253.40234375, + "completions/min_length": 119.875, + "completions/min_terminated_length": 119.875, + "epoch": 0.1431578947368421, + "grad_norm": 0.024472616612911224, + "learning_rate": 9.956103855119138e-07, + "loss": 0.0036, + "num_tokens": 33265961.0, + "reward": 1.6555908024311066, + "reward_std": 0.385839419439435, + "rewards/format_reward_embodied/mean": 0.990234375, + "rewards/format_reward_embodied/std": 0.078125, + "rewards/stop_prediction_reward/mean": 0.6640625, + "rewards/stop_prediction_reward/std": 0.36797660402953625, + "rewards/waypoint_pred_accuracy/mean": 0.0006469660570671656, + "rewards/waypoint_pred_accuracy/std": 0.003268853310080662, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.375, + "completions/max_terminated_length": 496.375, + "completions/mean_length": 253.37109375, + "completions/mean_terminated_length": 253.37109375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.14526315789473684, + "grad_norm": 0.038161348551511765, + "learning_rate": 9.951370171210359e-07, + "loss": 0.002, + "num_tokens": 33757543.0, + "reward": 1.755501314997673, + "reward_std": 0.34114088118076324, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.32307766377925873, + "rewards/waypoint_pred_accuracy/mean": 0.004703786010358479, + "rewards/waypoint_pred_accuracy/std": 0.016449308837348298, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 833.25, + "completions/max_terminated_length": 833.25, + "completions/mean_length": 262.56640625, + "completions/mean_terminated_length": 262.56640625, + "completions/min_length": 109.875, + "completions/min_terminated_length": 109.875, + "epoch": 0.14736842105263157, + "grad_norm": 0.024708310142159462, + "learning_rate": 9.946395532409847e-07, + "loss": 0.0085, + "num_tokens": 34253513.0, + "reward": 1.7724248170852661, + "reward_std": 0.354646734893322, + "rewards/format_reward_embodied/mean": 0.98828125, + "rewards/format_reward_embodied/std": 0.06417626701295376, + "rewards/stop_prediction_reward/mean": 0.7734375, + "rewards/stop_prediction_reward/std": 0.3255934212356806, + "rewards/waypoint_pred_accuracy/mean": 0.005353036525642015, + "rewards/waypoint_pred_accuracy/std": 0.013709596910865714, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.25, + "completions/max_terminated_length": 497.25, + "completions/mean_length": 261.169921875, + "completions/mean_terminated_length": 261.169921875, + "completions/min_length": 127.25, + "completions/min_terminated_length": 127.25, + "epoch": 0.14947368421052631, + "grad_norm": 0.02306721918284893, + "learning_rate": 9.941180207997288e-07, + "loss": -0.0001, + "num_tokens": 34747616.0, + "reward": 1.8229519128799438, + "reward_std": 0.33919697254896164, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.81640625, + "rewards/stop_prediction_reward/std": 0.3285626471042633, + "rewards/waypoint_pred_accuracy/mean": 0.003272829200625438, + "rewards/waypoint_pred_accuracy/std": 0.01800350467010503, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 244.701171875, + "completions/mean_terminated_length": 244.701171875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.15157894736842106, + "grad_norm": 0.03545621410012245, + "learning_rate": 9.935724480280795e-07, + "loss": 0.0, + "num_tokens": 35235719.0, + "reward": 1.7902133017778397, + "reward_std": 0.45963282138109207, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.6796875, + "rewards/stop_prediction_reward/std": 0.38505756109952927, + "rewards/waypoint_pred_accuracy/mean": 0.056239478069983306, + "rewards/waypoint_pred_accuracy/std": 0.08321574779984076, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.125, + "completions/max_terminated_length": 423.125, + "completions/mean_length": 241.056640625, + "completions/mean_terminated_length": 241.056640625, + "completions/min_length": 115.875, + "completions/min_terminated_length": 115.875, + "epoch": 0.15368421052631578, + "grad_norm": 0.025743646547198296, + "learning_rate": 9.93002864458164e-07, + "loss": 0.0017, + "num_tokens": 35717284.0, + "reward": 1.7117034643888474, + "reward_std": 0.34931979328393936, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.7109375, + "rewards/stop_prediction_reward/std": 0.3488571159541607, + "rewards/waypoint_pred_accuracy/mean": 0.00038298743026394556, + "rewards/waypoint_pred_accuracy/std": 0.0022788381146801144, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.375, + "completions/max_terminated_length": 484.375, + "completions/mean_length": 236.037109375, + "completions/mean_terminated_length": 236.037109375, + "completions/min_length": 110.375, + "completions/min_terminated_length": 110.375, + "epoch": 0.15578947368421053, + "grad_norm": 0.031387291848659515, + "learning_rate": 9.924093009218252e-07, + "loss": -0.0001, + "num_tokens": 36196791.0, + "reward": 1.53607939183712, + "reward_std": 0.3938233330845833, + "rewards/format_reward_embodied/mean": 0.990234375, + "rewards/format_reward_embodied/std": 0.04855126701295376, + "rewards/stop_prediction_reward/mean": 0.515625, + "rewards/stop_prediction_reward/std": 0.35425616055727005, + "rewards/waypoint_pred_accuracy/mean": 0.01511000881408156, + "rewards/waypoint_pred_accuracy/std": 0.024702310350773107, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 682.875, + "completions/max_terminated_length": 682.875, + "completions/mean_length": 246.58203125, + "completions/mean_terminated_length": 247.04151344299316, + "completions/min_length": 102.375, + "completions/min_terminated_length": 120.375, + "epoch": 0.15789473684210525, + "grad_norm": 0.026448730379343033, + "learning_rate": 9.917917895489542e-07, + "loss": 0.0036, + "num_tokens": 36683937.0, + "reward": 1.7273263335227966, + "reward_std": 0.3566751927137375, + "rewards/format_reward_embodied/mean": 0.98046875, + "rewards/format_reward_embodied/std": 0.12826303765177727, + "rewards/stop_prediction_reward/mean": 0.7109375, + "rewards/stop_prediction_reward/std": 0.2865038700401783, + "rewards/waypoint_pred_accuracy/mean": 0.01796005329746431, + "rewards/waypoint_pred_accuracy/std": 0.03300400403918848, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.5, + "completions/max_terminated_length": 466.5, + "completions/mean_length": 246.384765625, + "completions/mean_terminated_length": 246.384765625, + "completions/min_length": 122.625, + "completions/min_terminated_length": 122.625, + "epoch": 0.16, + "grad_norm": 0.026203418150544167, + "learning_rate": 9.9115036376575e-07, + "loss": 0.0007, + "num_tokens": 37170854.0, + "reward": 1.4314483553171158, + "reward_std": 0.3206907380372286, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.435546875, + "rewards/stop_prediction_reward/std": 0.31111637130379677, + "rewards/waypoint_pred_accuracy/mean": 0.0008804261656223389, + "rewards/waypoint_pred_accuracy/std": 0.007043409327252448, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 461.375, + "completions/max_terminated_length": 461.375, + "completions/mean_length": 245.017578125, + "completions/mean_terminated_length": 245.51959609985352, + "completions/min_length": 97.0, + "completions/min_terminated_length": 110.875, + "epoch": 0.16210526315789472, + "grad_norm": 0.021599190309643745, + "learning_rate": 9.904850582929109e-07, + "loss": -0.001, + "num_tokens": 37659375.0, + "reward": 1.7587501555681229, + "reward_std": 0.45513000525534153, + "rewards/format_reward_embodied/mean": 0.9765625, + "rewards/format_reward_embodied/std": 0.12835253402590752, + "rewards/stop_prediction_reward/mean": 0.6328125, + "rewards/stop_prediction_reward/std": 0.3041498549282551, + "rewards/waypoint_pred_accuracy/mean": 0.07468759280809832, + "rewards/waypoint_pred_accuracy/std": 0.08736424083447313, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.375, + "completions/max_terminated_length": 431.375, + "completions/mean_length": 251.69921875, + "completions/mean_terminated_length": 251.69921875, + "completions/min_length": 118.125, + "completions/min_terminated_length": 118.125, + "epoch": 0.16421052631578947, + "grad_norm": 0.023476749658584595, + "learning_rate": 9.897959091437545e-07, + "loss": -0.0017, + "num_tokens": 38147157.0, + "reward": 1.7629946172237396, + "reward_std": 0.3216873835772276, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.72265625, + "rewards/stop_prediction_reward/std": 0.2610730957239866, + "rewards/waypoint_pred_accuracy/mean": 0.02212230950329659, + "rewards/waypoint_pred_accuracy/std": 0.03588474957179061, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.875, + "completions/max_terminated_length": 464.875, + "completions/mean_length": 238.640625, + "completions/mean_terminated_length": 238.640625, + "completions/min_length": 118.375, + "completions/min_terminated_length": 118.375, + "epoch": 0.16631578947368422, + "grad_norm": 0.03286973387002945, + "learning_rate": 9.890829536222686e-07, + "loss": -0.0025, + "num_tokens": 38629981.0, + "reward": 1.7297946512699127, + "reward_std": 0.2887336425483227, + "rewards/format_reward_embodied/mean": 0.974609375, + "rewards/format_reward_embodied/std": 0.12744012847542763, + "rewards/stop_prediction_reward/mean": 0.74609375, + "rewards/stop_prediction_reward/std": 0.23198767006397247, + "rewards/waypoint_pred_accuracy/mean": 0.004545772711879333, + "rewards/waypoint_pred_accuracy/std": 0.008970102488097233, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.125, + "completions/max_terminated_length": 476.125, + "completions/mean_length": 233.671875, + "completions/mean_terminated_length": 233.671875, + "completions/min_length": 97.875, + "completions/min_terminated_length": 97.875, + "epoch": 0.16842105263157894, + "grad_norm": 0.022502202540636063, + "learning_rate": 9.88346230321092e-07, + "loss": 0.0003, + "num_tokens": 39111733.0, + "reward": 1.6976664066314697, + "reward_std": 0.36001696437597275, + "rewards/format_reward_embodied/mean": 0.986328125, + "rewards/format_reward_embodied/std": 0.09071702510118484, + "rewards/stop_prediction_reward/mean": 0.68359375, + "rewards/stop_prediction_reward/std": 0.3152890168130398, + "rewards/waypoint_pred_accuracy/mean": 0.013872268769774525, + "rewards/waypoint_pred_accuracy/std": 0.03006945856032351, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.875, + "completions/max_terminated_length": 428.875, + "completions/mean_length": 236.8515625, + "completions/mean_terminated_length": 236.8515625, + "completions/min_length": 110.125, + "completions/min_terminated_length": 110.125, + "epoch": 0.1705263157894737, + "grad_norm": 0.03461969271302223, + "learning_rate": 9.875857791194251e-07, + "loss": 0.0001, + "num_tokens": 39593449.0, + "reward": 1.8557344675064087, + "reward_std": 0.3654259257018566, + "rewards/format_reward_embodied/mean": 0.9921875, + "rewards/format_reward_embodied/std": 0.05317101255059242, + "rewards/stop_prediction_reward/mean": 0.798828125, + "rewards/stop_prediction_reward/std": 0.2945959325879812, + "rewards/waypoint_pred_accuracy/mean": 0.032359407392959616, + "rewards/waypoint_pred_accuracy/std": 0.051165802276623415, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 773.75, + "completions/max_terminated_length": 773.75, + "completions/mean_length": 242.849609375, + "completions/mean_terminated_length": 242.849609375, + "completions/min_length": 111.625, + "completions/min_terminated_length": 111.625, + "epoch": 0.1726315789473684, + "grad_norm": 0.025375094264745712, + "learning_rate": 9.868016411808711e-07, + "loss": 0.008, + "num_tokens": 40080732.0, + "reward": 1.8185276985168457, + "reward_std": 0.2778010666370392, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.822265625, + "rewards/stop_prediction_reward/std": 0.26144965551793575, + "rewards/waypoint_pred_accuracy/mean": 8.415357677159232e-05, + "rewards/waypoint_pred_accuracy/std": 0.0006628100762255675, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 239.771484375, + "completions/mean_terminated_length": 239.771484375, + "completions/min_length": 118.125, + "completions/min_terminated_length": 118.125, + "epoch": 0.17473684210526316, + "grad_norm": 0.02282722294330597, + "learning_rate": 9.85993858951209e-07, + "loss": -0.0, + "num_tokens": 40564327.0, + "reward": 1.8071236461400986, + "reward_std": 0.3150872718542814, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.7109375, + "rewards/stop_prediction_reward/std": 0.2674474287778139, + "rewards/waypoint_pred_accuracy/mean": 0.04809306015794699, + "rewards/waypoint_pred_accuracy/std": 0.0513845629028903, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.875, + "completions/max_terminated_length": 428.875, + "completions/mean_length": 237.19140625, + "completions/mean_terminated_length": 237.19140625, + "completions/min_length": 117.75, + "completions/min_terminated_length": 117.75, + "epoch": 0.17684210526315788, + "grad_norm": 0.024264369159936905, + "learning_rate": 9.851624761560941e-07, + "loss": 0.0003, + "num_tokens": 41047305.0, + "reward": 1.6365228593349457, + "reward_std": 0.2356659732758999, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.603515625, + "rewards/stop_prediction_reward/std": 0.21942270919680595, + "rewards/waypoint_pred_accuracy/mean": 0.016503638941257396, + "rewards/waypoint_pred_accuracy/std": 0.019282840457430492, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.125, + "completions/max_terminated_length": 443.125, + "completions/mean_length": 241.103515625, + "completions/mean_terminated_length": 241.103515625, + "completions/min_length": 119.125, + "completions/min_terminated_length": 119.125, + "epoch": 0.17894736842105263, + "grad_norm": 0.020238297060132027, + "learning_rate": 9.843075377986927e-07, + "loss": 0.0003, + "num_tokens": 41531134.0, + "reward": 1.74451445043087, + "reward_std": 0.27907660976052284, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.693359375, + "rewards/stop_prediction_reward/std": 0.22875236719846725, + "rewards/waypoint_pred_accuracy/mean": 0.025577549161396262, + "rewards/waypoint_pred_accuracy/std": 0.03110517306985683, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.5, + "completions/max_terminated_length": 419.5, + "completions/mean_length": 231.287109375, + "completions/mean_terminated_length": 231.287109375, + "completions/min_length": 118.875, + "completions/min_terminated_length": 118.875, + "epoch": 0.18105263157894738, + "grad_norm": 0.02773391455411911, + "learning_rate": 9.834290901572454e-07, + "loss": 0.0005, + "num_tokens": 42010385.0, + "reward": 1.8274007737636566, + "reward_std": 0.2518170941621065, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.822265625, + "rewards/stop_prediction_reward/std": 0.2328737936913967, + "rewards/waypoint_pred_accuracy/mean": 0.0035441384432527657, + "rewards/waypoint_pred_accuracy/std": 0.011852368814229316, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.25, + "completions/max_terminated_length": 486.25, + "completions/mean_length": 242.701171875, + "completions/mean_terminated_length": 242.701171875, + "completions/min_length": 112.125, + "completions/min_terminated_length": 112.125, + "epoch": 0.1831578947368421, + "grad_norm": 0.017116429284214973, + "learning_rate": 9.82527180782562e-07, + "loss": 0.0007, + "num_tokens": 42495544.0, + "reward": 1.7215514183044434, + "reward_std": 0.24272998422384262, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.71875, + "rewards/stop_prediction_reward/std": 0.22877886332571507, + "rewards/waypoint_pred_accuracy/mean": 0.0033538464465563867, + "rewards/waypoint_pred_accuracy/std": 0.008977477041369256, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.625, + "completions/max_terminated_length": 407.625, + "completions/mean_length": 227.666015625, + "completions/mean_terminated_length": 227.666015625, + "completions/min_length": 124.125, + "completions/min_terminated_length": 124.125, + "epoch": 0.18526315789473685, + "grad_norm": 0.025679390877485275, + "learning_rate": 9.816018584954474e-07, + "loss": -0.0001, + "num_tokens": 42972109.0, + "reward": 1.543754830956459, + "reward_std": 0.3515052441507578, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.423828125, + "rewards/stop_prediction_reward/std": 0.25312468968331814, + "rewards/waypoint_pred_accuracy/mean": 0.06093992558778956, + "rewards/waypoint_pred_accuracy/std": 0.090891926485829, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 229.5078125, + "completions/mean_terminated_length": 229.5078125, + "completions/min_length": 114.375, + "completions/min_terminated_length": 114.375, + "epoch": 0.18736842105263157, + "grad_norm": 0.025896085426211357, + "learning_rate": 9.806531733840594e-07, + "loss": 0.0009, + "num_tokens": 43453905.0, + "reward": 1.9336253255605698, + "reward_std": 0.3347325623035431, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.82421875, + "rewards/stop_prediction_reward/std": 0.24601666443049908, + "rewards/waypoint_pred_accuracy/mean": 0.05470328652882017, + "rewards/waypoint_pred_accuracy/std": 0.07181658712215722, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.375, + "completions/max_terminated_length": 432.375, + "completions/mean_length": 235.38671875, + "completions/mean_terminated_length": 235.38671875, + "completions/min_length": 115.875, + "completions/min_terminated_length": 115.875, + "epoch": 0.18947368421052632, + "grad_norm": 0.03356796130537987, + "learning_rate": 9.796811768011975e-07, + "loss": -0.0006, + "num_tokens": 43934935.0, + "reward": 1.7414738535881042, + "reward_std": 0.30686922930181026, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.607421875, + "rewards/stop_prediction_reward/std": 0.2297498807311058, + "rewards/waypoint_pred_accuracy/mean": 0.06702600460093983, + "rewards/waypoint_pred_accuracy/std": 0.05166525034704193, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.25, + "completions/max_terminated_length": 406.25, + "completions/mean_length": 222.359375, + "completions/mean_terminated_length": 222.359375, + "completions/min_length": 118.875, + "completions/min_terminated_length": 118.875, + "epoch": 0.19157894736842104, + "grad_norm": 0.01943252608180046, + "learning_rate": 9.78685921361522e-07, + "loss": -0.0005, + "num_tokens": 44409551.0, + "reward": 1.6487830728292465, + "reward_std": 0.30546887032687664, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.58203125, + "rewards/stop_prediction_reward/std": 0.2547878734767437, + "rewards/waypoint_pred_accuracy/mean": 0.03337590532140797, + "rewards/waypoint_pred_accuracy/std": 0.052433368229147945, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.375, + "completions/max_terminated_length": 385.375, + "completions/mean_length": 217.119140625, + "completions/mean_terminated_length": 217.119140625, + "completions/min_length": 113.5, + "completions/min_terminated_length": 113.5, + "epoch": 0.1936842105263158, + "grad_norm": 0.021023401990532875, + "learning_rate": 9.776674609387076e-07, + "loss": -0.0006, + "num_tokens": 44880844.0, + "reward": 1.756135731935501, + "reward_std": 0.264364130795002, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.7109375, + "rewards/stop_prediction_reward/std": 0.23147857002913952, + "rewards/waypoint_pred_accuracy/mean": 0.022599116048866108, + "rewards/waypoint_pred_accuracy/std": 0.03211659606313333, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.125, + "completions/max_terminated_length": 400.125, + "completions/mean_length": 218.37109375, + "completions/mean_terminated_length": 218.37109375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.1957894736842105, + "grad_norm": 0.020804792642593384, + "learning_rate": 9.766258506625257e-07, + "loss": 0.0009, + "num_tokens": 45354762.0, + "reward": 1.703179121017456, + "reward_std": 0.31406174413859844, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.623046875, + "rewards/stop_prediction_reward/std": 0.2401380892843008, + "rewards/waypoint_pred_accuracy/mean": 0.04104270155312406, + "rewards/waypoint_pred_accuracy/std": 0.047771165754966205, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.125, + "completions/max_terminated_length": 385.125, + "completions/mean_length": 220.767578125, + "completions/mean_terminated_length": 220.767578125, + "completions/min_length": 108.25, + "completions/min_terminated_length": 108.25, + "epoch": 0.19789473684210526, + "grad_norm": 0.05812095105648041, + "learning_rate": 9.75561146915861e-07, + "loss": 0.0007, + "num_tokens": 45828691.0, + "reward": 1.7804509848356247, + "reward_std": 0.24969635531306267, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.728515625, + "rewards/stop_prediction_reward/std": 0.2045249417424202, + "rewards/waypoint_pred_accuracy/mean": 0.02596768177553266, + "rewards/waypoint_pred_accuracy/std": 0.03161624588427525, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.125, + "completions/max_terminated_length": 406.125, + "completions/mean_length": 221.861328125, + "completions/mean_terminated_length": 221.861328125, + "completions/min_length": 118.75, + "completions/min_terminated_length": 118.75, + "epoch": 0.2, + "grad_norm": 0.019957855343818665, + "learning_rate": 9.744734073316595e-07, + "loss": -0.0003, + "num_tokens": 46303244.0, + "reward": 1.6937783360481262, + "reward_std": 0.18066157400608063, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.6171875, + "rewards/stop_prediction_reward/std": 0.12835253402590752, + "rewards/waypoint_pred_accuracy/mean": 0.03829541802406311, + "rewards/waypoint_pred_accuracy/std": 0.038537144660949735, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.5, + "completions/max_terminated_length": 407.5, + "completions/mean_length": 213.76953125, + "completions/mean_terminated_length": 213.76953125, + "completions/min_length": 101.625, + "completions/min_terminated_length": 101.625, + "epoch": 0.20210526315789473, + "grad_norm": 0.02102278172969818, + "learning_rate": 9.73362690789808e-07, + "loss": -0.001, + "num_tokens": 46770710.0, + "reward": 1.7461132854223251, + "reward_std": 0.28678105026483536, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.640625, + "rewards/stop_prediction_reward/std": 0.21690494194626808, + "rewards/waypoint_pred_accuracy/mean": 0.05372073073522188, + "rewards/waypoint_pred_accuracy/std": 0.054212798771914095, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 219.494140625, + "completions/mean_terminated_length": 219.494140625, + "completions/min_length": 107.75, + "completions/min_terminated_length": 107.75, + "epoch": 0.20421052631578948, + "grad_norm": 0.023958692327141762, + "learning_rate": 9.722290574139486e-07, + "loss": -0.0001, + "num_tokens": 47243155.0, + "reward": 1.747300535440445, + "reward_std": 0.17760824598371983, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.7421875, + "rewards/stop_prediction_reward/std": 0.15745450742542744, + "rewards/waypoint_pred_accuracy/mean": 0.0035330769751453772, + "rewards/waypoint_pred_accuracy/std": 0.017783919582143426, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.375, + "completions/max_terminated_length": 380.375, + "completions/mean_length": 215.083984375, + "completions/mean_terminated_length": 215.083984375, + "completions/min_length": 109.125, + "completions/min_terminated_length": 109.125, + "epoch": 0.2063157894736842, + "grad_norm": 0.016943305730819702, + "learning_rate": 9.71072568568222e-07, + "loss": -0.0005, + "num_tokens": 47713406.0, + "reward": 1.851816326379776, + "reward_std": 0.2104954868555069, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.849609375, + "rewards/stop_prediction_reward/std": 0.1879090555012226, + "rewards/waypoint_pred_accuracy/mean": 0.0020800431666430333, + "rewards/waypoint_pred_accuracy/std": 0.011219009378692333, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.625, + "completions/max_terminated_length": 392.625, + "completions/mean_length": 211.724609375, + "completions/mean_terminated_length": 211.724609375, + "completions/min_length": 104.125, + "completions/min_terminated_length": 104.125, + "epoch": 0.20842105263157895, + "grad_norm": 0.014324544928967953, + "learning_rate": 9.698932868539475e-07, + "loss": -0.0006, + "num_tokens": 48180849.0, + "reward": 1.6212971061468124, + "reward_std": 0.16710768891550742, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.498046875, + "rewards/stop_prediction_reward/std": 0.07980126701295376, + "rewards/waypoint_pred_accuracy/mean": 0.06260167788853366, + "rewards/waypoint_pred_accuracy/std": 0.04448781939231594, + "step": 99 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.016340678557753563, + "learning_rate": 9.686912761062337e-07, + "loss": -0.001, + "step": 100 + }, + { + "epoch": 0.21052631578947367, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.00015625, + "eval_completions/max_length": 457.89, + "eval_completions/max_terminated_length": 457.89, + "eval_completions/mean_length": 219.56675354003906, + "eval_completions/mean_terminated_length": 219.59981658935547, + "eval_completions/min_length": 112.23, + "eval_completions/min_terminated_length": 113.11, + "eval_loss": 0.0013147207209840417, + "eval_num_tokens": 48647110.0, + "eval_reward": 1.8446430933475495, + "eval_reward_std": 0.20415274247365858, + "eval_rewards/format_reward_embodied/mean": 0.99921875, + "eval_rewards/format_reward_embodied/std": 0.00625, + "eval_rewards/stop_prediction_reward/mean": 0.75734375, + "eval_rewards/stop_prediction_reward/std": 0.1349847713112831, + "eval_rewards/waypoint_pred_accuracy/mean": 0.04404030321765386, + "eval_rewards/waypoint_pred_accuracy/std": 0.04408785154897764, + "eval_runtime": 1143.7465, + "eval_samples_per_second": 0.087, + "eval_steps_per_second": 0.002, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.1875, + "completions/max_terminated_length": 387.1875, + "completions/mean_length": 211.564453125, + "completions/mean_terminated_length": 211.564453125, + "completions/min_length": 109.25, + "completions/min_terminated_length": 109.25, + "epoch": 0.21263157894736842, + "grad_norm": 7.573169568786398e-05, + "learning_rate": 9.674666013905223e-07, + "loss": 0.0, + "num_tokens": 49119859.0, + "reward": 1.8795468658208847, + "reward_std": 0.13644913337626896, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.8642578125, + "rewards/stop_prediction_reward/std": 0.10762263275682926, + "rewards/waypoint_pred_accuracy/mean": 0.007644527649440409, + "rewards/waypoint_pred_accuracy/std": 0.02138127497060599, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.625, + "completions/max_terminated_length": 423.625, + "completions/mean_length": 216.150390625, + "completions/mean_terminated_length": 216.150390625, + "completions/min_length": 112.875, + "completions/min_terminated_length": 112.875, + "epoch": 0.21473684210526317, + "grad_norm": 0.02024409919977188, + "learning_rate": 9.662193289990683e-07, + "loss": 0.0001, + "num_tokens": 49589696.0, + "reward": 1.9582752585411072, + "reward_std": 0.170343800484261, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.059467025101184845, + "rewards/waypoint_pred_accuracy/mean": 0.04359079086862039, + "rewards/waypoint_pred_accuracy/std": 0.05229037126991898, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.875, + "completions/max_terminated_length": 419.875, + "completions/mean_length": 220.193359375, + "completions/mean_terminated_length": 220.193359375, + "completions/min_length": 110.5, + "completions/min_terminated_length": 110.5, + "epoch": 0.2168421052631579, + "grad_norm": 0.020008524879813194, + "learning_rate": 9.649495264473496e-07, + "loss": -0.0008, + "num_tokens": 50062179.0, + "reward": 1.4985756427049637, + "reward_std": 0.15706685557961464, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.15027354657649994, + "rewards/waypoint_pred_accuracy/mean": 0.00026439113654092976, + "rewards/waypoint_pred_accuracy/std": 0.0021146056694039514, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.375, + "completions/max_terminated_length": 389.375, + "completions/mean_length": 224.572265625, + "completions/mean_terminated_length": 224.572265625, + "completions/min_length": 113.5, + "completions/min_terminated_length": 113.5, + "epoch": 0.21894736842105264, + "grad_norm": 0.010618757456541061, + "learning_rate": 9.636572624704126e-07, + "loss": 0.0008, + "num_tokens": 50537928.0, + "reward": 2.029994383454323, + "reward_std": 0.16062275879085064, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.98046875, + "rewards/stop_prediction_reward/std": 0.11734727956354618, + "rewards/waypoint_pred_accuracy/mean": 0.02476281741601838, + "rewards/waypoint_pred_accuracy/std": 0.035420115480633285, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.75, + "completions/max_terminated_length": 375.75, + "completions/mean_length": 208.478515625, + "completions/mean_terminated_length": 208.478515625, + "completions/min_length": 112.875, + "completions/min_terminated_length": 112.875, + "epoch": 0.22105263157894736, + "grad_norm": 0.0, + "learning_rate": 9.62342607019152e-07, + "loss": 0.0003, + "num_tokens": 51004477.0, + "reward": 1.7948767840862274, + "reward_std": 0.0999540267221164, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.734375, + "rewards/stop_prediction_reward/std": 0.0729166679084301, + "rewards/waypoint_pred_accuracy/mean": 0.03025089303362355, + "rewards/waypoint_pred_accuracy/std": 0.024821761748978988, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.25, + "completions/max_terminated_length": 391.25, + "completions/mean_length": 218.388671875, + "completions/mean_terminated_length": 218.388671875, + "completions/min_length": 113.375, + "completions/min_terminated_length": 113.375, + "epoch": 0.2231578947368421, + "grad_norm": 0.024067817255854607, + "learning_rate": 9.610056312565245e-07, + "loss": 0.0004, + "num_tokens": 51477636.0, + "reward": 2.026669681072235, + "reward_std": 0.1696237076412217, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.8671875, + "rewards/stop_prediction_reward/std": 0.09542626701295376, + "rewards/waypoint_pred_accuracy/mean": 0.07974108902908483, + "rewards/waypoint_pred_accuracy/std": 0.04533190939855558, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.75, + "completions/max_terminated_length": 420.75, + "completions/mean_length": 217.77734375, + "completions/mean_terminated_length": 217.77734375, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.22526315789473683, + "grad_norm": 0.013704587705433369, + "learning_rate": 9.596464075536963e-07, + "loss": 0.0003, + "num_tokens": 51951186.0, + "reward": 1.8652345538139343, + "reward_std": 0.09080793828513833, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.865234375, + "rewards/stop_prediction_reward/std": 0.0908065214753151, + "rewards/waypoint_pred_accuracy/mean": 9.960914946921718e-08, + "rewards/waypoint_pred_accuracy/std": 7.087302705995463e-07, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 403.875, + "completions/max_terminated_length": 403.875, + "completions/mean_length": 217.728515625, + "completions/mean_terminated_length": 218.15721321105957, + "completions/min_length": 100.25, + "completions/min_terminated_length": 116.625, + "epoch": 0.22736842105263158, + "grad_norm": 0.02499217353761196, + "learning_rate": 9.582650094861256e-07, + "loss": -0.0001, + "num_tokens": 52426055.0, + "reward": 1.8535159230232239, + "reward_std": 0.1513118724361675, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.857421875, + "rewards/stop_prediction_reward/std": 0.1338059287518263, + "rewards/waypoint_pred_accuracy/mean": 1.5656563736666032e-07, + "rewards/waypoint_pred_accuracy/std": 8.466679199507776e-07, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 217.58984375, + "completions/mean_terminated_length": 217.58984375, + "completions/min_length": 108.5, + "completions/min_terminated_length": 108.5, + "epoch": 0.2294736842105263, + "grad_norm": 0.018018925562500954, + "learning_rate": 9.568615118295798e-07, + "loss": 0.0002, + "num_tokens": 52898613.0, + "reward": 1.5074883997440338, + "reward_std": 0.08622953689905444, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.505859375, + "rewards/stop_prediction_reward/std": 0.07980126701295376, + "rewards/waypoint_pred_accuracy/mean": 0.0008145241825951426, + "rewards/waypoint_pred_accuracy/std": 0.003929519949304638, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.625, + "completions/max_terminated_length": 410.625, + "completions/mean_length": 220.09375, + "completions/mean_terminated_length": 220.09375, + "completions/min_length": 112.5, + "completions/min_terminated_length": 112.5, + "epoch": 0.23157894736842105, + "grad_norm": 0.026054121553897858, + "learning_rate": 9.554359905560885e-07, + "loss": 0.0009, + "num_tokens": 53371493.0, + "reward": 1.6272786408662796, + "reward_std": 0.1345605030655861, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.607421875, + "rewards/stop_prediction_reward/std": 0.10731646977365017, + "rewards/waypoint_pred_accuracy/mean": 0.009928377814540968, + "rewards/waypoint_pred_accuracy/std": 0.021277805810678764, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.5, + "completions/max_terminated_length": 451.5, + "completions/mean_length": 226.404296875, + "completions/mean_terminated_length": 226.404296875, + "completions/min_length": 104.75, + "completions/min_terminated_length": 104.75, + "epoch": 0.2336842105263158, + "grad_norm": 0.019862722605466843, + "learning_rate": 9.53988522829831e-07, + "loss": -0.0002, + "num_tokens": 53849844.0, + "reward": 1.7356369495391846, + "reward_std": 0.17475299397483468, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.6328125, + "rewards/stop_prediction_reward/std": 0.0929968785494566, + "rewards/waypoint_pred_accuracy/mean": 0.05238880167820055, + "rewards/waypoint_pred_accuracy/std": 0.03885504556402302, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.125, + "completions/max_terminated_length": 429.125, + "completions/mean_length": 228.794921875, + "completions/mean_terminated_length": 228.794921875, + "completions/min_length": 102.625, + "completions/min_terminated_length": 102.625, + "epoch": 0.23578947368421052, + "grad_norm": 0.013491553254425526, + "learning_rate": 9.52519187002958e-07, + "loss": -0.0013, + "num_tokens": 54326667.0, + "reward": 2.0176089107990265, + "reward_std": 0.09997917944565415, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.990234375, + "rewards/stop_prediction_reward/std": 0.06879601255059242, + "rewards/waypoint_pred_accuracy/mean": 0.013687264542047125, + "rewards/waypoint_pred_accuracy/std": 0.02499246856624292, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.25, + "completions/max_terminated_length": 419.25, + "completions/mean_length": 223.84765625, + "completions/mean_terminated_length": 223.84765625, + "completions/min_length": 116.125, + "completions/min_terminated_length": 116.125, + "epoch": 0.23789473684210527, + "grad_norm": 0.0, + "learning_rate": 9.510280626113524e-07, + "loss": 0.0002, + "num_tokens": 54804477.0, + "reward": 2.0416500568389893, + "reward_std": 0.15113097801804543, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.87109375, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.08527813665590386, + "rewards/waypoint_pred_accuracy/std": 0.06661079078944249, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 418.375, + "completions/max_terminated_length": 418.375, + "completions/mean_length": 230.408203125, + "completions/mean_terminated_length": 230.9030590057373, + "completions/min_length": 102.375, + "completions/min_terminated_length": 116.25, + "epoch": 0.24, + "grad_norm": 0.027175500988960266, + "learning_rate": 9.495152303703225e-07, + "loss": -0.0, + "num_tokens": 55282254.0, + "reward": 1.9152479320764542, + "reward_std": 0.15890819625928998, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.869140625, + "rewards/stop_prediction_reward/std": 0.08138803765177727, + "rewards/waypoint_pred_accuracy/mean": 0.024030234897509217, + "rewards/waypoint_pred_accuracy/std": 0.038782567949965596, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.5, + "completions/max_terminated_length": 379.5, + "completions/mean_length": 218.421875, + "completions/mean_terminated_length": 218.421875, + "completions/min_length": 108.625, + "completions/min_terminated_length": 108.625, + "epoch": 0.24210526315789474, + "grad_norm": 0.0, + "learning_rate": 9.479807721702337e-07, + "loss": 0.0009, + "num_tokens": 55753702.0, + "reward": 1.964790791273117, + "reward_std": 0.14700112864375114, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.869140625, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 0.04880167031660676, + "rewards/waypoint_pred_accuracy/std": 0.04630230367183685, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.75, + "completions/max_terminated_length": 418.75, + "completions/mean_length": 234.615234375, + "completions/mean_terminated_length": 234.615234375, + "completions/min_length": 107.25, + "completions/min_terminated_length": 107.25, + "epoch": 0.24421052631578946, + "grad_norm": 0.01868380233645439, + "learning_rate": 9.46424771072075e-07, + "loss": -0.0016, + "num_tokens": 56236001.0, + "reward": 1.7634397000074387, + "reward_std": 0.11784930247813463, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.744140625, + "rewards/stop_prediction_reward/std": 0.08138803765177727, + "rewards/waypoint_pred_accuracy/mean": 0.009649543033447117, + "rewards/waypoint_pred_accuracy/std": 0.01907464297255501, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.5, + "completions/max_terminated_length": 393.5, + "completions/mean_length": 218.193359375, + "completions/mean_terminated_length": 218.193359375, + "completions/min_length": 111.75, + "completions/min_terminated_length": 111.75, + "epoch": 0.2463157894736842, + "grad_norm": 0.013114881701767445, + "learning_rate": 9.448473113029633e-07, + "loss": 0.0009, + "num_tokens": 56707844.0, + "reward": 1.6348197907209396, + "reward_std": 0.15209808605868602, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.513671875, + "rewards/stop_prediction_reward/std": 0.08837713301181793, + "rewards/waypoint_pred_accuracy/mean": 0.0605739434017778, + "rewards/waypoint_pred_accuracy/std": 0.03603998847574985, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.625, + "completions/max_terminated_length": 385.625, + "completions/mean_length": 217.125, + "completions/mean_terminated_length": 217.125, + "completions/min_length": 115.125, + "completions/min_terminated_length": 115.125, + "epoch": 0.24842105263157896, + "grad_norm": 0.021751945838332176, + "learning_rate": 9.432484782515842e-07, + "loss": 0.0011, + "num_tokens": 57177540.0, + "reward": 1.8951680064201355, + "reward_std": 0.09731905919034034, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.87109375, + "rewards/stop_prediction_reward/std": 0.05317101255059242, + "rewards/waypoint_pred_accuracy/mean": 0.012037134467476562, + "rewards/waypoint_pred_accuracy/std": 0.022074746749979113, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 733.875, + "completions/max_terminated_length": 733.875, + "completions/mean_length": 230.4921875, + "completions/mean_terminated_length": 230.4921875, + "completions/min_length": 114.125, + "completions/min_terminated_length": 114.125, + "epoch": 0.2505263157894737, + "grad_norm": 0.017523573711514473, + "learning_rate": 9.416283584635699e-07, + "loss": 0.0073, + "num_tokens": 57653632.0, + "reward": 1.7819086909294128, + "reward_std": 0.13167815032647923, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.74609375, + "rewards/stop_prediction_reward/std": 0.0625, + "rewards/waypoint_pred_accuracy/mean": 0.018884045333834365, + "rewards/waypoint_pred_accuracy/std": 0.04270522284787148, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.75, + "completions/max_terminated_length": 383.75, + "completions/mean_length": 230.603515625, + "completions/mean_terminated_length": 230.603515625, + "completions/min_length": 109.25, + "completions/min_terminated_length": 109.25, + "epoch": 0.25263157894736843, + "grad_norm": 0.020878760144114494, + "learning_rate": 9.399870396368137e-07, + "loss": -0.0003, + "num_tokens": 58132533.0, + "reward": 1.75615593791008, + "reward_std": 0.08499195147633398, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.744140625, + "rewards/stop_prediction_reward/std": 0.03754601255059242, + "rewards/waypoint_pred_accuracy/mean": 0.006007667677934543, + "rewards/waypoint_pred_accuracy/std": 0.023722964530922208, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.75, + "completions/max_terminated_length": 393.75, + "completions/mean_length": 228.326171875, + "completions/mean_terminated_length": 228.326171875, + "completions/min_length": 110.875, + "completions/min_terminated_length": 110.875, + "epoch": 0.25473684210526315, + "grad_norm": 0.030050212517380714, + "learning_rate": 9.383246106167244e-07, + "loss": -0.0007, + "num_tokens": 58610652.0, + "reward": 2.005710780620575, + "reward_std": 0.1616469284220443, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.8671875, + "rewards/stop_prediction_reward/std": 0.05317101255059242, + "rewards/waypoint_pred_accuracy/mean": 0.06926164017690084, + "rewards/waypoint_pred_accuracy/std": 0.06174082592511354, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.75, + "completions/max_terminated_length": 411.75, + "completions/mean_length": 235.259765625, + "completions/mean_terminated_length": 235.259765625, + "completions/min_length": 119.75, + "completions/min_terminated_length": 119.75, + "epoch": 0.25684210526315787, + "grad_norm": 0.05214720964431763, + "learning_rate": 9.366411613914151e-07, + "loss": 0.0, + "num_tokens": 59091681.0, + "reward": 2.0128368139266968, + "reward_std": 0.041966003568632004, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.998046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.007394973285158812, + "rewards/waypoint_pred_accuracy/std": 0.013188560594699084, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.375, + "completions/max_terminated_length": 738.375, + "completions/mean_length": 229.73828125, + "completions/mean_terminated_length": 229.73828125, + "completions/min_length": 113.125, + "completions/min_terminated_length": 113.125, + "epoch": 0.25894736842105265, + "grad_norm": 0.020252572372555733, + "learning_rate": 9.349367830868338e-07, + "loss": 0.0065, + "num_tokens": 59571867.0, + "reward": 1.7482015490531921, + "reward_std": 0.07813655398786068, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.75390625, + "rewards/stop_prediction_reward/std": 0.042255254462361336, + "rewards/waypoint_pred_accuracy/mean": 7.734416431048885e-05, + "rewards/waypoint_pred_accuracy/std": 0.00014155825192574412, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.5, + "completions/max_terminated_length": 421.5, + "completions/mean_length": 232.515625, + "completions/mean_terminated_length": 232.515625, + "completions/min_length": 116.625, + "completions/min_terminated_length": 116.625, + "epoch": 0.26105263157894737, + "grad_norm": 0.012004735879600048, + "learning_rate": 9.332115679618299e-07, + "loss": -0.0002, + "num_tokens": 60051875.0, + "reward": 1.87109375, + "reward_std": 0.06475212238729, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.059467025101184845, + "rewards/waypoint_pred_accuracy/mean": 3.7687339174058405e-25, + "rewards/waypoint_pred_accuracy/std": 0.0, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.375, + "completions/max_terminated_length": 373.375, + "completions/mean_length": 221.771484375, + "completions/mean_terminated_length": 221.771484375, + "completions/min_length": 113.625, + "completions/min_terminated_length": 113.625, + "epoch": 0.2631578947368421, + "grad_norm": 0.01381534244865179, + "learning_rate": 9.3146560940316e-07, + "loss": -0.0001, + "num_tokens": 60525038.0, + "reward": 1.8031584918498993, + "reward_std": 0.08486801406252198, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.74609375, + "rewards/stop_prediction_reward/std": 0.021921012550592422, + "rewards/waypoint_pred_accuracy/mean": 0.02853236788253366, + "rewards/waypoint_pred_accuracy/std": 0.03149077049183591, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.125, + "completions/max_terminated_length": 379.125, + "completions/mean_length": 225.11328125, + "completions/mean_terminated_length": 225.11328125, + "completions/min_length": 106.375, + "completions/min_terminated_length": 106.375, + "epoch": 0.26526315789473687, + "grad_norm": 0.0, + "learning_rate": 9.296990019204335e-07, + "loss": 0.0003, + "num_tokens": 61001576.0, + "reward": 1.838148683309555, + "reward_std": 0.13138162437826395, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.1065743277722504, + "rewards/waypoint_pred_accuracy/std": 0.058355900342576206, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 596.5, + "completions/max_terminated_length": 596.5, + "completions/mean_length": 223.703125, + "completions/mean_terminated_length": 223.703125, + "completions/min_length": 122.375, + "completions/min_terminated_length": 122.375, + "epoch": 0.2673684210526316, + "grad_norm": 0.015371584333479404, + "learning_rate": 9.279118411409962e-07, + "loss": 0.0076, + "num_tokens": 61475152.0, + "reward": 2.0289103388786316, + "reward_std": 0.14714059105608612, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.986328125, + "rewards/stop_prediction_reward/std": 0.05234810337424278, + "rewards/waypoint_pred_accuracy/mean": 0.02226770008679324, + "rewards/waypoint_pred_accuracy/std": 0.03958419876477137, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 225.896484375, + "completions/mean_terminated_length": 225.896484375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.2694736842105263, + "grad_norm": 0.0, + "learning_rate": 9.261042238047539e-07, + "loss": 0.0003, + "num_tokens": 61948507.0, + "reward": 1.7672365009784698, + "reward_std": 0.08540481339514372, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07111826360536146, + "rewards/waypoint_pred_accuracy/std": 0.04270240558859675, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 229.087890625, + "completions/mean_terminated_length": 229.087890625, + "completions/min_length": 122.25, + "completions/min_terminated_length": 122.25, + "epoch": 0.27157894736842103, + "grad_norm": 0.0, + "learning_rate": 9.242762477589369e-07, + "loss": -0.0008, + "num_tokens": 62425864.0, + "reward": 1.7555950731039047, + "reward_std": 0.150370123796165, + "rewards/format_reward_embodied/mean": 0.99609375, + "rewards/format_reward_embodied/std": 0.03125, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.06725065042865234, + "rewards/waypoint_pred_accuracy/std": 0.060643474211165796, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 377.25, + "completions/max_terminated_length": 377.25, + "completions/mean_length": 218.873046875, + "completions/mean_terminated_length": 219.36554527282715, + "completions/min_length": 95.875, + "completions/min_terminated_length": 110.5, + "epoch": 0.2736842105263158, + "grad_norm": 0.015500541776418686, + "learning_rate": 9.224280119528013e-07, + "loss": -0.0006, + "num_tokens": 62900679.0, + "reward": 1.86328125, + "reward_std": 0.08442101255059242, + "rewards/format_reward_embodied/mean": 0.994140625, + "rewards/format_reward_embodied/std": 0.046875, + "rewards/stop_prediction_reward/mean": 0.869140625, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 5.195248799303594e-14, + "rewards/waypoint_pred_accuracy/std": 3.9519006556770764e-13, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.375, + "completions/max_terminated_length": 389.375, + "completions/mean_length": 223.375, + "completions/mean_terminated_length": 223.375, + "completions/min_length": 116.375, + "completions/min_terminated_length": 116.375, + "epoch": 0.27578947368421053, + "grad_norm": 0.0, + "learning_rate": 9.205596164322753e-07, + "loss": 0.0005, + "num_tokens": 63375047.0, + "reward": 1.8198472261428833, + "reward_std": 0.06600932776927948, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.74609375, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.03687673434615135, + "rewards/waypoint_pred_accuracy/std": 0.023463066667318344, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.875, + "completions/max_terminated_length": 386.875, + "completions/mean_length": 223.994140625, + "completions/mean_terminated_length": 223.994140625, + "completions/min_length": 106.125, + "completions/min_terminated_length": 106.125, + "epoch": 0.27789473684210525, + "grad_norm": 5.267659071250819e-05, + "learning_rate": 9.186711623345419e-07, + "loss": 0.0007, + "num_tokens": 63848068.0, + "reward": 1.8460404872894287, + "reward_std": 0.07753154253146377, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.04802026903570478, + "rewards/waypoint_pred_accuracy/std": 0.03876577728947339, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.625, + "completions/max_terminated_length": 370.625, + "completions/mean_length": 216.8359375, + "completions/mean_terminated_length": 216.8359375, + "completions/min_length": 113.25, + "completions/min_terminated_length": 113.25, + "epoch": 0.28, + "grad_norm": 0.019873064011335373, + "learning_rate": 9.167627518825651e-07, + "loss": -0.0003, + "num_tokens": 64319408.0, + "reward": 1.66434708237648, + "reward_std": 0.10717772238422185, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.619140625, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 0.022603242181673977, + "rewards/waypoint_pred_accuracy/std": 0.030156509747939708, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.75, + "completions/max_terminated_length": 394.75, + "completions/mean_length": 221.185546875, + "completions/mean_terminated_length": 221.185546875, + "completions/min_length": 105.75, + "completions/min_terminated_length": 105.75, + "epoch": 0.28210526315789475, + "grad_norm": 0.016748478636145592, + "learning_rate": 9.148344883795563e-07, + "loss": 0.0002, + "num_tokens": 64797519.0, + "reward": 1.6543543934822083, + "reward_std": 0.09143044333904982, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0625, + "rewards/waypoint_pred_accuracy/mean": 0.014677208887757254, + "rewards/waypoint_pred_accuracy/std": 0.02070214717129737, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.125, + "completions/max_terminated_length": 347.125, + "completions/mean_length": 207.6796875, + "completions/mean_terminated_length": 207.6796875, + "completions/min_length": 100.75, + "completions/min_terminated_length": 100.75, + "epoch": 0.28421052631578947, + "grad_norm": 0.018281355500221252, + "learning_rate": 9.128864762033824e-07, + "loss": 0.0009, + "num_tokens": 65264811.0, + "reward": 1.8661604225635529, + "reward_std": 0.13888043258339167, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.03754601255059242, + "rewards/waypoint_pred_accuracy/mean": 0.05905676480875123, + "rewards/waypoint_pred_accuracy/std": 0.053127349918608825, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.625, + "completions/max_terminated_length": 354.625, + "completions/mean_length": 213.609375, + "completions/mean_terminated_length": 213.609375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.2863157894736842, + "grad_norm": 0.01513614784926176, + "learning_rate": 9.10918820800916e-07, + "loss": -0.0, + "num_tokens": 65734563.0, + "reward": 1.7799546718597412, + "reward_std": 0.05648380851107504, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.751953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.014000790968564693, + "rewards/waypoint_pred_accuracy/std": 0.0204294033423741, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.5, + "completions/max_terminated_length": 359.5, + "completions/mean_length": 209.94140625, + "completions/mean_terminated_length": 209.94140625, + "completions/min_length": 112.625, + "completions/min_terminated_length": 112.625, + "epoch": 0.28842105263157897, + "grad_norm": 0.014532121829688549, + "learning_rate": 9.089316286823274e-07, + "loss": -0.0006, + "num_tokens": 66202821.0, + "reward": 1.8316712975502014, + "reward_std": 0.14127142806610715, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.042788804041257456, + "rewards/waypoint_pred_accuracy/std": 0.05501071766701504, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.75, + "completions/max_terminated_length": 384.75, + "completions/mean_length": 226.625, + "completions/mean_terminated_length": 226.625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2905263157894737, + "grad_norm": 0.013312633149325848, + "learning_rate": 9.069250074153191e-07, + "loss": -0.0001, + "num_tokens": 66681989.0, + "reward": 1.9901870042085648, + "reward_std": 0.08008617826271802, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.98828125, + "rewards/stop_prediction_reward/std": 0.07509202510118484, + "rewards/waypoint_pred_accuracy/mean": 0.0009528863083687822, + "rewards/waypoint_pred_accuracy/std": 0.007327620231080863, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.375, + "completions/max_terminated_length": 373.375, + "completions/mean_length": 206.775390625, + "completions/mean_terminated_length": 206.775390625, + "completions/min_length": 109.5, + "completions/min_terminated_length": 109.5, + "epoch": 0.2926315789473684, + "grad_norm": 0.0, + "learning_rate": 9.048990656193024e-07, + "loss": 0.0, + "num_tokens": 67151378.0, + "reward": 1.8710939586162567, + "reward_std": 0.03125098500925105, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.87109375, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 1.0124033455018937e-07, + "rewards/waypoint_pred_accuracy/std": 4.912120291946227e-07, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.25, + "completions/max_terminated_length": 365.25, + "completions/mean_length": 223.37109375, + "completions/mean_terminated_length": 223.37109375, + "completions/min_length": 115.875, + "completions/min_terminated_length": 115.875, + "epoch": 0.29473684210526313, + "grad_norm": 0.020622270181775093, + "learning_rate": 9.028539129595197e-07, + "loss": -0.0001, + "num_tokens": 67625936.0, + "reward": 1.9263398349285126, + "reward_std": 0.07740109786391258, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.025669913738965988, + "rewards/waypoint_pred_accuracy/std": 0.028465650044381622, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.75, + "completions/max_terminated_length": 378.75, + "completions/mean_length": 213.451171875, + "completions/mean_terminated_length": 213.451171875, + "completions/min_length": 107.875, + "completions/min_terminated_length": 107.875, + "epoch": 0.2968421052631579, + "grad_norm": 0.01621430739760399, + "learning_rate": 9.00789660141106e-07, + "loss": -0.0004, + "num_tokens": 68094391.0, + "reward": 1.8975248336791992, + "reward_std": 0.056784010463506895, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.012238975709265076, + "rewards/waypoint_pred_accuracy/std": 0.02289922521287524, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.125, + "completions/max_terminated_length": 366.125, + "completions/mean_length": 207.638671875, + "completions/mean_terminated_length": 207.638671875, + "completions/min_length": 111.25, + "completions/min_terminated_length": 111.25, + "epoch": 0.29894736842105263, + "grad_norm": 0.0012398953549563885, + "learning_rate": 8.987064189030983e-07, + "loss": -0.0, + "num_tokens": 68561854.0, + "reward": 1.809123456478119, + "reward_std": 0.05541337472914165, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.029561737023570345, + "rewards/waypoint_pred_accuracy/std": 0.027706685019552424, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.5, + "completions/max_terminated_length": 336.5, + "completions/mean_length": 201.529296875, + "completions/mean_terminated_length": 201.529296875, + "completions/min_length": 111.875, + "completions/min_terminated_length": 111.875, + "epoch": 0.30105263157894735, + "grad_norm": 0.0, + "learning_rate": 8.966043020123855e-07, + "loss": 0.0001, + "num_tokens": 69026509.0, + "reward": 2.1186273992061615, + "reward_std": 0.09686689289469541, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.994140625, + "rewards/stop_prediction_reward/std": 0.03754601255059242, + "rewards/waypoint_pred_accuracy/mean": 0.06224340945057809, + "rewards/waypoint_pred_accuracy/std": 0.029665624278586974, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.5, + "completions/max_terminated_length": 371.5, + "completions/mean_length": 212.501953125, + "completions/mean_terminated_length": 212.501953125, + "completions/min_length": 111.5, + "completions/min_terminated_length": 111.5, + "epoch": 0.3031578947368421, + "grad_norm": 0.0, + "learning_rate": 8.944834232576054e-07, + "loss": 0.0002, + "num_tokens": 69494414.0, + "reward": 2.0669292509555817, + "reward_std": 0.09019226813688874, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.99609375, + "rewards/stop_prediction_reward/std": 0.021921012550592422, + "rewards/waypoint_pred_accuracy/mean": 0.03541775744817477, + "rewards/waypoint_pred_accuracy/std": 0.03677688956680679, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.875, + "completions/max_terminated_length": 377.875, + "completions/mean_length": 214.53125, + "completions/mean_terminated_length": 214.53125, + "completions/min_length": 111.875, + "completions/min_terminated_length": 111.875, + "epoch": 0.30526315789473685, + "grad_norm": 0.0, + "learning_rate": 8.923438974429849e-07, + "loss": -0.0, + "num_tokens": 69966238.0, + "reward": 1.500933289527893, + "reward_std": 0.022255118004977703, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.498046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.0014432001626119018, + "rewards/waypoint_pred_accuracy/std": 0.0033150608651340008, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.375, + "completions/max_terminated_length": 341.375, + "completions/mean_length": 207.609375, + "completions/mean_terminated_length": 207.609375, + "completions/min_length": 108.625, + "completions/min_terminated_length": 108.625, + "epoch": 0.30736842105263157, + "grad_norm": 0.01606505736708641, + "learning_rate": 8.901858403821253e-07, + "loss": -0.0003, + "num_tokens": 70430934.0, + "reward": 1.9067281186580658, + "reward_std": 0.11152400076389313, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.03754601255059242, + "rewards/waypoint_pred_accuracy/mean": 0.08031718447636607, + "rewards/waypoint_pred_accuracy/std": 0.041061242358370054, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.125, + "completions/max_terminated_length": 339.125, + "completions/mean_length": 200.755859375, + "completions/mean_terminated_length": 200.755859375, + "completions/min_length": 108.125, + "completions/min_terminated_length": 108.125, + "epoch": 0.3094736842105263, + "grad_norm": 0.022619424387812614, + "learning_rate": 8.880093688917338e-07, + "loss": 0.0006, + "num_tokens": 70895897.0, + "reward": 1.979694738984108, + "reward_std": 0.10517071333015338, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.052347380133141996, + "rewards/waypoint_pred_accuracy/std": 0.05258536203473341, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.25, + "completions/max_terminated_length": 351.25, + "completions/mean_length": 209.3203125, + "completions/mean_terminated_length": 209.3203125, + "completions/min_length": 106.375, + "completions/min_terminated_length": 106.375, + "epoch": 0.31157894736842107, + "grad_norm": 0.015080302953720093, + "learning_rate": 8.858146007853e-07, + "loss": 0.0007, + "num_tokens": 71362301.0, + "reward": 1.7541356086730957, + "reward_std": 0.0744232046417892, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 0.003044351096450817, + "rewards/waypoint_pred_accuracy/std": 0.014287834603145175, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.25, + "completions/max_terminated_length": 339.25, + "completions/mean_length": 204.564453125, + "completions/mean_terminated_length": 204.564453125, + "completions/min_length": 109.5, + "completions/min_terminated_length": 109.5, + "epoch": 0.3136842105263158, + "grad_norm": 0.014518975280225277, + "learning_rate": 8.836016548667178e-07, + "loss": 0.0, + "num_tokens": 71831198.0, + "reward": 1.7602195739746094, + "reward_std": 0.04601290519349277, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.751953125, + "rewards/stop_prediction_reward/std": 0.03754601255059242, + "rewards/waypoint_pred_accuracy/mean": 0.004133254632145133, + "rewards/waypoint_pred_accuracy/std": 0.004251232765162753, + "step": 149 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.0, + "learning_rate": 8.813706509238558e-07, + "loss": -0.0002, + "step": 150 + }, + { + "epoch": 0.3157894736842105, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 410.41, + "eval_completions/max_terminated_length": 410.41, + "eval_completions/mean_length": 207.72815979003906, + "eval_completions/mean_terminated_length": 207.72815979003906, + "eval_completions/min_length": 111.67, + "eval_completions/min_terminated_length": 111.67, + "eval_loss": 0.0016332893865182996, + "eval_num_tokens": 72298062.0, + "eval_reward": 1.8651788556575775, + "eval_reward_std": 0.10482742591684201, + "eval_rewards/format_reward_embodied/mean": 0.9990625, + "eval_rewards/format_reward_embodied/std": 0.0075, + "eval_rewards/stop_prediction_reward/mean": 0.76859375, + "eval_rewards/stop_prediction_reward/std": 0.020503681004047394, + "eval_rewards/waypoint_pred_accuracy/mean": 0.04876130852479674, + "eval_rewards/waypoint_pred_accuracy/std": 0.0403332443083783, + "eval_runtime": 1073.7169, + "eval_samples_per_second": 0.093, + "eval_steps_per_second": 0.002, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.75, + "completions/max_terminated_length": 359.75, + "completions/mean_length": 208.6748046875, + "completions/mean_terminated_length": 208.6748046875, + "completions/min_length": 112.5, + "completions/min_terminated_length": 112.5, + "epoch": 0.3178947368421053, + "grad_norm": 0.013742033392190933, + "learning_rate": 8.791217097220724e-07, + "loss": -0.0001, + "num_tokens": 72761681.0, + "reward": 1.875585325062275, + "reward_std": 0.023173605810638875, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.8720703125, + "rewards/stop_prediction_reward/std": 0.01877300627529621, + "rewards/waypoint_pred_accuracy/mean": 0.0017575172029685837, + "rewards/waypoint_pred_accuracy/std": 0.0022003025424544072, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.375, + "completions/max_terminated_length": 350.375, + "completions/mean_length": 210.005859375, + "completions/mean_terminated_length": 210.005859375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.32, + "grad_norm": 0.010572736151516438, + "learning_rate": 8.768549529976783e-07, + "loss": 0.0006, + "num_tokens": 73228180.0, + "reward": 1.9192677438259125, + "reward_std": 0.049104438461654354, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.022133867223146808, + "rewards/waypoint_pred_accuracy/std": 0.02455222301614413, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 197.775390625, + "completions/mean_terminated_length": 197.775390625, + "completions/min_length": 107.625, + "completions/min_terminated_length": 107.625, + "epoch": 0.32210526315789473, + "grad_norm": 0.01287839561700821, + "learning_rate": 8.74570503451348e-07, + "loss": 0.0003, + "num_tokens": 73691105.0, + "reward": 1.9361660480499268, + "reward_std": 0.10891422609623902, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.87109375, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.03253614324701938, + "rewards/waypoint_pred_accuracy/std": 0.03883211300195525, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 202.033203125, + "completions/mean_terminated_length": 202.033203125, + "completions/min_length": 115.75, + "completions/min_terminated_length": 115.75, + "epoch": 0.32421052631578945, + "grad_norm": 0.018562331795692444, + "learning_rate": 8.72268484741477e-07, + "loss": -0.0005, + "num_tokens": 74156146.0, + "reward": 1.8251054883003235, + "reward_std": 0.1710243321698499, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.626953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.1000527671312958, + "rewards/waypoint_pred_accuracy/std": 0.07437628054339515, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 204.990234375, + "completions/mean_terminated_length": 204.990234375, + "completions/min_length": 115.75, + "completions/min_terminated_length": 115.75, + "epoch": 0.3263157894736842, + "grad_norm": 0.018523743376135826, + "learning_rate": 8.699490214774881e-07, + "loss": 0.0002, + "num_tokens": 74622701.0, + "reward": 1.7455661296844482, + "reward_std": 0.05205453363100787, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.744140625, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 0.0007127649293749982, + "rewards/waypoint_pred_accuracy/std": 0.0025897676093791233, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.75, + "completions/max_terminated_length": 333.75, + "completions/mean_length": 193.486328125, + "completions/mean_terminated_length": 193.486328125, + "completions/min_length": 102.625, + "completions/min_terminated_length": 102.625, + "epoch": 0.32842105263157895, + "grad_norm": 0.0, + "learning_rate": 8.676122392130872e-07, + "loss": 0.0, + "num_tokens": 75081254.0, + "reward": 1.623046875, + "reward_std": 0.015625, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.623046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 7.546804675536426e-25, + "rewards/waypoint_pred_accuracy/std": 4.67924008313652e-24, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.25, + "completions/max_terminated_length": 352.25, + "completions/mean_length": 199.650390625, + "completions/mean_terminated_length": 199.650390625, + "completions/min_length": 109.75, + "completions/min_terminated_length": 109.75, + "epoch": 0.33052631578947367, + "grad_norm": 0.0, + "learning_rate": 8.652582644394657e-07, + "loss": 0.001, + "num_tokens": 75543923.0, + "reward": 1.763193666934967, + "reward_std": 0.043326430561137386, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.007573404463160793, + "rewards/waypoint_pred_accuracy/std": 0.013850717227414266, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.375, + "completions/max_terminated_length": 351.375, + "completions/mean_length": 211.6484375, + "completions/mean_terminated_length": 211.6484375, + "completions/min_length": 105.125, + "completions/min_terminated_length": 105.125, + "epoch": 0.33263157894736844, + "grad_norm": 0.015552366152405739, + "learning_rate": 8.628872245784545e-07, + "loss": 0.0007, + "num_tokens": 76014975.0, + "reward": 1.7972655892372131, + "reward_std": 0.11691518849693239, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 0.025585949169908424, + "rewards/waypoint_pred_accuracy/std": 0.027767218511144182, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 200.451171875, + "completions/mean_terminated_length": 200.451171875, + "completions/min_length": 113.625, + "completions/min_terminated_length": 113.625, + "epoch": 0.33473684210526317, + "grad_norm": 0.012406791560351849, + "learning_rate": 8.60499247975626e-07, + "loss": 0.0001, + "num_tokens": 76479526.0, + "reward": 1.692464992403984, + "reward_std": 0.10753844678401947, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.626953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.0327559362485772, + "rewards/waypoint_pred_accuracy/std": 0.0465778008219786, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.875, + "completions/max_terminated_length": 320.875, + "completions/mean_length": 193.51171875, + "completions/mean_terminated_length": 193.51171875, + "completions/min_length": 102.875, + "completions/min_terminated_length": 102.875, + "epoch": 0.3368421052631579, + "grad_norm": 0.030549675226211548, + "learning_rate": 8.58094463893347e-07, + "loss": 0.0002, + "num_tokens": 76938732.0, + "reward": 1.9054777026176453, + "reward_std": 0.05647589443033718, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.016215412090739154, + "rewards/waypoint_pred_accuracy/std": 0.02042544638196753, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.125, + "completions/max_terminated_length": 334.125, + "completions/mean_length": 199.755859375, + "completions/mean_terminated_length": 199.755859375, + "completions/min_length": 113.875, + "completions/min_terminated_length": 113.875, + "epoch": 0.3389473684210526, + "grad_norm": 0.014368158765137196, + "learning_rate": 8.556730025037819e-07, + "loss": -0.0002, + "num_tokens": 77399855.0, + "reward": 1.9117814898490906, + "reward_std": 0.06853678584000633, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.020343898673879646, + "rewards/waypoint_pred_accuracy/std": 0.018643394690372794, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.125, + "completions/max_terminated_length": 311.125, + "completions/mean_length": 194.9609375, + "completions/mean_terminated_length": 194.9609375, + "completions/min_length": 111.875, + "completions/min_terminated_length": 111.875, + "epoch": 0.3410526315789474, + "grad_norm": 0.0, + "learning_rate": 8.532349948818453e-07, + "loss": 0.0001, + "num_tokens": 77858715.0, + "reward": 1.5617362409830093, + "reward_std": 0.09266455079254143, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.030868121356284917, + "rewards/waypoint_pred_accuracy/std": 0.04633227763988046, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.125, + "completions/max_terminated_length": 359.125, + "completions/mean_length": 195.875, + "completions/mean_terminated_length": 195.875, + "completions/min_length": 114.25, + "completions/min_terminated_length": 114.25, + "epoch": 0.3431578947368421, + "grad_norm": 0.0, + "learning_rate": 8.507805729981081e-07, + "loss": -0.0003, + "num_tokens": 78320411.0, + "reward": 1.5313882529735565, + "reward_std": 0.13795647164806724, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.375, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07819415128325216, + "rewards/waypoint_pred_accuracy/std": 0.06897824443884724, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.125, + "completions/max_terminated_length": 356.125, + "completions/mean_length": 200.798828125, + "completions/mean_terminated_length": 200.798828125, + "completions/min_length": 112.75, + "completions/min_terminated_length": 112.75, + "epoch": 0.3452631578947368, + "grad_norm": 0.0, + "learning_rate": 8.483098697116535e-07, + "loss": -0.0002, + "num_tokens": 78785204.0, + "reward": 1.8663674592971802, + "reward_std": 0.0740682063976692, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.05818372880702327, + "rewards/waypoint_pred_accuracy/std": 0.03703410336356683, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 184.767578125, + "completions/mean_terminated_length": 184.767578125, + "completions/min_length": 108.125, + "completions/min_terminated_length": 108.125, + "epoch": 0.3473684210526316, + "grad_norm": 0.014639639295637608, + "learning_rate": 8.45823018762885e-07, + "loss": 0.0004, + "num_tokens": 79241469.0, + "reward": 1.5147821009159088, + "reward_std": 0.028108830246765137, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.007391049890770773, + "rewards/waypoint_pred_accuracy/std": 0.01405441654196693, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.125, + "completions/max_terminated_length": 327.125, + "completions/mean_length": 188.45703125, + "completions/mean_terminated_length": 188.45703125, + "completions/min_length": 108.75, + "completions/min_terminated_length": 108.75, + "epoch": 0.3494736842105263, + "grad_norm": 0.01364043541252613, + "learning_rate": 8.43320154766287e-07, + "loss": 0.0, + "num_tokens": 79696807.0, + "reward": 1.9586426615715027, + "reward_std": 0.09136595235713685, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.04182136098216205, + "rewards/waypoint_pred_accuracy/std": 0.04568298065798615, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.375, + "completions/max_terminated_length": 332.375, + "completions/mean_length": 192.078125, + "completions/mean_terminated_length": 192.078125, + "completions/min_length": 112.125, + "completions/min_terminated_length": 112.125, + "epoch": 0.35157894736842105, + "grad_norm": 0.0016441630432382226, + "learning_rate": 8.408014132031385e-07, + "loss": -0.0002, + "num_tokens": 80154511.0, + "reward": 1.753662645816803, + "reward_std": 0.020446277248106215, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0018313333685417293, + "rewards/waypoint_pred_accuracy/std": 0.010223136260532173, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.625, + "completions/max_terminated_length": 350.625, + "completions/mean_length": 195.697265625, + "completions/mean_terminated_length": 195.697265625, + "completions/min_length": 111.25, + "completions/min_terminated_length": 111.25, + "epoch": 0.35368421052631577, + "grad_norm": 0.0191575326025486, + "learning_rate": 8.382669304141789e-07, + "loss": 0.0002, + "num_tokens": 80616180.0, + "reward": 1.7313858270645142, + "reward_std": 0.07452307981657214, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.05319291944942961, + "rewards/waypoint_pred_accuracy/std": 0.03726154523974404, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.125, + "completions/max_terminated_length": 339.125, + "completions/mean_length": 199.1796875, + "completions/mean_terminated_length": 199.1796875, + "completions/min_length": 113.125, + "completions/min_terminated_length": 113.125, + "epoch": 0.35578947368421054, + "grad_norm": 0.0011054413625970483, + "learning_rate": 8.35716843592228e-07, + "loss": -0.0002, + "num_tokens": 81079184.0, + "reward": 1.877403125166893, + "reward_std": 0.014616520323585291, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0012015849925077877, + "rewards/waypoint_pred_accuracy/std": 0.007308262612468781, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.625, + "completions/max_terminated_length": 345.625, + "completions/mean_length": 194.3984375, + "completions/mean_terminated_length": 194.3984375, + "completions/min_length": 118.875, + "completions/min_terminated_length": 118.875, + "epoch": 0.35789473684210527, + "grad_norm": 0.04289071634411812, + "learning_rate": 8.331512907747596e-07, + "loss": -0.0002, + "num_tokens": 81539356.0, + "reward": 1.8431425243616104, + "reward_std": 0.17017098766780236, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.10907126411490253, + "rewards/waypoint_pred_accuracy/std": 0.0850854907983205, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.125, + "completions/max_terminated_length": 312.125, + "completions/mean_length": 189.50390625, + "completions/mean_terminated_length": 189.50390625, + "completions/min_length": 114.375, + "completions/min_terminated_length": 114.375, + "epoch": 0.36, + "grad_norm": 0.0, + "learning_rate": 8.305704108364301e-07, + "loss": 0.0003, + "num_tokens": 81995934.0, + "reward": 1.8737435936927795, + "reward_std": 0.02003212797418996, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.0003483741428915721, + "rewards/waypoint_pred_accuracy/std": 0.0022035639689050868, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.625, + "completions/max_terminated_length": 353.625, + "completions/mean_length": 196.31640625, + "completions/mean_terminated_length": 196.31640625, + "completions/min_length": 111.5, + "completions/min_terminated_length": 111.5, + "epoch": 0.36210526315789476, + "grad_norm": 0.014004958793520927, + "learning_rate": 8.279743434815599e-07, + "loss": 0.0, + "num_tokens": 82457920.0, + "reward": 1.6893496811389923, + "reward_std": 0.07697248342446983, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.03217484994092959, + "rewards/waypoint_pred_accuracy/std": 0.038486242177896396, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.375, + "completions/max_terminated_length": 310.375, + "completions/mean_length": 184.642578125, + "completions/mean_terminated_length": 184.642578125, + "completions/min_length": 110.875, + "completions/min_terminated_length": 110.875, + "epoch": 0.3642105263157895, + "grad_norm": 0.013178675435483456, + "learning_rate": 8.253632292365726e-07, + "loss": 0.0, + "num_tokens": 82915145.0, + "reward": 2.0115868896245956, + "reward_std": 0.08305720053613186, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.06829345226288144, + "rewards/waypoint_pred_accuracy/std": 0.041528596542799806, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.125, + "completions/max_terminated_length": 318.125, + "completions/mean_length": 190.236328125, + "completions/mean_terminated_length": 190.236328125, + "completions/min_length": 112.125, + "completions/min_terminated_length": 112.125, + "epoch": 0.3663157894736842, + "grad_norm": 0.02363615669310093, + "learning_rate": 8.227372094423864e-07, + "loss": -0.0001, + "num_tokens": 83374914.0, + "reward": 2.0038606971502304, + "reward_std": 0.14384562149643898, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0644303746521473, + "rewards/waypoint_pred_accuracy/std": 0.07192281540483236, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.125, + "completions/max_terminated_length": 332.125, + "completions/mean_length": 180.150390625, + "completions/mean_terminated_length": 180.150390625, + "completions/min_length": 106.875, + "completions/min_terminated_length": 106.875, + "epoch": 0.3684210526315789, + "grad_norm": 0.0, + "learning_rate": 8.200964262467656e-07, + "loss": 0.0001, + "num_tokens": 83825615.0, + "reward": 1.4097924530506134, + "reward_std": 0.04181510955095291, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.375, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0173962339758873, + "rewards/waypoint_pred_accuracy/std": 0.020907556638121605, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.625, + "completions/max_terminated_length": 307.625, + "completions/mean_length": 184.662109375, + "completions/mean_terminated_length": 184.662109375, + "completions/min_length": 108.25, + "completions/min_terminated_length": 108.25, + "epoch": 0.3705263157894737, + "grad_norm": 0.01097350474447012, + "learning_rate": 8.174410225966239e-07, + "loss": 0.0001, + "num_tokens": 84280098.0, + "reward": 2.0639708340168, + "reward_std": 0.17662093978196225, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.09448541931482657, + "rewards/waypoint_pred_accuracy/std": 0.0883104762174689, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.375, + "completions/max_terminated_length": 311.375, + "completions/mean_length": 179.3046875, + "completions/mean_terminated_length": 179.3046875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.3726315789473684, + "grad_norm": 0.0, + "learning_rate": 8.147711422302881e-07, + "loss": 0.0, + "num_tokens": 84732926.0, + "reward": 1.5250985324382782, + "reward_std": 0.028973333232215737, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.01254925754006564, + "rewards/waypoint_pred_accuracy/std": 0.014486670532335214, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.625, + "completions/max_terminated_length": 315.625, + "completions/mean_length": 182.26171875, + "completions/mean_terminated_length": 182.26171875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.37473684210526315, + "grad_norm": 0.010854351334273815, + "learning_rate": 8.120869296697162e-07, + "loss": -0.0, + "num_tokens": 85187204.0, + "reward": 1.7528592348098755, + "reward_std": 0.01418565196615873, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0014296227405005812, + "rewards/waypoint_pred_accuracy/std": 0.007092827672981208, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 185.01953125, + "completions/mean_terminated_length": 185.01953125, + "completions/min_length": 105.25, + "completions/min_terminated_length": 105.25, + "epoch": 0.37684210526315787, + "grad_norm": 0.022924024611711502, + "learning_rate": 8.093885302126754e-07, + "loss": 0.0073, + "num_tokens": 85641038.0, + "reward": 1.7433076351881027, + "reward_std": 0.06334595192311099, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.744140625, + "rewards/stop_prediction_reward/std": 0.046875, + "rewards/waypoint_pred_accuracy/mean": 0.0005600605727522634, + "rewards/waypoint_pred_accuracy/std": 0.003230394551792415, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 179.8359375, + "completions/mean_terminated_length": 179.8359375, + "completions/min_length": 108.625, + "completions/min_terminated_length": 108.625, + "epoch": 0.37894736842105264, + "grad_norm": 0.0, + "learning_rate": 8.06676089924877e-07, + "loss": 0.0004, + "num_tokens": 86092666.0, + "reward": 1.8267612159252167, + "reward_std": 0.1466955652579145, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.10088061677343774, + "rewards/waypoint_pred_accuracy/std": 0.0733477777656617, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.75, + "completions/max_terminated_length": 298.75, + "completions/mean_length": 184.97265625, + "completions/mean_terminated_length": 184.97265625, + "completions/min_length": 111.625, + "completions/min_terminated_length": 111.625, + "epoch": 0.38105263157894737, + "grad_norm": 0.0, + "learning_rate": 8.03949755632069e-07, + "loss": -0.0004, + "num_tokens": 86546476.0, + "reward": 1.7814702987670898, + "reward_std": 0.08937373897060752, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07823515147902071, + "rewards/waypoint_pred_accuracy/std": 0.04468687262851745, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.625, + "completions/max_terminated_length": 309.625, + "completions/mean_length": 179.16015625, + "completions/mean_terminated_length": 179.16015625, + "completions/min_length": 105.625, + "completions/min_terminated_length": 105.625, + "epoch": 0.3831578947368421, + "grad_norm": 0.02093261480331421, + "learning_rate": 8.01209674912089e-07, + "loss": 0.0, + "num_tokens": 86998398.0, + "reward": 1.9096409678459167, + "reward_std": 0.0670090508647263, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07982050045393407, + "rewards/waypoint_pred_accuracy/std": 0.033504527527838945, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.375, + "completions/max_terminated_length": 286.375, + "completions/mean_length": 171.54296875, + "completions/mean_terminated_length": 171.54296875, + "completions/min_length": 102.875, + "completions/min_terminated_length": 102.875, + "epoch": 0.38526315789473686, + "grad_norm": 0.01432411465793848, + "learning_rate": 7.984559960868759e-07, + "loss": -0.0004, + "num_tokens": 87445908.0, + "reward": 1.6769181191921234, + "reward_std": 0.10013374220579863, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.02595905796625718, + "rewards/waypoint_pred_accuracy/std": 0.05006687459543879, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.375, + "completions/max_terminated_length": 317.375, + "completions/mean_length": 181.97265625, + "completions/mean_terminated_length": 181.97265625, + "completions/min_length": 106.875, + "completions/min_terminated_length": 106.875, + "epoch": 0.3873684210526316, + "grad_norm": 0.0, + "learning_rate": 7.956888682144403e-07, + "loss": 0.0001, + "num_tokens": 87901126.0, + "reward": 2.113648146390915, + "reward_std": 0.08789801027160138, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 1.0, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.05682408035500002, + "rewards/waypoint_pred_accuracy/std": 0.04394900894840466, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.375, + "completions/max_terminated_length": 297.375, + "completions/mean_length": 170.03125, + "completions/mean_terminated_length": 170.03125, + "completions/min_length": 107.25, + "completions/min_terminated_length": 107.25, + "epoch": 0.3894736842105263, + "grad_norm": 0.00041478071943856776, + "learning_rate": 7.929084410807964e-07, + "loss": -0.0, + "num_tokens": 88348630.0, + "reward": 1.8102026730775833, + "reward_std": 0.08258083421748097, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.032054444891858935, + "rewards/waypoint_pred_accuracy/std": 0.025665418093367975, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.125, + "completions/max_terminated_length": 285.125, + "completions/mean_length": 167.068359375, + "completions/mean_terminated_length": 167.068359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.391578947368421, + "grad_norm": 0.010830595158040524, + "learning_rate": 7.90114865191855e-07, + "loss": -0.0002, + "num_tokens": 88793017.0, + "reward": 1.899033010005951, + "reward_std": 0.05657581372270215, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.012016504072278167, + "rewards/waypoint_pred_accuracy/std": 0.02828790664943881, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.625, + "completions/max_terminated_length": 270.625, + "completions/mean_length": 170.416015625, + "completions/mean_terminated_length": 170.416015625, + "completions/min_length": 110.5, + "completions/min_terminated_length": 110.5, + "epoch": 0.3936842105263158, + "grad_norm": 0.02063567005097866, + "learning_rate": 7.873082917652743e-07, + "loss": -0.0001, + "num_tokens": 89241230.0, + "reward": 1.6706158965826035, + "reward_std": 0.083322549238801, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.023784521038484087, + "rewards/waypoint_pred_accuracy/std": 0.03384877370171229, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.25, + "completions/max_terminated_length": 305.25, + "completions/mean_length": 181.23046875, + "completions/mean_terminated_length": 181.23046875, + "completions/min_length": 114.5, + "completions/min_terminated_length": 114.5, + "epoch": 0.3957894736842105, + "grad_norm": 0.0, + "learning_rate": 7.844888727222768e-07, + "loss": 0.0004, + "num_tokens": 89692484.0, + "reward": 1.630469560623169, + "reward_std": 0.031483914237469435, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0037113359924784914, + "rewards/waypoint_pred_accuracy/std": 0.00797604240300253, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.875, + "completions/max_terminated_length": 331.875, + "completions/mean_length": 173.87109375, + "completions/mean_terminated_length": 173.87109375, + "completions/min_length": 113.75, + "completions/min_terminated_length": 113.75, + "epoch": 0.39789473684210525, + "grad_norm": 0.042730070650577545, + "learning_rate": 7.816567606794239e-07, + "loss": -0.0002, + "num_tokens": 90142082.0, + "reward": 1.7876133099198341, + "reward_std": 0.06541969033423811, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0197832117555663, + "rewards/waypoint_pred_accuracy/std": 0.02541783277411014, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.125, + "completions/max_terminated_length": 302.125, + "completions/mean_length": 182.103515625, + "completions/mean_terminated_length": 182.103515625, + "completions/min_length": 115.25, + "completions/min_terminated_length": 115.25, + "epoch": 0.4, + "grad_norm": 0.018826643005013466, + "learning_rate": 7.788121089403557e-07, + "loss": 0.0001, + "num_tokens": 90596087.0, + "reward": 1.805692046880722, + "reward_std": 0.06880141280907992, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.027846033899431428, + "rewards/waypoint_pred_accuracy/std": 0.03440070046775373, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.875, + "completions/max_terminated_length": 267.875, + "completions/mean_length": 175.751953125, + "completions/mean_terminated_length": 175.751953125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.40210526315789474, + "grad_norm": 0.013081498444080353, + "learning_rate": 7.759550714874924e-07, + "loss": -0.0004, + "num_tokens": 91046072.0, + "reward": 2.0215499103069305, + "reward_std": 0.13955920189619064, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.873046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.07522808883824439, + "rewards/waypoint_pred_accuracy/std": 0.05415468077226393, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.5, + "completions/max_terminated_length": 278.5, + "completions/mean_length": 169.3125, + "completions/mean_terminated_length": 169.3125, + "completions/min_length": 114.75, + "completions/min_terminated_length": 114.75, + "epoch": 0.40421052631578946, + "grad_norm": 0.0, + "learning_rate": 7.730858029736989e-07, + "loss": 0.0, + "num_tokens": 91491928.0, + "reward": 1.75, + "reward_std": 0.0, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 1.474326771777612e-10, + "rewards/waypoint_pred_accuracy/std": 1.1786529530155576e-09, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.875, + "completions/max_terminated_length": 306.875, + "completions/mean_length": 177.765625, + "completions/mean_terminated_length": 177.765625, + "completions/min_length": 114.875, + "completions/min_terminated_length": 114.875, + "epoch": 0.4063157894736842, + "grad_norm": 0.0, + "learning_rate": 7.702044587139137e-07, + "loss": 0.0002, + "num_tokens": 91941856.0, + "reward": 1.7487657219171524, + "reward_std": 0.01981517393141985, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.0003594244599299801, + "rewards/waypoint_pred_accuracy/std": 0.002140052256436602, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 303.5, + "completions/max_terminated_length": 303.5, + "completions/mean_length": 171.814453125, + "completions/mean_terminated_length": 172.19029235839844, + "completions/min_length": 92.375, + "completions/min_terminated_length": 106.75, + "epoch": 0.40842105263157896, + "grad_norm": 0.016274407505989075, + "learning_rate": 7.673111946767413e-07, + "loss": -0.0001, + "num_tokens": 92389185.0, + "reward": 2.0959380865097046, + "reward_std": 0.12629193731117994, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.11046904484828701, + "rewards/waypoint_pred_accuracy/std": 0.06314596923766658, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.125, + "completions/max_terminated_length": 290.125, + "completions/mean_length": 180.162109375, + "completions/mean_terminated_length": 180.162109375, + "completions/min_length": 107.5, + "completions/min_terminated_length": 107.5, + "epoch": 0.4105263157894737, + "grad_norm": 0.020258145406842232, + "learning_rate": 7.644061674760101e-07, + "loss": 0.0001, + "num_tokens": 92840532.0, + "reward": 2.294894278049469, + "reward_std": 0.23648597935971338, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.998046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.1494002838226665, + "rewards/waypoint_pred_accuracy/std": 0.11227664479537411, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.75, + "completions/max_terminated_length": 299.75, + "completions/mean_length": 169.263671875, + "completions/mean_terminated_length": 169.263671875, + "completions/min_length": 113.25, + "completions/min_terminated_length": 113.25, + "epoch": 0.4126315789473684, + "grad_norm": 0.04483957961201668, + "learning_rate": 7.61489534362294e-07, + "loss": -0.0002, + "num_tokens": 93288091.0, + "reward": 1.5267165899276733, + "reward_std": 0.05776224182045553, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.501953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.012381742581055732, + "rewards/waypoint_pred_accuracy/std": 0.02106862150685629, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.125, + "completions/max_terminated_length": 264.125, + "completions/mean_length": 165.439453125, + "completions/mean_terminated_length": 165.439453125, + "completions/min_length": 107.625, + "completions/min_terminated_length": 107.625, + "epoch": 0.4147368421052632, + "grad_norm": 0.0, + "learning_rate": 7.585614532144007e-07, + "loss": 0.0002, + "num_tokens": 93733372.0, + "reward": 1.80105559527874, + "reward_std": 0.17875106693827547, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.626953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.08705123437539442, + "rewards/waypoint_pred_accuracy/std": 0.08156303651776398, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.5, + "completions/max_terminated_length": 328.5, + "completions/mean_length": 170.4609375, + "completions/mean_terminated_length": 170.4609375, + "completions/min_length": 103.125, + "completions/min_terminated_length": 103.125, + "epoch": 0.4168421052631579, + "grad_norm": 0.009551014751195908, + "learning_rate": 7.556220825308261e-07, + "loss": 0.0002, + "num_tokens": 94180968.0, + "reward": 1.8914762139320374, + "reward_std": 0.05954795209981967, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.009214683991414209, + "rewards/waypoint_pred_accuracy/std": 0.021961479235898466, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.125, + "completions/max_terminated_length": 315.125, + "completions/mean_length": 176.51171875, + "completions/mean_terminated_length": 176.51171875, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, + "epoch": 0.4189473684210526, + "grad_norm": 0.0, + "learning_rate": 7.526715814211739e-07, + "loss": 0.0001, + "num_tokens": 94629294.0, + "reward": 2.0064243376255035, + "reward_std": 0.03297104453667998, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 1.0, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.004188739636447281, + "rewards/waypoint_pred_accuracy/std": 0.008869364886777475, + "step": 199 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.0, + "learning_rate": 7.49710109597544e-07, + "loss": 0.0002, + "step": 200 + }, + { + "epoch": 0.42105263157894735, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 290.91, + "eval_completions/max_terminated_length": 290.91, + "eval_completions/mean_length": 172.36274307250977, + "eval_completions/mean_terminated_length": 172.36274307250977, + "eval_completions/min_length": 109.78, + "eval_completions/min_terminated_length": 109.78, + "eval_loss": -4.892464494332671e-05, + "eval_num_tokens": 95080220.0, + "eval_reward": 1.8596287977695465, + "eval_reward_std": 0.08604613540126138, + "eval_rewards/format_reward_embodied/mean": 0.9996875, + "eval_rewards/format_reward_embodied/std": 0.0025, + "eval_rewards/stop_prediction_reward/mean": 0.7696875, + "eval_rewards/stop_prediction_reward/std": 0.005, + "eval_rewards/waypoint_pred_accuracy/mean": 0.045126906880960875, + "eval_rewards/waypoint_pred_accuracy/std": 0.039966236149646756, + "eval_runtime": 941.9095, + "eval_samples_per_second": 0.106, + "eval_steps_per_second": 0.002, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.9375, + "completions/max_terminated_length": 292.9375, + "completions/mean_length": 174.2548828125, + "completions/mean_terminated_length": 174.2548828125, + "completions/min_length": 110.5625, + "completions/min_terminated_length": 110.5625, + "epoch": 0.4231578947368421, + "grad_norm": 0.0, + "learning_rate": 7.467378273658856e-07, + "loss": 0.0005, + "num_tokens": 95528819.0, + "reward": 1.9443908333778381, + "reward_std": 0.04222029652737547, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.8740234375, + "rewards/stop_prediction_reward/std": 0.0078125, + "rewards/waypoint_pred_accuracy/mean": 0.03518370707206486, + "rewards/waypoint_pred_accuracy/std": 0.017203900250024166, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.5, + "completions/max_terminated_length": 305.5, + "completions/mean_length": 170.724609375, + "completions/mean_terminated_length": 170.724609375, + "completions/min_length": 112.375, + "completions/min_terminated_length": 112.375, + "epoch": 0.42526315789473684, + "grad_norm": 0.012682443484663963, + "learning_rate": 7.437548956173213e-07, + "loss": -0.0004, + "num_tokens": 95979302.0, + "reward": 1.9156423211097717, + "reward_std": 0.1263027695240453, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.751953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.0818446125079697, + "rewards/waypoint_pred_accuracy/std": 0.055338893387193444, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 293.875, + "completions/max_terminated_length": 293.875, + "completions/mean_length": 176.544921875, + "completions/mean_terminated_length": 176.8624153137207, + "completions/min_length": 101.625, + "completions/min_terminated_length": 111.625, + "epoch": 0.42736842105263156, + "grad_norm": 0.0, + "learning_rate": 7.407614758194373e-07, + "loss": -0.0006, + "num_tokens": 96431805.0, + "reward": 1.707748532295227, + "reward_std": 0.10709417768262597, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.623046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.04332738941334985, + "rewards/waypoint_pred_accuracy/std": 0.03793920004429459, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.5, + "completions/max_terminated_length": 321.5, + "completions/mean_length": 176.828125, + "completions/mean_terminated_length": 176.828125, + "completions/min_length": 115.625, + "completions/min_terminated_length": 115.625, + "epoch": 0.42947368421052634, + "grad_norm": 0.0, + "learning_rate": 7.377577300075431e-07, + "loss": 0.0, + "num_tokens": 96881189.0, + "reward": 1.760355144739151, + "reward_std": 0.02672452749078502, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.005177572603687428, + "rewards/waypoint_pred_accuracy/std": 0.013362265083738705, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.75, + "completions/max_terminated_length": 304.75, + "completions/mean_length": 175.419921875, + "completions/mean_terminated_length": 175.419921875, + "completions/min_length": 117.75, + "completions/min_terminated_length": 117.75, + "epoch": 0.43157894736842106, + "grad_norm": 0.015611842274665833, + "learning_rate": 7.347438207759002e-07, + "loss": -0.0002, + "num_tokens": 97333116.0, + "reward": 1.9850184619426727, + "reward_std": 0.06815559589631448, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.055009242740135744, + "rewards/waypoint_pred_accuracy/std": 0.0340777950465283, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.875, + "completions/max_terminated_length": 306.875, + "completions/mean_length": 174.68359375, + "completions/mean_terminated_length": 174.68359375, + "completions/min_length": 107.125, + "completions/min_terminated_length": 107.125, + "epoch": 0.4336842105263158, + "grad_norm": 0.0, + "learning_rate": 7.317199112689219e-07, + "loss": -0.0003, + "num_tokens": 97780314.0, + "reward": 1.6253042817115784, + "reward_std": 0.0009640372365589123, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0001521414744787989, + "rewards/waypoint_pred_accuracy/std": 0.00048202241833421994, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.625, + "completions/max_terminated_length": 294.625, + "completions/mean_length": 178.39453125, + "completions/mean_terminated_length": 178.39453125, + "completions/min_length": 117.125, + "completions/min_terminated_length": 117.125, + "epoch": 0.4357894736842105, + "grad_norm": 0.0, + "learning_rate": 7.286861651723403e-07, + "loss": 0.0, + "num_tokens": 98230564.0, + "reward": 1.8962776064872742, + "reward_std": 0.06431814459938323, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07313878076765164, + "rewards/waypoint_pred_accuracy/std": 0.03215907396928667, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.125, + "completions/max_terminated_length": 297.125, + "completions/mean_length": 170.755859375, + "completions/mean_terminated_length": 170.755859375, + "completions/min_length": 108.125, + "completions/min_terminated_length": 108.125, + "epoch": 0.4378947368421053, + "grad_norm": 0.02003273367881775, + "learning_rate": 7.256427467043479e-07, + "loss": 0.0001, + "num_tokens": 98680935.0, + "reward": 1.7773285955190659, + "reward_std": 0.045251342578694675, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.013664303890479346, + "rewards/waypoint_pred_accuracy/std": 0.022625670864954373, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.625, + "completions/max_terminated_length": 307.625, + "completions/mean_length": 175.458984375, + "completions/mean_terminated_length": 175.458984375, + "completions/min_length": 109.75, + "completions/min_terminated_length": 109.75, + "epoch": 0.44, + "grad_norm": 0.0, + "learning_rate": 7.225898206067071e-07, + "loss": 0.0, + "num_tokens": 99131986.0, + "reward": 2.000874102115631, + "reward_std": 0.07641031977254897, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.06293705673306249, + "rewards/waypoint_pred_accuracy/std": 0.038205162913072854, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.25, + "completions/max_terminated_length": 295.25, + "completions/mean_length": 177.837890625, + "completions/mean_terminated_length": 177.837890625, + "completions/min_length": 118.75, + "completions/min_terminated_length": 118.75, + "epoch": 0.4421052631578947, + "grad_norm": 0.01405387930572033, + "learning_rate": 7.195275521358332e-07, + "loss": -0.0003, + "num_tokens": 99580031.0, + "reward": 2.028854936361313, + "reward_std": 0.11646178726022072, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07692747169583214, + "rewards/waypoint_pred_accuracy/std": 0.05823090146415666, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.5, + "completions/max_terminated_length": 307.5, + "completions/mean_length": 177.76171875, + "completions/mean_terminated_length": 177.76171875, + "completions/min_length": 115.5, + "completions/min_terminated_length": 115.5, + "epoch": 0.4442105263157895, + "grad_norm": 0.0, + "learning_rate": 7.164561070538488e-07, + "loss": 0.0003, + "num_tokens": 100033989.0, + "reward": 1.7999018132686615, + "reward_std": 0.21613861247897148, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.14995091408491135, + "rewards/waypoint_pred_accuracy/std": 0.10806930996477604, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.875, + "completions/max_terminated_length": 431.875, + "completions/mean_length": 174.076171875, + "completions/mean_terminated_length": 174.076171875, + "completions/min_length": 112.375, + "completions/min_terminated_length": 112.375, + "epoch": 0.4463157894736842, + "grad_norm": 0.04999241605401039, + "learning_rate": 7.133756516196107e-07, + "loss": -0.0003, + "num_tokens": 100487340.0, + "reward": 1.7891514897346497, + "reward_std": 0.0568979331983428, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.01957578670394547, + "rewards/waypoint_pred_accuracy/std": 0.028448972130328657, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 171.359375, + "completions/mean_terminated_length": 171.359375, + "completions/min_length": 109.375, + "completions/min_terminated_length": 109.375, + "epoch": 0.44842105263157894, + "grad_norm": 0.0, + "learning_rate": 7.102863525797112e-07, + "loss": -0.0001, + "num_tokens": 100937124.0, + "reward": 1.9840654134750366, + "reward_std": 0.17244431003928185, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.17953270860016346, + "rewards/waypoint_pred_accuracy/std": 0.08622215129435062, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.375, + "completions/max_terminated_length": 291.375, + "completions/mean_length": 175.158203125, + "completions/mean_terminated_length": 175.158203125, + "completions/min_length": 113.25, + "completions/min_terminated_length": 113.25, + "epoch": 0.45052631578947366, + "grad_norm": 0.01138628926128149, + "learning_rate": 7.071883771594509e-07, + "loss": 0.0, + "num_tokens": 101387957.0, + "reward": 1.8491481095552444, + "reward_std": 0.1147658722824616, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.049574058981537675, + "rewards/waypoint_pred_accuracy/std": 0.05738293592632728, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.125, + "completions/max_terminated_length": 327.125, + "completions/mean_length": 187.966796875, + "completions/mean_terminated_length": 187.966796875, + "completions/min_length": 114.625, + "completions/min_terminated_length": 114.625, + "epoch": 0.45263157894736844, + "grad_norm": 0.011563337408006191, + "learning_rate": 7.040818930537874e-07, + "loss": -0.0003, + "num_tokens": 101845412.0, + "reward": 1.6611975878477097, + "reward_std": 0.0494131935941482, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.018098798598909838, + "rewards/waypoint_pred_accuracy/std": 0.02470659996008351, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.25, + "completions/max_terminated_length": 287.25, + "completions/mean_length": 171.841796875, + "completions/mean_terminated_length": 171.841796875, + "completions/min_length": 107.125, + "completions/min_terminated_length": 107.125, + "epoch": 0.45473684210526316, + "grad_norm": 2.7086246063845465e-06, + "learning_rate": 7.009670684182576e-07, + "loss": -0.0, + "num_tokens": 102293587.0, + "reward": 1.5, + "reward_std": 3.251700020356907e-09, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 4.380867890674267e-10, + "rewards/waypoint_pred_accuracy/std": 1.9065319947775272e-09, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.75, + "completions/max_terminated_length": 286.75, + "completions/mean_length": 176.490234375, + "completions/mean_terminated_length": 176.490234375, + "completions/min_length": 114.5, + "completions/min_terminated_length": 114.5, + "epoch": 0.4568421052631579, + "grad_norm": 0.0, + "learning_rate": 6.978440718598756e-07, + "loss": 0.0002, + "num_tokens": 102744270.0, + "reward": 1.7497325837612152, + "reward_std": 0.06904890944133513, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.06236628795954857, + "rewards/waypoint_pred_accuracy/std": 0.03452445384209568, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 182.732421875, + "completions/mean_terminated_length": 182.732421875, + "completions/min_length": 111.375, + "completions/min_terminated_length": 111.375, + "epoch": 0.4589473684210526, + "grad_norm": 0.029011964797973633, + "learning_rate": 6.947130724280057e-07, + "loss": 0.0, + "num_tokens": 103198789.0, + "reward": 1.7562294006347656, + "reward_std": 0.013579967227997258, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0031146957662715976, + "rewards/waypoint_pred_accuracy/std": 0.006789985702200646, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.375, + "completions/max_terminated_length": 270.375, + "completions/mean_length": 172.20703125, + "completions/mean_terminated_length": 172.20703125, + "completions/min_length": 111.875, + "completions/min_terminated_length": 111.875, + "epoch": 0.4610526315789474, + "grad_norm": 0.013365295715630054, + "learning_rate": 6.915742396052115e-07, + "loss": -0.0001, + "num_tokens": 103649519.0, + "reward": 1.5070368647575378, + "reward_std": 0.023722524622826313, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.501953125, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.002541873606073473, + "rewards/waypoint_pred_accuracy/std": 0.004048763120422751, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.375, + "completions/max_terminated_length": 291.375, + "completions/mean_length": 176.5546875, + "completions/mean_terminated_length": 176.5546875, + "completions/min_length": 110.875, + "completions/min_terminated_length": 110.875, + "epoch": 0.4631578947368421, + "grad_norm": 0.0, + "learning_rate": 6.884277432980825e-07, + "loss": 0.0001, + "num_tokens": 104099915.0, + "reward": 1.8983599245548248, + "reward_std": 0.022330745094222948, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.011679970655677607, + "rewards/waypoint_pred_accuracy/std": 0.01116537469351897, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.875, + "completions/max_terminated_length": 272.875, + "completions/mean_length": 176.126953125, + "completions/mean_terminated_length": 176.126953125, + "completions/min_length": 118.875, + "completions/min_terminated_length": 118.875, + "epoch": 0.4652631578947368, + "grad_norm": 0.017878547310829163, + "learning_rate": 6.852737538280359e-07, + "loss": -0.0001, + "num_tokens": 104550732.0, + "reward": 1.7993512451648712, + "reward_std": 0.045344060357820126, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.024675624659721507, + "rewards/waypoint_pred_accuracy/std": 0.022672027718726895, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.5, + "completions/max_terminated_length": 305.5, + "completions/mean_length": 181.193359375, + "completions/mean_terminated_length": 181.193359375, + "completions/min_length": 113.875, + "completions/min_terminated_length": 113.875, + "epoch": 0.4673684210526316, + "grad_norm": 0.01165629643946886, + "learning_rate": 6.821124419220978e-07, + "loss": 0.0002, + "num_tokens": 105002991.0, + "reward": 1.8998810648918152, + "reward_std": 0.040346091078029334, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.012440536318531065, + "rewards/waypoint_pred_accuracy/std": 0.020173047916574705, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.375, + "completions/max_terminated_length": 288.375, + "completions/mean_length": 175.021484375, + "completions/mean_terminated_length": 175.021484375, + "completions/min_length": 109.75, + "completions/min_terminated_length": 109.75, + "epoch": 0.4694736842105263, + "grad_norm": 0.0, + "learning_rate": 6.789439787036614e-07, + "loss": -0.0001, + "num_tokens": 105453626.0, + "reward": 1.9986501336097717, + "reward_std": 0.019883400294929743, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.998046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.0003016188566107303, + "rewards/waypoint_pred_accuracy/std": 0.0021291994489729404, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.375, + "completions/max_terminated_length": 309.375, + "completions/mean_length": 179.6640625, + "completions/mean_terminated_length": 179.6640625, + "completions/min_length": 112.75, + "completions/min_terminated_length": 112.75, + "epoch": 0.47157894736842104, + "grad_norm": 0.015561181120574474, + "learning_rate": 6.757685356832242e-07, + "loss": -0.0, + "num_tokens": 105904398.0, + "reward": 2.101376533508301, + "reward_std": 0.06816481053829193, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 1.0, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.05068826675858098, + "rewards/waypoint_pred_accuracy/std": 0.0340824015760622, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 175.58203125, + "completions/mean_terminated_length": 175.58203125, + "completions/min_length": 109.625, + "completions/min_terminated_length": 109.625, + "epoch": 0.47368421052631576, + "grad_norm": 0.013554830104112625, + "learning_rate": 6.725862847491034e-07, + "loss": 0.0002, + "num_tokens": 106353592.0, + "reward": 1.7516418248414993, + "reward_std": 0.0034361608559265733, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0008209160487240297, + "rewards/waypoint_pred_accuracy/std": 0.0017180802678922191, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.5, + "completions/max_terminated_length": 306.5, + "completions/mean_length": 183.095703125, + "completions/mean_terminated_length": 183.095703125, + "completions/min_length": 108.125, + "completions/min_terminated_length": 108.125, + "epoch": 0.47578947368421054, + "grad_norm": 0.00029330080724321306, + "learning_rate": 6.693973981581324e-07, + "loss": 0.0, + "num_tokens": 106808553.0, + "reward": 1.6250907480716705, + "reward_std": 0.00024353076181782285, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 4.53742529797907e-05, + "rewards/waypoint_pred_accuracy/std": 0.00012176844195366245, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.5, + "completions/max_terminated_length": 292.5, + "completions/mean_length": 181.337890625, + "completions/mean_terminated_length": 181.337890625, + "completions/min_length": 119.625, + "completions/min_terminated_length": 119.625, + "epoch": 0.47789473684210526, + "grad_norm": 0.0, + "learning_rate": 6.662020485263358e-07, + "loss": -0.0001, + "num_tokens": 107258774.0, + "reward": 1.625608280301094, + "reward_std": 0.0014176478143781424, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.00030413969053576185, + "rewards/waypoint_pred_accuracy/std": 0.00070882499138026, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.25, + "completions/max_terminated_length": 291.25, + "completions/mean_length": 173.318359375, + "completions/mean_terminated_length": 173.318359375, + "completions/min_length": 107.875, + "completions/min_terminated_length": 107.875, + "epoch": 0.48, + "grad_norm": 0.0, + "learning_rate": 6.630004088195858e-07, + "loss": 0.0002, + "num_tokens": 107708793.0, + "reward": 1.876689851284027, + "reward_std": 0.0026773642748594284, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0008449186572583362, + "rewards/waypoint_pred_accuracy/std": 0.0013386833028957005, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.5, + "completions/max_terminated_length": 308.5, + "completions/mean_length": 176.880859375, + "completions/mean_terminated_length": 176.880859375, + "completions/min_length": 117.25, + "completions/min_terminated_length": 117.25, + "epoch": 0.48210526315789476, + "grad_norm": 0.0, + "learning_rate": 6.597926523442398e-07, + "loss": 0.0, + "num_tokens": 108161148.0, + "reward": 1.509762555360794, + "reward_std": 0.021483093870038772, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0048812878156252685, + "rewards/waypoint_pred_accuracy/std": 0.01074154757443697, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.125, + "completions/max_terminated_length": 312.125, + "completions/mean_length": 173.00390625, + "completions/mean_terminated_length": 173.00390625, + "completions/min_length": 110.25, + "completions/min_terminated_length": 110.25, + "epoch": 0.4842105263157895, + "grad_norm": 0.013146106153726578, + "learning_rate": 6.565789527377587e-07, + "loss": -0.0005, + "num_tokens": 108611454.0, + "reward": 1.7406707108020782, + "reward_std": 0.09716045763343573, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.05783534119836986, + "rewards/waypoint_pred_accuracy/std": 0.048580223228782415, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.625, + "completions/max_terminated_length": 286.625, + "completions/mean_length": 172.7578125, + "completions/mean_terminated_length": 172.7578125, + "completions/min_length": 111.75, + "completions/min_terminated_length": 111.75, + "epoch": 0.4863157894736842, + "grad_norm": 0.00952562689781189, + "learning_rate": 6.533594839593081e-07, + "loss": 0.0001, + "num_tokens": 109059522.0, + "reward": 1.7838719189167023, + "reward_std": 0.09348210319876671, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07943596761145057, + "rewards/waypoint_pred_accuracy/std": 0.04674104697497585, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.25, + "completions/max_terminated_length": 293.25, + "completions/mean_length": 175.3828125, + "completions/mean_terminated_length": 175.3828125, + "completions/min_length": 114.75, + "completions/min_terminated_length": 114.75, + "epoch": 0.4884210526315789, + "grad_norm": 0.0, + "learning_rate": 6.501344202803414e-07, + "loss": 0.0001, + "num_tokens": 109511942.0, + "reward": 1.5068429559469223, + "reward_std": 0.023580931854667142, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.5, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.003421477313622745, + "rewards/waypoint_pred_accuracy/std": 0.011790467111495673, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 178.611328125, + "completions/mean_terminated_length": 178.9438877105713, + "completions/min_length": 97.375, + "completions/min_terminated_length": 110.125, + "epoch": 0.4905263157894737, + "grad_norm": 0.010548449121415615, + "learning_rate": 6.469039362751677e-07, + "loss": -0.0002, + "num_tokens": 109963455.0, + "reward": 1.7827188670635223, + "reward_std": 0.06010658299783245, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.018312573189177783, + "rewards/waypoint_pred_accuracy/std": 0.024115337153489236, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 301.125, + "completions/max_terminated_length": 301.125, + "completions/mean_length": 179.953125, + "completions/mean_terminated_length": 180.33076095581055, + "completions/min_length": 101.375, + "completions/min_terminated_length": 115.75, + "epoch": 0.4926315789473684, + "grad_norm": 0.0, + "learning_rate": 6.436682068115002e-07, + "loss": -0.0007, + "num_tokens": 110416615.0, + "reward": 1.99609375, + "reward_std": 0.03125, + "rewards/format_reward_embodied/mean": 0.998046875, + "rewards/format_reward_embodied/std": 0.015625, + "rewards/stop_prediction_reward/mean": 0.998046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 2.0173116242793235e-14, + "rewards/waypoint_pred_accuracy/std": 1.1195781985957193e-13, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 173.30859375, + "completions/mean_terminated_length": 173.30859375, + "completions/min_length": 112.75, + "completions/min_terminated_length": 112.75, + "epoch": 0.49473684210526314, + "grad_norm": 0.023477498441934586, + "learning_rate": 6.404274070409915e-07, + "loss": 0.0002, + "num_tokens": 110865797.0, + "reward": 1.8803559094667435, + "reward_std": 0.11074852512797406, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.06517796947535714, + "rewards/waypoint_pred_accuracy/std": 0.05537425884915592, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.875, + "completions/max_terminated_length": 299.875, + "completions/mean_length": 176.0390625, + "completions/mean_terminated_length": 176.0390625, + "completions/min_length": 114.375, + "completions/min_terminated_length": 114.375, + "epoch": 0.4968421052631579, + "grad_norm": 0.008372652344405651, + "learning_rate": 6.371817123897528e-07, + "loss": 0.0006, + "num_tokens": 111319513.0, + "reward": 1.8294343054294586, + "reward_std": 0.08038223959738389, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.748046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.040693737070411296, + "rewards/waypoint_pred_accuracy/std": 0.03237862857612228, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 251.25, + "completions/max_terminated_length": 251.25, + "completions/mean_length": 166.775390625, + "completions/mean_terminated_length": 166.775390625, + "completions/min_length": 109.5, + "completions/min_terminated_length": 109.5, + "epoch": 0.49894736842105264, + "grad_norm": 0.0, + "learning_rate": 6.339312985488576e-07, + "loss": -0.0001, + "num_tokens": 111764710.0, + "reward": 1.8851255774497986, + "reward_std": 0.04906696546822786, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.005062782554890241, + "rewards/waypoint_pred_accuracy/std": 0.024533490184718572, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.5, + "completions/max_terminated_length": 296.5, + "completions/mean_length": 172.90625, + "completions/mean_terminated_length": 172.90625, + "completions/min_length": 111.125, + "completions/min_terminated_length": 111.125, + "epoch": 0.5010526315789474, + "grad_norm": 0.01637883298099041, + "learning_rate": 6.30676341464831e-07, + "loss": -0.0003, + "num_tokens": 112215734.0, + "reward": 1.6509484648704529, + "reward_std": 0.06736253991999908, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.03125, + "rewards/waypoint_pred_accuracy/mean": 0.012974242886534648, + "rewards/waypoint_pred_accuracy/std": 0.01805627301899171, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.625, + "completions/max_terminated_length": 289.625, + "completions/mean_length": 169.388671875, + "completions/mean_terminated_length": 169.388671875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.5031578947368421, + "grad_norm": 0.0, + "learning_rate": 6.274170173301268e-07, + "loss": 0.0001, + "num_tokens": 112666173.0, + "reward": 1.9938509166240692, + "reward_std": 0.11555472994223237, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.05942547577433288, + "rewards/waypoint_pred_accuracy/std": 0.05777736520394683, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.5, + "completions/max_terminated_length": 289.5, + "completions/mean_length": 172.37890625, + "completions/mean_terminated_length": 172.37890625, + "completions/min_length": 108.875, + "completions/min_terminated_length": 108.875, + "epoch": 0.5052631578947369, + "grad_norm": 0.0, + "learning_rate": 6.24153502573589e-07, + "loss": -0.0001, + "num_tokens": 113115903.0, + "reward": 1.7577707767486572, + "reward_std": 0.016411395743489265, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.0038853867445141077, + "rewards/waypoint_pred_accuracy/std": 0.008205699268728495, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 175.09375, + "completions/mean_terminated_length": 175.09375, + "completions/min_length": 116.375, + "completions/min_terminated_length": 116.375, + "epoch": 0.5073684210526316, + "grad_norm": 0.03419042006134987, + "learning_rate": 6.208859738509021e-07, + "loss": 0.0007, + "num_tokens": 113568495.0, + "reward": 1.7237209975719452, + "reward_std": 0.14016160182654858, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.049360513221472516, + "rewards/waypoint_pred_accuracy/std": 0.07008079765364533, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.875, + "completions/max_terminated_length": 289.875, + "completions/mean_length": 173.462890625, + "completions/mean_terminated_length": 173.462890625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.5094736842105263, + "grad_norm": 0.0, + "learning_rate": 6.176146080350286e-07, + "loss": 0.0, + "num_tokens": 114018972.0, + "reward": 1.8751783967018127, + "reward_std": 0.00036079369601793587, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 8.91994423000142e-05, + "rewards/waypoint_pred_accuracy/std": 0.00018039710994344205, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 173.265625, + "completions/mean_terminated_length": 173.265625, + "completions/min_length": 114.25, + "completions/min_terminated_length": 114.25, + "epoch": 0.511578947368421, + "grad_norm": 0.013551232405006886, + "learning_rate": 6.14339582206635e-07, + "loss": 0.0001, + "num_tokens": 114468132.0, + "reward": 1.9178512692451477, + "reward_std": 0.06416846066713333, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.021425632760024103, + "rewards/waypoint_pred_accuracy/std": 0.03208423405926866, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.125, + "completions/max_terminated_length": 312.125, + "completions/mean_length": 175.140625, + "completions/mean_terminated_length": 175.140625, + "completions/min_length": 110.75, + "completions/min_terminated_length": 110.75, + "epoch": 0.5136842105263157, + "grad_norm": 0.02614566497504711, + "learning_rate": 6.110610736445058e-07, + "loss": 0.0003, + "num_tokens": 114913708.0, + "reward": 1.8622263967990875, + "reward_std": 0.18642316292971373, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.11861319048330188, + "rewards/waypoint_pred_accuracy/std": 0.09321157418889925, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.125, + "completions/max_terminated_length": 301.125, + "completions/mean_length": 172.37890625, + "completions/mean_terminated_length": 172.37890625, + "completions/min_length": 110.75, + "completions/min_terminated_length": 110.75, + "epoch": 0.5157894736842106, + "grad_norm": 0.013545077294111252, + "learning_rate": 6.077792598159479e-07, + "loss": -0.0, + "num_tokens": 115361006.0, + "reward": 1.9425796866416931, + "reward_std": 0.10113994629730882, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.03378988173824164, + "rewards/waypoint_pred_accuracy/std": 0.05056997878441809, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.5, + "completions/max_terminated_length": 276.5, + "completions/mean_length": 173.775390625, + "completions/mean_terminated_length": 173.775390625, + "completions/min_length": 116.5, + "completions/min_terminated_length": 116.5, + "epoch": 0.5178947368421053, + "grad_norm": 0.02074316143989563, + "learning_rate": 6.044943183671836e-07, + "loss": 0.0004, + "num_tokens": 115809723.0, + "reward": 1.7704734951257706, + "reward_std": 0.09013272261904604, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.623046875, + "rewards/stop_prediction_reward/std": 0.015625, + "rewards/waypoint_pred_accuracy/mean": 0.07371331164245536, + "rewards/waypoint_pred_accuracy/std": 0.03725385823372892, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.375, + "completions/max_terminated_length": 258.375, + "completions/mean_length": 170.69140625, + "completions/mean_terminated_length": 170.69140625, + "completions/min_length": 119.125, + "completions/min_terminated_length": 119.125, + "epoch": 0.52, + "grad_norm": 0.021677250042557716, + "learning_rate": 6.01206427113735e-07, + "loss": -0.0001, + "num_tokens": 116257053.0, + "reward": 1.8927133083343506, + "reward_std": 0.0853966644051809, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.75, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.07135666430194423, + "rewards/waypoint_pred_accuracy/std": 0.04269833582034271, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.125, + "completions/max_terminated_length": 272.125, + "completions/mean_length": 169.240234375, + "completions/mean_terminated_length": 169.240234375, + "completions/min_length": 112.375, + "completions/min_terminated_length": 112.375, + "epoch": 0.5221052631578947, + "grad_norm": 0.000306050234939903, + "learning_rate": 5.97915764030799e-07, + "loss": -0.0, + "num_tokens": 116703576.0, + "reward": 1.6304270327091217, + "reward_std": 0.011630857972136255, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.625, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.002713506846724556, + "rewards/waypoint_pred_accuracy/std": 0.005815430337122507, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.375, + "completions/max_terminated_length": 282.375, + "completions/mean_length": 166.033203125, + "completions/mean_terminated_length": 166.033203125, + "completions/min_length": 113.625, + "completions/min_terminated_length": 113.625, + "epoch": 0.5242105263157895, + "grad_norm": 0.0, + "learning_rate": 5.946225072436121e-07, + "loss": 0.0001, + "num_tokens": 117151145.0, + "reward": 1.8943032920360565, + "reward_std": 0.03212926587002585, + "rewards/format_reward_embodied/mean": 1.0, + "rewards/format_reward_embodied/std": 0.0, + "rewards/stop_prediction_reward/mean": 0.875, + "rewards/stop_prediction_reward/std": 0.0, + "rewards/waypoint_pred_accuracy/mean": 0.009651642787417297, + "rewards/waypoint_pred_accuracy/std": 0.016064628072191757, + "step": 249 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.015220913104712963, + "learning_rate": 5.913268350178101e-07, + "loss": 0.0001, + "step": 250 + }, + { + "epoch": 0.5263157894736842, + "eval_clip_ratio": 0.0, + "eval_completions/clipped_ratio": 0.00015625, + "eval_completions/max_length": 278.13, + "eval_completions/max_terminated_length": 278.13, + "eval_completions/mean_length": 169.34763885498046, + "eval_completions/mean_terminated_length": 169.375066986084, + "eval_completions/min_length": 110.76, + "eval_completions/min_terminated_length": 111.92, + "eval_loss": -2.504486656107474e-05, + "eval_num_tokens": 117598301.0, + "eval_reward": 1.865585025548935, + "eval_reward_std": 0.0873453421616474, + "eval_rewards/format_reward_embodied/mean": 0.9996875, + "eval_rewards/format_reward_embodied/std": 0.0025, + "eval_rewards/stop_prediction_reward/mean": 0.76984375, + "eval_rewards/stop_prediction_reward/std": 0.008003681004047393, + "eval_rewards/waypoint_pred_accuracy/mean": 0.048026895228197336, + "eval_rewards/waypoint_pred_accuracy/std": 0.038523975913848735, + "eval_runtime": 963.8699, + "eval_samples_per_second": 0.104, + "eval_steps_per_second": 0.002, + "step": 250 + } + ], + "logging_steps": 1, + "max_steps": 475, + "num_input_tokens_seen": 117598301, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}