{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.42105263157894735, "eval_steps": 200, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.375, "completions/max_terminated_length": 1129.375, "completions/mean_length": 217.826171875, "completions/mean_terminated_length": 217.826171875, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.002105263157894737, "grad_norm": 0.03688327968120575, "learning_rate": 0.0, "loss": -0.0036, "num_tokens": 501159.0, "reward": 0.6852182224392891, "reward_std": 0.5407347865402699, "rewards/format_reward_embodied/mean": 0.2578125, "rewards/format_reward_embodied/std": 0.4250035658478737, "rewards/stop_prediction_reward/mean": 0.40625, "rewards/stop_prediction_reward/std": 0.2911768723279238, "rewards/waypoint_pred_accuracy/mean": 0.010577870212728508, "rewards/waypoint_pred_accuracy/std": 0.04360266821459155, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.5, "completions/max_terminated_length": 1230.5, "completions/mean_length": 218.732421875, "completions/mean_terminated_length": 218.732421875, "completions/min_length": 92.125, "completions/min_terminated_length": 92.125, "epoch": 0.004210526315789474, "grad_norm": 0.03129596635699272, "learning_rate": 2.083333333333333e-08, "loss": -0.0014, "num_tokens": 1002846.0, "reward": 0.7121060490608215, "reward_std": 0.5467510148882866, "rewards/format_reward_embodied/mean": 0.27734375, "rewards/format_reward_embodied/std": 0.43743864819407463, "rewards/stop_prediction_reward/mean": 0.392578125, "rewards/stop_prediction_reward/std": 0.2523916997015476, "rewards/waypoint_pred_accuracy/mean": 0.02109208470572326, "rewards/waypoint_pred_accuracy/std": 0.049278605758283106, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1473.75, "completions/max_terminated_length": 1473.75, "completions/mean_length": 236.810546875, "completions/mean_terminated_length": 236.810546875, "completions/min_length": 105.5, "completions/min_terminated_length": 105.5, "epoch": 0.00631578947368421, "grad_norm": 0.11485113203525543, "learning_rate": 4.166666666666666e-08, "loss": -0.001, "num_tokens": 1511293.0, "reward": 0.6580178588628769, "reward_std": 0.5665898956358433, "rewards/format_reward_embodied/mean": 0.27734375, "rewards/format_reward_embodied/std": 0.44385743886232376, "rewards/stop_prediction_reward/mean": 0.365234375, "rewards/stop_prediction_reward/std": 0.3355066943913698, "rewards/waypoint_pred_accuracy/mean": 0.007719868453698335, "rewards/waypoint_pred_accuracy/std": 0.027026359874671183, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.25, "completions/max_terminated_length": 695.25, "completions/mean_length": 194.90234375, "completions/mean_terminated_length": 194.90234375, "completions/min_length": 90.75, "completions/min_terminated_length": 90.75, "epoch": 0.008421052631578947, "grad_norm": 0.03228563070297241, "learning_rate": 6.25e-08, "loss": -0.006, "num_tokens": 2000779.0, "reward": 0.6896363161504269, "reward_std": 0.5512153543531895, "rewards/format_reward_embodied/mean": 0.255859375, "rewards/format_reward_embodied/std": 0.4282885938882828, "rewards/stop_prediction_reward/mean": 0.39453125, "rewards/stop_prediction_reward/std": 0.3049718104302883, "rewards/waypoint_pred_accuracy/mean": 0.019622846261881932, "rewards/waypoint_pred_accuracy/std": 0.05433488787366514, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 222.998046875, "completions/mean_terminated_length": 223.74107551574707, "completions/min_length": 73.625, "completions/min_terminated_length": 95.75, "epoch": 0.010526315789473684, "grad_norm": 0.04120698943734169, "learning_rate": 8.333333333333333e-08, "loss": -0.004, "num_tokens": 2504266.0, "reward": 0.6145266555249691, "reward_std": 0.5875402726233006, "rewards/format_reward_embodied/mean": 0.30859375, "rewards/format_reward_embodied/std": 0.45738402009010315, "rewards/stop_prediction_reward/mean": 0.2890625, "rewards/stop_prediction_reward/std": 0.3146662712097168, "rewards/waypoint_pred_accuracy/mean": 0.00843520529876629, "rewards/waypoint_pred_accuracy/std": 0.03367704733402643, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.875, "completions/max_terminated_length": 1009.875, "completions/mean_length": 207.990234375, "completions/mean_terminated_length": 207.990234375, "completions/min_length": 95.75, "completions/min_terminated_length": 95.75, "epoch": 0.01263157894736842, "grad_norm": 0.026955831795930862, "learning_rate": 1.0416666666666667e-07, "loss": -0.0046, "num_tokens": 2999493.0, "reward": 0.5977649390697479, "reward_std": 0.5568973757326603, "rewards/format_reward_embodied/mean": 0.275390625, "rewards/format_reward_embodied/std": 0.44332288950681686, "rewards/stop_prediction_reward/mean": 0.322265625, "rewards/stop_prediction_reward/std": 0.3272685557603836, "rewards/waypoint_pred_accuracy/mean": 5.434432013211645e-05, "rewards/waypoint_pred_accuracy/std": 0.0004276758230305201, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1457.375, "completions/max_terminated_length": 1457.375, "completions/mean_length": 225.505859375, "completions/mean_terminated_length": 226.0093650817871, "completions/min_length": 73.125, "completions/min_terminated_length": 86.375, "epoch": 0.014736842105263158, "grad_norm": 0.03217363357543945, "learning_rate": 1.25e-07, "loss": -0.0045, "num_tokens": 3504072.0, "reward": 0.3724593073129654, "reward_std": 0.5260734632611275, "rewards/format_reward_embodied/mean": 0.275390625, "rewards/format_reward_embodied/std": 0.44422636553645134, "rewards/stop_prediction_reward/mean": 0.091796875, "rewards/stop_prediction_reward/std": 0.28769673593342304, "rewards/waypoint_pred_accuracy/mean": 0.0026359074025874563, "rewards/waypoint_pred_accuracy/std": 0.006386257246882465, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1779.625, "completions/max_terminated_length": 1779.625, "completions/mean_length": 223.65625, "completions/mean_terminated_length": 224.61331367492676, "completions/min_length": 69.625, "completions/min_terminated_length": 92.0, "epoch": 0.016842105263157894, "grad_norm": 0.03023587167263031, "learning_rate": 1.4583333333333335e-07, "loss": -0.0018, "num_tokens": 4009112.0, "reward": 0.7184133045375347, "reward_std": 0.6360493190586567, "rewards/format_reward_embodied/mean": 0.287109375, "rewards/format_reward_embodied/std": 0.4407898560166359, "rewards/stop_prediction_reward/mean": 0.33984375, "rewards/stop_prediction_reward/std": 0.3496157228946686, "rewards/waypoint_pred_accuracy/mean": 0.045730086048161776, "rewards/waypoint_pred_accuracy/std": 0.06390689127976774, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 209.583984375, "completions/mean_terminated_length": 209.583984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.018947368421052633, "grad_norm": 0.032111555337905884, "learning_rate": 1.6666666666666665e-07, "loss": -0.0049, "num_tokens": 4502851.0, "reward": 0.7770869806408882, "reward_std": 0.652216799557209, "rewards/format_reward_embodied/mean": 0.33203125, "rewards/format_reward_embodied/std": 0.46690355241298676, "rewards/stop_prediction_reward/mean": 0.291015625, "rewards/stop_prediction_reward/std": 0.28971834294497967, "rewards/waypoint_pred_accuracy/mean": 0.07702004234306514, "rewards/waypoint_pred_accuracy/std": 0.11429450009018183, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.25, "completions/max_terminated_length": 1344.25, "completions/mean_length": 220.0390625, "completions/mean_terminated_length": 220.0390625, "completions/min_length": 95.375, "completions/min_terminated_length": 95.375, "epoch": 0.021052631578947368, "grad_norm": 0.036742229014635086, "learning_rate": 1.875e-07, "loss": -0.0031, "num_tokens": 5006551.0, "reward": 0.8515274524688721, "reward_std": 0.5746714510023594, "rewards/format_reward_embodied/mean": 0.330078125, "rewards/format_reward_embodied/std": 0.46579784527421, "rewards/stop_prediction_reward/mean": 0.49609375, "rewards/stop_prediction_reward/std": 0.29504277743399143, "rewards/waypoint_pred_accuracy/mean": 0.012677800143137578, "rewards/waypoint_pred_accuracy/std": 0.04874318046495321, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 698.375, "completions/max_terminated_length": 698.375, "completions/mean_length": 222.986328125, "completions/mean_terminated_length": 223.3989658355713, "completions/min_length": 84.75, "completions/min_terminated_length": 96.125, "epoch": 0.023157894736842106, "grad_norm": 0.04964126646518707, "learning_rate": 2.0833333333333333e-07, "loss": -0.0055, "num_tokens": 5511248.0, "reward": 0.4871162623167038, "reward_std": 0.5761008001863956, "rewards/format_reward_embodied/mean": 0.33984375, "rewards/format_reward_embodied/std": 0.4706413373351097, "rewards/stop_prediction_reward/mean": 0.140625, "rewards/stop_prediction_reward/std": 0.34470293670892715, "rewards/waypoint_pred_accuracy/mean": 0.003323758961364831, "rewards/waypoint_pred_accuracy/std": 0.019421547695571917, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1410.375, "completions/max_terminated_length": 1410.375, "completions/mean_length": 221.177734375, "completions/mean_terminated_length": 221.177734375, "completions/min_length": 93.125, "completions/min_terminated_length": 93.125, "epoch": 0.02526315789473684, "grad_norm": 0.0342765711247921, "learning_rate": 2.2916666666666663e-07, "loss": -0.0036, "num_tokens": 6012011.0, "reward": 0.726501177996397, "reward_std": 0.6375694684684277, "rewards/format_reward_embodied/mean": 0.34375, "rewards/format_reward_embodied/std": 0.47322528064250946, "rewards/stop_prediction_reward/mean": 0.30859375, "rewards/stop_prediction_reward/std": 0.3438059203326702, "rewards/waypoint_pred_accuracy/mean": 0.037078722628304604, "rewards/waypoint_pred_accuracy/std": 0.0714277882893814, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.25, "completions/max_terminated_length": 1263.25, "completions/mean_length": 225.150390625, "completions/mean_terminated_length": 225.150390625, "completions/min_length": 100.125, "completions/min_terminated_length": 100.125, "epoch": 0.02736842105263158, "grad_norm": 0.030924461781978607, "learning_rate": 2.5e-07, "loss": -0.0023, "num_tokens": 6516280.0, "reward": 0.816199004650116, "reward_std": 0.5992049053311348, "rewards/format_reward_embodied/mean": 0.3984375, "rewards/format_reward_embodied/std": 0.4887525513768196, "rewards/stop_prediction_reward/mean": 0.400390625, "rewards/stop_prediction_reward/std": 0.3476070836186409, "rewards/waypoint_pred_accuracy/mean": 0.008685434097221462, "rewards/waypoint_pred_accuracy/std": 0.031141442110235985, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1929.25, "completions/max_terminated_length": 1929.25, "completions/mean_length": 231.19921875, "completions/mean_terminated_length": 232.3006935119629, "completions/min_length": 64.75, "completions/min_terminated_length": 90.875, "epoch": 0.029473684210526315, "grad_norm": 0.031240319833159447, "learning_rate": 2.708333333333333e-07, "loss": 0.0011, "num_tokens": 7025822.0, "reward": 0.7418127804994583, "reward_std": 0.6001337319612503, "rewards/format_reward_embodied/mean": 0.3359375, "rewards/format_reward_embodied/std": 0.46518728509545326, "rewards/stop_prediction_reward/mean": 0.369140625, "rewards/stop_prediction_reward/std": 0.33801633305847645, "rewards/waypoint_pred_accuracy/mean": 0.018367327513715306, "rewards/waypoint_pred_accuracy/std": 0.0351917422394765, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1099.875, "completions/max_terminated_length": 1099.875, "completions/mean_length": 224.60546875, "completions/mean_terminated_length": 225.1283187866211, "completions/min_length": 85.625, "completions/min_terminated_length": 95.75, "epoch": 0.031578947368421054, "grad_norm": 0.030209699645638466, "learning_rate": 2.916666666666667e-07, "loss": -0.006, "num_tokens": 7528852.0, "reward": 0.5229232423007488, "reward_std": 0.5791459046304226, "rewards/format_reward_embodied/mean": 0.3828125, "rewards/format_reward_embodied/std": 0.4857485108077526, "rewards/stop_prediction_reward/mean": 0.10546875, "rewards/stop_prediction_reward/std": 0.29261469282209873, "rewards/waypoint_pred_accuracy/mean": 0.01732099140614508, "rewards/waypoint_pred_accuracy/std": 0.04053366294584711, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1367.375, "completions/max_terminated_length": 1367.375, "completions/mean_length": 235.68359375, "completions/mean_terminated_length": 236.2605438232422, "completions/min_length": 91.125, "completions/min_terminated_length": 106.875, "epoch": 0.03368421052631579, "grad_norm": 0.03397149220108986, "learning_rate": 3.1249999999999997e-07, "loss": -0.0009, "num_tokens": 8041266.0, "reward": 0.9018646515905857, "reward_std": 0.5724878907203674, "rewards/format_reward_embodied/mean": 0.46875, "rewards/format_reward_embodied/std": 0.4908192716538906, "rewards/stop_prediction_reward/mean": 0.419921875, "rewards/stop_prediction_reward/std": 0.2832304909825325, "rewards/waypoint_pred_accuracy/mean": 0.006596383985863152, "rewards/waypoint_pred_accuracy/std": 0.030292918029928728, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.125, "completions/max_terminated_length": 453.125, "completions/mean_length": 204.66015625, "completions/mean_terminated_length": 204.66015625, "completions/min_length": 92.375, "completions/min_terminated_length": 92.375, "epoch": 0.035789473684210524, "grad_norm": 0.02802187018096447, "learning_rate": 3.333333333333333e-07, "loss": -0.0084, "num_tokens": 8535428.0, "reward": 0.7916244938969612, "reward_std": 0.5850509852170944, "rewards/format_reward_embodied/mean": 0.48046875, "rewards/format_reward_embodied/std": 0.4996633492410183, "rewards/stop_prediction_reward/mean": 0.302734375, "rewards/stop_prediction_reward/std": 0.31226812675595284, "rewards/waypoint_pred_accuracy/mean": 0.004210685843629879, "rewards/waypoint_pred_accuracy/std": 0.00963768846033057, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 865.625, "completions/max_terminated_length": 865.625, "completions/mean_length": 210.908203125, "completions/mean_terminated_length": 211.34145736694336, "completions/min_length": 86.0, "completions/min_terminated_length": 98.75, "epoch": 0.037894736842105266, "grad_norm": 0.03985866159200668, "learning_rate": 3.541666666666667e-07, "loss": -0.0064, "num_tokens": 9030549.0, "reward": 0.7308155745267868, "reward_std": 0.5641426295042038, "rewards/format_reward_embodied/mean": 0.53125, "rewards/format_reward_embodied/std": 0.4955369792878628, "rewards/stop_prediction_reward/mean": 0.19921875, "rewards/stop_prediction_reward/std": 0.29108020290732384, "rewards/waypoint_pred_accuracy/mean": 0.00017341448449656295, "rewards/waypoint_pred_accuracy/std": 0.0007318578800792436, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.625, "completions/max_terminated_length": 1043.625, "completions/mean_length": 225.025390625, "completions/mean_terminated_length": 225.025390625, "completions/min_length": 95.25, "completions/min_terminated_length": 95.25, "epoch": 0.04, "grad_norm": 0.030999917536973953, "learning_rate": 3.75e-07, "loss": -0.0037, "num_tokens": 9535394.0, "reward": 0.8065008148550987, "reward_std": 0.5943347848951817, "rewards/format_reward_embodied/mean": 0.53125, "rewards/format_reward_embodied/std": 0.4729270227253437, "rewards/stop_prediction_reward/mean": 0.271484375, "rewards/stop_prediction_reward/std": 0.3523057959973812, "rewards/waypoint_pred_accuracy/mean": 0.0018832212136416936, "rewards/waypoint_pred_accuracy/std": 0.009695871519704979, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1323.375, "completions/max_terminated_length": 1323.375, "completions/mean_length": 231.146484375, "completions/mean_terminated_length": 231.146484375, "completions/min_length": 96.375, "completions/min_terminated_length": 96.375, "epoch": 0.042105263157894736, "grad_norm": 0.032023850828409195, "learning_rate": 3.958333333333333e-07, "loss": -0.0019, "num_tokens": 10043949.0, "reward": 0.9295599162578583, "reward_std": 0.5884210243821144, "rewards/format_reward_embodied/mean": 0.5625, "rewards/format_reward_embodied/std": 0.4947963282465935, "rewards/stop_prediction_reward/mean": 0.29296875, "rewards/stop_prediction_reward/std": 0.2782147694379091, "rewards/waypoint_pred_accuracy/mean": 0.03704559023572074, "rewards/waypoint_pred_accuracy/std": 0.06106339632424351, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.125, "completions/max_terminated_length": 867.125, "completions/mean_length": 221.240234375, "completions/mean_terminated_length": 221.240234375, "completions/min_length": 95.375, "completions/min_terminated_length": 95.375, "epoch": 0.04421052631578947, "grad_norm": 0.033257901668548584, "learning_rate": 4.1666666666666667e-07, "loss": -0.002, "num_tokens": 10546600.0, "reward": 1.1667077615857124, "reward_std": 0.5849204882979393, "rewards/format_reward_embodied/mean": 0.6875, "rewards/format_reward_embodied/std": 0.46290775388479233, "rewards/stop_prediction_reward/mean": 0.44140625, "rewards/stop_prediction_reward/std": 0.3193382862955332, "rewards/waypoint_pred_accuracy/mean": 0.01890076152959421, "rewards/waypoint_pred_accuracy/std": 0.04711238816242192, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1204.375, "completions/max_terminated_length": 1204.375, "completions/mean_length": 242.275390625, "completions/mean_terminated_length": 242.7787094116211, "completions/min_length": 85.625, "completions/min_terminated_length": 101.0, "epoch": 0.04631578947368421, "grad_norm": 0.027390193194150925, "learning_rate": 4.375e-07, "loss": 0.0012, "num_tokens": 11059701.0, "reward": 0.9610797762870789, "reward_std": 0.516326267272234, "rewards/format_reward_embodied/mean": 0.796875, "rewards/format_reward_embodied/std": 0.39213394187390804, "rewards/stop_prediction_reward/mean": 0.154296875, "rewards/stop_prediction_reward/std": 0.2987161073833704, "rewards/waypoint_pred_accuracy/mean": 0.0049539450064901135, "rewards/waypoint_pred_accuracy/std": 0.02740523732789636, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.75, "completions/max_terminated_length": 475.75, "completions/mean_length": 229.240234375, "completions/mean_terminated_length": 229.240234375, "completions/min_length": 106.375, "completions/min_terminated_length": 106.375, "epoch": 0.04842105263157895, "grad_norm": 0.029475348070263863, "learning_rate": 4.5833333333333327e-07, "loss": 0.0004, "num_tokens": 11564848.0, "reward": 1.310406669974327, "reward_std": 0.47197042033076286, "rewards/format_reward_embodied/mean": 0.875, "rewards/format_reward_embodied/std": 0.32729423232376575, "rewards/stop_prediction_reward/mean": 0.40234375, "rewards/stop_prediction_reward/std": 0.32928374595940113, "rewards/waypoint_pred_accuracy/mean": 0.016531457972611463, "rewards/waypoint_pred_accuracy/std": 0.049992657648260774, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1241.875, "completions/max_terminated_length": 1241.875, "completions/mean_length": 249.404296875, "completions/mean_terminated_length": 249.8599338531494, "completions/min_length": 89.125, "completions/min_terminated_length": 102.5, "epoch": 0.05052631578947368, "grad_norm": 0.05331696942448616, "learning_rate": 4.791666666666667e-07, "loss": 0.0126, "num_tokens": 12081983.0, "reward": 1.369938537478447, "reward_std": 0.5135768614709377, "rewards/format_reward_embodied/mean": 0.8359375, "rewards/format_reward_embodied/std": 0.37031440809369087, "rewards/stop_prediction_reward/mean": 0.529296875, "rewards/stop_prediction_reward/std": 0.34028442576527596, "rewards/waypoint_pred_accuracy/mean": 0.002352085092750447, "rewards/waypoint_pred_accuracy/std": 0.015591990617297411, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.125, "completions/max_terminated_length": 1152.125, "completions/mean_length": 258.888671875, "completions/mean_terminated_length": 258.888671875, "completions/min_length": 97.875, "completions/min_terminated_length": 97.875, "epoch": 0.05263157894736842, "grad_norm": 0.024480490013957024, "learning_rate": 5e-07, "loss": 0.0108, "num_tokens": 12602118.0, "reward": 1.2254773080348969, "reward_std": 0.4796493649482727, "rewards/format_reward_embodied/mean": 0.888671875, "rewards/format_reward_embodied/std": 0.30017153173685074, "rewards/stop_prediction_reward/mean": 0.330078125, "rewards/stop_prediction_reward/std": 0.33193706534802914, "rewards/waypoint_pred_accuracy/mean": 0.0033636507843118604, "rewards/waypoint_pred_accuracy/std": 0.026622640219126532, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 240.220703125, "completions/mean_terminated_length": 240.7258529663086, "completions/min_length": 87.875, "completions/min_terminated_length": 104.0, "epoch": 0.05473684210526316, "grad_norm": 0.028204545378684998, "learning_rate": 5.208333333333334e-07, "loss": 0.0014, "num_tokens": 13115447.0, "reward": 1.255406454205513, "reward_std": 0.4552546963095665, "rewards/format_reward_embodied/mean": 0.892578125, "rewards/format_reward_embodied/std": 0.30313970148563385, "rewards/stop_prediction_reward/mean": 0.345703125, "rewards/stop_prediction_reward/std": 0.3145990837365389, "rewards/waypoint_pred_accuracy/mean": 0.008562608887709433, "rewards/waypoint_pred_accuracy/std": 0.015426839128514835, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2274.125, "completions/max_terminated_length": 2274.125, "completions/mean_length": 263.94921875, "completions/mean_terminated_length": 263.94921875, "completions/min_length": 98.625, "completions/min_terminated_length": 98.625, "epoch": 0.056842105263157895, "grad_norm": 0.027318229898810387, "learning_rate": 5.416666666666666e-07, "loss": 0.02, "num_tokens": 13643293.0, "reward": 1.1268005520105362, "reward_std": 0.4487200677394867, "rewards/format_reward_embodied/mean": 0.900390625, "rewards/format_reward_embodied/std": 0.29798127338290215, "rewards/stop_prediction_reward/mean": 0.224609375, "rewards/stop_prediction_reward/std": 0.31709046475589275, "rewards/waypoint_pred_accuracy/mean": 0.0009002835363531858, "rewards/waypoint_pred_accuracy/std": 0.006629681602248638, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1472.25, "completions/max_terminated_length": 1472.25, "completions/mean_length": 250.123046875, "completions/mean_terminated_length": 250.6703872680664, "completions/min_length": 90.5, "completions/min_terminated_length": 102.5, "epoch": 0.05894736842105263, "grad_norm": 0.02800668776035309, "learning_rate": 5.625e-07, "loss": 0.0081, "num_tokens": 14162140.0, "reward": 1.4252514392137527, "reward_std": 0.5070786997675896, "rewards/format_reward_embodied/mean": 0.912109375, "rewards/format_reward_embodied/std": 0.28026143461465836, "rewards/stop_prediction_reward/mean": 0.4453125, "rewards/stop_prediction_reward/std": 0.3642293494194746, "rewards/waypoint_pred_accuracy/mean": 0.03391479922983473, "rewards/waypoint_pred_accuracy/std": 0.05375720616380722, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.125, "completions/max_terminated_length": 1779.125, "completions/mean_length": 270.09375, "completions/mean_terminated_length": 270.09375, "completions/min_length": 105.375, "completions/min_terminated_length": 105.375, "epoch": 0.061052631578947365, "grad_norm": 0.023560991510748863, "learning_rate": 5.833333333333334e-07, "loss": 0.0175, "num_tokens": 14689548.0, "reward": 1.3020398318767548, "reward_std": 0.4841705746948719, "rewards/format_reward_embodied/mean": 0.927734375, "rewards/format_reward_embodied/std": 0.2531428262591362, "rewards/stop_prediction_reward/mean": 0.318359375, "rewards/stop_prediction_reward/std": 0.33780971355736256, "rewards/waypoint_pred_accuracy/mean": 0.027973046589977874, "rewards/waypoint_pred_accuracy/std": 0.05706198215375809, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1295.75, "completions/max_terminated_length": 1295.75, "completions/mean_length": 262.677734375, "completions/mean_terminated_length": 263.1325969696045, "completions/min_length": 87.0, "completions/min_terminated_length": 99.5, "epoch": 0.06315789473684211, "grad_norm": 0.02862788550555706, "learning_rate": 6.041666666666666e-07, "loss": 0.0153, "num_tokens": 15211623.0, "reward": 1.2348038852214813, "reward_std": 0.47002242133021355, "rewards/format_reward_embodied/mean": 0.9453125, "rewards/format_reward_embodied/std": 0.21914233826100826, "rewards/stop_prediction_reward/mean": 0.2734375, "rewards/stop_prediction_reward/std": 0.37936339154839516, "rewards/waypoint_pred_accuracy/mean": 0.008026950919884267, "rewards/waypoint_pred_accuracy/std": 0.03125202885712506, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2074.5, "completions/max_terminated_length": 2074.5, "completions/mean_length": 274.806640625, "completions/mean_terminated_length": 275.9952926635742, "completions/min_length": 69.125, "completions/min_terminated_length": 96.125, "epoch": 0.06526315789473684, "grad_norm": 0.024989839643239975, "learning_rate": 6.249999999999999e-07, "loss": 0.0194, "num_tokens": 15742340.0, "reward": 1.3568423390388489, "reward_std": 0.47142963856458664, "rewards/format_reward_embodied/mean": 0.947265625, "rewards/format_reward_embodied/std": 0.21802780404686928, "rewards/stop_prediction_reward/mean": 0.388671875, "rewards/stop_prediction_reward/std": 0.3706877864897251, "rewards/waypoint_pred_accuracy/mean": 0.010452428944169867, "rewards/waypoint_pred_accuracy/std": 0.04323142563315496, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2043.625, "completions/max_terminated_length": 2043.625, "completions/mean_length": 281.189453125, "completions/mean_terminated_length": 281.7291069030762, "completions/min_length": 93.0, "completions/min_terminated_length": 105.75, "epoch": 0.06736842105263158, "grad_norm": 0.02739080600440502, "learning_rate": 6.458333333333333e-07, "loss": 0.0224, "num_tokens": 16274597.0, "reward": 1.139307051897049, "reward_std": 0.46579157933592796, "rewards/format_reward_embodied/mean": 0.9453125, "rewards/format_reward_embodied/std": 0.2243238165974617, "rewards/stop_prediction_reward/mean": 0.189453125, "rewards/stop_prediction_reward/std": 0.39086980000138283, "rewards/waypoint_pred_accuracy/mean": 0.0022707147145411, "rewards/waypoint_pred_accuracy/std": 0.013931056426372379, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 753.75, "completions/max_terminated_length": 753.75, "completions/mean_length": 243.2890625, "completions/mean_terminated_length": 243.79483032226562, "completions/min_length": 93.125, "completions/min_terminated_length": 106.0, "epoch": 0.06947368421052631, "grad_norm": 0.025134088471531868, "learning_rate": 6.666666666666666e-07, "loss": 0.0046, "num_tokens": 16789753.0, "reward": 1.5594421923160553, "reward_std": 0.4511796310544014, "rewards/format_reward_embodied/mean": 0.97265625, "rewards/format_reward_embodied/std": 0.14868677593767643, "rewards/stop_prediction_reward/mean": 0.5859375, "rewards/stop_prediction_reward/std": 0.4054722301661968, "rewards/waypoint_pred_accuracy/mean": 0.00042422344501604533, "rewards/waypoint_pred_accuracy/std": 0.0026550625844949066, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2006.625, "completions/max_terminated_length": 2006.625, "completions/mean_length": 290.732421875, "completions/mean_terminated_length": 291.87425994873047, "completions/min_length": 76.25, "completions/min_terminated_length": 109.125, "epoch": 0.07157894736842105, "grad_norm": 0.02658732235431671, "learning_rate": 6.875e-07, "loss": 0.0223, "num_tokens": 17327152.0, "reward": 1.3722179979085922, "reward_std": 0.49062684178352356, "rewards/format_reward_embodied/mean": 0.962890625, "rewards/format_reward_embodied/std": 0.17316900379955769, "rewards/stop_prediction_reward/mean": 0.392578125, "rewards/stop_prediction_reward/std": 0.43504052981734276, "rewards/waypoint_pred_accuracy/mean": 0.008374628147294061, "rewards/waypoint_pred_accuracy/std": 0.02515958553613018, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 1412.75, "completions/max_terminated_length": 1412.75, "completions/mean_length": 272.447265625, "completions/mean_terminated_length": 274.3158702850342, "completions/min_length": 69.375, "completions/min_terminated_length": 110.75, "epoch": 0.07368421052631578, "grad_norm": 0.025512199848890305, "learning_rate": 7.083333333333334e-07, "loss": 0.0185, "num_tokens": 17857749.0, "reward": 1.378344789147377, "reward_std": 0.5315527282655239, "rewards/format_reward_embodied/mean": 0.953125, "rewards/format_reward_embodied/std": 0.20892741158604622, "rewards/stop_prediction_reward/mean": 0.419921875, "rewards/stop_prediction_reward/std": 0.46176647394895554, "rewards/waypoint_pred_accuracy/mean": 0.0026489564064510387, "rewards/waypoint_pred_accuracy/std": 0.0209440600764698, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1762.25, "completions/max_terminated_length": 1762.25, "completions/mean_length": 300.3046875, "completions/mean_terminated_length": 302.0868148803711, "completions/min_length": 72.0, "completions/min_terminated_length": 110.625, "epoch": 0.07578947368421053, "grad_norm": 0.030284173786640167, "learning_rate": 7.291666666666666e-07, "loss": 0.0228, "num_tokens": 18405233.0, "reward": 1.5107819437980652, "reward_std": 0.5866557471454144, "rewards/format_reward_embodied/mean": 0.935546875, "rewards/format_reward_embodied/std": 0.23654338158667088, "rewards/stop_prediction_reward/mean": 0.529296875, "rewards/stop_prediction_reward/std": 0.44718513265252113, "rewards/waypoint_pred_accuracy/mean": 0.022969108418879985, "rewards/waypoint_pred_accuracy/std": 0.04822167293027549, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 824.125, "completions/max_terminated_length": 824.125, "completions/mean_length": 268.046875, "completions/mean_terminated_length": 268.57704162597656, "completions/min_length": 97.25, "completions/min_terminated_length": 112.625, "epoch": 0.07789473684210527, "grad_norm": 0.022956551983952522, "learning_rate": 7.5e-07, "loss": 0.0041, "num_tokens": 18931657.0, "reward": 1.4823229908943176, "reward_std": 0.5730771608650684, "rewards/format_reward_embodied/mean": 0.96484375, "rewards/format_reward_embodied/std": 0.15371971018612385, "rewards/stop_prediction_reward/mean": 0.39453125, "rewards/stop_prediction_reward/std": 0.4760393425822258, "rewards/waypoint_pred_accuracy/mean": 0.06147399850306101, "rewards/waypoint_pred_accuracy/std": 0.0798248762730509, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1849.875, "completions/max_terminated_length": 1849.875, "completions/mean_length": 277.376953125, "completions/mean_terminated_length": 277.82732009887695, "completions/min_length": 93.5, "completions/min_terminated_length": 107.25, "epoch": 0.08, "grad_norm": 0.03779571130871773, "learning_rate": 7.708333333333333e-07, "loss": 0.0167, "num_tokens": 19464138.0, "reward": 1.6509281545877457, "reward_std": 0.6500044874846935, "rewards/format_reward_embodied/mean": 0.947265625, "rewards/format_reward_embodied/std": 0.22269471548497677, "rewards/stop_prediction_reward/mean": 0.5703125, "rewards/stop_prediction_reward/std": 0.46316269040107727, "rewards/waypoint_pred_accuracy/mean": 0.0666750292330382, "rewards/waypoint_pred_accuracy/std": 0.13379900066650008, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1376.25, "completions/max_terminated_length": 1376.25, "completions/mean_length": 274.71875, "completions/mean_terminated_length": 275.6487522125244, "completions/min_length": 96.0, "completions/min_terminated_length": 109.25, "epoch": 0.08210526315789474, "grad_norm": 0.03399224206805229, "learning_rate": 7.916666666666666e-07, "loss": 0.0112, "num_tokens": 19995898.0, "reward": 1.4387513846158981, "reward_std": 0.526082057505846, "rewards/format_reward_embodied/mean": 0.955078125, "rewards/format_reward_embodied/std": 0.20178878121078014, "rewards/stop_prediction_reward/mean": 0.482421875, "rewards/stop_prediction_reward/std": 0.4750334359705448, "rewards/waypoint_pred_accuracy/mean": 0.0006257013071992472, "rewards/waypoint_pred_accuracy/std": 0.0034726200243562974, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.5, "completions/max_terminated_length": 1890.5, "completions/mean_length": 289.318359375, "completions/mean_terminated_length": 289.318359375, "completions/min_length": 108.125, "completions/min_terminated_length": 108.125, "epoch": 0.08421052631578947, "grad_norm": 0.05279357358813286, "learning_rate": 8.125e-07, "loss": 0.021, "num_tokens": 20538205.0, "reward": 1.4992299228906631, "reward_std": 0.5350005514919758, "rewards/format_reward_embodied/mean": 0.96875, "rewards/format_reward_embodied/std": 0.14104433543980122, "rewards/stop_prediction_reward/mean": 0.474609375, "rewards/stop_prediction_reward/std": 0.48077963665127754, "rewards/waypoint_pred_accuracy/mean": 0.02793527340469384, "rewards/waypoint_pred_accuracy/std": 0.04508495466211591, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1191.75, "completions/max_terminated_length": 1191.75, "completions/mean_length": 257.849609375, "completions/mean_terminated_length": 258.3867835998535, "completions/min_length": 93.0, "completions/min_terminated_length": 107.0, "epoch": 0.0863157894736842, "grad_norm": 0.0242131520062685, "learning_rate": 8.333333333333333e-07, "loss": 0.0081, "num_tokens": 21060752.0, "reward": 1.451779067516327, "reward_std": 0.5482931584119797, "rewards/format_reward_embodied/mean": 0.97265625, "rewards/format_reward_embodied/std": 0.16118930466473103, "rewards/stop_prediction_reward/mean": 0.455078125, "rewards/stop_prediction_reward/std": 0.4890851750969887, "rewards/waypoint_pred_accuracy/mean": 0.012022334193716233, "rewards/waypoint_pred_accuracy/std": 0.05236113088176353, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1946.75, "completions/max_terminated_length": 1946.75, "completions/mean_length": 277.97265625, "completions/mean_terminated_length": 278.5504722595215, "completions/min_length": 98.375, "completions/min_terminated_length": 113.25, "epoch": 0.08842105263157894, "grad_norm": 0.02777104079723358, "learning_rate": 8.541666666666666e-07, "loss": 0.0179, "num_tokens": 21591234.0, "reward": 1.4967730790376663, "reward_std": 0.5411636978387833, "rewards/format_reward_embodied/mean": 0.95703125, "rewards/format_reward_embodied/std": 0.20193831622600555, "rewards/stop_prediction_reward/mean": 0.529296875, "rewards/stop_prediction_reward/std": 0.48042263463139534, "rewards/waypoint_pred_accuracy/mean": 0.005222481389856159, "rewards/waypoint_pred_accuracy/std": 0.01637772589662894, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1507.875, "completions/max_terminated_length": 1507.875, "completions/mean_length": 284.642578125, "completions/mean_terminated_length": 285.1566848754883, "completions/min_length": 100.25, "completions/min_terminated_length": 112.625, "epoch": 0.09052631578947369, "grad_norm": 0.023667719215154648, "learning_rate": 8.75e-07, "loss": 0.0097, "num_tokens": 22125899.0, "reward": 1.4648699462413788, "reward_std": 0.5257757790386677, "rewards/format_reward_embodied/mean": 0.978515625, "rewards/format_reward_embodied/std": 0.13138550892472267, "rewards/stop_prediction_reward/mean": 0.484375, "rewards/stop_prediction_reward/std": 0.49612294882535934, "rewards/waypoint_pred_accuracy/mean": 0.0009896594916386676, "rewards/waypoint_pred_accuracy/std": 0.00240541340038158, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1049.625, "completions/max_terminated_length": 1049.625, "completions/mean_length": 262.294921875, "completions/mean_terminated_length": 263.2274341583252, "completions/min_length": 90.625, "completions/min_terminated_length": 120.5, "epoch": 0.09263157894736843, "grad_norm": 0.02323044091463089, "learning_rate": 8.958333333333334e-07, "loss": 0.009, "num_tokens": 22650210.0, "reward": 1.5325981974601746, "reward_std": 0.5178802609443665, "rewards/format_reward_embodied/mean": 0.9765625, "rewards/format_reward_embodied/std": 0.1501840502023697, "rewards/stop_prediction_reward/mean": 0.55078125, "rewards/stop_prediction_reward/std": 0.47454017773270607, "rewards/waypoint_pred_accuracy/mean": 0.002627218333572107, "rewards/waypoint_pred_accuracy/std": 0.0178613198091855, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1067.625, "completions/max_terminated_length": 1067.625, "completions/mean_length": 261.609375, "completions/mean_terminated_length": 262.7379150390625, "completions/min_length": 80.75, "completions/min_terminated_length": 107.125, "epoch": 0.09473684210526316, "grad_norm": 0.025584101676940918, "learning_rate": 9.166666666666665e-07, "loss": 0.0049, "num_tokens": 23174490.0, "reward": 1.5120439231395721, "reward_std": 0.5698215290904045, "rewards/format_reward_embodied/mean": 0.970703125, "rewards/format_reward_embodied/std": 0.1432244125753641, "rewards/stop_prediction_reward/mean": 0.48046875, "rewards/stop_prediction_reward/std": 0.49686969071626663, "rewards/waypoint_pred_accuracy/mean": 0.030436025275711542, "rewards/waypoint_pred_accuracy/std": 0.04808906307854464, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 1311.875, "completions/max_terminated_length": 1311.875, "completions/mean_length": 260.18359375, "completions/mean_terminated_length": 261.09471702575684, "completions/min_length": 80.875, "completions/min_terminated_length": 107.125, "epoch": 0.0968421052631579, "grad_norm": 0.02259247750043869, "learning_rate": 9.374999999999999e-07, "loss": 0.009, "num_tokens": 23697464.0, "reward": 1.5787278413772583, "reward_std": 0.5821966156363487, "rewards/format_reward_embodied/mean": 0.962890625, "rewards/format_reward_embodied/std": 0.17129190266132355, "rewards/stop_prediction_reward/mean": 0.560546875, "rewards/stop_prediction_reward/std": 0.4862819127738476, "rewards/waypoint_pred_accuracy/mean": 0.027645181453438732, "rewards/waypoint_pred_accuracy/std": 0.052043586321354925, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 1217.375, "completions/max_terminated_length": 1217.375, "completions/mean_length": 258.544921875, "completions/mean_terminated_length": 261.1987762451172, "completions/min_length": 60.375, "completions/min_terminated_length": 106.25, "epoch": 0.09894736842105263, "grad_norm": 0.025578564032912254, "learning_rate": 9.583333333333334e-07, "loss": 0.0071, "num_tokens": 24219855.0, "reward": 1.7763752341270447, "reward_std": 0.558862715959549, "rewards/format_reward_embodied/mean": 0.9765625, "rewards/format_reward_embodied/std": 0.13683890365064144, "rewards/stop_prediction_reward/mean": 0.6796875, "rewards/stop_prediction_reward/std": 0.4396868348121643, "rewards/waypoint_pred_accuracy/mean": 0.060062614770686196, "rewards/waypoint_pred_accuracy/std": 0.09472785080470203, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 789.75, "completions/max_terminated_length": 789.75, "completions/mean_length": 250.767578125, "completions/mean_terminated_length": 251.30236434936523, "completions/min_length": 100.5, "completions/min_terminated_length": 114.25, "epoch": 0.10105263157894737, "grad_norm": 0.04857382923364639, "learning_rate": 9.791666666666667e-07, "loss": 0.005, "num_tokens": 24738392.0, "reward": 1.6715045720338821, "reward_std": 0.5651321746408939, "rewards/format_reward_embodied/mean": 0.986328125, "rewards/format_reward_embodied/std": 0.08913025446236134, "rewards/stop_prediction_reward/mean": 0.59375, "rewards/stop_prediction_reward/std": 0.475710678845644, "rewards/waypoint_pred_accuracy/mean": 0.04571322910890221, "rewards/waypoint_pred_accuracy/std": 0.09239658995958085, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1712.25, "completions/max_terminated_length": 1712.25, "completions/mean_length": 264.029296875, "completions/mean_terminated_length": 264.6627616882324, "completions/min_length": 96.375, "completions/min_terminated_length": 112.25, "epoch": 0.1031578947368421, "grad_norm": 0.02499283291399479, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 25266791.0, "reward": 1.6089884787797928, "reward_std": 0.4395485632121563, "rewards/format_reward_embodied/mean": 0.984375, "rewards/format_reward_embodied/std": 0.09701303765177727, "rewards/stop_prediction_reward/mean": 0.62109375, "rewards/stop_prediction_reward/std": 0.4166187085211277, "rewards/waypoint_pred_accuracy/mean": 0.0017598633744550646, "rewards/waypoint_pred_accuracy/std": 0.014041802557660181, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 782.25, "completions/max_terminated_length": 782.25, "completions/mean_length": 241.578125, "completions/mean_terminated_length": 242.62267875671387, "completions/min_length": 81.25, "completions/min_terminated_length": 110.375, "epoch": 0.10526315789473684, "grad_norm": 0.026004238054156303, "learning_rate": 9.999878206375666e-07, "loss": 0.0042, "num_tokens": 25780943.0, "reward": 1.752969428896904, "reward_std": 0.5163040142506361, "rewards/format_reward_embodied/mean": 0.98046875, "rewards/format_reward_embodied/std": 0.11734727956354618, "rewards/stop_prediction_reward/mean": 0.619140625, "rewards/stop_prediction_reward/std": 0.3831787258386612, "rewards/waypoint_pred_accuracy/mean": 0.07668001217677028, "rewards/waypoint_pred_accuracy/std": 0.09573968059240912, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 1246.5, "completions/max_terminated_length": 1246.5, "completions/mean_length": 256.455078125, "completions/mean_terminated_length": 258.2095832824707, "completions/min_length": 84.375, "completions/min_terminated_length": 116.0, "epoch": 0.10736842105263159, "grad_norm": 0.022528601810336113, "learning_rate": 9.999512832095417e-07, "loss": 0.0083, "num_tokens": 26303096.0, "reward": 1.688169151544571, "reward_std": 0.4684153348207474, "rewards/format_reward_embodied/mean": 0.97265625, "rewards/format_reward_embodied/std": 0.14541476964950562, "rewards/stop_prediction_reward/mean": 0.693359375, "rewards/stop_prediction_reward/std": 0.4177774414420128, "rewards/waypoint_pred_accuracy/mean": 0.01107676944522796, "rewards/waypoint_pred_accuracy/std": 0.01949074265544499, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 780.625, "completions/max_terminated_length": 780.625, "completions/mean_length": 238.931640625, "completions/mean_terminated_length": 239.8865966796875, "completions/min_length": 82.25, "completions/min_terminated_length": 110.25, "epoch": 0.10947368421052632, "grad_norm": 0.023264339193701744, "learning_rate": 9.998903896937148e-07, "loss": 0.0055, "num_tokens": 26815445.0, "reward": 1.7764329463243484, "reward_std": 0.4156668558716774, "rewards/format_reward_embodied/mean": 0.98046875, "rewards/format_reward_embodied/std": 0.1080182921141386, "rewards/stop_prediction_reward/mean": 0.765625, "rewards/stop_prediction_reward/std": 0.3569933623075485, "rewards/waypoint_pred_accuracy/mean": 0.015169612161131564, "rewards/waypoint_pred_accuracy/std": 0.038644970976866665, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 708.375, "completions/max_terminated_length": 708.375, "completions/mean_length": 240.755859375, "completions/mean_terminated_length": 243.21048164367676, "completions/min_length": 44.0, "completions/min_terminated_length": 111.75, "epoch": 0.11157894736842106, "grad_norm": 0.021222949028015137, "learning_rate": 9.998051433862818e-07, "loss": 0.0005, "num_tokens": 27328472.0, "reward": 1.6367684453725815, "reward_std": 0.4527711495757103, "rewards/format_reward_embodied/mean": 0.98046875, "rewards/format_reward_embodied/std": 0.12667626701295376, "rewards/stop_prediction_reward/mean": 0.654296875, "rewards/stop_prediction_reward/std": 0.4042268693447113, "rewards/waypoint_pred_accuracy/mean": 0.001001412993597487, "rewards/waypoint_pred_accuracy/std": 0.006711796085221954, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 1021.625, "completions/max_terminated_length": 1021.625, "completions/mean_length": 235.0859375, "completions/mean_terminated_length": 235.52288055419922, "completions/min_length": 88.375, "completions/min_terminated_length": 102.25, "epoch": 0.11368421052631579, "grad_norm": 0.08888416737318039, "learning_rate": 9.996955489016681e-07, "loss": 0.0061, "num_tokens": 27839236.0, "reward": 1.6098358184099197, "reward_std": 0.4157105013728142, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.537109375, "rewards/stop_prediction_reward/std": 0.32417640648782253, "rewards/waypoint_pred_accuracy/mean": 0.04026949138372071, "rewards/waypoint_pred_accuracy/std": 0.08703815758703826, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 645.875, "completions/max_terminated_length": 645.875, "completions/mean_length": 232.65625, "completions/mean_terminated_length": 233.95863342285156, "completions/min_length": 82.625, "completions/min_terminated_length": 108.5, "epoch": 0.11578947368421053, "grad_norm": 0.022888371720910072, "learning_rate": 9.995616121722783e-07, "loss": 0.0047, "num_tokens": 28348308.0, "reward": 1.4843996465206146, "reward_std": 0.3909701108932495, "rewards/format_reward_embodied/mean": 0.9765625, "rewards/format_reward_embodied/std": 0.13525213301181793, "rewards/stop_prediction_reward/mean": 0.5078125, "rewards/stop_prediction_reward/std": 0.3503939900547266, "rewards/waypoint_pred_accuracy/mean": 1.2323282025034152e-05, "rewards/waypoint_pred_accuracy/std": 8.679057491278136e-05, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 230.517578125, "completions/mean_terminated_length": 230.517578125, "completions/min_length": 109.625, "completions/min_terminated_length": 109.625, "epoch": 0.11789473684210526, "grad_norm": 0.02565157040953636, "learning_rate": 9.994033404481736e-07, "loss": 0.0049, "num_tokens": 28854301.0, "reward": 1.718205749988556, "reward_std": 0.3056327272206545, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.07509202510118484, "rewards/stop_prediction_reward/mean": 0.69140625, "rewards/stop_prediction_reward/std": 0.2528537716716528, "rewards/waypoint_pred_accuracy/mean": 0.019259145094103945, "rewards/waypoint_pred_accuracy/std": 0.033180712862469104, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 874.625, "completions/max_terminated_length": 874.625, "completions/mean_length": 225.9609375, "completions/mean_terminated_length": 228.2757396697998, "completions/min_length": 34.875, "completions/min_terminated_length": 100.25, "epoch": 0.12, "grad_norm": 0.04665112867951393, "learning_rate": 9.992207422966824e-07, "loss": 0.0022, "num_tokens": 29361673.0, "reward": 1.416211023926735, "reward_std": 0.36958669498562813, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.08442101255059242, "rewards/stop_prediction_reward/mean": 0.423828125, "rewards/stop_prediction_reward/std": 0.34239685721695423, "rewards/waypoint_pred_accuracy/mean": 0.0020508230609024296, "rewards/waypoint_pred_accuracy/std": 0.00960279977375578, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 730.625, "completions/max_terminated_length": 730.625, "completions/mean_length": 217.24609375, "completions/mean_terminated_length": 217.7291374206543, "completions/min_length": 84.625, "completions/min_terminated_length": 100.0, "epoch": 0.12210526315789473, "grad_norm": 0.021573517471551895, "learning_rate": 9.990138276019335e-07, "loss": 0.0033, "num_tokens": 29862663.0, "reward": 1.6188426464796066, "reward_std": 0.361522875726223, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.06879601255059242, "rewards/stop_prediction_reward/mean": 0.599609375, "rewards/stop_prediction_reward/std": 0.30805824138224125, "rewards/waypoint_pred_accuracy/mean": 0.014499449071699644, "rewards/waypoint_pred_accuracy/std": 0.03686596512818596, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 219.20703125, "completions/mean_terminated_length": 219.6153907775879, "completions/min_length": 90.0, "completions/min_terminated_length": 102.25, "epoch": 0.12421052631578948, "grad_norm": 0.030738674104213715, "learning_rate": 9.987826075643228e-07, "loss": 0.0005, "num_tokens": 30364145.0, "reward": 1.6070083975791931, "reward_std": 0.3145763948559761, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.607421875, "rewards/stop_prediction_reward/std": 0.3078697919845581, "rewards/waypoint_pred_accuracy/mean": 0.0007698313043051179, "rewards/waypoint_pred_accuracy/std": 0.003213268306107381, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 430.25, "completions/max_terminated_length": 430.25, "completions/mean_length": 227.884765625, "completions/mean_terminated_length": 228.35980987548828, "completions/min_length": 91.625, "completions/min_terminated_length": 106.25, "epoch": 0.12631578947368421, "grad_norm": 0.0346749946475029, "learning_rate": 9.985270946999066e-07, "loss": -0.0, "num_tokens": 30872630.0, "reward": 1.5473660081624985, "reward_std": 0.3273146338760853, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.51171875, "rewards/stop_prediction_reward/std": 0.2723542097955942, "rewards/waypoint_pred_accuracy/mean": 0.020753327865904253, "rewards/waypoint_pred_accuracy/std": 0.04440359811712844, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 215.94140625, "completions/mean_terminated_length": 216.40008163452148, "completions/min_length": 93.625, "completions/min_terminated_length": 107.625, "epoch": 0.12842105263157894, "grad_norm": 0.02391301840543747, "learning_rate": 9.982473028397236e-07, "loss": -0.0007, "num_tokens": 31374040.0, "reward": 1.980911210179329, "reward_std": 0.36170641146600246, "rewards/format_reward_embodied/mean": 0.98828125, "rewards/format_reward_embodied/std": 0.08442101255059242, "rewards/stop_prediction_reward/mean": 0.92578125, "rewards/stop_prediction_reward/std": 0.25123921409249306, "rewards/waypoint_pred_accuracy/mean": 0.033424367617953976, "rewards/waypoint_pred_accuracy/std": 0.0665561274972788, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 230.134765625, "completions/mean_terminated_length": 230.134765625, "completions/min_length": 101.875, "completions/min_terminated_length": 101.875, "epoch": 0.13052631578947368, "grad_norm": 0.028065558522939682, "learning_rate": 9.979432471290472e-07, "loss": 0.0077, "num_tokens": 31882013.0, "reward": 1.9677928388118744, "reward_std": 0.3956281915307045, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.841796875, "rewards/stop_prediction_reward/std": 0.2814692761749029, "rewards/waypoint_pred_accuracy/mean": 0.06592767706888156, "rewards/waypoint_pred_accuracy/std": 0.08086618531427625, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 577.75, "completions/max_terminated_length": 577.75, "completions/mean_length": 206.138671875, "completions/mean_terminated_length": 206.54464530944824, "completions/min_length": 89.25, "completions/min_terminated_length": 102.375, "epoch": 0.13263157894736843, "grad_norm": 0.034105412662029266, "learning_rate": 9.97614944026565e-07, "loss": 0.0031, "num_tokens": 32378340.0, "reward": 1.6257891356945038, "reward_std": 0.2507903631776571, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.62890625, "rewards/stop_prediction_reward/std": 0.24271228723227978, "rewards/waypoint_pred_accuracy/mean": 0.00039456591977750935, "rewards/waypoint_pred_accuracy/std": 0.0031565275905959434, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 423.125, "completions/max_terminated_length": 423.125, "completions/mean_length": 208.775390625, "completions/mean_terminated_length": 209.57437705993652, "completions/min_length": 77.75, "completions/min_terminated_length": 100.25, "epoch": 0.13473684210526315, "grad_norm": 0.02212904579937458, "learning_rate": 9.97262411303488e-07, "loss": -0.0014, "num_tokens": 32876401.0, "reward": 1.668052390217781, "reward_std": 0.29572467878460884, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.043842025101184845, "rewards/stop_prediction_reward/mean": 0.646484375, "rewards/stop_prediction_reward/std": 0.25251164473593235, "rewards/waypoint_pred_accuracy/mean": 0.014690272541894046, "rewards/waypoint_pred_accuracy/std": 0.03458334467003421, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 930.875, "completions/max_terminated_length": 930.875, "completions/mean_length": 216.072265625, "completions/mean_terminated_length": 216.53704833984375, "completions/min_length": 89.625, "completions/min_terminated_length": 101.625, "epoch": 0.1368421052631579, "grad_norm": 0.03168540447950363, "learning_rate": 9.968856680425886e-07, "loss": 0.0082, "num_tokens": 33375318.0, "reward": 1.8482034504413605, "reward_std": 0.26073482632637024, "rewards/format_reward_embodied/mean": 0.990234375, "rewards/format_reward_embodied/std": 0.06879601255059242, "rewards/stop_prediction_reward/mean": 0.8359375, "rewards/stop_prediction_reward/std": 0.21882940270006657, "rewards/waypoint_pred_accuracy/mean": 0.01101580195712165, "rewards/waypoint_pred_accuracy/std": 0.027152883159087076, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 211.0625, "completions/mean_terminated_length": 211.53580856323242, "completions/min_length": 89.25, "completions/min_terminated_length": 102.625, "epoch": 0.13894736842105262, "grad_norm": 0.01921886019408703, "learning_rate": 9.964847346371676e-07, "loss": 0.0044, "num_tokens": 33871222.0, "reward": 1.8809855580329895, "reward_std": 0.27521774359047413, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.042255254462361336, "rewards/stop_prediction_reward/mean": 0.84765625, "rewards/stop_prediction_reward/std": 0.20578788220882416, "rewards/waypoint_pred_accuracy/mean": 0.020570909882129118, "rewards/waypoint_pred_accuracy/std": 0.04058388972286174, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.875, "completions/max_terminated_length": 378.875, "completions/mean_length": 203.333984375, "completions/mean_terminated_length": 203.333984375, "completions/min_length": 104.375, "completions/min_terminated_length": 104.375, "epoch": 0.14105263157894737, "grad_norm": 0.0201885849237442, "learning_rate": 9.96059632789951e-07, "loss": -0.0001, "num_tokens": 34365345.0, "reward": 1.6326926052570343, "reward_std": 0.3391464538872242, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.53515625, "rewards/stop_prediction_reward/std": 0.26009298861026764, "rewards/waypoint_pred_accuracy/mean": 0.04974474538876031, "rewards/waypoint_pred_accuracy/std": 0.05710710766667135, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.5, "completions/max_terminated_length": 838.5, "completions/mean_length": 204.509765625, "completions/mean_terminated_length": 204.509765625, "completions/min_length": 101.25, "completions/min_terminated_length": 101.25, "epoch": 0.1431578947368421, "grad_norm": 0.027328945696353912, "learning_rate": 9.956103855119138e-07, "loss": 0.0118, "num_tokens": 34860198.0, "reward": 1.7429087907075882, "reward_std": 0.3206034153699875, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.73828125, "rewards/stop_prediction_reward/std": 0.2987744938582182, "rewards/waypoint_pred_accuracy/mean": 0.0032903402545485416, "rewards/waypoint_pred_accuracy/std": 0.015942630753571323, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 424.5, "completions/max_terminated_length": 424.5, "completions/mean_length": 205.94140625, "completions/mean_terminated_length": 206.31795692443848, "completions/min_length": 88.125, "completions/min_terminated_length": 101.75, "epoch": 0.14526315789473684, "grad_norm": 0.018343588337302208, "learning_rate": 9.951370171210359e-07, "loss": 0.0006, "num_tokens": 35356168.0, "reward": 1.7270590364933014, "reward_std": 0.26503213308751583, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.728515625, "rewards/stop_prediction_reward/std": 0.24761163257062435, "rewards/waypoint_pred_accuracy/mean": 0.0012248406576317328, "rewards/waypoint_pred_accuracy/std": 0.004542221148686524, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.875, "completions/max_terminated_length": 461.875, "completions/mean_length": 211.755859375, "completions/mean_terminated_length": 211.755859375, "completions/min_length": 105.625, "completions/min_terminated_length": 105.625, "epoch": 0.14736842105263157, "grad_norm": 0.0311113428324461, "learning_rate": 9.946395532409847e-07, "loss": 0.003, "num_tokens": 35854795.0, "reward": 1.8440569341182709, "reward_std": 0.2056777123361826, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.833984375, "rewards/stop_prediction_reward/std": 0.18128339014947414, "rewards/waypoint_pred_accuracy/mean": 0.0060128392641496066, "rewards/waypoint_pred_accuracy/std": 0.014838929497395702, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 383.625, "completions/max_terminated_length": 383.625, "completions/mean_length": 197.767578125, "completions/mean_terminated_length": 198.14220809936523, "completions/min_length": 88.0, "completions/min_terminated_length": 100.0, "epoch": 0.14947368421052631, "grad_norm": 0.03061249665915966, "learning_rate": 9.941180207997288e-07, "loss": 0.0012, "num_tokens": 36345108.0, "reward": 1.8805700987577438, "reward_std": 0.21998493559658527, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.16972095146775246, "rewards/waypoint_pred_accuracy/mean": 0.006691301724458754, "rewards/waypoint_pred_accuracy/std": 0.03666904416240868, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 351.75, "completions/max_terminated_length": 351.75, "completions/mean_length": 185.158203125, "completions/mean_terminated_length": 185.51720809936523, "completions/min_length": 81.875, "completions/min_terminated_length": 92.125, "epoch": 0.15157894736842106, "grad_norm": 0.02365231141448021, "learning_rate": 9.935724480280795e-07, "loss": -0.0009, "num_tokens": 36831397.0, "reward": 1.8188596963882446, "reward_std": 0.3578371275216341, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.720703125, "rewards/stop_prediction_reward/std": 0.2518173884600401, "rewards/waypoint_pred_accuracy/mean": 0.05103141432846315, "rewards/waypoint_pred_accuracy/std": 0.07095948743517516, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.625, "completions/max_terminated_length": 358.625, "completions/mean_length": 188.61328125, "completions/mean_terminated_length": 188.61328125, "completions/min_length": 101.625, "completions/min_terminated_length": 101.625, "epoch": 0.15368421052631578, "grad_norm": 0.025174804031848907, "learning_rate": 9.93002864458164e-07, "loss": -0.0004, "num_tokens": 37314783.0, "reward": 1.753688931465149, "reward_std": 0.20815912820398808, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.75390625, "rewards/stop_prediction_reward/std": 0.19735873863101006, "rewards/waypoint_pred_accuracy/mean": 0.0008679154927271392, "rewards/waypoint_pred_accuracy/std": 0.006068133080992454, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 356.75, "completions/max_terminated_length": 356.75, "completions/mean_length": 192.439453125, "completions/mean_terminated_length": 192.8411464691162, "completions/min_length": 82.625, "completions/min_terminated_length": 93.5, "epoch": 0.15578947368421053, "grad_norm": 0.017899736762046814, "learning_rate": 9.924093009218252e-07, "loss": -0.0003, "num_tokens": 37800640.0, "reward": 1.5418598055839539, "reward_std": 0.31721452437341213, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.5234375, "rewards/stop_prediction_reward/std": 0.29032695107162, "rewards/waypoint_pred_accuracy/mean": 0.012140836188299201, "rewards/waypoint_pred_accuracy/std": 0.017781355033130546, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.375, "completions/max_terminated_length": 373.375, "completions/mean_length": 193.908203125, "completions/mean_terminated_length": 193.908203125, "completions/min_length": 91.75, "completions/min_terminated_length": 91.75, "epoch": 0.15789473684210525, "grad_norm": 0.09963640570640564, "learning_rate": 9.917917895489542e-07, "loss": 0.0009, "num_tokens": 38289489.0, "reward": 1.7774494141340256, "reward_std": 0.22092761541716754, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.7265625, "rewards/stop_prediction_reward/std": 0.19099510088562965, "rewards/waypoint_pred_accuracy/mean": 0.026420030107956904, "rewards/waypoint_pred_accuracy/std": 0.02624278316289974, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 185.908203125, "completions/mean_terminated_length": 186.30822372436523, "completions/min_length": 81.0, "completions/min_terminated_length": 96.125, "epoch": 0.16, "grad_norm": 0.02129918336868286, "learning_rate": 9.9115036376575e-07, "loss": -0.0006, "num_tokens": 38774114.0, "reward": 1.41980442404747, "reward_std": 0.24889344349503517, "rewards/format_reward_embodied/mean": 0.994140625, "rewards/format_reward_embodied/std": 0.046875, "rewards/stop_prediction_reward/mean": 0.423828125, "rewards/stop_prediction_reward/std": 0.22381583414971828, "rewards/waypoint_pred_accuracy/mean": 0.0009178373624600936, "rewards/waypoint_pred_accuracy/std": 0.0070393299319626125, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.125, "completions/max_terminated_length": 348.125, "completions/mean_length": 193.267578125, "completions/mean_terminated_length": 193.267578125, "completions/min_length": 96.25, "completions/min_terminated_length": 96.25, "epoch": 0.16210526315789472, "grad_norm": 0.01985691301524639, "learning_rate": 9.904850582929109e-07, "loss": 0.0008, "num_tokens": 39264811.0, "reward": 1.7604794949293137, "reward_std": 0.3248732276260853, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.634765625, "rewards/stop_prediction_reward/std": 0.24500796012580395, "rewards/waypoint_pred_accuracy/mean": 0.06285692949041849, "rewards/waypoint_pred_accuracy/std": 0.06050711135693543, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.625, "completions/max_terminated_length": 398.625, "completions/mean_length": 198.037109375, "completions/mean_terminated_length": 198.037109375, "completions/min_length": 98.125, "completions/min_terminated_length": 98.125, "epoch": 0.16421052631578947, "grad_norm": 0.0205206498503685, "learning_rate": 9.897959091437545e-07, "loss": 0.0007, "num_tokens": 39753790.0, "reward": 1.78813037276268, "reward_std": 0.2303650790709071, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.1705988198518753, "rewards/waypoint_pred_accuracy/mean": 0.021994864640540634, "rewards/waypoint_pred_accuracy/std": 0.032827818689523065, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.875, "completions/max_terminated_length": 370.875, "completions/mean_length": 182.419921875, "completions/mean_terminated_length": 182.419921875, "completions/min_length": 97.25, "completions/min_terminated_length": 97.25, "epoch": 0.16631578947368422, "grad_norm": 0.02171274460852146, "learning_rate": 9.890829536222686e-07, "loss": 0.0006, "num_tokens": 40236501.0, "reward": 1.7775225639343262, "reward_std": 0.2283242531120777, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.755859375, "rewards/stop_prediction_reward/std": 0.18596058152616024, "rewards/waypoint_pred_accuracy/mean": 0.010831597135309401, "rewards/waypoint_pred_accuracy/std": 0.030910690631162474, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 356.125, "completions/max_terminated_length": 356.125, "completions/mean_length": 177.482421875, "completions/mean_terminated_length": 177.7895278930664, "completions/min_length": 85.0, "completions/min_terminated_length": 97.5, "epoch": 0.16842105263157894, "grad_norm": 0.01832726038992405, "learning_rate": 9.88346230321092e-07, "loss": -0.0013, "num_tokens": 40718156.0, "reward": 1.7886408120393753, "reward_std": 0.2419158397242427, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.18826745636761189, "rewards/waypoint_pred_accuracy/mean": 0.022250097567066918, "rewards/waypoint_pred_accuracy/std": 0.04195201856602874, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 176.24609375, "completions/mean_terminated_length": 176.65014457702637, "completions/min_length": 79.0, "completions/min_terminated_length": 93.25, "epoch": 0.1705263157894737, "grad_norm": 0.058526039123535156, "learning_rate": 9.875857791194251e-07, "loss": -0.0005, "num_tokens": 41197514.0, "reward": 1.8776662796735764, "reward_std": 0.3414868116378784, "rewards/format_reward_embodied/mean": 0.9921875, "rewards/format_reward_embodied/std": 0.05317101255059242, "rewards/stop_prediction_reward/mean": 0.828125, "rewards/stop_prediction_reward/std": 0.24201190657913685, "rewards/waypoint_pred_accuracy/mean": 0.028676905061158466, "rewards/waypoint_pred_accuracy/std": 0.05707082395648245, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.75, "completions/max_terminated_length": 339.75, "completions/mean_length": 184.318359375, "completions/mean_terminated_length": 184.318359375, "completions/min_length": 100.5, "completions/min_terminated_length": 100.5, "epoch": 0.1726315789473684, "grad_norm": 0.04847518354654312, "learning_rate": 9.868016411808711e-07, "loss": -0.0001, "num_tokens": 41683501.0, "reward": 1.8535273373126984, "reward_std": 0.17614830657839775, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.853515625, "rewards/stop_prediction_reward/std": 0.17615067958831787, "rewards/waypoint_pred_accuracy/mean": 5.858862551570454e-06, "rewards/waypoint_pred_accuracy/std": 2.9580631640167827e-05, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.25, "completions/max_terminated_length": 362.25, "completions/mean_length": 191.689453125, "completions/mean_terminated_length": 191.689453125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.17473684210526316, "grad_norm": 0.18235114216804504, "learning_rate": 9.85993858951209e-07, "loss": -0.0, "num_tokens": 42171150.0, "reward": 1.8688680231571198, "reward_std": 0.2486374917862122, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.732421875, "rewards/stop_prediction_reward/std": 0.1559995636343956, "rewards/waypoint_pred_accuracy/mean": 0.06822307023441451, "rewards/waypoint_pred_accuracy/std": 0.061132666667712415, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 183.05859375, "completions/mean_terminated_length": 183.41474533081055, "completions/min_length": 83.25, "completions/min_terminated_length": 96.0, "epoch": 0.17684210526315788, "grad_norm": 0.019062340259552002, "learning_rate": 9.851624761560941e-07, "loss": -0.0009, "num_tokens": 42655084.0, "reward": 1.6268093585968018, "reward_std": 0.17163129267282784, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.611328125, "rewards/stop_prediction_reward/std": 0.11577124893665314, "rewards/waypoint_pred_accuracy/mean": 0.009693755733415545, "rewards/waypoint_pred_accuracy/std": 0.0210638802138276, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 366.25, "completions/max_terminated_length": 366.25, "completions/mean_length": 179.791015625, "completions/mean_terminated_length": 180.09539413452148, "completions/min_length": 82.125, "completions/min_terminated_length": 93.25, "epoch": 0.17894736842105263, "grad_norm": 0.01828734204173088, "learning_rate": 9.843075377986927e-07, "loss": -0.0007, "num_tokens": 43136193.0, "reward": 1.7669616490602493, "reward_std": 0.21106806583702564, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.71875, "rewards/stop_prediction_reward/std": 0.15764378011226654, "rewards/waypoint_pred_accuracy/mean": 0.025082381325773895, "rewards/waypoint_pred_accuracy/std": 0.028875190793769434, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.5, "completions/max_terminated_length": 357.5, "completions/mean_length": 188.0703125, "completions/mean_terminated_length": 188.0703125, "completions/min_length": 92.75, "completions/min_terminated_length": 92.75, "epoch": 0.18105263157894738, "grad_norm": 0.02086515724658966, "learning_rate": 9.834290901572454e-07, "loss": -0.0004, "num_tokens": 43621989.0, "reward": 1.8521649837493896, "reward_std": 0.22175267152488232, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.837890625, "rewards/stop_prediction_reward/std": 0.1837922688573599, "rewards/waypoint_pred_accuracy/mean": 0.007137191339450078, "rewards/waypoint_pred_accuracy/std": 0.028705519106249404, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.875, "completions/max_terminated_length": 348.875, "completions/mean_length": 181.27734375, "completions/mean_terminated_length": 181.27734375, "completions/min_length": 96.375, "completions/min_terminated_length": 96.375, "epoch": 0.1831578947368421, "grad_norm": 0.046753548085689545, "learning_rate": 9.82527180782562e-07, "loss": 0.0012, "num_tokens": 44104371.0, "reward": 1.7520140707492828, "reward_std": 0.22030256874859333, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.736328125, "rewards/stop_prediction_reward/std": 0.1816553734242916, "rewards/waypoint_pred_accuracy/mean": 0.007842985792270555, "rewards/waypoint_pred_accuracy/std": 0.03329349086866181, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.125, "completions/max_terminated_length": 324.125, "completions/mean_length": 173.541015625, "completions/mean_terminated_length": 173.541015625, "completions/min_length": 97.5, "completions/min_terminated_length": 97.5, "epoch": 0.18526315789473685, "grad_norm": 0.024735594168305397, "learning_rate": 9.816018584954474e-07, "loss": 0.0008, "num_tokens": 44581896.0, "reward": 1.5429035127162933, "reward_std": 0.3246102202683687, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.41796875, "rewards/stop_prediction_reward/std": 0.19491917081177235, "rewards/waypoint_pred_accuracy/mean": 0.062467404420990205, "rewards/waypoint_pred_accuracy/std": 0.10502552830491003, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.125, "completions/max_terminated_length": 315.125, "completions/mean_length": 177.5859375, "completions/mean_terminated_length": 177.5859375, "completions/min_length": 98.75, "completions/min_terminated_length": 98.75, "epoch": 0.18736842105263157, "grad_norm": 0.18669086694717407, "learning_rate": 9.806531733840594e-07, "loss": -0.0001, "num_tokens": 45065780.0, "reward": 1.9531521201133728, "reward_std": 0.1684918818064034, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.861328125, "rewards/stop_prediction_reward/std": 0.0773718785494566, "rewards/waypoint_pred_accuracy/mean": 0.045912011210408575, "rewards/waypoint_pred_accuracy/std": 0.06239442274401004, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.625, "completions/max_terminated_length": 332.625, "completions/mean_length": 178.658203125, "completions/mean_terminated_length": 178.658203125, "completions/min_length": 100.375, "completions/min_terminated_length": 100.375, "epoch": 0.18947368421052632, "grad_norm": 0.0202130526304245, "learning_rate": 9.796811768011975e-07, "loss": 0.0001, "num_tokens": 45546437.0, "reward": 1.7593656927347183, "reward_std": 0.2186888437718153, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.634765625, "rewards/stop_prediction_reward/std": 0.1627318561077118, "rewards/waypoint_pred_accuracy/mean": 0.062300028515860306, "rewards/waypoint_pred_accuracy/std": 0.046674464582067664, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.375, "completions/max_terminated_length": 354.375, "completions/mean_length": 161.93359375, "completions/mean_terminated_length": 161.93359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.19157894736842104, "grad_norm": 0.01931101270020008, "learning_rate": 9.78685921361522e-07, "loss": -0.0004, "num_tokens": 46018787.0, "reward": 1.735956072807312, "reward_std": 0.2895608698017895, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.1331936027854681, "rewards/waypoint_pred_accuracy/mean": 0.055478046621841415, "rewards/waypoint_pred_accuracy/std": 0.08614378009180079, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.75, "completions/max_terminated_length": 342.75, "completions/mean_length": 165.138671875, "completions/mean_terminated_length": 165.138671875, "completions/min_length": 93.375, "completions/min_terminated_length": 93.375, "epoch": 0.1936842105263158, "grad_norm": 0.011576792225241661, "learning_rate": 9.776674609387076e-07, "loss": 0.0004, "num_tokens": 46492138.0, "reward": 1.800860732793808, "reward_std": 0.19335580058395863, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.1392682921141386, "rewards/waypoint_pred_accuracy/mean": 0.0273834979573945, "rewards/waypoint_pred_accuracy/std": 0.03366286576419705, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.625, "completions/max_terminated_length": 293.625, "completions/mean_length": 162.947265625, "completions/mean_terminated_length": 162.947265625, "completions/min_length": 92.125, "completions/min_terminated_length": 92.125, "epoch": 0.1957894736842105, "grad_norm": 0.01784905605018139, "learning_rate": 9.766258506625257e-07, "loss": -0.0001, "num_tokens": 46966351.0, "reward": 1.7491846829652786, "reward_std": 0.24123720126226544, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.638671875, "rewards/stop_prediction_reward/std": 0.1292374823242426, "rewards/waypoint_pred_accuracy/mean": 0.05525641991156202, "rewards/waypoint_pred_accuracy/std": 0.0679310456182847, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.875, "completions/max_terminated_length": 328.875, "completions/mean_length": 169.697265625, "completions/mean_terminated_length": 169.697265625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.19789473684210526, "grad_norm": 0.02496343106031418, "learning_rate": 9.75561146915861e-07, "loss": 0.0001, "num_tokens": 47442804.0, "reward": 1.7829404175281525, "reward_std": 0.22052480925231066, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.72265625, "rewards/stop_prediction_reward/std": 0.16125134378671646, "rewards/waypoint_pred_accuracy/mean": 0.03014207717228104, "rewards/waypoint_pred_accuracy/std": 0.03959382537319134, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.375, "completions/max_terminated_length": 304.375, "completions/mean_length": 164.10546875, "completions/mean_terminated_length": 164.10546875, "completions/min_length": 93.5, "completions/min_terminated_length": 93.5, "epoch": 0.2, "grad_norm": 0.028125545009970665, "learning_rate": 9.744734073316595e-07, "loss": 0.0013, "num_tokens": 47916458.0, "reward": 1.7026303112506866, "reward_std": 0.21261605620384216, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.16691532172262669, "rewards/waypoint_pred_accuracy/mean": 0.0388151630759339, "rewards/waypoint_pred_accuracy/std": 0.035128295421680174, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.75, "completions/max_terminated_length": 325.75, "completions/mean_length": 160.841796875, "completions/mean_terminated_length": 160.841796875, "completions/min_length": 83.75, "completions/min_terminated_length": 83.75, "epoch": 0.20210526315789473, "grad_norm": 0.009644444100558758, "learning_rate": 9.73362690789808e-07, "loss": 0.0003, "num_tokens": 48385497.0, "reward": 1.747939333319664, "reward_std": 0.223597856999163, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.634765625, "rewards/stop_prediction_reward/std": 0.12978976964950562, "rewards/waypoint_pred_accuracy/mean": 0.05658684538012437, "rewards/waypoint_pred_accuracy/std": 0.06814135060892568, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.875, "completions/max_terminated_length": 310.875, "completions/mean_length": 162.240234375, "completions/mean_terminated_length": 162.240234375, "completions/min_length": 94.625, "completions/min_terminated_length": 94.625, "epoch": 0.20421052631578948, "grad_norm": 0.018140247091650963, "learning_rate": 9.722290574139486e-07, "loss": -0.0003, "num_tokens": 48857300.0, "reward": 1.7528142184019089, "reward_std": 0.16555408760905266, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.13214937038719654, "rewards/waypoint_pred_accuracy/mean": 0.004336794838309695, "rewards/waypoint_pred_accuracy/std": 0.015406182501468337, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.25, "completions/max_terminated_length": 455.25, "completions/mean_length": 156.06640625, "completions/mean_terminated_length": 156.06640625, "completions/min_length": 89.625, "completions/min_terminated_length": 89.625, "epoch": 0.2063157894736842, "grad_norm": 0.015433871187269688, "learning_rate": 9.71072568568222e-07, "loss": 0.0048, "num_tokens": 49326006.0, "reward": 1.848653882741928, "reward_std": 0.14563900604844093, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.849609375, "rewards/stop_prediction_reward/std": 0.1308242529630661, "rewards/waypoint_pred_accuracy/mean": 0.0004988121282371114, "rewards/waypoint_pred_accuracy/std": 0.0009192883252055577, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.375, "completions/max_terminated_length": 564.375, "completions/mean_length": 160.87109375, "completions/mean_terminated_length": 160.87109375, "completions/min_length": 86.375, "completions/min_terminated_length": 86.375, "epoch": 0.20842105263157895, "grad_norm": 0.02826160565018654, "learning_rate": 9.698932868539475e-07, "loss": 0.0066, "num_tokens": 49796084.0, "reward": 1.6241666972637177, "reward_std": 0.24621143471449614, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.49609375, "rewards/stop_prediction_reward/std": 0.11416476964950562, "rewards/waypoint_pred_accuracy/mean": 0.06501307094004005, "rewards/waypoint_pred_accuracy/std": 0.07491768430918455, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 346.875, "completions/max_terminated_length": 346.875, "completions/mean_length": 166.466796875, "completions/mean_terminated_length": 166.755952835083, "completions/min_length": 82.375, "completions/min_terminated_length": 92.875, "epoch": 0.21052631578947367, "grad_norm": 0.00846907589584589, "learning_rate": 9.686912761062337e-07, "loss": -0.0007, "num_tokens": 50269923.0, "reward": 1.7262057214975357, "reward_std": 0.14077163115143776, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.724609375, "rewards/stop_prediction_reward/std": 0.12929067946970463, "rewards/waypoint_pred_accuracy/mean": 0.001774733640439002, "rewards/waypoint_pred_accuracy/std": 0.004268001195070267, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.5, "completions/max_terminated_length": 330.5, "completions/mean_length": 155.73828125, "completions/mean_terminated_length": 155.73828125, "completions/min_length": 91.625, "completions/min_terminated_length": 91.625, "epoch": 0.21263157894736842, "grad_norm": 0.0018760801758617163, "learning_rate": 9.674666013905223e-07, "loss": 0.0012, "num_tokens": 50740765.0, "reward": 2.0045997500419617, "reward_std": 0.12817948791450817, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.9765625, "rewards/stop_prediction_reward/std": 0.09483768045902252, "rewards/waypoint_pred_accuracy/mean": 0.014018624695630422, "rewards/waypoint_pred_accuracy/std": 0.017155832063963317, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.75, "completions/max_terminated_length": 408.75, "completions/mean_length": 160.771484375, "completions/mean_terminated_length": 160.771484375, "completions/min_length": 91.125, "completions/min_terminated_length": 91.125, "epoch": 0.21473684210526317, "grad_norm": 0.025941183790564537, "learning_rate": 9.662193289990683e-07, "loss": 0.0018, "num_tokens": 51210920.0, "reward": 1.963687315583229, "reward_std": 0.22355148011411075, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.865234375, "rewards/stop_prediction_reward/std": 0.1220565214753151, "rewards/waypoint_pred_accuracy/mean": 0.050203046113892924, "rewards/waypoint_pred_accuracy/std": 0.06606378212018171, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.25, "completions/max_terminated_length": 322.25, "completions/mean_length": 166.92578125, "completions/mean_terminated_length": 166.92578125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2168421052631579, "grad_norm": 0.025028549134731293, "learning_rate": 9.649495264473496e-07, "loss": -0.0001, "num_tokens": 51684802.0, "reward": 1.500137910246849, "reward_std": 0.1536319953884231, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.501953125, "rewards/stop_prediction_reward/std": 0.14937089011073112, "rewards/waypoint_pred_accuracy/mean": 6.89590217128333e-05, "rewards/waypoint_pred_accuracy/std": 0.0004515758951737503, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 303.125, "completions/max_terminated_length": 303.125, "completions/mean_length": 165.875, "completions/mean_terminated_length": 166.2123966217041, "completions/min_length": 82.125, "completions/min_terminated_length": 93.375, "epoch": 0.21894736842105264, "grad_norm": 0.021551745012402534, "learning_rate": 9.636572624704126e-07, "loss": -0.0012, "num_tokens": 52159170.0, "reward": 2.046603038907051, "reward_std": 0.17094448019634, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.978515625, "rewards/stop_prediction_reward/std": 0.09938238747417927, "rewards/waypoint_pred_accuracy/mean": 0.035020323445451154, "rewards/waypoint_pred_accuracy/std": 0.03831136192093263, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.625, "completions/max_terminated_length": 292.625, "completions/mean_length": 149.169921875, "completions/mean_terminated_length": 149.169921875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.22105263157894736, "grad_norm": 0.017386160790920258, "learning_rate": 9.62342607019152e-07, "loss": 0.0002, "num_tokens": 52624025.0, "reward": 1.7423816174268723, "reward_std": 0.2114694133537114, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.701171875, "rewards/stop_prediction_reward/std": 0.18037440441548824, "rewards/waypoint_pred_accuracy/mean": 0.021581426198892473, "rewards/waypoint_pred_accuracy/std": 0.03059530609976946, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.125, "completions/max_terminated_length": 284.125, "completions/mean_length": 155.060546875, "completions/mean_terminated_length": 155.060546875, "completions/min_length": 95.5, "completions/min_terminated_length": 95.5, "epoch": 0.2231578947368421, "grad_norm": 0.048246338963508606, "learning_rate": 9.610056312565245e-07, "loss": -0.0004, "num_tokens": 53093432.0, "reward": 2.0104575753211975, "reward_std": 0.13160700640946743, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.87109375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.06968190488207829, "rewards/waypoint_pred_accuracy/std": 0.050178504459300124, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.875, "completions/max_terminated_length": 632.875, "completions/mean_length": 163.904296875, "completions/mean_terminated_length": 163.904296875, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.22526315789473683, "grad_norm": 0.0, "learning_rate": 9.596464075536963e-07, "loss": 0.0091, "num_tokens": 53568071.0, "reward": 1.8554696440696716, "reward_std": 0.14293749257922173, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.859375, "rewards/stop_prediction_reward/std": 0.11734727956354618, "rewards/waypoint_pred_accuracy/mean": 4.472400104072698e-07, "rewards/waypoint_pred_accuracy/std": 1.442633011543798e-06, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.25, "completions/max_terminated_length": 273.25, "completions/mean_length": 154.779296875, "completions/mean_terminated_length": 154.779296875, "completions/min_length": 85.125, "completions/min_terminated_length": 85.125, "epoch": 0.22736842105263158, "grad_norm": 0.005716841667890549, "learning_rate": 9.582650094861256e-07, "loss": 0.0001, "num_tokens": 54039382.0, "reward": 1.873047947883606, "reward_std": 0.03755049656763276, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 5.439363948693785e-07, "rewards/waypoint_pred_accuracy/std": 2.2413606779063174e-06, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.75, "completions/max_terminated_length": 299.75, "completions/mean_length": 159.359375, "completions/mean_terminated_length": 159.359375, "completions/min_length": 89.875, "completions/min_terminated_length": 89.875, "epoch": 0.2294736842105263, "grad_norm": 0.010698024183511734, "learning_rate": 9.568615118295798e-07, "loss": -0.0007, "num_tokens": 54510798.0, "reward": 1.5139821916818619, "reward_std": 0.10784149792743847, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5078125, "rewards/stop_prediction_reward/std": 0.09701303765177727, "rewards/waypoint_pred_accuracy/mean": 0.0030848284332449794, "rewards/waypoint_pred_accuracy/std": 0.008302096623808097, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.625, "completions/max_terminated_length": 288.625, "completions/mean_length": 149.833984375, "completions/mean_terminated_length": 149.833984375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.23157894736842105, "grad_norm": 0.019380802288651466, "learning_rate": 9.554359905560885e-07, "loss": -0.0004, "num_tokens": 54976377.0, "reward": 1.6111545264720917, "reward_std": 0.14732337184250355, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.599609375, "rewards/stop_prediction_reward/std": 0.12605497241020203, "rewards/waypoint_pred_accuracy/mean": 0.005772589463781978, "rewards/waypoint_pred_accuracy/std": 0.015806115148408278, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.25, "completions/max_terminated_length": 310.25, "completions/mean_length": 160.642578125, "completions/mean_terminated_length": 160.642578125, "completions/min_length": 93.625, "completions/min_terminated_length": 93.625, "epoch": 0.2336842105263158, "grad_norm": 0.010887962765991688, "learning_rate": 9.53988522829831e-07, "loss": 0.0006, "num_tokens": 55449730.0, "reward": 1.7607887834310532, "reward_std": 0.176338329911232, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.11029814556241035, "rewards/waypoint_pred_accuracy/mean": 0.06691783313807408, "rewards/waypoint_pred_accuracy/std": 0.05048349817446263, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.625, "completions/max_terminated_length": 284.625, "completions/mean_length": 164.515625, "completions/mean_terminated_length": 164.515625, "completions/min_length": 94.125, "completions/min_terminated_length": 94.125, "epoch": 0.23578947368421052, "grad_norm": 0.016550879925489426, "learning_rate": 9.52519187002958e-07, "loss": 0.0005, "num_tokens": 55922314.0, "reward": 2.0147946178913116, "reward_std": 0.08335691201500595, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.994140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.010327004097856944, "rewards/waypoint_pred_accuracy/std": 0.021345943544904804, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.5, "completions/max_terminated_length": 607.5, "completions/mean_length": 168.51171875, "completions/mean_terminated_length": 168.51171875, "completions/min_length": 90.5, "completions/min_terminated_length": 90.5, "epoch": 0.23789473684210527, "grad_norm": 0.0, "learning_rate": 9.510280626113524e-07, "loss": 0.0077, "num_tokens": 56400464.0, "reward": 2.001875877380371, "reward_std": 0.20845668017864227, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.84765625, "rewards/stop_prediction_reward/std": 0.12359907291829586, "rewards/waypoint_pred_accuracy/mean": 0.07808637249919906, "rewards/waypoint_pred_accuracy/std": 0.052403363499503426, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.375, "completions/max_terminated_length": 302.375, "completions/mean_length": 155.916015625, "completions/mean_terminated_length": 155.916015625, "completions/min_length": 93.625, "completions/min_terminated_length": 93.625, "epoch": 0.24, "grad_norm": 0.021923840045928955, "learning_rate": 9.495152303703225e-07, "loss": 0.0, "num_tokens": 56868773.0, "reward": 1.9231128692626953, "reward_std": 0.12709264224395156, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.05317101255059242, "rewards/waypoint_pred_accuracy/mean": 0.02405643684323879, "rewards/waypoint_pred_accuracy/std": 0.04448868869803859, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.25, "completions/max_terminated_length": 314.25, "completions/mean_length": 160.0390625, "completions/mean_terminated_length": 160.0390625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.24210526315789474, "grad_norm": 0.01805245131254196, "learning_rate": 9.479807721702337e-07, "loss": -0.0003, "num_tokens": 57339001.0, "reward": 2.048165649175644, "reward_std": 0.1738033015280962, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.8671875, "rewards/stop_prediction_reward/std": 0.043842025101184845, "rewards/waypoint_pred_accuracy/mean": 0.09048910532146692, "rewards/waypoint_pred_accuracy/std": 0.07687698677182198, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.25, "completions/max_terminated_length": 306.25, "completions/mean_length": 163.044921875, "completions/mean_terminated_length": 163.044921875, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.24421052631578946, "grad_norm": 0.0, "learning_rate": 9.46424771072075e-07, "loss": 0.0005, "num_tokens": 57813328.0, "reward": 1.7839026153087616, "reward_std": 0.11150169186294079, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.755859375, "rewards/stop_prediction_reward/std": 0.046121878549456596, "rewards/waypoint_pred_accuracy/mean": 0.0140216209110804, "rewards/waypoint_pred_accuracy/std": 0.044068384915590376, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.75, "completions/max_terminated_length": 285.75, "completions/mean_length": 159.0703125, "completions/mean_terminated_length": 159.0703125, "completions/min_length": 93.125, "completions/min_terminated_length": 93.125, "epoch": 0.2463157894736842, "grad_norm": 0.011590894311666489, "learning_rate": 9.448473113029633e-07, "loss": -0.0003, "num_tokens": 58283572.0, "reward": 1.6105285286903381, "reward_std": 0.19277687231078744, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.515625, "rewards/stop_prediction_reward/std": 0.10811806842684746, "rewards/waypoint_pred_accuracy/mean": 0.04842832078634274, "rewards/waypoint_pred_accuracy/std": 0.044830967724465154, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.375, "completions/max_terminated_length": 355.375, "completions/mean_length": 153.669921875, "completions/mean_terminated_length": 153.669921875, "completions/min_length": 88.625, "completions/min_terminated_length": 88.625, "epoch": 0.24842105263157896, "grad_norm": 0.018902691081166267, "learning_rate": 9.432484782515842e-07, "loss": 0.001, "num_tokens": 58749451.0, "reward": 1.8925653398036957, "reward_std": 0.10405641289980849, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.865234375, "rewards/stop_prediction_reward/std": 0.06879601255059242, "rewards/waypoint_pred_accuracy/mean": 0.014642067655131541, "rewards/waypoint_pred_accuracy/std": 0.020200557806674624, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.875, "completions/max_terminated_length": 480.875, "completions/mean_length": 154.16015625, "completions/mean_terminated_length": 154.16015625, "completions/min_length": 89.625, "completions/min_terminated_length": 89.625, "epoch": 0.2505263157894737, "grad_norm": 0.02062033675611019, "learning_rate": 9.416283584635699e-07, "loss": 0.0045, "num_tokens": 59215133.0, "reward": 1.7936918139457703, "reward_std": 0.16157833766192198, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.07509202510118484, "rewards/waypoint_pred_accuracy/mean": 0.022822471810224426, "rewards/waypoint_pred_accuracy/std": 0.03745843005840822, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.25, "completions/max_terminated_length": 495.25, "completions/mean_length": 163.1953125, "completions/mean_terminated_length": 163.1953125, "completions/min_length": 92.125, "completions/min_terminated_length": 92.125, "epoch": 0.25263157894736843, "grad_norm": 0.019115004688501358, "learning_rate": 9.399870396368137e-07, "loss": 0.0062, "num_tokens": 59688193.0, "reward": 1.752405360341072, "reward_std": 0.06196481070946902, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.004132360947551206, "rewards/waypoint_pred_accuracy/std": 0.012019388610497117, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.625, "completions/max_terminated_length": 314.625, "completions/mean_length": 161.328125, "completions/mean_terminated_length": 161.328125, "completions/min_length": 98.5, "completions/min_terminated_length": 98.5, "epoch": 0.25473684210526315, "grad_norm": 0.019789932295680046, "learning_rate": 9.383246106167244e-07, "loss": -0.0002, "num_tokens": 60160681.0, "reward": 1.9855428040027618, "reward_std": 0.23401551693677902, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.85546875, "rewards/stop_prediction_reward/std": 0.11576050892472267, "rewards/waypoint_pred_accuracy/mean": 0.0650370218045282, "rewards/waypoint_pred_accuracy/std": 0.06812710995291127, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.5, "completions/max_terminated_length": 327.5, "completions/mean_length": 168.681640625, "completions/mean_terminated_length": 168.681640625, "completions/min_length": 90.375, "completions/min_terminated_length": 90.375, "epoch": 0.25684210526315787, "grad_norm": 0.021785695105791092, "learning_rate": 9.366411613914151e-07, "loss": 0.0001, "num_tokens": 60636294.0, "reward": 2.019701272249222, "reward_std": 0.05673945321541396, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.998046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.01082722095088684, "rewards/waypoint_pred_accuracy/std": 0.020641872481064638, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.75, "completions/max_terminated_length": 296.75, "completions/mean_length": 151.181640625, "completions/mean_terminated_length": 151.181640625, "completions/min_length": 92.75, "completions/min_terminated_length": 92.75, "epoch": 0.25894736842105265, "grad_norm": 0.0, "learning_rate": 9.349367830868338e-07, "loss": 0.0003, "num_tokens": 61104931.0, "reward": 1.756501317024231, "reward_std": 0.06210230151191354, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.755859375, "rewards/stop_prediction_reward/std": 0.057880254462361336, "rewards/waypoint_pred_accuracy/mean": 0.00032096245558932424, "rewards/waypoint_pred_accuracy/std": 0.0021110251545906067, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.25, "completions/max_terminated_length": 325.25, "completions/mean_length": 171.123046875, "completions/mean_terminated_length": 171.123046875, "completions/min_length": 93.25, "completions/min_terminated_length": 93.25, "epoch": 0.26105263157894737, "grad_norm": 0.011362242512404919, "learning_rate": 9.332115679618299e-07, "loss": -0.0001, "num_tokens": 61582178.0, "reward": 1.87109375, "reward_std": 0.0535217709839344, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 8.560674261081592e-33, "rewards/waypoint_pred_accuracy/std": 0.0, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 162.998046875, "completions/mean_terminated_length": 162.998046875, "completions/min_length": 90.375, "completions/min_terminated_length": 90.375, "epoch": 0.2631578947368421, "grad_norm": 0.01136239804327488, "learning_rate": 9.3146560940316e-07, "loss": -0.0001, "num_tokens": 62053921.0, "reward": 1.8074305802583694, "reward_std": 0.07321994344238192, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.751953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.02773874229941244, "rewards/waypoint_pred_accuracy/std": 0.03534268934024435, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.375, "completions/max_terminated_length": 300.375, "completions/mean_length": 161.583984375, "completions/mean_terminated_length": 161.583984375, "completions/min_length": 87.875, "completions/min_terminated_length": 87.875, "epoch": 0.26526315789473687, "grad_norm": 0.0, "learning_rate": 9.296990019204335e-07, "loss": 0.0005, "num_tokens": 62526604.0, "reward": 1.7796232551336288, "reward_std": 0.14883975870907307, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.6171875, "rewards/stop_prediction_reward/std": 0.030496878549456596, "rewards/waypoint_pred_accuracy/mean": 0.08121787941490766, "rewards/waypoint_pred_accuracy/std": 0.059465451777214184, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 158.2578125, "completions/mean_terminated_length": 158.2578125, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.2673684210526316, "grad_norm": 0.009571162052452564, "learning_rate": 9.279118411409962e-07, "loss": -0.0, "num_tokens": 62995344.0, "reward": 2.0410386621952057, "reward_std": 0.13146163042983972, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.98828125, "rewards/stop_prediction_reward/std": 0.07509202510118484, "rewards/waypoint_pred_accuracy/mean": 0.026378706125527174, "rewards/waypoint_pred_accuracy/std": 0.028185460301244802, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.875, "completions/max_terminated_length": 306.875, "completions/mean_length": 163.37890625, "completions/mean_terminated_length": 163.37890625, "completions/min_length": 94.25, "completions/min_terminated_length": 94.25, "epoch": 0.2694736842105263, "grad_norm": 0.0, "learning_rate": 9.261042238047539e-07, "loss": -0.0001, "num_tokens": 63465362.0, "reward": 1.7530074417591095, "reward_std": 0.0875884689448867, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.626953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.06302716654918482, "rewards/waypoint_pred_accuracy/std": 0.04162663857716874, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.25, "completions/max_terminated_length": 291.25, "completions/mean_length": 165.87109375, "completions/mean_terminated_length": 165.87109375, "completions/min_length": 93.875, "completions/min_terminated_length": 93.875, "epoch": 0.27157894736842103, "grad_norm": 0.013052922673523426, "learning_rate": 9.242762477589369e-07, "loss": 0.0001, "num_tokens": 63939024.0, "reward": 1.7669493854045868, "reward_std": 0.18367847800254822, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.0729278109720326, "rewards/waypoint_pred_accuracy/std": 0.06537189942901023, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.125, "completions/max_terminated_length": 290.125, "completions/mean_length": 159.1796875, "completions/mean_terminated_length": 159.1796875, "completions/min_length": 92.25, "completions/min_terminated_length": 92.25, "epoch": 0.2736842105263158, "grad_norm": 0.0, "learning_rate": 9.224280119528013e-07, "loss": 0.0, "num_tokens": 64411948.0, "reward": 1.869140625, "reward_std": 0.03754601255059242, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.03754601255059242, "rewards/waypoint_pred_accuracy/mean": 6.390997080804871e-14, "rewards/waypoint_pred_accuracy/std": 4.016090304301343e-13, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.25, "completions/max_terminated_length": 302.25, "completions/mean_length": 160.634765625, "completions/mean_terminated_length": 160.634765625, "completions/min_length": 93.75, "completions/min_terminated_length": 93.75, "epoch": 0.27578947368421053, "grad_norm": 0.0, "learning_rate": 9.205596164322753e-07, "loss": 0.0001, "num_tokens": 64882865.0, "reward": 1.8263644874095917, "reward_std": 0.089549720287323, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.751953125, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.03720568120479584, "rewards/waypoint_pred_accuracy/std": 0.026742003858089447, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.375, "completions/max_terminated_length": 299.375, "completions/mean_length": 160.9375, "completions/mean_terminated_length": 160.9375, "completions/min_length": 86.875, "completions/min_terminated_length": 86.875, "epoch": 0.27789473684210525, "grad_norm": 0.0, "learning_rate": 9.186711623345419e-07, "loss": 0.0007, "num_tokens": 65352273.0, "reward": 1.8552441000938416, "reward_std": 0.09687134113391949, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.05262205236723276, "rewards/waypoint_pred_accuracy/std": 0.04328961128608891, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.875, "completions/max_terminated_length": 313.875, "completions/mean_length": 164.759765625, "completions/mean_terminated_length": 164.759765625, "completions/min_length": 94.25, "completions/min_terminated_length": 94.25, "epoch": 0.28, "grad_norm": 0.12175282090902328, "learning_rate": 9.167627518825651e-07, "loss": 0.0001, "num_tokens": 65825622.0, "reward": 1.674107313156128, "reward_std": 0.10450775223216624, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.02553023178981706, "rewards/waypoint_pred_accuracy/std": 0.03363654209747424, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.375, "completions/max_terminated_length": 323.375, "completions/mean_length": 168.17578125, "completions/mean_terminated_length": 168.17578125, "completions/min_length": 97.125, "completions/min_terminated_length": 97.125, "epoch": 0.28210526315789475, "grad_norm": 0.035799406468868256, "learning_rate": 9.148344883795563e-07, "loss": 0.0002, "num_tokens": 66305264.0, "reward": 1.6507416665554047, "reward_std": 0.14080907637253404, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.08138803765177727, "rewards/waypoint_pred_accuracy/mean": 0.014823972041076217, "rewards/waypoint_pred_accuracy/std": 0.02189802705651367, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.5, "completions/max_terminated_length": 276.5, "completions/mean_length": 155.49609375, "completions/mean_terminated_length": 155.49609375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.28421052631578947, "grad_norm": 0.014099229127168655, "learning_rate": 9.128864762033824e-07, "loss": 0.0002, "num_tokens": 66774510.0, "reward": 1.8715248703956604, "reward_std": 0.12338122609071434, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.060762443249078105, "rewards/waypoint_pred_accuracy/std": 0.06169061613346334, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.75, "completions/max_terminated_length": 311.75, "completions/mean_length": 164.794921875, "completions/mean_terminated_length": 164.794921875, "completions/min_length": 94.875, "completions/min_terminated_length": 94.875, "epoch": 0.2863157894736842, "grad_norm": 0.017550233751535416, "learning_rate": 9.10918820800916e-07, "loss": 0.0001, "num_tokens": 67247941.0, "reward": 1.7576846480369568, "reward_std": 0.059877947336644866, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.006772012609022298, "rewards/waypoint_pred_accuracy/std": 0.01116596694236254, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.875, "completions/max_terminated_length": 333.875, "completions/mean_length": 161.1640625, "completions/mean_terminated_length": 161.1640625, "completions/min_length": 93.75, "completions/min_terminated_length": 93.75, "epoch": 0.28842105263157897, "grad_norm": 0.01980479061603546, "learning_rate": 9.089316286823274e-07, "loss": 0.0012, "num_tokens": 67719897.0, "reward": 1.8230868130922318, "reward_std": 0.1315819690862554, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.0625, "rewards/waypoint_pred_accuracy/mean": 0.03849652719228994, "rewards/waypoint_pred_accuracy/std": 0.04325572500814767, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 174.234375, "completions/mean_terminated_length": 174.234375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.2905263157894737, "grad_norm": 0.024728739634156227, "learning_rate": 9.069250074153191e-07, "loss": 0.0005, "num_tokens": 68200913.0, "reward": 1.992507129907608, "reward_std": 0.054616279961919645, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.9921875, "rewards/stop_prediction_reward/std": 0.05317101255059242, "rewards/waypoint_pred_accuracy/mean": 0.0001598159879208083, "rewards/waypoint_pred_accuracy/std": 0.0007227737407333734, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 162.240234375, "completions/mean_terminated_length": 162.240234375, "completions/min_length": 89.375, "completions/min_terminated_length": 89.375, "epoch": 0.2926315789473684, "grad_norm": 0.0, "learning_rate": 9.048990656193024e-07, "loss": -0.0005, "num_tokens": 68676172.0, "reward": 1.875022366642952, "reward_std": 0.03142248300719075, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 1.118002853672806e-05, "rewards/waypoint_pred_accuracy/std": 8.741845476833987e-05, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 167.47265625, "completions/mean_terminated_length": 167.47265625, "completions/min_length": 92.75, "completions/min_terminated_length": 92.75, "epoch": 0.29473684210526313, "grad_norm": 0.028531944379210472, "learning_rate": 9.028539129595197e-07, "loss": -0.0008, "num_tokens": 69150782.0, "reward": 1.9313835203647614, "reward_std": 0.13018206134438515, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.031121453504955626, "rewards/waypoint_pred_accuracy/std": 0.04165353068666345, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.625, "completions/max_terminated_length": 312.625, "completions/mean_length": 167.0859375, "completions/mean_terminated_length": 167.0859375, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.2968421052631579, "grad_norm": 0.015785250812768936, "learning_rate": 9.00789660141106e-07, "loss": -0.0003, "num_tokens": 69624170.0, "reward": 1.9134919345378876, "reward_std": 0.11400369919920195, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.87109375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.02217568371906964, "rewards/waypoint_pred_accuracy/std": 0.04750002907394446, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.375, "completions/max_terminated_length": 306.375, "completions/mean_length": 167.19140625, "completions/mean_terminated_length": 167.19140625, "completions/min_length": 98.75, "completions/min_terminated_length": 98.75, "epoch": 0.29894736842105263, "grad_norm": 0.023555859923362732, "learning_rate": 8.987064189030983e-07, "loss": 0.0001, "num_tokens": 70099596.0, "reward": 1.818978101015091, "reward_std": 0.0910695298812243, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.035465632668339866, "rewards/waypoint_pred_accuracy/std": 0.03772226279037594, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.375, "completions/max_terminated_length": 316.375, "completions/mean_length": 155.17578125, "completions/mean_terminated_length": 155.17578125, "completions/min_length": 92.125, "completions/min_terminated_length": 92.125, "epoch": 0.30105263157894735, "grad_norm": 0.0, "learning_rate": 8.966043020123855e-07, "loss": -0.0006, "num_tokens": 70569190.0, "reward": 2.1103286147117615, "reward_std": 0.0724575242602441, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05516431625073892, "rewards/waypoint_pred_accuracy/std": 0.036228755789124996, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 162.4140625, "completions/mean_terminated_length": 162.4140625, "completions/min_length": 95.875, "completions/min_terminated_length": 95.875, "epoch": 0.3031578947368421, "grad_norm": 0.010524507611989975, "learning_rate": 8.944834232576054e-07, "loss": -0.001, "num_tokens": 71040122.0, "reward": 2.065228596329689, "reward_std": 0.1154269075486809, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.990234375, "rewards/stop_prediction_reward/std": 0.04855126701295376, "rewards/waypoint_pred_accuracy/mean": 0.0374971410976741, "rewards/waypoint_pred_accuracy/std": 0.03343789520468832, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 513.25, "completions/max_terminated_length": 513.25, "completions/mean_length": 163.306640625, "completions/mean_terminated_length": 163.64570999145508, "completions/min_length": 77.75, "completions/min_terminated_length": 89.375, "epoch": 0.30526315789473685, "grad_norm": 0.01892881467938423, "learning_rate": 8.923438974429849e-07, "loss": 0.0062, "num_tokens": 71514391.0, "reward": 1.5013127624988556, "reward_std": 0.05349951934939168, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.501953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0016329490543148754, "rewards/waypoint_pred_accuracy/std": 0.00331225974358014, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.125, "completions/max_terminated_length": 293.125, "completions/mean_length": 156.130859375, "completions/mean_terminated_length": 156.130859375, "completions/min_length": 97.5, "completions/min_terminated_length": 97.5, "epoch": 0.30736842105263157, "grad_norm": 0.016309263184666634, "learning_rate": 8.901858403821253e-07, "loss": 0.0002, "num_tokens": 71981402.0, "reward": 1.896241456270218, "reward_std": 0.12619078904390335, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.07409729063510895, "rewards/waypoint_pred_accuracy/std": 0.03965789079666138, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.125, "completions/max_terminated_length": 285.125, "completions/mean_length": 153.650390625, "completions/mean_terminated_length": 153.650390625, "completions/min_length": 94.75, "completions/min_terminated_length": 94.75, "epoch": 0.3094736842105263, "grad_norm": 0.017356984317302704, "learning_rate": 8.880093688917338e-07, "loss": -0.0001, "num_tokens": 72450919.0, "reward": 1.9465051144361496, "reward_std": 0.14818904345179362, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.869140625, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.03965881569784813, "rewards/waypoint_pred_accuracy/std": 0.05162953603291953, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.5, "completions/max_terminated_length": 289.5, "completions/mean_length": 156.361328125, "completions/mean_terminated_length": 156.361328125, "completions/min_length": 85.875, "completions/min_terminated_length": 85.875, "epoch": 0.31157894736842107, "grad_norm": 0.016940809786319733, "learning_rate": 8.858146007853e-07, "loss": 0.0001, "num_tokens": 72918880.0, "reward": 1.7521924823522568, "reward_std": 0.03509542074514993, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.0010962454687926675, "rewards/waypoint_pred_accuracy/std": 0.005062559436581103, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.875, "completions/max_terminated_length": 261.875, "completions/mean_length": 150.791015625, "completions/mean_terminated_length": 150.791015625, "completions/min_length": 92.5, "completions/min_terminated_length": 92.5, "epoch": 0.3136842105263158, "grad_norm": 0.008656186982989311, "learning_rate": 8.836016548667178e-07, "loss": -0.0002, "num_tokens": 73388917.0, "reward": 1.7601932436227798, "reward_std": 0.03978594159707427, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.006073190983845654, "rewards/waypoint_pred_accuracy/std": 0.012080476215070388, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.125, "completions/max_terminated_length": 262.125, "completions/mean_length": 150.279296875, "completions/mean_terminated_length": 150.279296875, "completions/min_length": 91.875, "completions/min_terminated_length": 91.875, "epoch": 0.3157894736842105, "grad_norm": 0.015179594978690147, "learning_rate": 8.813706509238558e-07, "loss": -0.0004, "num_tokens": 73853444.0, "reward": 1.7564743012189865, "reward_std": 0.06601627822965384, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.005190281644558099, "rewards/waypoint_pred_accuracy/std": 0.01738314354451939, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.75, "completions/max_terminated_length": 290.75, "completions/mean_length": 148.55859375, "completions/mean_terminated_length": 148.55859375, "completions/min_length": 86.25, "completions/min_terminated_length": 86.25, "epoch": 0.3178947368421053, "grad_norm": 0.011068091727793217, "learning_rate": 8.791217097220724e-07, "loss": 0.0001, "num_tokens": 74316066.0, "reward": 2.0001233518123627, "reward_std": 0.0008530195158940046, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 6.167972693649215e-05, "rewards/waypoint_pred_accuracy/std": 0.0004265111779173658, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 146.140625, "completions/mean_terminated_length": 146.140625, "completions/min_length": 88.625, "completions/min_terminated_length": 88.625, "epoch": 0.32, "grad_norm": 0.013286644592881203, "learning_rate": 8.768549529976783e-07, "loss": -0.0001, "num_tokens": 74778538.0, "reward": 1.9455503523349762, "reward_std": 0.09065551357343793, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0352751867758343, "rewards/waypoint_pred_accuracy/std": 0.045327756204642355, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.125, "completions/max_terminated_length": 272.125, "completions/mean_length": 149.201171875, "completions/mean_terminated_length": 149.201171875, "completions/min_length": 85.5, "completions/min_terminated_length": 85.5, "epoch": 0.32210526315789473, "grad_norm": 0.027283476665616035, "learning_rate": 8.74570503451348e-07, "loss": 0.0, "num_tokens": 75245265.0, "reward": 1.9154797792434692, "reward_std": 0.07870338168504531, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.021216471865045605, "rewards/waypoint_pred_accuracy/std": 0.03153918843054271, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.125, "completions/max_terminated_length": 294.125, "completions/mean_length": 148.419921875, "completions/mean_terminated_length": 148.419921875, "completions/min_length": 92.375, "completions/min_terminated_length": 92.375, "epoch": 0.32421052631578945, "grad_norm": 0.017137156799435616, "learning_rate": 8.72268484741477e-07, "loss": 0.0002, "num_tokens": 75711528.0, "reward": 1.8267784118652344, "reward_std": 0.15307198226219043, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.10088922739669257, "rewards/waypoint_pred_accuracy/std": 0.07653599904734049, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.75, "completions/max_terminated_length": 274.75, "completions/mean_length": 149.67578125, "completions/mean_terminated_length": 149.67578125, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.3263157894736842, "grad_norm": 0.0, "learning_rate": 8.699490214774881e-07, "loss": 0.0001, "num_tokens": 76178434.0, "reward": 1.754124492406845, "reward_std": 0.0151796446363619, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0020622443745356234, "rewards/waypoint_pred_accuracy/std": 0.007589819752975188, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.25, "completions/max_terminated_length": 268.25, "completions/mean_length": 144.4921875, "completions/mean_terminated_length": 144.4921875, "completions/min_length": 84.5, "completions/min_terminated_length": 84.5, "epoch": 0.32842105263157895, "grad_norm": 0.0, "learning_rate": 8.676122392130872e-07, "loss": -0.0001, "num_tokens": 76640574.0, "reward": 1.62890625, "reward_std": 0.021921012550592422, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.62890625, "rewards/stop_prediction_reward/std": 0.021921012550592422, "rewards/waypoint_pred_accuracy/mean": 7.589869124147713e-25, "rewards/waypoint_pred_accuracy/std": 4.67924008313652e-24, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.125, "completions/max_terminated_length": 245.125, "completions/mean_length": 137.328125, "completions/mean_terminated_length": 137.328125, "completions/min_length": 82.625, "completions/min_terminated_length": 82.625, "epoch": 0.33052631578947367, "grad_norm": 0.013590642251074314, "learning_rate": 8.652582644394657e-07, "loss": 0.0003, "num_tokens": 77100006.0, "reward": 1.7640494406223297, "reward_std": 0.06568732671439648, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.046875, "rewards/waypoint_pred_accuracy/mean": 0.008001290596129801, "rewards/waypoint_pred_accuracy/std": 0.011422272541230698, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.25, "completions/max_terminated_length": 300.25, "completions/mean_length": 159.173828125, "completions/mean_terminated_length": 159.173828125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.33263157894736844, "grad_norm": 0.027779361233115196, "learning_rate": 8.628872245784545e-07, "loss": 0.0002, "num_tokens": 77572863.0, "reward": 1.8006756454706192, "reward_std": 0.10399453737277398, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.025337843918715696, "rewards/waypoint_pred_accuracy/std": 0.051997271799539925, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.25, "completions/max_terminated_length": 268.25, "completions/mean_length": 145.630859375, "completions/mean_terminated_length": 145.630859375, "completions/min_length": 89.375, "completions/min_terminated_length": 89.375, "epoch": 0.33473684210526317, "grad_norm": 0.010887346230447292, "learning_rate": 8.60499247975626e-07, "loss": 0.0002, "num_tokens": 78038018.0, "reward": 1.707542285323143, "reward_std": 0.15247048148279418, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.04127114170480439, "rewards/waypoint_pred_accuracy/std": 0.06334648191131009, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.375, "completions/max_terminated_length": 280.375, "completions/mean_length": 149.53125, "completions/mean_terminated_length": 149.53125, "completions/min_length": 91.875, "completions/min_terminated_length": 91.875, "epoch": 0.3368421052631579, "grad_norm": 0.025546882301568985, "learning_rate": 8.58094463893347e-07, "loss": -0.0, "num_tokens": 78503378.0, "reward": 1.9160315990447998, "reward_std": 0.04672868221945237, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.020515798863947058, "rewards/waypoint_pred_accuracy/std": 0.023364338963972414, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.5, "completions/max_terminated_length": 260.5, "completions/mean_length": 145.369140625, "completions/mean_terminated_length": 145.369140625, "completions/min_length": 88.25, "completions/min_terminated_length": 88.25, "epoch": 0.3389473684210526, "grad_norm": 0.010935882106423378, "learning_rate": 8.556730025037819e-07, "loss": 0.0004, "num_tokens": 78965327.0, "reward": 1.9171094596385956, "reward_std": 0.06519777842363439, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.02203131265337177, "rewards/waypoint_pred_accuracy/std": 0.024786388293865258, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.625, "completions/max_terminated_length": 259.625, "completions/mean_length": 145.685546875, "completions/mean_terminated_length": 145.685546875, "completions/min_length": 92.375, "completions/min_terminated_length": 92.375, "epoch": 0.3410526315789474, "grad_norm": 0.0, "learning_rate": 8.532349948818453e-07, "loss": 0.0001, "num_tokens": 79427630.0, "reward": 1.5663428604602814, "reward_std": 0.11875003115164873, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.033171423198496086, "rewards/waypoint_pred_accuracy/std": 0.05937501807238732, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.125, "completions/max_terminated_length": 263.125, "completions/mean_length": 146.3671875, "completions/mean_terminated_length": 146.3671875, "completions/min_length": 88.5, "completions/min_terminated_length": 88.5, "epoch": 0.3431578947368421, "grad_norm": 0.0, "learning_rate": 8.507805729981081e-07, "loss": -0.0002, "num_tokens": 79892650.0, "reward": 1.5351474583148956, "reward_std": 0.14698171289637685, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.375, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.08007374802274361, "rewards/waypoint_pred_accuracy/std": 0.07349086113989284, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.125, "completions/max_terminated_length": 244.125, "completions/mean_length": 145.857421875, "completions/mean_terminated_length": 145.857421875, "completions/min_length": 91.125, "completions/min_terminated_length": 91.125, "epoch": 0.3452631578947368, "grad_norm": 0.0, "learning_rate": 8.483098697116535e-07, "loss": -0.0, "num_tokens": 80357985.0, "reward": 1.8520799279212952, "reward_std": 0.07666328936466016, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.05103997113747369, "rewards/waypoint_pred_accuracy/std": 0.03833164387472608, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.125, "completions/max_terminated_length": 244.125, "completions/mean_length": 143.919921875, "completions/mean_terminated_length": 143.919921875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.3473684210526316, "grad_norm": 0.014973700977861881, "learning_rate": 8.45823018762885e-07, "loss": 0.0001, "num_tokens": 80822008.0, "reward": 1.516570344567299, "reward_std": 0.03593069076305255, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.008285194722702727, "rewards/waypoint_pred_accuracy/std": 0.01796534500317648, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.625, "completions/max_terminated_length": 240.625, "completions/mean_length": 138.451171875, "completions/mean_terminated_length": 138.451171875, "completions/min_length": 86.875, "completions/min_terminated_length": 86.875, "epoch": 0.3494736842105263, "grad_norm": 0.012644310481846333, "learning_rate": 8.43320154766287e-07, "loss": -0.0, "num_tokens": 81280415.0, "reward": 1.950978472828865, "reward_std": 0.10077390982041834, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.03798924066802556, "rewards/waypoint_pred_accuracy/std": 0.05038695407711202, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.25, "completions/max_terminated_length": 261.25, "completions/mean_length": 146.03125, "completions/mean_terminated_length": 146.03125, "completions/min_length": 85.625, "completions/min_terminated_length": 85.625, "epoch": 0.35157894736842105, "grad_norm": 0.0, "learning_rate": 8.408014132031385e-07, "loss": 0.0001, "num_tokens": 81743215.0, "reward": 1.7540984004735947, "reward_std": 0.03939264878863469, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.003025781332005908, "rewards/waypoint_pred_accuracy/std": 0.012455621152136035, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.5, "completions/max_terminated_length": 251.5, "completions/mean_length": 144.814453125, "completions/mean_terminated_length": 144.814453125, "completions/min_length": 94.5, "completions/min_terminated_length": 94.5, "epoch": 0.35368421052631577, "grad_norm": 0.021090500056743622, "learning_rate": 8.382669304141789e-07, "loss": -0.0001, "num_tokens": 82207504.0, "reward": 1.7465842962265015, "reward_std": 0.11820045917193056, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06079213417120413, "rewards/waypoint_pred_accuracy/std": 0.059100230197541466, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.375, "completions/max_terminated_length": 265.375, "completions/mean_length": 147.98046875, "completions/mean_terminated_length": 147.98046875, "completions/min_length": 85.625, "completions/min_terminated_length": 85.625, "epoch": 0.35578947368421054, "grad_norm": 2.866966042347485e-06, "learning_rate": 8.35716843592228e-07, "loss": 0.0002, "num_tokens": 82672966.0, "reward": 1.8769371807575226, "reward_std": 0.014908303018460689, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0009685879016880089, "rewards/waypoint_pred_accuracy/std": 0.0074541514106900575, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.875, "completions/max_terminated_length": 250.875, "completions/mean_length": 144.619140625, "completions/mean_terminated_length": 144.619140625, "completions/min_length": 89.75, "completions/min_terminated_length": 89.75, "epoch": 0.35789473684210527, "grad_norm": 0.019311359152197838, "learning_rate": 8.331512907747596e-07, "loss": -0.0, "num_tokens": 83136323.0, "reward": 1.8065136820077896, "reward_std": 0.2033659865359141, "rewards/format_reward_embodied/mean": 0.99609375, "rewards/format_reward_embodied/std": 0.03125, "rewards/stop_prediction_reward/mean": 0.62109375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 0.09466311723218013, "rewards/waypoint_pred_accuracy/std": 0.08210119066467314, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.75, "completions/max_terminated_length": 224.75, "completions/mean_length": 138.552734375, "completions/mean_terminated_length": 138.552734375, "completions/min_length": 83.5, "completions/min_terminated_length": 83.5, "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 8.305704108364301e-07, "loss": 0.0, "num_tokens": 83595486.0, "reward": 1.8751020431518555, "reward_std": 0.0004879390965015773, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 5.101329828358249e-05, "rewards/waypoint_pred_accuracy/std": 0.00024397094620981193, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.75, "completions/max_terminated_length": 263.75, "completions/mean_length": 144.60546875, "completions/mean_terminated_length": 144.60546875, "completions/min_length": 90.25, "completions/min_terminated_length": 90.25, "epoch": 0.36210526315789476, "grad_norm": 0.014394046738743782, "learning_rate": 8.279743434815599e-07, "loss": -0.0004, "num_tokens": 84059668.0, "reward": 1.683645486831665, "reward_std": 0.10552045330405235, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.030299316626042128, "rewards/waypoint_pred_accuracy/std": 0.05022337753325701, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.25, "completions/max_terminated_length": 243.25, "completions/mean_length": 135.642578125, "completions/mean_terminated_length": 135.642578125, "completions/min_length": 80.875, "completions/min_terminated_length": 80.875, "epoch": 0.3642105263157895, "grad_norm": 0.012850490398705006, "learning_rate": 8.253632292365726e-07, "loss": -0.0004, "num_tokens": 84520477.0, "reward": 1.9978020787239075, "reward_std": 0.14740227215224877, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.062377601401009786, "rewards/waypoint_pred_accuracy/std": 0.06588864292415944, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.125, "completions/max_terminated_length": 247.125, "completions/mean_length": 143.791015625, "completions/mean_terminated_length": 143.791015625, "completions/min_length": 88.5, "completions/min_terminated_length": 88.5, "epoch": 0.3663157894736842, "grad_norm": 0.01804622821509838, "learning_rate": 8.227372094423864e-07, "loss": -0.0001, "num_tokens": 84985138.0, "reward": 1.9833861291408539, "reward_std": 0.19392894953489304, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.05614619702100754, "rewards/waypoint_pred_accuracy/std": 0.08133947290480137, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.375, "completions/max_terminated_length": 265.375, "completions/mean_length": 141.953125, "completions/mean_terminated_length": 141.953125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.3684210526315789, "grad_norm": 9.899375072563998e-06, "learning_rate": 8.200964262467656e-07, "loss": -0.0004, "num_tokens": 85444954.0, "reward": 1.394999474287033, "reward_std": 0.05159659555202989, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.376953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.00902318308910971, "rewards/waypoint_pred_accuracy/std": 0.01798579893491592, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.75, "completions/max_terminated_length": 282.75, "completions/mean_length": 143.80078125, "completions/mean_terminated_length": 143.80078125, "completions/min_length": 86.25, "completions/min_terminated_length": 86.25, "epoch": 0.3705263157894737, "grad_norm": 0.013024689629673958, "learning_rate": 8.174410225966239e-07, "loss": -0.0004, "num_tokens": 85907188.0, "reward": 2.009105235338211, "reward_std": 0.1475043022655882, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06802919746895669, "rewards/waypoint_pred_accuracy/std": 0.0659396512298075, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.5, "completions/max_terminated_length": 233.5, "completions/mean_length": 139.0703125, "completions/mean_terminated_length": 139.0703125, "completions/min_length": 87.5, "completions/min_terminated_length": 87.5, "epoch": 0.3726315789473684, "grad_norm": 0.0012392106000334024, "learning_rate": 8.147711422302881e-07, "loss": -0.0, "num_tokens": 86368088.0, "reward": 1.5241148173809052, "reward_std": 0.03582000017365772, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.5, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.012057423907151588, "rewards/waypoint_pred_accuracy/std": 0.017910004946543268, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.75, "completions/max_terminated_length": 238.75, "completions/mean_length": 141.525390625, "completions/mean_terminated_length": 141.525390625, "completions/min_length": 82.875, "completions/min_terminated_length": 82.875, "epoch": 0.37473684210526315, "grad_norm": 0.01618722826242447, "learning_rate": 8.120869296697162e-07, "loss": 0.0001, "num_tokens": 86830181.0, "reward": 1.7583584785461426, "reward_std": 0.02889735234600721, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0041792236801776, "rewards/waypoint_pred_accuracy/std": 0.014448676936513825, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 273.25, "completions/max_terminated_length": 273.25, "completions/mean_length": 141.87890625, "completions/mean_terminated_length": 142.17975044250488, "completions/min_length": 74.875, "completions/min_terminated_length": 87.5, "epoch": 0.37684210526315787, "grad_norm": 0.014032267965376377, "learning_rate": 8.093885302126754e-07, "loss": -0.0004, "num_tokens": 87290599.0, "reward": 1.7515175342559814, "reward_std": 0.040817584842443466, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.001735331054078415, "rewards/waypoint_pred_accuracy/std": 0.012596295913681388, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.5, "completions/max_terminated_length": 246.5, "completions/mean_length": 139.904296875, "completions/mean_terminated_length": 139.904296875, "completions/min_length": 88.375, "completions/min_terminated_length": 88.375, "epoch": 0.37894736842105264, "grad_norm": 0.0, "learning_rate": 8.06676089924877e-07, "loss": -0.0002, "num_tokens": 87750454.0, "reward": 1.8144625127315521, "reward_std": 0.16868100641295314, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.623046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.09570782238733955, "rewards/waypoint_pred_accuracy/std": 0.08269866928458214, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.875, "completions/max_terminated_length": 301.875, "completions/mean_length": 152.48828125, "completions/mean_terminated_length": 152.48828125, "completions/min_length": 94.75, "completions/min_terminated_length": 94.75, "epoch": 0.38105263157894737, "grad_norm": 0.0, "learning_rate": 8.03949755632069e-07, "loss": -0.0, "num_tokens": 88216304.0, "reward": 1.7629780769348145, "reward_std": 0.11227092240005732, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.06898905546404421, "rewards/waypoint_pred_accuracy/std": 0.05613546399399638, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.75, "completions/max_terminated_length": 284.75, "completions/mean_length": 148.99609375, "completions/mean_terminated_length": 148.99609375, "completions/min_length": 93.625, "completions/min_terminated_length": 93.625, "epoch": 0.3831578947368421, "grad_norm": 0.02158622071146965, "learning_rate": 8.01209674912089e-07, "loss": 0.0, "num_tokens": 88681454.0, "reward": 1.9078221917152405, "reward_std": 0.07242656219750643, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0789111019403208, "rewards/waypoint_pred_accuracy/std": 0.036213275976479053, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 280.625, "completions/max_terminated_length": 280.625, "completions/mean_length": 141.5078125, "completions/mean_terminated_length": 141.7366075515747, "completions/min_length": 78.125, "completions/min_terminated_length": 88.0, "epoch": 0.38526315789473686, "grad_norm": 0.017062678933143616, "learning_rate": 7.984559960868759e-07, "loss": -0.0002, "num_tokens": 89142258.0, "reward": 1.696620061993599, "reward_std": 0.11242801253683865, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.036786592652788386, "rewards/waypoint_pred_accuracy/std": 0.0484015071997419, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.25, "completions/max_terminated_length": 276.25, "completions/mean_length": 155.98046875, "completions/mean_terminated_length": 155.98046875, "completions/min_length": 92.625, "completions/min_terminated_length": 92.625, "epoch": 0.3873684210526316, "grad_norm": 0.0, "learning_rate": 7.956888682144403e-07, "loss": 0.0002, "num_tokens": 89612840.0, "reward": 2.0942837595939636, "reward_std": 0.09505448490381241, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.998046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.04811845096992329, "rewards/waypoint_pred_accuracy/std": 0.04697974817827344, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 147.583984375, "completions/mean_terminated_length": 147.583984375, "completions/min_length": 89.875, "completions/min_terminated_length": 89.875, "epoch": 0.3894736842105263, "grad_norm": 0.0, "learning_rate": 7.929084410807964e-07, "loss": -0.0001, "num_tokens": 90077523.0, "reward": 1.80657397210598, "reward_std": 0.04950862978614623, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.75, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.028286980984023265, "rewards/waypoint_pred_accuracy/std": 0.02475431633633779, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.5, "completions/max_terminated_length": 265.5, "completions/mean_length": 142.900390625, "completions/mean_terminated_length": 142.900390625, "completions/min_length": 85.625, "completions/min_terminated_length": 85.625, "epoch": 0.391578947368421, "grad_norm": 1.3056469470029697e-05, "learning_rate": 7.90114865191855e-07, "loss": -0.0, "num_tokens": 90538208.0, "reward": 1.9060467183589935, "reward_std": 0.08156437341584333, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.015523360966641753, "rewards/waypoint_pred_accuracy/std": 0.040782188392672225, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.75, "completions/max_terminated_length": 290.75, "completions/mean_length": 152.669921875, "completions/mean_terminated_length": 152.669921875, "completions/min_length": 87.75, "completions/min_terminated_length": 87.75, "epoch": 0.3936842105263158, "grad_norm": 0.024091186001896858, "learning_rate": 7.873082917652743e-07, "loss": 0.0003, "num_tokens": 91006007.0, "reward": 1.6670926809310913, "reward_std": 0.05647587693238165, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.021046354011408017, "rewards/waypoint_pred_accuracy/std": 0.028237939174382873, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.5, "completions/max_terminated_length": 278.5, "completions/mean_length": 156.775390625, "completions/mean_terminated_length": 156.775390625, "completions/min_length": 96.75, "completions/min_terminated_length": 96.75, "epoch": 0.3957894736842105, "grad_norm": 0.012568764388561249, "learning_rate": 7.844888727222768e-07, "loss": -0.0005, "num_tokens": 91473412.0, "reward": 1.6338473558425903, "reward_std": 0.022411950994865038, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.004423671395670681, "rewards/waypoint_pred_accuracy/std": 0.011205977587451343, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 157.619140625, "completions/mean_terminated_length": 157.619140625, "completions/min_length": 94.125, "completions/min_terminated_length": 94.125, "epoch": 0.39789473684210525, "grad_norm": 0.01582680270075798, "learning_rate": 7.816567606794239e-07, "loss": 0.0011, "num_tokens": 91943361.0, "reward": 1.7881784588098526, "reward_std": 0.08320084138540551, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.02104235623846762, "rewards/waypoint_pred_accuracy/std": 0.02681039142771624, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.25, "completions/max_terminated_length": 299.25, "completions/mean_length": 159.732421875, "completions/mean_terminated_length": 159.732421875, "completions/min_length": 94.125, "completions/min_terminated_length": 94.125, "epoch": 0.4, "grad_norm": 0.020354358479380608, "learning_rate": 7.788121089403557e-07, "loss": 0.0002, "num_tokens": 92414584.0, "reward": 1.804437667131424, "reward_std": 0.0880438498343139, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.748046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.028195410479389338, "rewards/waypoint_pred_accuracy/std": 0.036214622034558275, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.5, "completions/max_terminated_length": 291.5, "completions/mean_length": 157.779296875, "completions/mean_terminated_length": 157.779296875, "completions/min_length": 89.5, "completions/min_terminated_length": 89.5, "epoch": 0.40210526315789474, "grad_norm": 0.01398465782403946, "learning_rate": 7.759550714874924e-07, "loss": 0.0002, "num_tokens": 92884039.0, "reward": 2.01309472322464, "reward_std": 0.11034491898243459, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.0690473628026318, "rewards/waypoint_pred_accuracy/std": 0.0551724555417934, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.5, "completions/max_terminated_length": 267.5, "completions/mean_length": 148.189453125, "completions/mean_terminated_length": 148.189453125, "completions/min_length": 87.5, "completions/min_terminated_length": 87.5, "epoch": 0.40421052631578946, "grad_norm": 0.010616026818752289, "learning_rate": 7.730858029736989e-07, "loss": -0.0003, "num_tokens": 93347752.0, "reward": 1.744140625, "reward_std": 0.03754602471734003, "rewards/format_reward_embodied/mean": 0.998046875, "rewards/format_reward_embodied/std": 0.015625, "rewards/stop_prediction_reward/mean": 0.74609375, "rewards/stop_prediction_reward/std": 0.03125, "rewards/waypoint_pred_accuracy/mean": 7.299939541738366e-10, "rewards/waypoint_pred_accuracy/std": 5.8379368006456025e-09, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.5, "completions/max_terminated_length": 277.5, "completions/mean_length": 153.03515625, "completions/mean_terminated_length": 153.03515625, "completions/min_length": 91.375, "completions/min_terminated_length": 91.375, "epoch": 0.4063157894736842, "grad_norm": 0.0, "learning_rate": 7.702044587139137e-07, "loss": 0.0002, "num_tokens": 93813690.0, "reward": 1.752562627196312, "reward_std": 0.019909796021238435, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.751953125, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.0003047559775951213, "rewards/waypoint_pred_accuracy/std": 0.0021423957914523876, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.75, "completions/max_terminated_length": 269.75, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 93.375, "completions/min_terminated_length": 93.375, "epoch": 0.40842105263157896, "grad_norm": 0.016295522451400757, "learning_rate": 7.673111946767413e-07, "loss": 0.0003, "num_tokens": 94280122.0, "reward": 2.099191278219223, "reward_std": 0.13045725226402283, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.875, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.11209563538432243, "rewards/waypoint_pred_accuracy/std": 0.06522862054408278, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.75, "completions/max_terminated_length": 285.75, "completions/mean_length": 153.009765625, "completions/mean_terminated_length": 153.009765625, "completions/min_length": 88.75, "completions/min_terminated_length": 88.75, "epoch": 0.4105263157894737, "grad_norm": 0.018163178116083145, "learning_rate": 7.644061674760101e-07, "loss": -0.0001, "num_tokens": 94746239.0, "reward": 2.2828119099140167, "reward_std": 0.21475151367485523, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.14140595760660155, "rewards/waypoint_pred_accuracy/std": 0.10737574858109461, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.625, "completions/max_terminated_length": 289.625, "completions/mean_length": 150.8828125, "completions/mean_terminated_length": 150.8828125, "completions/min_length": 90.625, "completions/min_terminated_length": 90.625, "epoch": 0.4126315789473684, "grad_norm": 0.022137505933642387, "learning_rate": 7.61489534362294e-07, "loss": 0.0002, "num_tokens": 95213059.0, "reward": 1.5212143957614899, "reward_std": 0.03324593836441636, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.498046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.01158377673709765, "rewards/waypoint_pred_accuracy/std": 0.019431066000834107, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 141.884765625, "completions/mean_terminated_length": 141.884765625, "completions/min_length": 87.25, "completions/min_terminated_length": 87.25, "epoch": 0.4147368421052632, "grad_norm": 0.0, "learning_rate": 7.585614532144007e-07, "loss": 0.0001, "num_tokens": 95674952.0, "reward": 1.8150294870138168, "reward_std": 0.1609834808987216, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.625, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.09501473996715504, "rewards/waypoint_pred_accuracy/std": 0.08049174577718077, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.375, "completions/max_terminated_length": 255.375, "completions/mean_length": 144.384765625, "completions/mean_terminated_length": 144.384765625, "completions/min_length": 85.625, "completions/min_terminated_length": 85.625, "epoch": 0.4168421052631579, "grad_norm": 0.0, "learning_rate": 7.556220825308261e-07, "loss": -0.0002, "num_tokens": 96137869.0, "reward": 1.887547492980957, "reward_std": 0.04732609516941011, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 0.873046875, "rewards/stop_prediction_reward/std": 0.015625, "rewards/waypoint_pred_accuracy/mean": 0.007250291717035162, "rewards/waypoint_pred_accuracy/std": 0.020060429181877233, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.875, "completions/max_terminated_length": 287.875, "completions/mean_length": 149.78125, "completions/mean_terminated_length": 149.78125, "completions/min_length": 93.625, "completions/min_terminated_length": 93.625, "epoch": 0.4189473684210526, "grad_norm": 0.0, "learning_rate": 7.526715814211739e-07, "loss": -0.0004, "num_tokens": 96601181.0, "reward": 2.0124606788158417, "reward_std": 0.05255796667188406, "rewards/format_reward_embodied/mean": 1.0, "rewards/format_reward_embodied/std": 0.0, "rewards/stop_prediction_reward/mean": 1.0, "rewards/stop_prediction_reward/std": 0.0, "rewards/waypoint_pred_accuracy/mean": 0.006230339058674872, "rewards/waypoint_pred_accuracy/std": 0.02627899032086134, "step": 199 }, { "epoch": 0.42105263157894735, "grad_norm": 0.0, "learning_rate": 7.49710109597544e-07, "loss": -0.0, "step": 200 }, { "epoch": 0.42105263157894735, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 273.93, "eval_completions/max_terminated_length": 273.93, "eval_completions/mean_length": 149.2509895324707, "eval_completions/mean_terminated_length": 149.2509895324707, "eval_completions/min_length": 89.12, "eval_completions/min_terminated_length": 89.12, "eval_loss": -0.00012273716856725514, "eval_num_tokens": 97069106.0, "eval_reward": 1.8643140089511872, "eval_reward_std": 0.08826646614328094, "eval_rewards/format_reward_embodied/mean": 1.0, "eval_rewards/format_reward_embodied/std": 0.0, "eval_rewards/stop_prediction_reward/mean": 0.7696875, "eval_rewards/stop_prediction_reward/std": 0.005, "eval_rewards/waypoint_pred_accuracy/mean": 0.04731326040690851, "eval_rewards/waypoint_pred_accuracy/std": 0.041636976439897125, "eval_runtime": 934.4513, "eval_samples_per_second": 0.107, "eval_steps_per_second": 0.002, "step": 200 } ], "logging_steps": 1, "max_steps": 475, "num_input_tokens_seen": 97069106, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }