Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +32 -6
config.json +120 -46
model.safetensors +1 -1
train_config.json +280 -0

README.md CHANGED Viewed

@@ -15,24 +15,50 @@ pipeline_tag: robotics
 Read more about the model and implementation details in the [DOT Policy repository](https://github.com/IliaLarchenko/dot_policy).
-This model is trained using the [LeRobot library](https://huggingface.co/lerobot) and achieves state-of-the-art results on behavior cloning on the PushT keypoints dataset. It achieves 84.5% success rate (and 0.964 average max reward) vs. ~78% for the previous state-of-the-art model or 69% that I managed to reproduce using VQ-BET implementation in LeRobot.
 This result is achieved without the checkpoint selection. If you are interested in an even better model with a success rate of ~94% (but harder to reproduce as it requires some parameters tuning and checkpoint selection), please refer to [this model](https://huggingface.co/IliaLarchenko/dot_pusht_keypoints_best)
-You can use this model by installing LeRobot from [this branch](https://github.com/IliaLarchenko/lerobot/tree/dot)
 To train the model:
 ```bash
-python lerobot/scripts/train.py policy=dot_pusht_keypoints env=pusht env.gym.obs_type=environment_state_agent_pos
 ```
 To evaluate the model:
 ```bash
-python lerobot/scripts/eval.py -p IliaLarchenko/dot_pusht_keypoints eval.n_episodes=1000 eval.batch_size=100 seed=1000000
 ```
 Model size:
-- Total parameters: 2.1m
-- Trainable parameters: 2.1m

 Read more about the model and implementation details in the [DOT Policy repository](https://github.com/IliaLarchenko/dot_policy).
+This model is trained using the [LeRobot library](https://huggingface.co/lerobot) and achieves state-of-the-art results on behavior cloning on the PushT keypoints dataset. It achieves 88.1% success rate (and 0.969 average max reward) vs. ~78% for the previous state-of-the-art model or 69% that I managed to reproduce using VQ-BET implementation in LeRobot.
 This result is achieved without the checkpoint selection. If you are interested in an even better model with a success rate of ~94% (but harder to reproduce as it requires some parameters tuning and checkpoint selection), please refer to [this model](https://huggingface.co/IliaLarchenko/dot_pusht_keypoints_best)
+You can use this model by installing LeRobot from [this branch](https://github.com/IliaLarchenko/lerobot/tree/dot_new_config)
 To train the model:
 ```bash
+python lerobot/scripts/train.py \
+    --policy.type=dot \
+    --dataset.repo_id=lerobot/pusht_keypoints \
+    --env.type=pusht \
+    --env.task=PushT-v0 \
+    --output_dir=outputs/train/pusht_keyponts \
+    --batch_size=24  \
+    --log_freq=1000 \
+    --eval_freq=10000 \
+    --save_freq=50000 \
+    --offline.steps=1000000 \
+    --seed=100000 \
+    --wandb.enable=true \
+    --num_workers=24 \
+    --use_amp=true \
+    --device=cuda \
+    --policy.return_every_n=2
 ```
 To evaluate the model:
 ```bash
+python lerobot/scripts/eval.py \
+    --policy.path=IliaLarchenko/dot_pusht_keypoints \
+    --env.type=pusht \
+    --env.task=PushT-v0 \
+    --eval.n_episodes=1000 \
+    --eval.batch_size=100 \
+    --env.obs_type=environment_state_agent_pos \
+    --seed=1000000
 ```
 Model size:
+- Total parameters: 14.1m
+- Trainable parameters: 2.9m
+Note: the results are even slightly better than ones reported in the repository. There was a big update in the LeRobot library, I did all the original training and evaluation using the older version of LeRobot. This model was trained using the new version of the library with the same parameters. The older version of the library that was used for the original experiments is available in the [this branch](https://github.com/IliaLarchenko/lerobot/tree/dot)

config.json CHANGED Viewed

@@ -1,48 +1,122 @@
 {
-  "alpha": 0.75,
-  "crop_scale": 1.0,
-  "dim_feedforward": 512,
-  "dim_model": 128,
-  "dropout": 0.1,
-  "inference_horizon": 20,
-  "input_normalization_modes": {
-    "observation.environment_state": "min_max",
-    "observation.state": "min_max"
-  },
-  "input_shapes": {
-    "observation.environment_state": [
-      16
     ],
-    "observation.state": [
-      2
-    ]
-  },
-  "lookback_aug": 5,
-  "lookback_obs_steps": 10,
-  "lora_rank": 20,
-  "merge_lora": true,
-  "n_decoder_layers": 8,
-  "n_heads": 8,
-  "n_obs_steps": 3,
-  "noise_decay": 0.999995,
-  "output_normalization_modes": {
-    "action": "min_max"
-  },
-  "output_shapes": {
-    "action": [
-      2
-    ]
-  },
-  "pre_norm": true,
-  "predict_every_n": 1,
-  "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
-  "rescale_shape": [
-    96,
-    96
-  ],
-  "return_every_n": 2,
-  "state_noise": 0.01,
-  "train_alpha": 0.9,
-  "train_horizon": 20,
-  "vision_backbone": "resnet18"
-}

 {
+    "type": "dot",
+    "n_obs_steps": 3,
+    "normalization_mapping": {
+        "VISUAL": "MEAN_STD",
+        "STATE": "MIN_MAX",
+        "ENV": "MIN_MAX",
+        "ACTION": "MIN_MAX"
+    },
+    "input_features": {
+        "observation.state": {
+            "type": "STATE",
+            "shape": [
+                2
+            ]
+        },
+        "observation.environment_state": {
+            "type": "ENV",
+            "shape": [
+                16
+            ]
+        }
+    },
+    "output_features": {
+        "action": {
+            "type": "ACTION",
+            "shape": [
+                2
+            ]
+        }
+    },
+    "train_horizon": 20,
+    "inference_horizon": 20,
+    "lookback_obs_steps": 10,
+    "lookback_aug": 5,
+    "override_dataset_stats": false,
+    "new_dataset_stats": {
+        "action": {
+            "max": [
+                512.0,
+                512.0
+            ],
+            "min": [
+                0.0,
+                0.0
+            ]
+        },
+        "observation.environment_state": {
+            "max": [
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0,
+                512.0
+            ],
+            "min": [
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0,
+                0.0
+            ]
+        },
+        "observation.state": {
+            "max": [
+                512.0,
+                512.0
+            ],
+            "min": [
+                0.0,
+                0.0
+            ]
+        }
+    },
+    "vision_backbone": "resnet18",
+    "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
+    "pre_norm": true,
+    "lora_rank": 20,
+    "merge_lora": false,
+    "dim_model": 128,
+    "n_heads": 8,
+    "dim_feedforward": 512,
+    "n_decoder_layers": 8,
+    "rescale_shape": [
+        96,
+        96
     ],
+    "crop_scale": 1.0,
+    "state_noise": 0.01,
+    "noise_decay": 0.999995,
+    "dropout": 0.1,
+    "alpha": 0.75,
+    "train_alpha": 0.9,
+    "predict_every_n": 1,
+    "return_every_n": 2,
+    "optimizer_lr": 0.0001,
+    "optimizer_min_lr": 0.0001,
+    "optimizer_lr_cycle_steps": 300000,
+    "optimizer_weight_decay": 1e-05
+}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b45aaac6d363fb26f405462dd901b45d1b436b686c163c9b4d2b71085bdc1aa5
 size 8523444

 version https://git-lfs.github.com/spec/v1
+oid sha256:e7a88ed5e7a822b3501cbca499af46a156cf4bd52b7ee6bb60249b697a28c652
 size 8523444

train_config.json ADDED Viewed

	@@ -0,0 +1,280 @@

+{
+    "dataset": {
+        "repo_id": "lerobot/pusht_keypoints",
+        "episodes": null,
+        "image_transforms": {
+            "enable": false,
+            "max_num_transforms": 3,
+            "random_order": false,
+            "tfs": {
+                "brightness": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "brightness": [
+                            0.8,
+                            1.2
+                        ]
+                    }
+                },
+                "contrast": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "contrast": [
+                            0.8,
+                            1.2
+                        ]
+                    }
+                },
+                "saturation": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "saturation": [
+                            0.5,
+                            1.5
+                        ]
+                    }
+                },
+                "hue": {
+                    "weight": 1.0,
+                    "type": "ColorJitter",
+                    "kwargs": {
+                        "hue": [
+                            -0.05,
+                            0.05
+                        ]
+                    }
+                },
+                "sharpness": {
+                    "weight": 1.0,
+                    "type": "SharpnessJitter",
+                    "kwargs": {
+                        "sharpness": [
+                            0.5,
+                            1.5
+                        ]
+                    }
+                }
+            }
+        },
+        "local_files_only": false,
+        "use_imagenet_stats": true,
+        "video_backend": "pyav"
+    },
+    "env": {
+        "type": "pusht",
+        "task": "PushT-v0",
+        "fps": 10,
+        "features": {
+            "action": {
+                "type": "ACTION",
+                "shape": [
+                    2
+                ]
+            },
+            "agent_pos": {
+                "type": "STATE",
+                "shape": [
+                    2
+                ]
+            },
+            "environment_state": {
+                "type": "ENV",
+                "shape": [
+                    16
+                ]
+            }
+        },
+        "features_map": {
+            "action": "action",
+            "agent_pos": "observation.state",
+            "environment_state": "observation.environment_state",
+            "pixels": "observation.image"
+        },
+        "episode_length": 300,
+        "obs_type": "environment_state_agent_pos",
+        "render_mode": "rgb_array",
+        "visualization_width": 384,
+        "visualization_height": 384
+    },
+    "policy": {
+        "type": "dot",
+        "n_obs_steps": 3,
+        "normalization_mapping": {
+            "VISUAL": "MEAN_STD",
+            "STATE": "MIN_MAX",
+            "ENV": "MIN_MAX",
+            "ACTION": "MIN_MAX"
+        },
+        "input_features": {
+            "observation.state": {
+                "type": "STATE",
+                "shape": [
+                    2
+                ]
+            },
+            "observation.environment_state": {
+                "type": "ENV",
+                "shape": [
+                    16
+                ]
+            }
+        },
+        "output_features": {
+            "action": {
+                "type": "ACTION",
+                "shape": [
+                    2
+                ]
+            }
+        },
+        "train_horizon": 20,
+        "inference_horizon": 20,
+        "lookback_obs_steps": 10,
+        "lookback_aug": 5,
+        "override_dataset_stats": false,
+        "new_dataset_stats": {
+            "action": {
+                "max": [
+                    512.0,
+                    512.0
+                ],
+                "min": [
+                    0.0,
+                    0.0
+                ]
+            },
+            "observation.environment_state": {
+                "max": [
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0,
+                    512.0
+                ],
+                "min": [
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0,
+                    0.0
+                ]
+            },
+            "observation.state": {
+                "max": [
+                    512.0,
+                    512.0
+                ],
+                "min": [
+                    0.0,
+                    0.0
+                ]
+            }
+        },
+        "vision_backbone": "resnet18",
+        "pretrained_backbone_weights": "ResNet18_Weights.IMAGENET1K_V1",
+        "pre_norm": true,
+        "lora_rank": 20,
+        "merge_lora": false,
+        "dim_model": 128,
+        "n_heads": 8,
+        "dim_feedforward": 512,
+        "n_decoder_layers": 8,
+        "rescale_shape": [
+            96,
+            96
+        ],
+        "crop_scale": 1.0,
+        "state_noise": 0.01,
+        "noise_decay": 0.999995,
+        "dropout": 0.1,
+        "alpha": 0.75,
+        "train_alpha": 0.9,
+        "predict_every_n": 1,
+        "return_every_n": 2,
+        "optimizer_lr": 0.0001,
+        "optimizer_min_lr": 0.0001,
+        "optimizer_lr_cycle_steps": 300000,
+        "optimizer_weight_decay": 1e-05
+    },
+    "output_dir": "outputs/train/pusht_keypoints",
+    "job_name": "pusht_dot",
+    "resume": false,
+    "device": "cuda",
+    "use_amp": true,
+    "seed": 100000,
+    "num_workers": 24,
+    "batch_size": 24,
+    "eval_freq": 10000,
+    "log_freq": 1000,
+    "save_checkpoint": true,
+    "save_freq": 50000,
+    "offline": {
+        "steps": 1000000
+    },
+    "online": {
+        "steps": 0,
+        "rollout_n_episodes": 1,
+        "rollout_batch_size": 1,
+        "steps_between_rollouts": null,
+        "sampling_ratio": 0.5,
+        "env_seed": null,
+        "buffer_capacity": null,
+        "buffer_seed_size": 0,
+        "do_rollout_async": false
+    },
+    "use_policy_training_preset": true,
+    "optimizer": {
+        "type": "adamw",
+        "lr": 0.0001,
+        "weight_decay": 1e-05,
+        "grad_clip_norm": 10.0,
+        "betas": [
+            0.9,
+            0.999
+        ],
+        "eps": 1e-08
+    },
+    "scheduler": {
+        "type": "cosine_annealing",
+        "num_warmup_steps": 0,
+        "min_lr": 0.0001,
+        "T_max": 300000
+    },
+    "eval": {
+        "n_episodes": 50,
+        "batch_size": 50,
+        "use_async_envs": false
+    },
+    "wandb": {
+        "enable": true,
+        "disable_artifact": false,
+        "project": "pusht",
+        "entity": null,
+        "notes": null
+    }
+}