Spaces:

yslan
/

worldmem

Running on Zero

App Files Files Community

xizaoqu commited on 28 days ago

Commit

ae8fd03

1 Parent(s): faeb2a7

update

Browse files

Files changed (5) hide show

algorithms/common/base_algo.py +0 -1
algorithms/common/base_pytorch_algo.py +0 -1
algorithms/worldmem/df_video.py +1 -1
app.py +216 -246
configurations/huggingface.yaml +57 -56

algorithms/common/base_algo.py CHANGED Viewed

@@ -12,7 +12,6 @@ class BaseAlgo(ABC):
     def __init__(self, cfg: DictConfig):
         super().__init__()
         self.cfg = cfg
-        self.debug = self.cfg.debug
     @abstractmethod
     def run(*args: Any, **kwargs: Any) -> Any:

     def __init__(self, cfg: DictConfig):
         super().__init__()
         self.cfg = cfg
     @abstractmethod
     def run(*args: Any, **kwargs: Any) -> Any:

algorithms/common/base_pytorch_algo.py CHANGED Viewed

@@ -21,7 +21,6 @@ class BasePytorchAlgo(pl.LightningModule, ABC):
     def __init__(self, cfg: DictConfig):
         super().__init__()
         self.cfg = cfg
-        self.debug = self.cfg.debug
         self._build_model()
     @abstractmethod

     def __init__(self, cfg: DictConfig):
         super().__init__()
         self.cfg = cfg
         self._build_model()
     @abstractmethod

algorithms/worldmem/df_video.py CHANGED Viewed

@@ -379,7 +379,7 @@ class WorldMemMinecraft(DiffusionForcingBase):
             ref_mode=self.ref_mode
         )
-        self.register_data_mean_std(self.cfg.data_mean, self.cfg.data_std)
         self.validation_lpips_model = LearnedPerceptualImagePatchSimilarity()
         vae = VAE_models["vit-l-20-shallow-encoder"]()

             ref_mode=self.ref_mode
         )
+        # self.register_data_mean_std(self.cfg.data_mean, self.cfg.data_std)
         self.validation_lpips_model = LearnedPerceptualImagePatchSimilarity()
         vae = VAE_models["vit-l-20-shallow-encoder"]()

app.py CHANGED Viewed

@@ -23,6 +23,8 @@ import subprocess
 from PIL import Image
 from datetime import datetime
 import spaces
 ACTION_KEYS = [
     "inventory",
@@ -65,6 +67,16 @@ KEY_TO_ACTION = {
     "1": ("hotbar.1", 1),
 }
 def parse_input_to_tensor(input_str):
     """
     Convert an input string into a (sequence_length, 25) tensor, where each row is a one-hot representation
@@ -157,265 +169,223 @@ def save_video(frames, path="output.mp4", fps=10):
     subprocess.run(ffmpeg_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     return path
-class InteractiveRunner:
-    def __init__(self, algo):
-        self.algo = algo
-    @spaces.GPU()
-    @torch.autocast("cuda")
-    def run(self, first_frame, action, first_pose, curr_frame, device):
-        return self.algo.interactive(first_frame, action, first_pose, curr_frame, device=device)
-@hydra.main(
-    version_base=None,
-    config_path="configurations",
-    config_name="huggingface",
-)
-def run(cfg: DictConfig):
-    algo = run_local(cfg)
-    algo.to(device)
-    algodevice = next(algo.parameters()).device
-    print("algo:", algodevice)
-    actions = torch.zeros((1, 25))
-    poses = torch.zeros((1, 5))
     memory_frames.append(load_image_as_tensor(DEFAULT_IMAGE))
-    runner = InteractiveRunner(algo)
-    algodevice = next(runner.algo.parameters()).device
-    print("runner.algo:", algodevice)
-    # @spaces.GPU()
-    # def run_interactive(first_frame, action, first_pose, curr_frame, device):
-    #     global algo
-    #     new_frame = algo.interactive(first_frame,
-    #                                     action,
-    #                                     first_pose,
-    #                                     curr_frame,
-    #                                     device=device)
-    #     return new_frame
-    def set_denoising_steps(denoising_steps, sampling_timesteps_state):
-        runner.algo.sampling_timesteps = denoising_steps
-        runner.algo.diffusion_model.sampling_timesteps = denoising_steps
-        sampling_timesteps_state = denoising_steps
-        print("set denoising steps to", runner.algo.sampling_timesteps)
-        return sampling_timesteps_state
-    def update_image_and_log(keys):
-        actions = parse_input_to_tensor(keys)
-        global input_history
-        global memory_curr_frame
-        print("algo frame:", len(runner.algo.frames))
-        for i in range(len(actions)):
-            memory_curr_frame += 1
-            # new_frame = run_interactive(memory_frames[0],
-            #                               actions[i],
-            #                               None,
-            #                               memory_curr_frame,
-            #                               device=device)
-            new_frame = runner.run(
-                        memory_frames[0],
-                        actions[i],
-                        None,
-                        memory_curr_frame,
-                        device
-                    )
-            print("algo frame:", len(runner.algo.frames))
-            memory_frames.append(new_frame)
-        out_video = torch.stack(memory_frames)
-        out_video = out_video.permute(0,2,3,1).numpy()
-        out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
-        out_video = (out_video * 255).astype(np.uint8)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        os.makedirs("outputs_gradio", exist_ok=True)
-        filename = f"outputs_gradio/{timestamp}.mp4"
-        save_video(out_video, filename)
-        input_history += keys
-        return out_video[-1], filename, input_history
-    def reset():
-        global memory_curr_frame
-        global input_history
-        global memory_frames
-        # runner.algo.to(device)
-        algodevice = next(runner.algo.parameters()).device
-        print(algodevice)
-        runner.algo.reset()
-        memory_frames = []
-        memory_frames.append(load_image_as_tensor(DEFAULT_IMAGE))
-        memory_curr_frame = 0
-        input_history = ""
-        # _ = run_interactive(memory_frames[0],
-        #                             actions[0],
-        #                             poses[0],
-        #                             memory_curr_frame,
-        #                             device=device)
-        #
-        new_frame = runner.run(
-                        memory_frames[0],
-                        actions[0],
-                        poses[0],
-                        memory_curr_frame,
-                        device
-                    )
-        return input_history, DEFAULT_IMAGE
-    def on_image_click(SELECTED_IMAGE):
         global DEFAULT_IMAGE
         DEFAULT_IMAGE = SELECTED_IMAGE
         reset()
         return SELECTED_IMAGE
-    # new_frame = runner.run(
-    #                     memory_frames[0],
-    #                     actions[0],
-    #                     poses[0],
-    #                     memory_curr_frame,
-    #                     device
-    #                 )
-    # print("first algo frame:", len(algo.frames))
-    css = """
-    h1 {
-        text-align: center;
-        display:block;
-    }
-    """
-    with gr.Blocks(css=css) as demo:
-        gr.Markdown(
-            """
-            # WORLDMEM: Long-term Consistent World Generation with Memory
-            """
-            )
-            # <div style="text-align: center;">
-            # <!-- Public Website -->
-            # <a style="display:inline-block" href="https://nirvanalan.github.io/projects/GA/">
-            #     <img src="https://img.shields.io/badge/public_website-8A2BE2">
-            # </a>
-            # <!-- GitHub Stars -->
-            # <a style="display:inline-block; margin-left: .5em" href="https://github.com/NIRVANALAN/GaussianAnything">
-            #     <img src="https://img.shields.io/github/stars/NIRVANALAN/GaussianAnything?style=social">
-            # </a>
-            # <!-- Project Page -->
-            # <a style="display:inline-block; margin-left: .5em" href="https://nirvanalan.github.io/projects/GA/">
-            #     <img src="https://img.shields.io/badge/project_page-blue">
-            # </a>
-            # <!-- arXiv Paper -->
-            # <a style="display:inline-block; margin-left: .5em" href="https://arxiv.org/abs/XXXX.XXXXX">
-            #     <img src="https://img.shields.io/badge/arXiv-paper-red">
-            # </a>
-            # </div>
-        with gr.Row(variant="panel"):
-            video_display = gr.Video(autoplay=True, loop=True)
-            image_display = gr.Image(value=DEFAULT_IMAGE, interactive=False, label="Last Frame")
-        with gr.Row(variant="panel"):
-            with gr.Column(scale=2):
-                input_box = gr.Textbox(label="Action Sequence", placeholder="Enter action sequence here...", lines=1, max_lines=1)
-                log_output = gr.Textbox(label="History Log", interactive=False)
-            with gr.Column(scale=1):
-                slider = gr.Slider(minimum=10, maximum=50, value=runner.algo.sampling_timesteps, step=1, label="Denoising Steps")
-                submit_button = gr.Button("Generate")
-                reset_btn = gr.Button("Reset")
-        sampling_timesteps_state = gr.State(runner.algo.sampling_timesteps)
-        example_actions = ["DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW", "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
-        "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSSAAAAAAAAAAAAAAAAAAAAAAAA", "SSUNNWWEEEEEEEEEAAA1NNNNNNNNNSSUNNWW"]
-        def set_action(action):
-            return action
-        gr.Markdown("### Action sequence examples.")
-        with gr.Row():
-            buttons = []
-            for action in example_actions[:2]:
-                with gr.Column(scale=len(action)):
-                    buttons.append(gr.Button(action))
-        with gr.Row():
-            for action in example_actions[2:4]:
-                with gr.Column(scale=len(action)):
-                    buttons.append(gr.Button(action))
-        with gr.Row():
-            for action in example_actions[4:5]:
-                with gr.Column(scale=len(action)):
-                    buttons.append(gr.Button(action))
-        for button, action in zip(buttons, example_actions):
-                button.click(set_action, inputs=[gr.State(value=action)], outputs=input_box)
-        gr.Markdown("### Click on the images below to reset the sequence and generate from the new image.")
-        with gr.Row():
-            image_display_1 = gr.Image(value=SUNFLOWERS_IMAGE, interactive=False, label="Sunflower Plains")
-            image_display_2 = gr.Image(value=DESERT_IMAGE, interactive=False, label="Desert")
-            image_display_3 = gr.Image(value=SAVANNA_IMAGE, interactive=False, label="Savanna")
-            image_display_4 = gr.Image(value=ICE_PLAINS_IMAGE, interactive=False, label="Ice Plains")
-            image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
-            image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
-        gr.Markdown(
-            """
-            ## Instructions & Notes:
-            1. Enter an action sequence in the **"Action Sequence"** text box and click **"Generate"** to begin.
-            2. You can continue generation by clicking **"Generation"** again and again. Previous sequences are logged in the history panel.
-            3. Click **"Reset"** to clear the current sequence and start fresh.
-            4. Action sequences can be composed using the following keys:
-            - W: turn up
-            - S: turn down
-            - A: turn left
-            - D: turn right
-            - Q: move forward
-            - E: move backward
-            - N: no-op (do nothing)
-            - 1: switch to hotbar 1
-            - U: use item
-            5. Higher denoising steps produce more detailed results but take longer. **20 steps** is a good balance between quality and speed.
-            6. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
-            7. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **[email protected]**.
-            """
         )
-        # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
-        submit_button.click(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
-        reset_btn.click(reset, outputs=[log_output, image_display])
-        image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=image_display)
-        image_display_2.select(lambda: on_image_click(DESERT_IMAGE), outputs=image_display)
-        image_display_3.select(lambda: on_image_click(SAVANNA_IMAGE), outputs=image_display)
-        image_display_4.select(lambda: on_image_click(ICE_PLAINS_IMAGE), outputs=image_display)
-        image_display_5.select(lambda: on_image_click(SUNFLOWERS_RAIN_IMAGE), outputs=image_display)
-        image_display_6.select(lambda: on_image_click(PLACE_IMAGE), outputs=image_display)
-        slider.change(fn=set_denoising_steps, inputs=[slider, sampling_timesteps_state], outputs=sampling_timesteps_state)
-    demo.launch()
-if __name__ == "__main__":
-    run()  # pylint: disable=no-value-for-parameter

 from PIL import Image
 from datetime import datetime
 import spaces
+from algorithms.worldmem import WorldMemMinecraft
+from huggingface_hub import hf_hub_download
 ACTION_KEYS = [
     "inventory",
     "1": ("hotbar.1", 1),
 }
+def load_custom_checkpoint(algo, checkpoint_path):
+    hf_ckpt = str(checkpoint_path).split('/')
+    repo_id = '/'.join(hf_ckpt[:2])
+    file_name = '/'.join(hf_ckpt[2:])
+    model_path = hf_hub_download(repo_id=repo_id,
+                        filename=file_name)
+    ckpt = torch.load(model_path, map_location=torch.device('cpu'))
+    algo.load_state_dict(ckpt['state_dict'], strict=False)
 def parse_input_to_tensor(input_str):
     """
     Convert an input string into a (sequence_length, 25) tensor, where each row is a one-hot representation
     subprocess.run(ffmpeg_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
     return path
+cfg = OmegaConf.load("configurations/huggingface.yaml")
+worldmem = WorldMemMinecraft(cfg)
+load_custom_checkpoint(algo=worldmem.diffusion_model, checkpoint_path=cfg.diffusion_path)
+load_custom_checkpoint(algo=worldmem.vae, checkpoint_path=cfg.vae_path)
+load_custom_checkpoint(algo=worldmem.pose_prediction_model, checkpoint_path=cfg.pose_predictor_path)
+worldmem.to("cuda").eval()
+actions = torch.zeros((1, 25))
+poses = torch.zeros((1, 5))
+memory_frames.append(load_image_as_tensor(DEFAULT_IMAGE))
+@spaces.GPU()
+def run_interactive(first_frame, action, first_pose, curr_frame, device):
+    new_frame = worldmem.interactive(first_frame,
+                                    action,
+                                    first_pose,
+                                    curr_frame,
+                                    device=device)
+    return new_frame
+def set_denoising_steps(denoising_steps, sampling_timesteps_state):
+    worldmem.sampling_timesteps = denoising_steps
+    worldmem.diffusion_model.sampling_timesteps = denoising_steps
+    sampling_timesteps_state = denoising_steps
+    print("set denoising steps to", worldmem.sampling_timesteps)
+    return sampling_timesteps_state
+def update_image_and_log(keys):
+    actions = parse_input_to_tensor(keys)
+    global input_history
+    global memory_curr_frame
+    print("algo frame:", len(worldmem.frames))
+    for i in range(len(actions)):
+        memory_curr_frame += 1
+        new_frame = run_interactive(memory_frames[0],
+                                      actions[i],
+                                      None,
+                                      memory_curr_frame,
+                                      device=device)
+        # print("algo frame:", len(runner.algo.frames))
+        memory_frames.append(new_frame)
+    out_video = torch.stack(memory_frames)
+    out_video = out_video.permute(0,2,3,1).numpy()
+    out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
+    out_video = (out_video * 255).astype(np.uint8)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    os.makedirs("outputs_gradio", exist_ok=True)
+    filename = f"outputs_gradio/{timestamp}.mp4"
+    save_video(out_video, filename)
+    input_history += keys
+    return out_video[-1], filename, input_history
+def reset():
+    global memory_curr_frame
+    global input_history
+    global memory_frames
+    worldmem.reset()
+    memory_frames = []
     memory_frames.append(load_image_as_tensor(DEFAULT_IMAGE))
+    memory_curr_frame = 0
+    input_history = ""
+    _ = run_interactive(memory_frames[0],
+                                actions[0],
+                                poses[0],
+                                memory_curr_frame,
+                                device=device)
+    return input_history, DEFAULT_IMAGE
+def on_image_click(SELECTED_IMAGE):
         global DEFAULT_IMAGE
         DEFAULT_IMAGE = SELECTED_IMAGE
         reset()
         return SELECTED_IMAGE
+# new_frame = runner.run(
+#                     memory_frames[0],
+#                     actions[0],
+#                     poses[0],
+#                     memory_curr_frame,
+#                     device
+#                 )
+# print("first algo frame:", len(algo.frames))
+css = """
+h1 {
+    text-align: center;
+    display:block;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(
+        """
+        # WORLDMEM: Long-term Consistent World Generation with Memory
+        """
         )
+        # <div style="text-align: center;">
+        # <!-- Public Website -->
+        # <a style="display:inline-block" href="https://nirvanalan.github.io/projects/GA/">
+        #     <img src="https://img.shields.io/badge/public_website-8A2BE2">
+        # </a>
+        # <!-- GitHub Stars -->
+        # <a style="display:inline-block; margin-left: .5em" href="https://github.com/NIRVANALAN/GaussianAnything">
+        #     <img src="https://img.shields.io/github/stars/NIRVANALAN/GaussianAnything?style=social">
+        # </a>
+        # <!-- Project Page -->
+        # <a style="display:inline-block; margin-left: .5em" href="https://nirvanalan.github.io/projects/GA/">
+        #     <img src="https://img.shields.io/badge/project_page-blue">
+        # </a>
+        # <!-- arXiv Paper -->
+        # <a style="display:inline-block; margin-left: .5em" href="https://arxiv.org/abs/XXXX.XXXXX">
+        #     <img src="https://img.shields.io/badge/arXiv-paper-red">
+        # </a>
+        # </div>
+    with gr.Row(variant="panel"):
+        video_display = gr.Video(autoplay=True, loop=True)
+        image_display = gr.Image(value=DEFAULT_IMAGE, interactive=False, label="Last Frame")
+    with gr.Row(variant="panel"):
+        with gr.Column(scale=2):
+            input_box = gr.Textbox(label="Action Sequence", placeholder="Enter action sequence here...", lines=1, max_lines=1)
+            log_output = gr.Textbox(label="History Log", interactive=False)
+        with gr.Column(scale=1):
+            slider = gr.Slider(minimum=10, maximum=50, value=worldmem.sampling_timesteps, step=1, label="Denoising Steps")
+            submit_button = gr.Button("Generate")
+            reset_btn = gr.Button("Reset")
+    sampling_timesteps_state = gr.State(worldmem.sampling_timesteps)
+    example_actions = ["DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW", "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
+    "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSSAAAAAAAAAAAAAAAAAAAAAAAA", "SSUNNWWEEEEEEEEEAAA1NNNNNNNNNSSUNNWW"]
+    def set_action(action):
+        return action
+    gr.Markdown("### Action sequence examples.")
+    with gr.Row():
+        buttons = []
+        for action in example_actions[:2]:
+            with gr.Column(scale=len(action)):
+                buttons.append(gr.Button(action))
+    with gr.Row():
+        for action in example_actions[2:4]:
+            with gr.Column(scale=len(action)):
+                buttons.append(gr.Button(action))
+    with gr.Row():
+        for action in example_actions[4:5]:
+            with gr.Column(scale=len(action)):
+                buttons.append(gr.Button(action))
+    for button, action in zip(buttons, example_actions):
+            button.click(set_action, inputs=[gr.State(value=action)], outputs=input_box)
+    gr.Markdown("### Click on the images below to reset the sequence and generate from the new image.")
+    with gr.Row():
+        image_display_1 = gr.Image(value=SUNFLOWERS_IMAGE, interactive=False, label="Sunflower Plains")
+        image_display_2 = gr.Image(value=DESERT_IMAGE, interactive=False, label="Desert")
+        image_display_3 = gr.Image(value=SAVANNA_IMAGE, interactive=False, label="Savanna")
+        image_display_4 = gr.Image(value=ICE_PLAINS_IMAGE, interactive=False, label="Ice Plains")
+        image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
+        image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
+    gr.Markdown(
+        """
+        ## Instructions & Notes:
+        1. Enter an action sequence in the **"Action Sequence"** text box and click **"Generate"** to begin.
+        2. You can continue generation by clicking **"Generation"** again and again. Previous sequences are logged in the history panel.
+        3. Click **"Reset"** to clear the current sequence and start fresh.
+        4. Action sequences can be composed using the following keys:
+        - W: turn up
+        - S: turn down
+        - A: turn left
+        - D: turn right
+        - Q: move forward
+        - E: move backward
+        - N: no-op (do nothing)
+        - 1: switch to hotbar 1
+        - U: use item
+        5. Higher denoising steps produce more detailed results but take longer. **20 steps** is a good balance between quality and speed.
+        6. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
+        7. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **[email protected]**.
+        """
+    )
+    # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
+    submit_button.click(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
+    reset_btn.click(reset, outputs=[log_output, image_display])
+    image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=image_display)
+    image_display_2.select(lambda: on_image_click(DESERT_IMAGE), outputs=image_display)
+    image_display_3.select(lambda: on_image_click(SAVANNA_IMAGE), outputs=image_display)
+    image_display_4.select(lambda: on_image_click(ICE_PLAINS_IMAGE), outputs=image_display)
+    image_display_5.select(lambda: on_image_click(SUNFLOWERS_RAIN_IMAGE), outputs=image_display)
+    image_display_6.select(lambda: on_image_click(PLACE_IMAGE), outputs=image_display)
+    slider.change(fn=set_denoising_steps, inputs=[slider, sampling_timesteps_state], outputs=sampling_timesteps_state)
+demo.launch()

configurations/huggingface.yaml CHANGED Viewed

@@ -1,57 +1,58 @@
-defaults:
-  - algorithm: df_video_worldmemminecraft
-  - experiment: exp_video
-  - dataset: video_minecraft
-dataset:
-  n_frames_valid: 100
-  validation_multiplier: 1
-  use_plucker: true
-  customized_validation: true
-  condition_similar_length: 8
-  padding_pool: 10
-  focal_length: 0.35
-  save_dir: data/test_pumpkin
-  add_frame_timestep_embedder: true
-  pos_range: 0.5
-  angle_range: 30
-experiment:
-  tasks: [interactive]
-  training:
-    data:
-      num_workers: 4
-  validation:
-    batch_size: 1
-    limit_batch: 1
-    data:
-      num_workers: 4
-  load_vae: false
-  load_t_to_r: false
-  zero_init_gate: false
-  only_tune_refer: false
-  diffusion_path: yslan/worldmem_checkpoints/diffusion_only.ckpt
-  vae_path: yslan/worldmem_checkpoints/vae_only.ckpt
-  pose_predictor_path: yslan/worldmem_checkpoints/pose_prediction_model_only.ckpt
-  customized_load: true
-algorithm:
-  n_tokens: 8
-  context_frames: 90
-  pose_cond_dim: 5
-  use_plucker: true
-  focal_length: 0.35
-  customized_validation: true
-  condition_similar_length: 8
-  log_video: true
-  relative_embedding: true
-  cond_only_on_qk: true
-  add_pose_embed: false
-  use_domain_adapter: false
-  use_reference_attention: true
-  add_frame_timestep_embedder: true
-  is_interactive: true
-  diffusion:
-    sampling_timesteps: 20
-debug: false

+n_tokens: 8
+pose_cond_dim: 5
+use_plucker: true
+focal_length: 0.35
+customized_validation: true
+condition_similar_length: 8
+log_video: true
+relative_embedding: true
+cond_only_on_qk: true
+add_pose_embed: false
+use_domain_adapter: false
+use_reference_attention: true
+add_frame_timestep_embedder: true
+is_interactive: true
+diffusion:
+  sampling_timesteps: 20
+  beta_schedule: sigmoid
+  objective: pred_v
+  use_fused_snr: True
+  cum_snr_decay: 0.96
+  clip_noise: 20.
+  ddim_sampling_eta: 0.0
+  stabilization_level: 15
+  schedule_fn_kwargs: {}
+  use_snr: False
+  use_cum_snr: False
+  snr_clip: 5.0
+  timesteps: 1000
+  # architecture
+  architecture:
+    network_size: 64
+    attn_heads: 4
+    attn_dim_head: 64
+    dim_mults: [1, 2, 4, 8]
+    resolution: ${dataset.resolution}
+    attn_resolutions: [16, 32, 64, 128]
+    use_init_temporal_attn: True
+    use_linear_attn: True
+    time_emb_type: rotary
+weight_decay: 2e-3
+warmup_steps: 10000
+optimizer_beta: [0.9, 0.99]
+action_cond_dim: 25
+n_frames: 8
+frame_skip: 1
+frame_stack: 1
+uncertainty_scale: 1
+guidance_scale: 0.0
+chunk_size: 1 # -1 for full trajectory diffusion, number to specify diffusion chunk size
+scheduling_matrix: autoregressive
+noise_level: random_all
+causal: True
+x_shape: [3, 360, 640]
+context_frames: 1
+diffusion_path: yslan/worldmem_checkpoints/diffusion_only.ckpt
+vae_path: yslan/worldmem_checkpoints/vae_only.ckpt
+pose_predictor_path: yslan/worldmem_checkpoints/pose_prediction_model_only.ckpt