Spaces:

yslan
/

worldmem

Running on Zero

App Files Files Community

xizaoqu commited on 24 days ago

Commit

c7542a3

1 Parent(s): a7ea928

update

Browse files

Files changed (2) hide show

algorithms/worldmem/df_video.py +2 -5
app.py +142 -59

algorithms/worldmem/df_video.py CHANGED Viewed

@@ -615,8 +615,6 @@ class WorldMemMinecraft(DiffusionForcingBase):
             for _ in range(condition_similar_length):
                 overlap_ratio = ((in_fov1.bool() & in_fov_list).sum(1)) / in_fov1.sum()
-                # if curr_frame == 54:
-                #     import pdb;pdb.set_trace()
                 confidence = overlap_ratio + (curr_frame - frame_idx[:curr_frame]) / curr_frame * (-0.2)
                 if len(random_idx) > 0:
@@ -624,10 +622,11 @@ class WorldMemMinecraft(DiffusionForcingBase):
                 _, r_idx = torch.topk(confidence, k=1, dim=0)
                 random_idx.append(r_idx[0])
                 occupied_mask = in_fov_list[r_idx[0, range(in_fov1.shape[-1])], :, range(in_fov1.shape[-1])].permute(1,0)
                 in_fov1 = in_fov1 & ~occupied_mask
                 # cos_sim = F.cosine_similarity(xs_pred.to(r_idx.device)[r_idx[:, range(in_fov1.shape[1])],
                 #     range(in_fov1.shape[1])], xs_pred.to(r_idx.device)[:curr_frame], dim=2)
                 # cos_sim = cos_sim.mean((-2,-1))
@@ -637,8 +636,6 @@ class WorldMemMinecraft(DiffusionForcingBase):
             random_idx = torch.stack(random_idx).cpu()
-            print(random_idx)
         return random_idx
     def _prepare_conditions(self,

             for _ in range(condition_similar_length):
                 overlap_ratio = ((in_fov1.bool() & in_fov_list).sum(1)) / in_fov1.sum()
                 confidence = overlap_ratio + (curr_frame - frame_idx[:curr_frame]) / curr_frame * (-0.2)
                 if len(random_idx) > 0:
                 _, r_idx = torch.topk(confidence, k=1, dim=0)
                 random_idx.append(r_idx[0])
+                # choice 1: directly remove overlapping region
                 occupied_mask = in_fov_list[r_idx[0, range(in_fov1.shape[-1])], :, range(in_fov1.shape[-1])].permute(1,0)
                 in_fov1 = in_fov1 & ~occupied_mask
+                # choice 2: apply similarity filter
                 # cos_sim = F.cosine_similarity(xs_pred.to(r_idx.device)[r_idx[:, range(in_fov1.shape[1])],
                 #     range(in_fov1.shape[1])], xs_pred.to(r_idx.device)[:curr_frame], dim=2)
                 # cos_sim = cos_sim.mean((-2,-1))
             random_idx = torch.stack(random_idx).cpu()
         return random_idx
     def _prepare_conditions(self,

app.py CHANGED Viewed

@@ -70,6 +70,13 @@ KEY_TO_ACTION = {
     "1": ("hotbar.1", 1),
 }
 def load_custom_checkpoint(algo, checkpoint_path):
     hf_ckpt = str(checkpoint_path).split('/')
     repo_id = '/'.join(hf_ckpt[:2])
@@ -156,7 +163,6 @@ def enable_amp(model, precision="16-mixed"):
     return model
 memory_frames = []
-memory_curr_frame = 0
 input_history = ""
 ICE_PLAINS_IMAGE = "assets/ice_plains.png"
 DESERT_IMAGE = "assets/desert.png"
@@ -166,7 +172,6 @@ PLACE_IMAGE = "assets/place.png"
 SUNFLOWERS_IMAGE = "assets/sunflower_plains.png"
 SUNFLOWERS_RAIN_IMAGE = "assets/rain_sunflower_plains.png"
-DEFAULT_IMAGE = ICE_PLAINS_IMAGE
 device = torch.device('cuda')
 def save_video(frames, path="output.mp4", fps=10):
@@ -193,13 +198,6 @@ worldmem = enable_amp(worldmem, precision="16-mixed")
 actions = np.zeros((1, 25), dtype=np.float32)
 poses = np.zeros((1, 5), dtype=np.float32)
-memory_frames = load_image_as_tensor(DEFAULT_IMAGE)[None].numpy()
-self_frames = None
-self_actions = None
-self_poses = None
-self_memory_c2w = None
-self_frame_idx = None
 def get_duration_single_image_to_long_video(first_frame, action, first_pose, device, self_frames, self_actions,
@@ -240,17 +238,8 @@ def set_memory_length(memory_length, sampling_memory_length_state):
     print("set memory length to", worldmem.condition_similar_length)
     return sampling_memory_length_state
-def generate(keys):
-    # print("algo frame:", len(worldmem.frames))
     input_actions = parse_input_to_tensor(keys)
-    global input_history
-    global memory_frames
-    global memory_curr_frame
-    global self_frames
-    global self_actions
-    global self_poses
-    global self_memory_c2w
-    global self_frame_idx
     if self_frames is None:
         new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
@@ -282,25 +271,34 @@ def generate(keys):
     temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
     save_video(out_video, temporal_video_path)
     input_history += keys
-    return out_video[-1], temporal_video_path, input_history
-def reset():
-    global memory_curr_frame
-    global input_history
-    global memory_frames
-    global self_frames
-    global self_actions
-    global self_poses
-    global self_memory_c2w
-    global self_frame_idx
     self_frames = None
     self_poses = None
     self_actions = None
     self_memory_c2w = None
     self_frame_idx = None
-    memory_frames = load_image_as_tensor(DEFAULT_IMAGE).numpy()[None]
     input_history = ""
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
@@ -313,14 +311,58 @@ def reset():
                                 self_memory_c2w=self_memory_c2w,
                                 self_frame_idx=self_frame_idx)
-    return input_history, DEFAULT_IMAGE
-def on_image_click(SELECTED_IMAGE):
-    global DEFAULT_IMAGE
-    DEFAULT_IMAGE = SELECTED_IMAGE
-    reset()
-    return SELECTED_IMAGE
 css = """
 h1 {
@@ -329,6 +371,10 @@ h1 {
 }
 """
 with gr.Blocks(css=css) as demo:
     gr.Markdown(
         """
@@ -358,13 +404,18 @@ with gr.Blocks(css=css) as demo:
         # </a>
         # </div>
-    example_actions = ["AAAAAAAAAAAADDDDDDDDDDDD", "AAAAAAAAAAAAAAAAAAAAAAAA", "DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW", "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
-    "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSS", "SSUNNWWEEEEEEEEEAAASSUNNWWEEEEEEEEE"]
     with gr.Row(variant="panel"):
         video_display = gr.Video(autoplay=True, loop=True)
-        image_display = gr.Image(value=DEFAULT_IMAGE, interactive=False, label="Last Frame")
     with gr.Row(variant="panel"):
@@ -374,17 +425,17 @@ with gr.Blocks(css=css) as demo:
             gr.Markdown("### Action sequence examples.")
             with gr.Row():
                 buttons = []
-                for action in example_actions[:2]:
-                    with gr.Column(scale=len(action)):
-                        buttons.append(gr.Button(action))
             with gr.Row():
-                for action in example_actions[2:4]:
-                    with gr.Column(scale=len(action)):
-                        buttons.append(gr.Button(action))
             with gr.Row():
-                for action in example_actions[4:6]:
-                    with gr.Column(scale=len(action)):
-                        buttons.append(gr.Button(action))
         with gr.Column(scale=1):
             slider_denoising_step = gr.Slider(minimum=10, maximum=50, value=worldmem.sampling_timesteps, step=1, label="Denoising Steps")
@@ -397,6 +448,12 @@ with gr.Blocks(css=css) as demo:
     sampling_context_length_state = gr.State(worldmem.n_tokens)
     sampling_memory_length_state = gr.State(worldmem.condition_similar_length)
     def set_action(action):
         return action
@@ -404,8 +461,8 @@ with gr.Blocks(css=css) as demo:
     # gr.Markdown("### Action sequence examples.")
-    for button, action in zip(buttons, example_actions):
-            button.click(set_action, inputs=[gr.State(value=action)], outputs=input_box)
     gr.Markdown("### Click on the images below to reset the sequence and generate from the new image.")
@@ -418,6 +475,32 @@ with gr.Blocks(css=css) as demo:
         image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
         image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
     gr.Markdown(
         """
         ## Instructions & Notes:
@@ -441,14 +524,14 @@ with gr.Blocks(css=css) as demo:
         """
     )
     # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
-    submit_button.click(generate, inputs=[input_box], outputs=[image_display, video_display, log_output])
-    reset_btn.click(reset, outputs=[log_output, image_display])
-    image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=image_display)
-    image_display_2.select(lambda: on_image_click(DESERT_IMAGE), outputs=image_display)
-    image_display_3.select(lambda: on_image_click(SAVANNA_IMAGE), outputs=image_display)
-    image_display_4.select(lambda: on_image_click(ICE_PLAINS_IMAGE), outputs=image_display)
-    image_display_5.select(lambda: on_image_click(SUNFLOWERS_RAIN_IMAGE), outputs=image_display)
-    image_display_6.select(lambda: on_image_click(PLACE_IMAGE), outputs=image_display)
     slider_denoising_step.change(fn=set_denoising_steps, inputs=[slider_denoising_step, sampling_timesteps_state], outputs=sampling_timesteps_state)
     slider_context_length.change(fn=set_context_length, inputs=[slider_context_length, sampling_context_length_state], outputs=sampling_context_length_state)

     "1": ("hotbar.1", 1),
 }
+example_images = [
+    ["1", "assets/ice_plains.png", "turn right+go backward+look up+turn left+look down+turn right+go forward+turn left", 20, 3, 8],
+    ["2", "assets/place.png", "put item+go backward+put item+go backward+go around", 20, 3, 8],
+    ["3", "assets/rain_sunflower_plains.png", "turn right+look up+turn right+look down+turn left+go backward+turn left", 20, 3, 8],
+    ["4", "assets/desert.png", "turn 360 degree+turn right+go forward+turn left", 20, 3, 8],
+]
 def load_custom_checkpoint(algo, checkpoint_path):
     hf_ckpt = str(checkpoint_path).split('/')
     repo_id = '/'.join(hf_ckpt[:2])
     return model
 memory_frames = []
 input_history = ""
 ICE_PLAINS_IMAGE = "assets/ice_plains.png"
 DESERT_IMAGE = "assets/desert.png"
 SUNFLOWERS_IMAGE = "assets/sunflower_plains.png"
 SUNFLOWERS_RAIN_IMAGE = "assets/rain_sunflower_plains.png"
 device = torch.device('cuda')
 def save_video(frames, path="output.mp4", fps=10):
 actions = np.zeros((1, 25), dtype=np.float32)
 poses = np.zeros((1, 5), dtype=np.float32)
 def get_duration_single_image_to_long_video(first_frame, action, first_pose, device, self_frames, self_actions,
     print("set memory length to", worldmem.condition_similar_length)
     return sampling_memory_length_state
+def generate(keys, input_history, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx):
     input_actions = parse_input_to_tensor(keys)
     if self_frames is None:
         new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
     temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
     save_video(out_video, temporal_video_path)
+    now = datetime.now()
+    folder_name = now.strftime("%Y-%m-%d_%H-%M-%S")
+    folder_path = os.path.join("/mnt/xiaozeqi/worldmem/output_material", folder_name)
+    os.makedirs(folder_path, exist_ok=True)
     input_history += keys
+    data_dict = {
+        "input_history": input_history,
+        "memory_frames": memory_frames,
+        "self_frames": self_frames,
+        "self_actions": self_actions,
+        "self_poses": self_poses,
+        "self_memory_c2w": self_memory_c2w,
+        "self_frame_idx": self_frame_idx,
+    }
+    np.savez(os.path.join(folder_path, "data_bundle.npz"), **data_dict)
+    return out_video[-1], temporal_video_path, input_history, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
+def reset(selected_image):
     self_frames = None
     self_poses = None
     self_actions = None
     self_memory_c2w = None
     self_frame_idx = None
+    memory_frames = load_image_as_tensor(selected_image).numpy()[None]
     input_history = ""
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
                                 self_memory_c2w=self_memory_c2w,
                                 self_frame_idx=self_frame_idx)
+    return input_history, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
+def on_image_click(selected_image):
+    input_history, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = reset(selected_image)
+    return input_history, selected_image, selected_image, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
+def set_memory(examples_case, image_display, log_output, slider_denoising_step,  slider_context_length, slider_memory_length):
+    if examples_case == '1':
+        data_bundle = np.load("/mnt/xiaozeqi/worldmem/output_material/2025-04-11_16-01-49/data_bundle.npz")
+        input_history = data_bundle['input_history'].item()
+        memory_frames = data_bundle['memory_frames']
+        self_frames = data_bundle['self_frames']
+        self_actions = data_bundle['self_actions']
+        self_poses = data_bundle['self_poses']
+        self_memory_c2w = data_bundle['self_memory_c2w']
+        self_frame_idx = data_bundle['self_frame_idx']
+    elif examples_case == '2':
+        data_bundle = np.load("/mnt/xiaozeqi/worldmem/output_material/2025-04-12_10-42-04/data_bundle.npz")
+        input_history = data_bundle['input_history'].item()
+        memory_frames = data_bundle['memory_frames']
+        self_frames = data_bundle['self_frames']
+        self_actions = data_bundle['self_actions']
+        self_poses = data_bundle['self_poses']
+        self_memory_c2w = data_bundle['self_memory_c2w']
+        self_frame_idx = data_bundle['self_frame_idx']
+    elif examples_case == '3':
+        data_bundle = np.load("/mnt/xiaozeqi/worldmem/output_material/2025-04-12_10-56-57/data_bundle.npz")
+        input_history = data_bundle['input_history'].item()
+        memory_frames = data_bundle['memory_frames']
+        self_frames = data_bundle['self_frames']
+        self_actions = data_bundle['self_actions']
+        self_poses = data_bundle['self_poses']
+        self_memory_c2w = data_bundle['self_memory_c2w']
+        self_frame_idx = data_bundle['self_frame_idx']
+    elif examples_case == '4':
+        data_bundle = np.load("/mnt/xiaozeqi/worldmem/output_material/2025-04-11_16-07-19/data_bundle.npz")
+        input_history = data_bundle['input_history'].item()
+        memory_frames = data_bundle['memory_frames']
+        self_frames = data_bundle['self_frames']
+        self_actions = data_bundle['self_actions']
+        self_poses = data_bundle['self_poses']
+        self_memory_c2w = data_bundle['self_memory_c2w']
+        self_frame_idx = data_bundle['self_frame_idx']
+    out_video = memory_frames.transpose(0,2,3,1)
+    out_video = np.clip(out_video, a_min=0.0, a_max=1.0)
+    out_video = (out_video * 255).astype(np.uint8)
+    temporal_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+    save_video(out_video, temporal_video_path)
+    return input_history, out_video[-1], temporal_video_path, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx
 css = """
 h1 {
 }
 """
+def on_select(evt: gr.SelectData):
+    selected_index = evt.index
+    return examples[selected_index]
 with gr.Blocks(css=css) as demo:
     gr.Markdown(
         """
         # </a>
         # </div>
+    example_actions = {"turn left + turn right": "AAAAAAAAAAAADDDDDDDDDDDD",
+                        "turn 360 degree": "AAAAAAAAAAAAAAAAAAAAAAAA",
+                        "turn right+go backward+look up+turn left+look down": "DDDDDDDDEEEEEEEEEESSSAAAAAAAAWWW",
+                        "turn right+go forward+turn left": "DDDDDDDDDDDDQQQQQQQQQQQQQQQDDDDDDDDDDDD",
+                        "turn right+look up+turn right+look down": "DDDDWWWDDDDDDDDDDDDDDDDDDDDSSS",
+                        "put item+go backward+put item+go backward":"SSUNNWWEEEEEEEEEAAASSUNNWWEEEEEEEEE"}
+    selected_image = gr.State(ICE_PLAINS_IMAGE)
     with gr.Row(variant="panel"):
         video_display = gr.Video(autoplay=True, loop=True)
+        image_display = gr.Image(value=selected_image.value, interactive=False, label="Current Frame")
     with gr.Row(variant="panel"):
             gr.Markdown("### Action sequence examples.")
             with gr.Row():
                 buttons = []
+                for action_key in list(example_actions.keys())[:2]:
+                    with gr.Column(scale=len(action_key)):
+                        buttons.append(gr.Button(action_key))
             with gr.Row():
+                for action_key in list(example_actions.keys())[2:4]:
+                    with gr.Column(scale=len(action_key)):
+                        buttons.append(gr.Button(action_key))
             with gr.Row():
+                for action_key in list(example_actions.keys())[4:6]:
+                    with gr.Column(scale=len(action_key)):
+                        buttons.append(gr.Button(action_key))
         with gr.Column(scale=1):
             slider_denoising_step = gr.Slider(minimum=10, maximum=50, value=worldmem.sampling_timesteps, step=1, label="Denoising Steps")
     sampling_context_length_state = gr.State(worldmem.n_tokens)
     sampling_memory_length_state = gr.State(worldmem.condition_similar_length)
+    memory_frames = gr.State(load_image_as_tensor(selected_image.value)[None].numpy())
+    self_frames = gr.State()
+    self_actions = gr.State()
+    self_poses = gr.State()
+    self_memory_c2w = gr.State()
+    self_frame_idx = gr.State()
     def set_action(action):
         return action
     # gr.Markdown("### Action sequence examples.")
+    for button, action_key in zip(buttons, list(example_actions.keys())):
+            button.click(set_action, inputs=[gr.State(value=example_actions[action_key])], outputs=input_box)
     gr.Markdown("### Click on the images below to reset the sequence and generate from the new image.")
         image_display_5 = gr.Image(value=SUNFLOWERS_RAIN_IMAGE, interactive=False, label="Rainy Sunflower Plains")
         image_display_6 = gr.Image(value=PLACE_IMAGE, interactive=False, label="Place")
+    gr.Markdown("### Click the examples below for a quick review, and continue generating based on them.")
+    example_case = gr.Textbox(label="Case", visible=False)
+    image_output = gr.Image(visible=False)
+    # gr.Examples(examples=example_images,
+    #     inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_length],
+    #     fn=set_memory,
+    #     outputs=[log_output, image_display, video_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx],
+    #     cache_examples=True
+    # )
+    examples = gr.Examples(
+        examples=example_images,
+        inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_length],
+        cache_examples=False
+    )
+    example_case.change(
+        fn=set_memory,
+        inputs=[example_case, image_output, log_output, slider_denoising_step, slider_context_length, slider_memory_length],
+        outputs=[log_output, image_display, video_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx]
+    )
     gr.Markdown(
         """
         ## Instructions & Notes:
         """
     )
     # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
+    submit_button.click(generate, inputs=[input_box, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx], outputs=[image_display, video_display, log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    reset_btn.click(reset, inputs=[selected_image], outputs=[log_output, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    image_display_1.select(lambda: on_image_click(SUNFLOWERS_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    image_display_2.select(lambda: on_image_click(DESERT_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    image_display_3.select(lambda: on_image_click(SAVANNA_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    image_display_4.select(lambda: on_image_click(ICE_PLAINS_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    image_display_5.select(lambda: on_image_click(SUNFLOWERS_RAIN_IMAGE), outputs=[log_output, selected_image, image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
+    image_display_6.select(lambda: on_image_click(PLACE_IMAGE), outputs=[log_output, selected_image,image_display, memory_frames, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx])
     slider_denoising_step.change(fn=set_denoising_steps, inputs=[slider_denoising_step, sampling_timesteps_state], outputs=sampling_timesteps_state)
     slider_context_length.change(fn=set_context_length, inputs=[slider_context_length, sampling_context_length_state], outputs=sampling_context_length_state)