Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

modeling_llavanext_for_embedding.py +71 -0

modeling_llavanext_for_embedding.py CHANGED Viewed

	@@ -257,3 +257,74 @@ class LLaVANextForEmbedding(LlavaNextForConditionalGeneration):
257
258	return outputs
259

         return outputs
+    def set_processor(self, model_name):
+        self.processor = LlavaNextProcessor.from_pretrained(model_name)
+    def prepare_text_input(self, image=None, text=None, q_or_c=None, task_instruction=None):
+        task_instruction_example_cir = "Retrieve the target image that best meets the combined criteria by using both the provided image and the image retrieval instructions: "
+        assert q_or_c in ["query", "candidate", "q", "c"]
+        if "q" in q_or_c:
+            if task_instruction is None:
+                text_input = "[INST] \n <instruct>  <query>"
+                print(f"""
+                        Warning: For optimal performance, MMRet-MLLM requires the task instruction to be specified in the query.
+                        For example, for the composed image retrieval task, you might use a specific instruction like: {task_instruction_example_cir}.
+                        Instructions for other tasks can be referenced in the MMEB benchmark.
+                    """)
+            elif task_instruction is not None:
+                text_input = f"[INST] \n <instruct> {task_instruction} <query> "
+            if text is not None:
+                text_input = f"{text_input} {text} \n"
+            if image is not None:
+                text_input = f"{text_input} <image>"
+            text_input = f"{text_input} [/INST]"
+        else:
+            text_input = "[INST] "
+            if text is not None:
+                text_input = f"{text_input} {text} \n"
+            if image is not None:
+                text_input = f"{text_input} <image>"
+            text_input = f"{text_input} [/INST]"
+        return text_input
+    def data_process(self, images=None, text=None, q_or_c=None, task_instruction=None):
+        if images is not None:
+            _is_list = isinstance(images, list)
+        elif text is not None:
+            _is_list = isinstance(text, list)
+        else:
+            raise ValueError("images and text cannot be both None.")
+        assert q_or_c in ["query", "candidate", "q", "c"]
+        if not _is_list :
+            text_input = self.prepare_text_input(images, text, q_or_c, task_instruction)
+            text_input = [text_input]
+            print(text_input)
+            if images is not None:
+                images = Image.open(images).resize((512,512)).convert("RGB")
+                images = [images]
+                inputs = self.processor(images=images, text=text_input, return_tensors="pt", padding=True)
+            else:
+                inputs = self.processor(text=text_input, return_tensors="pt", padding=True)
+        else:
+            text_input = [self.prepare_text_input(_image, _text, q_or_c, task_instruction) for _image, _text in zip(images, text)]
+            print(text_input)
+            if images is not None:
+                images = [Image.open(_image).resize((512,512)).convert("RGB") for _image in images]
+                inputs = self.processor(images=images, text=text_input, return_tensors="pt", padding=True)
+            else:
+                inputs = self.processor(text=text_input, return_tensors="pt", padding=True)
+        inputs = inputs.to(self.device)
+        return inputs