DeepGlint-AI
/

UniME-Phi3.5-V-4.2B

@@ -1,17 +1,17 @@
 ---
-license: mit
 datasets:
 - TIGER-Lab/MMEB-train
 language:
 - en
-base_model:
-- microsoft/Phi-3.5-vision-instruct
 library_name: transformers
 tags:
 - Retrieval
 - Multimodal
 - Embedding
-pipeline_tag: image-text-to-text
 ---
 # Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs
@@ -25,7 +25,7 @@ Yingda Chen,</span>
 <a href="https://weidong-tom-cai.github.io/">Weidong Cai</a>,</span>
 <a href="https://jiankangdeng.github.io">Jiankang Deng</a></span>
-[🏡 Project Page](https://garygutc.github.io/UniME) |  [📄 Paper](https://arxiv.org/pdf/2504.17432) | [💻 Github](https://github.com/deepglint/UniME)
 <p align="center">
@@ -62,8 +62,16 @@ from torch.nn import functional as F
 from transformers import AutoProcessor, AutoModelForCausalLM
 base_model_path="DeepGlint-AI/UniME-Phi3.5-V-4.2B"
-img_prompt = '<|user|>\n<|image_1|>\nSummary above image in one word: <|end|>\n<|assistant|>\n'
-text_prompt = '<|user|>\n<sent>\nSummary above sentence in one word: <|end|>\n<|assistant|>\n'
 text = "A man is crossing the street with a red car parked nearby."
 image_path = "figures/demo.png"
@@ -108,6 +116,8 @@ print("Score: ", Score)
 ## 📖 Citation
 If you find this repository useful, please use the following BibTeX entry for citation.
 ```latex
 @misc{gu2025breakingmodalitybarrieruniversal,
       title={Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs},

 ---
+base_model:
+- microsoft/Phi-3.5-vision-instruct
 datasets:
 - TIGER-Lab/MMEB-train
 language:
 - en
 library_name: transformers
+license: mit
+pipeline_tag: image-text-to-text
 tags:
 - Retrieval
 - Multimodal
 - Embedding
 ---
 # Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs
 <a href="https://weidong-tom-cai.github.io/">Weidong Cai</a>,</span>
 <a href="https://jiankangdeng.github.io">Jiankang Deng</a></span>
+[🏡 Project Page](https://garygutc.github.io/UniME) |  [📄 Paper](https://arxiv.org/abs/2504.17432) | [💻 Github](https://github.com/deepglint/UniME)
 <p align="center">
 from transformers import AutoProcessor, AutoModelForCausalLM
 base_model_path="DeepGlint-AI/UniME-Phi3.5-V-4.2B"
+img_prompt = '<|user|>
+<|image_1|>
+Summary above image in one word: <|end|>
+<|assistant|>
+'
+text_prompt = '<|user|>
+<sent>
+Summary above sentence in one word: <|end|>
+<|assistant|>
+'
 text = "A man is crossing the street with a red car parked nearby."
 image_path = "figures/demo.png"
 ## 📖 Citation
 If you find this repository useful, please use the following BibTeX entry for citation.
+[📄 Paper](https://arxiv.org/abs/2504.17432)
 ```latex
 @misc{gu2025breakingmodalitybarrieruniversal,
       title={Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs},