Update README.md
Browse files
README.md
CHANGED
@@ -112,37 +112,45 @@ import requests
|
|
112 |
from io import BytesIO
|
113 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
114 |
|
115 |
-
|
116 |
# Load model directly
|
117 |
-
processor = AutoProcessor.from_pretrained("
|
118 |
-
model = AutoModelForImageTextToText.from_pretrained("
|
119 |
-
|
120 |
model.eval()
|
121 |
|
122 |
# Prepare image input
|
123 |
-
image_url = "https://
|
124 |
|
125 |
# Prepare text input
|
126 |
-
question = "
|
127 |
prompt = f"A conversation between User and Assistant. The user asks a question about the image, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\nUser: {question} \nAssistant: Let me solve this step by step.\n<think>"
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
# Process input
|
130 |
response = requests.get(image_url)
|
131 |
image = Image.open(BytesIO(response.content))
|
132 |
-
text = processor.apply_chat_template(
|
133 |
input = processor(
|
134 |
text=text,
|
135 |
-
|
136 |
padding=True,
|
137 |
return_tensors="pt",
|
138 |
)
|
139 |
-
input =
|
140 |
|
141 |
# Generation of the output
|
142 |
-
|
143 |
-
generated_ids = model.module.generate(**input, use_cache=True, max_new_tokens=1024, do_sample=True)
|
144 |
generated_ids_trimmed = [
|
145 |
-
out_ids[len(in_ids):] for in_ids, out_ids in zip(
|
146 |
]
|
147 |
batch_output_text = processor.batch_decode(
|
148 |
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
@@ -150,6 +158,7 @@ batch_output_text = processor.batch_decode(
|
|
150 |
|
151 |
# Get output
|
152 |
output_text = batch_output_text[0]
|
|
|
153 |
```
|
154 |
|
155 |
<!-- ## 📰 Evaluation Results
|
|
|
112 |
from io import BytesIO
|
113 |
from transformers import AutoProcessor, AutoModelForImageTextToText
|
114 |
|
|
|
115 |
# Load model directly
|
116 |
+
processor = AutoProcessor.from_pretrained("AIcell/reproduce-1200")
|
117 |
+
model = AutoModelForImageTextToText.from_pretrained("AIcell/reproduce-1200"
|
118 |
+
,torch_dtype="auto", device_map="auto")
|
119 |
model.eval()
|
120 |
|
121 |
# Prepare image input
|
122 |
+
image_url = "https://multimodal-r1.s3.us-west-1.amazonaws.com/demo_image.jpg"
|
123 |
|
124 |
# Prepare text input
|
125 |
+
question = "Considering the relative positions of the sofa and the picture in the image provided, where is the sofa located with respect to the picture? Select from the following choices.\n(A) above or \n(B) below"
|
126 |
prompt = f"A conversation between User and Assistant. The user asks a question about the image, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\nUser: {question} \nAssistant: Let me solve this step by step.\n<think>"
|
127 |
|
128 |
+
# Create Message
|
129 |
+
message = [
|
130 |
+
|
131 |
+
{
|
132 |
+
"type": "image",
|
133 |
+
"image": image_url,
|
134 |
+
},
|
135 |
+
{"type": "text", "text": "<image>" + prompt},
|
136 |
+
]
|
137 |
+
|
138 |
# Process input
|
139 |
response = requests.get(image_url)
|
140 |
image = Image.open(BytesIO(response.content))
|
141 |
+
text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
|
142 |
input = processor(
|
143 |
text=text,
|
144 |
+
image=image,
|
145 |
padding=True,
|
146 |
return_tensors="pt",
|
147 |
)
|
148 |
+
input = input.to("cuda")
|
149 |
|
150 |
# Generation of the output
|
151 |
+
generated_ids = model.generate(**input, use_cache=True, max_new_tokens=1024, do_sample=True)
|
|
|
152 |
generated_ids_trimmed = [
|
153 |
+
out_ids[len(in_ids):] for in_ids, out_ids in zip(input.input_ids, generated_ids)
|
154 |
]
|
155 |
batch_output_text = processor.batch_decode(
|
156 |
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
|
|
158 |
|
159 |
# Get output
|
160 |
output_text = batch_output_text[0]
|
161 |
+
print(output_text)
|
162 |
```
|
163 |
|
164 |
<!-- ## 📰 Evaluation Results
|