AIcell commited on
Commit
68ca8e9
·
verified ·
1 Parent(s): 64bf3c4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +21 -12
README.md CHANGED
@@ -112,37 +112,45 @@ import requests
112
  from io import BytesIO
113
  from transformers import AutoProcessor, AutoModelForImageTextToText
114
 
115
-
116
  # Load model directly
117
- processor = AutoProcessor.from_pretrained("turningpoint-ai/VisualThinker-R1-Zero")
118
- model = AutoModelForImageTextToText.from_pretrained("turningpoint-ai/VisualThinker-R1-Zero",
119
- , torch_dtype="auto", device_map="auto")
120
  model.eval()
121
 
122
  # Prepare image input
123
- image_url = "https://huggingface.co/datasets/array/SAT/viewer/default/validation?row=2&image-viewer=1FECF8A4A7380558FF5C3E659A8D54DB721032AF"
124
 
125
  # Prepare text input
126
- question = "Answer in natural language. I need to go to Chair (near the mark 7 in the image). Which direction should I turn to face the object? look straight or left by 40 degrees."
127
  prompt = f"A conversation between User and Assistant. The user asks a question about the image, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\nUser: {question} \nAssistant: Let me solve this step by step.\n<think>"
128
 
 
 
 
 
 
 
 
 
 
 
129
  # Process input
130
  response = requests.get(image_url)
131
  image = Image.open(BytesIO(response.content))
132
- text = processor.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
133
  input = processor(
134
  text=text,
135
- images=image,
136
  padding=True,
137
  return_tensors="pt",
138
  )
139
- input = inputsto("cuda")
140
 
141
  # Generation of the output
142
- with torch.no_grad():
143
- generated_ids = model.module.generate(**input, use_cache=True, max_new_tokens=1024, do_sample=True)
144
  generated_ids_trimmed = [
145
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
146
  ]
147
  batch_output_text = processor.batch_decode(
148
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
@@ -150,6 +158,7 @@ batch_output_text = processor.batch_decode(
150
 
151
  # Get output
152
  output_text = batch_output_text[0]
 
153
  ```
154
 
155
  <!-- ## 📰 Evaluation Results
 
112
  from io import BytesIO
113
  from transformers import AutoProcessor, AutoModelForImageTextToText
114
 
 
115
  # Load model directly
116
+ processor = AutoProcessor.from_pretrained("AIcell/reproduce-1200")
117
+ model = AutoModelForImageTextToText.from_pretrained("AIcell/reproduce-1200"
118
+ ,torch_dtype="auto", device_map="auto")
119
  model.eval()
120
 
121
  # Prepare image input
122
+ image_url = "https://multimodal-r1.s3.us-west-1.amazonaws.com/demo_image.jpg"
123
 
124
  # Prepare text input
125
+ question = "Considering the relative positions of the sofa and the picture in the image provided, where is the sofa located with respect to the picture? Select from the following choices.\n(A) above or \n(B) below"
126
  prompt = f"A conversation between User and Assistant. The user asks a question about the image, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\nUser: {question} \nAssistant: Let me solve this step by step.\n<think>"
127
 
128
+ # Create Message
129
+ message = [
130
+
131
+ {
132
+ "type": "image",
133
+ "image": image_url,
134
+ },
135
+ {"type": "text", "text": "<image>" + prompt},
136
+ ]
137
+
138
  # Process input
139
  response = requests.get(image_url)
140
  image = Image.open(BytesIO(response.content))
141
+ text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
142
  input = processor(
143
  text=text,
144
+ image=image,
145
  padding=True,
146
  return_tensors="pt",
147
  )
148
+ input = input.to("cuda")
149
 
150
  # Generation of the output
151
+ generated_ids = model.generate(**input, use_cache=True, max_new_tokens=1024, do_sample=True)
 
152
  generated_ids_trimmed = [
153
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(input.input_ids, generated_ids)
154
  ]
155
  batch_output_text = processor.batch_decode(
156
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
158
 
159
  # Get output
160
  output_text = batch_output_text[0]
161
+ print(output_text)
162
  ```
163
 
164
  <!-- ## 📰 Evaluation Results