turningpoint-ai
/

VisualThinker-R1-Zero

Image-Text-to-Text

Safetensors

English

qwen2_vl

conversational

Model card Files Files and versions Community

AIcell commited on 8 days ago

Commit

68ca8e9

verified ·

1 Parent(s): 64bf3c4

Update README.md

Browse files

Files changed (1) hide show

README.md +21 -12

README.md CHANGED Viewed

@@ -112,37 +112,45 @@ import requests
 from io import BytesIO
 from transformers import AutoProcessor, AutoModelForImageTextToText
 # Load model directly
-processor = AutoProcessor.from_pretrained("turningpoint-ai/VisualThinker-R1-Zero")
-model = AutoModelForImageTextToText.from_pretrained("turningpoint-ai/VisualThinker-R1-Zero",
-  , torch_dtype="auto", device_map="auto")
 model.eval()
 # Prepare image input
-image_url = "https://huggingface.co/datasets/array/SAT/viewer/default/validation?row=2&image-viewer=1FECF8A4A7380558FF5C3E659A8D54DB721032AF"
 # Prepare text input
-question = "Answer in natural language. I need to go to Chair (near the mark 7 in the image). Which direction should I turn to face the object? look straight or left by 40 degrees."
 prompt = f"A conversation between User and Assistant. The user asks a question about the image, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\nUser: {question} \nAssistant: Let me solve this step by step.\n<think>"
 # Process input
 response = requests.get(image_url)
 image = Image.open(BytesIO(response.content))
-text = processor.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
 input = processor(
                 text=text,
-                images=image,
                 padding=True,
                 return_tensors="pt",
             )
-input = inputsto("cuda")
 # Generation of the output
-with torch.no_grad():
-    generated_ids = model.module.generate(**input, use_cache=True, max_new_tokens=1024, do_sample=True)
 generated_ids_trimmed = [
-    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 ]
 batch_output_text = processor.batch_decode(
     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
@@ -150,6 +158,7 @@ batch_output_text = processor.batch_decode(
 # Get output
 output_text = batch_output_text[0]
 ```
 <!-- ## 📰 Evaluation Results

 from io import BytesIO
 from transformers import AutoProcessor, AutoModelForImageTextToText
 # Load model directly
+processor = AutoProcessor.from_pretrained("AIcell/reproduce-1200")
+model = AutoModelForImageTextToText.from_pretrained("AIcell/reproduce-1200"
+                                                    ,torch_dtype="auto", device_map="auto")
 model.eval()
 # Prepare image input
+image_url = "https://multimodal-r1.s3.us-west-1.amazonaws.com/demo_image.jpg"
 # Prepare text input
+question = "Considering the relative positions of the sofa and the picture in the image provided, where is the sofa located with respect to the picture? Select from the following choices.\n(A) above or \n(B) below"
 prompt = f"A conversation between User and Assistant. The user asks a question about the image, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer.\nUser: {question} \nAssistant: Let me solve this step by step.\n<think>"
+# Create Message
+message = [
+                        {
+                            "type": "image",
+                            "image": image_url,
+                        },
+                        {"type": "text", "text": "<image>" + prompt},
+                    ]
 # Process input
 response = requests.get(image_url)
 image = Image.open(BytesIO(response.content))
+text = processor.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
 input = processor(
                 text=text,
+                image=image,
                 padding=True,
                 return_tensors="pt",
             )
+input = input.to("cuda")
 # Generation of the output
+generated_ids = model.generate(**input, use_cache=True, max_new_tokens=1024, do_sample=True)
 generated_ids_trimmed = [
+    out_ids[len(in_ids):] for in_ids, out_ids in zip(input.input_ids, generated_ids)
 ]
 batch_output_text = processor.batch_decode(
     generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 # Get output
 output_text = batch_output_text[0]
+print(output_text)
 ```
 <!-- ## 📰 Evaluation Results