richardaecn commited on
Commit
90e0439
·
verified ·
1 Parent(s): 97e0ce9

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +144 -0
README.md CHANGED
@@ -16,6 +16,150 @@ pipeline_tag: image-text-to-text
16
 
17
  [[Paper](https://arxiv.org/abs/2504.16072)] | [[Code](https://github.com/NVlabs/describe-anything)] | [[Project Page](https://describe-anything.github.io/)] | [[Video](https://describe-anything.github.io/#video)] | [[HuggingFace Demo](https://huggingface.co/spaces/nvidia/describe-anything-model-demo)] | [[Model/Benchmark/Datasets](https://huggingface.co/collections/nvidia/describe-anything-680825bb8f5e41ff0785834c)] | [[Citation](#citation)]
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Model Card for DAM-3B
20
 
21
  ## Description
 
16
 
17
  [[Paper](https://arxiv.org/abs/2504.16072)] | [[Code](https://github.com/NVlabs/describe-anything)] | [[Project Page](https://describe-anything.github.io/)] | [[Video](https://describe-anything.github.io/#video)] | [[HuggingFace Demo](https://huggingface.co/spaces/nvidia/describe-anything-model-demo)] | [[Model/Benchmark/Datasets](https://huggingface.co/collections/nvidia/describe-anything-680825bb8f5e41ff0785834c)] | [[Citation](#citation)]
18
 
19
+ An example code of inference using this self-contained model:
20
+ ```python
21
+ # Copyright 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
+ # Licensed under the Apache License, Version 2.0 (the "License");
23
+ # you may not use this file except in compliance with the License.
24
+ # You may obtain a copy of the License at
25
+ # http://www.apache.org/licenses/LICENSE-2.0
26
+ # Unless required by applicable law or agreed to in writing, software
27
+ # distributed under the License is distributed on an "AS IS" BASIS,
28
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
29
+ # See the License for the specific language governing permissions and
30
+ # limitations under the License.
31
+ # SPDX-License-Identifier: Apache-2.0
32
+
33
+ import torch
34
+ import numpy as np
35
+ from PIL import Image
36
+ from transformers import SamModel, SamProcessor, AutoModel
37
+ import cv2
38
+ import requests
39
+ from io import BytesIO
40
+
41
+
42
+ def apply_sam(image, input_points=None, input_boxes=None, input_labels=None):
43
+ inputs = sam_processor(image, input_points=input_points, input_boxes=input_boxes,
44
+ input_labels=input_labels, return_tensors="pt").to(device)
45
+
46
+ with torch.no_grad():
47
+ outputs = sam_model(**inputs)
48
+
49
+ masks = sam_processor.image_processor.post_process_masks(
50
+ outputs.pred_masks.cpu(),
51
+ inputs["original_sizes"].cpu(),
52
+ inputs["reshaped_input_sizes"].cpu()
53
+ )[0][0]
54
+ scores = outputs.iou_scores[0, 0]
55
+
56
+ mask_selection_index = scores.argmax()
57
+ mask_np = masks[mask_selection_index].numpy()
58
+ return mask_np
59
+
60
+
61
+ def add_contour(img, mask, input_points=None, input_boxes=None):
62
+ img = img.copy()
63
+ mask = mask.astype(np.uint8) * 255
64
+ contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
65
+ cv2.drawContours(img, contours, -1, (1.0, 1.0, 1.0), thickness=6)
66
+
67
+ if input_points is not None:
68
+ for points in input_points:
69
+ for x, y in points:
70
+ cv2.circle(img, (int(x), int(y)), radius=10, color=(1.0, 0.0, 0.0), thickness=-1)
71
+ cv2.circle(img, (int(x), int(y)), radius=10, color=(1.0, 1.0, 1.0), thickness=2)
72
+
73
+ if input_boxes is not None:
74
+ for box_batch in input_boxes:
75
+ for box in box_batch:
76
+ x1, y1, x2, y2 = map(int, box)
77
+ cv2.rectangle(img, (x1, y1), (x2, y2), color=(1.0, 1.0, 1.0), thickness=4)
78
+ cv2.rectangle(img, (x1, y1), (x2, y2), color=(1.0, 0.0, 0.0), thickness=2)
79
+
80
+ return img
81
+
82
+ def print_streaming(text):
83
+ print(text, end="", flush=True)
84
+
85
+ if __name__ == '__main__':
86
+ # Download the image via HTTP
87
+ image_url = 'https://github.com/NVlabs/describe-anything/blob/main/images/1.jpg?raw=true'
88
+ response = requests.get(image_url)
89
+ img = Image.open(BytesIO(response.content)).convert('RGB')
90
+
91
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
92
+ sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
93
+ sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
94
+ image_size = img.size # (width, height)
95
+
96
+ # Initialize DAM model once
97
+ model = AutoModel.from_pretrained(
98
+ 'nvidia/DAM-3B-Self-Contained',
99
+ trust_remote_code=True,
100
+ torch_dtype='torch.float16'
101
+ ).to(device)
102
+ dam = model.init_dam(conv_mode='v1', prompt_mode='full+focal_crop')
103
+
104
+ # Define two runs: one with points, one with box
105
+ runs = [
106
+ {
107
+ 'use_box': False,
108
+ 'points': [[1172, 812], [1572, 800]],
109
+ 'output_image_path': 'output_visualization_points.png'
110
+ },
111
+ {
112
+ 'use_box': True,
113
+ 'box': [800, 500, 1800, 1000],
114
+ 'output_image_path': 'output_visualization_box.png'
115
+ }
116
+ ]
117
+
118
+ for run in runs:
119
+ if run['use_box']:
120
+ # Prepare box input
121
+ coords = run['box']
122
+ input_boxes = [[coords]]
123
+ print(f"Running inference with input_boxes: {input_boxes}")
124
+ mask_np = apply_sam(img, input_boxes=input_boxes)
125
+ vis_points = None
126
+ vis_boxes = input_boxes
127
+ else:
128
+ # Prepare point input
129
+ pts = run['points']
130
+ input_points = [pts]
131
+ input_labels = [[1] * len(pts)]
132
+ print(f"Running inference with input_points: {input_points}")
133
+ mask_np = apply_sam(img, input_points=input_points, input_labels=input_labels)
134
+ vis_points = input_points
135
+ vis_boxes = None
136
+
137
+ # Convert mask and describe
138
+ mask = Image.fromarray((mask_np * 255).astype(np.uint8))
139
+ print("Description:")
140
+ for token in dam.get_description(
141
+ img,
142
+ mask,
143
+ '<image>\nDescribe the masked region in detail.',
144
+ streaming=True,
145
+ temperature=0.2,
146
+ top_p=0.5,
147
+ num_beams=1,
148
+ max_new_tokens=512
149
+ ):
150
+ print_streaming(token)
151
+ print() # newline
152
+
153
+ # Save visualization with contour
154
+ img_np = np.asarray(img).astype(float) / 255.0
155
+ img_with_contour_np = add_contour(img_np, mask_np,
156
+ input_points=vis_points,
157
+ input_boxes=vis_boxes)
158
+ img_with_contour_pil = Image.fromarray((img_with_contour_np * 255.0).astype(np.uint8))
159
+ img_with_contour_pil.save(run['output_image_path'])
160
+ print(f"Output image with contour saved as {run['output_image_path']}")
161
+ ```
162
+
163
  # Model Card for DAM-3B
164
 
165
  ## Description