Fancy-MLLM commited on
Commit
a5d2387
·
verified ·
1 Parent(s): 5762ea1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -283
app.py CHANGED
@@ -1,240 +1,19 @@
1
- # import gradio as gr
2
- # from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
3
- # from threading import Thread
4
- # from qwen_vl_utils import process_vision_info
5
- # import torch
6
- # import time
7
-
8
- # # Check if a GPU is available
9
- # device = "cuda" if torch.cuda.is_available() else "cpu"
10
-
11
- # local_path = "Fancy-MLLM/R1-OneVision-7B"
12
-
13
- # # Load the model on the appropriate device (GPU if available, otherwise CPU)
14
- # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
15
- # local_path, torch_dtype="auto", device_map=device
16
- # )
17
- # processor = AutoProcessor.from_pretrained(local_path)
18
-
19
- # def generate_output(image, text, button_click):
20
- # # Prepare input data
21
- # messages = [
22
- # {
23
- # "role": "user",
24
- # "content": [
25
- # {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056},
26
- # {"type": "text", "text": text},
27
- # ],
28
- # }
29
- # ]
30
-
31
- # # Prepare inputs for the model
32
- # text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
33
- # image_inputs, video_inputs = process_vision_info(messages)
34
- # inputs = processor(
35
- # text=[text_input],
36
- # images=image_inputs,
37
- # videos=video_inputs,
38
- # padding=True,
39
- # return_tensors="pt",
40
- # )
41
-
42
- # # Move inputs to the same device as the model
43
- # inputs = inputs.to(model.device)
44
-
45
- # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
46
- # generation_kwargs = dict(
47
- # **inputs,
48
- # streamer=streamer,
49
- # max_new_tokens=4096,
50
- # top_p=0.001,
51
- # top_k=1,
52
- # temperature=0.01,
53
- # repetition_penalty=1.0,
54
- # )
55
-
56
- # thread = Thread(target=model.generate, kwargs=generation_kwargs)
57
- # thread.start()
58
- # generated_text = ''
59
-
60
- # try:
61
- # for new_text in streamer:
62
- # generated_text += new_text
63
- # yield f"‎{generated_text}"
64
- # except Exception as e:
65
- # print(f"Error: {e}")
66
- # yield f"Error occurred: {str(e)}"
67
-
68
- # Css = """
69
- # #output-markdown {
70
- # overflow-y: auto;
71
- # white-space: pre-wrap;
72
- # word-wrap: break-word;
73
- # }
74
- # #output-markdown .math {
75
- # overflow-x: auto;
76
- # max-width: 100%;
77
- # }
78
- # .markdown-text {
79
- # white-space: pre-wrap;
80
- # word-wrap: break-word;
81
- # }
82
- # .markdown-output {
83
- # min-height: 20vh;
84
- # max-width: 100%;
85
- # overflow-y: auto;
86
- # }
87
- # #qwen-md .katex-display { display: inline; }
88
- # #qwen-md .katex-display>.katex { display: inline; }
89
- # #qwen-md .katex-display>.katex>.katex-html { display: inline; }
90
- # """
91
-
92
- # with gr.Blocks(css=Css) as demo:
93
- # gr.HTML("""<center><font size=8>🦖 R1-OneVision Demo</center>""")
94
-
95
- # with gr.Row():
96
- # with gr.Column():
97
- # input_image = gr.Image(type="pil", label="Upload") # **改回 PIL 处理**
98
- # input_text = gr.Textbox(label="Input your question")
99
- # with gr.Row():
100
- # clear_btn = gr.ClearButton([input_image, input_text])
101
- # submit_btn = gr.Button("Submit", variant="primary")
102
-
103
- # with gr.Column():
104
- # output_text = gr.Markdown(elem_id="qwen-md", container=True, elem_classes="markdown-output")
105
-
106
- # submit_btn.click(fn=generate_output, inputs=[input_image, input_text], outputs=output_text)
107
-
108
- # demo.launch(share=False)
109
-
110
-
111
- # import gradio as gr
112
- # from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
113
- # from transformers.image_utils import load_image
114
- # from threading import Thread
115
- # import time
116
- # import torch
117
- # import spaces
118
-
119
- # MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
120
- # processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
121
- # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
122
- # MODEL_ID,
123
- # trust_remote_code=True,
124
- # torch_dtype=torch.bfloat16
125
- # ).to("cuda").eval()
126
-
127
- # @spaces.GPU(duration=200)
128
- # def model_inference(input_dict, history):
129
- # text = input_dict["text"]
130
- # files = input_dict["files"]
131
-
132
- # # Load images if provided
133
- # if len(files) > 1:
134
- # images = [load_image(image) for image in files]
135
- # elif len(files) == 1:
136
- # images = [load_image(files[0])]
137
- # else:
138
- # images = []
139
-
140
- # # Validate input
141
- # if text == "" and not images:
142
- # gr.Error("Please input a query and optionally image(s).")
143
- # return
144
- # if text == "" and images:
145
- # gr.Error("Please input a text query along with the image(s).")
146
- # return
147
-
148
- # # Prepare messages for the model
149
- # messages = [
150
- # {
151
- # "role": "user",
152
- # "content": [
153
- # *[{"type": "image", "image": image} for image in images],
154
- # {"type": "text", "text": text},
155
- # ],
156
- # }
157
- # ]
158
-
159
- # # Apply chat template and process inputs
160
- # prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
161
- # inputs = processor(
162
- # text=[prompt],
163
- # images=images if images else None,
164
- # return_tensors="pt",
165
- # padding=True,
166
- # ).to("cuda")
167
-
168
- # # # Set up streamer for real-time output
169
- # # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
170
- # # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
171
-
172
- # # # Start generation in a separate thread
173
- # # thread = Thread(target=model.generate, kwargs=generation_kwargs)
174
- # # thread.start()
175
-
176
- # # # Stream the output
177
- # # buffer = ""
178
- # # yield "Thinking..."
179
- # # for new_text in streamer:
180
- # # buffer += new_text
181
- # # time.sleep(0.01)
182
- # # yield buffer
183
- # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
184
- # generation_kwargs = dict(
185
- # **inputs,
186
- # streamer=streamer,
187
- # max_new_tokens=2048,
188
- # top_p=0.001,
189
- # top_k=1,
190
- # temperature=0.01,
191
- # repetition_penalty=1.0,
192
- # )
193
-
194
- # thread = Thread(target=model.generate, kwargs=generation_kwargs)
195
- # thread.start()
196
- # generated_text = ''
197
-
198
- # try:
199
- # for new_text in streamer:
200
- # generated_text += new_text
201
- # yield generated_text
202
- # except Exception as e:
203
- # print(f"Error: {e}")
204
- # yield f"Error occurred: {str(e)}"
205
-
206
- # examples = [
207
- # [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
208
- # ]
209
-
210
- # demo = gr.ChatInterface(
211
- # fn=model_inference,
212
- # description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
213
- # examples=examples,
214
- # textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
215
- # stop_btn="Stop Generation",
216
- # multimodal=True,
217
- # cache_examples=False,
218
- # )
219
-
220
- # demo.launch(debug=True)
221
-
222
-
223
  import os
224
  from datetime import datetime
 
225
  import time
226
- from threading import Thread
227
 
228
  # Third-party imports
229
  import numpy as np
230
  import torch
231
  from PIL import Image
 
232
  import gradio as gr
233
  import spaces
234
  from transformers import (
235
  Qwen2_5_VLForConditionalGeneration,
236
- AutoProcessor,
237
- TextIteratorStreamer
238
  )
239
 
240
  # Local imports
@@ -250,6 +29,7 @@ else:
250
 
251
  print(f"[INFO] Using device: {device}")
252
 
 
253
  def array_to_image_path(image_array):
254
  if image_array is None:
255
  raise ValueError("No image provided. Please upload an image before submitting.")
@@ -267,19 +47,19 @@ def array_to_image_path(image_array):
267
  full_path = os.path.abspath(filename)
268
 
269
  return full_path
270
-
271
  models = {
272
- "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
273
  trust_remote_code=True,
274
  torch_dtype="auto",
275
  device_map="auto").eval(),
276
  }
277
 
278
  processors = {
279
- "Fancy-MLLM/R1-OneVision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-OneVision-7B", trust_remote_code=True),
280
  }
281
 
282
- DESCRIPTION = "[🦖 Fancy-MLLM/R1-OneVision-7B Demo]"
283
 
284
  kwargs = {}
285
  kwargs['torch_dtype'] = torch.bfloat16
@@ -289,70 +69,55 @@ assistant_prompt = '<|assistant|>\n'
289
  prompt_suffix = "<|end|>\n"
290
 
291
  @spaces.GPU
292
- def model_inference(input_dict, history):
293
- text = input_dict["text"]
294
- files = input_dict["files"]
295
-
296
- # Load images if provided
297
- images = []
298
- if len(files) > 0:
299
- images = [array_to_image_path(image) for image in files]
300
 
301
- # Validate input
302
- if text == "" and not images:
303
- yield "Error: Please input a query and optionally image(s)."
304
- return
305
- if text == "" and images:
306
- yield "Error: Please input a text query along with the image(s)."
307
- return
308
 
309
- # Prepare messages for the model
310
  messages = [
311
- {
312
  "role": "user",
313
  "content": [
314
- *[{"type": "image", "image": image} for image in images],
315
- {"type": "text", "text": text},
 
 
 
316
  ],
317
  }
318
  ]
319
 
320
- # Apply chat template and process inputs
321
- prompt = processors["Fancy-MLLM/R1-OneVision-7B"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
322
  image_inputs, video_inputs = process_vision_info(messages)
323
- inputs = processors["Fancy-MLLM/R1-OneVision-7B"](
324
- text=[prompt],
325
  images=image_inputs,
326
  videos=video_inputs,
327
  padding=True,
328
  return_tensors="pt",
329
- ).to(device)
330
-
331
- # Set up streamer for real-time output
332
- streamer = TextIteratorStreamer(processors["Fancy-MLLM/R1-OneVision-7B"], skip_prompt=True, skip_special_tokens=True)
333
 
334
- # Define the generation parameters
335
- generation_kwargs = dict(
336
- **inputs,
337
- streamer=streamer,
338
- max_new_tokens=2048,
339
- top_p=0.001,
340
- top_k=1,
341
- temperature=0.01,
342
- repetition_penalty=1.0,
343
  )
344
 
345
- # Start generation in a separate thread
346
- thread = Thread(target=models["Fancy-MLLM/R1-OneVision-7B"].generate, kwargs=generation_kwargs)
347
- thread.start()
348
-
349
- # Stream the output
350
- buffer = ""
351
- yield "Thinking..."
352
- for new_text in streamer:
353
- buffer += new_text
354
- time.sleep(0.01)
355
- yield buffer
356
 
357
  css = """
358
  #output {
@@ -364,21 +129,20 @@ css = """
364
 
365
  with gr.Blocks(css=css) as demo:
366
  gr.Markdown(DESCRIPTION)
367
- with gr.Tab(label="R1-OneVision-7B Input"):
368
  with gr.Row():
369
  with gr.Column():
370
- input_img = gr.Image(label="Input Picture", type="numpy", elem_id="image_input")
371
  model_selector = gr.Dropdown(choices=list(models.keys()),
372
  label="Model",
373
- value="Fancy-MLLM/R1-OneVision-7B")
374
  text_input = gr.Textbox(label="Text Prompt")
375
  submit_btn = gr.Button(value="Submit")
376
  with gr.Column():
377
- output_text = gr.Textbox(label="Output Text", elem_id="output_text", lines=10)
 
378
 
379
- submit_btn.click(model_inference, [input_img, text_input, model_selector], [output_text])
380
 
381
  demo.queue(api_open=False)
382
  demo.launch(debug=True)
383
-
384
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from datetime import datetime
3
+ import subprocess
4
  import time
 
5
 
6
  # Third-party imports
7
  import numpy as np
8
  import torch
9
  from PIL import Image
10
+ import accelerate
11
  import gradio as gr
12
  import spaces
13
  from transformers import (
14
  Qwen2_5_VLForConditionalGeneration,
15
+ AutoTokenizer,
16
+ AutoProcessor
17
  )
18
 
19
  # Local imports
 
29
 
30
  print(f"[INFO] Using device: {device}")
31
 
32
+
33
  def array_to_image_path(image_array):
34
  if image_array is None:
35
  raise ValueError("No image provided. Please upload an image before submitting.")
 
47
  full_path = os.path.abspath(filename)
48
 
49
  return full_path
50
+
51
  models = {
52
+ "Fancy-MLLM/R1-Onevision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-Onevision-7B",
53
  trust_remote_code=True,
54
  torch_dtype="auto",
55
  device_map="auto").eval(),
56
  }
57
 
58
  processors = {
59
+ "Fancy-MLLM/R1-Onevision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-Onevision-7B", trust_remote_code=True),
60
  }
61
 
62
+ DESCRIPTION = "[🦖 Fancy-MLLM/R1-Onevision-7B Demo]"
63
 
64
  kwargs = {}
65
  kwargs['torch_dtype'] = torch.bfloat16
 
69
  prompt_suffix = "<|end|>\n"
70
 
71
  @spaces.GPU
72
+ def run_example(image, text_input=None, model_id=None):
73
+ start_time = time.time()
74
+ image_path = array_to_image_path(image)
 
 
 
 
 
75
 
76
+ print(image_path)
77
+ model = models[model_id]
78
+ processor = processors[model_id]
 
 
 
 
79
 
80
+ image = Image.fromarray(image).convert("RGB")
81
  messages = [
82
+ {
83
  "role": "user",
84
  "content": [
85
+ {
86
+ "type": "image",
87
+ "image": image_path,
88
+ },
89
+ {"type": "text", "text": text_input},
90
  ],
91
  }
92
  ]
93
 
94
+ # Preparation for inference
95
+ text = processor.apply_chat_template(
96
+ messages, tokenize=False, add_generation_prompt=True
97
+ )
98
  image_inputs, video_inputs = process_vision_info(messages)
99
+ inputs = processor(
100
+ text=[text],
101
  images=image_inputs,
102
  videos=video_inputs,
103
  padding=True,
104
  return_tensors="pt",
105
+ )
106
+ inputs = inputs.to(device)
 
 
107
 
108
+ # Inference: Generation of the output
109
+ generated_ids = model.generate(**inputs, max_new_tokens=2048)
110
+ generated_ids_trimmed = [
111
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
112
+ ]
113
+ output_text = processor.batch_decode(
114
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
115
  )
116
 
117
+ end_time = time.time()
118
+ total_time = round(end_time - start_time, 2)
119
+
120
+ return output_text[0], total_time
 
 
 
 
 
 
 
121
 
122
  css = """
123
  #output {
 
129
 
130
  with gr.Blocks(css=css) as demo:
131
  gr.Markdown(DESCRIPTION)
132
+ with gr.Tab(label="R1-Onevision-7B Input"):
133
  with gr.Row():
134
  with gr.Column():
135
+ input_img = gr.Image(label="Input Picture")
136
  model_selector = gr.Dropdown(choices=list(models.keys()),
137
  label="Model",
138
+ value="Fancy-MLLM/R1-Onevision-7B")
139
  text_input = gr.Textbox(label="Text Prompt")
140
  submit_btn = gr.Button(value="Submit")
141
  with gr.Column():
142
+ output_text = gr.Textbox(label="Output Text")
143
+ time_taken = gr.Textbox(label="Time taken for processing + inference")
144
 
145
+ submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text, time_taken])
146
 
147
  demo.queue(api_open=False)
148
  demo.launch(debug=True)