Fancy-MLLM commited on
Commit
5762ea1
·
verified ·
1 Parent(s): 6bcd1bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -43
app.py CHANGED
@@ -222,20 +222,19 @@
222
 
223
  import os
224
  from datetime import datetime
225
- import subprocess
226
  import time
 
227
 
228
  # Third-party imports
229
  import numpy as np
230
  import torch
231
  from PIL import Image
232
- import accelerate
233
  import gradio as gr
234
  import spaces
235
  from transformers import (
236
  Qwen2_5_VLForConditionalGeneration,
237
- AutoTokenizer,
238
- AutoProcessor
239
  )
240
 
241
  # Local imports
@@ -251,7 +250,6 @@ else:
251
 
252
  print(f"[INFO] Using device: {device}")
253
 
254
-
255
  def array_to_image_path(image_array):
256
  if image_array is None:
257
  raise ValueError("No image provided. Please upload an image before submitting.")
@@ -269,7 +267,7 @@ def array_to_image_path(image_array):
269
  full_path = os.path.abspath(filename)
270
 
271
  return full_path
272
-
273
  models = {
274
  "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
275
  trust_remote_code=True,
@@ -291,55 +289,70 @@ assistant_prompt = '<|assistant|>\n'
291
  prompt_suffix = "<|end|>\n"
292
 
293
  @spaces.GPU
294
- def run_example(image, text_input=None, model_id=None):
295
- start_time = time.time()
296
- image_path = array_to_image_path(image)
 
 
 
 
 
297
 
298
- print(image_path)
299
- model = models[model_id]
300
- processor = processors[model_id]
301
-
302
- image = Image.fromarray(image).convert("RGB")
 
 
 
 
303
  messages = [
304
- {
305
  "role": "user",
306
  "content": [
307
- {
308
- "type": "image",
309
- "image": image_path,
310
- },
311
- {"type": "text", "text": text_input},
312
  ],
313
  }
314
  ]
315
 
316
- # Preparation for inference
317
- text = processor.apply_chat_template(
318
- messages, tokenize=False, add_generation_prompt=True
319
- )
320
  image_inputs, video_inputs = process_vision_info(messages)
321
- inputs = processor(
322
- text=[text],
323
  images=image_inputs,
324
  videos=video_inputs,
325
  padding=True,
326
  return_tensors="pt",
327
- )
328
- inputs = inputs.to(device)
 
 
329
 
330
- # Inference: Generation of the output
331
- generated_ids = model.generate(**inputs, max_new_tokens=2048)
332
- generated_ids_trimmed = [
333
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
334
- ]
335
- output_text = processor.batch_decode(
336
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
 
337
  )
338
 
339
- end_time = time.time()
340
- total_time = round(end_time - start_time, 2)
341
-
342
- return output_text[0], total_time
 
 
 
 
 
 
 
343
 
344
  css = """
345
  #output {
@@ -354,18 +367,18 @@ with gr.Blocks(css=css) as demo:
354
  with gr.Tab(label="R1-OneVision-7B Input"):
355
  with gr.Row():
356
  with gr.Column():
357
- input_img = gr.Image(label="Input Picture")
358
  model_selector = gr.Dropdown(choices=list(models.keys()),
359
  label="Model",
360
  value="Fancy-MLLM/R1-OneVision-7B")
361
  text_input = gr.Textbox(label="Text Prompt")
362
  submit_btn = gr.Button(value="Submit")
363
  with gr.Column():
364
- output_text = gr.Textbox(label="Output Text")
365
- time_taken = gr.Textbox(label="Time taken for processing + inference")
366
 
367
- submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text, time_taken])
368
 
369
  demo.queue(api_open=False)
370
  demo.launch(debug=True)
371
 
 
 
222
 
223
  import os
224
  from datetime import datetime
 
225
  import time
226
+ from threading import Thread
227
 
228
  # Third-party imports
229
  import numpy as np
230
  import torch
231
  from PIL import Image
 
232
  import gradio as gr
233
  import spaces
234
  from transformers import (
235
  Qwen2_5_VLForConditionalGeneration,
236
+ AutoProcessor,
237
+ TextIteratorStreamer
238
  )
239
 
240
  # Local imports
 
250
 
251
  print(f"[INFO] Using device: {device}")
252
 
 
253
  def array_to_image_path(image_array):
254
  if image_array is None:
255
  raise ValueError("No image provided. Please upload an image before submitting.")
 
267
  full_path = os.path.abspath(filename)
268
 
269
  return full_path
270
+
271
  models = {
272
  "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
273
  trust_remote_code=True,
 
289
  prompt_suffix = "<|end|>\n"
290
 
291
  @spaces.GPU
292
+ def model_inference(input_dict, history):
293
+ text = input_dict["text"]
294
+ files = input_dict["files"]
295
+
296
+ # Load images if provided
297
+ images = []
298
+ if len(files) > 0:
299
+ images = [array_to_image_path(image) for image in files]
300
 
301
+ # Validate input
302
+ if text == "" and not images:
303
+ yield "Error: Please input a query and optionally image(s)."
304
+ return
305
+ if text == "" and images:
306
+ yield "Error: Please input a text query along with the image(s)."
307
+ return
308
+
309
+ # Prepare messages for the model
310
  messages = [
311
+ {
312
  "role": "user",
313
  "content": [
314
+ *[{"type": "image", "image": image} for image in images],
315
+ {"type": "text", "text": text},
 
 
 
316
  ],
317
  }
318
  ]
319
 
320
+ # Apply chat template and process inputs
321
+ prompt = processors["Fancy-MLLM/R1-OneVision-7B"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
322
  image_inputs, video_inputs = process_vision_info(messages)
323
+ inputs = processors["Fancy-MLLM/R1-OneVision-7B"](
324
+ text=[prompt],
325
  images=image_inputs,
326
  videos=video_inputs,
327
  padding=True,
328
  return_tensors="pt",
329
+ ).to(device)
330
+
331
+ # Set up streamer for real-time output
332
+ streamer = TextIteratorStreamer(processors["Fancy-MLLM/R1-OneVision-7B"], skip_prompt=True, skip_special_tokens=True)
333
 
334
+ # Define the generation parameters
335
+ generation_kwargs = dict(
336
+ **inputs,
337
+ streamer=streamer,
338
+ max_new_tokens=2048,
339
+ top_p=0.001,
340
+ top_k=1,
341
+ temperature=0.01,
342
+ repetition_penalty=1.0,
343
  )
344
 
345
+ # Start generation in a separate thread
346
+ thread = Thread(target=models["Fancy-MLLM/R1-OneVision-7B"].generate, kwargs=generation_kwargs)
347
+ thread.start()
348
+
349
+ # Stream the output
350
+ buffer = ""
351
+ yield "Thinking..."
352
+ for new_text in streamer:
353
+ buffer += new_text
354
+ time.sleep(0.01)
355
+ yield buffer
356
 
357
  css = """
358
  #output {
 
367
  with gr.Tab(label="R1-OneVision-7B Input"):
368
  with gr.Row():
369
  with gr.Column():
370
+ input_img = gr.Image(label="Input Picture", type="numpy", elem_id="image_input")
371
  model_selector = gr.Dropdown(choices=list(models.keys()),
372
  label="Model",
373
  value="Fancy-MLLM/R1-OneVision-7B")
374
  text_input = gr.Textbox(label="Text Prompt")
375
  submit_btn = gr.Button(value="Submit")
376
  with gr.Column():
377
+ output_text = gr.Textbox(label="Output Text", elem_id="output_text", lines=10)
 
378
 
379
+ submit_btn.click(model_inference, [input_img, text_input, model_selector], [output_text])
380
 
381
  demo.queue(api_open=False)
382
  demo.launch(debug=True)
383
 
384
+