Spaces:

Fancy-MLLM
/

R1-Onevision

Running on Zero

App Files Files Community

Fancy-MLLM commited on Feb 12

Commit

6bcd1bb

verified ·

1 Parent(s): 3a69964

Update app.py

Browse files

Files changed (1) hide show

app.py +243 -93

app.py CHANGED Viewed

@@ -108,114 +108,264 @@
 # demo.launch(share=False)
-import gradio as gr
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
-from transformers.image_utils import load_image
-from threading import Thread
 import time
 import torch
 import spaces
-MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
-processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16
-).to("cuda").eval()
-@spaces.GPU(duration=200)
-def model_inference(input_dict, history):
-    text = input_dict["text"]
-    files = input_dict["files"]
-    # Load images if provided
-    if len(files) > 1:
-        images = [load_image(image) for image in files]
-    elif len(files) == 1:
-        images = [load_image(files[0])]
-    else:
-        images = []
-    # Validate input
-    if text == "" and not images:
-        gr.Error("Please input a query and optionally image(s).")
-        return
-    if text == "" and images:
-        gr.Error("Please input a text query along with the image(s).")
-        return
-    # Prepare messages for the model
     messages = [
-        {
             "role": "user",
             "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
             ],
         }
     ]
-    # Apply chat template and process inputs
-    prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
-        text=[prompt],
-        images=images if images else None,
-        return_tensors="pt",
         padding=True,
-    ).to("cuda")
-    # # Set up streamer for real-time output
-    # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
-    # # Start generation in a separate thread
-    # thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    # thread.start()
-    # # Stream the output
-    # buffer = ""
-    # yield "Thinking..."
-    # for new_text in streamer:
-    #     buffer += new_text
-    #     time.sleep(0.01)
-    #     yield buffer
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        **inputs,
-        streamer=streamer,
-        max_new_tokens=2048,
-        top_p=0.001,
-        top_k=1,
-        temperature=0.01,
-        repetition_penalty=1.0,
     )
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    generated_text = ''
-    try:
-        for new_text in streamer:
-            generated_text += new_text
-            yield generated_text
-    except Exception as e:
-        print(f"Error: {e}")
-        yield f"Error occurred: {str(e)}"
-examples = [
-    [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
-]
-demo = gr.ChatInterface(
-    fn=model_inference,
-    description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
-    examples=examples,
-    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
-    stop_btn="Stop Generation",
-    multimodal=True,
-    cache_examples=False,
-)
 demo.launch(debug=True)

 # demo.launch(share=False)
+# import gradio as gr
+# from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
+# from transformers.image_utils import load_image
+# from threading import Thread
+# import time
+# import torch
+# import spaces
+# MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
+# processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+#     MODEL_ID,
+#     trust_remote_code=True,
+#     torch_dtype=torch.bfloat16
+# ).to("cuda").eval()
+# @spaces.GPU(duration=200)
+# def model_inference(input_dict, history):
+#     text = input_dict["text"]
+#     files = input_dict["files"]
+#     # Load images if provided
+#     if len(files) > 1:
+#         images = [load_image(image) for image in files]
+#     elif len(files) == 1:
+#         images = [load_image(files[0])]
+#     else:
+#         images = []
+#     # Validate input
+#     if text == "" and not images:
+#         gr.Error("Please input a query and optionally image(s).")
+#         return
+#     if text == "" and images:
+#         gr.Error("Please input a text query along with the image(s).")
+#         return
+#     # Prepare messages for the model
+#     messages = [
+#         {
+#             "role": "user",
+#             "content": [
+#                 *[{"type": "image", "image": image} for image in images],
+#                 {"type": "text", "text": text},
+#             ],
+#         }
+#     ]
+#     # Apply chat template and process inputs
+#     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#     inputs = processor(
+#         text=[prompt],
+#         images=images if images else None,
+#         return_tensors="pt",
+#         padding=True,
+#     ).to("cuda")
+#     # # Set up streamer for real-time output
+#     # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+#     # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
+#     # # Start generation in a separate thread
+#     # thread = Thread(target=model.generate, kwargs=generation_kwargs)
+#     # thread.start()
+#     # # Stream the output
+#     # buffer = ""
+#     # yield "Thinking..."
+#     # for new_text in streamer:
+#     #     buffer += new_text
+#     #     time.sleep(0.01)
+#     #     yield buffer
+#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+#     generation_kwargs = dict(
+#         **inputs,
+#         streamer=streamer,
+#         max_new_tokens=2048,
+#         top_p=0.001,
+#         top_k=1,
+#         temperature=0.01,
+#         repetition_penalty=1.0,
+#     )
+#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
+#     thread.start()
+#     generated_text = ''
+#     try:
+#         for new_text in streamer:
+#             generated_text += new_text
+#             yield generated_text
+#     except Exception as e:
+#         print(f"Error: {e}")
+#         yield f"Error occurred: {str(e)}"
+# examples = [
+#     [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
+# ]
+# demo = gr.ChatInterface(
+#     fn=model_inference,
+#     description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
+#     examples=examples,
+#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
+#     stop_btn="Stop Generation",
+#     multimodal=True,
+#     cache_examples=False,
+# )
+# demo.launch(debug=True)
+import os
+from datetime import datetime
+import subprocess
 import time
+# Third-party imports
+import numpy as np
 import torch
+from PIL import Image
+import accelerate
+import gradio as gr
 import spaces
+from transformers import (
+    Qwen2_5_VLForConditionalGeneration,
+    AutoTokenizer,
+    AutoProcessor
+)
+# Local imports
+from qwen_vl_utils import process_vision_info
+# Set device agnostic code
+if torch.cuda.is_available():
+    device = "cuda"
+elif (torch.backends.mps.is_available()) and (torch.backends.mps.is_built()):
+    device = "mps"
+else:
+    device = "cpu"
+print(f"[INFO] Using device: {device}")
+def array_to_image_path(image_array):
+    if image_array is None:
+        raise ValueError("No image provided. Please upload an image before submitting.")
+    # Convert numpy array to PIL Image
+    img = Image.fromarray(np.uint8(image_array))
+    # Generate a unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"image_{timestamp}.png"
+    # Save the image
+    img.save(filename)
+    # Get the full path of the saved image
+    full_path = os.path.abspath(filename)
+    return full_path
+models = {
+    "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
+                                                                                      trust_remote_code=True,
+                                                                                      torch_dtype="auto",
+                                                                                      device_map="auto").eval(),
+}
+processors = {
+    "Fancy-MLLM/R1-OneVision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-OneVision-7B", trust_remote_code=True),
+}
+DESCRIPTION = "[🦖 Fancy-MLLM/R1-OneVision-7B Demo]"
+kwargs = {}
+kwargs['torch_dtype'] = torch.bfloat16
+user_prompt = '<|user|>\n'
+assistant_prompt = '<|assistant|>\n'
+prompt_suffix = "<|end|>\n"
+@spaces.GPU
+def run_example(image, text_input=None, model_id=None):
+    start_time = time.time()
+    image_path = array_to_image_path(image)
+    print(image_path)
+    model = models[model_id]
+    processor = processors[model_id]
+    image = Image.fromarray(image).convert("RGB")
     messages = [
+    {
             "role": "user",
             "content": [
+                {
+                    "type": "image",
+                    "image": image_path,
+                },
+                {"type": "text", "text": text_input},
             ],
         }
     ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
         padding=True,
+        return_tensors="pt",
     )
+    inputs = inputs.to(device)
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=2048)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    end_time = time.time()
+    total_time = round(end_time - start_time, 2)
+    return output_text[0], total_time
+css = """
+  #output {
+    height: 500px;
+    overflow: auto;
+    border: 1px solid #ccc;
+  }
+"""
+with gr.Blocks(css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Tab(label="R1-OneVision-7B Input"):
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Image(label="Input Picture")
+                model_selector = gr.Dropdown(choices=list(models.keys()),
+                                             label="Model",
+                                             value="Fancy-MLLM/R1-OneVision-7B")
+                text_input = gr.Textbox(label="Text Prompt")
+                submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+                time_taken = gr.Textbox(label="Time taken for processing + inference")
+        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text, time_taken])
+demo.queue(api_open=False)
 demo.launch(debug=True)