Spaces:
Running
on
Zero
Running
on
Zero
# import gradio as gr | |
# from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer | |
# from threading import Thread | |
# from qwen_vl_utils import process_vision_info | |
# import torch | |
# import time | |
# # Check if a GPU is available | |
# device = "cuda" if torch.cuda.is_available() else "cpu" | |
# local_path = "Fancy-MLLM/R1-OneVision-7B" | |
# # Load the model on the appropriate device (GPU if available, otherwise CPU) | |
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
# local_path, torch_dtype="auto", device_map=device | |
# ) | |
# processor = AutoProcessor.from_pretrained(local_path) | |
# def generate_output(image, text, button_click): | |
# # Prepare input data | |
# messages = [ | |
# { | |
# "role": "user", | |
# "content": [ | |
# {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056}, | |
# {"type": "text", "text": text}, | |
# ], | |
# } | |
# ] | |
# # Prepare inputs for the model | |
# text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
# image_inputs, video_inputs = process_vision_info(messages) | |
# inputs = processor( | |
# text=[text_input], | |
# images=image_inputs, | |
# videos=video_inputs, | |
# padding=True, | |
# return_tensors="pt", | |
# ) | |
# # Move inputs to the same device as the model | |
# inputs = inputs.to(model.device) | |
# streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) | |
# generation_kwargs = dict( | |
# **inputs, | |
# streamer=streamer, | |
# max_new_tokens=4096, | |
# top_p=0.001, | |
# top_k=1, | |
# temperature=0.01, | |
# repetition_penalty=1.0, | |
# ) | |
# thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
# thread.start() | |
# generated_text = '' | |
# try: | |
# for new_text in streamer: | |
# generated_text += new_text | |
# yield f"{generated_text}" | |
# except Exception as e: | |
# print(f"Error: {e}") | |
# yield f"Error occurred: {str(e)}" | |
# Css = """ | |
# #output-markdown { | |
# overflow-y: auto; | |
# white-space: pre-wrap; | |
# word-wrap: break-word; | |
# } | |
# #output-markdown .math { | |
# overflow-x: auto; | |
# max-width: 100%; | |
# } | |
# .markdown-text { | |
# white-space: pre-wrap; | |
# word-wrap: break-word; | |
# } | |
# .markdown-output { | |
# min-height: 20vh; | |
# max-width: 100%; | |
# overflow-y: auto; | |
# } | |
# #qwen-md .katex-display { display: inline; } | |
# #qwen-md .katex-display>.katex { display: inline; } | |
# #qwen-md .katex-display>.katex>.katex-html { display: inline; } | |
# """ | |
# with gr.Blocks(css=Css) as demo: | |
# gr.HTML("""<center><font size=8>🦖 R1-OneVision Demo</center>""") | |
# with gr.Row(): | |
# with gr.Column(): | |
# input_image = gr.Image(type="pil", label="Upload") # **改回 PIL 处理** | |
# input_text = gr.Textbox(label="Input your question") | |
# with gr.Row(): | |
# clear_btn = gr.ClearButton([input_image, input_text]) | |
# submit_btn = gr.Button("Submit", variant="primary") | |
# with gr.Column(): | |
# output_text = gr.Markdown(elem_id="qwen-md", container=True, elem_classes="markdown-output") | |
# submit_btn.click(fn=generate_output, inputs=[input_image, input_text], outputs=output_text) | |
# demo.launch(share=False) | |
# import gradio as gr | |
# from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer | |
# from transformers.image_utils import load_image | |
# from threading import Thread | |
# import time | |
# import torch | |
# import spaces | |
# MODEL_ID = "Fancy-MLLM/R1-OneVision-7B" | |
# processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) | |
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
# MODEL_ID, | |
# trust_remote_code=True, | |
# torch_dtype=torch.bfloat16 | |
# ).to("cuda").eval() | |
# @spaces.GPU(duration=200) | |
# def model_inference(input_dict, history): | |
# text = input_dict["text"] | |
# files = input_dict["files"] | |
# # Load images if provided | |
# if len(files) > 1: | |
# images = [load_image(image) for image in files] | |
# elif len(files) == 1: | |
# images = [load_image(files[0])] | |
# else: | |
# images = [] | |
# # Validate input | |
# if text == "" and not images: | |
# gr.Error("Please input a query and optionally image(s).") | |
# return | |
# if text == "" and images: | |
# gr.Error("Please input a text query along with the image(s).") | |
# return | |
# # Prepare messages for the model | |
# messages = [ | |
# { | |
# "role": "user", | |
# "content": [ | |
# *[{"type": "image", "image": image} for image in images], | |
# {"type": "text", "text": text}, | |
# ], | |
# } | |
# ] | |
# # Apply chat template and process inputs | |
# prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
# inputs = processor( | |
# text=[prompt], | |
# images=images if images else None, | |
# return_tensors="pt", | |
# padding=True, | |
# ).to("cuda") | |
# # # Set up streamer for real-time output | |
# # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) | |
# # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048) | |
# # # Start generation in a separate thread | |
# # thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
# # thread.start() | |
# # # Stream the output | |
# # buffer = "" | |
# # yield "Thinking..." | |
# # for new_text in streamer: | |
# # buffer += new_text | |
# # time.sleep(0.01) | |
# # yield buffer | |
# streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) | |
# generation_kwargs = dict( | |
# **inputs, | |
# streamer=streamer, | |
# max_new_tokens=2048, | |
# top_p=0.001, | |
# top_k=1, | |
# temperature=0.01, | |
# repetition_penalty=1.0, | |
# ) | |
# thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
# thread.start() | |
# generated_text = '' | |
# try: | |
# for new_text in streamer: | |
# generated_text += new_text | |
# yield generated_text | |
# except Exception as e: | |
# print(f"Error: {e}") | |
# yield f"Error occurred: {str(e)}" | |
# examples = [ | |
# [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}] | |
# ] | |
# demo = gr.ChatInterface( | |
# fn=model_inference, | |
# description="# **🦖 Fancy-MLLM/R1-OneVision-7B**", | |
# examples=examples, | |
# textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"), | |
# stop_btn="Stop Generation", | |
# multimodal=True, | |
# cache_examples=False, | |
# ) | |
# demo.launch(debug=True) | |
import os | |
from datetime import datetime | |
import time | |
from threading import Thread | |
# Third-party imports | |
import numpy as np | |
import torch | |
from PIL import Image | |
import gradio as gr | |
import spaces | |
from transformers import ( | |
Qwen2_5_VLForConditionalGeneration, | |
AutoProcessor, | |
TextIteratorStreamer | |
) | |
# Local imports | |
from qwen_vl_utils import process_vision_info | |
# Set device agnostic code | |
if torch.cuda.is_available(): | |
device = "cuda" | |
elif (torch.backends.mps.is_available()) and (torch.backends.mps.is_built()): | |
device = "mps" | |
else: | |
device = "cpu" | |
print(f"[INFO] Using device: {device}") | |
def array_to_image_path(image_array): | |
if image_array is None: | |
raise ValueError("No image provided. Please upload an image before submitting.") | |
# Convert numpy array to PIL Image | |
img = Image.fromarray(np.uint8(image_array)) | |
# Generate a unique filename using timestamp | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
filename = f"image_{timestamp}.png" | |
# Save the image | |
img.save(filename) | |
# Get the full path of the saved image | |
full_path = os.path.abspath(filename) | |
return full_path | |
models = { | |
"Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B", | |
trust_remote_code=True, | |
torch_dtype="auto", | |
device_map="auto").eval(), | |
} | |
processors = { | |
"Fancy-MLLM/R1-OneVision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-OneVision-7B", trust_remote_code=True), | |
} | |
DESCRIPTION = "[🦖 Fancy-MLLM/R1-OneVision-7B Demo]" | |
kwargs = {} | |
kwargs['torch_dtype'] = torch.bfloat16 | |
user_prompt = '<|user|>\n' | |
assistant_prompt = '<|assistant|>\n' | |
prompt_suffix = "<|end|>\n" | |
def model_inference(input_dict, history): | |
text = input_dict["text"] | |
files = input_dict["files"] | |
# Load images if provided | |
images = [] | |
if len(files) > 0: | |
images = [array_to_image_path(image) for image in files] | |
# Validate input | |
if text == "" and not images: | |
yield "Error: Please input a query and optionally image(s)." | |
return | |
if text == "" and images: | |
yield "Error: Please input a text query along with the image(s)." | |
return | |
# Prepare messages for the model | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
*[{"type": "image", "image": image} for image in images], | |
{"type": "text", "text": text}, | |
], | |
} | |
] | |
# Apply chat template and process inputs | |
prompt = processors["Fancy-MLLM/R1-OneVision-7B"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
image_inputs, video_inputs = process_vision_info(messages) | |
inputs = processors["Fancy-MLLM/R1-OneVision-7B"]( | |
text=[prompt], | |
images=image_inputs, | |
videos=video_inputs, | |
padding=True, | |
return_tensors="pt", | |
).to(device) | |
# Set up streamer for real-time output | |
streamer = TextIteratorStreamer(processors["Fancy-MLLM/R1-OneVision-7B"], skip_prompt=True, skip_special_tokens=True) | |
# Define the generation parameters | |
generation_kwargs = dict( | |
**inputs, | |
streamer=streamer, | |
max_new_tokens=2048, | |
top_p=0.001, | |
top_k=1, | |
temperature=0.01, | |
repetition_penalty=1.0, | |
) | |
# Start generation in a separate thread | |
thread = Thread(target=models["Fancy-MLLM/R1-OneVision-7B"].generate, kwargs=generation_kwargs) | |
thread.start() | |
# Stream the output | |
buffer = "" | |
yield "Thinking..." | |
for new_text in streamer: | |
buffer += new_text | |
time.sleep(0.01) | |
yield buffer | |
css = """ | |
#output { | |
height: 500px; | |
overflow: auto; | |
border: 1px solid #ccc; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown(DESCRIPTION) | |
with gr.Tab(label="R1-OneVision-7B Input"): | |
with gr.Row(): | |
with gr.Column(): | |
input_img = gr.Image(label="Input Picture", type="numpy", elem_id="image_input") | |
model_selector = gr.Dropdown(choices=list(models.keys()), | |
label="Model", | |
value="Fancy-MLLM/R1-OneVision-7B") | |
text_input = gr.Textbox(label="Text Prompt") | |
submit_btn = gr.Button(value="Submit") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Output Text", elem_id="output_text", lines=10) | |
submit_btn.click(model_inference, [input_img, text_input, model_selector], [output_text]) | |
demo.queue(api_open=False) | |
demo.launch(debug=True) | |