Spaces:

Fancy-MLLM
/

R1-Onevision

Running on Zero

App Files Files Community

Fancy-MLLM commited on Feb 12

Commit

53c6808

verified ·

1 Parent(s): 4c6ecb5

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -125

app.py CHANGED Viewed

@@ -108,104 +108,15 @@
 # demo.launch(share=False)
-# import gradio as gr
-# from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
-# from transformers.image_utils import load_image
-# from threading import Thread
-# import time
-# import torch
-# import spaces
-# MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
-# processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-#     MODEL_ID,
-#     trust_remote_code=True,
-#     torch_dtype=torch.bfloat16
-# ).to("cuda").eval()
-# @spaces.GPU(duration=200)
-# def model_inference(input_dict, history):
-#     text = input_dict["text"]
-#     files = input_dict["files"]
-#     # Load images if provided
-#     if len(files) > 1:
-#         images = [load_image(image) for image in files]
-#     elif len(files) == 1:
-#         images = [load_image(files[0])]
-#     else:
-#         images = []
-#     # Validate input
-#     if text == "" and not images:
-#         gr.Error("Please input a query and optionally image(s).")
-#         return
-#     if text == "" and images:
-#         gr.Error("Please input a text query along with the image(s).")
-#         return
-#     # Prepare messages for the model
-#     messages = [
-#         {
-#             "role": "user",
-#             "content": [
-#                 *[{"type": "image", "image": image} for image in images],
-#                 {"type": "text", "text": text},
-#             ],
-#         }
-#     ]
-#     # Apply chat template and process inputs
-#     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#     inputs = processor(
-#         text=[prompt],
-#         images=images if images else None,
-#         return_tensors="pt",
-#         padding=True,
-#     ).to("cuda")
-#     # Set up streamer for real-time output
-#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-#     generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
-#     # Start generation in a separate thread
-#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
-#     thread.start()
-#     # Stream the output
-#     buffer = ""
-#     yield "Thinking..."
-#     for new_text in streamer:
-#         buffer += new_text
-#         time.sleep(0.01)
-#         yield buffer
-# examples = [
-#     [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
-# ]
-# demo = gr.ChatInterface(
-#     fn=model_inference,
-#     description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
-#     examples=examples,
-#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
-#     stop_btn="Stop Generation",
-#     multimodal=True,
-#     cache_examples=False,
-# )
-# demo.launch(debug=True)
 import gradio as gr
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import time
 import torch
-# 加载模型和处理器
-MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
@@ -213,11 +124,12 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     torch_dtype=torch.bfloat16
 ).to("cuda").eval()
 def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict["files"]
-    # 加载图片（如果提供）
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
@@ -225,13 +137,15 @@ def model_inference(input_dict, history):
     else:
         images = []
-    # 输入验证
     if text == "" and not images:
-        return gr.Error("Please input a query and optionally image(s).")
     if text == "" and images:
-        return gr.Error("Please input a text query along with the image(s).")
-    # 准备输入消息
     messages = [
         {
             "role": "user",
@@ -242,7 +156,7 @@ def model_inference(input_dict, history):
         }
     ]
-    # 使用处理器准备输入
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
@@ -251,44 +165,55 @@ def model_inference(input_dict, history):
         padding=True,
     ).to("cuda")
-    # 设置最大输出token数以控制推理时间
-    max_new_tokens = 1024  # 可��根据实际需要调整
-    # 创建流式输出
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
-    # 使用后台线程执行推理
-    def run_inference():
-        model.generate(**generation_kwargs)
-    thread = Thread(target=run_inference)
     thread.start()
-    # 生成过程中更新UI
-    buffer = ""
-    yield "Processing your request, please wait..."
-    for new_text in streamer:
-        buffer += new_text
-        time.sleep(0.01)  # 给UI流畅更新的时间
-        yield buffer
-# 示例输入
 examples = [
     [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
 ]
-# 创建Gradio界面
-demo = gr.Interface(
     fn=model_inference,
     description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
     examples=examples,
-    inputs=gr.Chatbox(),
-    outputs=gr.Textbox(),
-    live=True,
-    allow_flagging="never",
-    layout="vertical",
-    title="Multimodal Inference with Fancy-MLLM",
     cache_examples=False,
 )

 # demo.launch(share=False)
 import gradio as gr
 from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
 from transformers.image_utils import load_image
 from threading import Thread
 import time
 import torch
+import spaces
+MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
 processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
     torch_dtype=torch.bfloat16
 ).to("cuda").eval()
+@spaces.GPU(duration=200)
 def model_inference(input_dict, history):
     text = input_dict["text"]
     files = input_dict["files"]
+    # Load images if provided
     if len(files) > 1:
         images = [load_image(image) for image in files]
     elif len(files) == 1:
     else:
         images = []
+    # Validate input
     if text == "" and not images:
+        gr.Error("Please input a query and optionally image(s).")
+        return
     if text == "" and images:
+        gr.Error("Please input a text query along with the image(s).")
+        return
+    # Prepare messages for the model
     messages = [
         {
             "role": "user",
         }
     ]
+    # Apply chat template and process inputs
     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(
         text=[prompt],
         padding=True,
     ).to("cuda")
+    # # Set up streamer for real-time output
+    # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
+    # # Start generation in a separate thread
+    # thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    # thread.start()
+    # # Stream the output
+    # buffer = ""
+    # yield "Thinking..."
+    # for new_text in streamer:
+    #     buffer += new_text
+    #     time.sleep(0.01)
+    #     yield buffer
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        **inputs,
+        streamer=streamer,
+        max_new_tokens=2048,
+        top_p=0.001,
+        top_k=1,
+        temperature=0.01,
+        repetition_penalty=1.0,
+    )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    generated_text = ''
+    try:
+        for new_text in streamer:
+            generated_text += new_text
+            yield generated_text
+    except Exception as e:
+        print(f"Error: {e}")
+        yield f"Error occurred: {str(e)}"
 examples = [
     [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
 ]
+demo = gr.ChatInterface(
     fn=model_inference,
     description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
     examples=examples,
+    textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
+    stop_btn="Stop Generation",
+    multimodal=True,
     cache_examples=False,
 )