Fancy-MLLM commited on
Commit
6bcd1bb
·
verified ·
1 Parent(s): 3a69964

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -93
app.py CHANGED
@@ -108,114 +108,264 @@
108
  # demo.launch(share=False)
109
 
110
 
111
- import gradio as gr
112
- from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
113
- from transformers.image_utils import load_image
114
- from threading import Thread
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  import time
 
 
 
116
  import torch
 
 
 
117
  import spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
120
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
121
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
122
- MODEL_ID,
123
- trust_remote_code=True,
124
- torch_dtype=torch.bfloat16
125
- ).to("cuda").eval()
126
-
127
- @spaces.GPU(duration=200)
128
- def model_inference(input_dict, history):
129
- text = input_dict["text"]
130
- files = input_dict["files"]
131
-
132
- # Load images if provided
133
- if len(files) > 1:
134
- images = [load_image(image) for image in files]
135
- elif len(files) == 1:
136
- images = [load_image(files[0])]
137
- else:
138
- images = []
139
-
140
- # Validate input
141
- if text == "" and not images:
142
- gr.Error("Please input a query and optionally image(s).")
143
- return
144
- if text == "" and images:
145
- gr.Error("Please input a text query along with the image(s).")
146
- return
147
-
148
- # Prepare messages for the model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  messages = [
150
- {
151
  "role": "user",
152
  "content": [
153
- *[{"type": "image", "image": image} for image in images],
154
- {"type": "text", "text": text},
 
 
 
155
  ],
156
  }
157
  ]
158
-
159
- # Apply chat template and process inputs
160
- prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
161
  inputs = processor(
162
- text=[prompt],
163
- images=images if images else None,
164
- return_tensors="pt",
165
  padding=True,
166
- ).to("cuda")
167
-
168
- # # Set up streamer for real-time output
169
- # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
170
- # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
171
-
172
- # # Start generation in a separate thread
173
- # thread = Thread(target=model.generate, kwargs=generation_kwargs)
174
- # thread.start()
175
-
176
- # # Stream the output
177
- # buffer = ""
178
- # yield "Thinking..."
179
- # for new_text in streamer:
180
- # buffer += new_text
181
- # time.sleep(0.01)
182
- # yield buffer
183
- streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
184
- generation_kwargs = dict(
185
- **inputs,
186
- streamer=streamer,
187
- max_new_tokens=2048,
188
- top_p=0.001,
189
- top_k=1,
190
- temperature=0.01,
191
- repetition_penalty=1.0,
192
  )
 
193
 
194
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
195
- thread.start()
196
- generated_text = ''
 
 
 
 
 
 
 
 
197
 
198
- try:
199
- for new_text in streamer:
200
- generated_text += new_text
201
- yield generated_text
202
- except Exception as e:
203
- print(f"Error: {e}")
204
- yield f"Error occurred: {str(e)}"
205
-
206
- examples = [
207
- [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
208
- ]
209
-
210
- demo = gr.ChatInterface(
211
- fn=model_inference,
212
- description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
213
- examples=examples,
214
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
215
- stop_btn="Stop Generation",
216
- multimodal=True,
217
- cache_examples=False,
218
- )
 
 
 
 
 
219
 
 
220
  demo.launch(debug=True)
221
 
 
108
  # demo.launch(share=False)
109
 
110
 
111
+ # import gradio as gr
112
+ # from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
113
+ # from transformers.image_utils import load_image
114
+ # from threading import Thread
115
+ # import time
116
+ # import torch
117
+ # import spaces
118
+
119
+ # MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
120
+ # processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
121
+ # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
122
+ # MODEL_ID,
123
+ # trust_remote_code=True,
124
+ # torch_dtype=torch.bfloat16
125
+ # ).to("cuda").eval()
126
+
127
+ # @spaces.GPU(duration=200)
128
+ # def model_inference(input_dict, history):
129
+ # text = input_dict["text"]
130
+ # files = input_dict["files"]
131
+
132
+ # # Load images if provided
133
+ # if len(files) > 1:
134
+ # images = [load_image(image) for image in files]
135
+ # elif len(files) == 1:
136
+ # images = [load_image(files[0])]
137
+ # else:
138
+ # images = []
139
+
140
+ # # Validate input
141
+ # if text == "" and not images:
142
+ # gr.Error("Please input a query and optionally image(s).")
143
+ # return
144
+ # if text == "" and images:
145
+ # gr.Error("Please input a text query along with the image(s).")
146
+ # return
147
+
148
+ # # Prepare messages for the model
149
+ # messages = [
150
+ # {
151
+ # "role": "user",
152
+ # "content": [
153
+ # *[{"type": "image", "image": image} for image in images],
154
+ # {"type": "text", "text": text},
155
+ # ],
156
+ # }
157
+ # ]
158
+
159
+ # # Apply chat template and process inputs
160
+ # prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
161
+ # inputs = processor(
162
+ # text=[prompt],
163
+ # images=images if images else None,
164
+ # return_tensors="pt",
165
+ # padding=True,
166
+ # ).to("cuda")
167
+
168
+ # # # Set up streamer for real-time output
169
+ # # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
170
+ # # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
171
+
172
+ # # # Start generation in a separate thread
173
+ # # thread = Thread(target=model.generate, kwargs=generation_kwargs)
174
+ # # thread.start()
175
+
176
+ # # # Stream the output
177
+ # # buffer = ""
178
+ # # yield "Thinking..."
179
+ # # for new_text in streamer:
180
+ # # buffer += new_text
181
+ # # time.sleep(0.01)
182
+ # # yield buffer
183
+ # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
184
+ # generation_kwargs = dict(
185
+ # **inputs,
186
+ # streamer=streamer,
187
+ # max_new_tokens=2048,
188
+ # top_p=0.001,
189
+ # top_k=1,
190
+ # temperature=0.01,
191
+ # repetition_penalty=1.0,
192
+ # )
193
+
194
+ # thread = Thread(target=model.generate, kwargs=generation_kwargs)
195
+ # thread.start()
196
+ # generated_text = ''
197
+
198
+ # try:
199
+ # for new_text in streamer:
200
+ # generated_text += new_text
201
+ # yield generated_text
202
+ # except Exception as e:
203
+ # print(f"Error: {e}")
204
+ # yield f"Error occurred: {str(e)}"
205
+
206
+ # examples = [
207
+ # [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
208
+ # ]
209
+
210
+ # demo = gr.ChatInterface(
211
+ # fn=model_inference,
212
+ # description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
213
+ # examples=examples,
214
+ # textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
215
+ # stop_btn="Stop Generation",
216
+ # multimodal=True,
217
+ # cache_examples=False,
218
+ # )
219
+
220
+ # demo.launch(debug=True)
221
+
222
+
223
+ import os
224
+ from datetime import datetime
225
+ import subprocess
226
  import time
227
+
228
+ # Third-party imports
229
+ import numpy as np
230
  import torch
231
+ from PIL import Image
232
+ import accelerate
233
+ import gradio as gr
234
  import spaces
235
+ from transformers import (
236
+ Qwen2_5_VLForConditionalGeneration,
237
+ AutoTokenizer,
238
+ AutoProcessor
239
+ )
240
+
241
+ # Local imports
242
+ from qwen_vl_utils import process_vision_info
243
+
244
+ # Set device agnostic code
245
+ if torch.cuda.is_available():
246
+ device = "cuda"
247
+ elif (torch.backends.mps.is_available()) and (torch.backends.mps.is_built()):
248
+ device = "mps"
249
+ else:
250
+ device = "cpu"
251
+
252
+ print(f"[INFO] Using device: {device}")
253
 
254
+
255
+ def array_to_image_path(image_array):
256
+ if image_array is None:
257
+ raise ValueError("No image provided. Please upload an image before submitting.")
258
+ # Convert numpy array to PIL Image
259
+ img = Image.fromarray(np.uint8(image_array))
260
+
261
+ # Generate a unique filename using timestamp
262
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
263
+ filename = f"image_{timestamp}.png"
264
+
265
+ # Save the image
266
+ img.save(filename)
267
+
268
+ # Get the full path of the saved image
269
+ full_path = os.path.abspath(filename)
270
+
271
+ return full_path
272
+
273
+ models = {
274
+ "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
275
+ trust_remote_code=True,
276
+ torch_dtype="auto",
277
+ device_map="auto").eval(),
278
+ }
279
+
280
+ processors = {
281
+ "Fancy-MLLM/R1-OneVision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-OneVision-7B", trust_remote_code=True),
282
+ }
283
+
284
+ DESCRIPTION = "[🦖 Fancy-MLLM/R1-OneVision-7B Demo]"
285
+
286
+ kwargs = {}
287
+ kwargs['torch_dtype'] = torch.bfloat16
288
+
289
+ user_prompt = '<|user|>\n'
290
+ assistant_prompt = '<|assistant|>\n'
291
+ prompt_suffix = "<|end|>\n"
292
+
293
+ @spaces.GPU
294
+ def run_example(image, text_input=None, model_id=None):
295
+ start_time = time.time()
296
+ image_path = array_to_image_path(image)
297
+
298
+ print(image_path)
299
+ model = models[model_id]
300
+ processor = processors[model_id]
301
+
302
+ image = Image.fromarray(image).convert("RGB")
303
  messages = [
304
+ {
305
  "role": "user",
306
  "content": [
307
+ {
308
+ "type": "image",
309
+ "image": image_path,
310
+ },
311
+ {"type": "text", "text": text_input},
312
  ],
313
  }
314
  ]
315
+
316
+ # Preparation for inference
317
+ text = processor.apply_chat_template(
318
+ messages, tokenize=False, add_generation_prompt=True
319
+ )
320
+ image_inputs, video_inputs = process_vision_info(messages)
321
  inputs = processor(
322
+ text=[text],
323
+ images=image_inputs,
324
+ videos=video_inputs,
325
  padding=True,
326
+ return_tensors="pt",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  )
328
+ inputs = inputs.to(device)
329
 
330
+ # Inference: Generation of the output
331
+ generated_ids = model.generate(**inputs, max_new_tokens=2048)
332
+ generated_ids_trimmed = [
333
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
334
+ ]
335
+ output_text = processor.batch_decode(
336
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
337
+ )
338
+
339
+ end_time = time.time()
340
+ total_time = round(end_time - start_time, 2)
341
 
342
+ return output_text[0], total_time
343
+
344
+ css = """
345
+ #output {
346
+ height: 500px;
347
+ overflow: auto;
348
+ border: 1px solid #ccc;
349
+ }
350
+ """
351
+
352
+ with gr.Blocks(css=css) as demo:
353
+ gr.Markdown(DESCRIPTION)
354
+ with gr.Tab(label="R1-OneVision-7B Input"):
355
+ with gr.Row():
356
+ with gr.Column():
357
+ input_img = gr.Image(label="Input Picture")
358
+ model_selector = gr.Dropdown(choices=list(models.keys()),
359
+ label="Model",
360
+ value="Fancy-MLLM/R1-OneVision-7B")
361
+ text_input = gr.Textbox(label="Text Prompt")
362
+ submit_btn = gr.Button(value="Submit")
363
+ with gr.Column():
364
+ output_text = gr.Textbox(label="Output Text")
365
+ time_taken = gr.Textbox(label="Time taken for processing + inference")
366
+
367
+ submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text, time_taken])
368
 
369
+ demo.queue(api_open=False)
370
  demo.launch(debug=True)
371