mjavaid commited on
Commit
ed11a3f
·
1 Parent(s): 2567c58

first commit

Browse files
Files changed (1) hide show
  1. app.py +66 -80
app.py CHANGED
@@ -4,37 +4,35 @@ import torch
4
  from transformers import AutoProcessor, AutoModelForImageTextToText
5
  from PIL import Image
6
  import io
7
- import requests
8
  import os
9
 
10
  hf_token = os.environ.get("HF_TOKEN")
11
- # Initialize model and processor globally for caching
12
  model_id = "CohereForAI/aya-vision-8b"
13
- processor = None
14
- model = None
15
 
16
- @spaces.GPU
17
- def load_model():
18
- global processor, model
19
- if processor is None or model is None:
20
- try:
21
- processor = AutoProcessor.from_pretrained(model_id)
22
- model = AutoModelForImageTextToText.from_pretrained(
23
- model_id, device_map="auto", torch_dtype=torch.float16, use_auth_token=hf_token
24
- )
25
- return "Model loaded successfully!"
26
- except Exception as e:
27
- return f"Error loading model: {e}\nMake sure to install the correct version of transformers with: pip install 'git+https://github.com/huggingface/[email protected]'"
28
- return "Model already loaded!"
 
29
 
30
  @spaces.GPU
31
  def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_tokens=300):
32
  global processor, model
33
-
34
- # Ensure model is loaded
35
  if processor is None or model is None:
36
- return "Please load the model first using the 'Load Model' button."
37
-
38
  # Process image input (either uploaded or from URL)
39
  if image is not None:
40
  img = Image.fromarray(image)
@@ -46,48 +44,50 @@ def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_toke
46
  return f"Error loading image from URL: {e}"
47
  else:
48
  return "Please provide either an image or an image URL."
49
-
50
- # Format message with the aya-vision chat template
51
  messages = [
52
- {"role": "user",
53
- "content": [
54
- {"type": "image", "source": img},
55
- {"type": "text", "text": prompt},
56
- ]},
 
 
57
  ]
58
 
59
- # Process input
60
  try:
61
  inputs = processor.apply_chat_template(
62
- messages,
63
- padding=True,
64
- add_generation_prompt=True,
65
- tokenize=True,
66
- return_dict=True,
67
  return_tensors="pt"
68
  ).to(model.device)
69
-
70
- # Generate response
71
  gen_tokens = model.generate(
72
- **inputs,
73
- max_new_tokens=int(max_tokens),
74
- do_sample=True,
75
  temperature=float(temperature),
76
  )
77
 
78
- response = processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
 
 
79
  return response
80
  except Exception as e:
81
  return f"Error generating response: {e}"
82
 
83
- # Define example inputs
84
  examples = [
85
  [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
86
  [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
87
  [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
88
  ]
89
 
90
- # Create Gradio application
91
  with gr.Blocks(title="Aya Vision 8B Demo") as demo:
92
  gr.Markdown("# Aya Vision 8B Model Demo")
93
  gr.Markdown("""
@@ -97,36 +97,32 @@ with gr.Blocks(title="Aya Vision 8B Demo") as demo:
97
  - Visual reasoning
98
  - Question answering
99
  - Support for 23 languages
100
-
101
  Upload an image or provide a URL, and enter a prompt to get started!
102
  """)
103
 
104
- with gr.Row():
105
- with gr.Column():
106
- load_button = gr.Button("Load Model", variant="primary")
107
- status = gr.Textbox(label="Model Status", placeholder="Model not loaded yet. Click 'Load Model' to start.")
108
-
109
- gr.Markdown("### Upload an image or provide an image URL:")
110
- with gr.Tab("Upload Image"):
111
- image_input = gr.Image(label="Upload Image", type="numpy")
112
- image_url_input = gr.Textbox(label="Image URL", placeholder="Leave blank if uploading an image", visible=False)
113
-
114
- with gr.Tab("Image URL"):
115
- image_url_visible = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
116
- image_input_url = gr.Image(label="Upload Image", type="numpy", visible=False)
117
-
118
- prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
119
-
120
- with gr.Accordion("Generation Settings", open=False):
121
- temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
122
- max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
123
-
124
- generate_button = gr.Button("Generate Response", variant="primary")
125
-
126
- with gr.Column():
127
- output = gr.Textbox(label="Model Response", lines=10)
128
-
129
- # Add examples section
130
  gr.Markdown("### Examples")
131
  gr.Examples(
132
  examples=examples,
@@ -135,20 +131,10 @@ with gr.Blocks(title="Aya Vision 8B Demo") as demo:
135
  fn=process_image_and_prompt
136
  )
137
 
138
- # Set up tab switching logic - hide appropriate inputs depending on tab
139
- def update_image_tab():
140
- return {image_url_input: gr.update(visible=False), image_input: gr.update(visible=True)}
141
-
142
- def update_url_tab():
143
- return {image_url_visible: gr.update(visible=True), image_input_url: gr.update(visible=False)}
144
-
145
- # Define button click behavior
146
- load_button.click(load_model, inputs=None, outputs=status)
147
-
148
  # Handle generation from either image or URL
149
  def generate_response(image, image_url_visible, prompt, temperature, max_tokens):
150
  return process_image_and_prompt(image, image_url_visible, prompt, temperature, max_tokens)
151
-
152
  generate_button.click(
153
  generate_response,
154
  inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],
 
4
  from transformers import AutoProcessor, AutoModelForImageTextToText
5
  from PIL import Image
6
  import io
7
+ import requests
8
  import os
9
 
10
  hf_token = os.environ.get("HF_TOKEN")
 
11
  model_id = "CohereForAI/aya-vision-8b"
 
 
12
 
13
+ # Load model and processor immediately on startup.
14
+ try:
15
+ processor = AutoProcessor.from_pretrained(model_id)
16
+ model = AutoModelForImageTextToText.from_pretrained(
17
+ model_id, device_map="auto", torch_dtype=torch.float16, use_auth_token=hf_token
18
+ )
19
+ model_status = "Model loaded successfully!"
20
+ except Exception as e:
21
+ processor = None
22
+ model = None
23
+ model_status = (
24
+ f"Error loading model: {e}\nMake sure to install the correct version of transformers with: "
25
+ "pip install 'git+https://github.com/huggingface/[email protected]'"
26
+ )
27
 
28
  @spaces.GPU
29
  def process_image_and_prompt(image, image_url, prompt, temperature=0.3, max_tokens=300):
30
  global processor, model
31
+
32
+ # Check if the model is loaded
33
  if processor is None or model is None:
34
+ return "Model failed to load. Please check the logs."
35
+
36
  # Process image input (either uploaded or from URL)
37
  if image is not None:
38
  img = Image.fromarray(image)
 
44
  return f"Error loading image from URL: {e}"
45
  else:
46
  return "Please provide either an image or an image URL."
47
+
48
+ # Format the message using the Aya Vision chat template
49
  messages = [
50
+ {
51
+ "role": "user",
52
+ "content": [
53
+ {"type": "image", "source": img},
54
+ {"type": "text", "text": prompt},
55
+ ],
56
+ },
57
  ]
58
 
 
59
  try:
60
  inputs = processor.apply_chat_template(
61
+ messages,
62
+ padding=True,
63
+ add_generation_prompt=True,
64
+ tokenize=True,
65
+ return_dict=True,
66
  return_tensors="pt"
67
  ).to(model.device)
68
+
 
69
  gen_tokens = model.generate(
70
+ **inputs,
71
+ max_new_tokens=int(max_tokens),
72
+ do_sample=True,
73
  temperature=float(temperature),
74
  )
75
 
76
+ response = processor.tokenizer.decode(
77
+ gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True
78
+ )
79
  return response
80
  except Exception as e:
81
  return f"Error generating response: {e}"
82
 
83
+ # Example inputs for testing
84
  examples = [
85
  [None, "https://media.istockphoto.com/id/458012057/photo/istanbul-turkey.jpg?s=612x612&w=0&k=20&c=qogAOVvkpfUyqLUMr_XJQyq-HkACXyYUSZbKhBlPrxo=", "What landmark is shown in this image?", 0.3, 300],
86
  [None, "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", "What does the text in this image say?", 0.3, 300],
87
  [None, "https://upload.wikimedia.org/wikipedia/commons/d/da/The_Parthenon_in_Athens.jpg", "Describe esta imagen en español", 0.3, 300]
88
  ]
89
 
90
+ # Create the Gradio application
91
  with gr.Blocks(title="Aya Vision 8B Demo") as demo:
92
  gr.Markdown("# Aya Vision 8B Model Demo")
93
  gr.Markdown("""
 
97
  - Visual reasoning
98
  - Question answering
99
  - Support for 23 languages
100
+
101
  Upload an image or provide a URL, and enter a prompt to get started!
102
  """)
103
 
104
+ # Display model loading status
105
+ gr.Markdown(f"**Model Status:** {model_status}")
106
+
107
+ gr.Markdown("### Upload an image or provide an image URL:")
108
+ with gr.Tab("Upload Image"):
109
+ image_input = gr.Image(label="Upload Image", type="numpy")
110
+ image_url_input = gr.Textbox(label="Image URL", placeholder="Leave blank if uploading an image", visible=False)
111
+ with gr.Tab("Image URL"):
112
+ image_url_visible = gr.Textbox(label="Image URL", placeholder="Enter a URL to an image")
113
+ image_input_url = gr.Image(label="Upload Image", type="numpy", visible=False)
114
+
115
+ prompt = gr.Textbox(label="Prompt", placeholder="Enter your prompt to the model", lines=3)
116
+
117
+ with gr.Accordion("Generation Settings", open=False):
118
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.3, label="Temperature")
119
+ max_tokens = gr.Slider(minimum=50, maximum=1000, step=50, value=300, label="Max Tokens")
120
+
121
+ generate_button = gr.Button("Generate Response", variant="primary")
122
+
123
+ with gr.Column():
124
+ output = gr.Textbox(label="Model Response", lines=10)
125
+
 
 
 
 
126
  gr.Markdown("### Examples")
127
  gr.Examples(
128
  examples=examples,
 
131
  fn=process_image_and_prompt
132
  )
133
 
 
 
 
 
 
 
 
 
 
 
134
  # Handle generation from either image or URL
135
  def generate_response(image, image_url_visible, prompt, temperature, max_tokens):
136
  return process_image_and_prompt(image, image_url_visible, prompt, temperature, max_tokens)
137
+
138
  generate_button.click(
139
  generate_response,
140
  inputs=[image_input, image_url_visible, prompt, temperature, max_tokens],