falcon-vision / app.py
Tonic's picture
Update app.py
8e769ae verified
raw
history blame contribute delete
4.45 kB
import gradio as gr
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
from PIL import Image
import requests
import torch
import spaces
title = """ # ๐Ÿ™‹๐Ÿปโ€โ™‚๏ธWelcome to Tonic's๐Ÿฆ…Falcon Vision๐Ÿ‘๏ธLanguage Model !
"""
description = """
Falcon2-11B-vlm is an 11B parameters causal decoder-only model built by TII and trained on over 5,000B tokens of RefinedWeb enhanced with curated corpora. To bring vision capabilities, , we integrate the pretrained CLIP ViT-L/14 vision encoder with our Falcon2-11B chat-finetuned model and train with image-text data. For enhancing the VLM's perception of fine-grained details w.r.t small objects in images, we employ a dynamic encoding mechanism at high-resolution for image inputs.
### Join us :
๐ŸŒŸTeamTonic๐ŸŒŸ is always making cool demos! Join our active builder's ๐Ÿ› ๏ธcommunity ๐Ÿ‘ป [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On ๐Ÿค—Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) Math ๐Ÿ” [introspector](https://huggingface.co/introspector) On ๐ŸŒGithub: [Tonic-AI](https://github.com/tonic-ai) & contribute to๐ŸŒŸ [MultiTonic](https://github.com/multitonic/)๐Ÿค—Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant ๐Ÿค—
"""
processor = LlavaNextProcessor.from_pretrained("tiiuae/falcon-11B-vlm", tokenizer_class='PreTrainedTokenizerFast')
model = LlavaNextForConditionalGeneration.from_pretrained("tiiuae/falcon-11B-vlm", torch_dtype=torch.bfloat16).to('cuda:0')
@spaces.GPU
def generate_paragraph(image_url):
cats_image = Image.open(requests.get(image_url, stream=True).raw)
instruction = 'Write a long paragraph about this picture.'
prompt = f"User:<image>\n{instruction} Falcon:"
inputs = processor(prompt, images=cats_image, return_tensors="pt", padding=True).to('cuda:0')
output = model.generate(**inputs, max_new_tokens=256)
generated_captions = processor.decode(output[0], skip_special_tokens=True).strip()
return generated_captions
def set_and_generate(url):
generated_paragraph = generate_paragraph(url)
return url, generated_paragraph
# Create the Gradio Blocks interface
with gr.Blocks(css=".thumbnail { width: 150px; height: 150px; object-fit: cover; }") as demo:
gr.Markdown(title)
gr.Markdown(description)
with gr.Row():
with gr.Column():
image_url_input = gr.Textbox(label="Image URL")
generate_button = gr.Button("Generate Paragraph")
# Image thumbnails acting as buttons
example_1 = gr.Button("Types of Falcons", elem_id="example_1")
example_2 = gr.Button("Camel Racing - Saudi Arabia", elem_id="example_2")
example_3 = gr.Button("Urban Scene - India", elem_id="example_3")
with gr.Column():
generated_paragraph_output = gr.Textbox(label="๐Ÿฆ…Falcon Vision๐Ÿ‘๏ธ")
# Wire click events
generate_button.click(generate_paragraph, inputs=image_url_input, outputs=generated_paragraph_output)
example_1.click(
lambda: set_and_generate("https://www.animalspot.net/wp-content/uploads/2020/01/Types-of-Falcons.jpg"),
outputs=[image_url_input, generated_paragraph_output]
)
example_2.click(
lambda: set_and_generate("https://www.leaders-mena.com/leaders/uploads/2023/01/The-Traditional-Camel-Racing-In-Saudi-Arabia-Unique-Sport-Activity-1024x576.jpg"),
outputs=[image_url_input, generated_paragraph_output]
)
example_3.click(
lambda: set_and_generate("http://embed.robertharding.com/embed/1161-4342.jpg"),
outputs=[image_url_input, generated_paragraph_output]
)
# Configure the CSS for thumbnails
demo.css += """
#example_1 {
background: url("https://www.animalspot.net/wp-content/uploads/2020/01/Types-of-Falcons.jpg") no-repeat center center;
background-size: cover;
}
#example_2 {
background: url("https://www.leaders-mena.com/leaders/uploads/2023/01/The-Traditional-Camel-Racing-In-Saudi-Arabia-Unique-Sport-Activity-1024x576.jpg") no-repeat center center;
background-size: cover;
}
#example_3 {
background: url("http://embed.robertharding.com/embed/1161-4342.jpg") no-repeat center center;
background-size: cover;
}
"""
# Launch the Gradio interface
demo.launch()