Spaces:

5m4ck3r
/

meta-llama-Llama-3.2-11B-Vision-Instruct

Running on Zero

meta-llama-Llama-3.2-11B-Vision-Instruct

File size: 1,658 Bytes

11bdc52
bd1bb19
 
 
bd6df67
11bdc52
79cf728
bd1bb19
 
 
 
 
 
 
 
bd6df67
bd1bb19
79cf728
bd1bb19
 
 
 
 
 
 
 
 
 
 
79cf728
bd1bb19
 
 
 
79cf728
bd1bb19
 
 
79cf728
 
bd1bb19
79cf728
bd1bb19

import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, LlavaNextForConditionalGeneration
import spaces

# Load the processor and model
model_id = "llava-hf/llava-v1.6-mistral-7B-hf"
processor = AutoProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True
)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

@spaces.GPU()
def llava_inference(image: Image.Image, prompt: str):
    # Format the input as a conversation
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt},
            ],
        },
    ]
    formatted_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(image, formatted_prompt, return_tensors="pt").to(device)
    # Generate response with a max token limit
    output_ids = model.generate(**inputs, max_new_tokens=100)
    output_text = processor.decode(output_ids[0], skip_special_tokens=True)
    return output_text

# Updated Gradio interface using new component syntax
demo = gr.Interface(
    fn=llava_inference,
    inputs=[
        gr.Image(type="pil", label="Input Image"),
        gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt")
    ],
    outputs=gr.Text(label="Output Response"),
    title="LLaVA-1.6 Gradio Demo",
    description="Upload an image and enter a prompt. The model will generate a response using LLaVA-1.6.",
)

if __name__ == "__main__":
    demo.launch()