import gradio as gr from PIL import Image import torch from transformers import AutoProcessor, LlavaNextForConditionalGeneration import spaces # Load the processor and model model_id = "llava-hf/llava-v1.6-mistral-7B-hf" processor = AutoProcessor.from_pretrained(model_id) model = LlavaNextForConditionalGeneration.from_pretrained( model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True ) device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) @spaces.GPU() def llava_inference(image: Image.Image, prompt: str): # Format the input as a conversation conversation = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": prompt}, ], }, ] formatted_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) inputs = processor(image, formatted_prompt, return_tensors="pt").to(device) # Generate response with a max token limit output_ids = model.generate(**inputs, max_new_tokens=100) output_text = processor.decode(output_ids[0], skip_special_tokens=True) return output_text # Updated Gradio interface using new component syntax demo = gr.Interface( fn=llava_inference, inputs=[ gr.Image(type="pil", label="Input Image"), gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt") ], outputs=gr.Text(label="Output Response"), title="LLaVA-1.6 Gradio Demo", description="Upload an image and enter a prompt. The model will generate a response using LLaVA-1.6.", ) if __name__ == "__main__": demo.launch()