Spaces:

mknolan
/

internvl2-llama3-demo

Build error

App Files Files Community

internvl2-llama3-demo / app.py

mknolan

Upload app.py with huggingface_hub

1cd032a verified 4 days ago

raw

history blame contribute delete

6.69 kB

	import torch
	from PIL import Image
	import requests
	from io import BytesIO
	import gradio as gr
	import os
	import sys
	import time
	import warnings

	# Suppress warnings
	warnings.filterwarnings("ignore")

	print("Starting InternVL2 with Llama3-76B initialization...")
	print(f"Python version: {sys.version}")
	print(f"PyTorch version: {torch.__version__}")
	print(f"CUDA available: {torch.cuda.is_available()}")

	# Set up environment for CUDA
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

	# Check GPU availability
	def check_gpu():
	if not torch.cuda.is_available():
	print("CUDA is not available. This application requires GPU acceleration.")
	return False

	try:
	# Test GPU with a simple operation
	test_tensor = torch.rand(10, device="cuda")
	_ = test_tensor + test_tensor
	print(f"GPU is available: {torch.cuda.get_device_name(0)}")
	return True
	except Exception as e:
	print(f"Error initializing GPU: {str(e)}")
	return False

	# Global flag for GPU availability
	USE_GPU = check_gpu()

	# Import InternVL modules
	try:
	from transformers import AutoModel, AutoProcessor
	HAS_TRANSFORMERS = True
	print("Successfully imported transformers")
	except ImportError as e:
	print(f"Error importing transformers: {str(e)}")
	HAS_TRANSFORMERS = False

	# Initialize models
	internvit_model = None
	llama_model = None
	processor = None

	def load_models():
	global internvit_model, llama_model, processor

	if not USE_GPU:
	print("Cannot load models without GPU")
	return False

	try:
	print("Loading InternViT-6B model for visual feature extraction...")

	# Following the GitHub repo instructions for using InternViT-6B
	processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px")
	internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px")

	if USE_GPU:
	internvit_model = internvit_model.to("cuda")

	print("InternViT-6B model loaded successfully!")

	# For demonstration purposes, we'll just extract visual features for now
	# In a real implementation, we would load Llama3-76B here
	print("Note: Llama3-76B model loading is commented out for this demonstration")
	# llama_model = ...

	return True
	except Exception as e:
	print(f"Error loading models: {str(e)}")
	return False

	# Load models on startup
	MODELS_LOADED = load_models()

	def process_image(image_path, sample_url=None):
	"""Process an image using InternViT-6B for feature extraction"""

	# Load image
	if sample_url and not image_path:
	# Load from URL if provided and no image uploaded
	response = requests.get(sample_url)
	image = Image.open(BytesIO(response.content))
	print(f"Loaded sample image from URL: {sample_url}")
	else:
	# Use uploaded image
	if isinstance(image_path, str):
	image = Image.open(image_path)
	else:
	image = image_path

	if not image:
	return "No image provided"

	if not MODELS_LOADED:
	return "Models failed to load. Please check the logs."

	try:
	# Start timing
	start_time = time.time()

	# Process image through the visual encoder
	print("Processing image through InternViT-6B...")
	inputs = processor(images=image, return_tensors="pt")
	if USE_GPU:
	inputs = {k: v.to("cuda") for k, v in inputs.items()}

	with torch.no_grad():
	outputs = internvit_model(**inputs)

	# Extract image features
	image_features = outputs.last_hidden_state
	pooled_output = outputs.pooler_output

	# In a real implementation, we would pass these features to Llama3-76B
	# For now, we'll just return info about the extracted features
	feature_info = f"""
	Image successfully processed through InternViT-6B:
	- Last hidden state shape: {image_features.shape}
	- Pooled output shape: {pooled_output.shape}

	In a complete implementation, these visual features would be passed to Llama3-76B
	for generating text responses about the image.

	Note: This is a demonstration of visual feature extraction only.
	"""

	# Calculate elapsed time
	elapsed = time.time() - start_time

	return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds."

	except Exception as e:
	return f"Error processing image: {str(e)}"

	# Set up Gradio interface
	def create_interface():
	with gr.Blocks(title="InternVL2 with Llama3-76B") as demo:
	gr.Markdown("# InternVL2 Visual Feature Extraction Demo")
	gr.Markdown("## Using InternViT-6B for visual feature extraction")

	# System status
	status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load"
	gr.Markdown(f"### System Status: {status}")

	with gr.Row():
	with gr.Column():
	input_image = gr.Image(type="pil", label="Upload Image")
	sample_btn = gr.Button("Use Sample Image")

	with gr.Column():
	output_text = gr.Textbox(label="Results", lines=10)

	# Process button
	process_btn = gr.Button("Extract Visual Features")
	process_btn.click(
	fn=process_image,
	inputs=[input_image],
	outputs=output_text
	)

	# Sample image button logic
	sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg"

	def use_sample():
	return process_image(None, sample_image_url)

	sample_btn.click(
	fn=use_sample,
	inputs=[],
	outputs=output_text
	)

	# Add some explanation
	gr.Markdown("""
	## About This Demo

	This demonstration shows how to use InternViT-6B for visual feature extraction,
	following the instructions from the OpenGVLab/InternVL GitHub repository.

	The application extracts visual features from the input image that would typically
	be passed to a language model like Llama3-76B. In a complete implementation,
	these features would be used to generate text responses about the image.
	""")

	return demo

	# Main function
	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(share=False, server_name="0.0.0.0")