Spaces:
Build error
Build error
import torch | |
from PIL import Image | |
import requests | |
from io import BytesIO | |
import gradio as gr | |
import os | |
import sys | |
import time | |
import warnings | |
# Suppress warnings | |
warnings.filterwarnings("ignore") | |
print("Starting InternVL2 with Llama3-76B initialization...") | |
print(f"Python version: {sys.version}") | |
print(f"PyTorch version: {torch.__version__}") | |
print(f"CUDA available: {torch.cuda.is_available()}") | |
# Set up environment for CUDA | |
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128" | |
# Check GPU availability | |
def check_gpu(): | |
if not torch.cuda.is_available(): | |
print("CUDA is not available. This application requires GPU acceleration.") | |
return False | |
try: | |
# Test GPU with a simple operation | |
test_tensor = torch.rand(10, device="cuda") | |
_ = test_tensor + test_tensor | |
print(f"GPU is available: {torch.cuda.get_device_name(0)}") | |
return True | |
except Exception as e: | |
print(f"Error initializing GPU: {str(e)}") | |
return False | |
# Global flag for GPU availability | |
USE_GPU = check_gpu() | |
# Import InternVL modules | |
try: | |
from transformers import AutoModel, AutoProcessor | |
HAS_TRANSFORMERS = True | |
print("Successfully imported transformers") | |
except ImportError as e: | |
print(f"Error importing transformers: {str(e)}") | |
HAS_TRANSFORMERS = False | |
# Initialize models | |
internvit_model = None | |
llama_model = None | |
processor = None | |
def load_models(): | |
global internvit_model, llama_model, processor | |
if not USE_GPU: | |
print("Cannot load models without GPU") | |
return False | |
try: | |
print("Loading InternViT-6B model for visual feature extraction...") | |
# Following the GitHub repo instructions for using InternViT-6B | |
processor = AutoProcessor.from_pretrained("OpenGVLab/InternViT-6B-224px") | |
internvit_model = AutoModel.from_pretrained("OpenGVLab/InternViT-6B-224px") | |
if USE_GPU: | |
internvit_model = internvit_model.to("cuda") | |
print("InternViT-6B model loaded successfully!") | |
# For demonstration purposes, we'll just extract visual features for now | |
# In a real implementation, we would load Llama3-76B here | |
print("Note: Llama3-76B model loading is commented out for this demonstration") | |
# llama_model = ... | |
return True | |
except Exception as e: | |
print(f"Error loading models: {str(e)}") | |
return False | |
# Load models on startup | |
MODELS_LOADED = load_models() | |
def process_image(image_path, sample_url=None): | |
"""Process an image using InternViT-6B for feature extraction""" | |
# Load image | |
if sample_url and not image_path: | |
# Load from URL if provided and no image uploaded | |
response = requests.get(sample_url) | |
image = Image.open(BytesIO(response.content)) | |
print(f"Loaded sample image from URL: {sample_url}") | |
else: | |
# Use uploaded image | |
if isinstance(image_path, str): | |
image = Image.open(image_path) | |
else: | |
image = image_path | |
if not image: | |
return "No image provided" | |
if not MODELS_LOADED: | |
return "Models failed to load. Please check the logs." | |
try: | |
# Start timing | |
start_time = time.time() | |
# Process image through the visual encoder | |
print("Processing image through InternViT-6B...") | |
inputs = processor(images=image, return_tensors="pt") | |
if USE_GPU: | |
inputs = {k: v.to("cuda") for k, v in inputs.items()} | |
with torch.no_grad(): | |
outputs = internvit_model(**inputs) | |
# Extract image features | |
image_features = outputs.last_hidden_state | |
pooled_output = outputs.pooler_output | |
# In a real implementation, we would pass these features to Llama3-76B | |
# For now, we'll just return info about the extracted features | |
feature_info = f""" | |
Image successfully processed through InternViT-6B: | |
- Last hidden state shape: {image_features.shape} | |
- Pooled output shape: {pooled_output.shape} | |
In a complete implementation, these visual features would be passed to Llama3-76B | |
for generating text responses about the image. | |
Note: This is a demonstration of visual feature extraction only. | |
""" | |
# Calculate elapsed time | |
elapsed = time.time() - start_time | |
return f"{feature_info}\n\nProcessing completed in {elapsed:.2f} seconds." | |
except Exception as e: | |
return f"Error processing image: {str(e)}" | |
# Set up Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="InternVL2 with Llama3-76B") as demo: | |
gr.Markdown("# InternVL2 Visual Feature Extraction Demo") | |
gr.Markdown("## Using InternViT-6B for visual feature extraction") | |
# System status | |
status = "✅ Ready" if MODELS_LOADED else "❌ Models failed to load" | |
gr.Markdown(f"### System Status: {status}") | |
with gr.Row(): | |
with gr.Column(): | |
input_image = gr.Image(type="pil", label="Upload Image") | |
sample_btn = gr.Button("Use Sample Image") | |
with gr.Column(): | |
output_text = gr.Textbox(label="Results", lines=10) | |
# Process button | |
process_btn = gr.Button("Extract Visual Features") | |
process_btn.click( | |
fn=process_image, | |
inputs=[input_image], | |
outputs=output_text | |
) | |
# Sample image button logic | |
sample_image_url = "https://huggingface.co/OpenGVLab/InternVL2/resolve/main/assets/demo.jpg" | |
def use_sample(): | |
return process_image(None, sample_image_url) | |
sample_btn.click( | |
fn=use_sample, | |
inputs=[], | |
outputs=output_text | |
) | |
# Add some explanation | |
gr.Markdown(""" | |
## About This Demo | |
This demonstration shows how to use InternViT-6B for visual feature extraction, | |
following the instructions from the OpenGVLab/InternVL GitHub repository. | |
The application extracts visual features from the input image that would typically | |
be passed to a language model like Llama3-76B. In a complete implementation, | |
these features would be used to generate text responses about the image. | |
""") | |
return demo | |
# Main function | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch(share=False, server_name="0.0.0.0") |