Spaces:

yunusajib
/

transport_query_assistant

Running

App Files Files Community

yunusajib commited on 4 days ago

Commit

971be40

verified ·

1 Parent(s): dd2c70a

update model and requirements

Browse files

Files changed (2) hide show

llava_inference.py +82 -7
requirements.txt +5 -0

llava_inference.py CHANGED Viewed

@@ -2,24 +2,99 @@ from llava.model.builder import load_pretrained_model
 from llava.mm_utils import process_images, tokenizer_image_token
 from transformers import AutoTokenizer
 import torch
 class LLaVAHelper:
     def __init__(self, model_name="llava-hf/llava-1.5-7b-hf"):
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name, force_download=True)
-        self.model, self.image_processor, _ = load_pretrained_model(model_name, None)
         self.model.eval()
     def generate_answer(self, image, question):
-        # Preprocess
-        image_tensor = process_images([image], self.image_processor, self.model.config)[0].unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")
         prompt = f"###Human: <image>\n{question}\n###Assistant:"
-        input_ids = tokenizer_image_token(prompt, self.tokenizer, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
         with torch.no_grad():
             output_ids = self.model.generate(
                 input_ids=input_ids.input_ids,
                 images=image_tensor,
-                max_new_tokens=512
             )
         output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return output.split("###Assistant:")[-1].strip()

 from llava.mm_utils import process_images, tokenizer_image_token
 from transformers import AutoTokenizer
 import torch
+import requests
+from PIL import Image
+from io import BytesIO
 class LLaVAHelper:
     def __init__(self, model_name="llava-hf/llava-1.5-7b-hf"):
+        # Use cache_dir to avoid issues with the default cache location
+        # and disable force_download to use cached versions when available
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            cache_dir="./model_cache",
+            force_download=False,
+            trust_remote_code=True
+        )
+        # Load model with same cache directory
+        self.model, self.image_processor, _ = load_pretrained_model(
+            model_name,
+            None,
+            cache_dir="./model_cache"
+        )
         self.model.eval()
+        # Move model to appropriate device
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model.to(self.device)
+        print(f"Model loaded on {self.device}")
     def generate_answer(self, image, question):
+        """
+        Generate a response to a question about an image
+        Args:
+            image: PIL Image or path to image
+            question: String question about the image
+        Returns:
+            String response from the model
+        """
+        # Handle image input (either PIL Image or path/URL)
+        if isinstance(image, str):
+            if image.startswith(('http://', 'https://')):
+                response = requests.get(image)
+                image = Image.open(BytesIO(response.content))
+            else:
+                image = Image.open(image)
+        # Preprocess image
+        image_tensor = process_images(
+            [image],
+            self.image_processor,
+            self.model.config
+        )[0].unsqueeze(0).to(self.device)
+        # Format prompt with question
         prompt = f"###Human: <image>\n{question}\n###Assistant:"
+        # Tokenize prompt
+        input_ids = tokenizer_image_token(
+            prompt,
+            self.tokenizer,
+            return_tensors="pt"
+        ).to(self.device)
+        # Generate response
         with torch.no_grad():
             output_ids = self.model.generate(
                 input_ids=input_ids.input_ids,
                 images=image_tensor,
+                max_new_tokens=512,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
             )
+        # Decode and extract response
         output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return output.split("###Assistant:")[-1].strip()
+# Example usage
+if __name__ == "__main__":
+    try:
+        # Initialize model
+        llava = LLaVAHelper()
+        # Example with a local file
+        # response = llava.generate_answer("path/to/your/image.jpg", "What's in this image?")
+        # Example with a URL
+        # image_url = "https://example.com/image.jpg"
+        # response = llava.generate_answer(image_url, "Describe this image in detail.")
+        # print(response)
+        print("LLaVA model initialized successfully. Ready to process images.")
+    except Exception as e:
+        print(f"Error initializing LLaVA: {e}")

requirements.txt CHANGED Viewed

@@ -2,4 +2,9 @@ torch>=2.0.0
 transformers>=4.30.0
 accelerate>=0.20.0
 gradio>=3.35.0
 git+https://github.com/haotian-liu/LLaVA.git

 transformers>=4.30.0
 accelerate>=0.20.0
 gradio>=3.35.0
+pillow>=9.0.0
+requests>=2.28.0
+tqdm>=4.65.0
+timm>=0.6.13
+sentencepiece>=0.1.97
 git+https://github.com/haotian-liu/LLaVA.git