Spaces:

ErenalpCet
/

AI-Persona-Simulator

Running on Zero

App Files Files Community

ErenalpCet commited on 13 days ago

Commit

19fe477

verified ·

1 Parent(s): acc34b2

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -10

app.py CHANGED Viewed

@@ -27,30 +27,39 @@ MAX_GPU_MEMORY = "40GiB"  # A100 memory allocation
 # --- Model Loading ---
 @GPU(memory=40) # ****** THIS DECORATOR IS ESSENTIAL FOR SPACES STARTUP ******
 def load_model():
-    """Load the LLM model optimized for A100 GPU."""
-    print(f"Attempting to load model: {MODEL_ID}")
     try:
-        # Configure quantization
-        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
         pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
             torch_dtype=torch.bfloat16,
-            device_map="auto", # Relies on accelerate
             model_kwargs={
                 "quantization_config": quantization_config,
                 "use_cache": True,
-                # "max_memory": {0: MAX_GPU_MEMORY} # Often handled by device_map="auto"
             }
         )
-        print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device}")
         return pipe
     except Exception as e:
-        print(f"FATAL Error loading model '{MODEL_ID}': {e}")
-        # Raise the error to potentially get more detailed logs in Spaces
         raise e
-        # return None # Returning None might hide the root cause in Spaces logs
 # --- Web Search ---
 # (Keep search_person, create_synthetic_profile, extract_text_from_search_results as before)

 # --- Model Loading ---
 @GPU(memory=40) # ****** THIS DECORATOR IS ESSENTIAL FOR SPACES STARTUP ******
 def load_model():
+    """Load the LLM model optimized for A100 GPU using 4-bit quantization."""
+    print(f"Attempting to load model: {MODEL_ID} with 4-bit quantization")
     try:
+        # Configure quantization for 4-bit
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",            # NF4 is often recommended
+            bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for compute
+            bnb_4bit_use_double_quant=True,       # Use double quantization to save more memory
+        )
+        # Device map will handle placing layers, relying on accelerate
+        # No need to explicitly set max_memory when using device_map="auto" typically
         pipe = pipeline(
             "text-generation",
             model=MODEL_ID,
+            # Note: torch_dtype is sometimes ignored when quantization_config is used,
+            # but specifying compute_dtype in BitsAndBytesConfig is key.
+            # Keep torch_dtype=torch.bfloat16 here for consistency if needed by other parts.
             torch_dtype=torch.bfloat16,
+            device_map="auto", # Let accelerate handle layer placement
             model_kwargs={
                 "quantization_config": quantization_config,
                 "use_cache": True,
+                # "trust_remote_code=True" # Add if model requires it (check model card)
             }
         )
+        print(f"Model {MODEL_ID} loaded successfully on device: {pipe.device} (using 4-bit quantization)")
         return pipe
     except Exception as e:
+        print(f"FATAL Error loading model '{MODEL_ID}' (check memory/config): {e}")
+        # Raise the error to ensure it's visible in Spaces logs
         raise e
 # --- Web Search ---
 # (Keep search_person, create_synthetic_profile, extract_text_from_search_results as before)