---
license: llama3.3
base_model:
- meta-llama/Llama-3.3-70B-Instruct
language:
- en
- hi
- it
- de
- fr
- th
- es
- pt
library_name: transformers
tags:
- meta
- pytorch
- llama
---
# MODEL DESCRIPTION
Simple compression of llama-3.3-70B-instruct model using AWQ method.

## Loading model with AutoModelForCausalLM
```python
from transformers import AutoModelForCausalLM

model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM"

model = AutoModelForCausalLM.from_pretrained(model_name)

print(model)
```

## Loading this model with VLLM via docker
```
docker run --runtime nvidia --gpus all \
--env "HUGGING_FACE_HUB_TOKEN = .........." \
-p 8000:8000 \
--ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \
--gpu-memory-utilization 0.9 \
--swap-space 0 \
--max-seq-len-to-capture 512 \
--max-num-seqs 1 \
--api-key "token-abc123" \
--max-model-len 8000 \
--trust-remote-code --enable-chunked-prefill \
--max_num_batched_tokens 1024
```

## A method to merge adapter weights to the base model and quantize
```python
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os
from awq import AutoAWQForCausalLM
import gc

def clear_gpu_memory():
    """Clear GPU memory and cache"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"):
    """Merge adapter with base model and save"""
    print("Loading base model...")
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_path,
        torch_dtype=torch.float16,
        device_map=device
    )
    
    print("Loading adapter...")
    adapter_model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        device_map=device
    )
    
    print("Merging adapter with base model...")
    merged_model = adapter_model.merge_and_unload()
    
    print("Saving merged model...")
    merged_model.save_pretrained(merged_path)
    
    # Clear model from GPU memory
    del base_model
    del adapter_model
    del merged_model
    clear_gpu_memory()
    print("Cleared GPU memory after merge")

def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"):
    """Quantize the merged model"""
    print("Starting quantization...")
    quant_config = {
        "bits": 4,
        "group_size": 128,
        "zero_point": True,
        "modules_to_not_convert": [
            "attention",    # keep attention in fp16
            "rotary_emb",  # keep embeddings in fp16
            "norm",        # keep normalization in fp16
            "adapter",     # keep adapter weights in fp16
            "lora"         # keep any remaining LoRA weights in fp16
        ]
    }
    
    # Load and quantize
    print("Loading merged model for quantization...")
    quantizer = AutoAWQForCausalLM.from_pretrained(
        merged_path,
        **quant_config,
        device_map=device
    )
    
    quantized_model = quantizer.quantize(
        examples=128,
        verify_loading=True
    )
    
    print("Saving quantized model...")
    quantized_model.save_pretrained(quantized_path)
    
    # Clear GPU memory again
    del quantizer
    del quantized_model
    clear_gpu_memory()
    print("Cleared GPU memory after quantization")

def process_model(base_model_path: str, adapter_path: str, output_dir: str):
    """Main processing function"""
    os.makedirs(output_dir, exist_ok=True)
    merged_path = os.path.join(output_dir, "merged_model")
    quantized_path = os.path.join(output_dir, "quantized_model")
    
    try:
        # Step 1: Merge
        merge_model(base_model_path, adapter_path, merged_path)
        
        # Step 2: Quantize
        quantize_model(merged_path, quantized_path)
        
        print("Process completed successfully!")
        return True
        
    except Exception as e:
        print(f"Error during processing: {str(e)}")
        clear_gpu_memory()  # Clear memory if there's an error
        return False

if __name__ == "__main__":
    # Configuration
    BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct"
    ADAPTER_PATH = "./checkpoint-781"  # Directory with adapter_config.json
    OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM"
    
    # Run the process
    success = process_model(
        base_model_path=BASE_MODEL_PATH,
        adapter_path=ADAPTER_PATH,
        output_dir=OUTPUT_DIR
    )
```