--- license: llama3.3 base_model: - meta-llama/Llama-3.3-70B-Instruct language: - en - hi - it - de - fr - th - es - pt library_name: transformers tags: - meta - pytorch - llama --- # MODEL DESCRIPTION Simple compression of llama-3.3-70B-instruct model using AWQ method. ## Loading model with AutoModelForCausalLM ```python from transformers import AutoModelForCausalLM model_name = "uyiosa/Llama-3.3-70b-Instruct-AWQ-4bit-GEMM" model = AutoModelForCausalLM.from_pretrained(model_name) print(model) ``` ## Loading this model with VLLM via docker ``` docker run --runtime nvidia --gpus all \ --env "HUGGING_FACE_HUB_TOKEN = .........." \ -p 8000:8000 \ --ipc=host --model jsbaicenter/Llama-3.3-70b-Instruct-AWQ-4BIT-GEMM \ --gpu-memory-utilization 0.9 \ --swap-space 0 \ --max-seq-len-to-capture 512 \ --max-num-seqs 1 \ --api-key "token-abc123" \ --max-model-len 8000 \ --trust-remote-code --enable-chunked-prefill \ --max_num_batched_tokens 1024 ``` ## A method to merge adapter weights to the base model and quantize ```python import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import os from awq import AutoAWQForCausalLM import gc def clear_gpu_memory(): """Clear GPU memory and cache""" if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() def merge_model(base_model_path: str, adapter_path: str, merged_path: str, device: str = "cuda"): """Merge adapter with base model and save""" print("Loading base model...") base_model = AutoModelForCausalLM.from_pretrained( base_model_path, torch_dtype=torch.float16, device_map=device ) print("Loading adapter...") adapter_model = PeftModel.from_pretrained( base_model, adapter_path, device_map=device ) print("Merging adapter with base model...") merged_model = adapter_model.merge_and_unload() print("Saving merged model...") merged_model.save_pretrained(merged_path) # Clear model from GPU memory del base_model del adapter_model del merged_model clear_gpu_memory() print("Cleared GPU memory after merge") def quantize_model(merged_path: str, quantized_path: str, device: str = "cuda"): """Quantize the merged model""" print("Starting quantization...") quant_config = { "bits": 4, "group_size": 128, "zero_point": True, "modules_to_not_convert": [ "attention", # keep attention in fp16 "rotary_emb", # keep embeddings in fp16 "norm", # keep normalization in fp16 "adapter", # keep adapter weights in fp16 "lora" # keep any remaining LoRA weights in fp16 ] } # Load and quantize print("Loading merged model for quantization...") quantizer = AutoAWQForCausalLM.from_pretrained( merged_path, **quant_config, device_map=device ) quantized_model = quantizer.quantize( examples=128, verify_loading=True ) print("Saving quantized model...") quantized_model.save_pretrained(quantized_path) # Clear GPU memory again del quantizer del quantized_model clear_gpu_memory() print("Cleared GPU memory after quantization") def process_model(base_model_path: str, adapter_path: str, output_dir: str): """Main processing function""" os.makedirs(output_dir, exist_ok=True) merged_path = os.path.join(output_dir, "merged_model") quantized_path = os.path.join(output_dir, "quantized_model") try: # Step 1: Merge merge_model(base_model_path, adapter_path, merged_path) # Step 2: Quantize quantize_model(merged_path, quantized_path) print("Process completed successfully!") return True except Exception as e: print(f"Error during processing: {str(e)}") clear_gpu_memory() # Clear memory if there's an error return False if __name__ == "__main__": # Configuration BASE_MODEL_PATH = "meta-llama/Llama-3.3-70B-Instruct" ADAPTER_PATH = "./checkpoint-781" # Directory with adapter_config.json OUTPUT_DIR = "llama-3.3-70b-FT781-AWQ-GEMM" # Run the process success = process_model( base_model_path=BASE_MODEL_PATH, adapter_path=ADAPTER_PATH, output_dir=OUTPUT_DIR ) ```