File size: 5,077 Bytes

e1ebdf9

#!/usr/bin/env python3
import os
import torch
import gguf  # llama.cpp's specific gguf python module
import argparse
from tqdm import tqdm
from safetensors.torch import load_file

# Configuration constants
QUANTIZATION_THRESHOLD = 1024
MAX_TENSOR_NAME_LENGTH = 127

# Base model template class
class ModelTemplate:
    arch = "invalid"
    shape_fix = False
    keys_detect = []
    keys_banned = []

# Specific template for clip_g using ComfyUI standard keys
class ModelClipG(ModelTemplate):
    arch = "clip_g"
    shape_fix = False  # No rearrangement for text encoder models
    keys_detect = [
        ("logit_scale",),
        ("text_model.embeddings.position_embedding.weight",),
        ("text_model.encoder.layers.0.self_attn.in_proj_weight",),
    ]
    keys_banned = []

# Only clip_g in this conversion script
arch_list = [ModelClipG]

def is_model_arch(model, state_dict):
    for key_tuple in model.keys_detect:
        if all(key in state_dict for key in key_tuple):
            # Optionally check for banned keys
            if any(key in state_dict for key in model.keys_banned):
                raise ValueError("Model architecture not allowed for conversion!")
            return True
    return False

def detect_arch(state_dict):
    for model in arch_list:
        if is_model_arch(model, state_dict):
            return model
    raise ValueError("Unknown model architecture!")

def parse_args():
    parser = argparse.ArgumentParser(description="Convert clip_g model (ComfyUI standard) to GGUF")
    parser.add_argument("--src", required=True, help="Source model file (.safetensors, .pt, etc)")
    parser.add_argument("--dst", help="Output GGUF file")
    return parser.parse_args()

def load_state_dict(path):
    if any(path.endswith(ext) for ext in [".ckpt", ".pt", ".bin", ".pth"]):
        state_dict = torch.load(path, map_location="cpu", weights_only=True)
        state_dict = state_dict.get("model", state_dict)
    else:
        state_dict = load_file(path)
    
    # Remove unwanted prefixes if they exist.
    prefix = None
    for pfx in ["model.diffusion_model.", "model."]:
        if any(k.startswith(pfx) for k in state_dict.keys()):
            prefix = pfx
            break
    new_state = {}
    for k, v in state_dict.items():
        if prefix:
            if not k.startswith(prefix):
                continue
            k = k.replace(prefix, "")
        new_state[k] = v
    return new_state

def load_model(path):
    state_dict = load_state_dict(path)
    model_arch = detect_arch(state_dict)
    print(f"Detected architecture: {model_arch.arch}")
    writer = gguf.GGUFWriter(path=None, arch=model_arch.arch)
    return writer, state_dict, model_arch

def handle_tensors(writer, state_dict, model_arch):
    # Check that all tensor names are within allowed length.
    for key in state_dict.keys():
        if len(key) > MAX_TENSOR_NAME_LENGTH:
            raise ValueError(f"Tensor name {key} exceeds maximum length {MAX_TENSOR_NAME_LENGTH}")

    for key, tensor in tqdm(state_dict.items(), desc="Processing tensors"):
        if isinstance(tensor, torch.Tensor):
            data = tensor.detach().cpu().numpy()
        else:
            data = tensor

        # Determine quantization based on key name
        key_lower = key.lower()
        if data.ndim == 1 or "bias" in key_lower or "layer_norm" in key_lower or "ln_" in key_lower:
            data_qtype = gguf.GGMLQuantizationType.F32
        elif "embeddings" in key_lower:
            data_qtype = gguf.GGMLQuantizationType.F32
        else:
            data_qtype = gguf.GGMLQuantizationType.F16

        if data.size <= QUANTIZATION_THRESHOLD:
            data_qtype = gguf.GGMLQuantizationType.F32

        try:
            quantized = gguf.quants.quantize(data, data_qtype)
        except Exception as e:
            tqdm.write(f"Quantization failed for {key} with error {e}; falling back to F16")
            data_qtype = gguf.GGMLQuantizationType.F16
            quantized = gguf.quants.quantize(data, data_qtype)

        writer.add_tensor(key, quantized, raw_dtype=data_qtype)
        tqdm.write(f"Processed {key}: {data.dtype} -> {data_qtype.name}, shape = {data.shape}")

def main():
    args = parse_args()
    writer, state_dict, model_arch = load_model(args.src)

    # Determine file type based on first tensor's dtype.
    first_tensor = next(iter(state_dict.values()))
    if first_tensor.dtype == torch.bfloat16:
        out_path = args.dst or os.path.splitext(args.src)[0] + "-BF16.gguf"
        writer.add_file_type(gguf.LlamaFileType.MOSTLY_BF16)
    else:
        out_path = args.dst or os.path.splitext(args.src)[0] + "-F16.gguf"
        writer.add_file_type(gguf.LlamaFileType.MOSTLY_F16)

    if os.path.isfile(out_path):
        input("Output exists. Press enter to continue or Ctrl+C to abort")

    handle_tensors(writer, state_dict, model_arch)
    writer.write_header_to_file(path=out_path)
    writer.write_kv_data_to_file()
    writer.write_tensors_to_file(progress=True)
    writer.close()

if __name__ == "__main__":
    main()