OzzyGT/diffusers-image-fill · Adopted the same model but I had poor results

3 days ago

This is my local effect

This is the effect of your online links

The following is my local implementation code, I can't check the problem, can you help if it's convenient for you

import os
import torch
from PIL import Image
from diffusers import AutoencoderKL, TCDScheduler
from diffusers.models.model_loading_utils import load_state_dict
from controlnet_union import ControlNetModel_Union
from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
import spaces

# 确保输出目录存在
os.makedirs("pic/result", exist_ok=True)

# 加载模型配置
config = ControlNetModel_Union.load_config("model/xinsir/controlnet-union-sdxl-1.0/config_promax.json")
controlnet_model = ControlNetModel_Union.from_config(config)

# 加载模型权重
model_file = "model/xinsir/controlnet-union-sdxl-1.0/diffusion_pytorch_model_promax.safetensors"
state_dict = load_state_dict(model_file)
model, _, _, _, _ = ControlNetModel_Union._load_pretrained_model(
    controlnet_model, state_dict, model_file, "model/xinsir/controlnet-union-sdxl-1.0"
)
model.to(device="cuda", dtype=torch.float16)

# 加载 VAE
vae = AutoencoderKL.from_pretrained(
    "model/madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
).to("cuda")

# 创建 StableDiffusionXLFillPipeline
pipe = StableDiffusionXLFillPipeline.from_pretrained(
    "model/SG161222/RealVisXL_V5.0_Lightning",
    torch_dtype=torch.float16,
    vae=vae,
    controlnet=model,
    variant="fp16",
).to("cuda")

pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)

# 编码提示词
prompt = "high quality"
(
    prompt_embeds,
    negative_prompt_embeds,
    pooled_prompt_embeds,
    negative_pooled_prompt_embeds,
) = pipe.encode_prompt(prompt, "cuda", True)

@spaces.GPU(duration=16)
def process_image(source_path, mask_path, output_dir):
    # 加载原图和 mask
    source = Image.open(source_path).convert("RGBA")
    mask = Image.open(mask_path).convert("L")

    # 调整尺寸为 1024x1024
    source = source.resize((1024, 1024), Image.LANCZOS)
    mask = mask.resize((1024, 1024), Image.LANCZOS)

    # 创建二值 mask
    binary_mask = mask.point(lambda p: 255 if p > 0 else 0)
    inverted_mask = Image.eval(binary_mask, lambda p: 255 - p)

    # 创建用于 ControlNet 的输入图像
    alpha_image = Image.new("RGBA", source.size, (0, 0, 0, 0))
    cnet_image = Image.composite(source, alpha_image, inverted_mask)

    # 使用模型修复图像
    # 注意：pipe 返回的是一个生成器，需要迭代获取结果
    result_generator = pipe(
        prompt_embeds=prompt_embeds,
        negative_prompt_embeds=negative_prompt_embeds,
        pooled_prompt_embeds=pooled_prompt_embeds,
        negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
        image=cnet_image,
    )

    # 从生成器中获取第一个图像
    for i, result in enumerate(result_generator):
        result_image = result  # 直接使用生成器返回的图像对象
        # result_image.save(f"pic/test/result_{i}.png")
        # break  # 只取第一个结果

    # 将修复后的图像粘贴回原图
    result_image = result_image.convert("RGBA")
    final_image = source.copy()
    final_image.paste(result_image, (0, 0), binary_mask)

    # 保存修复后的图像
    result_filename = os.path.basename(source_path).replace(".png", "_result.png")
    final_image.save(os.path.join(output_dir, result_filename))

    # 创建四宫格图
    grid_image = Image.new("RGBA", (2048, 2048), (255, 255, 255, 255))
    grid_image.paste(source, (0, 0))  # 原图
    grid_image.paste(mask.convert("RGBA"), (256, 0))  # mask 图
    grid_image.paste(cnet_image, (0, 256))  # 原图和 mask 叠加图
    grid_image.paste(final_image, (256, 256))  # 修复后的图像

    # 保存四宫格图
    grid_filename = os.path.basename(source_path).replace(".png", "_grid.png")
    grid_image.save(os.path.join(output_dir, grid_filename))


# 批量处理图片
source_dir = "pic/source"
mask_dir = "pic/mask"
output_dir = "pic/result"

# 获取目录下所有文件，并过滤出 .png 和 .jpg 文件
source_files = sorted([f for f in os.listdir(source_dir) if f.lower().endswith(('.png', '.jpg'))])
mask_files = sorted([f for f in os.listdir(mask_dir) if f.lower().endswith(('.png', '.jpg'))])

# 确保原图和 mask 图数量一致
if len(source_files) != len(mask_files):
    raise ValueError("原图和 mask 图数量不匹配！")

print(mask_files)
print(source_files)
for source_file, mask_file in zip(source_files, mask_files):
    source_path = os.path.join(source_dir, source_file)
    print(source_path)
    mask_path = os.path.join(mask_dir, mask_file)
    print(mask_path)
    process_image(source_path, mask_path, output_dir)
    # break

print("批量处理完成！")

OzzyGT

Owner 1 day ago

Hi!, can you try it with the code here, this is a PoC of that guide. If it doesn't work, maybe you can share the image and mask you're using locally. Sadly I can't help you with the custom parts or local models if I can't access to them, also controlnet union was added to diffusers after this guide so there's no need for the custom code here for it to make it work.

AntonioZheng

about 4 hours ago

This comment has been hidden (marked as Resolved)

AntonioZheng

about 4 hours ago

Hi!, can you try it with the code here, this is a PoC of that guide. If it doesn't work, maybe you can share the image and mask you're using locally. Sadly I can't help you with the custom parts or local models if I can't access to them, also controlnet union was added to diffusers after this guide so there's no need for the custom code here for it to make it work.

Thank you for taking the time to answer my question, I checked the code and found that the problem is that when I manually delineate the mask area, it will be slightly larger than the real modification, and my code is an exact match, which can lead to some problems, I expanded the mask area by 15% and the effect is very good, your project is very valuable, thank you!