szyezhu
/

BoundaryDiffusion

Model card Files Files and versions Community

szyezhu commited on Oct 22, 2023

Commit

019d164

1 Parent(s): 2066db5

Upload 46 files

Browse files

Files changed (47) hide show

.gitattributes +5 -0
assets/geo.png +0 -0
assets/geo_white.png +0 -0
assets/main.png +3 -0
assets/mixing_traj.png +0 -0
assets/mixing_traj_white.png +0 -0
assets/non_cherry_picky.png +3 -0
assets/strength_space.png +3 -0
assets/teaser.png +3 -0
assets/unconditional.png +3 -0
boundarydiffusion.py +713 -0
configs/afhq.yml +35 -0
configs/bedroom.yml +35 -0
configs/celeba.yml +35 -0
configs/church.yml +35 -0
configs/imagenet.yml +35 -0
configs/paths_config.py +25 -0
data_download.sh +38 -0
datasets/AFHQ_dataset.py +42 -0
datasets/CelebA_HQ_dataset.py +83 -0
datasets/CelebA_HQ_dataset_with_label.py +63 -0
datasets/IMAGENET_dataset.py +102 -0
datasets/LSUN_dataset.py +304 -0
datasets/celeba_attr.txt +40 -0
datasets/data_utils.py +44 -0
datasets/imagenet_dic.py +408 -0
imgs/img1.jpg +0 -0
losses/clip_loss.py +299 -0
losses/id_loss.py +35 -0
main.py +275 -0
models/ddpm/diffusion.py +348 -0
models/improved_ddpm/fp16_util.py +236 -0
models/improved_ddpm/logger.py +451 -0
models/improved_ddpm/nn.py +170 -0
models/improved_ddpm/script_util.py +109 -0
models/improved_ddpm/unet.py +677 -0
models/insight_face/__init__.py +0 -0
models/insight_face/helpers.py +178 -0
models/insight_face/model_irse.py +124 -0
requirements.txt +10 -0
utils/align_utils.py +213 -0
utils/celeba_attr.txt +40 -0
utils/colab_utils.py +36 -0
utils/diffusion_utils.py +134 -0
utils/prepare_lmdb_data.py +140 -0
utils/text_dic.py +123 -0
utils/text_templates.py +129 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/main.png filter=lfs diff=lfs merge=lfs -text
+assets/non_cherry_picky.png filter=lfs diff=lfs merge=lfs -text
+assets/strength_space.png filter=lfs diff=lfs merge=lfs -text
+assets/teaser.png filter=lfs diff=lfs merge=lfs -text
+assets/unconditional.png filter=lfs diff=lfs merge=lfs -text

assets/geo.png ADDED Viewed

assets/geo_white.png ADDED Viewed

assets/main.png ADDED Viewed

Git LFS Details

SHA256: 15eb1da3306b6965c62fb13f978e8074eb2b52bd94ee523b1a4262995b702da0
Pointer size: 132 Bytes
Size of remote file: 3.92 MB

assets/mixing_traj.png ADDED Viewed

assets/mixing_traj_white.png ADDED Viewed

assets/non_cherry_picky.png ADDED Viewed

Git LFS Details

SHA256: 9717a8a7412b4da6a76f13993d9422db7bb76afc24ca2f87c82956da4c177da8
Pointer size: 132 Bytes
Size of remote file: 9.45 MB

assets/strength_space.png ADDED Viewed

Git LFS Details

SHA256: 6be7e9920de0585cd4e246c9246e1e71b94b673b23d7b86dfe436f84bbf66ad7
Pointer size: 132 Bytes
Size of remote file: 2.27 MB

assets/teaser.png ADDED Viewed

Git LFS Details

SHA256: e8fc6904459ed380844bb2d00f8a9b0a6c34c15b575ed0c939eaeb9fe7d47692
Pointer size: 132 Bytes
Size of remote file: 1.32 MB

assets/unconditional.png ADDED Viewed

Git LFS Details

SHA256: d3cc39beb6510e1b0316b3f93cb9e4880c1a4c2b1cfe12e1d6707470663943f7
Pointer size: 132 Bytes
Size of remote file: 4.79 MB

boundarydiffusion.py ADDED Viewed

	@@ -0,0 +1,713 @@

+import time
+from glob import glob
+from tqdm import tqdm
+import os
+import numpy as np
+import cv2
+from PIL import Image
+import torch
+from torch import nn
+import torchvision.utils as tvu
+from sklearn import svm
+import pickle
+import torch.optim as optim
+from models.ddpm.diffusion import DDPM
+from models.improved_ddpm.script_util import i_DDPM
+from utils.text_dic import SRC_TRG_TXT_DIC
+from utils.diffusion_utils import get_beta_schedule, denoising_step
+from datasets.data_utils import get_dataset, get_dataloader
+from configs.paths_config import DATASET_PATHS, MODEL_PATHS, HYBRID_MODEL_PATHS, HYBRID_CONFIG
+from datasets.imagenet_dic import IMAGENET_DIC
+from utils.align_utils import run_alignment
+from utils.distance_utils import euclidean_distance, cosine_similarity
+def compute_radius(x):
+    x = torch.pow(x, 2)
+    r = torch.sum(x)
+    r = torch.sqrt(r)
+    return r
+class BoundaryDiffusion(object):
+    def __init__(self, args, config, device=None):
+        self.args = args
+        self.config = config
+        if device is None:
+            device = torch.device(
+                "cuda") if torch.cuda.is_available() else torch.device("cpu")
+        self.device = device
+        self.model_var_type = config.model.var_type
+        betas = get_beta_schedule(
+            beta_start=config.diffusion.beta_start,
+            beta_end=config.diffusion.beta_end,
+            num_diffusion_timesteps=config.diffusion.num_diffusion_timesteps
+        )
+        self.betas = torch.from_numpy(betas).float().to(self.device)
+        self.num_timesteps = betas.shape[0]
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+        posterior_variance = betas * \
+                             (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        if self.model_var_type == "fixedlarge":
+            self.logvar = np.log(np.append(posterior_variance[1], betas[1:]))
+        elif self.model_var_type == 'fixedsmall':
+            self.logvar = np.log(np.maximum(posterior_variance, 1e-20))
+        if self.args.edit_attr is None:
+            self.src_txts = self.args.src_txts
+            self.trg_txts = self.args.trg_txts
+        else:
+            self.src_txts = SRC_TRG_TXT_DIC[self.args.edit_attr][0]
+            self.trg_txts = SRC_TRG_TXT_DIC[self.args.edit_attr][1]
+    def unconditional(self):
+        print(self.args.exp)
+        # ----------- Model -----------#
+        if self.config.data.dataset == "LSUN":
+            if self.config.data.category == "bedroom":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/bedroom.ckpt"
+            elif self.config.data.category == "church_outdoor":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/church_outdoor.ckpt"
+        elif self.config.data.dataset == "CelebA_HQ":
+            url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/celeba_hq.ckpt"
+        elif self.config.data.dataset == "AFHQ":
+            pass
+        else:
+            raise ValueError
+        if self.config.data.dataset in ["CelebA_HQ", "LSUN"]:
+            model = DDPM(self.config)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.hub.load_state_dict_from_url(url, map_location=self.device)
+            learn_sigma = False
+            print("Original diffusion Model loaded.")
+        elif self.config.data.dataset in ["FFHQ", "AFHQ"]:
+            model = i_DDPM(self.config.data.dataset)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.load(MODEL_PATHS[self.config.data.dataset])
+            learn_sigma = True
+            print("Improved diffusion Model loaded.")
+        else:
+            print('Not implemented dataset')
+            raise ValueError
+        model.load_state_dict(init_ckpt)
+        model.to(self.device)
+        model = torch.nn.DataParallel(model)
+        model.eval()
+        # ----------- Precompute Latents -----------#
+        seq_inv = np.linspace(0, 1, 999) * 999
+        seq_inv = [int(s) for s in list(seq_inv)]
+        seq_inv_next = [-1] + list(seq_inv[:-1])
+        ###---- boundaries---####
+        # ---------- Load boundary ----------#
+        classifier = pickle.load(open('./boundary/smile_boundary_h.sav', 'rb'))
+        a = classifier.coef_.reshape(1, 512*8*8).astype(np.float32)
+        # a = a / np.linalg.norm(a)
+        z_classifier = pickle.load(open('./boundary/smile_boundary_z.sav', 'rb'))
+        z_a = z_classifier.coef_.reshape(1, 3*256*256).astype(np.float32)
+        z_a = z_a / np.linalg.norm(z_a) # normalized boundary
+        x_lat = torch.randn(1, 3, 256, 256, device=self.device)
+        n = 1
+        print("get the sampled latent encodings x_T!")
+        with torch.no_grad():
+            with tqdm(total=len(seq_inv), desc=f"Generative process") as progress_bar:
+                for it, (i, j) in enumerate(zip(reversed((seq_inv)), reversed((seq_inv_next)))):
+                    t = (torch.ones(n) * i).to(self.device)
+                    t_next = (torch.ones(n) * j).to(self.device)
+                    # print("check t and t_next:", t, t_next)
+                    if t == self.args.t_0:
+                        break
+                    x_lat, h_lat = denoising_step(x_lat, t=t, t_next=t_next, models=model,
+                                       logvars=self.logvar,
+                                       # sampling_type=self.args.sample_type,
+                                       sampling_type='ddim',
+                                       b=self.betas,
+                                       eta=0.0,
+                                       learn_sigma=learn_sigma,
+                                       )
+                    progress_bar.update(1)
+            # ----- Editing space ------ #
+            start_distance = self.args.start_distance
+            end_distance = self.args.end_distance
+            edit_img_number = self.args.edit_img_number
+            linspace = np.linspace(start_distance, end_distance, edit_img_number)
+            latent_code = h_lat.cpu().view(1,-1).numpy()
+            linspace = linspace - latent_code.dot(a.T)
+            linspace = linspace.reshape(-1, 1).astype(np.float32)
+            edit_h_seq = latent_code + linspace * a
+            z_linspace = np.linspace(start_distance, end_distance, edit_img_number)
+            z_latent_code = x_lat.cpu().view(1,-1).numpy()
+            z_linspace = z_linspace - z_latent_code.dot(z_a.T)
+            z_linspace = z_linspace.reshape(-1, 1).astype(np.float32)
+            edit_z_seq = z_latent_code + z_linspace * z_a
+            for k in range(edit_img_number):
+                time_in_start = time.time()
+                seq_inv = np.linspace(0, 1, self.args.n_inv_step) * self.args.t_0
+                seq_inv = [int(s) for s in list(seq_inv)]
+                seq_inv_next = [-1] + list(seq_inv[:-1])
+                with tqdm(total=len(seq_inv), desc="Generative process {}".format(it)) as progress_bar:
+                    edit_h = torch.from_numpy(edit_h_seq[k]).to(self.device).view(-1, 512, 8, 8)
+                    edit_z = torch.from_numpy(edit_z_seq[k]).to(self.device).view(-1, 3, 256, 256)
+                    for i, j in zip(reversed(seq_inv), reversed(seq_inv_next)):
+                        t = (torch.ones(n) * i).to(self.device)
+                        t_next = (torch.ones(n) * j).to(self.device)
+                        edit_z, edit_h = denoising_step(edit_z, t=t, t_next=t_next, models=model,
+                                           logvars=self.logvar,
+                                           sampling_type=self.args.sample_type,
+                                           b=self.betas,
+                                           eta = 1.0,
+                                           learn_sigma=learn_sigma,
+                                           ratio=self.args.model_ratio,
+                                           hybrid=self.args.hybrid_noise,
+                                           hybrid_config=HYBRID_CONFIG,
+                                           edit_h=edit_h,
+                                           )
+                save_edit = "unconditioned_smile_"+str(k)+".png"
+                tvu.save_image((edit_z + 1) * 0.5, os.path.join("edit_output",save_edit))
+                time_in_end = time.time()
+                print(f"Editing for 1 image takes {time_in_end - time_in_start:.4f}s")
+        return
+    def radius(self):
+        print(self.args.exp)
+        # ----------- Model -----------#
+        if self.config.data.dataset == "LSUN":
+            if self.config.data.category == "bedroom":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/bedroom.ckpt"
+            elif self.config.data.category == "church_outdoor":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/church_outdoor.ckpt"
+        elif self.config.data.dataset == "CelebA_HQ":
+            url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/celeba_hq.ckpt"
+        elif self.config.data.dataset == "AFHQ":
+            pass
+        else:
+            raise ValueError
+        if self.config.data.dataset in ["CelebA_HQ", "LSUN"]:
+            model = DDPM(self.config)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.hub.load_state_dict_from_url(url, map_location=self.device)
+            learn_sigma = False
+            print("Original diffusion Model loaded.")
+        elif self.config.data.dataset in ["FFHQ", "AFHQ"]:
+            model = i_DDPM(self.config.data.dataset)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.load(MODEL_PATHS[self.config.data.dataset])
+            learn_sigma = True
+            print("Improved diffusion Model loaded.")
+        else:
+            print('Not implemented dataset')
+            raise ValueError
+        model.load_state_dict(init_ckpt)
+        model.to(self.device)
+        model = torch.nn.DataParallel(model)
+        model.eval()
+        # ---------- Prepare the seq --------- #
+        # seq_inv = np.linspace(0, 1, self.args.n_inv_step) * self.args.t_0
+        seq_inv = np.linspace(0, 1, 999) * 999
+        seq_inv = [int(s) for s in list(seq_inv)]
+        seq_inv_next = [-1] + list(seq_inv[:-1])
+        n = 1
+        with torch.no_grad():
+            er = 0
+            x_rand = torch.randn(100, 3, 256, 256, device=self.device)
+            for idx in range(100):
+                x = x_rand[idx, :, :, :].unsqueeze(0)
+                with tqdm(total=len(seq_inv), desc=f"Generative process") as progress_bar:
+                    for it, (i, j) in enumerate(zip(reversed((seq_inv)), reversed((seq_inv_next)))):
+                        t = (torch.ones(n) * i).to(self.device)
+                        t_next = (torch.ones(n) * j).to(self.device)
+                        if t == 500:
+                            break
+                        x, _ = denoising_step(x, t=t, t_next=t_next, models=model,
+                                           logvars=self.logvar,
+                                           # sampling_type=self.args.sample_type,
+                                           sampling_type='ddim',
+                                           b=self.betas,
+                                           eta=0.0,
+                                           learn_sigma=learn_sigma,
+                                           )
+                        progress_bar.update(1)
+                    r_x = compute_radius(x)
+                er += r_x
+        print("Check radius at step :", er/100)
+        return
+    def boundary_search(self):
+        print(self.args.exp)
+        # ----------- Model -----------#
+        if self.config.data.dataset == "LSUN":
+            if self.config.data.category == "bedroom":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/bedroom.ckpt"
+            elif self.config.data.category == "church_outdoor":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/church_outdoor.ckpt"
+        elif self.config.data.dataset == "CelebA_HQ":
+            url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/celeba_hq.ckpt"
+        elif self.config.data.dataset == "AFHQ":
+            pass
+        else:
+            raise ValueError
+        if self.config.data.dataset in ["CelebA_HQ", "LSUN"]:
+            model = DDPM(self.config)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.hub.load_state_dict_from_url(url, map_location=self.device)
+            learn_sigma = False
+            print("Original diffusion Model loaded.")
+        elif self.config.data.dataset in ["FFHQ", "AFHQ"]:
+            model = i_DDPM(self.config.data.dataset)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.load(MODEL_PATHS[self.config.data.dataset])
+            learn_sigma = True
+            print("Improved diffusion Model loaded.")
+        else:
+            print('Not implemented dataset')
+            raise ValueError
+        model.load_state_dict(init_ckpt)
+        model.to(self.device)
+        model = torch.nn.DataParallel(model)
+        model.eval()
+        # ----------- Precompute Latents -----------#
+        print("Prepare identity latent")
+        seq_inv = np.linspace(0, 1, self.args.n_inv_step) * self.args.t_0
+        seq_inv = [int(s) for s in list(seq_inv)]
+        seq_inv_next = [-1] + list(seq_inv[:-1])
+        n = self.args.bs_train
+        img_lat_pairs_dic = {}
+        for mode in ['train', 'test']:
+            img_lat_pairs = []
+            pairs_path = os.path.join('precomputed/',
+                                      f'{self.config.data.category}_{mode}_t{self.args.t_0}_nim{self.args.n_precomp_img}_ninv{self.args.n_inv_step}_pairs.pth')
+            print(pairs_path)
+            if os.path.exists(pairs_path):
+                print(f'{mode} pairs exists')
+                img_lat_pairs_dic[mode] = torch.load(pairs_path)
+                for step, (x0, x_id, x_lat, mid_h, label) in enumerate(img_lat_pairs_dic[mode]):
+                    tvu.save_image((x0 + 1) * 0.5, os.path.join(self.args.image_folder, f'{mode}_{step}_0_orig.png'))
+                    tvu.save_image((x_id + 1) * 0.5, os.path.join(self.args.image_folder,
+                                                                  f'{mode}_{step}_1_rec_ninv{self.args.n_inv_step}.png'))
+                    if step == self.args.n_precomp_img - 1:
+                        break
+                continue
+            else:
+                train_dataset, test_dataset = get_dataset(self.config.data.dataset, DATASET_PATHS, self.config)
+                loader_dic = get_dataloader(train_dataset, test_dataset, bs_train=self.args.bs_train,
+                                            num_workers=self.config.data.num_workers)
+                loader = loader_dic[mode]
+            for step, (img, label) in enumerate(loader):
+            # for step, img in enumerate(loader):
+                x0 = img.to(self.config.device)
+                tvu.save_image((x0 + 1) * 0.5, os.path.join(self.args.image_folder, f'{mode}_{step}_0_orig.png'))
+                x = x0.clone()
+                model.eval()
+                label = label.to(self.config.device)
+                # print("check x and label:", x.size(), label)
+                with torch.no_grad():
+                    with tqdm(total=len(seq_inv), desc=f"Inversion process {mode} {step}") as progress_bar:
+                        for it, (i, j) in enumerate(zip((seq_inv_next[1:]), (seq_inv[1:]))):
+                            t = (torch.ones(n) * i).to(self.device)
+                            t_prev = (torch.ones(n) * j).to(self.device)
+                            x, mid_h_g = denoising_step(x, t=t, t_next=t_prev, models=model,
+                                               logvars=self.logvar,
+                                               sampling_type='ddim',
+                                               b=self.betas,
+                                               eta=0,
+                                               learn_sigma=learn_sigma)
+                            progress_bar.update(1)
+                    x_lat = x.clone()
+                    tvu.save_image((x_lat + 1) * 0.5, os.path.join(self.args.image_folder,
+                                                                   f'{mode}_{step}_1_lat_ninv{self.args.n_inv_step}.png'))
+                    with tqdm(total=len(seq_inv), desc=f"Generative process {mode} {step}") as progress_bar:
+                        for it, (i, j) in enumerate(zip(reversed((seq_inv)), reversed((seq_inv_next)))):
+                            t = (torch.ones(n) * i).to(self.device)
+                            t_next = (torch.ones(n) * j).to(self.device)
+                            x, _ = denoising_step(x, t=t, t_next=t_next, models=model,
+                                               logvars=self.logvar,
+                                               sampling_type=self.args.sample_type,
+                                               b=self.betas,
+                                               learn_sigma=learn_sigma,
+                                               # edit_h = mid_h,
+                                               )
+                            progress_bar.update(1)
+                    img_lat_pairs.append([x0, x.detach().clone(), x_lat.detach().clone(), mid_h_g.detach().clone(), label])
+                    # img_lat_pairs.append([x0, x.detach().clone(), x_lat.detach().clone(), mid_h_g.detach().clone()])
+                tvu.save_image((x + 1) * 0.5, os.path.join(self.args.image_folder,
+                                                           f'{mode}_{step}_1_rec_ninv{self.args.n_inv_step}.png'))
+                if step == self.args.n_precomp_img - 1:
+                    break
+            img_lat_pairs_dic[mode] = img_lat_pairs
+            pairs_path = os.path.join('precomputed/',
+                                      f'{self.config.data.category}_{mode}_t{self.args.t_0}_nim{self.args.n_precomp_img}_ninv{self.args.n_inv_step}_pairs.pth')
+            torch.save(img_lat_pairs, pairs_path)
+        # ----------- Training boundaries -----------#
+        print("Start boundary search")
+        print(f"Sampling type: {self.args.sample_type.upper()} with eta {self.args.eta}")
+        if self.args.n_train_step != 0:
+            seq_train = np.linspace(0, 1, self.args.n_train_step) * self.args.t_0
+            seq_train = [int(s) for s in list(seq_train)]
+            print('Uniform skip type')
+        else:
+            seq_train = list(range(self.args.t_0))
+            print('No skip')
+        seq_train_next = [-1] + list(seq_train[:-1])
+        seq_test = np.linspace(0, 1, self.args.n_test_step) * self.args.t_0
+        seq_test = [int(s) for s in list(seq_test)]
+        seq_test_next = [-1] + list(seq_test[:-1])
+        for src_txt, trg_txt in zip(self.src_txts, self.trg_txts):
+            print(f"CHANGE {src_txt} TO {trg_txt}")
+            time_in_start = time.time()
+            clf_h = svm.SVC(kernel='linear')
+            clf_z = svm.SVC(kernel='linear')
+            # print("clf model:",clf)
+            exp_id = os.path.split(self.args.exp)[-1]
+            save_name_h = f'boundary/{exp_id}_{trg_txt.replace(" ", "_")}_h.sav'
+            save_name_z = f'boundary/{exp_id}_{trg_txt.replace(" ", "_")}_z.sav'
+            n_train = len(img_lat_pairs_dic['train'])
+            train_data_z = np.empty([n_train, 3*256*256])
+            train_data_h = np.empty([n_train, 512*8*8])
+            train_label = np.empty([n_train,],  dtype=int)
+            for step, (x0, x_id, x_lat, mid_h, label) in enumerate(img_lat_pairs_dic['train']):
+                train_data_h[step, :] = mid_h.view(1,-1).cpu().numpy()
+                train_data_z[step, :] = x_lat.view(1,-1).cpu().numpy()
+                train_label[step] = label.cpu().numpy()
+            classifier_h = clf_h.fit(train_data_h, train_label)
+            classifier_z = clf_z.fit(train_data_z, train_label)
+            print(np.shape(train_data_h), np.shape(train_data_z), np.shape(train_label))
+            # a = classifier.coef_.reshape(1, 512*8*8).astype(np.float32)
+            # a = classifier.coef_.reshape(1, 3*256*256).astype(np.float32)
+            # a = a / np.linalg.norm(a)
+            time_in_end = time.time()
+            print(f"Finding boundary takes {time_in_end - time_in_start:.4f}s")
+            print("Finishing boudary seperation!")
+            # boudary_save_h = 'smiling_boundary_h.sav'
+            # boudary_save_z = 'smiling_boundary_z.sav'
+            pickle.dump(classifier_h, open(save_name_h, 'wb'))
+            pickle.dump(classifier_z, open(save_name_z, 'wb'))
+            # test the accuracy ##
+            n_test = len(img_lat_pairs_dic['test'])
+            test_data_h = np.empty([n_test, 512*8*8])
+            test_data_z = np.empty([n_test, 3*256*256])
+            test_lable = np.empty([n_test,], dtype=int)
+            for step, (x0, x_id, x_lat, mid_h, label) in enumerate(img_lat_pairs_dic['test']):
+                test_data_h[step, :] = mid_h.view(1,-1).cpu().numpy()
+                test_data_z[step, :] = x_lat.view(1,-1).cpu().numpy()
+                test_lable[step] = label.cpu().numpy()
+            classifier_h = pickle.load(open(save_name_h, 'rb'))
+            classifier_z = pickle.load(open(save_name_z, 'rb'))
+            print("Boundary loaded!")
+            val_prediction_h = classifier_h.predict(test_data_h)
+            val_prediction_z = classifier_z.predict(test_data_z)
+            correct_num_h = np.sum(test_lable == val_prediction_h)
+            correct_num_z = np.sum(test_lable == val_prediction_z)
+            # print(val_prediction_h, test_lable)
+            print("Validation accuracy on h and z spaces:", correct_num_h/n_test, correct_num_z/n_test)
+            print("total training and testing", n_train, n_test)
+        return None
+    def edit_image_boundary(self):
+        # ----------- Data -----------#
+        n = self.args.bs_test
+        if self.args.align_face and self.config.data.dataset in ["FFHQ", "CelebA_HQ"]:
+            try:
+                img = run_alignment(self.args.img_path, output_size=self.config.data.image_size)
+            except:
+                img = Image.open(self.args.img_path).convert("RGB")
+        else:
+            img = Image.open(self.args.img_path).convert("RGB")
+        img = img.resize((self.config.data.image_size, self.config.data.image_size), Image.ANTIALIAS)
+        img = np.array(img)/255
+        img = torch.from_numpy(img).type(torch.FloatTensor).permute(2, 0, 1).unsqueeze(dim=0).repeat(n, 1, 1, 1)
+        img = img.to(self.config.device)
+        tvu.save_image(img, os.path.join(self.args.image_folder, f'0_orig.png'))
+        x0 = (img - 0.5) * 2.
+        # ----------- Models -----------#
+        if self.config.data.dataset == "LSUN":
+            if self.config.data.category == "bedroom":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/bedroom.ckpt"
+            elif self.config.data.category == "church_outdoor":
+                url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/church_outdoor.ckpt"
+        elif self.config.data.dataset == "CelebA_HQ":
+            url = "https://image-editing-test-12345.s3-us-west-2.amazonaws.com/checkpoints/celeba_hq.ckpt"
+        elif self.config.data.dataset in ["FFHQ", "AFHQ", "IMAGENET"]:
+            pass
+        else:
+            raise ValueError
+        if self.config.data.dataset in ["CelebA_HQ", "LSUN"]:
+            model = DDPM(self.config)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.hub.load_state_dict_from_url(url, map_location=self.device)
+            learn_sigma = False
+            print("Original diffusion Model loaded.")
+        elif self.config.data.dataset in ["FFHQ", "AFHQ"]:
+            model = i_DDPM(self.config.data.dataset)
+            if self.args.model_path:
+                init_ckpt = torch.load(self.args.model_path)
+            else:
+                init_ckpt = torch.load(MODEL_PATHS[self.config.data.dataset])
+            learn_sigma = True
+            print("Improved diffusion Model loaded.")
+        else:
+            print('Not implemented dataset')
+            raise ValueError
+        model.load_state_dict(init_ckpt)
+        model.to(self.device)
+        model = torch.nn.DataParallel(model)
+        model.eval()
+        # ---------- Load boundary ----------#
+        boundary_h = pickle.load(open('./boundary/smile_boundary_h.sav', 'rb'))
+        a = boundary_h.coef_.reshape(1, 512*8*8).astype(np.float32)
+        a = a / np.linalg.norm(a)
+        boundary_z = pickle.load(open('./boundary/smile_boundary_z.sav', 'rb'))
+        z_a = boundary_z.coef_.reshape(1, 3*256*256).astype(np.float32)
+        z_a = z_a / np.linalg.norm(z_a) # normalized boundary
+        print("Boundary loaded! In shape:", np.shape(a), np.shape(z_a))
+        with torch.no_grad():
+            #---------------- Invert Image to Latent in case of Deterministic Inversion process -------------------#
+            if self.args.deterministic_inv:
+                x_lat_path = os.path.join(self.args.image_folder, f'x_lat_t{self.args.t_0}_ninv{self.args.n_inv_step}.pth')
+                h_lat_path = os.path.join(self.args.image_folder, f'h_lat_t{self.args.t_0}_ninv{self.args.n_inv_step}.pth')
+                if not os.path.exists(x_lat_path):
+                    seq_inv = np.linspace(0, 1, self.args.n_inv_step) * self.args.t_0
+                    seq_inv = [int(s) for s in list(seq_inv)]
+                    seq_inv_next = [-1] + list(seq_inv[:-1])
+                    x = x0.clone()
+                    with tqdm(total=len(seq_inv), desc=f"Inversion process ") as progress_bar:
+                        for it, (i, j) in enumerate(zip((seq_inv_next[1:]), (seq_inv[1:]))):
+                            t = (torch.ones(n) * i).to(self.device)
+                            t_prev = (torch.ones(n) * j).to(self.device)
+                            x, mid_h_g = denoising_step(x, t=t, t_next=t_prev, models=model,
+                                               logvars=self.logvar,
+                                               sampling_type='ddim',
+                                               b=self.betas,
+                                               eta=0,
+                                               learn_sigma=learn_sigma,
+                                               ratio=0,
+                                               )
+                            progress_bar.update(1)
+                        x_lat = x.clone()
+                        h_lat = mid_h_g.clone()
+                        torch.save(x_lat, x_lat_path)
+                        torch.save(h_lat, h_lat_path)
+                else:
+                    print('Latent exists.')
+                    x_lat = torch.load(x_lat_path)
+                    h_lat = torch.load(h_lat_path)
+            print("Finish inversion for the given image!", h_lat.size())
+            # ----------- Generative Process -----------#
+            print(f"Sampling type: {self.args.sample_type.upper()} with eta {self.args.eta}, "
+                  f" Steps: {self.args.n_test_step}/{self.args.t_0}")
+            # ----- Editing space ------ #
+            start_distance = self.args.start_distance
+            end_distance = self.args.end_distance
+            edit_img_number = self.args.edit_img_number
+            # [-100, 100]
+            linspace = np.linspace(start_distance, end_distance, edit_img_number)
+            latent_code = h_lat.cpu().view(1,-1).numpy()
+            linspace = linspace - latent_code.dot(a.T)
+            linspace = linspace.reshape(-1, 1).astype(np.float32)
+            edit_h_seq = latent_code + linspace * a
+            z_linspace = np.linspace(start_distance, end_distance, edit_img_number)
+            z_latent_code = x_lat.cpu().view(1,-1).numpy()
+            z_linspace = z_linspace - z_latent_code.dot(z_a.T)
+            z_linspace = z_linspace.reshape(-1, 1).astype(np.float32)
+            edit_z_seq = z_latent_code + z_linspace * z_a
+            if self.args.n_test_step != 0:
+                seq_test = np.linspace(0, 1, self.args.n_test_step) * self.args.t_0
+                seq_test = [int(s) for s in list(seq_test)]
+                print('Uniform skip type')
+            else:
+                seq_test = list(range(self.args.t_0))
+                print('No skip')
+            seq_test_next = [-1] + list(seq_test[:-1])
+            for it in range(self.args.n_iter):
+                if self.args.deterministic_inv:
+                    x = x_lat.clone()
+                else:
+                    e = torch.randn_like(x0)
+                    a = (1 - self.betas).cumprod(dim=0)
+                    x = x0 * a[self.args.t_0 - 1].sqrt() + e * (1.0 - a[self.args.t_0 - 1]).sqrt()
+                tvu.save_image((x + 1) * 0.5, os.path.join(self.args.image_folder,
+                                                           f'1_lat_ninv{self.args.n_inv_step}.png'))
+                for k in range(edit_img_number):
+                    time_in_start = time.time()
+                    with tqdm(total=len(seq_test), desc="Generative process {}".format(it)) as progress_bar:
+                        edit_h = torch.from_numpy(edit_h_seq[k]).to(self.device).view(-1, 512, 8, 8)
+                        edit_z = torch.from_numpy(edit_z_seq[k]).to(self.device).view(-1, 3, 256, 256)
+                        for i, j in zip(reversed(seq_test), reversed(seq_test_next)):
+                            t = (torch.ones(n) * i).to(self.device)
+                            t_next = (torch.ones(n) * j).to(self.device)
+                            edit_z, edit_h = denoising_step(edit_z, t=t, t_next=t_next, models=model,
+                                               logvars=self.logvar,
+                                               sampling_type=self.args.sample_type,
+                                               b=self.betas,
+                                               eta = 1.0,
+                                               learn_sigma=learn_sigma,
+                                               ratio=self.args.model_ratio,
+                                               hybrid=self.args.hybrid_noise,
+                                               hybrid_config=HYBRID_CONFIG,
+                                               edit_h=edit_h,
+                                               )
+                    x0 = x.clone()
+                    save_edit = "edited_"+str(k)+".png"
+                    tvu.save_image((edit_z + 1) * 0.5, os.path.join("edit_output",save_edit))
+                    time_in_end = time.time()
+                    print(f"Editing for 1 image takes {time_in_end - time_in_start:.4f}s")
+                # this is for recons
+                with tqdm(total=len(seq_test), desc="Generative process {}".format(it)) as progress_bar:
+                    for i, j in zip(reversed(seq_test), reversed(seq_test_next)):
+                        t = (torch.ones(n) * i).to(self.device)
+                        t_next = (torch.ones(n) * j).to(self.device)
+                        x_lat, _ = denoising_step(x_lat, t=t, t_next=t_next, models=model,
+                                           logvars=self.logvar,
+                                           sampling_type=self.args.sample_type,
+                                           b=self.betas,
+                                           # eta=self.args.eta,
+                                           eta = 0.0,
+                                           learn_sigma=learn_sigma,
+                                           ratio=self.args.model_ratio,
+                                           hybrid=self.args.hybrid_noise,
+                                           hybrid_config=HYBRID_CONFIG,
+                                           edit_h=None,
+                                           )
+                        # added intermediate step vis
+                        if (i - 99) % 100 == 0:
+                            tvu.save_image((x + 1) * 0.5, os.path.join(self.args.image_folder,
+                                                                       f'2_lat_t{self.args.t_0}_ninv{self.args.n_inv_step}_ngen{self.args.n_test_step}_{i}_it{it}.png'))
+                        progress_bar.update(1)
+                x0 = x.clone()
+                save_edit = "recons.png"
+                tvu.save_image((x_lat + 1) * 0.5, os.path.join("edit_output",save_edit))
+        return None

configs/afhq.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+data:
+    dataset: "AFHQ"
+    category: "dog"
+    image_size: 256
+    channels: 3
+    logit_transform: false
+    uniform_dequantization: false
+    gaussian_dequantization: false
+    random_flip: true
+    rescaled: true
+    num_workers: 0
+model:
+    type: "simple"
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16, ]
+    dropout: 0.0
+    var_type: fixedsmall
+    ema_rate: 0.999
+    ema: True
+    resamp_with_conv: True
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000
+sampling:
+    batch_size: 4
+    last_only: True

configs/bedroom.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+data:
+    dataset: "LSUN"
+    category: "bedroom"
+    image_size: 256
+    channels: 3
+    logit_transform: false
+    uniform_dequantization: false
+    gaussian_dequantization: false
+    random_flip: true
+    rescaled: true
+    num_workers: 0
+model:
+    type: "simple"
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16, ]
+    dropout: 0.0
+    var_type: fixedsmall
+    ema_rate: 0.999
+    ema: True
+    resamp_with_conv: True
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000
+sampling:
+    batch_size: 4
+    last_only: True

configs/celeba.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+data:
+    dataset: "CelebA_HQ"
+    category: "CelebA_HQ"
+    image_size: 256
+    channels: 3
+    logit_transform: false
+    uniform_dequantization: false
+    gaussian_dequantization: false
+    random_flip: true
+    rescaled: true
+    num_workers: 0
+model:
+    type: "simple"
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16, ]
+    dropout: 0.0
+    var_type: fixedsmall
+    ema_rate: 0.999
+    ema: True
+    resamp_with_conv: True
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000
+sampling:
+    batch_size: 4
+    last_only: True

configs/church.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+data:
+    dataset: "LSUN"
+    category: "church_outdoor"
+    image_size: 256
+    channels: 3
+    logit_transform: false
+    uniform_dequantization: false
+    gaussian_dequantization: false
+    random_flip: true
+    rescaled: true
+    num_workers: 0
+model:
+    type: "simple"
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16, ]
+    dropout: 0.0
+    var_type: fixedsmall
+    ema_rate: 0.999
+    ema: True
+    resamp_with_conv: True
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000
+sampling:
+    batch_size: 4
+    last_only: True

configs/imagenet.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+data:
+    dataset: "IMAGENET"
+    category: "IMAGENET"
+    image_size: 512
+    channels: 3
+    logit_transform: false
+    uniform_dequantization: false
+    gaussian_dequantization: false
+    random_flip: true
+    rescaled: true
+    num_workers: 0
+model:
+    type: "simple"
+    in_channels: 3
+    out_ch: 3
+    ch: 128
+    ch_mult: [1, 1, 2, 2, 4, 4]
+    num_res_blocks: 2
+    attn_resolutions: [16, ]
+    dropout: 0.0
+    var_type: fixedsmall
+    ema_rate: 0.999
+    ema: True
+    resamp_with_conv: True
+diffusion:
+    beta_schedule: linear
+    beta_start: 0.0001
+    beta_end: 0.02
+    num_diffusion_timesteps: 1000
+sampling:
+    batch_size: 4
+    last_only: True

configs/paths_config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+DATASET_PATHS = {
+	'FFHQ': '/n/fs/visualai-scr/Data/CelebA-HQ/',
+	'CelebA_HQ': '/n/fs/visualai-scr/Data/CelebA-HQ/',
+	'AFHQ': '/n/fs/visualai-scr/Data/AFHQ-Dog/',
+	'LSUN':  '/n/fs/yz-diff/dataset/',
+    'IMAGENET': 'data/imagenet/',
+}
+MODEL_PATHS = {
+	'AFHQ': "pretrained/afhqdog_p2.pt",
+	'FFHQ': "pretrained/ffhq_10m.pt",
+	'ir_se50': 'pretrained/model_ir_se50.pth',
+    'IMAGENET': "pretrained/512x512_diffusion.pt",
+	'shape_predictor': "pretrained/shape_predictor_68_face_landmarks.dat.bz2",
+}
+HYBRID_MODEL_PATHS = [
+	'./checkpoint/human_face/curly_hair_t401.pth',
+	'./checkpoint/human_face/with_makeup_t401.pth',
+]
+HYBRID_CONFIG = \
+	{ 300: [0.4, 0.6, 0],
+	    0: [0.15, 0.15, 0.7]}

data_download.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Modified version of download.sh in https://github.com/naver-ai/StyleMapGAN
+"""
+DATASET=$1
+BASE_DIR=$2
+    if [ $DATASET == "celeba_hq" ]; then
+        URL="https://docs.google.com/uc?export=download&id=1R72NB79CX0MpnmWSli2SMu-Wp-M0xI-o"
+        DATASET_FOLDER="/n/fs/visualai-scr/Data/CelebA-HQ"
+        ZIP_FILE=$DATASET_FOLDER/celeba_hq_raw.zip
+    elif  [ $DATASET == "afhq" ]; then
+        URL="https://docs.google.com/uc?export=download&id=1Pf4f6Y27lQX9y9vjeSQnoOQntw_ln7il"
+        DATASET_FOLDER="./data/afhq"
+        ZIP_FILE=$DATASET_FOLDER/afhq_raw.zip
+    else
+        echo "Unknown DATASET"
+        exit 1
+    fi
+    mkdir -p $DATASET_FOLDER
+    # wget --no-check-certificate -r $URL -O $ZIP_FILE
+    # wget --load-cookies ~/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies ~/cookies.txt --keep-session-cookies --no-check-certificate $URL -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1R72NB79CX0MpnmWSli2SMu-Wp-M0xI-o" -O $ZIP_FILE && rm -rf ~/cookies.txt
+    # unzip $ZIP_FILE -d $DATASET_FOLDER
+    # rm $ZIP_FILE
+    # raw images to LMDB format
+    TARGET_SIZE=256,1024
+    for DATASET_TYPE in "train" "test" "val"; do
+        python utils/prepare_lmdb_data.py --out $DATASET_FOLDER/LMDB_$DATASET_TYPE --size $TARGET_SIZE $DATASET_FOLDER/raw_images/$DATASET_TYPE --attr gender
+    done
+wget --load-cookies ~/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies ~/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1R72NB79CX0MpnmWSli2SMu-Wp-M0xI-o' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1R72NB79CX0MpnmWSli2SMu-Wp-M0xI-o" -O a.zip && rm -rf ~/cookies.txt

datasets/AFHQ_dataset.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from PIL import Image
+from glob import glob
+import os
+from torch.utils.data import Dataset
+import torchvision.transforms as tfs
+class AFHQ_dataset(Dataset):
+    def __init__(self, image_root, transform=None, mode='train', animal_class='dog', img_size=256):
+        super().__init__()
+        self.image_paths = glob(os.path.join(image_root, mode, animal_class, '*.jpg'))
+        self.transform = transform
+        self.img_size = img_size
+    def __getitem__(self, index):
+        image_path = self.image_paths[index]
+        x = Image.open(image_path)
+        x = x.resize((self.img_size, self.img_size))
+        if self.transform is not None:
+            x = self.transform(x)
+        return x
+    def __len__(self):
+        return len(self.image_paths)
+################################################################################
+def get_afhq_dataset(data_root, config):
+    train_transform = tfs.Compose([tfs.ToTensor(),
+                                   tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                                 inplace=True)])
+    test_transform = tfs.Compose([tfs.ToTensor(),
+                                  tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                                inplace=True)])
+    train_dataset = AFHQ_dataset(data_root, transform=train_transform, mode='train', animal_class='dog',
+                                 img_size=config.data.image_size)
+    test_dataset = AFHQ_dataset(data_root, transform=test_transform, mode='val', animal_class='dog',
+                                img_size=config.data.image_size)
+    return train_dataset, test_dataset

datasets/CelebA_HQ_dataset.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from torch.utils.data import Dataset
+import lmdb
+from io import BytesIO
+from PIL import Image
+import torchvision.transforms as tfs
+import os
+class MultiResolutionDataset(Dataset):
+    def __init__(self, path, transform, resolution=256):
+        self.env = lmdb.open(
+            path,
+            max_readers=32,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+            # attribute=,
+        )
+        if not self.env:
+            raise IOError("Cannot open lmdb dataset", path)
+        with self.env.begin(write=False) as txn:
+            self.length = int(txn.get("length".encode("utf-8")).decode("utf-8"))
+        self.resolution = resolution
+        self.transform = transform
+        attr_file_path = '/n/fs/yz-diff/inversion/list_attr_celeba.txt'
+        self.labels = file_to_list(attr_file_path)
+    def __len__(self):
+        return self.length
+    def __getitem__(self, index):
+        with self.env.begin(write=False) as txn:
+            key = f"{self.resolution}-{str(index).zfill(5)}".encode("utf-8")
+            key_label = f"{str(index).zfill(5)}".encode("utf-8")
+            print("check key:", key, key_label)
+            img_bytes = txn.get(key)
+            img_id = int(txn.get(key_label).decode("utf-8"))
+        buffer = BytesIO(img_bytes)
+        img = Image.open(buffer)
+        img = self.transform(img)
+        attr_label = self.labels[img_id-1].split()
+        # map the attr to the index position
+        label = int(attr_label[32])
+        print("check img_id and label:", img_id, label)
+        return img, label
+################################################################################
+def get_celeba_dataset(data_root, config):
+    train_transform = tfs.Compose([tfs.ToTensor(),
+                                   tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                                 inplace=True)])
+    test_transform = tfs.Compose([tfs.ToTensor(),
+                                  tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                                inplace=True)])
+    train_dataset = MultiResolutionDataset(os.path.join(data_root, 'LMDB_train'),
+                                           train_transform, config.data.image_size)
+    test_dataset = MultiResolutionDataset(os.path.join(data_root, 'LMDB_test'),
+                                          test_transform, config.data.image_size)
+    return train_dataset, test_dataset
+def file_to_list(filename):
+    with open(filename, encoding='utf-8') as f:
+        files = f.readlines()
+    files = [f.rstrip() for f in files]
+    return files

datasets/CelebA_HQ_dataset_with_label.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from torch.utils.data import Dataset
+import lmdb
+from io import BytesIO
+from PIL import Image
+import torchvision.transforms as tfs
+import os
+class MultiResolutionDataset(Dataset):
+    def __init__(self, path, transform, resolution=256):
+        self.env = lmdb.open(
+            path,
+            max_readers=32,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+        )
+        if not self.env:
+            raise IOError("Cannot open lmdb dataset", path)
+        with self.env.begin(write=False) as txn:
+            self.length = int(txn.get("length".encode("utf-8")).decode("utf-8"))
+        self.resolution = resolution
+        self.transform = transform
+    def __len__(self):
+        return self.length
+    def __getitem__(self, index):
+        with self.env.begin(write=False) as txn:
+            key = f"{self.resolution}-{str(index).zfill(5)}".encode("utf-8")
+            img_bytes = txn.get(key)
+        buffer = BytesIO(img_bytes)
+        img = Image.open(buffer)
+        img = self.transform(img)
+        return img
+################################################################################
+def get_celeba_dataset(data_root, config):
+    train_transform = tfs.Compose([tfs.ToTensor(),
+                                   tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                                 inplace=True)])
+    test_transform = tfs.Compose([tfs.ToTensor(),
+                                  tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                                inplace=True)])
+    train_dataset = MultiResolutionDataset(os.path.join(data_root, 'LMDB_train'),
+                                           train_transform, config.data.image_size)
+    test_dataset = MultiResolutionDataset(os.path.join(data_root, 'LMDB_test'),
+                                          test_transform, config.data.image_size)
+    return train_dataset, test_dataset

datasets/IMAGENET_dataset.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from PIL import Image
+from glob import glob
+import os
+from torch.utils.data import Dataset
+import math
+import numpy as np
+import random
+from .imagenet_dic import IMAGENET_DIC
+def get_imagenet_dataset(data_root, config, class_num=None, random_crop=True, random_flip=False):
+    train_dataset = IMAGENET_dataset(data_root, mode='train', class_num=class_num, img_size=config.data.image_size,
+                                     random_crop=random_crop, random_flip=random_flip)
+    test_dataset = IMAGENET_dataset(data_root, mode='val', class_num=class_num, img_size=config.data.image_size,
+                                    random_crop=random_crop, random_flip=random_flip)
+    return train_dataset, test_dataset
+###################################################################
+class IMAGENET_dataset(Dataset):
+    def __init__(self, image_root, mode='val', class_num=None, img_size=512, random_crop=True, random_flip=False):
+        super().__init__()
+        if class_num is not None:
+            self.data_dir = os.path.join(image_root, mode, IMAGENET_DIC[str(class_num)][0], '*.JPEG')
+            self.image_paths = sorted(glob(self.data_dir))
+        else:
+            self.data_dir = os.path.join(image_root, mode, '*', '*.JPEG')
+            self.image_paths = sorted(glob(self.data_dir))
+        self.img_size = img_size
+        self.random_crop = random_crop
+        self.random_flip = random_flip
+        self.class_num = class_num
+    def __getitem__(self, index):
+        f = self.image_paths[index]
+        pil_image = Image.open(f)
+        pil_image.load()
+        pil_image = pil_image.convert("RGB")
+        if self.random_crop:
+            arr = random_crop_arr(pil_image, self.img_size)
+        else:
+            arr = center_crop_arr(pil_image, self.img_size)
+        if self.random_flip and random.random() < 0.5:
+            arr = arr[:, ::-1]
+        arr = arr.astype(np.float32) / 127.5 - 1
+        # y = [self.class_num, IMAGENET_DIC[str(self.class_num)][0], IMAGENET_DIC[str(self.class_num)][1]]
+        # y = self.class_num
+        return np.transpose(arr, [2, 0, 1])#, y
+    def __len__(self):
+        return len(self.image_paths)
+def center_crop_arr(pil_image, image_size):
+    # We are not on a new enough PIL to support the `reducing_gap`
+    # argument, which uses BOX downsampling at powers of two first.
+    # Thus, we do it by hand to improve downsample quality.
+    while min(*pil_image.size) >= 2 * image_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = image_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = (arr.shape[0] - image_size) // 2
+    crop_x = (arr.shape[1] - image_size) // 2
+    return arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size]
+def random_crop_arr(pil_image, image_size, min_crop_frac=0.8, max_crop_frac=1.0):
+    min_smaller_dim_size = math.ceil(image_size / max_crop_frac)
+    max_smaller_dim_size = math.ceil(image_size / min_crop_frac)
+    smaller_dim_size = random.randrange(min_smaller_dim_size, max_smaller_dim_size + 1)
+    # We are not on a new enough PIL to support the `reducing_gap`
+    # argument, which uses BOX downsampling at powers of two first.
+    # Thus, we do it by hand to improve downsample quality.
+    while min(*pil_image.size) >= 2 * smaller_dim_size:
+        pil_image = pil_image.resize(
+            tuple(x // 2 for x in pil_image.size), resample=Image.BOX
+        )
+    scale = smaller_dim_size / min(*pil_image.size)
+    pil_image = pil_image.resize(
+        tuple(round(x * scale) for x in pil_image.size), resample=Image.BICUBIC
+    )
+    arr = np.array(pil_image)
+    crop_y = random.randrange(arr.shape[0] - image_size + 1)
+    crop_x = random.randrange(arr.shape[1] - image_size + 1)
+    return arr[crop_y: crop_y + image_size, crop_x: crop_x + image_size]

datasets/LSUN_dataset.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import os.path
+from collections.abc import Iterable
+from torchvision.datasets.utils import verify_str_arg, iterable_to_str
+from PIL import Image
+import io
+import pickle
+import os
+import torch
+import torch.utils.data as data
+import torchvision.transforms as tfs
+class VisionDataset(data.Dataset):
+    _repr_indent = 4
+    def __init__(self, root, transforms=None, transform=None, target_transform=None):
+        if isinstance(root, torch._six.string_classes):
+            root = os.path.expanduser(root)
+        self.root = root
+        has_transforms = transforms is not None
+        has_separate_transform = transform is not None or target_transform is not None
+        if has_transforms and has_separate_transform:
+            raise ValueError("Only transforms or transform/target_transform can "
+                             "be passed as argument")
+        # for backwards-compatibility
+        self.transform = transform
+        self.target_transform = target_transform
+        if has_separate_transform:
+            transforms = StandardTransform(transform, target_transform)
+        self.transforms = transforms
+    def __getitem__(self, index):
+        raise NotImplementedError
+    def __len__(self):
+        raise NotImplementedError
+    def __repr__(self):
+        head = "Dataset " + self.__class__.__name__
+        body = ["Number of datapoints: {}".format(self.__len__())]
+        if self.root is not None:
+            body.append("Root location: {}".format(self.root))
+        body += self.extra_repr().splitlines()
+        if hasattr(self, 'transform') and self.transform is not None:
+            body += self._format_transform_repr(self.transform,
+                                                "Transforms: ")
+        if hasattr(self, 'target_transform') and self.target_transform is not None:
+            body += self._format_transform_repr(self.target_transform,
+                                                "Target transforms: ")
+        lines = [head] + [" " * self._repr_indent + line for line in body]
+        return '\n'.join(lines)
+    def _format_transform_repr(self, transform, head):
+        lines = transform.__repr__().splitlines()
+        return (["{}{}".format(head, lines[0])] +
+                ["{}{}".format(" " * len(head), line) for line in lines[1:]])
+    def extra_repr(self):
+        return ""
+class StandardTransform(object):
+    def __init__(self, transform=None, target_transform=None):
+        self.transform = transform
+        self.target_transform = target_transform
+    def __call__(self, input, target):
+        if self.transform is not None:
+            input = self.transform(input)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return input, target
+    def _format_transform_repr(self, transform, head):
+        lines = transform.__repr__().splitlines()
+        return (["{}{}".format(head, lines[0])] +
+                ["{}{}".format(" " * len(head), line) for line in lines[1:]])
+    def __repr__(self):
+        body = [self.__class__.__name__]
+        if self.transform is not None:
+            body += self._format_transform_repr(self.transform,
+                                                "Transform: ")
+        if self.target_transform is not None:
+            body += self._format_transform_repr(self.target_transform,
+                                                "Target transform: ")
+        return '\n'.join(body)
+################################################################
+class LSUNClass(VisionDataset):
+    def __init__(self, root, transform=None, target_transform=None):
+        import lmdb
+        super(LSUNClass, self).__init__(
+            root, transform=transform, target_transform=target_transform
+        )
+        self.env = lmdb.open(
+            root,
+            max_readers=1,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+        )
+        with self.env.begin(write=False) as txn:
+            self.length = txn.stat()["entries"]
+        root_split = root.split("/")
+        cache_file = os.path.join("/".join(root_split[:-1]), f"_cache_{root_split[-1]}")
+        if os.path.isfile(cache_file):
+            self.keys = pickle.load(open(cache_file, "rb"))
+        else:
+            with self.env.begin(write=False) as txn:
+                self.keys = [key for key, _ in txn.cursor()]
+            pickle.dump(self.keys, open(cache_file, "wb"))
+    def __getitem__(self, index):
+        img, target = None, None
+        env = self.env
+        with env.begin(write=False) as txn:
+            imgbuf = txn.get(self.keys[index])
+        buf = io.BytesIO()
+        buf.write(imgbuf)
+        buf.seek(0)
+        img = Image.open(buf).convert("RGB")
+        if self.transform is not None:
+            img = self.transform(img)
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        return img, target
+    def __len__(self):
+        return self.length
+class LSUN(VisionDataset):
+    """
+    `LSUN <https://www.yf.io/p/lsun>`_ dataset.
+    Args:
+        root (string): Root directory for the database files.
+        classes (string or list): One of {'train', 'val', 'test'} or a list of
+            categories to load. e,g. ['bedroom_train', 'church_outdoor_train'].
+        transform (callable, optional): A function/transform that  takes in an PIL image
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        target_transform (callable, optional): A function/transform that takes in the
+            target and transforms it.
+    """
+    def __init__(self, root, classes="train", transform=None, target_transform=None):
+        super(LSUN, self).__init__(
+            root, transform=transform, target_transform=target_transform
+        )
+        self.classes = self._verify_classes(classes)
+        # for each class, create an LSUNClassDataset
+        self.dbs = []
+        for c in self.classes:
+            self.dbs.append(
+                LSUNClass(root=root + "/" + c + "_lmdb", transform=transform)
+            )
+        self.indices = []
+        count = 0
+        for db in self.dbs:
+            count += len(db)
+            self.indices.append(count)
+        self.length = count
+    def _verify_classes(self, classes):
+        categories = [
+            "bedroom",
+            "bridge",
+            "church_outdoor",
+            "classroom",
+            "conference_room",
+            "dining_room",
+            "kitchen",
+            "living_room",
+            "restaurant",
+            "tower",
+        ]
+        dset_opts = ["train", "val", "test"]
+        try:
+            verify_str_arg(classes, "classes", dset_opts)
+            if classes == "test":
+                classes = [classes]
+            else:
+                classes = [c + "_" + classes for c in categories]
+        except ValueError:
+            if not isinstance(classes, Iterable):
+                msg = (
+                    "Expected type str or Iterable for argument classes, "
+                    "but got type {}."
+                )
+                raise ValueError(msg.format(type(classes)))
+            classes = list(classes)
+            msg_fmtstr = (
+                "Expected type str for elements in argument classes, "
+                "but got type {}."
+            )
+            for c in classes:
+                verify_str_arg(c, custom_msg=msg_fmtstr.format(type(c)))
+                c_short = c.split("_")
+                category, dset_opt = "_".join(c_short[:-1]), c_short[-1]
+                msg_fmtstr = "Unknown value '{}' for {}. Valid values are {{{}}}."
+                msg = msg_fmtstr.format(
+                    category, "LSUN class", iterable_to_str(categories)
+                )
+                verify_str_arg(category, valid_values=categories, custom_msg=msg)
+                msg = msg_fmtstr.format(dset_opt, "postfix", iterable_to_str(dset_opts))
+                verify_str_arg(dset_opt, valid_values=dset_opts, custom_msg=msg)
+        return classes
+    def __getitem__(self, index):
+        """
+        Args:
+            index (int): Index
+        Returns:
+            tuple: Tuple (image, target) where target is the index of the target category.
+        """
+        target = 0
+        sub = 0
+        for ind in self.indices:
+            if index < ind:
+                break
+            target += 1
+            sub = ind
+        db = self.dbs[target]
+        index = index - sub
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        img, _ = db[index]
+        return img#, target
+    def __len__(self):
+        return self.length
+    def extra_repr(self):
+        return "Classes: {classes}".format(**self.__dict__)
+################################################################
+def get_lsun_dataset(data_root, config):
+    train_folder = "{}_train".format(config.data.category)
+    val_folder = "{}_val".format(config.data.category)
+    train_dataset = LSUN(
+        root=os.path.join(data_root),
+        classes=[train_folder],
+        transform=tfs.Compose(
+            [
+                tfs.Resize(config.data.image_size),
+                tfs.CenterCrop(config.data.image_size),
+                tfs.ToTensor(),
+                tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                     inplace=True)
+            ]
+        ),
+    )
+    test_dataset = LSUN(
+        root=os.path.join(data_root),
+        classes=[val_folder],
+        transform=tfs.Compose(
+            [
+                tfs.Resize(config.data.image_size),
+                tfs.CenterCrop(config.data.image_size),
+                tfs.ToTensor(),
+                tfs.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5),
+                                     inplace=True)
+                ,
+            ]
+        ),
+    )
+    return train_dataset, test_dataset

datasets/celeba_attr.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+5_o_Clock_Shadow
+Arched_Eyebrows
+Attractive
+Bags_Under_Eyes
+Bald
+Bangs
+Big_Lips
+Big_Nose
+Black_Hair
+Blond_Hair
+Blurry
+Brown_Hair
+Bushy_Eyebrows
+Chubby
+Double_Chin
+Eyeglasses
+Goatee
+Gray_Hair
+Heavy_Makeup
+High_Cheekbones
+Male
+Mouth_Slightly_Open
+Mustache
+Narrow_Eyes
+No_Beard
+Oval_Face
+Pale_Skin
+Pointy_Nose
+Receding_Hairline
+Rosy_Cheeks
+Sideburns
+Smiling
+Straight_Hair
+Wavy_Hair
+Wearing_Earrings
+Wearing_Hat
+Wearing_Lipstick
+Wearing_Necklace
+Wearing_Necktie
+Young

datasets/data_utils.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from .AFHQ_dataset import get_afhq_dataset
+from .CelebA_HQ_dataset import get_celeba_dataset
+from .LSUN_dataset import get_lsun_dataset
+from torch.utils.data import DataLoader
+from .IMAGENET_dataset import get_imagenet_dataset
+def get_dataset(dataset_type, dataset_paths, config, target_class_num=None, gender=None):
+    if dataset_type == 'AFHQ':
+        train_dataset, test_dataset = get_afhq_dataset(dataset_paths['AFHQ'], config)
+    elif dataset_type == "LSUN":
+        train_dataset, test_dataset = get_lsun_dataset(dataset_paths['LSUN'], config)
+    elif dataset_type == "CelebA_HQ":
+        train_dataset, test_dataset = get_celeba_dataset(dataset_paths['CelebA_HQ'], config)
+    elif dataset_type == "IMAGENET":
+        train_dataset, test_dataset = get_imagenet_dataset(dataset_paths['IMAGENET'], config, class_num=target_class_num)
+    else:
+        raise ValueError
+    return train_dataset, test_dataset
+def get_dataloader(train_dataset, test_dataset, bs_train=1, num_workers=0):
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=bs_train,
+        drop_last=True,
+        shuffle=True,
+        sampler=None,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    test_loader = DataLoader(
+        test_dataset,
+        batch_size=1,
+        drop_last=True,
+        sampler=None,
+        shuffle=True,
+        num_workers=num_workers,
+        pin_memory=True,
+    )
+    return {'train': train_loader, 'test': test_loader}

datasets/imagenet_dic.py ADDED Viewed

	@@ -0,0 +1,408 @@

+IMAGENET_DIC = {"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"],
+                "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"],
+                "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"],
+                "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"],
+                "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"],
+                "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"],
+                "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"],
+                "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"],
+                "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"],
+                "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"],
+                "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"],
+                "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"],
+                "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"],
+                "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"],
+                "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"],
+                "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"],
+                "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"],
+                "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"],
+                "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"],
+                "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"],
+                "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"],
+                "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"],
+                "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"],
+                "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"],
+                "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"],
+                "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"],
+                "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"],
+                "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"],
+                "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"],
+                "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"],
+                "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"],
+                "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"],
+                "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"],
+                "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"],
+                "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"],
+                "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"],
+                "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"],
+                "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"],
+                "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"],
+                "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"],
+                "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"],
+                "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"],
+                "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"],
+                "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"],
+                "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"],
+                "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"],
+                "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"],
+                "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"],
+                "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"],
+                "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"],
+                "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"],
+                "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"],
+                "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"],
+                "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"],
+                "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"],
+                "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"],
+                "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"],
+                "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"],
+                "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"],
+                "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"],
+                "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"],
+                "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"],
+                "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"],
+                "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"],
+                "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"],
+                "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"],
+                "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"],
+                "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"],
+                "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"],
+                "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"],
+                "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"],
+                "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"],
+                "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"],
+                "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"],
+                "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"],
+                "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"],
+                "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"],
+                "179": ["n02093256", "Staffordshire_bullterrier"],
+                "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"],
+                "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"],
+                "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"],
+                "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"],
+                "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"],
+                "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"],
+                "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"],
+                "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"],
+                "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"],
+                "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"],
+                "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"],
+                "202": ["n02098105", "soft-coated_wheaten_terrier"],
+                "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"],
+                "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"],
+                "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"],
+                "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"],
+                "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"],
+                "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"],
+                "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"],
+                "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"],
+                "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"],
+                "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"],
+                "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"],
+                "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"],
+                "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"],
+                "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"],
+                "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"],
+                "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"],
+                "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"],
+                "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"],
+                "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"],
+                "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"],
+                "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"],
+                "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"],
+                "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"],
+                "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"],
+                "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"],
+                "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"],
+                "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"],
+                "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"],
+                "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"],
+                "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"],
+                "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"],
+                "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"],
+                "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"],
+                "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"],
+                "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"],
+                "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"],
+                "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"],
+                "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"],
+                "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"],
+                "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"],
+                "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"],
+                "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"],
+                "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"],
+                "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"],
+                "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"],
+                "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"],
+                "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"],
+                "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"],
+                "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"],
+                "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"],
+                "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"],
+                "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"],
+                "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"],
+                "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"],
+                "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"],
+                "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"],
+                "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"],
+                "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"],
+                "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"],
+                "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"],
+                "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"],
+                "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"],
+                "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"],
+                "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"],
+                "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"],
+                "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"],
+                "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"],
+                "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"],
+                "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"],
+                "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"],
+                "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"],
+                "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"],
+                "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"],
+                "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"],
+                "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"],
+                "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"],
+                "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"],
+                "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"],
+                "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"],
+                "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"],
+                "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"],
+                "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"],
+                "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"],
+                "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"],
+                "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"],
+                "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"],
+                "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"],
+                "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"],
+                "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"],
+                "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"],
+                "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"],
+                "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"],
+                "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"],
+                "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"],
+                "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"],
+                "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"],
+                "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"],
+                "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"],
+                "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"],
+                "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"],
+                "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"],
+                "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"],
+                "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"],
+                "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"],
+                "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"],
+                "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"],
+                "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"],
+                "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"],
+                "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"],
+                "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"],
+                "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"],
+                "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"],
+                "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"],
+                "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"],
+                "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"],
+                "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"],
+                "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"],
+                "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"],
+                "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"],
+                "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"],
+                "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"],
+                "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"],
+                "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"],
+                "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"],
+                "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"],
+                "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"],
+                "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"],
+                "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"],
+                "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"],
+                "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"],
+                "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"],
+                "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"],
+                "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"],
+                "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"],
+                "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"],
+                "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"],
+                "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"],
+                "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"],
+                "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"],
+                "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"],
+                "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"],
+                "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"],
+                "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"],
+                "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"],
+                "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"],
+                "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"],
+                "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"],
+                "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"],
+                "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"],
+                "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"],
+                "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"],
+                "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"],
+                "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"],
+                "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"],
+                "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"],
+                "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"],
+                "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"],
+                "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"],
+                "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"],
+                "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"],
+                "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"],
+                "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"],
+                "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"],
+                "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"],
+                "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"],
+                "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"],
+                "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"],
+                "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"],
+                "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"],
+                "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"],
+                "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"],
+                "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"],
+                "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"],
+                "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"],
+                "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"],
+                "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"],
+                "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"],
+                "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"],
+                "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"],
+                "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"],
+                "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"],
+                "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"],
+                "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"],
+                "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"],
+                "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"],
+                "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"],
+                "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"],
+                "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"],
+                "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"],
+                "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"],
+                "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"],
+                "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"],
+                "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"],
+                "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"],
+                "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"],
+                "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"],
+                "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"],
+                "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"],
+                "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"],
+                "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"],
+                "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"],
+                "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"],
+                "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"],
+                "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"],
+                "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"],
+                "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"],
+                "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"],
+                "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"],
+                "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"],
+                "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"],
+                "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"],
+                "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"],
+                "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"],
+                "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"],
+                "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"],
+                "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"],
+                "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"],
+                "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"],
+                "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"],
+                "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"],
+                "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"],
+                "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"],
+                "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"],
+                "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"],
+                "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"],
+                "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"],
+                "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"],
+                "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"],
+                "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"],
+                "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"],
+                "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"],
+                "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"],
+                "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"],
+                "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"],
+                "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"],
+                "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"],
+                "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"],
+                "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"],
+                "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"],
+                "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"],
+                "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"],
+                "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"],
+                "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"],
+                "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"],
+                "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"],
+                "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"],
+                "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"],
+                "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"],
+                "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"],
+                "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"],
+                "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"],
+                "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"],
+                "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"],
+                "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"],
+                "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"],
+                "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"],
+                "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"],
+                "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"],
+                "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"],
+                "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"],
+                "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"],
+                "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"],
+                "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"],
+                "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"],
+                "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"],
+                "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"],
+                "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"],
+                "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"],
+                "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"],
+                "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"],
+                "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"],
+                "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"],
+                "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"],
+                "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"],
+                "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"],
+                "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"],
+                "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"],
+                "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"],
+                "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"],
+                "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"],
+                "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"],
+                "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"],
+                "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"],
+                "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"],
+                "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"],
+                "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"],
+                "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"],
+                "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"],
+                "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"],
+                "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"],
+                "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"],
+                "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"],
+                "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"],
+                "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"],
+                "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"],
+                "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"],
+                "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"],
+                "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"],
+                "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"],
+                "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"],
+                "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"],
+                "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"],
+                "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"],
+                "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"],
+                "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"],
+                "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"],
+                "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"],
+                "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"],
+                "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"],
+                "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"],
+                "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"],
+                "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"],
+                "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"],
+                "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"],
+                "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"],
+                "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"],
+                "999": ["n15075141", "toilet_tissue"]}

imgs/img1.jpg ADDED Viewed

losses/clip_loss.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import torch
+import torchvision.transforms as transforms
+import numpy as np
+import clip
+from PIL import Image
+from utils.text_templates import imagenet_templates, part_templates, imagenet_templates_small
+class DirectionLoss(torch.nn.Module):
+    def __init__(self, loss_type='mse'):
+        super(DirectionLoss, self).__init__()
+        self.loss_type = loss_type
+        self.loss_func = {
+            'mse':    torch.nn.MSELoss,
+            'cosine': torch.nn.CosineSimilarity,
+            'mae':    torch.nn.L1Loss
+        }[loss_type]()
+    def forward(self, x, y):
+        if self.loss_type == "cosine":
+            return 1. - self.loss_func(x, y)
+        return self.loss_func(x, y)
+class CLIPLoss(torch.nn.Module):
+    def __init__(self, device, lambda_direction=1., lambda_patch=0., lambda_global=0., lambda_manifold=0., lambda_texture=0., patch_loss_type='mae', direction_loss_type='cosine', clip_model='ViT-B/16'):
+        super(CLIPLoss, self).__init__()
+        self.device = device
+        self.model, clip_preprocess = clip.load(clip_model, device=self.device)
+        self.clip_preprocess = clip_preprocess
+        self.preprocess = transforms.Compose([transforms.Normalize(mean=[-1.0, -1.0, -1.0], std=[2.0, 2.0, 2.0])] + # Un-normalize from [-1.0, 1.0] (GAN output) to [0, 1].
+                                              clip_preprocess.transforms[:2] +                                      # to match CLIP input scale assumptions
+                                              clip_preprocess.transforms[4:])                                       # + skip convert PIL to tensor
+        self.target_direction      = None
+        self.patch_text_directions = None
+        self.patch_loss     = DirectionLoss(patch_loss_type)
+        self.direction_loss = DirectionLoss(direction_loss_type)
+        self.patch_direction_loss = torch.nn.CosineSimilarity(dim=2)
+        self.lambda_global    = lambda_global
+        self.lambda_patch     = lambda_patch
+        self.lambda_direction = lambda_direction
+        self.lambda_manifold  = lambda_manifold
+        self.lambda_texture   = lambda_texture
+        self.src_text_features = None
+        self.target_text_features = None
+        self.angle_loss = torch.nn.L1Loss()
+        self.model_cnn, preprocess_cnn = clip.load("RN50", device=self.device)
+        self.preprocess_cnn = transforms.Compose([transforms.Normalize(mean=[-1.0, -1.0, -1.0], std=[2.0, 2.0, 2.0])] + # Un-normalize from [-1.0, 1.0] (GAN output) to [0, 1].
+                                        preprocess_cnn.transforms[:2] +                                                 # to match CLIP input scale assumptions
+                                        preprocess_cnn.transforms[4:])                                                  # + skip convert PIL to tensor
+        self.texture_loss = torch.nn.MSELoss()
+    def tokenize(self, strings: list):
+        return clip.tokenize(strings).to(self.device)
+    def encode_text(self, tokens: list) -> torch.Tensor:
+        return self.model.encode_text(tokens)
+    def encode_images(self, images: torch.Tensor) -> torch.Tensor:
+        images = self.preprocess(images).to(self.device)
+        return self.model.encode_image(images)
+    def encode_images_with_cnn(self, images: torch.Tensor) -> torch.Tensor:
+        images = self.preprocess_cnn(images).to(self.device)
+        return self.model_cnn.encode_image(images)
+    def distance_with_templates(self, img: torch.Tensor, class_str: str, templates=imagenet_templates) -> torch.Tensor:
+        text_features  = self.get_text_features(class_str, templates)
+        image_features = self.get_image_features(img)
+        similarity = image_features @ text_features.T
+        return 1. - similarity
+    def get_text_features(self, class_str: str, templates=imagenet_templates, norm: bool = True) -> torch.Tensor:
+        template_text = self.compose_text_with_templates(class_str, templates)
+        tokens = clip.tokenize(template_text).to(self.device)
+        text_features = self.encode_text(tokens).detach()
+        if norm:
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+        return text_features
+    def get_image_features(self, img: torch.Tensor, norm: bool = True) -> torch.Tensor:
+        image_features = self.encode_images(img)
+        if norm:
+            image_features /= image_features.clone().norm(dim=-1, keepdim=True)
+        return image_features
+    def compute_text_direction(self, source_class: str, target_class: str) -> torch.Tensor:
+        source_features = self.get_text_features(source_class)
+        target_features = self.get_text_features(target_class)
+        text_direction = (target_features - source_features).mean(axis=0, keepdim=True)
+        text_direction /= text_direction.norm(dim=-1, keepdim=True)
+        return text_direction
+    def compute_img2img_direction(self, source_images: torch.Tensor, target_images: list) -> torch.Tensor:
+        with torch.no_grad():
+            src_encoding = self.get_image_features(source_images)
+            src_encoding = src_encoding.mean(dim=0, keepdim=True)
+            target_encodings = []
+            for target_img in target_images:
+                preprocessed = self.clip_preprocess(Image.open(target_img)).unsqueeze(0).to(self.device)
+                encoding = self.model.encode_image(preprocessed)
+                encoding /= encoding.norm(dim=-1, keepdim=True)
+                target_encodings.append(encoding)
+            target_encoding = torch.cat(target_encodings, axis=0)
+            target_encoding = target_encoding.mean(dim=0, keepdim=True)
+            direction = target_encoding - src_encoding
+            direction /= direction.norm(dim=-1, keepdim=True)
+        return direction
+    def set_text_features(self, source_class: str, target_class: str) -> None:
+        source_features = self.get_text_features(source_class).mean(axis=0, keepdim=True)
+        self.src_text_features = source_features / source_features.norm(dim=-1, keepdim=True)
+        target_features = self.get_text_features(target_class).mean(axis=0, keepdim=True)
+        self.target_text_features = target_features / target_features.norm(dim=-1, keepdim=True)
+    def clip_angle_loss(self, src_img: torch.Tensor, source_class: str, target_img: torch.Tensor, target_class: str) -> torch.Tensor:
+        if self.src_text_features is None:
+            self.set_text_features(source_class, target_class)
+        cos_text_angle = self.target_text_features @ self.src_text_features.T
+        text_angle = torch.acos(cos_text_angle)
+        src_img_features = self.get_image_features(src_img).unsqueeze(2)
+        target_img_features = self.get_image_features(target_img).unsqueeze(1)
+        cos_img_angle = torch.clamp(target_img_features @ src_img_features, min=-1.0, max=1.0)
+        img_angle = torch.acos(cos_img_angle)
+        text_angle = text_angle.unsqueeze(0).repeat(img_angle.size()[0], 1, 1)
+        cos_text_angle = cos_text_angle.unsqueeze(0).repeat(img_angle.size()[0], 1, 1)
+        return self.angle_loss(cos_img_angle, cos_text_angle)
+    def compose_text_with_templates(self, text: str, templates=imagenet_templates) -> list:
+        return [template.format(text) for template in templates]
+    def clip_directional_loss(self, src_img: torch.Tensor, source_class: str, target_img: torch.Tensor, target_class: str) -> torch.Tensor:
+        if self.target_direction is None:
+            self.target_direction = self.compute_text_direction(source_class, target_class)
+        src_encoding    = self.get_image_features(src_img)
+        target_encoding = self.get_image_features(target_img)
+        edit_direction = (target_encoding - src_encoding)
+        edit_direction /= (edit_direction.clone().norm(dim=-1, keepdim=True) + 1e-7)
+        return self.direction_loss(edit_direction, self.target_direction).mean()
+    def global_clip_loss(self, img: torch.Tensor, text) -> torch.Tensor:
+        if not isinstance(text, list):
+            text = [text]
+        tokens = clip.tokenize(text).to(self.device)
+        image  = self.preprocess(img)
+        logits_per_image, _ = self.model(image, tokens)
+        return (1. - logits_per_image / 100).mean()
+    def random_patch_centers(self, img_shape, num_patches, size):
+        batch_size, channels, height, width = img_shape
+        half_size = size // 2
+        patch_centers = np.concatenate([np.random.randint(half_size, width - half_size,  size=(batch_size * num_patches, 1)),
+                                        np.random.randint(half_size, height - half_size, size=(batch_size * num_patches, 1))], axis=1)
+        return patch_centers
+    def generate_patches(self, img: torch.Tensor, patch_centers, size):
+        batch_size  = img.shape[0]
+        num_patches = len(patch_centers) // batch_size
+        half_size   = size // 2
+        patches = []
+        for batch_idx in range(batch_size):
+            for patch_idx in range(num_patches):
+                center_x = patch_centers[batch_idx * num_patches + patch_idx][0]
+                center_y = patch_centers[batch_idx * num_patches + patch_idx][1]
+                patch = img[batch_idx:batch_idx+1, :, center_y - half_size:center_y + half_size, center_x - half_size:center_x + half_size]
+                patches.append(patch)
+        patches = torch.cat(patches, axis=0)
+        return patches
+    def patch_scores(self, img: torch.Tensor, class_str: str, patch_centers, patch_size: int) -> torch.Tensor:
+        parts = self.compose_text_with_templates(class_str, part_templates)
+        tokens = clip.tokenize(parts).to(self.device)
+        text_features = self.encode_text(tokens).detach()
+        patches        = self.generate_patches(img, patch_centers, patch_size)
+        image_features = self.get_image_features(patches)
+        similarity = image_features @ text_features.T
+        return similarity
+    def clip_patch_similarity(self, src_img: torch.Tensor, source_class: str, target_img: torch.Tensor, target_class: str) -> torch.Tensor:
+        patch_size = 196 #TODO remove magic number
+        patch_centers = self.random_patch_centers(src_img.shape, 4, patch_size) #TODO remove magic number
+        src_scores    = self.patch_scores(src_img, source_class, patch_centers, patch_size)
+        target_scores = self.patch_scores(target_img, target_class, patch_centers, patch_size)
+        return self.patch_loss(src_scores, target_scores)
+    def patch_directional_loss(self, src_img: torch.Tensor, source_class: str, target_img: torch.Tensor, target_class: str) -> torch.Tensor:
+        if self.patch_text_directions is None:
+            src_part_classes = self.compose_text_with_templates(source_class, part_templates)
+            target_part_classes = self.compose_text_with_templates(target_class, part_templates)
+            parts_classes = list(zip(src_part_classes, target_part_classes))
+            self.patch_text_directions = torch.cat([self.compute_text_direction(pair[0], pair[1]) for pair in parts_classes], dim=0)
+        patch_size = 510 # TODO remove magic numbers
+        patch_centers = self.random_patch_centers(src_img.shape, 1, patch_size)
+        patches = self.generate_patches(src_img, patch_centers, patch_size)
+        src_features = self.get_image_features(patches)
+        patches = self.generate_patches(target_img, patch_centers, patch_size)
+        target_features = self.get_image_features(patches)
+        edit_direction = (target_features - src_features)
+        edit_direction /= edit_direction.clone().norm(dim=-1, keepdim=True)
+        cosine_dists = 1. - self.patch_direction_loss(edit_direction.unsqueeze(1), self.patch_text_directions.unsqueeze(0))
+        patch_class_scores = cosine_dists * (edit_direction @ self.patch_text_directions.T).softmax(dim=-1)
+        return patch_class_scores.mean()
+    def cnn_feature_loss(self, src_img: torch.Tensor, target_img: torch.Tensor) -> torch.Tensor:
+        src_features = self.encode_images_with_cnn(src_img)
+        target_features = self.encode_images_with_cnn(target_img)
+        return self.texture_loss(src_features, target_features)
+    def forward(self, src_img: torch.Tensor, source_class: str, target_img: torch.Tensor, target_class: str, texture_image: torch.Tensor = None):
+        clip_loss = 0.0
+        if self.lambda_global:
+            clip_loss += self.lambda_global * self.global_clip_loss(target_img, [f"a {target_class}"])
+        if self.lambda_patch:
+            clip_loss += self.lambda_patch * self.patch_directional_loss(src_img, source_class, target_img, target_class)
+        if self.lambda_direction:
+            clip_loss += self.lambda_direction * self.clip_directional_loss(src_img, source_class, target_img, target_class)
+        if self.lambda_manifold:
+            clip_loss += self.lambda_manifold * self.clip_angle_loss(src_img, source_class, target_img, target_class)
+        if self.lambda_texture and (texture_image is not None):
+            clip_loss += self.lambda_texture * self.cnn_feature_loss(texture_image, target_img)
+        return clip_loss

losses/id_loss.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+from torch import nn
+from configs.paths_config import MODEL_PATHS
+from models.insight_face.model_irse import Backbone, MobileFaceNet
+class IDLoss(nn.Module):
+    def __init__(self, use_mobile_id=False):
+        super(IDLoss, self).__init__()
+        print('Loading ResNet ArcFace')
+        self.facenet = Backbone(input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se')
+        self.facenet.load_state_dict(torch.load(MODEL_PATHS['ir_se50']))
+        self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112))
+        self.facenet.eval()
+    def extract_feats(self, x):
+        x = x[:, :, 35:223, 32:220]  # Crop interesting region
+        x = self.face_pool(x)
+        x_feats = self.facenet(x)
+        return x_feats
+    def forward(self, x, x_hat):
+        n_samples = x.shape[0]
+        x_feats = self.extract_feats(x)
+        x_feats = x_feats.detach()
+        x_hat_feats = self.extract_feats(x_hat)
+        losses = []
+        for i in range(n_samples):
+            loss_sample = 1 - x_hat_feats[i].dot(x_feats[i])
+            losses.append(loss_sample.unsqueeze(0))
+        losses = torch.cat(losses, dim=0)
+        return losses

main.py ADDED Viewed

	@@ -0,0 +1,275 @@

+import argparse
+import traceback
+import logging
+import yaml
+import sys
+import os
+import torch
+import numpy as np
+from boundarydiffusion import BoundaryDiffusion
+from configs.paths_config import HYBRID_MODEL_PATHS
+def parse_args_and_config():
+    parser = argparse.ArgumentParser(description=globals()['__doc__'])
+    # Mode
+    parser.add_argument('--radius', action='store_true')
+    parser.add_argument('--unconditional', action='store_true')
+    parser.add_argument('--boundary_search', action='store_true')
+    parser.add_argument('--diffusion_hyperplane', action='store_true')
+    parser.add_argument('--clip_finetune', action='store_true')
+    parser.add_argument('--clip_latent_optim', action='store_true')
+    parser.add_argument('--edit_images_from_dataset', action='store_true')
+    parser.add_argument('--edit_one_image', action='store_true')
+    parser.add_argument('--unseen2unseen', action='store_true')
+    parser.add_argument('--clip_finetune_eff', action='store_true')
+    parser.add_argument('--edit_one_image_eff', action='store_true')
+    parser.add_argument('--edit_image_boundary', action='store_true')
+    # Default
+    parser.add_argument('--config', type=str, required=True, help='Path to the config file')
+    parser.add_argument('--seed', type=int, default=1006, help='Random seed')
+    parser.add_argument('--exp', type=str, default='./runs/', help='Path for saving running related data.')
+    parser.add_argument('--comment', type=str, default='', help='A string for experiment comment')
+    parser.add_argument('--verbose', type=str, default='info', help='Verbose level: info | debug | warning | critical')
+    parser.add_argument('--ni', type=int, default=1,  help="No interaction. Suitable for Slurm Job launcher")
+    parser.add_argument('--align_face', type=int, default=1, help='align face or not')
+    # Text
+    parser.add_argument('--edit_attr', type=str, default=None, help='Attribute to edit defiend in ./utils/text_dic.py')
+    parser.add_argument('--src_txts', type=str, action='append', help='Source text e.g. Face')
+    parser.add_argument('--trg_txts', type=str, action='append', help='Target text e.g. Angry Face')
+    parser.add_argument('--target_class_num', type=str, default=None)
+    # Sampling
+    parser.add_argument('--t_0', type=int, default=400, help='Return step in [0, 1000)')
+    parser.add_argument('--n_inv_step', type=int, default=40, help='# of steps during generative pross for inversion')
+    parser.add_argument('--n_train_step', type=int, default=6, help='# of steps during generative pross for train')
+    parser.add_argument('--n_test_step', type=int, default=40, help='# of steps during generative pross for test')
+    parser.add_argument('--sample_type', type=str, default='ddim', help='ddpm for Markovian sampling, ddim for non-Markovian sampling')
+    parser.add_argument('--eta', type=float, default=0.0, help='Controls of varaince of the generative process')
+    parser.add_argument('--start_distance', type=float, default=-150.0, help='Starting distance of the editing space')
+    parser.add_argument('--end_distance', type=float, default=150.0, help='Ending distance of the editing space')
+    parser.add_argument('--edit_img_number', type=int, default=20, help='Number of editing images')
+    # Train & Test
+    parser.add_argument('--do_train', type=int, default=1, help='Whether to train or not during CLIP finetuning')
+    parser.add_argument('--do_test', type=int, default=1, help='Whether to test or not during CLIP finetuning')
+    parser.add_argument('--save_train_image', type=int, default=1, help='Wheter to save training results during CLIP fineuning')
+    parser.add_argument('--bs_train', type=int, default=1, help='Training batch size during CLIP fineuning')
+    parser.add_argument('--bs_test', type=int, default=1, help='Test batch size during CLIP fineuning')
+    parser.add_argument('--n_precomp_img', type=int, default=100, help='# of images to precompute latents')
+    parser.add_argument('--n_train_img', type=int, default=50, help='# of training images')
+    parser.add_argument('--n_test_img', type=int, default=10, help='# of test images')
+    parser.add_argument('--model_path', type=str, default=None, help='Test model path')
+    parser.add_argument('--img_path', type=str, default=None, help='Image path to test')
+    parser.add_argument('--deterministic_inv', type=int, default=1, help='Whether to use deterministic inversion during inference')
+    parser.add_argument('--hybrid_noise', type=int, default=0, help='Whether to change multiple attributes by mixing multiple models')
+    parser.add_argument('--model_ratio', type=float, default=1, help='Degree of change, noise ratio from original and finetuned model.')
+    # Loss & Optimization
+    parser.add_argument('--clip_loss_w', type=int, default=0, help='Weights of CLIP loss')
+    parser.add_argument('--l1_loss_w', type=float, default=0, help='Weights of L1 loss')
+    parser.add_argument('--id_loss_w', type=float, default=0, help='Weights of ID loss')
+    parser.add_argument('--clip_model_name', type=str, default='ViT-B/16', help='ViT-B/16, ViT-B/32, RN50x16 etc')
+    parser.add_argument('--lr_clip_finetune', type=float, default=2e-6, help='Initial learning rate for finetuning')
+    parser.add_argument('--lr_clip_lat_opt', type=float, default=2e-2, help='Initial learning rate for latent optim')
+    parser.add_argument('--n_iter', type=int, default=1, help='# of iterations of a generative process with `n_train_img` images')
+    parser.add_argument('--scheduler', type=int, default=1, help='Whether to increase the learning rate')
+    parser.add_argument('--sch_gamma', type=float, default=1.3, help='Scheduler gamma')
+    args = parser.parse_args()
+    # parse config file
+    with open(os.path.join('configs', args.config), 'r') as f:
+        config = yaml.safe_load(f)
+    new_config = dict2namespace(config)
+    if args.diffusion_hyperplane:
+        if args.edit_attr is not None:
+            args.exp = args.exp + f'_SP_{new_config.data.category}_{args.edit_attr}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+        else:
+            args.exp = args.exp + f'_SP_{new_config.data.category}_{args.trg_txts}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+    elif args.radius:
+        if args.edit_attr is not None:
+            args.exp = args.exp + f'_R_{new_config.data.category}_{args.edit_attr}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+        else:
+            args.exp = args.exp + f'_R_{new_config.data.category}_{args.trg_txts}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+    elif args.unconditional:
+        if args.edit_attr is not None:
+            args.exp = args.exp + f'_UN_{new_config.data.category}_{args.edit_attr}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+        else:
+            args.exp = args.exp + f'_UN_{new_config.data.category}_{args.trg_txts}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+    elif args.boundary_search:
+        if args.edit_attr is not None:
+            args.exp = args.exp + f'_BCLIP_{new_config.data.category}_{args.edit_attr}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+        else:
+            args.exp = args.exp + f'_BCLIP_{new_config.data.category}_{args.trg_txts}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+    elif args.clip_finetune or args.clip_finetune_eff :
+        if args.edit_attr is not None:
+            args.exp = args.exp + f'_FT_{new_config.data.category}_{args.edit_attr}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+        else:
+            args.exp = args.exp + f'_FT_{new_config.data.category}_{args.trg_txts}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_finetune}'
+    elif args.clip_latent_optim:
+        if args.edit_attr is not None:
+            args.exp = args.exp + f'_LO_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_{args.edit_attr}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_lat_opt}'
+        else:
+            args.exp = args.exp + f'_LO_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_{args.trg_txts}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_id{args.id_loss_w}_l1{args.l1_loss_w}_lr{args.lr_clip_lat_opt}'
+    elif args.edit_images_from_dataset:
+        if args.model_path:
+            args.exp = args.exp + f'_ED_{new_config.data.category}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_{os.path.split(args.model_path)[-1].replace(".pth","")}'
+        elif args.hybrid_noise:
+            hb_str = '_'
+            for i, model_name in enumerate(HYBRID_MODEL_PATHS):
+                hb_str = hb_str + model_name.split('_')[1]
+                if i != len(HYBRID_MODEL_PATHS) - 1:
+                    hb_str = hb_str + '_'
+            args.exp = args.exp + f'_ED_{new_config.data.category}_t{args.t_0}_ninv{args.n_train_step}_ngen{args.n_train_step}' + hb_str
+        else:
+            args.exp = args.exp + f'_ED_{new_config.data.category}_t{args.t_0}_ninv{args.n_train_step}_ngen{args.n_train_step}_orig'
+    elif args.edit_image_boundary:
+        if args.model_path:
+            args.exp = args.exp + f'_E1_t{args.t_0}_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_inv_step}_{os.path.split(args.model_path)[-1].replace(".pth", "")}'
+        elif args.hybrid_noise:
+            hb_str = '_'
+            for i, model_name in enumerate(HYBRID_MODEL_PATHS):
+                hb_str = hb_str + model_name.split('_')[1]
+                if i != len(HYBRID_MODEL_PATHS) - 1:
+                    hb_str = hb_str + '_'
+            args.exp = args.exp + f'_E1_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}' + hb_str
+        else:
+            args.exp = args.exp + f'_E1_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}_orig'
+    elif args.edit_one_image:
+        if args.model_path:
+            args.exp = args.exp + f'_E1_t{args.t_0}_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_inv_step}_{os.path.split(args.model_path)[-1].replace(".pth", "")}'
+        elif args.hybrid_noise:
+            hb_str = '_'
+            for i, model_name in enumerate(HYBRID_MODEL_PATHS):
+                hb_str = hb_str + model_name.split('_')[1]
+                if i != len(HYBRID_MODEL_PATHS) - 1:
+                    hb_str = hb_str + '_'
+            args.exp = args.exp + f'_E1_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}' + hb_str
+        else:
+            args.exp = args.exp + f'_E1_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}_orig'
+    elif args.unseen2unseen:
+        if args.model_path:
+            args.exp = args.exp + f'_U2U_t{args.t_0}_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_inv_step}_ngen{args.n_train_step}_{os.path.split(args.model_path)[-1].replace(".pth", "")}'
+        elif args.hybrid_noise:
+            hb_str = '_'
+            for i, model_name in enumerate(HYBRID_MODEL_PATHS):
+                hb_str = hb_str + model_name.split('_')[1]
+                if i != len(HYBRID_MODEL_PATHS) - 1:
+                    hb_str = hb_str + '_'
+            args.exp = args.exp + f'_U2U_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}_ngen{args.n_train_step}' + hb_str
+        else:
+            args.exp = args.exp + f'_U2U_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}_ngen{args.n_train_step}_orig'
+    elif args.recon_exp:
+        args.exp = args.exp + f'_REC_{new_config.data.category}_{args.img_path.split("/")[-1].split(".")[0]}_t{args.t_0}_ninv{args.n_train_step}'
+    elif args.find_best_image:
+        args.exp = args.exp + f'_FOpt_{new_config.data.category}_{args.trg_txts[0]}_t{args.t_0}_ninv{args.n_train_step}'
+    level = getattr(logging, args.verbose.upper(), None)
+    if not isinstance(level, int):
+        raise ValueError('level {} not supported'.format(args.verbose))
+    handler1 = logging.StreamHandler()
+    formatter = logging.Formatter('%(levelname)s - %(filename)s - %(asctime)s - %(message)s')
+    handler1.setFormatter(formatter)
+    logger = logging.getLogger()
+    logger.addHandler(handler1)
+    logger.setLevel(level)
+    os.makedirs(args.exp, exist_ok=True)
+    os.makedirs('checkpoint', exist_ok=True)
+    os.makedirs('precomputed', exist_ok=True)
+    os.makedirs('runs', exist_ok=True)
+    os.makedirs(args.exp, exist_ok=True)
+    args.image_folder = os.path.join(args.exp, 'image_samples')
+    if not os.path.exists(args.image_folder):
+        os.makedirs(args.image_folder)
+    else:
+        overwrite = False
+        if args.ni:
+            overwrite = True
+        else:
+            response = input("Image folder already exists. Overwrite? (Y/N)")
+            if response.upper() == 'Y':
+                overwrite = True
+        if overwrite:
+            # shutil.rmtree(args.image_folder)
+            os.makedirs(args.image_folder, exist_ok=True)
+        else:
+            print("Output image folder exists. Program halted.")
+            sys.exit(0)
+    # add device
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+    logging.info("Using device: {}".format(device))
+    new_config.device = device
+    # set random seed
+    torch.manual_seed(args.seed)
+    np.random.seed(args.seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(args.seed)
+    torch.backends.cudnn.benchmark = True
+    return args, new_config
+def dict2namespace(config):
+    namespace = argparse.Namespace()
+    for key, value in config.items():
+        if isinstance(value, dict):
+            new_value = dict2namespace(value)
+        else:
+            new_value = value
+        setattr(namespace, key, new_value)
+    return namespace
+def main():
+    args, config = parse_args_and_config()
+    print(">" * 80)
+    logging.info("Exp instance id = {}".format(os.getpid()))
+    logging.info("Exp comment = {}".format(args.comment))
+    logging.info("Config =")
+    print("<" * 80)
+    runner = BoundaryDiffusion(args, config)
+    try:
+        if args.clip_finetune:
+            runner.clip_finetune()
+        elif args.radius:
+            runner.radius()
+        elif args.unconditional:
+            runner.unconditional()
+        elif args.diffusion_hyperplane:
+            runner.diffusion_hyperplane()
+        elif args.boundary_search:
+            runner.boundary_search()
+        elif args.edit_image_boundary:
+            runner.edit_image_boundary()
+        else:
+            print('Choose one mode!')
+            raise ValueError
+    except Exception:
+        logging.error(traceback.format_exc())
+    return 0
+if __name__ == '__main__':
+    sys.exit(main())

models/ddpm/diffusion.py ADDED Viewed

	@@ -0,0 +1,348 @@

+import math
+import torch
+import torch.nn as nn
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(
+            x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        self.temb_proj = torch.nn.Linear(temb_channels,
+                                         out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class DDPM(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        ch, out_ch, ch_mult = config.model.ch, config.model.out_ch, tuple(config.model.ch_mult)
+        num_res_blocks = config.model.num_res_blocks
+        attn_resolutions = config.model.attn_resolutions
+        dropout = config.model.dropout
+        in_channels = config.model.in_channels
+        resolution = config.data.image_size
+        resamp_with_conv = config.model.resamp_with_conv
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # timestep embedding
+        self.temb = nn.Module()
+        self.temb.dense = nn.ModuleList([
+            torch.nn.Linear(self.ch,
+                            self.temb_ch),
+            torch.nn.Linear(self.temb_ch,
+                            self.temb_ch),
+        ])
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + ch_mult
+        self.down = nn.ModuleList()
+        block_in = None
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in + skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, t, edit_h=None):
+        assert x.shape[2] == x.shape[3] == self.resolution
+        # print("check input in U-NET:", x.size()) # [1,3,256,256]
+        # timestep embedding
+        temb = get_timestep_embedding(t, self.ch)
+        temb = self.temb.dense[0](temb)
+        temb = nonlinearity(temb)
+        temb = self.temb.dense[1](temb)
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        mid_h = h.detach().clone()  # get the bottleneck h space embedding
+        # print("check Unet:", mid_h.size()) # [1, 512, 8, 8]
+        # exit()
+        if edit_h != None:
+            h = edit_h
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # print("check UNET upsampled:", h.size()) # [1, 128, 256, 256]
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        # print("check U-NET output:", h.size(), mid_h.size()) # [1,3,256,256]
+        # exit()
+        return mid_h, h

models/improved_ddpm/fp16_util.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""
+Helpers to train with 16-bit precision.
+"""
+import numpy as np
+import torch as th
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from . import logger
+INITIAL_LOG_LOSS_SCALE = 20.0
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.half()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.float()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.float()
+def make_master_params(param_groups_and_shapes):
+    """
+    Copy model parameters into a (differently-shaped) list of full-precision
+    parameters.
+    """
+    master_params = []
+    for param_group, shape in param_groups_and_shapes:
+        master_param = nn.Parameter(
+            _flatten_dense_tensors(
+                [param.detach().float() for (_, param) in param_group]
+            ).view(shape)
+        )
+        master_param.requires_grad = True
+        master_params.append(master_param)
+    return master_params
+def model_grads_to_master_grads(param_groups_and_shapes, master_params):
+    """
+    Copy the gradients from the model parameters into the master parameters
+    from make_master_params().
+    """
+    for master_param, (param_group, shape) in zip(
+        master_params, param_groups_and_shapes
+    ):
+        master_param.grad = _flatten_dense_tensors(
+            [param_grad_or_zeros(param) for (_, param) in param_group]
+        ).view(shape)
+def master_params_to_model_params(param_groups_and_shapes, master_params):
+    """
+    Copy the master parameter data back into the model parameters.
+    """
+    # Without copying to a list, if a generator is passed, this will
+    # silently not copy any parameters.
+    for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
+        for (_, param), unflat_master_param in zip(
+            param_group, unflatten_master_params(param_group, master_param.view(-1))
+        ):
+            param.detach().copy_(unflat_master_param)
+def unflatten_master_params(param_group, master_param):
+    return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
+def get_param_groups_and_shapes(named_model_params):
+    named_model_params = list(named_model_params)
+    scalar_vector_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim <= 1],
+        (-1),
+    )
+    matrix_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim > 1],
+        (1, -1),
+    )
+    return [scalar_vector_named_params, matrix_named_params]
+def master_params_to_state_dict(
+    model, param_groups_and_shapes, master_params, use_fp16
+):
+    if use_fp16:
+        state_dict = model.state_dict()
+        for master_param, (param_group, _) in zip(
+            master_params, param_groups_and_shapes
+        ):
+            for (name, _), unflat_master_param in zip(
+                param_group, unflatten_master_params(param_group, master_param.view(-1))
+            ):
+                assert name in state_dict
+                state_dict[name] = unflat_master_param
+    else:
+        state_dict = model.state_dict()
+        for i, (name, _value) in enumerate(model.named_parameters()):
+            assert name in state_dict
+            state_dict[name] = master_params[i]
+    return state_dict
+def state_dict_to_master_params(model, state_dict, use_fp16):
+    if use_fp16:
+        named_model_params = [
+            (name, state_dict[name]) for name, _ in model.named_parameters()
+        ]
+        param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
+        master_params = make_master_params(param_groups_and_shapes)
+    else:
+        master_params = [state_dict[name] for name, _ in model.named_parameters()]
+    return master_params
+def zero_master_grads(master_params):
+    for param in master_params:
+        param.grad = None
+def zero_grad(model_params):
+    for param in model_params:
+        # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
+        if param.grad is not None:
+            param.grad.detach_()
+            param.grad.zero_()
+def param_grad_or_zeros(param):
+    if param.grad is not None:
+        return param.grad.data.detach()
+    else:
+        return th.zeros_like(param)
+class MixedPrecisionTrainer:
+    def __init__(
+        self,
+        *,
+        model,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
+    ):
+        self.model = model
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+        self.model_params = list(self.model.parameters())
+        self.master_params = self.model_params
+        self.param_groups_and_shapes = None
+        self.lg_loss_scale = initial_lg_loss_scale
+        if self.use_fp16:
+            self.param_groups_and_shapes = get_param_groups_and_shapes(
+                self.model.named_parameters()
+            )
+            self.master_params = make_master_params(self.param_groups_and_shapes)
+            self.model.convert_to_fp16()
+    def zero_grad(self):
+        zero_grad(self.model_params)
+    def backward(self, loss: th.Tensor):
+        if self.use_fp16:
+            loss_scale = 2 ** self.lg_loss_scale
+            (loss * loss_scale).backward()
+        else:
+            loss.backward()
+    def optimize(self, opt: th.optim.Optimizer):
+        if self.use_fp16:
+            return self._optimize_fp16(opt)
+        else:
+            return self._optimize_normal(opt)
+    def _optimize_fp16(self, opt: th.optim.Optimizer):
+        logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
+        model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
+        grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale)
+        if check_overflow(grad_norm):
+            self.lg_loss_scale -= 1
+            logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
+            zero_master_grads(self.master_params)
+            return False
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        self.master_params[0].grad.mul_(1.0 / (2 ** self.lg_loss_scale))
+        opt.step()
+        zero_master_grads(self.master_params)
+        master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
+        self.lg_loss_scale += self.fp16_scale_growth
+        return True
+    def _optimize_normal(self, opt: th.optim.Optimizer):
+        grad_norm, param_norm = self._compute_norms()
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        opt.step()
+        return True
+    def _compute_norms(self, grad_scale=1.0):
+        grad_norm = 0.0
+        param_norm = 0.0
+        for p in self.master_params:
+            with th.no_grad():
+                param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
+                if p.grad is not None:
+                    grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
+        return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
+    def master_params_to_state_dict(self, master_params):
+        return master_params_to_state_dict(
+            self.model, self.param_groups_and_shapes, master_params, self.use_fp16
+        )
+    def state_dict_to_master_params(self, state_dict):
+        return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
+def check_overflow(value):
+    return (value == float("inf")) or (value == -float("inf")) or (value != value)

models/improved_ddpm/logger.py ADDED Viewed

	@@ -0,0 +1,451 @@

+"""
+Logger based on OpenAI baselines to avoid extra RL-based dependencies:
+https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py
+"""
+import os
+import sys
+import os.path as osp
+import json
+import time
+import datetime
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+DEBUG = 10
+INFO = 20
+WARN = 30
+ERROR = 40
+DISABLED = 50
+class KVWriter(object):
+    def writekvs(self, kvs):
+        raise NotImplementedError
+class SeqWriter(object):
+    def writeseq(self, seq):
+        raise NotImplementedError
+class HumanOutputFormat(KVWriter, SeqWriter):
+    def __init__(self, filename_or_file):
+        if isinstance(filename_or_file, str):
+            self.file = open(filename_or_file, "wt")
+            self.own_file = True
+        else:
+            assert hasattr(filename_or_file, "read"), (
+                "expected file or str, got %s" % filename_or_file
+            )
+            self.file = filename_or_file
+            self.own_file = False
+    def writekvs(self, kvs):
+        # Create strings for printing
+        key2str = {}
+        for (key, val) in sorted(kvs.items()):
+            if hasattr(val, "__float__"):
+                valstr = "%-8.3g" % val
+            else:
+                valstr = str(val)
+            key2str[self._truncate(key)] = self._truncate(valstr)
+        # Find max widths
+        if len(key2str) == 0:
+            print("WARNING: tried to write empty key-value dict")
+            return
+        else:
+            keywidth = max(map(len, key2str.keys()))
+            valwidth = max(map(len, key2str.values()))
+        # Write out the data
+        dashes = "-" * (keywidth + valwidth + 7)
+        lines = [dashes]
+        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
+            lines.append(
+                "| %s%s | %s%s |"
+                % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val)))
+            )
+        lines.append(dashes)
+        self.file.write("\n".join(lines) + "\n")
+        # Flush the output to the file
+        self.file.flush()
+    def _truncate(self, s):
+        maxlen = 30
+        return s[: maxlen - 3] + "..." if len(s) > maxlen else s
+    def writeseq(self, seq):
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1:  # add space unless this is the last one
+                self.file.write(" ")
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        if self.own_file:
+            self.file.close()
+class JSONOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "wt")
+    def writekvs(self, kvs):
+        for k, v in sorted(kvs.items()):
+            if hasattr(v, "dtype"):
+                kvs[k] = float(v)
+        self.file.write(json.dumps(kvs) + "\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class CSVOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "w+t")
+        self.keys = []
+        self.sep = ","
+    def writekvs(self, kvs):
+        # Add our current row to the history
+        extra_keys = list(kvs.keys() - self.keys)
+        extra_keys.sort()
+        if extra_keys:
+            self.keys.extend(extra_keys)
+            self.file.seek(0)
+            lines = self.file.readlines()
+            self.file.seek(0)
+            for (i, k) in enumerate(self.keys):
+                if i > 0:
+                    self.file.write(",")
+                self.file.write(k)
+            self.file.write("\n")
+            for line in lines[1:]:
+                self.file.write(line[:-1])
+                self.file.write(self.sep * len(extra_keys))
+                self.file.write("\n")
+        for (i, k) in enumerate(self.keys):
+            if i > 0:
+                self.file.write(",")
+            v = kvs.get(k)
+            if v is not None:
+                self.file.write(str(v))
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+def make_output_format(format, ev_dir, log_suffix=""):
+    os.makedirs(ev_dir, exist_ok=True)
+    if format == "stdout":
+        return HumanOutputFormat(sys.stdout)
+    elif format == "log":
+        return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix))
+    elif format == "json":
+        return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix))
+    elif format == "csv":
+        return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix))
+    else:
+        raise ValueError("Unknown format specified: %s" % (format,))
+# ================================================================
+# API
+# ================================================================
+def logkv(key, val):
+    """
+    Log a value of some diagnostic
+    Call this once for each diagnostic quantity, each iteration
+    If called many times, last value will be used.
+    """
+    get_current().logkv(key, val)
+def logkv_mean(key, val):
+    """
+    The same as logkv(), but if called many times, values averaged.
+    """
+    get_current().logkv_mean(key, val)
+def logkvs(d):
+    """
+    Log a dictionary of key-value pairs
+    """
+    for (k, v) in d.items():
+        logkv(k, v)
+def dumpkvs():
+    """
+    Write all of the diagnostics from the current iteration
+    """
+    return get_current().dumpkvs()
+def getkvs():
+    return get_current().name2val
+def log(*args, level=INFO):
+    """
+    Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
+    """
+    get_current().log(*args, level=level)
+def debug(*args):
+    log(*args, level=DEBUG)
+def info(*args):
+    log(*args, level=INFO)
+def warn(*args):
+    log(*args, level=WARN)
+def error(*args):
+    log(*args, level=ERROR)
+def set_level(level):
+    """
+    Set logging threshold on current logger.
+    """
+    get_current().set_level(level)
+def set_comm(comm):
+    get_current().set_comm(comm)
+def get_dir():
+    """
+    Get directory that log files are being written to.
+    will be None if there is no output directory (i.e., if you didn't call start)
+    """
+    return get_current().get_dir()
+record_tabular = logkv
+dump_tabular = dumpkvs
+@contextmanager
+def profile_kv(scopename):
+    logkey = "wait_" + scopename
+    tstart = time.time()
+    try:
+        yield
+    finally:
+        get_current().name2val[logkey] += time.time() - tstart
+def profile(n):
+    """
+    Usage:
+    @profile("my_func")
+    def my_func(): code
+    """
+    def decorator_with_name(func):
+        def func_wrapper(*args, **kwargs):
+            with profile_kv(n):
+                return func(*args, **kwargs)
+        return func_wrapper
+    return decorator_with_name
+# ================================================================
+# Backend
+# ================================================================
+def get_current():
+    if Logger.CURRENT is None:
+        _configure_default_logger()
+    return Logger.CURRENT
+class Logger(object):
+    DEFAULT = None  # A logger with no output files. (See right below class definition)
+    # So that you can still log to the terminal without setting up any output files
+    CURRENT = None  # Current logger being used by the free functions above
+    def __init__(self, dir, output_formats, comm=None):
+        self.name2val = defaultdict(float)  # values this iteration
+        self.name2cnt = defaultdict(int)
+        self.level = INFO
+        self.dir = dir
+        self.output_formats = output_formats
+        self.comm = comm
+    # Logging API, forwarded
+    # ----------------------------------------
+    def logkv(self, key, val):
+        self.name2val[key] = val
+    def logkv_mean(self, key, val):
+        oldval, cnt = self.name2val[key], self.name2cnt[key]
+        self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1)
+        self.name2cnt[key] = cnt + 1
+    def dumpkvs(self):
+        if self.comm is None:
+            d = self.name2val
+        else:
+            d = mpi_weighted_mean(
+                self.comm,
+                {
+                    name: (val, self.name2cnt.get(name, 1))
+                    for (name, val) in self.name2val.items()
+                },
+            )
+            if self.comm.rank != 0:
+                d["dummy"] = 1  # so we don't get a warning about empty dict
+        out = d.copy()  # Return the dict for unit testing purposes
+        for fmt in self.output_formats:
+            if isinstance(fmt, KVWriter):
+                fmt.writekvs(d)
+        self.name2val.clear()
+        self.name2cnt.clear()
+        return out
+    def log(self, *args, level=INFO):
+        if self.level <= level:
+            self._do_log(args)
+    # Configuration
+    # ----------------------------------------
+    def set_level(self, level):
+        self.level = level
+    def set_comm(self, comm):
+        self.comm = comm
+    def get_dir(self):
+        return self.dir
+    def close(self):
+        for fmt in self.output_formats:
+            fmt.close()
+    # Misc
+    # ----------------------------------------
+    def _do_log(self, args):
+        for fmt in self.output_formats:
+            if isinstance(fmt, SeqWriter):
+                fmt.writeseq(map(str, args))
+def get_rank_without_mpi_import():
+    # check environment variables here instead of importing mpi4py
+    # to avoid calling MPI_Init() when this module is imported
+    for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]:
+        if varname in os.environ:
+            return int(os.environ[varname])
+    return 0
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn(
+                            "WARNING: tried to compute mean on non-float {}={}".format(
+                                name, val
+                            )
+                        )
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name: name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}
+def configure(dir=None, format_strs=None, comm=None, log_suffix=""):
+    """
+    If comm is provided, average all numerical stats across that comm
+    """
+    if dir is None:
+        dir = os.getenv("OPENAI_LOGDIR")
+    if dir is None:
+        dir = osp.join(
+            tempfile.gettempdir(),
+            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"),
+        )
+    assert isinstance(dir, str)
+    dir = os.path.expanduser(dir)
+    os.makedirs(os.path.expanduser(dir), exist_ok=True)
+    rank = get_rank_without_mpi_import()
+    if rank > 0:
+        log_suffix = log_suffix + "-rank%03i" % rank
+    if format_strs is None:
+        if rank == 0:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT", "stdout,log,csv").split(",")
+        else:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",")
+    format_strs = filter(None, format_strs)
+    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
+    if output_formats:
+        log("Logging to %s" % dir)
+def _configure_default_logger():
+    configure()
+    Logger.DEFAULT = Logger.CURRENT
+def reset():
+    if Logger.CURRENT is not Logger.DEFAULT:
+        Logger.CURRENT.close()
+        Logger.CURRENT = Logger.DEFAULT
+        log("Reset logger")
+@contextmanager
+def scoped_configure(dir=None, format_strs=None, comm=None):
+    prevlogger = Logger.CURRENT
+    configure(dir=dir, format_strs=format_strs, comm=comm)
+    try:
+        yield
+    finally:
+        Logger.CURRENT.close()
+        Logger.CURRENT = prevlogger

models/improved_ddpm/nn.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+Various utilities for neural networks.
+"""
+import math
+import torch as th
+import torch.nn as nn
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * th.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = th.exp(
+        -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with th.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with th.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = th.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads

models/improved_ddpm/script_util.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from .unet import UNetModel
+NUM_CLASSES = 1000
+AFHQ_DICT = dict(
+    attention_resolutions="16",
+    class_cond=False,
+    dropout=0.0,
+    image_size=256,
+    learn_sigma=True,
+    num_channels=128,
+    num_head_channels=64,
+    num_res_blocks=1,
+    resblock_updown=True,
+    use_fp16=False,
+    use_scale_shift_norm=True,
+    num_heads=4,
+    num_heads_upsample=-1,
+    channel_mult="",
+    use_checkpoint=False,
+    use_new_attention_order=False,
+)
+IMAGENET_DICT = dict(
+    attention_resolutions="32,16,8",
+    class_cond=True,
+    image_size=512,
+    learn_sigma=True,
+    num_channels=256,
+    num_head_channels=64,
+    num_res_blocks=2,
+    resblock_updown=True,
+    use_fp16=False,
+    use_scale_shift_norm=True,
+    dropout=0.0,
+    num_heads=4,
+    num_heads_upsample=-1,
+    channel_mult="",
+    use_checkpoint=False,
+    use_new_attention_order=False,
+)
+def create_model(
+    image_size,
+    num_channels,
+    num_res_blocks,
+    channel_mult="",
+    learn_sigma=False,
+    class_cond=False,
+    use_checkpoint=False,
+    attention_resolutions="16",
+    num_heads=1,
+    num_head_channels=-1,
+    num_heads_upsample=-1,
+    use_scale_shift_norm=False,
+    dropout=0,
+    resblock_updown=False,
+    use_fp16=False,
+    use_new_attention_order=False,
+):
+    if channel_mult == "":
+        if image_size == 512:
+            channel_mult = (0.5, 1, 1, 2, 2, 4, 4)
+        elif image_size == 256:
+            channel_mult = (1, 1, 2, 2, 4, 4)
+        elif image_size == 128:
+            channel_mult = (1, 1, 2, 3, 4)
+        elif image_size == 64:
+            channel_mult = (1, 2, 3, 4)
+        else:
+            raise ValueError(f"unsupported image size: {image_size}")
+    else:
+        channel_mult = tuple(int(ch_mult) for ch_mult in channel_mult.split(","))
+    attention_ds = []
+    for res in attention_resolutions.split(","):
+        attention_ds.append(image_size // int(res))
+    return UNetModel(
+        image_size=image_size,
+        in_channels=3,
+        model_channels=num_channels,
+        out_channels=(3 if not learn_sigma else 6),
+        num_res_blocks=num_res_blocks,
+        attention_resolutions=tuple(attention_ds),
+        dropout=dropout,
+        channel_mult=channel_mult,
+        num_classes=(NUM_CLASSES if class_cond else None),
+        use_checkpoint=use_checkpoint,
+        use_fp16=use_fp16,
+        num_heads=num_heads,
+        num_head_channels=num_head_channels,
+        num_heads_upsample=num_heads_upsample,
+        use_scale_shift_norm=use_scale_shift_norm,
+        resblock_updown=resblock_updown,
+        use_new_attention_order=use_new_attention_order,
+    )
+def i_DDPM(dataset_name = 'AFHQ'):
+    if dataset_name in  ['AFHQ', 'FFHQ']:
+        return create_model(**AFHQ_DICT)
+    elif dataset_name == 'IMAGENET':
+        return create_model(**IMAGENET_DICT)
+    else:
+        print('Not implemented.')
+        exit()

models/improved_ddpm/unet.py ADDED Viewed

	@@ -0,0 +1,677 @@

+"""
+Codebase for "Improved Denoising Diffusion Probabilistic Models".
+"""
+from abc import abstractmethod
+import math
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from .fp16_util import convert_module_to_f16, convert_module_to_f32
+from .nn import (
+    checkpoint,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    zero_module,
+    normalization,
+    timestep_embedding,
+)
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=1
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps, y=None, ref_img=None, edit_h=None):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        # assert (y is not None) == (
+        #     self.num_classes is not None
+        # ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        # if self.num_classes is not None:
+        #     assert y.shape == (x.shape[0],)
+        #     emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+        h = self.middle_block(h, emb)
+        mid_h = h.detach().clone()  # get the bottleneck h space embedding
+        # print("check Unet:", mid_h.size()) # [1, 512, 8, 8]
+        # exit()
+        if edit_h != None:
+            h = edit_h
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+        h = h.type(x.dtype)
+        # print("check U-NET output:", h.size(), mid_h.size(), self.out(h).size()) # [1,3,256,256]
+        # exit()
+        return mid_h, self.out(h)

models/insight_face/__init__.py ADDED Viewed

File without changes

models/insight_face/helpers.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from collections import namedtuple
+import torch
+from torch.nn import Conv2d, BatchNorm2d, PReLU, ReLU, Sigmoid, MaxPool2d, AdaptiveAvgPool2d, Sequential, Module
+"""
+ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+"""
+class Conv_block(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(Conv_block, self).__init__()
+        self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False)
+        self.bn = BatchNorm2d(out_c)
+        self.prelu = PReLU(out_c)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.prelu(x)
+        return x
+class Linear_block(Module):
+    def __init__(self, in_c, out_c, kernel=(1, 1), stride=(1, 1), padding=(0, 0), groups=1):
+        super(Linear_block, self).__init__()
+        self.conv = Conv2d(in_c, out_channels=out_c, kernel_size=kernel, groups=groups, stride=stride, padding=padding, bias=False)
+        self.bn = BatchNorm2d(out_c)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+class Depth_Wise(Module):
+     def __init__(self, in_c, out_c, residual = False, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=1):
+        super(Depth_Wise, self).__init__()
+        self.conv = Conv_block(in_c, out_c=groups, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.conv_dw = Conv_block(groups, groups, groups=groups, kernel=kernel, padding=padding, stride=stride)
+        self.project = Linear_block(groups, out_c, kernel=(1, 1), padding=(0, 0), stride=(1, 1))
+        self.residual = residual
+     def forward(self, x):
+        if self.residual:
+            short_cut = x
+        x = self.conv(x)
+        x = self.conv_dw(x)
+        x = self.project(x)
+        if self.residual:
+            output = short_cut + x
+        else:
+            output = x
+        return output
+class Residual(Module):
+    def __init__(self, c, num_block, groups, kernel=(3, 3), stride=(1, 1), padding=(1, 1)):
+        super(Residual, self).__init__()
+        modules = []
+        for _ in range(num_block):
+            modules.append(Depth_Wise(c, c, residual=True, kernel=kernel, padding=padding, stride=stride, groups=groups))
+        self.model = Sequential(*modules)
+    def forward(self, x):
+        return self.model(x)
+######################################################################################
+class Flatten(Module):
+	def forward(self, input):
+		return input.view(input.size(0), -1)
+def l2_norm(input, axis=1):
+	norm = torch.norm(input, 2, axis, True)
+	output = torch.div(input, norm)
+	return output
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+	""" A named tuple describing a ResNet block. """
+def get_block(in_channel, depth, num_units, stride=2):
+	return [Bottleneck(in_channel, depth, stride)] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+def get_blocks(num_layers):
+	if num_layers == 50:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=4),
+			get_block(in_channel=128, depth=256, num_units=14),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	elif num_layers == 100:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=13),
+			get_block(in_channel=128, depth=256, num_units=30),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	elif num_layers == 152:
+		blocks = [
+			get_block(in_channel=64, depth=64, num_units=3),
+			get_block(in_channel=64, depth=128, num_units=8),
+			get_block(in_channel=128, depth=256, num_units=36),
+			get_block(in_channel=256, depth=512, num_units=3)
+		]
+	else:
+		raise ValueError("Invalid number of layers: {}. Must be one of [50, 100, 152]".format(num_layers))
+	return blocks
+class SEModule(Module):
+	def __init__(self, channels, reduction):
+		super(SEModule, self).__init__()
+		self.avg_pool = AdaptiveAvgPool2d(1)
+		self.fc1 = Conv2d(channels, channels // reduction, kernel_size=1, padding=0, bias=False)
+		self.relu = ReLU(inplace=True)
+		self.fc2 = Conv2d(channels // reduction, channels, kernel_size=1, padding=0, bias=False)
+		self.sigmoid = Sigmoid()
+	def forward(self, x):
+		module_input = x
+		x = self.avg_pool(x)
+		x = self.fc1(x)
+		x = self.relu(x)
+		x = self.fc2(x)
+		x = self.sigmoid(x)
+		return module_input * x
+class bottleneck_IR(Module):
+	def __init__(self, in_channel, depth, stride):
+		super(bottleneck_IR, self).__init__()
+		if in_channel == depth:
+			self.shortcut_layer = MaxPool2d(1, stride)
+		else:
+			self.shortcut_layer = Sequential(
+				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+				BatchNorm2d(depth)
+			)
+		self.res_layer = Sequential(
+			BatchNorm2d(in_channel),
+			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), PReLU(depth),
+			Conv2d(depth, depth, (3, 3), stride, 1, bias=False), BatchNorm2d(depth)
+		)
+	def forward(self, x):
+		shortcut = self.shortcut_layer(x)
+		res = self.res_layer(x)
+		return res + shortcut
+class bottleneck_IR_SE(Module):
+	def __init__(self, in_channel, depth, stride):
+		super(bottleneck_IR_SE, self).__init__()
+		if in_channel == depth:
+			self.shortcut_layer = MaxPool2d(1, stride)
+		else:
+			self.shortcut_layer = Sequential(
+				Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+				BatchNorm2d(depth)
+			)
+		self.res_layer = Sequential(
+			BatchNorm2d(in_channel),
+			Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+			PReLU(depth),
+			Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+			BatchNorm2d(depth),
+			SEModule(depth, 16)
+		)
+	def forward(self, x):
+		shortcut = self.shortcut_layer(x)
+		res = self.res_layer(x)
+		return res + shortcut

models/insight_face/model_irse.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from torch.nn import Linear, Conv2d, BatchNorm1d, BatchNorm2d, PReLU, Dropout, Sequential, Module
+from models.insight_face.helpers import get_blocks, Flatten, bottleneck_IR, bottleneck_IR_SE, l2_norm
+from models.insight_face.helpers import  Conv_block, Linear_block, Depth_Wise, Residual
+"""
+Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+"""
+class MobileFaceNet(Module):
+	def __init__(self, embedding_size):
+		super(MobileFaceNet, self).__init__()
+		self.conv1 = Conv_block(3, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1))
+		self.conv2_dw = Conv_block(64, 64, kernel=(3, 3), stride=(1, 1), padding=(1, 1), groups=64)
+		self.conv_23 = Depth_Wise(64, 64, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=128)
+		self.conv_3 = Residual(64, num_block=4, groups=128, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+		self.conv_34 = Depth_Wise(64, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=256)
+		self.conv_4 = Residual(128, num_block=6, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+		self.conv_45 = Depth_Wise(128, 128, kernel=(3, 3), stride=(2, 2), padding=(1, 1), groups=512)
+		self.conv_5 = Residual(128, num_block=2, groups=256, kernel=(3, 3), stride=(1, 1), padding=(1, 1))
+		self.conv_6_sep = Conv_block(128, 512, kernel=(1, 1), stride=(1, 1), padding=(0, 0))
+		self.conv_6_dw = Linear_block(512, 512, groups=512, kernel=(7, 7), stride=(1, 1), padding=(0, 0))
+		self.conv_6_flatten = Flatten()
+		self.linear = Linear(512, embedding_size, bias=False)
+		self.bn = BatchNorm1d(embedding_size)
+	def forward(self, x):
+		out = self.conv1(x)
+		out = self.conv2_dw(out)
+		out = self.conv_23(out)
+		out = self.conv_3(out)
+		out = self.conv_34(out)
+		out = self.conv_4(out)
+		out = self.conv_45(out)
+		out = self.conv_5(out)
+		out = self.conv_6_sep(out)
+		out = self.conv_6_dw(out)
+		out = self.conv_6_flatten(out)
+		out = self.linear(out)
+		out = self.bn(out)
+		return l2_norm(out)
+######################################################################################
+class Backbone(Module):
+	def __init__(self, input_size, num_layers, mode='ir', drop_ratio=0.4, affine=True):
+		super(Backbone, self).__init__()
+		assert input_size in [112, 224], "input_size should be 112 or 224"
+		assert num_layers in [50, 100, 152], "num_layers should be 50, 100 or 152"
+		assert mode in ['ir', 'ir_se'], "mode should be ir or ir_se"
+		blocks = get_blocks(num_layers)
+		if mode == 'ir':
+			unit_module = bottleneck_IR
+		elif mode == 'ir_se':
+			unit_module = bottleneck_IR_SE
+		self.input_layer = Sequential(Conv2d(3, 64, (3, 3), 1, 1, bias=False),
+									  BatchNorm2d(64),
+									  PReLU(64))
+		if input_size == 112:
+			self.output_layer = Sequential(BatchNorm2d(512),
+			                               Dropout(drop_ratio),
+			                               Flatten(),
+			                               Linear(512 * 7 * 7, 512),
+			                               BatchNorm1d(512, affine=affine))
+		else:
+			self.output_layer = Sequential(BatchNorm2d(512),
+			                               Dropout(drop_ratio),
+			                               Flatten(),
+			                               Linear(512 * 14 * 14, 512),
+			                               BatchNorm1d(512, affine=affine))
+		modules = []
+		for block in blocks:
+			for bottleneck in block:
+				modules.append(unit_module(bottleneck.in_channel,
+										   bottleneck.depth,
+										   bottleneck.stride))
+		self.body = Sequential(*modules)
+	def forward(self, x):
+		x = self.input_layer(x)
+		x = self.body(x)
+		x = self.output_layer(x)
+		return l2_norm(x)
+def IR_50(input_size):
+	"""Constructs a ir-50 model."""
+	model = Backbone(input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False)
+	return model
+def IR_101(input_size):
+	"""Constructs a ir-101 model."""
+	model = Backbone(input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False)
+	return model
+def IR_152(input_size):
+	"""Constructs a ir-152 model."""
+	model = Backbone(input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False)
+	return model
+def IR_SE_50(input_size):
+	"""Constructs a ir_se-50 model."""
+	model = Backbone(input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False)
+	return model
+def IR_SE_101(input_size):
+	"""Constructs a ir_se-101 model."""
+	model = Backbone(input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False)
+	return model
+def IR_SE_152(input_size):
+	"""Constructs a ir_se-152 model."""
+	model = Backbone(input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False)
+	return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+cmake>=3.22.3
+lmdb>=1.2.1
+numpy>=1.19.2
+Pillow>=8.4.0
+PyYAML>=6.0
+tqdm>=4.55.1
+opencv_python>=4.5.2.52
+ftfy>=6.0.3
+regex>=2021.10.23
+dlib>=19.22.1

utils/align_utils.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+brief: face alignment with FFHQ method (https://github.com/NVlabs/ffhq-dataset)
+author: lzhbrian (https://lzhbrian.me)
+date: 2020.1.5
+note: code is heavily borrowed from
+	https://github.com/NVlabs/ffhq-dataset
+	http://dlib.net/face_landmark_detection.py.html
+requirements:
+	apt install cmake
+	conda install Pillow numpy scipy
+	pip install dlib
+	# download face landmark model from:
+	# http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
+"""
+from argparse import ArgumentParser
+import time
+import numpy as np
+import PIL
+import PIL.Image
+import os
+import scipy
+import scipy.ndimage
+import dlib
+import multiprocessing as mp
+import math
+from configs.paths_config import MODEL_PATHS
+SHAPE_PREDICTOR_PATH = MODEL_PATHS["shape_predictor"]
+def run_alignment(image_path, output_size):
+    if not os.path.exists("pretrained/shape_predictor_68_face_landmarks.dat"):
+        print('Downloading files for aligning face image...')
+        os.system(f'wget -P pretrained/ http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2')
+        os.system('bzip2 -dk pretrained/shape_predictor_68_face_landmarks.dat.bz2')
+        print('Done.')
+    predictor = dlib.shape_predictor("pretrained/shape_predictor_68_face_landmarks.dat")
+    aligned_image = align_face(filepath=image_path, predictor=predictor, output_size=output_size, transform_size=output_size)
+    print("Aligned image has shape: {}".format(aligned_image.size))
+    return aligned_image
+def get_landmark(filepath, predictor):
+    """get landmark with dlib
+	:return: np.array shape=(68, 2)
+	"""
+    detector = dlib.get_frontal_face_detector()
+    img = dlib.load_rgb_image(filepath)
+    dets = detector(img, 1)
+    for k, d in enumerate(dets):
+        shape = predictor(img, d)
+    t = list(shape.parts())
+    a = []
+    for tt in t:
+        a.append([tt.x, tt.y])
+    lm = np.array(a)
+    return lm
+def align_face(filepath, predictor, output_size=256, transform_size=256):
+    """
+	:param filepath: str
+	:return: PIL Image
+	"""
+    lm = get_landmark(filepath, predictor)
+    lm_chin = lm[0: 17]  # left-right
+    lm_eyebrow_left = lm[17: 22]  # left-right
+    lm_eyebrow_right = lm[22: 27]  # left-right
+    lm_nose = lm[27: 31]  # top-down
+    lm_nostrils = lm[31: 36]  # top-down
+    lm_eye_left = lm[36: 42]  # left-clockwise
+    lm_eye_right = lm[42: 48]  # left-clockwise
+    lm_mouth_outer = lm[48: 60]  # left-clockwise
+    lm_mouth_inner = lm[60: 68]  # left-clockwise
+    # Calculate auxiliary vectors.
+    eye_left = np.mean(lm_eye_left, axis=0)
+    eye_right = np.mean(lm_eye_right, axis=0)
+    eye_avg = (eye_left + eye_right) * 0.5
+    eye_to_eye = eye_right - eye_left
+    mouth_left = lm_mouth_outer[0]
+    mouth_right = lm_mouth_outer[6]
+    mouth_avg = (mouth_left + mouth_right) * 0.5
+    eye_to_mouth = mouth_avg - eye_avg
+    # Choose oriented crop rectangle.
+    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+    x /= np.hypot(*x)
+    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
+    y = np.flipud(x) * [-1, 1]
+    c = eye_avg + eye_to_mouth * 0.1
+    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+    qsize = np.hypot(*x) * 2
+    # read image
+    img = PIL.Image.open(filepath)
+    enable_padding = True
+    # Shrink.
+    shrink = int(np.floor(qsize / output_size * 0.5))
+    if shrink > 1:
+        rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+        img = img.resize(rsize, PIL.Image.ANTIALIAS)
+        quad /= shrink
+        qsize /= shrink
+    # Crop.
+    border = max(int(np.rint(qsize * 0.1)), 3)
+    crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+            int(np.ceil(max(quad[:, 1]))))
+    crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
+            min(crop[3] + border, img.size[1]))
+    if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
+        img = img.crop(crop)
+        quad -= crop[0:2]
+    # Pad.
+    pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+           int(np.ceil(max(quad[:, 1]))))
+    pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0),
+           max(pad[3] - img.size[1] + border, 0))
+    if enable_padding and max(pad) > border - 4:
+        pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
+        img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+        h, w, _ = img.shape
+        y, x, _ = np.ogrid[:h, :w, :1]
+        mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
+                          1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3]))
+        blur = qsize * 0.02
+        img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+        img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
+        img = PIL.Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
+        quad += pad[:2]
+    # Transform.
+    img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
+    if output_size < transform_size:
+        img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+    # Save aligned image.
+    return img
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+def extract_on_paths(file_paths):
+    predictor = dlib.shape_predictor(SHAPE_PREDICTOR_PATH)
+    pid = mp.current_process().name
+    print('\t{} is starting to extract on #{} images'.format(pid, len(file_paths)))
+    tot_count = len(file_paths)
+    count = 0
+    for file_path, res_path in file_paths:
+        count += 1
+        if count % 100 == 0:
+            print('{} done with {}/{}'.format(pid, count, tot_count))
+        try:
+            res = align_face(file_path, predictor)
+            res = res.convert('RGB')
+            os.makedirs(os.path.dirname(res_path), exist_ok=True)
+            res.save(res_path)
+        except Exception:
+            continue
+    print('\tDone!')
+def parse_args():
+    parser = ArgumentParser(add_help=False)
+    parser.add_argument('--num_threads', type=int, default=1)
+    parser.add_argument('--root_path', type=str, default='')
+    args = parser.parse_args()
+    return args
+def run(args):
+    root_path = args.root_path
+    out_crops_path = root_path + '_crops'
+    if not os.path.exists(out_crops_path):
+        os.makedirs(out_crops_path, exist_ok=True)
+    file_paths = []
+    for root, dirs, files in os.walk(root_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            fname = os.path.join(out_crops_path, os.path.relpath(file_path, root_path))
+            res_path = '{}.jpg'.format(os.path.splitext(fname)[0])
+            if os.path.splitext(file_path)[1] == '.txt' or os.path.exists(res_path):
+                continue
+            file_paths.append((file_path, res_path))
+    file_chunks = list(chunks(file_paths, int(math.ceil(len(file_paths) / args.num_threads))))
+    print(len(file_chunks))
+    pool = mp.Pool(args.num_threads)
+    print('Running on {} paths\nHere we goooo'.format(len(file_paths)))
+    tic = time.time()
+    pool.map(extract_on_paths, file_chunks)
+    toc = time.time()
+    print('Mischief managed in {}s'.format(toc - tic))
+if __name__ == '__main__':
+    args = parse_args()
+    run(args)

utils/celeba_attr.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+5_o_Clock_Shadow
+Arched_Eyebrows
+Attractive
+Bags_Under_Eyes
+Bald
+Bangs
+Big_Lips
+Big_Nose
+Black_Hair
+Blond_Hair
+Blurry
+Brown_Hair
+Bushy_Eyebrows
+Chubby
+Double_Chin
+Eyeglasses
+Goatee
+Gray_Hair
+Heavy_Makeup
+High_Cheekbones
+Male
+Mouth_Slightly_Open
+Mustache
+Narrow_Eyes
+No_Beard
+Oval_Face
+Pale_Skin
+Pointy_Nose
+Receding_Hairline
+Rosy_Cheeks
+Sideburns
+Smiling
+Straight_Hair
+Wavy_Hair
+Wearing_Earrings
+Wearing_Hat
+Wearing_Lipstick
+Wearing_Necklace
+Wearing_Necktie
+Young

utils/colab_utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive
+from google.colab import auth
+from oauth2client.client import GoogleCredentials
+import os
+class GoogleDrive_Dowonloader(object):
+    def __init__(self, use_pydrive):
+        self.use_pydrive = use_pydrive
+        if self.use_pydrive:
+            self.authenticate()
+    def authenticate(self):
+        auth.authenticate_user()
+        gauth = GoogleAuth()
+        gauth.credentials = GoogleCredentials.get_application_default()
+        self.drive = GoogleDrive(gauth)
+    def ensure_file_exists(self, file_id, file_dst):
+      if not os.path.isfile(file_dst):
+        if self.use_pydrive:
+            print(f'Downloading {file_dst} ...')
+            downloaded = self.drive.CreateFile({'id':file_id})
+            downloaded.FetchMetadata(fetch_all=True)
+            downloaded.GetContentFile(file_dst)
+            print('Finished')
+        else:
+            from gdown import download as drive_download
+            drive_download(f'https://drive.google.com/uc?id={file_id}', file_dst, quiet=False)
+      else:
+        print(f'{file_dst} exists.')

utils/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import numpy as np
+import torch
+def get_beta_schedule(*, beta_start, beta_end, num_diffusion_timesteps):
+    betas = np.linspace(beta_start, beta_end,
+                        num_diffusion_timesteps, dtype=np.float64)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+def extract(a, t, x_shape):
+    """Extract coefficients from a based on t and reshape to make it
+    broadcastable with x_shape."""
+    bs, = t.shape
+    assert x_shape[0] == bs
+    out = torch.gather(torch.tensor(a, dtype=torch.float, device=t.device), 0, t.long())
+    assert out.shape == (bs,)
+    out = out.reshape((bs,) + (1,) * (len(x_shape) - 1))
+    return out
+def denoising_step(xt, t, t_next, *,
+                   models,
+                   logvars,
+                   b,
+                   sampling_type='ddpm',
+                   eta=0.0,
+                   learn_sigma=False,
+                   hybrid=False,
+                   hybrid_config=None,
+                   ratio=1.0,
+                   out_x0_t=False,
+                   edit_h=None,
+                   ):
+    # Compute noise and variance
+    if type(models) != list:
+        model = models
+        if edit_h == None:
+            mid_h, et = model(xt, t)
+            # print("check mid_h and et:", mid_h.size(), et.size())
+        else:
+            mid_h, et = model(xt, t, edit_h)
+            # print("Denoising for editing!")
+        if learn_sigma:
+            et, logvar_learned = torch.split(et, et.shape[1] // 2, dim=1)
+            logvar = logvar_learned
+            # print("split et:", et.size())
+        else:
+            logvar = extract(logvars, t, xt.shape)
+    else:
+        if not hybrid:
+            et = 0
+            logvar = 0
+            if ratio != 0.0:
+                et_i = ratio * models[1](xt, t)
+                if learn_sigma:
+                    et_i, logvar_learned = torch.split(et_i, et_i.shape[1] // 2, dim=1)
+                    logvar += logvar_learned
+                else:
+                    logvar += ratio * extract(logvars, t, xt.shape)
+                et += et_i
+            if ratio != 1.0:
+                et_i = (1 - ratio) * models[0](xt, t)
+                if learn_sigma:
+                    et_i, logvar_learned = torch.split(et_i, et_i.shape[1] // 2, dim=1)
+                    logvar += logvar_learned
+                else:
+                    logvar += (1 - ratio) * extract(logvars, t, xt.shape)
+                et += et_i
+        else:
+            for thr in list(hybrid_config.keys()):
+                if t.item() >= thr:
+                    et = 0
+                    logvar = 0
+                    for i, ratio in enumerate(hybrid_config[thr]):
+                        ratio /= sum(hybrid_config[thr])
+                        et_i = models[i+1](xt, t)
+                        if learn_sigma:
+                            et_i, logvar_learned = torch.split(et_i, et_i.shape[1] // 2, dim=1)
+                            logvar_i = logvar_learned
+                        else:
+                            logvar_i = extract(logvars, t, xt.shape)
+                        et += ratio * et_i
+                        logvar += ratio * logvar_i
+                    break
+    # Compute the next x
+    bt = extract(b, t, xt.shape)
+    at = extract((1.0 - b).cumprod(dim=0), t, xt.shape)
+    if t_next.sum() == -t_next.shape[0]:
+        at_next = torch.ones_like(at)
+    else:
+        at_next = extract((1.0 - b).cumprod(dim=0), t_next, xt.shape)
+    xt_next = torch.zeros_like(xt)
+    if sampling_type == 'ddpm':
+        weight = bt / torch.sqrt(1 - at)
+        mean = 1 / torch.sqrt(1.0 - bt) * (xt - weight * et)
+        noise = torch.randn_like(xt)
+        mask = 1 - (t == 0).float()
+        mask = mask.reshape((xt.shape[0],) + (1,) * (len(xt.shape) - 1))
+        xt_next = mean + mask * torch.exp(0.5 * logvar) * noise
+        xt_next = xt_next.float()
+    elif sampling_type == 'ddim':
+        # print("check ddim incersion:", et.size())
+        x0_t = (xt - et * (1 - at).sqrt()) / at.sqrt()
+        if eta == 0:
+            xt_next = at_next.sqrt() * x0_t + (1 - at_next).sqrt() * et
+        elif at > (at_next):
+            print('Inversion process is only possible with eta = 0')
+            raise ValueError
+        else:
+            c1 = eta * ((1 - at / (at_next)) * (1 - at_next) / (1 - at)).sqrt()
+            c2 = ((1 - at_next) - c1 ** 2).sqrt()
+            xt_next = at_next.sqrt() * x0_t + c2 * et + c1 * torch.randn_like(xt)
+    # print("check out:", xt_next.size(), mid_h.size(), x0_t.size())
+    if out_x0_t == True:
+        # print("three output!")
+        return xt_next, x0_t, mid_h
+    else:
+        # print("two output!")
+        return xt_next, mid_h

utils/prepare_lmdb_data.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""
+    Refer to https://github.com/rosinality/stylegan2-pytorch/blob/master/prepare_data.py
+"""
+import argparse
+from io import BytesIO
+import multiprocessing
+from functools import partial
+import os, glob, sys
+from PIL import Image
+import lmdb
+from tqdm import tqdm
+from torchvision import datasets
+from torchvision.transforms import functional as trans_fn
+def resize_and_convert(img, size, resample, quality=100):
+    img = trans_fn.resize(img, (size, size), resample)
+    # img = trans_fn.center_crop(img, size)
+    buffer = BytesIO()
+    img.save(buffer, format="jpeg", quality=quality)
+    val = buffer.getvalue()
+    return val
+def resize_multiple(
+    img, sizes=(128, 256, 512, 1024), resample=Image.LANCZOS, quality=100
+):
+    imgs = []
+    for size in sizes:
+        imgs.append(resize_and_convert(img, size, resample, quality))
+    return imgs
+def resize_worker(img_file, sizes, resample):
+    i, file, img_id = img_file
+    # print("check resize_worker:", i, file, img_id)
+    img = Image.open(file)
+    img = img.convert("RGB")
+    out = resize_multiple(img, sizes=sizes, resample=resample)
+    return i, out, img_id
+def file_to_list(filename):
+    with open(filename, encoding='utf-8') as f:
+        files = f.readlines()
+    files = [f.rstrip() for f in files]
+    return files
+def prepare(
+    env, dataset, n_worker, sizes=(128, 256, 512, 1024), resample=Image.LANCZOS
+):
+    resize_fn = partial(resize_worker, sizes=sizes, resample=resample)
+    files = sorted(dataset.imgs, key=lambda x: x[0])
+    files = [(i, file, file.split('/')[-1].split('.')[0]) for i, (file, label) in enumerate(files)]
+    total = 0
+    with multiprocessing.Pool(n_worker) as pool:
+        for i, imgs, img_id in tqdm(pool.imap_unordered(resize_fn, files)):
+            key_label = f"{str(i).zfill(5)}".encode("utf-8")
+            for size, img in zip(sizes, imgs):
+                key = f"{size}-{str(i).zfill(5)}".encode("utf-8")
+                with env.begin(write=True) as txn:
+                    txn.put(key, img)
+                    txn.put(key_label, str(img_id).encode("utf-8"))
+            total += 1
+        with env.begin(write=True) as txn:
+            txn.put("length".encode("utf-8"), str(total).encode("utf-8"))
+def prepare_attr(
+    env, dataset, n_worker, sizes=(128, 256, 512, 1024), resample=Image.LANCZOS, label_attr='gender'
+):
+    resize_fn = partial(resize_worker, sizes=sizes, resample=resample)
+    files = sorted(dataset.imgs, key=lambda x: x[0])
+    attr_file_path = '/n/fs/yz-diff/inversion/list_attr_celeba.txt'
+    labels = file_to_list(attr_file_path)
+    attr_dict = {}
+    files_attr = []
+    for i, (file, split) in enumerate(files):
+        img_id = int(file.split('/')[-1].split('.')[0])
+        # print("check i, file, and split:", i, file, split, img_id)
+        attr_label = labels[img_id-1].split()
+        label = int(attr_label[21])
+        # print("check attr_label:", attr_label, len(attr_label), label)
+        files_attr.append((i, file, label))
+        # exit()
+    files = files_attr
+    # files = [(i, file) for i, (file, label) in enumerate(files)]
+    total = 0
+    with multiprocessing.Pool(n_worker) as pool:
+        for i, imgs, label in tqdm(pool.imap_unordered(resize_fn, files)):
+            # print("check i, imgs, label:", label)
+            for size, img in zip(sizes, imgs):
+                key = f"{size}-{str(i).zfill(5)}".encode("utf-8")
+                key_label = f"{'label'}-{str(i).zfill(5)}".encode("utf-8")
+                with env.begin(write=True) as txn:
+                    txn.put(key, img)
+                    txn.put(key_label, str(label).encode("utf-8"))
+            total += 1
+        with env.begin(write=True) as txn:
+            txn.put("length".encode("utf-8"), str(total).encode("utf-8"))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", type=str)
+    parser.add_argument("--size", type=str, default="128,256,512,1024")
+    parser.add_argument("--n_worker", type=int, default=5)
+    parser.add_argument("--resample", type=str, default="bilinear")
+    parser.add_argument("--attr", type=str)
+    parser.add_argument("path", type=str)
+    args = parser.parse_args()
+    resample_map = {"lanczos": Image.LANCZOS, "bilinear": Image.BILINEAR}
+    resample = resample_map[args.resample]
+    sizes = [int(s.strip()) for s in args.size.split(",")]
+    print(f"Make dataset of image sizes:", ", ".join(str(s) for s in sizes))
+    imgset = datasets.ImageFolder(args.path)
+    with lmdb.open(args.out, map_size=1024 ** 4, readahead=False) as env:
+        prepare(env, imgset, args.n_worker, sizes=sizes, resample=resample)

utils/text_dic.py ADDED Viewed

	@@ -0,0 +1,123 @@

+SRC_TRG_TXT_DIC = {
+    # Human face
+    'tanned': (['face'],
+               ['tanned face']),
+    'pale': (['face'],
+             ['pale face']),
+    'makeup': (['person'],
+               ['person with makeup']),
+    'no_makeup': (['person'],
+                  ['person without makeup']),
+    'old': (['person'],
+            ['old person']),
+    'young': (['person'],
+              ['young person']),
+    'beards': (['person'],
+               ['person with beards']),
+    'angry': (['face'],
+              ['angry face']),
+    'surprised': (['face'],
+                  ['surprised face']),
+    'smiling': (['face'],
+                ['smiling face']),
+    'blond_hair': (['person'],
+                   ['person with blond hair']),
+    'red_hair': (['person'],
+                 ['person with red hair']),
+    'grey_hair': (['person'],
+                  ['person with red hair']),
+    'curly_hair': (['person'],
+                   ['person with curly hair']),
+    'nicolas': (['Person'],
+                ['Nicolas Cage']),
+    'zuckerberg': (['Person'],
+                   ['Mark Zuckerberg']),
+    'benedict': (['Person'],
+              ['Benedict Cumberbatch']),
+    'gogh': (['photo'],
+             ['painting by Gogh']),
+    'frida': (['photo'],
+              ['self-portrait by Frida Kahlo']),
+    'modigliani': (['photo'],
+                   ['Painting in Modigliani style']),
+    'sketch': (['photo'],
+              ['sketch']),
+    'watercolor': (['photo'],
+                   ['Watercolor Art with Thick Brushstrokes']),
+    'elf': (['Human'],
+            ['Tolkien elf']),
+    'super_saiyan': (['Human'],
+                     ['Super saiyan']),
+    'pixar': (['Human'],
+              ['3D render in the style of Pixar']),
+    'neanderthal': (['Human'],
+                    ['Neanderthal']),
+    'zombie': (['Human'],
+               ['Zombie']),
+    'jocker': (['Human'],
+               ['The Jocker']),
+    # Dog face
+    'dog_nicolas': (['Dog'],
+                    ['Nicolas Cage']),
+    'dog_yorkshire': (['Dog'],
+                      ['Yorkshire Terrier']),
+    'dog_smiling': (['Dog'],
+                    ['Smiling Dog']),
+    'dog_zombie': (['Dog'],
+                   ['Zombie']),
+    'dog_super_saiyan': (['Dog'],
+                         ['Super saiyan']),
+    'dog_venom': (['Dog'],
+                  ['Venom']),
+    'dog_bear': (['Dog'],
+                 ['Bear']),
+    'dog_fox': (['Dog'],
+                ['Fox']),
+    'dog_wolf': (['Dog'],
+                 ['Wolf']),
+    'dog_hamster': (['Dog'],
+                    ['Hamster']),
+    # Church
+    'church_snow': (['Church'],
+                    ['Snow Coverd Church']),
+    'church_night': (['Church'],
+                     ['Church at night']),
+    'church_red_brick': (['Church'],
+                         ['Red brick wall Church']),
+    'church_golden': (['Church'],
+                      ['Golden Church']),
+    'church_wooden_house': (['Church'],
+                            ['Wooden House']),
+    'church_gothic': (['Church'],
+                      ['Gothic Church']),
+    'church_ancient_tower': (['Church'],
+                             ['Ancient traditional Asian tower']),
+    'church_temple': (['Church'],
+                      ['Temple']),
+    'church_factory': (['church'],
+                       ['factory with chimneys']),
+    'church_department_store': (['church'],
+                                ['department store']),
+    # Bedroom
+    'bedroom_blue': (['Bedroom'],
+                     ['Blue tone Bedroom']),
+    'bedroom_green': (['Bedroom'],
+                      ['Green tone Bedroom']),
+    'bedroom_golden': (['Bedroom'],
+                       ['Golden Bedroom']),
+    'bedroom_princess': (['Bedroom'],
+                         ['Princess Bedroom']),
+    'bedroom_palace': (['Bedroom'],
+                       ['Palace Bedroom']),
+    'bedroom_wooden': (['Bedroom'],
+                       ['Wooden Bedroom']),
+}

utils/text_templates.py ADDED Viewed

	@@ -0,0 +1,129 @@

+imagenet_templates = [
+    'a bad photo of a {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+]
+part_templates = [
+    'the paw of a {}.',
+    'the nose of a {}.',
+    'the eye of the {}.',
+    'the ears of a {}.',
+    'an eye of a {}.',
+    'the tongue of a {}.',
+    'the fur of the {}.',
+    'colorful {} fur.',
+    'a snout of a {}.',
+    'the teeth of the {}.',
+    'the {}s fangs.',
+    'a claw of the {}.',
+    'the face of the {}',
+    'a neck of a {}',
+    'the head of the {}',
+]
+imagenet_templates_small = [
+    'a photo of a {}.',
+    'a rendering of a {}.',
+    'a cropped photo of the {}.',
+    'the photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a photo of my {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a photo of one {}.',
+    'a close-up photo of the {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a good photo of a {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'a photo of the large {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+]