Spaces:

K00B404
/

pix2pix_flux_train

Running

App Files Files Community

K00B404 commited on Oct 23, 2024

Commit

82ee3f8

verified ·

1 Parent(s): dadfc60

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -18

app.py CHANGED Viewed

@@ -87,12 +87,65 @@ class Pix2PixDataset(torch.utils.data.Dataset):
         return original, target, original_tokens, enhanced_tokens
 class UNetWrapper:
-    def __init__(self, unet_model, repo_id):
         self.model = unet_model
         self.repo_id = repo_id
-        self.token = os.getenv('NEW_TOKEN')  # Make sure this environment variable is set
-        self.api = HfApi(token=os.getenv('NEW_TOKEN'))
     def push_to_hub(self):
         try:
@@ -103,7 +156,11 @@ class UNetWrapper:
                     'big': isinstance(self.model, big_UNet),
                     'img_size': 1024 if isinstance(self.model, big_UNet) else 256
                 },
-                'model_architecture': str(self.model)
             }
             # Save model locally
@@ -120,14 +177,18 @@ class UNetWrapper:
             except Exception as e:
                 print(f"Repository creation note: {e}")
-            # Upload the model file
-            self.api.upload_file(
-                path_or_fileobj=pth_name,
-                path_in_repo=pth_name,
-                repo_id=self.repo_id,
-                token=self.token,
-                repo_type="model"
-            )
             # Create and upload model card
             model_card = f"""---
@@ -222,14 +283,83 @@ def run_inference(image):
     # Convert output to image
     output = output.cpu().squeeze(0).permute(1, 2, 0).numpy()
     output = ((output - output.min()) / (output.max() - output.min()) * 255).astype(np.uint8)
-    rp(output)
     return output
-def to_hub(model):
-    wrapper = UNetWrapper(model, model_repo_id)
     wrapper.push_to_hub()
-def train_model(epochs):
     """Training function"""
     global global_model
@@ -282,7 +412,7 @@ def train_model(epochs):
                 output_text.append(status)
         # Push model to Hugging Face Hub at the end of each epoch
-        to_hub(model)
     global_model = model  # Update the global model after training
     return model, "\n".join(output_text)
@@ -295,7 +425,11 @@ def gradio_train(epochs):
 def gradio_inference(input_image):
     """Gradio inference interface function"""
-    return input_image, run_inference(input_image)
 # Create Gradio interface with tabs
 with gr.Blocks() as app:

         return original, target, original_tokens, enhanced_tokens
+class UNetWrapper:
+    def push_to_hub(self, pth_name):
+        """Push model checkpoint and metadata to the Hugging Face Hub."""
+        try:
+            self.api.upload_file(
+                path_or_fileobj=pth_name,
+                path_in_repo=pth_name,
+                repo_id=self.repo_id,
+                token=self.token,
+                repo_type="model"
+            )
+            print(f"Model checkpoint successfully uploaded to {self.repo_id}")
+        except Exception as e:
+            print(f"Error uploading model: {e}")
 class UNetWrapper:
+        def __init__(self, unet_model, repo_id, epoch, loss, optimizer, scheduler=None):
+        self.loss = loss
+        self.epoch = epoch
         self.model = unet_model
+        self.optimizer = optimizer
+        self.scheduler = scheduler
         self.repo_id = repo_id
+        self.token = os.getenv('NEW_TOKEN')  # Ensure the token is set in the environment
+        self.api = HfApi(token=self.token)
+    def save_checkpoint(self, save_path):
+        """Save checkpoint with model, optimizer, and scheduler states."""
+        save_dict = {
+            'model_state_dict': self.model.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict(),
+            'scheduler_state_dict': self.scheduler.state_dict() if self.scheduler else None,
+            'model_config': {
+                'big': isinstance(self.model, big_UNet),
+                'img_size': 1024 if isinstance(self.model, big_UNet) else 256
+            },
+            'epoch': self.epoch,
+            'loss': self.loss
+        }
+        torch.save(save_dict, save_path)
+        print(f"Checkpoint saved at epoch {self.epoch}, loss: {self.loss}")
+    def load_checkpoint(self, checkpoint_path):
+        """Load model, optimizer, and scheduler states from the checkpoint."""
+        checkpoint = torch.load(checkpoint_path, map_location=device)
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        if self.scheduler and checkpoint['scheduler_state_dict']:
+            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        self.epoch = checkpoint['epoch']
+        self.loss = checkpoint['loss']
+        print(f"Checkpoint loaded: epoch {self.epoch}, loss: {self.loss}")
     def push_to_hub(self):
         try:
                     'big': isinstance(self.model, big_UNet),
                     'img_size': 1024 if isinstance(self.model, big_UNet) else 256
                 },
+                'model_architecture': str(self.model),
+                'model_state':{
+                    'epoch': self.epoch,
+                    'loss': self.loss
+                }
             }
             # Save model locally
             except Exception as e:
                 print(f"Repository creation note: {e}")
+            ""Push model checkpoint and metadata to the Hugging Face Hub."""
+            try:
+                self.api.upload_file(
+                    path_or_fileobj=pth_name,
+                    path_in_repo=pth_name,
+                    repo_id=self.repo_id,
+                    token=self.token,
+                    repo_type="model"
+                )
+                print(f"Model checkpoint successfully uploaded to {self.repo_id}")
+            except Exception as e:
+                print(f"Error uploading model: {e}")
             # Create and upload model card
             model_card = f"""---
     # Convert output to image
     output = output.cpu().squeeze(0).permute(1, 2, 0).numpy()
     output = ((output - output.min()) / (output.max() - output.min()) * 255).astype(np.uint8)
+    rp(output[0])
     return output
+def to_hub(model, epoch, loss):
+    wrapper = UNetWrapper(model, model_repo_id, epoch, loss)
     wrapper.push_to_hub()
+def train_model(epochs, save_interval=1):
+    """Training function with checkpoint saving and model uploading."""
+    global global_model
+    # Load combined data CSV
+    data_path = 'combined_data.csv'
+    combined_data = pd.read_csv(data_path)
+    # Define the transformation
+    transform = transforms.Compose([
+        transforms.Resize((IMG_SIZE, IMG_SIZE)),
+        transforms.ToTensor(),
+    ])
+    # Initialize dataset and dataloader
+    dataset = Pix2PixDataset(combined_data, transform, clip_tokenizer)
+    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
+    model = global_model
+    criterion = nn.L1Loss()
+    optimizer = optim.Adam(model.parameters(), lr=LR)
+    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)  # Example scheduler
+    wrapper = UNetWrapper(model, model_repo_id, epoch=0, loss=0.0, optimizer=optimizer, scheduler=scheduler)
+    output_text = []
+    for epoch in range(epochs):
+        model.train()
+        running_loss = 0.0
+        for i, (original, target, original_prompt_tokens, enhanced_prompt_tokens) in enumerate(dataloader):
+            # Move data to device
+            original, target = original.to(device), target.to(device)
+            original_prompt_tokens = original_prompt_tokens.input_ids.to(device).float()
+            enhanced_prompt_tokens = enhanced_prompt_tokens.input_ids.to(device).float()
+            optimizer.zero_grad()
+            # Forward pass
+            output = model(target)
+            img_loss = criterion(output, original)
+            total_loss = img_loss
+            total_loss.backward()
+            optimizer.step()
+            running_loss += total_loss.item()
+            if i % 10 == 0:
+                status = f"Epoch [{epoch}/{epochs}], Step [{i}/{len(dataloader)}], Loss: {total_loss.item():.8f}"
+                print(status)
+                output_text.append(status)
+        # Update the epoch and loss for checkpoint
+        wrapper.epoch = epoch + 1
+        wrapper.loss = running_loss / len(dataloader)
+        # Save checkpoint at specified intervals
+        if (epoch + 1) % save_interval == 0:
+            checkpoint_path = f'big_checkpoint_epoch_{epoch+1}.pth' if big else   f'small_checkpoint_epoch_{epoch+1}.pth'
+            wrapper.save_checkpoint(checkpoint_path)
+            wrapper.push_to_hub(checkpoint_path)
+        scheduler.step()  # Update learning rate scheduler
+    global_model = model  # Update global model after training
+    return model, "\n".join(output_text)
+def train_model_old(epochs):
     """Training function"""
     global global_model
                 output_text.append(status)
         # Push model to Hugging Face Hub at the end of each epoch
+        to_hub(model, epoch, total_loss)
     global_model = model  # Update the global model after training
     return model, "\n".join(output_text)
 def gradio_inference(input_image):
     """Gradio inference interface function"""
+    output_image = run_inference(input_image)  # Assuming `run_inference` returns a tuple (output_image, other_data)
+    rp(output_image)
+    # If `run_inference` returns a tuple, you should only return the image part
+    return output_image  # Ensure you're only returning the processed output image
 # Create Gradio interface with tabs
 with gr.Blocks() as app: