In [2]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from artifact_classification.utils import ConfigLoader

In [3]:
config = "testing"
args = ConfigLoader(config, "../configs/train_bm_configs.yaml", "../configs/train_bm_default.yaml")

############################## Load dataset ##############################
# Load dataset, filter out na inputs and labels and encode labels (as label column can change)


label_cols = ["Object type", "Culture", "Materials", "Production place"]
split_sizes = [0.1, 0.12, 0.13, 0.15, 0.2]

ds_lim = load_dataset(args.dataset)["train"].to_pandas()

Updating with:
{'config': 'testing', 'fast_dev_run': True, 'dataset': 'james-burton/BritishMuseum', 'wandb_proj_name': 'British Museum', 'model_base': 'google/efficientnet-b3', 'problem_type': 'image', 'lower_lim': 5, 'label_col': 'Object type'}


{'config': 'testing', 'fast_dev_run': True, 'do_train': True, 'do_predict': True, 'batch_size': 128, 'output_root': 'models/', 'num_epochs': 100, 'early_stopping_patience': 5, 'grad_accumulation_steps': 1, 'seed': 42, 'logging_steps': 10, 'lr_scheduler': 'linear', 'warmup_ratio': 0, 'weight_decay': 0, 'device': 'cuda', 'num_workers': 1, 'resume_from_checkpoint': False, 'predict_batch_size': 16, 'save_total_limit': 1, 'lr': 5e-05, 'pytorch2_0': True, 'max_length': 512, 'text_column': 'Description', 'fp16': True, 'testset_size': 0.1, 'dataset': 'james-burton/BritishMuseum', 'wandb_proj_name': 'British Museum', 'model_base': 'google/efficientnet-b3', 'problem_type': 'image', 'lower_lim': 5, 'label_col': 'Object type'}



Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [8]:
def test_split_size(split_size, label_col, ds_lim):
    try:
        ds_lim.dropna(subset=[label_col])
        num_counts = ds_lim[label_col].value_counts()
        ds_lim = ds_lim[ds_lim[label_col].isin(num_counts[num_counts > args.lower_lim].index)]

        train, val_test = train_test_split(
            ds_lim,
            stratify=ds_lim[label_col],
            test_size=2 * split_size,
            random_state=42,
        )
        val, test = train_test_split(
            val_test, stratify=val_test[label_col], test_size=0.5, random_state=42
        )
        print(f"Label col {label_col} Split size {split_size} passed")
    except ValueError:
        print(f"Split size {split_size} failed")


for label_col in label_cols:
    for split_size in split_sizes:
        test_split_size(split_size, label_col, ds_lim)

Split size 0.1 failed
Split size 0.12 failed
Label col Object type Split size 0.13 passed
Label col Object type Split size 0.15 passed
Label col Object type Split size 0.2 passed
Split size 0.1 failed
Split size 0.12 failed
Label col Culture Split size 0.13 passed
Label col Culture Split size 0.15 passed
Label col Culture Split size 0.2 passed
Split size 0.1 failed
Split size 0.12 failed
Label col Materials Split size 0.13 passed
Label col Materials Split size 0.15 passed
Label col Materials Split size 0.2 passed
Split size 0.1 failed
Split size 0.12 failed
Label col Production place Split size 0.13 passed
Label col Production place Split size 0.15 passed
Label col Production place Split size 0.2 passed


In [1]:
import yaml

with open("../configs/train_configs.yaml", "r") as file:
    configs = list(yaml.safe_load_all(file))

In [2]:
" ".join(
    [cfg["config"] for cfg in configs if "bm" in cfg["config"] and "num" not in cfg["config"]]
)

'om3-white_material_bm-pretrn om3-white_name_bm-pretrn om3-3Dwhite_material_bm-pretrn om3-3Dwhite_name_bm-pretrn om3-3Dwhite-1frame_material_bm-pretrn om3-3Dwhite-1frame_name_bm-pretrn om4-white_material_bm-pretrn om4-white_name_bm-pretrn om4-3Dwhite_material_bm-pretrn om4-3Dwhite_name_bm-pretrn om4-3Dwhite-1frame_material_bm-pretrn om4-3Dwhite-1frame_name_bm-pretrn om5-white_material_bm-pretrn om5-white_name_bm-pretrn om5-3Dwhite_material_bm-pretrn om5-3Dwhite_name_bm-pretrn om5-3Dwhite-1frame_material_bm-pretrn om5-3Dwhite-1frame_name_bm-pretrn om6-white_material_bm-pretrn om6-white_name_bm-pretrn om6-3Dwhite_material_bm-pretrn om6-3Dwhite_name_bm-pretrn om6-3Dwhite-1frame_material_bm-pretrn om6-3Dwhite-1frame_name_bm-pretrn'

In [18]:
configs

[{'config': 'testing',
  'fast_dev_run': True,
  'dataset': 'james-burton/BritishMuseum',
  'wandb_proj_name': 'British Museum',
  'model_base': 'google/efficientnet-b3',
  'problem_type': 'image',
  'lower_lim': 5,
  'label_col': 'Object type'},
 {'config': 'bm3_type',
  'dataset': 'james-burton/BritishMuseum',
  'wandb_proj_name': 'British Museum',
  'model_base': 'google/efficientnet-b3',
  'problem_type': 'image',
  'lower_lim': 3,
  'label_col': 'Object type',
  'testset_size': 0.205},
 {'config': 'bm3_material',
  'dataset': 'james-burton/BritishMuseum',
  'wandb_proj_name': 'British Museum',
  'model_base': 'google/efficientnet-b3',
  'problem_type': 'image',
  'lower_lim': 3,
  'label_col': 'Materials',
  'testset_size': 0.205},
 {'config': 'bm3_culture',
  'dataset': 'james-burton/BritishMuseum',
  'wandb_proj_name': 'British Museum',
  'model_base': 'google/efficientnet-b3',
  'problem_type': 'image',
  'lower_lim': 3,
  'label_col': 'Culture',
  'testset_size': 0.205},
 {'co