{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "e:\\plant\\venv\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "from autogluon.tabular import TabularDataset, TabularPredictor\n", "from autogluon.common.utils.utils import setup_outputdir\n", "from autogluon.core.utils.loaders import load_pkl\n", "from autogluon.core.utils.savers import save_pkl\n", "import os.path\n", "import os\n", "import pandas as pd\n", "from PIL import Image\n", "import torch\n", "from transformers import ViTModel, ViTFeatureExtractor\n", "import pickle\n", "\n", "class MultilabelPredictor:\n", " \"\"\" Tabular Predictor for predicting multiple columns in table.\n", " Creates multiple TabularPredictor objects which you can also use individually.\n", " You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`\n", "\n", " Parameters\n", " ----------\n", " labels : List[str]\n", " The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.\n", " path : str, default = None\n", " Path to directory where models and intermediate outputs should be saved.\n", " If unspecified, a time-stamped folder called \"AutogluonModels/ag-[TIMESTAMP]\" will be created in the working directory to store all models.\n", " Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.\n", " Otherwise files from first `fit()` will be overwritten by second `fit()`.\n", " Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.\n", " problem_types : List[str], default = None\n", " The ith element is the `problem_type` for the ith TabularPredictor stored in this object.\n", " eval_metrics : List[str], default = None\n", " The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.\n", " consider_labels_correlation : bool, default = True\n", " Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.\n", " If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).\n", " Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.\n", " kwargs :\n", " Arguments passed into the initialization of each TabularPredictor.\n", "\n", " \"\"\"\n", "\n", " multi_predictor_file = 'multilabel_predictor.pkl'\n", "\n", " def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):\n", " if len(labels) < 2:\n", " raise ValueError(\"MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).\")\n", " if (problem_types is not None) and (len(problem_types) != len(labels)):\n", " raise ValueError(\"If provided, `problem_types` must have same length as `labels`\")\n", " if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):\n", " raise ValueError(\"If provided, `eval_metrics` must have same length as `labels`\")\n", " self.path = setup_outputdir(path, warn_if_exist=False)\n", " self.labels = labels\n", " self.consider_labels_correlation = consider_labels_correlation\n", " self.predictors = {} # key = label, value = TabularPredictor or str path to the TabularPredictor for this label\n", " if eval_metrics is None:\n", " self.eval_metrics = {}\n", " else:\n", " self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}\n", " problem_type = None\n", " eval_metric = None\n", " for i in range(len(labels)):\n", " label = labels[i]\n", " path_i = os.path.join(self.path, \"Predictor_\" + str(label))\n", " if problem_types is not None:\n", " problem_type = problem_types[i]\n", " if eval_metrics is not None:\n", " eval_metric = eval_metrics[i]\n", " self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)\n", "\n", " def fit(self, train_data, tuning_data=None, **kwargs):\n", " \"\"\" Fits a separate TabularPredictor to predict each of the labels.\n", "\n", " Parameters\n", " ----------\n", " train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame\n", " See documentation for `TabularPredictor.fit()`.\n", " kwargs :\n", " Arguments passed into the `fit()` call for each TabularPredictor.\n", " \"\"\"\n", " if isinstance(train_data, str):\n", " train_data = TabularDataset(train_data)\n", " if tuning_data is not None and isinstance(tuning_data, str):\n", " tuning_data = TabularDataset(tuning_data)\n", " train_data_og = train_data.copy()\n", " if tuning_data is not None:\n", " tuning_data_og = tuning_data.copy()\n", " else:\n", " tuning_data_og = None\n", " save_metrics = len(self.eval_metrics) == 0\n", " for i in range(len(self.labels)):\n", " label = self.labels[i]\n", " predictor = self.get_predictor(label)\n", " if not self.consider_labels_correlation:\n", " labels_to_drop = [l for l in self.labels if l != label]\n", " else:\n", " labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]\n", " train_data = train_data_og.drop(labels_to_drop, axis=1)\n", " if tuning_data is not None:\n", " tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)\n", " print(f\"Fitting TabularPredictor for label: {label} ...\")\n", " predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)\n", " self.predictors[label] = predictor.path\n", " if save_metrics:\n", " self.eval_metrics[label] = predictor.eval_metric\n", " self.save()\n", "\n", " def predict(self, data, **kwargs):\n", " \"\"\" Returns DataFrame with label columns containing predictions for each label.\n", "\n", " Parameters\n", " ----------\n", " data_copy : str or autogluon.tabular.TabularDataset or pd.DataFrame\n", " Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.\n", " kwargs :\n", " Arguments passed into the predict() call for each TabularPredictor.\n", " \"\"\"\n", " return self._predict(data, as_proba=False, **kwargs)\n", "\n", " def predict_proba(self, data, **kwargs):\n", " \"\"\" Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.\n", "\n", " Parameters\n", " ----------\n", " data : str or autogluon.tabular.TabularDataset or pd.DataFrame\n", " Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.\n", " kwargs :\n", " Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).\n", " \"\"\"\n", " return self._predict(data, as_proba=True, **kwargs)\n", "\n", " def evaluate(self, data, **kwargs):\n", " \"\"\" Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.\n", "\n", " Parameters\n", " ----------\n", " data : str or autogluon.tabular.TabularDataset or pd.DataFrame\n", " Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.\n", " kwargs :\n", " Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).\n", " \"\"\"\n", " data = self._get_data(data)\n", " eval_dict = {}\n", " for label in self.labels:\n", " print(f\"Evaluating TabularPredictor for label: {label} ...\")\n", " predictor = self.get_predictor(label)\n", " eval_dict[label] = predictor.evaluate(data, **kwargs)\n", " if self.consider_labels_correlation:\n", " data[label] = predictor.predict(data, **kwargs)\n", " return eval_dict\n", "\n", " def save(self):\n", " \"\"\" Save MultilabelPredictor to disk. \"\"\"\n", " for label in self.labels:\n", " if not isinstance(self.predictors[label], str):\n", " self.predictors[label] = self.predictors[label].path\n", " save_pkl.save(path=os.path.join(self.path, self.multi_predictor_file), object=self)\n", " print(f\"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')\")\n", "\n", " @classmethod\n", " def load(cls, path):\n", " \"\"\" Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. \"\"\"\n", " path = os.path.expanduser(path)\n", " return load_pkl.load(path=os.path.join(path, cls.multi_predictor_file))\n", "\n", " def get_predictor(self, label):\n", " \"\"\" Returns TabularPredictor which is used to predict this label. \"\"\"\n", " predictor = self.predictors[label]\n", " if isinstance(predictor, str):\n", " return TabularPredictor.load(path=predictor)\n", " return predictor\n", "\n", " def _get_data(self, data):\n", " if isinstance(data, str):\n", " return TabularDataset(data)\n", " return data.copy()\n", "\n", " def _predict(self, data, as_proba=False, **kwargs):\n", " data = self._get_data(data)\n", " if as_proba:\n", " predproba_dict = {}\n", " for label in self.labels:\n", " print(f\"Predicting with TabularPredictor for label: {label} ...\")\n", " predictor = self.get_predictor(label)\n", " if as_proba:\n", " predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)\n", " data[label] = predictor.predict(data, **kwargs)\n", " if not as_proba:\n", " return data[self.labels]\n", " else:\n", " return predproba_dict\n", "\n", "def extract_image_embeddings_batch(image_paths):\n", " \"\"\"Extract embeddings for a batch of images using Vision Transformer.\"\"\"\n", " images = []\n", " \n", " # Load and preprocess all images in the batch\n", " for image_path in image_paths:\n", " image = Image.open(image_path).convert(\"RGB\")\n", " images.append(image)\n", " \n", " # Prepare inputs as a batch\n", " inputs = feature_extractor(images=images, return_tensors=\"pt\", padding=True).to(device)\n", " \n", " # Get embeddings in a single forward pass\n", " with torch.no_grad():\n", " outputs = vit_model(**inputs)\n", " \n", " # Compute mean embeddings for each image in the batch\n", " return outputs.last_hidden_state.mean(dim=1).cpu().numpy()\n", "\n", "def preprocess_images(df, image_dir, image_column='id', batch_size=512):\n", " \"\"\"Generate image embeddings for all rows in a DataFrame in batches.\"\"\"\n", " embeddings = []\n", " n = len(df)\n", " \n", " for i in range(0, n, batch_size):\n", " # Get the current batch of image paths\n", " batch = df.iloc[i:i+batch_size]\n", " image_paths = [os.path.join(image_dir, f\"{int(row[image_column])}.jpeg\") for _, row in batch.iterrows()]\n", " # Extract embeddings for the batch\n", " batch_embeddings = extract_image_embeddings_batch(image_paths)\n", " embeddings.extend(batch_embeddings)\n", " \n", " print(f\"Processed batch {i//batch_size + 1}/{(n + batch_size - 1)//batch_size}\")\n", " # Convert to DataFrame\n", " return pd.DataFrame(embeddings, index=df.index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting image embeddings for training data...\n", "Combining ancillary data and image embeddings...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Verbosity: 2 (Standard Logging)\n", "=================== System Info ===================\n", "AutoGluon Version: 1.1.1\n", "Python Version: 3.10.11\n", "Operating System: Windows\n", "Platform Machine: AMD64\n", "Platform Version: 10.0.22631\n", "CPU Count: 12\n", "Memory Avail: 5.11 GB / 15.79 GB (32.4%)\n", "Disk Space Avail: 79.69 GB / 150.79 GB (52.8%)\n", "===================================================\n", "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n", "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n", "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n", "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n", "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n", "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Training MultilabelPredictor...\n", "Fitting TabularPredictor for label: X4_mean ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 190.45 MB).\n", "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n", "Beginning AutoGluon training ...\n", "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X4_mean\"\n", "Train Data Rows: 43363\n", "Train Data Columns: 932\n", "Label Column: X4_mean\n", "Problem Type: regression\n", "Preprocessing data ...\n", "Using Feature Generators to preprocess the data ...\n", "Fitting AutoMLPipelineFeatureGenerator...\n", "\tAvailable Memory: 5219.75 MB\n", "\tTrain Data (Original) Memory Usage: 181.30 MB (3.5% of available memory)\n", "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n", "\tStage 1 Generators:\n", "\t\tFitting AsTypeFeatureGenerator...\n", "\tStage 2 Generators:\n", "\t\tFitting FillNaFeatureGenerator...\n", "\tStage 3 Generators:\n", "\t\tFitting IdentityFeatureGenerator...\n", "\tStage 4 Generators:\n", "\t\tFitting DropUniqueFeatureGenerator...\n", "\tStage 5 Generators:\n", "\t\tFitting DropDuplicatesFeatureGenerator...\n", "\tTypes of features in original data (raw dtype, special dtypes):\n", "\t\t('float', []) : 810 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\tTypes of features in processed data (raw dtype, special dtypes):\n", "\t\t('float', []) : 810 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\t5.1s = Fit runtime\n", "\t932 features in original data used to generate 932 features in processed data.\n", "\tTrain Data (Processed) Memory Usage: 181.30 MB (3.5% of available memory)\n", "Data preprocessing and feature engineering runtime = 5.57s ...\n", "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n", "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n", "\tTo change this, specify the eval_metric parameter of Predictor()\n", "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n", "User-specified model hyperparameters to be fit:\n", "{\n", "\t'NN_TORCH': {},\n", "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n", "\t'FASTAI': {},\n", "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n", "}\n", "Fitting 9 L1 models ...\n", "Fitting model: KNeighborsUnif ...\n", "\t-0.1421\t = Validation score (-root_mean_squared_error)\n", "\t1.47s\t = Training runtime\n", "\t2.66s\t = Validation runtime\n", "Fitting model: KNeighborsDist ...\n", "\t-0.1426\t = Validation score (-root_mean_squared_error)\n", "\t1.45s\t = Training runtime\n", "\t2.88s\t = Validation runtime\n", "Fitting model: LightGBMXT ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 0.10796\n", "[2000]\tvalid_set's rmse: 0.107227\n", "[3000]\tvalid_set's rmse: 0.106933\n", "[4000]\tvalid_set's rmse: 0.106685\n", "[5000]\tvalid_set's rmse: 0.106466\n", "[6000]\tvalid_set's rmse: 0.106427\n", "[7000]\tvalid_set's rmse: 0.106386\n", "[8000]\tvalid_set's rmse: 0.106361\n", "[9000]\tvalid_set's rmse: 0.106337\n", "[10000]\tvalid_set's rmse: 0.106303\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-0.1063\t = Validation score (-root_mean_squared_error)\n", "\t863.4s\t = Training runtime\n", "\t0.93s\t = Validation runtime\n", "Fitting model: LightGBM ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 0.108342\n", "[2000]\tvalid_set's rmse: 0.107862\n", "[3000]\tvalid_set's rmse: 0.107599\n", "[4000]\tvalid_set's rmse: 0.107513\n", "[5000]\tvalid_set's rmse: 0.107464\n", "[6000]\tvalid_set's rmse: 0.107424\n", "[7000]\tvalid_set's rmse: 0.107404\n", "[8000]\tvalid_set's rmse: 0.107379\n", "[9000]\tvalid_set's rmse: 0.107371\n", "[10000]\tvalid_set's rmse: 0.107365\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-0.1074\t = Validation score (-root_mean_squared_error)\n", "\t1027.06s\t = Training runtime\n", "\t0.83s\t = Validation runtime\n", "Fitting model: RandomForestMSE ...\n", "\t-0.112\t = Validation score (-root_mean_squared_error)\n", "\t3077.41s\t = Training runtime\n", "\t0.22s\t = Validation runtime\n", "Fitting model: ExtraTreesMSE ...\n", "\t-0.1119\t = Validation score (-root_mean_squared_error)\n", "\t1255.77s\t = Training runtime\n", "\t0.24s\t = Validation runtime\n", "Fitting model: NeuralNetFastAI ...\n", "No improvement since epoch 2: early stopping\n", "\t-0.1104\t = Validation score (-root_mean_squared_error)\n", "\t135.6s\t = Training runtime\n", "\t0.28s\t = Validation runtime\n", "Fitting model: NeuralNetTorch ...\n", "\t-0.1095\t = Validation score (-root_mean_squared_error)\n", "\t143.11s\t = Training runtime\n", "\t0.32s\t = Validation runtime\n", "Fitting model: LightGBMLarge ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 0.107068\n", "[2000]\tvalid_set's rmse: 0.10661\n", "[3000]\tvalid_set's rmse: 0.10653\n", "[4000]\tvalid_set's rmse: 0.106503\n", "[5000]\tvalid_set's rmse: 0.106497\n", "[6000]\tvalid_set's rmse: 0.106495\n", "[7000]\tvalid_set's rmse: 0.106495\n", "[8000]\tvalid_set's rmse: 0.106495\n", "[9000]\tvalid_set's rmse: 0.106495\n", "[10000]\tvalid_set's rmse: 0.106495\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-0.1065\t = Validation score (-root_mean_squared_error)\n", "\t2938.26s\t = Training runtime\n", "\t1.38s\t = Validation runtime\n", "Fitting model: WeightedEnsemble_L2 ...\n", "\tEnsemble Weights: {'LightGBMXT': 0.333, 'NeuralNetTorch': 0.238, 'LightGBMLarge': 0.238, 'NeuralNetFastAI': 0.095, 'KNeighborsDist': 0.048, 'LightGBM': 0.048}\n", "\t-0.1047\t = Validation score (-root_mean_squared_error)\n", "\t0.03s\t = Training runtime\n", "\t0.0s\t = Validation runtime\n", "AutoGluon training complete, total runtime = 9466.82s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 378.7 rows/s (2500 batch size)\n", "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X4_mean\")\n", "Verbosity: 2 (Standard Logging)\n", "=================== System Info ===================\n", "AutoGluon Version: 1.1.1\n", "Python Version: 3.10.11\n", "Operating System: Windows\n", "Platform Machine: AMD64\n", "Platform Version: 10.0.22631\n", "CPU Count: 12\n", "Memory Avail: 5.24 GB / 15.79 GB (33.2%)\n", "Disk Space Avail: 77.84 GB / 150.79 GB (51.6%)\n", "===================================================\n", "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n", "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n", "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n", "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n", "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n", "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n", "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 190.8 MB).\n", "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n", "Beginning AutoGluon training ...\n", "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X11_mean\"\n", "Train Data Rows: 43363\n", "Train Data Columns: 933\n", "Label Column: X11_mean\n", "Problem Type: regression\n", "Preprocessing data ...\n", "Using Feature Generators to preprocess the data ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting TabularPredictor for label: X11_mean ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Fitting AutoMLPipelineFeatureGenerator...\n", "\tAvailable Memory: 5340.17 MB\n", "\tTrain Data (Original) Memory Usage: 181.63 MB (3.4% of available memory)\n", "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n", "\tStage 1 Generators:\n", "\t\tFitting AsTypeFeatureGenerator...\n", "\tStage 2 Generators:\n", "\t\tFitting FillNaFeatureGenerator...\n", "\tStage 3 Generators:\n", "\t\tFitting IdentityFeatureGenerator...\n", "\tStage 4 Generators:\n", "\t\tFitting DropUniqueFeatureGenerator...\n", "\tStage 5 Generators:\n", "\t\tFitting DropDuplicatesFeatureGenerator...\n", "\tTypes of features in original data (raw dtype, special dtypes):\n", "\t\t('float', []) : 811 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\tTypes of features in processed data (raw dtype, special dtypes):\n", "\t\t('float', []) : 811 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\t5.5s = Fit runtime\n", "\t933 features in original data used to generate 933 features in processed data.\n", "\tTrain Data (Processed) Memory Usage: 181.63 MB (3.4% of available memory)\n", "Data preprocessing and feature engineering runtime = 5.89s ...\n", "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n", "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n", "\tTo change this, specify the eval_metric parameter of Predictor()\n", "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n", "User-specified model hyperparameters to be fit:\n", "{\n", "\t'NN_TORCH': {},\n", "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n", "\t'FASTAI': {},\n", "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n", "}\n", "Fitting 9 L1 models ...\n", "Fitting model: KNeighborsUnif ...\n", "\t-7.1893\t = Validation score (-root_mean_squared_error)\n", "\t1.57s\t = Training runtime\n", "\t2.38s\t = Validation runtime\n", "Fitting model: KNeighborsDist ...\n", "\t-7.2766\t = Validation score (-root_mean_squared_error)\n", "\t1.58s\t = Training runtime\n", "\t2.41s\t = Validation runtime\n", "Fitting model: LightGBMXT ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 5.34109\n", "[2000]\tvalid_set's rmse: 5.3167\n", "[3000]\tvalid_set's rmse: 5.29916\n", "[4000]\tvalid_set's rmse: 5.29677\n", "[5000]\tvalid_set's rmse: 5.29458\n", "[6000]\tvalid_set's rmse: 5.29489\n", "[7000]\tvalid_set's rmse: 5.29236\n", "[8000]\tvalid_set's rmse: 5.29263\n", "[9000]\tvalid_set's rmse: 5.29315\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-5.2913\t = Validation score (-root_mean_squared_error)\n", "\t831.77s\t = Training runtime\n", "\t0.34s\t = Validation runtime\n", "Fitting model: LightGBM ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 5.29744\n", "[2000]\tvalid_set's rmse: 5.26782\n", "[3000]\tvalid_set's rmse: 5.26091\n", "[4000]\tvalid_set's rmse: 5.25295\n", "[5000]\tvalid_set's rmse: 5.24923\n", "[6000]\tvalid_set's rmse: 5.24709\n", "[7000]\tvalid_set's rmse: 5.24592\n", "[8000]\tvalid_set's rmse: 5.24511\n", "[9000]\tvalid_set's rmse: 5.24443\n", "[10000]\tvalid_set's rmse: 5.24422\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-5.2442\t = Validation score (-root_mean_squared_error)\n", "\t1007.46s\t = Training runtime\n", "\t0.8s\t = Validation runtime\n", "Fitting model: RandomForestMSE ...\n", "\t-5.466\t = Validation score (-root_mean_squared_error)\n", "\t3405.54s\t = Training runtime\n", "\t0.21s\t = Validation runtime\n", "Fitting model: ExtraTreesMSE ...\n", "\t-5.5053\t = Validation score (-root_mean_squared_error)\n", "\t1100.81s\t = Training runtime\n", "\t0.19s\t = Validation runtime\n", "Fitting model: NeuralNetFastAI ...\n", "No improvement since epoch 8: early stopping\n", "\t-5.3575\t = Validation score (-root_mean_squared_error)\n", "\t156.5s\t = Training runtime\n", "\t0.26s\t = Validation runtime\n", "Fitting model: NeuralNetTorch ...\n", "\t-5.3648\t = Validation score (-root_mean_squared_error)\n", "\t123.3s\t = Training runtime\n", "\t0.3s\t = Validation runtime\n", "Fitting model: LightGBMLarge ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 5.22467\n", "[2000]\tvalid_set's rmse: 5.20862\n", "[3000]\tvalid_set's rmse: 5.20477\n", "[4000]\tvalid_set's rmse: 5.20326\n", "[5000]\tvalid_set's rmse: 5.20295\n", "[6000]\tvalid_set's rmse: 5.20281\n", "[7000]\tvalid_set's rmse: 5.20276\n", "[8000]\tvalid_set's rmse: 5.20275\n", "[9000]\tvalid_set's rmse: 5.20275\n", "[10000]\tvalid_set's rmse: 5.20275\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-5.2028\t = Validation score (-root_mean_squared_error)\n", "\t2423.97s\t = Training runtime\n", "\t1.28s\t = Validation runtime\n", "Fitting model: WeightedEnsemble_L2 ...\n", "\tEnsemble Weights: {'LightGBMLarge': 0.417, 'NeuralNetFastAI': 0.375, 'LightGBM': 0.208}\n", "\t-5.0914\t = Validation score (-root_mean_squared_error)\n", "\t0.02s\t = Training runtime\n", "\t0.0s\t = Validation runtime\n", "AutoGluon training complete, total runtime = 9074.56s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1068.5 rows/s (2500 batch size)\n", "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X11_mean\")\n", "Verbosity: 2 (Standard Logging)\n", "=================== System Info ===================\n", "AutoGluon Version: 1.1.1\n", "Python Version: 3.10.11\n", "Operating System: Windows\n", "Platform Machine: AMD64\n", "Platform Version: 10.0.22631\n", "CPU Count: 12\n", "Memory Avail: 7.64 GB / 15.79 GB (48.4%)\n", "Disk Space Avail: 75.99 GB / 150.79 GB (50.4%)\n", "===================================================\n", "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n", "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n", "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n", "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n", "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n", "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n", "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 191.14 MB).\n", "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n", "Beginning AutoGluon training ...\n", "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X18_mean\"\n", "Train Data Rows: 43363\n", "Train Data Columns: 934\n", "Label Column: X18_mean\n", "Problem Type: regression\n", "Preprocessing data ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting TabularPredictor for label: X18_mean ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using Feature Generators to preprocess the data ...\n", "Fitting AutoMLPipelineFeatureGenerator...\n", "\tAvailable Memory: 7901.67 MB\n", "\tTrain Data (Original) Memory Usage: 181.96 MB (2.3% of available memory)\n", "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n", "\tStage 1 Generators:\n", "\t\tFitting AsTypeFeatureGenerator...\n", "\tStage 2 Generators:\n", "\t\tFitting FillNaFeatureGenerator...\n", "\tStage 3 Generators:\n", "\t\tFitting IdentityFeatureGenerator...\n", "\tStage 4 Generators:\n", "\t\tFitting DropUniqueFeatureGenerator...\n", "\tStage 5 Generators:\n", "\t\tFitting DropDuplicatesFeatureGenerator...\n", "\tTypes of features in original data (raw dtype, special dtypes):\n", "\t\t('float', []) : 812 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\tTypes of features in processed data (raw dtype, special dtypes):\n", "\t\t('float', []) : 812 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\t4.8s = Fit runtime\n", "\t934 features in original data used to generate 934 features in processed data.\n", "\tTrain Data (Processed) Memory Usage: 181.96 MB (2.3% of available memory)\n", "Data preprocessing and feature engineering runtime = 5.04s ...\n", "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n", "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n", "\tTo change this, specify the eval_metric parameter of Predictor()\n", "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n", "User-specified model hyperparameters to be fit:\n", "{\n", "\t'NN_TORCH': {},\n", "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n", "\t'FASTAI': {},\n", "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n", "}\n", "Fitting 9 L1 models ...\n", "Fitting model: KNeighborsUnif ...\n", "\t-4.4719\t = Validation score (-root_mean_squared_error)\n", "\t1.33s\t = Training runtime\n", "\t2.34s\t = Validation runtime\n", "Fitting model: KNeighborsDist ...\n", "\t-4.4852\t = Validation score (-root_mean_squared_error)\n", "\t1.35s\t = Training runtime\n", "\t2.87s\t = Validation runtime\n", "Fitting model: LightGBMXT ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 2.7975\n", "[2000]\tvalid_set's rmse: 2.77084\n", "[3000]\tvalid_set's rmse: 2.76197\n", "[4000]\tvalid_set's rmse: 2.76049\n", "[5000]\tvalid_set's rmse: 2.75914\n", "[6000]\tvalid_set's rmse: 2.75773\n", "[7000]\tvalid_set's rmse: 2.75728\n", "[8000]\tvalid_set's rmse: 2.75624\n", "[9000]\tvalid_set's rmse: 2.75584\n", "[10000]\tvalid_set's rmse: 2.75552\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-2.7555\t = Validation score (-root_mean_squared_error)\n", "\t722.76s\t = Training runtime\n", "\t0.62s\t = Validation runtime\n", "Fitting model: LightGBM ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 2.79461\n", "[2000]\tvalid_set's rmse: 2.77581\n", "[3000]\tvalid_set's rmse: 2.76911\n", "[4000]\tvalid_set's rmse: 2.76665\n", "[5000]\tvalid_set's rmse: 2.76656\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-2.7665\t = Validation score (-root_mean_squared_error)\n", "\t455.92s\t = Training runtime\n", "\t0.25s\t = Validation runtime\n", "Fitting model: RandomForestMSE ...\n", "\t-3.0041\t = Validation score (-root_mean_squared_error)\n", "\t5707.16s\t = Training runtime\n", "\t0.29s\t = Validation runtime\n", "Fitting model: ExtraTreesMSE ...\n", "\t-3.0281\t = Validation score (-root_mean_squared_error)\n", "\t1414.74s\t = Training runtime\n", "\t0.24s\t = Validation runtime\n", "Fitting model: NeuralNetFastAI ...\n", "\t-2.7646\t = Validation score (-root_mean_squared_error)\n", "\t158.74s\t = Training runtime\n", "\t0.24s\t = Validation runtime\n", "Fitting model: NeuralNetTorch ...\n", "\t-2.7368\t = Validation score (-root_mean_squared_error)\n", "\t132.61s\t = Training runtime\n", "\t0.27s\t = Validation runtime\n", "Fitting model: LightGBMLarge ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 2.76306\n", "[2000]\tvalid_set's rmse: 2.75877\n", "[3000]\tvalid_set's rmse: 2.75837\n", "[4000]\tvalid_set's rmse: 2.75822\n", "[5000]\tvalid_set's rmse: 2.75819\n", "[6000]\tvalid_set's rmse: 2.75819\n", "[7000]\tvalid_set's rmse: 2.75818\n", "[8000]\tvalid_set's rmse: 2.75818\n", "[9000]\tvalid_set's rmse: 2.75818\n", "[10000]\tvalid_set's rmse: 2.75818\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-2.7582\t = Validation score (-root_mean_squared_error)\n", "\t2648.19s\t = Training runtime\n", "\t1.43s\t = Validation runtime\n", "Fitting model: WeightedEnsemble_L2 ...\n", "\tEnsemble Weights: {'NeuralNetTorch': 0.375, 'NeuralNetFastAI': 0.333, 'LightGBMLarge': 0.167, 'LightGBM': 0.125}\n", "\t-2.6075\t = Validation score (-root_mean_squared_error)\n", "\t0.03s\t = Training runtime\n", "\t0.0s\t = Validation runtime\n", "AutoGluon training complete, total runtime = 11264.22s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1140.4 rows/s (2500 batch size)\n", "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X18_mean\")\n", "Verbosity: 2 (Standard Logging)\n", "=================== System Info ===================\n", "AutoGluon Version: 1.1.1\n", "Python Version: 3.10.11\n", "Operating System: Windows\n", "Platform Machine: AMD64\n", "Platform Version: 10.0.22631\n", "CPU Count: 12\n", "Memory Avail: 7.60 GB / 15.79 GB (48.1%)\n", "Disk Space Avail: 74.16 GB / 150.79 GB (49.2%)\n", "===================================================\n", "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n", "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n", "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n", "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n", "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n", "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n", "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 191.49 MB).\n", "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n", "Beginning AutoGluon training ...\n", "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X26_mean\"\n", "Train Data Rows: 43363\n", "Train Data Columns: 935\n", "Label Column: X26_mean\n", "Problem Type: regression\n", "Preprocessing data ...\n", "Using Feature Generators to preprocess the data ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting TabularPredictor for label: X26_mean ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Fitting AutoMLPipelineFeatureGenerator...\n", "\tAvailable Memory: 7763.00 MB\n", "\tTrain Data (Original) Memory Usage: 182.29 MB (2.3% of available memory)\n", "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n", "\tStage 1 Generators:\n", "\t\tFitting AsTypeFeatureGenerator...\n", "\tStage 2 Generators:\n", "\t\tFitting FillNaFeatureGenerator...\n", "\tStage 3 Generators:\n", "\t\tFitting IdentityFeatureGenerator...\n", "\tStage 4 Generators:\n", "\t\tFitting DropUniqueFeatureGenerator...\n", "\tStage 5 Generators:\n", "\t\tFitting DropDuplicatesFeatureGenerator...\n", "\tTypes of features in original data (raw dtype, special dtypes):\n", "\t\t('float', []) : 813 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\tTypes of features in processed data (raw dtype, special dtypes):\n", "\t\t('float', []) : 813 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\t6.5s = Fit runtime\n", "\t935 features in original data used to generate 935 features in processed data.\n", "\tTrain Data (Processed) Memory Usage: 182.29 MB (2.4% of available memory)\n", "Data preprocessing and feature engineering runtime = 6.81s ...\n", "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n", "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n", "\tTo change this, specify the eval_metric parameter of Predictor()\n", "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n", "User-specified model hyperparameters to be fit:\n", "{\n", "\t'NN_TORCH': {},\n", "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n", "\t'FASTAI': {},\n", "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n", "}\n", "Fitting 9 L1 models ...\n", "Fitting model: KNeighborsUnif ...\n", "\t-75.2345\t = Validation score (-root_mean_squared_error)\n", "\t1.63s\t = Training runtime\n", "\t2.42s\t = Validation runtime\n", "Fitting model: KNeighborsDist ...\n", "\t-77.2557\t = Validation score (-root_mean_squared_error)\n", "\t1.57s\t = Training runtime\n", "\t2.46s\t = Validation runtime\n", "Fitting model: LightGBMXT ...\n", "\t-56.0706\t = Validation score (-root_mean_squared_error)\n", "\t45.17s\t = Training runtime\n", "\t0.06s\t = Validation runtime\n", "Fitting model: LightGBM ...\n", "\t-54.6852\t = Validation score (-root_mean_squared_error)\n", "\t41.69s\t = Training runtime\n", "\t0.04s\t = Validation runtime\n", "Fitting model: RandomForestMSE ...\n", "\t-55.0949\t = Validation score (-root_mean_squared_error)\n", "\t9653.14s\t = Training runtime\n", "\t0.3s\t = Validation runtime\n", "Fitting model: ExtraTreesMSE ...\n", "\t-55.9584\t = Validation score (-root_mean_squared_error)\n", "\t1874.15s\t = Training runtime\n", "\t0.27s\t = Validation runtime\n", "Fitting model: NeuralNetFastAI ...\n", "\t-57.9006\t = Validation score (-root_mean_squared_error)\n", "\t159.0s\t = Training runtime\n", "\t0.22s\t = Validation runtime\n", "Fitting model: NeuralNetTorch ...\n", "\t-59.0582\t = Validation score (-root_mean_squared_error)\n", "\t155.0s\t = Training runtime\n", "\t0.27s\t = Validation runtime\n", "Fitting model: LightGBMLarge ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 53.3837\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-53.3795\t = Validation score (-root_mean_squared_error)\n", "\t442.04s\t = Training runtime\n", "\t0.13s\t = Validation runtime\n", "Fitting model: WeightedEnsemble_L2 ...\n", "\tEnsemble Weights: {'LightGBMLarge': 0.84, 'NeuralNetFastAI': 0.16}\n", "\t-53.1964\t = Validation score (-root_mean_squared_error)\n", "\t0.03s\t = Training runtime\n", "\t0.0s\t = Validation runtime\n", "AutoGluon training complete, total runtime = 12390.51s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 7137.6 rows/s (2500 batch size)\n", "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X26_mean\")\n", "Verbosity: 2 (Standard Logging)\n", "=================== System Info ===================\n", "AutoGluon Version: 1.1.1\n", "Python Version: 3.10.11\n", "Operating System: Windows\n", "Platform Machine: AMD64\n", "Platform Version: 10.0.22631\n", "CPU Count: 12\n", "Memory Avail: 7.35 GB / 15.79 GB (46.5%)\n", "Disk Space Avail: 72.47 GB / 150.79 GB (48.1%)\n", "===================================================\n", "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n", "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n", "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n", "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n", "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n", "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n", "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 191.84 MB).\n", "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n", "Beginning AutoGluon training ...\n", "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X50_mean\"\n", "Train Data Rows: 43363\n", "Train Data Columns: 936\n", "Label Column: X50_mean\n", "Problem Type: regression\n", "Preprocessing data ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting TabularPredictor for label: X50_mean ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using Feature Generators to preprocess the data ...\n", "Fitting AutoMLPipelineFeatureGenerator...\n", "\tAvailable Memory: 7495.31 MB\n", "\tTrain Data (Original) Memory Usage: 182.62 MB (2.4% of available memory)\n", "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n", "\tStage 1 Generators:\n", "\t\tFitting AsTypeFeatureGenerator...\n", "\tStage 2 Generators:\n", "\t\tFitting FillNaFeatureGenerator...\n", "\tStage 3 Generators:\n", "\t\tFitting IdentityFeatureGenerator...\n", "\tStage 4 Generators:\n", "\t\tFitting DropUniqueFeatureGenerator...\n", "\tStage 5 Generators:\n", "\t\tFitting DropDuplicatesFeatureGenerator...\n", "\tTypes of features in original data (raw dtype, special dtypes):\n", "\t\t('float', []) : 814 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\tTypes of features in processed data (raw dtype, special dtypes):\n", "\t\t('float', []) : 814 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\t6.4s = Fit runtime\n", "\t936 features in original data used to generate 936 features in processed data.\n", "\tTrain Data (Processed) Memory Usage: 182.62 MB (2.4% of available memory)\n", "Data preprocessing and feature engineering runtime = 6.79s ...\n", "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n", "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n", "\tTo change this, specify the eval_metric parameter of Predictor()\n", "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n", "User-specified model hyperparameters to be fit:\n", "{\n", "\t'NN_TORCH': {},\n", "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n", "\t'FASTAI': {},\n", "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n", "}\n", "Fitting 9 L1 models ...\n", "Fitting model: KNeighborsUnif ...\n", "\t-0.6334\t = Validation score (-root_mean_squared_error)\n", "\t1.99s\t = Training runtime\n", "\t2.73s\t = Validation runtime\n", "Fitting model: KNeighborsDist ...\n", "\t-0.6393\t = Validation score (-root_mean_squared_error)\n", "\t1.95s\t = Training runtime\n", "\t2.72s\t = Validation runtime\n", "Fitting model: LightGBMXT ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 0.361925\n", "[2000]\tvalid_set's rmse: 0.357162\n", "[3000]\tvalid_set's rmse: 0.355106\n", "[4000]\tvalid_set's rmse: 0.353916\n", "[5000]\tvalid_set's rmse: 0.353093\n", "[6000]\tvalid_set's rmse: 0.352683\n", "[7000]\tvalid_set's rmse: 0.352526\n", "[8000]\tvalid_set's rmse: 0.352398\n", "[9000]\tvalid_set's rmse: 0.352323\n", "[10000]\tvalid_set's rmse: 0.352234\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-0.3522\t = Validation score (-root_mean_squared_error)\n", "\t744.88s\t = Training runtime\n", "\t0.8s\t = Validation runtime\n", "Fitting model: LightGBM ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 0.352549\n", "[2000]\tvalid_set's rmse: 0.349969\n", "[3000]\tvalid_set's rmse: 0.348952\n", "[4000]\tvalid_set's rmse: 0.348591\n", "[5000]\tvalid_set's rmse: 0.348339\n", "[6000]\tvalid_set's rmse: 0.348147\n", "[7000]\tvalid_set's rmse: 0.348034\n", "[8000]\tvalid_set's rmse: 0.347988\n", "[9000]\tvalid_set's rmse: 0.347937\n", "[10000]\tvalid_set's rmse: 0.347919\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-0.3479\t = Validation score (-root_mean_squared_error)\n", "\t921.95s\t = Training runtime\n", "\t0.8s\t = Validation runtime\n", "Fitting model: RandomForestMSE ...\n", "\t-0.344\t = Validation score (-root_mean_squared_error)\n", "\t3068.82s\t = Training runtime\n", "\t0.21s\t = Validation runtime\n", "Fitting model: ExtraTreesMSE ...\n", "\t-0.3735\t = Validation score (-root_mean_squared_error)\n", "\t1075.89s\t = Training runtime\n", "\t0.21s\t = Validation runtime\n", "Fitting model: NeuralNetFastAI ...\n", "\t-0.397\t = Validation score (-root_mean_squared_error)\n", "\t161.54s\t = Training runtime\n", "\t0.25s\t = Validation runtime\n", "Fitting model: NeuralNetTorch ...\n", "\t-0.3914\t = Validation score (-root_mean_squared_error)\n", "\t251.87s\t = Training runtime\n", "\t0.53s\t = Validation runtime\n", "Fitting model: LightGBMLarge ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 0.330805\n", "[2000]\tvalid_set's rmse: 0.329588\n", "[3000]\tvalid_set's rmse: 0.329333\n", "[4000]\tvalid_set's rmse: 0.329259\n", "[5000]\tvalid_set's rmse: 0.329238\n", "[6000]\tvalid_set's rmse: 0.329229\n", "[7000]\tvalid_set's rmse: 0.329227\n", "[8000]\tvalid_set's rmse: 0.329226\n", "[9000]\tvalid_set's rmse: 0.329226\n", "[10000]\tvalid_set's rmse: 0.329226\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-0.3292\t = Validation score (-root_mean_squared_error)\n", "\t2505.43s\t = Training runtime\n", "\t1.29s\t = Validation runtime\n", "Fitting model: WeightedEnsemble_L2 ...\n", "\tEnsemble Weights: {'LightGBMLarge': 0.857, 'NeuralNetFastAI': 0.095, 'RandomForestMSE': 0.048}\n", "\t-0.3284\t = Validation score (-root_mean_squared_error)\n", "\t0.02s\t = Training runtime\n", "\t0.0s\t = Validation runtime\n", "AutoGluon training complete, total runtime = 8758.55s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1436.0 rows/s (2500 batch size)\n", "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X50_mean\")\n", "Verbosity: 2 (Standard Logging)\n", "=================== System Info ===================\n", "AutoGluon Version: 1.1.1\n", "Python Version: 3.10.11\n", "Operating System: Windows\n", "Platform Machine: AMD64\n", "Platform Version: 10.0.22631\n", "CPU Count: 12\n", "Memory Avail: 6.87 GB / 15.79 GB (43.5%)\n", "Disk Space Avail: 70.62 GB / 150.79 GB (46.8%)\n", "===================================================\n", "No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.\n", "\tRecommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):\n", "\tpresets='best_quality' : Maximize accuracy. Default time_limit=3600.\n", "\tpresets='high_quality' : Strong accuracy with fast inference speed. Default time_limit=3600.\n", "\tpresets='good_quality' : Good accuracy with very fast inference speed. Default time_limit=3600.\n", "\tpresets='medium_quality' : Fast training time, ideal for initial prototyping.\n", "Warning: Training may take a very long time because `time_limit` was not specified and `train_data` is large (43363 samples, 192.18 MB).\n", "\tConsider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.\n", "Beginning AutoGluon training ...\n", "AutoGluon will save models to \"multilabel_predictor_source\\Predictor_X3112_mean\"\n", "Train Data Rows: 43363\n", "Train Data Columns: 937\n", "Label Column: X3112_mean\n", "Problem Type: regression\n", "Preprocessing data ...\n", "Using Feature Generators to preprocess the data ...\n", "Fitting AutoMLPipelineFeatureGenerator...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Fitting TabularPredictor for label: X3112_mean ...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\tAvailable Memory: 7019.43 MB\n", "\tTrain Data (Original) Memory Usage: 182.95 MB (2.6% of available memory)\n", "\tInferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.\n", "\tStage 1 Generators:\n", "\t\tFitting AsTypeFeatureGenerator...\n", "\tStage 2 Generators:\n", "\t\tFitting FillNaFeatureGenerator...\n", "\tStage 3 Generators:\n", "\t\tFitting IdentityFeatureGenerator...\n", "\tStage 4 Generators:\n", "\t\tFitting DropUniqueFeatureGenerator...\n", "\tStage 5 Generators:\n", "\t\tFitting DropDuplicatesFeatureGenerator...\n", "\tTypes of features in original data (raw dtype, special dtypes):\n", "\t\t('float', []) : 815 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\tTypes of features in processed data (raw dtype, special dtypes):\n", "\t\t('float', []) : 815 | ['WORLDCLIM_BIO1_annual_mean_temperature', 'WORLDCLIM_BIO12_annual_precipitation', 'WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month', 'WORLDCLIM_BIO15_precipitation_seasonality', 'WORLDCLIM_BIO4_temperature_seasonality', ...]\n", "\t\t('int', []) : 122 | ['id', 'SOIL_bdod_0.5cm_mean_0.01_deg', 'SOIL_bdod_100.200cm_mean_0.01_deg', 'SOIL_bdod_15.30cm_mean_0.01_deg', 'SOIL_bdod_30.60cm_mean_0.01_deg', ...]\n", "\t5.0s = Fit runtime\n", "\t937 features in original data used to generate 937 features in processed data.\n", "\tTrain Data (Processed) Memory Usage: 182.95 MB (2.6% of available memory)\n", "Data preprocessing and feature engineering runtime = 5.29s ...\n", "AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'\n", "\tThis metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.\n", "\tTo change this, specify the eval_metric parameter of Predictor()\n", "Automatically generating train/validation split with holdout_frac=0.05765283767267025, Train Rows: 40863, Val Rows: 2500\n", "User-specified model hyperparameters to be fit:\n", "{\n", "\t'NN_TORCH': {},\n", "\t'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],\n", "\t'FASTAI': {},\n", "\t'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],\n", "\t'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],\n", "}\n", "Fitting 9 L1 models ...\n", "Fitting model: KNeighborsUnif ...\n", "\t-2270.871\t = Validation score (-root_mean_squared_error)\n", "\t1.37s\t = Training runtime\n", "\t2.24s\t = Validation runtime\n", "Fitting model: KNeighborsDist ...\n", "\t-2230.0395\t = Validation score (-root_mean_squared_error)\n", "\t1.34s\t = Training runtime\n", "\t2.34s\t = Validation runtime\n", "Fitting model: LightGBMXT ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 1470.67\n", "[2000]\tvalid_set's rmse: 1460.77\n", "[3000]\tvalid_set's rmse: 1453.2\n", "[4000]\tvalid_set's rmse: 1449.16\n", "[5000]\tvalid_set's rmse: 1448\n", "[6000]\tvalid_set's rmse: 1447.65\n", "[7000]\tvalid_set's rmse: 1447.57\n", "[8000]\tvalid_set's rmse: 1446.92\n", "[9000]\tvalid_set's rmse: 1446.78\n", "[10000]\tvalid_set's rmse: 1446.71\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-1446.6537\t = Validation score (-root_mean_squared_error)\n", "\t680.41s\t = Training runtime\n", "\t0.54s\t = Validation runtime\n", "Fitting model: LightGBM ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 1401.6\n", "[2000]\tvalid_set's rmse: 1389.58\n", "[3000]\tvalid_set's rmse: 1386.45\n", "[4000]\tvalid_set's rmse: 1385.03\n", "[5000]\tvalid_set's rmse: 1384.81\n", "[6000]\tvalid_set's rmse: 1384.61\n", "[7000]\tvalid_set's rmse: 1384.48\n", "[8000]\tvalid_set's rmse: 1384.34\n", "[9000]\tvalid_set's rmse: 1384.35\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-1384.3118\t = Validation score (-root_mean_squared_error)\n", "\t820.56s\t = Training runtime\n", "\t0.42s\t = Validation runtime\n", "Fitting model: RandomForestMSE ...\n", "\t-1349.2685\t = Validation score (-root_mean_squared_error)\n", "\t4440.72s\t = Training runtime\n", "\t0.21s\t = Validation runtime\n", "Fitting model: ExtraTreesMSE ...\n", "\t-1451.9243\t = Validation score (-root_mean_squared_error)\n", "\t1308.72s\t = Training runtime\n", "\t0.22s\t = Validation runtime\n", "Fitting model: NeuralNetFastAI ...\n", "\t-1514.4165\t = Validation score (-root_mean_squared_error)\n", "\t158.34s\t = Training runtime\n", "\t0.24s\t = Validation runtime\n", "Fitting model: NeuralNetTorch ...\n", "\t-1537.7455\t = Validation score (-root_mean_squared_error)\n", "\t143.11s\t = Training runtime\n", "\t0.53s\t = Validation runtime\n", "Fitting model: LightGBMLarge ...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "[1000]\tvalid_set's rmse: 1327.67\n", "[2000]\tvalid_set's rmse: 1325.67\n", "[3000]\tvalid_set's rmse: 1325.22\n", "[4000]\tvalid_set's rmse: 1325.1\n", "[5000]\tvalid_set's rmse: 1325.06\n", "[6000]\tvalid_set's rmse: 1325.05\n", "[7000]\tvalid_set's rmse: 1325.04\n", "[8000]\tvalid_set's rmse: 1325.04\n", "[9000]\tvalid_set's rmse: 1325.04\n", "[10000]\tvalid_set's rmse: 1325.04\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\t-1325.0433\t = Validation score (-root_mean_squared_error)\n", "\t2420.99s\t = Training runtime\n", "\t1.04s\t = Validation runtime\n", "Fitting model: WeightedEnsemble_L2 ...\n", "\tEnsemble Weights: {'LightGBMLarge': 0.571, 'RandomForestMSE': 0.333, 'NeuralNetFastAI': 0.095}\n", "\t-1313.9254\t = Validation score (-root_mean_squared_error)\n", "\t0.03s\t = Training runtime\n", "\t0.0s\t = Validation runtime\n", "AutoGluon training complete, total runtime = 9995.55s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 1683.5 rows/s (2500 batch size)\n", "TabularPredictor saved. To load, use: predictor = TabularPredictor.load(\"multilabel_predictor_source\\Predictor_X3112_mean\")\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('multilabel_predictor_source')\n" ] } ], "source": [ "# Define paths\n", "train_csv_path = 'train.csv'\n", "train_image_dir = 'train_images'\n", "test_csv_path = 'test.csv'\n", "test_image_dir = 'test_images'\n", "output_path = 'prediction.csv'\n", "\n", "# Load train and test datasets\n", "train_df = pd.read_csv(train_csv_path)\n", "\n", "# Columns for ancillary data and target traits\n", "ancillary_columns = train_df.columns[:-6] # First 164 columns are ancillary data\n", "target_columns = train_df.columns[-6:] # Last 6 columns are target traits\n", "\n", "# Load Vision Transformer model and feature extractor\n", "# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "# vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k').to(device)\n", "# feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')\n", "\n", "# Generate image embeddings for train and test datasets\n", "print(\"Extracting image embeddings for training data...\")\n", "# train_image_embeddings = preprocess_images(train_df, train_image_dir)\n", "with open('train_image_embeddings.pkl', 'rb') as f:\n", " train_image_embeddings = pickle.load(f)\n", "\n", "# Combine ancillary data and image embeddings\n", "print(\"Combining ancillary data and image embeddings...\")\n", "train_combined = pd.concat([train_df[ancillary_columns], train_image_embeddings, train_df[target_columns]], axis=1)\n", "\n", "# Initialize MultilabelPredictor\n", "targets = list(target_columns)\n", "problem_types = ['regression'] * len(targets)\n", "eval_metrics = ['mean_absolute_percentage_error'] * len(targets)\n", "hyperparameters = {\n", "\t'NN_TORCH': {},\n", "\t'GBM': ['GBMLarge'],\n", "\t'FASTAI': {}\n", "}\n", "\n", "multi_predictor = MultilabelPredictor(\n", " labels=targets,\n", " problem_types=problem_types,\n", " # eval_metrics=eval_metrics,\n", " path='multilabel_predictor_source'\n", ")\n", "\n", "# Train MultilabelPredictor\n", "print(\"Training MultilabelPredictor...\")\n", "multi_predictor.fit(train_combined, hyperparameters=hyperparameters)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Extracting image embeddings for test data...\n", "Making predictions on test data...\n", "Predicting with TabularPredictor for label: X4_mean ...\n", "Predicting with TabularPredictor for label: X11_mean ...\n", "Predicting with TabularPredictor for label: X18_mean ...\n", "Predicting with TabularPredictor for label: X26_mean ...\n", "Predicting with TabularPredictor for label: X50_mean ...\n", "Predicting with TabularPredictor for label: X3112_mean ...\n", "Saving predictions to prediction.csv...\n", "Predictions saved successfully!\n" ] } ], "source": [ "test_df = pd.read_csv(test_csv_path)\n", "print(\"Extracting image embeddings for test data...\")\n", "# test_image_embeddings = preprocess_images(test_df, test_image_dir)\n", "with open('test_image_embeddings.pkl', 'rb') as f:\n", " test_image_embeddings = pickle.load(f)\n", "\n", "test_combined = pd.concat([test_df[ancillary_columns], test_image_embeddings], axis=1)\n", "\n", "# Make predictions on test data\n", "print(\"Making predictions on test data...\")\n", "predictions = multi_predictor.predict(test_combined)\n", "\n", "# Save predictions to CSV\n", "print(f\"Saving predictions to {output_path}...\")\n", "predictions.insert(0, 'id', test_df['id'])\n", "predictions.to_csv(output_path, index=False)\n", "print(\"Predictions saved successfully!\")" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }