ctheodoris/Geneformer · PrecollatorForGeneAndCellClassification has no attribute save

Hi there, thanks for the great pipeline and maintenance! Currently I am trying to replicate the cell_classification.ipynb from the example folder, but got an error from the PrecollatorForGeneAndCellClassification module just after finishing the first epoch:

Epoch	Training Loss	Validation Loss	Accuracy	Macro F1
0	0.133900	0.411489	0.883736	0.684715
  0%|          | 0/1 [28:44<?, ?it/s]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[5], line 6
      1 train_valid_id_split_dict = {"attr_key": "individual",
      2                             "train": train_ids,
      3                             "eval": eval_ids}
      5 # Example 6 layer 30M Geneformer model: https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors
----> 6 all_metrics = cc.validate(model_directory="/home/data1/geneformer/Geneformer/gf-6L-30M-i2048/",
      7                           prepared_input_data_file=f"{output_dir}/{output_prefix}_labeled_train.dataset",
      8                           id_class_dict_file=f"{output_dir}/{output_prefix}_id_class_dict.pkl",
      9                           output_directory=output_dir,
     10                           output_prefix=output_prefix,
     11                           split_id_dict=train_valid_id_split_dict)
     12                           # to optimize hyperparameters, set n_hyperopt_trials=100 (or alternative desired # of trials)

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/geneformer/classifier.py:791, in Classifier.validate(self, model_directory, prepared_input_data_file, id_class_dict_file, output_directory, output_prefix, split_id_dict, attr_to_split, attr_to_balance, gene_balance, max_trials, pval_threshold, save_eval_output, predict_eval, predict_trainer, n_hyperopt_trials, save_gene_split_datasets, debug_gene_split_datasets)
    789     train_data = data.select(train_indices)
    790 if n_hyperopt_trials == 0:
--> 791     trainer = self.train_classifier(
    792         model_directory,
    793         num_classes,
    794         train_data,
    795         eval_data,
    796         ksplit_output_dir,
    797         predict_trainer,
    798     )
    799 else:
    800     trainer = self.hyperopt_classifier(
    801         model_directory,
    802         num_classes,
   (...)    806         n_trials=n_hyperopt_trials,
    807     )

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/geneformer/classifier.py:1269, in Classifier.train_classifier(self, model_directory, num_classes, train_data, eval_data, output_directory, predict)
   1259 trainer = Trainer(
   1260     model=model,
   1261     args=training_args_init,
   (...)   1265     compute_metrics=cu.compute_metrics,
   1266 )
   1268 # train the classifier
-> 1269 trainer.train()
   1270 trainer.save_model(output_directory)
   1271 if predict is True:
   1272     # make eval predictions and save predictions and metrics

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/trainer.py:2245, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   2243         hf_hub_utils.enable_progress_bars()
   2244 else:
-> 2245     return inner_training_loop(
   2246         args=args,
   2247         resume_from_checkpoint=resume_from_checkpoint,
   2248         trial=trial,
   2249         ignore_keys_for_eval=ignore_keys_for_eval,
   2250     )

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/trainer.py:2661, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2658     self.control.should_training_stop = True
   2660 self.control = self.callback_handler.on_epoch_end(args, self.state, self.control)
-> 2661 self._maybe_log_save_evaluate(
   2662     tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=learning_rate
   2663 )
   2665 if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
   2666     if is_torch_xla_available():
   2667         # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/trainer.py:3103, in Trainer._maybe_log_save_evaluate(self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate)
   3100         self.control.should_save = is_new_best_metric
   3102 if self.control.should_save:
-> 3103     self._save_checkpoint(model, trial)
   3104     self.control = self.callback_handler.on_save(self.args, self.state, self.control)

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/trainer.py:3200, in Trainer._save_checkpoint(self, model, trial)
   3198 run_dir = self._get_output_dir(trial=trial)
   3199 output_dir = os.path.join(run_dir, checkpoint_folder)
-> 3200 self.save_model(output_dir, _internal_call=True)
   3202 if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH] and self.state.best_global_step:
   3203     best_checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.best_global_step}"

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/trainer.py:3902, in Trainer.save_model(self, output_dir, _internal_call)
   3899         self.model_wrapped.save_checkpoint(output_dir)
   3901 elif self.args.should_save:
-> 3902     self._save(output_dir)
   3904 # Push to the Hub when `save_model` is called by the user.
   3905 if self.args.push_to_hub and not _internal_call:

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/trainer.py:4018, in Trainer._save(self, output_dir, state_dict)
   4012 elif (
   4013     self.data_collator is not None
   4014     and hasattr(self.data_collator, "tokenizer")
   4015     and self.data_collator.tokenizer is not None
   4016 ):
   4017     logger.info("Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`")
-> 4018     self.data_collator.tokenizer.save_pretrained(output_dir)
   4020 # Good practice: save your training arguments together with the trained model
   4021 torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))

File /opt/miniconda/envs/gnenformer/lib/python3.13/site-packages/transformers/tokenization_utils_base.py:1108, in SpecialTokensMixin.__getattr__(self, key)
   1105         return self.convert_tokens_to_ids(attr_as_tokens) if attr_as_tokens is not None else None
   1107 if key not in self.__dict__:
-> 1108     raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
   1109 else:
   1110     return super().__getattr__(key)

AttributeError: PrecollatorForGeneAndCellClassification has no attribute save_pretrained

Should I add the parameter save_pretrained manually to trainer.py or classifier.py ? Seems that the issue may due to a different version of package transformers to me. my env is:

Package                   Version
------------------------- -----------
absl-py                   2.2.2
accelerate                1.6.0
accumulation_tree         0.6.4
aiohappyeyeballs          2.6.1
aiohttp                   3.11.18
aiosignal                 1.3.2
alembic                   1.15.2
anndata                   0.11.4
annotated-types           0.7.0
array_api_compat          1.11.2
asttokens                 3.0.0
attrs                     25.3.0
certifi                   2025.4.26
charset-normalizer        3.4.1
click                     8.1.8
colorlog                  6.9.0
comm                      0.2.2
contourpy                 1.3.2
cycler                    0.12.1
datasets                  3.5.1
debugpy                   1.8.14
decorator                 5.2.1
dill                      0.3.8
docker-pycreds            0.4.0
exceptiongroup            1.2.2
executing                 2.2.0
filelock                  3.18.0
fonttools                 4.57.0
frozenlist                1.6.0
fsspec                    2025.3.0
geneformer                0.1.0
gitdb                     4.0.12
GitPython                 3.1.44
greenlet                  3.2.1
grpcio                    1.71.0
h5py                      3.13.0
huggingface-hub           0.30.2
idna                      3.10
importlib_metadata        8.6.1
ipykernel                 6.29.5
ipython                   9.2.0
ipython_pygments_lexers   1.1.1
jedi                      0.19.2
Jinja2                    3.1.6
joblib                    1.4.2
jsonschema                4.23.0
jsonschema-specifications 2025.4.1
jupyter_client            8.6.3
jupyter_core              5.7.2
kiwisolver                1.4.8
legacy-api-wrap           1.4.1
llvmlite                  0.44.0
loompy                    3.0.8
Mako                      1.3.10
Markdown                  3.8
MarkupSafe                3.0.2
matplotlib                3.10.1
matplotlib-inline         0.1.7
mpmath                    1.3.0
msgpack                   1.1.0
multidict                 6.4.3
multiprocess              0.70.16
natsort                   8.4.0
nest_asyncio              1.6.0
networkx                  3.4.2
numba                     0.61.2
numpy                     2.2.5
numpy-groupies            0.11.2
nvidia-cublas-cu12        12.6.4.1
nvidia-cuda-cupti-cu12    12.6.80
nvidia-cuda-nvrtc-cu12    12.6.77
nvidia-cuda-runtime-cu12  12.6.77
nvidia-cudnn-cu12         9.5.1.17
nvidia-cufft-cu12         11.3.0.4
nvidia-cufile-cu12        1.11.1.6
nvidia-curand-cu12        10.3.7.77
nvidia-cusolver-cu12      11.7.1.2
nvidia-cusparse-cu12      12.5.4.2
nvidia-cusparselt-cu12    0.6.3
nvidia-nccl-cu12          2.26.2
nvidia-nvjitlink-cu12     12.6.85
nvidia-nvtx-cu12          12.6.77
optuna                    4.3.0
optuna-integration        4.3.0
packaging                 25.0
pandas                    2.2.3
parso                     0.8.4
patsy                     1.0.1
peft                      0.15.2
pexpect                   4.9.0
pickleshare               0.7.5
pillow                    11.2.1
pip                       25.1
platformdirs              4.3.7
prompt_toolkit            3.0.51
propcache                 0.3.1
protobuf                  6.30.2
psutil                    7.0.0
ptyprocess                0.7.0
pure_eval                 0.2.3
pyarrow                   20.0.0
pydantic                  2.11.4
pydantic_core             2.33.2
Pygments                  2.19.1
pynndescent               0.5.13
pyparsing                 3.2.3
python-dateutil           2.9.0.post0
pytz                      2025.2
pyudorandom               1.0.0
PyYAML                    6.0.2
pyzmq                     26.4.0
ray                       2.45.0
referencing               0.36.2
regex                     2024.11.6
requests                  2.32.3
rpds-py                   0.24.0
safetensors               0.5.3
scanpy                    1.11.1
scikit-learn              1.5.2
scikit-misc               0.5.1
scipy                     1.15.2
seaborn                   0.13.2
sentry-sdk                2.27.0
session-info2             0.1.2
setproctitle              1.3.6
setuptools                80.0.1
six                       1.17.0
smmap                     5.0.2
SQLAlchemy                2.0.40
stack_data                0.6.3
statsmodels               0.14.4
sympy                     1.14.0
tdigest                   0.5.2.2
tensorboard               2.19.0
tensorboard-data-server   0.7.2
threadpoolctl             3.6.0
tokenizers                0.21.1
torch                     2.7.0
tornado                   6.4.2
tqdm                      4.67.1
traitlets                 5.14.3
transformers              4.51.3
triton                    3.3.0
typing_extensions         4.13.2
typing-inspection         0.4.0
tzdata                    2025.2
umap-learn                0.5.7
urllib3                   2.4.0
wandb                     0.19.10
wcwidth                   0.2.13
Werkzeug                  3.1.3
xxhash                    3.5.0
yarl                      1.20.0
zipp                      3.21.0

Colud I know how could I solve this issue, or jsut downgrade the package transformers? Thank you!