Bagel-7B-Demo / data /dataset_info.py
KingNish's picture
Upload 110 files
e6af450 verified
# Copyright 2025 Bytedance Ltd. and/or its affiliates.
# SPDX-License-Identifier: Apache-2.0
from .interleave_datasets import UnifiedEditIterableDataset
from .t2i_dataset import T2IIterableDataset
from .vlm_dataset import SftJSONLIterableDataset
DATASET_REGISTRY = {
't2i_pretrain': T2IIterableDataset,
'vlm_sft': SftJSONLIterableDataset,
'unified_edit': UnifiedEditIterableDataset,
}
DATASET_INFO = {
't2i_pretrain': {
't2i': {
'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files
'num_files': 10, # number of data units to be sharded across all ranks and workers
'num_total_samples': 1000, # number of total samples in the dataset
},
},
'unified_edit':{
'seedxedit_multi': {
'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi',
'num_files': 10,
'num_total_samples': 1000,
"parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files
},
},
'vlm_sft': {
'llava_ov': {
'data_dir': 'your_data_path/bagel_example/vlm/images',
'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl',
'num_total_samples': 1000
},
},
}