Spaces:
Running
on
Zero
Running
on
Zero
# Copyright 2025 Bytedance Ltd. and/or its affiliates. | |
# SPDX-License-Identifier: Apache-2.0 | |
from .interleave_datasets import UnifiedEditIterableDataset | |
from .t2i_dataset import T2IIterableDataset | |
from .vlm_dataset import SftJSONLIterableDataset | |
DATASET_REGISTRY = { | |
't2i_pretrain': T2IIterableDataset, | |
'vlm_sft': SftJSONLIterableDataset, | |
'unified_edit': UnifiedEditIterableDataset, | |
} | |
DATASET_INFO = { | |
't2i_pretrain': { | |
't2i': { | |
'data_dir': 'your_data_path/bagel_example/t2i', # path of the parquet files | |
'num_files': 10, # number of data units to be sharded across all ranks and workers | |
'num_total_samples': 1000, # number of total samples in the dataset | |
}, | |
}, | |
'unified_edit':{ | |
'seedxedit_multi': { | |
'data_dir': 'your_data_path/bagel_example/editing/seedxedit_multi', | |
'num_files': 10, | |
'num_total_samples': 1000, | |
"parquet_info_path": 'your_data_path/bagel_example/editing/parquet_info/seedxedit_multi_nas.json', # information of the parquet files | |
}, | |
}, | |
'vlm_sft': { | |
'llava_ov': { | |
'data_dir': 'your_data_path/bagel_example/vlm/images', | |
'jsonl_path': 'your_data_path/bagel_example/vlm/llava_ov_si.jsonl', | |
'num_total_samples': 1000 | |
}, | |
}, | |
} |