Spaces:
Running
Running
Update process_hf_dataset.py
Browse files- process_hf_dataset.py +4 -4
process_hf_dataset.py
CHANGED
@@ -186,11 +186,11 @@ def process_hf_dataset(batch_size=100, use_gpu=False):
|
|
186 |
# Do not clear or populate with defaults here—let UI buttons handle this
|
187 |
try:
|
188 |
collection = client.get_or_create_collection(DB_NAME)
|
189 |
-
logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}")
|
190 |
# Verify collection is valid
|
191 |
if collection is None or not hasattr(collection, 'add'):
|
192 |
raise ValueError("ChromaDB collection access failed")
|
193 |
-
logger.info(
|
194 |
except Exception as e:
|
195 |
logger.error(f"Error accessing ChromaDB collection: {e}")
|
196 |
raise
|
@@ -277,8 +277,8 @@ def save_chromadb_to_hf(dataset_name=HF_DATASET_NAME, token=os.getenv("HF_KEY"))
|
|
277 |
logger.info(f"Created Hugging Face Dataset with {len(data['code'])} entries")
|
278 |
|
279 |
# Push to Hugging Face Hub
|
280 |
-
dataset.push_to_hub(dataset_name, token=token)
|
281 |
-
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}")
|
282 |
# Verify push (optional, could check dataset on Hub)
|
283 |
logger.info(f"Verified Hugging Face dataset push with {len(dataset)} entries")
|
284 |
except Exception as e:
|
|
|
186 |
# Do not clear or populate with defaults here—let UI buttons handle this
|
187 |
try:
|
188 |
collection = client.get_or_create_collection(DB_NAME)
|
189 |
+
logger.info(f"Using existing or new ChromaDB collection: {DB_NAME}, contains {collection.count()} entries")
|
190 |
# Verify collection is valid
|
191 |
if collection is None or not hasattr(collection, 'add'):
|
192 |
raise ValueError("ChromaDB collection access failed")
|
193 |
+
logger.info("Verified ChromaDB collection is valid")
|
194 |
except Exception as e:
|
195 |
logger.error(f"Error accessing ChromaDB collection: {e}")
|
196 |
raise
|
|
|
277 |
logger.info(f"Created Hugging Face Dataset with {len(data['code'])} entries")
|
278 |
|
279 |
# Push to Hugging Face Hub
|
280 |
+
dataset.push_to_hub(dataset_name, token=token, exist_ok=True) # Allow overwriting existing dataset
|
281 |
+
logger.info(f"Dataset pushed to Hugging Face Hub as {dataset_name}, overwriting existing dataset")
|
282 |
# Verify push (optional, could check dataset on Hub)
|
283 |
logger.info(f"Verified Hugging Face dataset push with {len(dataset)} entries")
|
284 |
except Exception as e:
|