pdf2dataset

Sleeping

pdf2dataset / app.py

Quentin Gallouédec

improve template

2aa695b 9 months ago

7.09 kB

	import os
	import random
	import re
	from string import Template

	import gradio as gr
	import pandas as pd
	from datasets import Dataset
	from huggingface_hub import HfApi
	from pypdf import PdfReader


	to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
	to_be_replaced = {
	"½": "1/2",
	"–": "-",
	"‘": "'",
	"’": "'",
	"…": "...",
	"₋": "-",
	"−": "-",
	"⓫": "11.",
	"⓬": "12.",
	"⓭": "13.",
	"⓮": "14.",
	"◦": "°",
	"❶": "1.",
	"❷": "2.",
	"❸": "3.",
	"❹": "4.",
	"❺": "5.",
	"❻": "6.",
	"❼": "7.",
	"❽": "8.",
	"❾": "9.",
	"❿": "10.",
	"\n": " ",
	}


	def clean(text):
	# Remove all the unwanted characters
	for char in to_be_removed:
	text = text.replace(char, "")

	# Replace all the characters that need to be replaced
	for char, replacement in to_be_replaced.items():
	text = text.replace(char, replacement)

	# For all \n, if the next line doesn't start with a capital letter, remove the \n
	# text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)

	# Make sure that every "." is followed by a space
	text = re.sub(r"\.([^ ])", r". \1", text)

	# Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
	text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)

	# Make sure that there is no space before a comma, a period, or a hyphen
	text = text.replace(" ,", ",")
	text = text.replace(" .", ".")
	text = text.replace(" -", "-")
	text = text.replace("- ", "-")

	while " " in text:
	text = text.replace(" ", " ")

	return text


	def pdf2dataset(pathes, user_id, dataset_id, token, private, progress=gr.Progress()):
	if any([user_id, dataset_id, token]) and not all([user_id, dataset_id, token]):
	raise gr.Error("Please provide all three: User ID, Dataset ID, and API token.")

	if user_id == "":
	user_id = "pdf2dataset"
	private = False
	if dataset_id == "":
	dataset_id = f"{random.getrandbits(128):x}"
	if token == "":
	token = os.getenv("HF_TOKEN")

	progress(0, desc="Starting...")
	readers = []
	for path in pathes:
	try:
	readers.append(PdfReader(path))
	except Exception as e:
	raise gr.Error(f"Failed to read {path.split('/')[-1]}.")
	num_pages = sum(len(reader.pages) for reader in readers)
	filenames = [path.split("/")[-1] for path in pathes]

	# Convert the PDFs to text
	page_texts = []
	page_filenames = []
	progress(0, desc="Converting pages...")
	for reader, filename in zip(readers, filenames):
	for page in reader.pages:
	page_text = page.extract_text()
	page_text = clean(page_text)
	page_texts.append(page_text)
	page_filenames.append(filename)
	progress(len(page_texts) / num_pages, desc="Converting pages...")

	# Upload the dataset to Hugging Face
	progress(0, desc="Uploading to Hugging Face...")
	dataset = Dataset.from_dict({"text": page_texts, "source": page_filenames})
	dataset.push_to_hub(f"{user_id}/{dataset_id}", token=token, private=private)
	progress(1, desc="Done!")

	instructions = instructions_template.substitute(user_id=user_id, dataset_id=dataset_id)
	preview = pd.DataFrame(dataset[:10])
	print(f"Dataset {dataset_id} uploaded successfully.")
	delete_dataset_id = dataset_id if user_id == "pdf2dataset" else ""
	return instructions, preview, delete_dataset_id


	def delete_dataset(repo_id_or_dataset_id):
	# Get the user_id, dataset_id
	if "/" in repo_id_or_dataset_id:
	user_id, dataset_id = repo_id_or_dataset_id.split("/")
	repo_id = repo_id_or_dataset_id
	else:
	user_id = "pdf2dataset"
	dataset_id = repo_id_or_dataset_id
	repo_id = f"{user_id}/{dataset_id}"

	# Only allow the deletion of datasets in the pdf2dataset namespace
	if not user_id == "pdf2dataset":
	print(f"Deleting datasets in the {user_id} namespace is not allowed.")
	return f"❌ Deleting datasets in the {user_id} namespace is not allowed."

	# Delete the dataset
	api = HfApi()
	try:
	api.delete_repo(repo_id, repo_type="dataset")
	print(f"Dataset {repo_id} deleted successfully.")
	return "✅ Dataset deleted successfully."
	except Exception as e:
	print(f"Error deleting dataset{repo_id}: {e}")
	return f"❌ Error deleting dataset: {e}"


	caution_text = """⚠️ Caution:
	- This process will upload your data to a public Hugging Face repository. Do not upload sensitive information.
	- Anyone (including you) will be able to delete the dataset once it is uploaded.

	To avoid this, you can push the dataset to your personal Hugging Face account ⬇️
	"""

	instructions_template = Template(
	"""
	🔗: https://huggingface.co/datasets/$user_id/$dataset_id.

	```python
	from datasets import load_dataset

	dataset = load_dataset("$user_id/$dataset_id")
	```
	"""
	)

	with gr.Blocks() as demo:
	gr.Markdown("# PDF to 🤗 Dataset")
	gr.Markdown("## 1️⃣ Upload PDFs")
	file = gr.File(file_types=["pdf"], file_count="multiple")
	gr.Markdown(caution_text)
	with gr.Accordion("🔒 Pushing to my personal Hugging Face namespace", open=False):
	gr.Markdown(
	"""Recommended for API token
	- Go to https://huggingface.co/settings/tokens?new_token=true
	- Choose _Fine-grained_
	- Check only _Repos/Write access to contents/settings of all repos under your personal namespace_
	- Revoke the token after use"""
	)
	user_id = gr.Textbox(label="User ID", placeholder="Enter your Hugging Face user ID")
	dataset_id = gr.Textbox(label="Dataset ID", placeholder="Enter the desired dataset ID")
	token = gr.Textbox(label="API token", placeholder="Enter a Hugging Face API token")
	private = gr.Checkbox(label="Make dataset private")

	gr.Markdown("## 2️⃣ Convert the PDFs and upload")
	convert_button = gr.Button("🔄 Convert and upload")
	preview = gr.Dataframe(
	label="Preview (first 10 rows)", headers=["text", "source"], datatype=["str", "str"], row_count=10, wrap=True, height=200
	)
	gr.Markdown("## 3️⃣ Use the dataset in your code")
	instructions = gr.Markdown(instructions_template.substitute(user_id="pdf2dataset", dataset_id="generated_dataset_id"))
	gr.Markdown("## 4️⃣ Delete the dataset (optional)")
	dataset_id_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
	delete_button = gr.Button("🗑️ Delete dataset")

	# Define the actions
	convert_button.click(
	pdf2dataset, inputs=[file, user_id, dataset_id, token, private], outputs=[instructions, preview, dataset_id_to_delete]
	)
	delete_button.click(delete_dataset, inputs=[dataset_id_to_delete], outputs=[delete_button])
	dataset_id_to_delete.input(lambda: "🗑️ Delete dataset", outputs=[delete_button])

	demo.launch()