pdf2dataset / app.py
Quentin Gallouédec
improve template
2aa695b
import os
import random
import re
from string import Template
import gradio as gr
import pandas as pd
from datasets import Dataset
from huggingface_hub import HfApi
from pypdf import PdfReader
to_be_removed = ["ͳ", "•", "→", "□", "▪", "►", "�", "", "", "", ""]
to_be_replaced = {
"½": "1/2",
"–": "-",
"‘": "'",
"’": "'",
"…": "...",
"₋": "-",
"−": "-",
"⓫": "11.",
"⓬": "12.",
"⓭": "13.",
"⓮": "14.",
"◦": "°",
"❶": "1.",
"❷": "2.",
"❸": "3.",
"❹": "4.",
"❺": "5.",
"❻": "6.",
"❼": "7.",
"❽": "8.",
"❾": "9.",
"❿": "10.",
"\n": " ",
}
def clean(text):
# Remove all the unwanted characters
for char in to_be_removed:
text = text.replace(char, "")
# Replace all the characters that need to be replaced
for char, replacement in to_be_replaced.items():
text = text.replace(char, replacement)
# For all \n, if the next line doesn't start with a capital letter, remove the \n
# text = re.sub(r"\n([^A-ZÀ-ÖØ-Þ])", r" \1", text)
# Make sure that every "." is followed by a space
text = re.sub(r"\.([^ ])", r". \1", text)
# Add a space between a lowercase followed by an uppercase "aA" -> "a A" (include accents)
text = re.sub(r"([a-zà-öø-ÿ])([A-ZÀ-ÖØ-Þ])", r"\1 \2", text)
# Make sure that there is no space before a comma, a period, or a hyphen
text = text.replace(" ,", ",")
text = text.replace(" .", ".")
text = text.replace(" -", "-")
text = text.replace("- ", "-")
while " " in text:
text = text.replace(" ", " ")
return text
def pdf2dataset(pathes, user_id, dataset_id, token, private, progress=gr.Progress()):
if any([user_id, dataset_id, token]) and not all([user_id, dataset_id, token]):
raise gr.Error("Please provide all three: User ID, Dataset ID, and API token.")
if user_id == "":
user_id = "pdf2dataset"
private = False
if dataset_id == "":
dataset_id = f"{random.getrandbits(128):x}"
if token == "":
token = os.getenv("HF_TOKEN")
progress(0, desc="Starting...")
readers = []
for path in pathes:
try:
readers.append(PdfReader(path))
except Exception as e:
raise gr.Error(f"Failed to read {path.split('/')[-1]}.")
num_pages = sum(len(reader.pages) for reader in readers)
filenames = [path.split("/")[-1] for path in pathes]
# Convert the PDFs to text
page_texts = []
page_filenames = []
progress(0, desc="Converting pages...")
for reader, filename in zip(readers, filenames):
for page in reader.pages:
page_text = page.extract_text()
page_text = clean(page_text)
page_texts.append(page_text)
page_filenames.append(filename)
progress(len(page_texts) / num_pages, desc="Converting pages...")
# Upload the dataset to Hugging Face
progress(0, desc="Uploading to Hugging Face...")
dataset = Dataset.from_dict({"text": page_texts, "source": page_filenames})
dataset.push_to_hub(f"{user_id}/{dataset_id}", token=token, private=private)
progress(1, desc="Done!")
instructions = instructions_template.substitute(user_id=user_id, dataset_id=dataset_id)
preview = pd.DataFrame(dataset[:10])
print(f"Dataset {dataset_id} uploaded successfully.")
delete_dataset_id = dataset_id if user_id == "pdf2dataset" else ""
return instructions, preview, delete_dataset_id
def delete_dataset(repo_id_or_dataset_id):
# Get the user_id, dataset_id
if "/" in repo_id_or_dataset_id:
user_id, dataset_id = repo_id_or_dataset_id.split("/")
repo_id = repo_id_or_dataset_id
else:
user_id = "pdf2dataset"
dataset_id = repo_id_or_dataset_id
repo_id = f"{user_id}/{dataset_id}"
# Only allow the deletion of datasets in the pdf2dataset namespace
if not user_id == "pdf2dataset":
print(f"Deleting datasets in the {user_id} namespace is not allowed.")
return f"❌ Deleting datasets in the {user_id} namespace is not allowed."
# Delete the dataset
api = HfApi()
try:
api.delete_repo(repo_id, repo_type="dataset")
print(f"Dataset {repo_id} deleted successfully.")
return "✅ Dataset deleted successfully."
except Exception as e:
print(f"Error deleting dataset{repo_id}: {e}")
return f"❌ Error deleting dataset: {e}"
caution_text = """⚠️ Caution:
- This process will upload your data to a public Hugging Face repository. Do not upload sensitive information.
- Anyone (including you) will be able to delete the dataset once it is uploaded.
To avoid this, you can push the dataset to your personal Hugging Face account ⬇️
"""
instructions_template = Template(
"""
🔗: https://huggingface.co/datasets/$user_id/$dataset_id.
```python
from datasets import load_dataset
dataset = load_dataset("$user_id/$dataset_id")
```
"""
)
with gr.Blocks() as demo:
gr.Markdown("# PDF to 🤗 Dataset")
gr.Markdown("## 1️⃣ Upload PDFs")
file = gr.File(file_types=["pdf"], file_count="multiple")
gr.Markdown(caution_text)
with gr.Accordion("🔒 Pushing to my personal Hugging Face namespace", open=False):
gr.Markdown(
"""Recommended for API token
- Go to https://huggingface.co/settings/tokens?new_token=true
- Choose _Fine-grained_
- Check only _**Repos**/Write access to contents/settings of all repos under your personal namespace_
- Revoke the token after use"""
)
user_id = gr.Textbox(label="User ID", placeholder="Enter your Hugging Face user ID")
dataset_id = gr.Textbox(label="Dataset ID", placeholder="Enter the desired dataset ID")
token = gr.Textbox(label="API token", placeholder="Enter a Hugging Face API token")
private = gr.Checkbox(label="Make dataset private")
gr.Markdown("## 2️⃣ Convert the PDFs and upload")
convert_button = gr.Button("🔄 Convert and upload")
preview = gr.Dataframe(
label="Preview (first 10 rows)", headers=["text", "source"], datatype=["str", "str"], row_count=10, wrap=True, height=200
)
gr.Markdown("## 3️⃣ Use the dataset in your code")
instructions = gr.Markdown(instructions_template.substitute(user_id="pdf2dataset", dataset_id="generated_dataset_id"))
gr.Markdown("## 4️⃣ Delete the dataset (optional)")
dataset_id_to_delete = gr.Textbox("", placeholder="Enter dataset name to delete", label="Dataset to delete")
delete_button = gr.Button("🗑️ Delete dataset")
# Define the actions
convert_button.click(
pdf2dataset, inputs=[file, user_id, dataset_id, token, private], outputs=[instructions, preview, dataset_id_to_delete]
)
delete_button.click(delete_dataset, inputs=[dataset_id_to_delete], outputs=[delete_button])
dataset_id_to_delete.input(lambda: "🗑️ Delete dataset", outputs=[delete_button])
demo.launch()