Spaces:
Sleeping
Sleeping
import os | |
import gradio as gr | |
from pdf2image import convert_from_path, pdfinfo_from_path | |
import zipfile | |
import shutil | |
import tempfile | |
from pathlib import Path | |
import traceback | |
def zip_folder(folder_path, output_path): | |
"""Create a zip archive from a folder with improved error handling""" | |
try: | |
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: | |
for root, _, files in os.walk(folder_path): | |
for file in files: | |
file_path = os.path.join(root, file) | |
zipf.write(file_path, os.path.relpath(file_path, folder_path)) | |
return True, "" | |
except Exception as e: | |
error_msg = f"Error creating zip file: {str(e)}" | |
print(error_msg) | |
return False, error_msg | |
# Use more robust directory handling with pathlib | |
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor" | |
DIRECTORY = BASE_DIR / "image_reference" | |
DIRECTORY_OUTPUT = BASE_DIR / "output" | |
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT] | |
# Check and create directories | |
for directory in DIRECTORIES: | |
directory.mkdir(parents=True, exist_ok=True) | |
ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif'] | |
def get_image_files(directory): | |
"""Get all image files from a directory with path validation""" | |
directory = Path(directory) | |
if not directory.exists() or not directory.is_dir(): | |
return [] | |
image_files = [] | |
for file_path in directory.glob('*'): | |
if file_path.suffix.lower() in ALLOWED_EXTENSIONS: | |
image_files.append(str(file_path)) | |
return image_files | |
def clear_directory(directory): | |
"""Safely clear a directory with error handling""" | |
directory = Path(directory) | |
if not directory.exists(): | |
return True, "" | |
try: | |
for item in directory.iterdir(): | |
if item.is_file() or item.is_symlink(): | |
item.unlink() | |
elif item.is_dir(): | |
shutil.rmtree(item) | |
return True, "" | |
except Exception as e: | |
error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}" | |
print(error_msg) | |
return False, error_msg | |
def extract_photos_from_pdf(file_pdf): | |
"""Extract all pages from a PDF as images""" | |
# Update status at the beginning | |
status_text = "Starting extraction process..." | |
# Check if file is provided | |
if file_pdf is None: | |
return [], None, "Error: No file uploaded" | |
# Clear directories for new extraction | |
clear_success, clear_error = clear_directory(DIRECTORY) | |
if not clear_success: | |
return [], None, f"Error clearing directories: {clear_error}" | |
clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT) | |
if not clear_success: | |
return [], None, f"Error clearing output directory: {clear_error}" | |
try: | |
# Get PDF path and info | |
pdf_path = file_pdf.name | |
try: | |
info = pdfinfo_from_path(pdf_path) | |
total_pages = info["Pages"] | |
except Exception as e: | |
error_details = traceback.format_exc() | |
return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}" | |
# Progress tracking variables | |
batch_size = 10 # Smaller batch size for better progress visibility | |
# Process PDF in batches | |
for start_page in range(1, total_pages + 1, batch_size): | |
end_page = min(start_page + batch_size - 1, total_pages) | |
try: | |
images = convert_from_path( | |
pdf_path, | |
first_page=start_page, | |
last_page=end_page, | |
dpi=150 # Adjustable DPI for quality vs size | |
) | |
for idx, image in enumerate(images, start=start_page): | |
image_path = DIRECTORY / f"{idx}.png" | |
image.save(str(image_path), 'PNG') | |
except Exception as e: | |
error_details = traceback.format_exc() | |
return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}" | |
# Get list of extracted images and sort them numerically | |
images_pdf_list = get_image_files(DIRECTORY) | |
if not images_pdf_list: | |
return [], None, "No images could be extracted from the PDF." | |
image_names = [(path, os.path.basename(path)) for path in images_pdf_list] | |
try: | |
sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem)) | |
except Exception as e: | |
# Fallback to unsorted if sorting fails | |
sorted_names = image_names | |
print(f"Error sorting images: {e}") | |
# Create zip file of all images | |
zip_path = DIRECTORY_OUTPUT / "all_photos.zip" | |
zip_success, zip_error = zip_folder(DIRECTORY, zip_path) | |
if zip_success: | |
return ( | |
sorted_names, | |
str(zip_path), | |
f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF." | |
) | |
else: | |
return ( | |
sorted_names, | |
None, | |
f"Images extracted but zip creation failed: {zip_error}" | |
) | |
except Exception as e: | |
error_details = traceback.format_exc() | |
return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}" | |
# Create Gradio interface with improved layout and error handling | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
with gr.Row(): | |
with gr.Column(scale=1): | |
gr.Markdown(""" | |
# PDF Image Extractor | |
Upload a PDF file to extract all pages as images. | |
""") | |
with gr.Tab("PDF Extractor"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
file_pdf = gr.File( | |
file_types=['.pdf'], | |
label="Upload PDF file" | |
) | |
with gr.Row(): | |
btn = gr.Button("Extract Images", variant="primary") | |
clear_btn = gr.Button("Clear") | |
with gr.Column(): | |
status = gr.Textbox( | |
label="Status", | |
value="Upload a PDF and click 'Extract Images'", | |
visible=True | |
) | |
gallery = gr.Gallery( | |
label="Extracted Pages", | |
show_label=True, | |
elem_id="gallery", | |
columns=3, | |
object_fit="contain", | |
height="auto" | |
) | |
download_btn = gr.File( | |
label="Download All Images (ZIP)", | |
visible=True | |
) | |
# Event handlers | |
btn.click( | |
fn=extract_photos_from_pdf, | |
inputs=[file_pdf], | |
outputs=[gallery, download_btn, status], | |
api_name="extract" | |
) | |
def clear_outputs(): | |
return [], None, "Cleared. Upload a PDF to begin." | |
clear_btn.click( | |
fn=clear_outputs, | |
inputs=[], | |
outputs=[gallery, download_btn, status] | |
) | |
# Example for demonstration | |
example_path = "./examples/sample.pdf" | |
if os.path.exists(example_path): | |
gr.Examples( | |
examples=[[example_path]], | |
fn=extract_photos_from_pdf, | |
inputs=[file_pdf], | |
outputs=[gallery, download_btn, status], | |
cache_examples=False | |
) | |
if __name__ == "__main__": | |
demo.launch() |