extract-photos-from-pdf

Sleeping

App Files Files Community

extract-photos-from-pdf / app.py

Dejansimic

Update app.py

0c76214 verified 25 days ago

raw

history blame contribute delete

7.62 kB

	import os
	import gradio as gr
	from pdf2image import convert_from_path, pdfinfo_from_path
	import zipfile
	import shutil
	import tempfile
	from pathlib import Path
	import traceback

	def zip_folder(folder_path, output_path):
	"""Create a zip archive from a folder with improved error handling"""
	try:
	with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
	for root, _, files in os.walk(folder_path):
	for file in files:
	file_path = os.path.join(root, file)
	zipf.write(file_path, os.path.relpath(file_path, folder_path))
	return True, ""
	except Exception as e:
	error_msg = f"Error creating zip file: {str(e)}"
	print(error_msg)
	return False, error_msg

	# Use more robust directory handling with pathlib
	BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
	DIRECTORY = BASE_DIR / "image_reference"
	DIRECTORY_OUTPUT = BASE_DIR / "output"
	DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]

	# Check and create directories
	for directory in DIRECTORIES:
	directory.mkdir(parents=True, exist_ok=True)

	ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']

	def get_image_files(directory):
	"""Get all image files from a directory with path validation"""
	directory = Path(directory)
	if not directory.exists() or not directory.is_dir():
	return []

	image_files = []
	for file_path in directory.glob('*'):
	if file_path.suffix.lower() in ALLOWED_EXTENSIONS:
	image_files.append(str(file_path))
	return image_files

	def clear_directory(directory):
	"""Safely clear a directory with error handling"""
	directory = Path(directory)
	if not directory.exists():
	return True, ""

	try:
	for item in directory.iterdir():
	if item.is_file() or item.is_symlink():
	item.unlink()
	elif item.is_dir():
	shutil.rmtree(item)
	return True, ""
	except Exception as e:
	error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
	print(error_msg)
	return False, error_msg

	def extract_photos_from_pdf(file_pdf):
	"""Extract all pages from a PDF as images"""
	# Update status at the beginning
	status_text = "Starting extraction process..."

	# Check if file is provided
	if file_pdf is None:
	return [], None, "Error: No file uploaded"

	# Clear directories for new extraction
	clear_success, clear_error = clear_directory(DIRECTORY)
	if not clear_success:
	return [], None, f"Error clearing directories: {clear_error}"

	clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
	if not clear_success:
	return [], None, f"Error clearing output directory: {clear_error}"

	try:
	# Get PDF path and info
	pdf_path = file_pdf.name

	try:
	info = pdfinfo_from_path(pdf_path)
	total_pages = info["Pages"]
	except Exception as e:
	error_details = traceback.format_exc()
	return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}"

	# Progress tracking variables
	batch_size = 10 # Smaller batch size for better progress visibility

	# Process PDF in batches
	for start_page in range(1, total_pages + 1, batch_size):
	end_page = min(start_page + batch_size - 1, total_pages)

	try:
	images = convert_from_path(
	pdf_path,
	first_page=start_page,
	last_page=end_page,
	dpi=150 # Adjustable DPI for quality vs size
	)

	for idx, image in enumerate(images, start=start_page):
	image_path = DIRECTORY / f"{idx}.png"
	image.save(str(image_path), 'PNG')
	except Exception as e:
	error_details = traceback.format_exc()
	return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}"

	# Get list of extracted images and sort them numerically
	images_pdf_list = get_image_files(DIRECTORY)
	if not images_pdf_list:
	return [], None, "No images could be extracted from the PDF."

	image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
	try:
	sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
	except Exception as e:
	# Fallback to unsorted if sorting fails
	sorted_names = image_names
	print(f"Error sorting images: {e}")

	# Create zip file of all images
	zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
	zip_success, zip_error = zip_folder(DIRECTORY, zip_path)

	if zip_success:
	return (
	sorted_names,
	str(zip_path),
	f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF."
	)
	else:
	return (
	sorted_names,
	None,
	f"Images extracted but zip creation failed: {zip_error}"
	)

	except Exception as e:
	error_details = traceback.format_exc()
	return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}"

	# Create Gradio interface with improved layout and error handling
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("""
	# PDF Image Extractor
	Upload a PDF file to extract all pages as images.
	""")

	with gr.Tab("PDF Extractor"):
	with gr.Row():
	with gr.Column(scale=1):
	file_pdf = gr.File(
	file_types=['.pdf'],
	label="Upload PDF file"
	)

	with gr.Row():
	btn = gr.Button("Extract Images", variant="primary")
	clear_btn = gr.Button("Clear")

	with gr.Column():
	status = gr.Textbox(
	label="Status",
	value="Upload a PDF and click 'Extract Images'",
	visible=True
	)
	gallery = gr.Gallery(
	label="Extracted Pages",
	show_label=True,
	elem_id="gallery",
	columns=3,
	object_fit="contain",
	height="auto"
	)
	download_btn = gr.File(
	label="Download All Images (ZIP)",
	visible=True
	)

	# Event handlers
	btn.click(
	fn=extract_photos_from_pdf,
	inputs=[file_pdf],
	outputs=[gallery, download_btn, status],
	api_name="extract"
	)

	def clear_outputs():
	return [], None, "Cleared. Upload a PDF to begin."

	clear_btn.click(
	fn=clear_outputs,
	inputs=[],
	outputs=[gallery, download_btn, status]
	)

	# Example for demonstration
	example_path = "./examples/sample.pdf"
	if os.path.exists(example_path):
	gr.Examples(
	examples=[[example_path]],
	fn=extract_photos_from_pdf,
	inputs=[file_pdf],
	outputs=[gallery, download_btn, status],
	cache_examples=False
	)

	if __name__ == "__main__":
	demo.launch()