Dejansimic's picture
Update app.py
0c76214 verified
import os
import gradio as gr
from pdf2image import convert_from_path, pdfinfo_from_path
import zipfile
import shutil
import tempfile
from pathlib import Path
import traceback
def zip_folder(folder_path, output_path):
"""Create a zip archive from a folder with improved error handling"""
try:
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
for root, _, files in os.walk(folder_path):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, folder_path))
return True, ""
except Exception as e:
error_msg = f"Error creating zip file: {str(e)}"
print(error_msg)
return False, error_msg
# Use more robust directory handling with pathlib
BASE_DIR = Path(tempfile.gettempdir()) / "pdf_extractor"
DIRECTORY = BASE_DIR / "image_reference"
DIRECTORY_OUTPUT = BASE_DIR / "output"
DIRECTORIES = [DIRECTORY, DIRECTORY_OUTPUT]
# Check and create directories
for directory in DIRECTORIES:
directory.mkdir(parents=True, exist_ok=True)
ALLOWED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.gif']
def get_image_files(directory):
"""Get all image files from a directory with path validation"""
directory = Path(directory)
if not directory.exists() or not directory.is_dir():
return []
image_files = []
for file_path in directory.glob('*'):
if file_path.suffix.lower() in ALLOWED_EXTENSIONS:
image_files.append(str(file_path))
return image_files
def clear_directory(directory):
"""Safely clear a directory with error handling"""
directory = Path(directory)
if not directory.exists():
return True, ""
try:
for item in directory.iterdir():
if item.is_file() or item.is_symlink():
item.unlink()
elif item.is_dir():
shutil.rmtree(item)
return True, ""
except Exception as e:
error_msg = f"Failed to clear directory {directory}. Reason: {str(e)}"
print(error_msg)
return False, error_msg
def extract_photos_from_pdf(file_pdf):
"""Extract all pages from a PDF as images"""
# Update status at the beginning
status_text = "Starting extraction process..."
# Check if file is provided
if file_pdf is None:
return [], None, "Error: No file uploaded"
# Clear directories for new extraction
clear_success, clear_error = clear_directory(DIRECTORY)
if not clear_success:
return [], None, f"Error clearing directories: {clear_error}"
clear_success, clear_error = clear_directory(DIRECTORY_OUTPUT)
if not clear_success:
return [], None, f"Error clearing output directory: {clear_error}"
try:
# Get PDF path and info
pdf_path = file_pdf.name
try:
info = pdfinfo_from_path(pdf_path)
total_pages = info["Pages"]
except Exception as e:
error_details = traceback.format_exc()
return [], None, f"Error reading PDF: {str(e)}\n\nDetails: {error_details}"
# Progress tracking variables
batch_size = 10 # Smaller batch size for better progress visibility
# Process PDF in batches
for start_page in range(1, total_pages + 1, batch_size):
end_page = min(start_page + batch_size - 1, total_pages)
try:
images = convert_from_path(
pdf_path,
first_page=start_page,
last_page=end_page,
dpi=150 # Adjustable DPI for quality vs size
)
for idx, image in enumerate(images, start=start_page):
image_path = DIRECTORY / f"{idx}.png"
image.save(str(image_path), 'PNG')
except Exception as e:
error_details = traceback.format_exc()
return [], None, f"Error converting PDF pages {start_page}-{end_page}: {str(e)}\n\nDetails: {error_details}"
# Get list of extracted images and sort them numerically
images_pdf_list = get_image_files(DIRECTORY)
if not images_pdf_list:
return [], None, "No images could be extracted from the PDF."
image_names = [(path, os.path.basename(path)) for path in images_pdf_list]
try:
sorted_names = sorted(image_names, key=lambda x: int(Path(x[1]).stem))
except Exception as e:
# Fallback to unsorted if sorting fails
sorted_names = image_names
print(f"Error sorting images: {e}")
# Create zip file of all images
zip_path = DIRECTORY_OUTPUT / "all_photos.zip"
zip_success, zip_error = zip_folder(DIRECTORY, zip_path)
if zip_success:
return (
sorted_names,
str(zip_path),
f"Successfully extracted {len(images_pdf_list)} page{'s' if len(images_pdf_list) != 1 else ''} from PDF."
)
else:
return (
sorted_names,
None,
f"Images extracted but zip creation failed: {zip_error}"
)
except Exception as e:
error_details = traceback.format_exc()
return [], None, f"Unexpected error: {str(e)}\n\nDetails: {error_details}"
# Create Gradio interface with improved layout and error handling
with gr.Blocks(theme=gr.themes.Soft()) as demo:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("""
# PDF Image Extractor
Upload a PDF file to extract all pages as images.
""")
with gr.Tab("PDF Extractor"):
with gr.Row():
with gr.Column(scale=1):
file_pdf = gr.File(
file_types=['.pdf'],
label="Upload PDF file"
)
with gr.Row():
btn = gr.Button("Extract Images", variant="primary")
clear_btn = gr.Button("Clear")
with gr.Column():
status = gr.Textbox(
label="Status",
value="Upload a PDF and click 'Extract Images'",
visible=True
)
gallery = gr.Gallery(
label="Extracted Pages",
show_label=True,
elem_id="gallery",
columns=3,
object_fit="contain",
height="auto"
)
download_btn = gr.File(
label="Download All Images (ZIP)",
visible=True
)
# Event handlers
btn.click(
fn=extract_photos_from_pdf,
inputs=[file_pdf],
outputs=[gallery, download_btn, status],
api_name="extract"
)
def clear_outputs():
return [], None, "Cleared. Upload a PDF to begin."
clear_btn.click(
fn=clear_outputs,
inputs=[],
outputs=[gallery, download_btn, status]
)
# Example for demonstration
example_path = "./examples/sample.pdf"
if os.path.exists(example_path):
gr.Examples(
examples=[[example_path]],
fn=extract_photos_from_pdf,
inputs=[file_pdf],
outputs=[gallery, download_btn, status],
cache_examples=False
)
if __name__ == "__main__":
demo.launch()