historical-ocr / test_pdf_preview.py
milwright's picture
submit pull for merge (#1)
f475c01 verified
#!/usr/bin/env python3
"""
Test PDF preview functionality
"""
import os
import io
from pathlib import Path
from pdf2image import convert_from_path
from PIL import Image
def test_pdf_preview():
"""Test converting a PDF to an image preview"""
# Get the first PDF file from the input directory
input_dir = Path(__file__).parent / "input"
pdf_files = list(input_dir.glob("*.pdf"))
if not pdf_files:
print("No PDF files found in the input directory")
return
pdf_path = pdf_files[0]
print(f"Testing PDF preview with file: {pdf_path}")
try:
# Convert first page of PDF to image
images = convert_from_path(pdf_path, first_page=1, last_page=1)
if not images:
print("No images extracted from PDF")
return
# Save the preview image
first_page = images[0]
output_dir = Path(__file__).parent / "output"
output_dir.mkdir(exist_ok=True)
output_path = output_dir / f"{pdf_path.stem}_preview.jpg"
first_page.save(output_path, format='JPEG')
print(f"PDF preview saved to: {output_path}")
# Demonstrate converting to bytes for Streamlit
img_bytes = io.BytesIO()
first_page.save(img_bytes, format='JPEG')
img_bytes.seek(0)
print(f"Successfully converted PDF to image bytes (size: {len(img_bytes.getvalue())} bytes)")
except Exception as e:
print(f"Error converting PDF to image: {str(e)}")
if __name__ == "__main__":
test_pdf_preview()