Spaces:

milwright
/

historical-ocr

Running

historical-ocr / test_pdf_preview.py

submit pull for merge (#1)

f475c01 verified 8 days ago

1.58 kB

	#!/usr/bin/env python3
	"""
	Test PDF preview functionality
	"""
	import os
	import io
	from pathlib import Path
	from pdf2image import convert_from_path
	from PIL import Image

	def test_pdf_preview():
	"""Test converting a PDF to an image preview"""
	# Get the first PDF file from the input directory
	input_dir = Path(__file__).parent / "input"
	pdf_files = list(input_dir.glob("*.pdf"))

	if not pdf_files:
	print("No PDF files found in the input directory")
	return

	pdf_path = pdf_files[0]
	print(f"Testing PDF preview with file: {pdf_path}")

	try:
	# Convert first page of PDF to image
	images = convert_from_path(pdf_path, first_page=1, last_page=1)

	if not images:
	print("No images extracted from PDF")
	return

	# Save the preview image
	first_page = images[0]
	output_dir = Path(__file__).parent / "output"
	output_dir.mkdir(exist_ok=True)
	output_path = output_dir / f"{pdf_path.stem}_preview.jpg"

	first_page.save(output_path, format='JPEG')
	print(f"PDF preview saved to: {output_path}")

	# Demonstrate converting to bytes for Streamlit
	img_bytes = io.BytesIO()
	first_page.save(img_bytes, format='JPEG')
	img_bytes.seek(0)
	print(f"Successfully converted PDF to image bytes (size: {len(img_bytes.getvalue())} bytes)")

	except Exception as e:
	print(f"Error converting PDF to image: {str(e)}")

	if __name__ == "__main__":
	test_pdf_preview()