Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

historical-ocr / pdf_ocr.py

milwright

Upload historical-ocr v1.1

e99f9b5 verified 11 days ago

raw

history blame contribute delete

2.41 kB

	#!/usr/bin/env python3
	"""
	PDFOCR - Module for processing PDF files with OCR and extracting structured data.
	"""

	import json
	from pathlib import Path
	from structured_ocr import StructuredOCR

	class PDFOCR:
	"""Class for processing PDF files with OCR and extracting structured data."""

	def __init__(self, api_key=None):
	"""Initialize the PDF OCR processor."""
	self.processor = StructuredOCR(api_key=api_key)

	def process_pdf(self, pdf_path, use_vision=True):
	"""
	Process a PDF file with OCR and extract structured data.

	Args:
	pdf_path: Path to the PDF file
	use_vision: Whether to use vision model for improved analysis

	Returns:
	Dictionary with structured OCR results
	"""
	pdf_path = Path(pdf_path)
	if not pdf_path.exists():
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)

	def save_json_output(self, pdf_path, output_path, use_vision=True):
	"""
	Process a PDF file and save the structured output as JSON.

	Args:
	pdf_path: Path to the PDF file
	output_path: Path where to save the JSON output
	use_vision: Whether to use vision model for improved analysis

	Returns:
	Path to the saved JSON file
	"""
	# Process the PDF
	result = self.process_pdf(pdf_path, use_vision=use_vision)

	# Save the result to JSON
	output_path = Path(output_path)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	with open(output_path, 'w') as f:
	json.dump(result, f, indent=2)

	return output_path

	# For testing directly
	if __name__ == "__main__":
	import sys

	if len(sys.argv) < 2:
	print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
	sys.exit(1)

	pdf_path = sys.argv[1]
	output_path = sys.argv[2] if len(sys.argv) > 2 else None

	processor = PDFOCR()

	if output_path:
	result_path = processor.save_json_output(pdf_path, output_path)
	print(f"Results saved to: {result_path}")
	else:
	result = processor.process_pdf(pdf_path)
	print(json.dumps(result, indent=2))