historical-ocr / pdf_ocr.py
milwright's picture
Upload historical-ocr v1.1
e99f9b5 verified
#!/usr/bin/env python3
"""
PDFOCR - Module for processing PDF files with OCR and extracting structured data.
"""
import json
from pathlib import Path
from structured_ocr import StructuredOCR
class PDFOCR:
"""Class for processing PDF files with OCR and extracting structured data."""
def __init__(self, api_key=None):
"""Initialize the PDF OCR processor."""
self.processor = StructuredOCR(api_key=api_key)
def process_pdf(self, pdf_path, use_vision=True):
"""
Process a PDF file with OCR and extract structured data.
Args:
pdf_path: Path to the PDF file
use_vision: Whether to use vision model for improved analysis
Returns:
Dictionary with structured OCR results
"""
pdf_path = Path(pdf_path)
if not pdf_path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
def save_json_output(self, pdf_path, output_path, use_vision=True):
"""
Process a PDF file and save the structured output as JSON.
Args:
pdf_path: Path to the PDF file
output_path: Path where to save the JSON output
use_vision: Whether to use vision model for improved analysis
Returns:
Path to the saved JSON file
"""
# Process the PDF
result = self.process_pdf(pdf_path, use_vision=use_vision)
# Save the result to JSON
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
json.dump(result, f, indent=2)
return output_path
# For testing directly
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
sys.exit(1)
pdf_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else None
processor = PDFOCR()
if output_path:
result_path = processor.save_json_output(pdf_path, output_path)
print(f"Results saved to: {result_path}")
else:
result = processor.process_pdf(pdf_path)
print(json.dumps(result, indent=2))