Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
PDFOCR - Module for processing PDF files with OCR and extracting structured data. | |
""" | |
import json | |
from pathlib import Path | |
from structured_ocr import StructuredOCR | |
class PDFOCR: | |
"""Class for processing PDF files with OCR and extracting structured data.""" | |
def __init__(self, api_key=None): | |
"""Initialize the PDF OCR processor.""" | |
self.processor = StructuredOCR(api_key=api_key) | |
def process_pdf(self, pdf_path, use_vision=True): | |
""" | |
Process a PDF file with OCR and extract structured data. | |
Args: | |
pdf_path: Path to the PDF file | |
use_vision: Whether to use vision model for improved analysis | |
Returns: | |
Dictionary with structured OCR results | |
""" | |
pdf_path = Path(pdf_path) | |
if not pdf_path.exists(): | |
raise FileNotFoundError(f"PDF file not found: {pdf_path}") | |
return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision) | |
def save_json_output(self, pdf_path, output_path, use_vision=True): | |
""" | |
Process a PDF file and save the structured output as JSON. | |
Args: | |
pdf_path: Path to the PDF file | |
output_path: Path where to save the JSON output | |
use_vision: Whether to use vision model for improved analysis | |
Returns: | |
Path to the saved JSON file | |
""" | |
# Process the PDF | |
result = self.process_pdf(pdf_path, use_vision=use_vision) | |
# Save the result to JSON | |
output_path = Path(output_path) | |
output_path.parent.mkdir(parents=True, exist_ok=True) | |
with open(output_path, 'w') as f: | |
json.dump(result, f, indent=2) | |
return output_path | |
# For testing directly | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) < 2: | |
print("Usage: python pdf_ocr.py <pdf_path> [output_path]") | |
sys.exit(1) | |
pdf_path = sys.argv[1] | |
output_path = sys.argv[2] if len(sys.argv) > 2 else None | |
processor = PDFOCR() | |
if output_path: | |
result_path = processor.save_json_output(pdf_path, output_path) | |
print(f"Results saved to: {result_path}") | |
else: | |
result = processor.process_pdf(pdf_path) | |
print(json.dumps(result, indent=2)) |