historical-ocr / test_pdf.py
milwright's picture
Upload historical-ocr v1.1
e99f9b5 verified
#!/usr/bin/env python3
"""
Test script for pdf_ocr.py
"""
from pdf_ocr import PDFOCR
import json
import os
def main():
# Initialize PDF processor
processor = PDFOCR()
# Define input and output paths
pdf_path = "input/rubric.pdf"
output_path = "output/rubric_test.json"
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Process PDF and save output
print(f"Processing PDF: {pdf_path}")
processor.save_json_output(pdf_path, output_path)
print(f"Output saved to: {output_path}")
# Read and print the output
with open(output_path, 'r') as f:
result = json.load(f)
print("\nOutput preview:")
print(f"File name: {result.get('file_name')}")
print(f"Topics: {result.get('topics')}")
print(f"Languages: {result.get('languages')}")
print("OCR contents preview (first few keys):")
ocr_contents = result.get('ocr_contents', {})
for i, (key, value) in enumerate(ocr_contents.items()):
if i >= 3: # Only show first 3 keys
break
print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}")
if __name__ == "__main__":
main()