#!/usr/bin/env python3 """ Test script for pdf_ocr.py """ from pdf_ocr import PDFOCR import json import os def main(): # Initialize PDF processor processor = PDFOCR() # Define input and output paths pdf_path = "input/rubric.pdf" output_path = "output/rubric_test.json" # Create output directory if it doesn't exist os.makedirs(os.path.dirname(output_path), exist_ok=True) # Process PDF and save output print(f"Processing PDF: {pdf_path}") processor.save_json_output(pdf_path, output_path) print(f"Output saved to: {output_path}") # Read and print the output with open(output_path, 'r') as f: result = json.load(f) print("\nOutput preview:") print(f"File name: {result.get('file_name')}") print(f"Topics: {result.get('topics')}") print(f"Languages: {result.get('languages')}") print("OCR contents preview (first few keys):") ocr_contents = result.get('ocr_contents', {}) for i, (key, value) in enumerate(ocr_contents.items()): if i >= 3: # Only show first 3 keys break print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}") if __name__ == "__main__": main()