#!/usr/bin/env python3 """ Simple test script for structured_ocr.py """ import os import sys import json from pathlib import Path def main(): print("Testing OCR with a sample image file") # Path to the sample image file image_path = os.path.join("input", "magician-satire.jpg") # Check if the file exists if not os.path.isfile(image_path): print(f"Error: Image file not found at {image_path}") return print(f"File found: {image_path}") # Create the output directory if it doesn't exist output_dir = "output" os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "magician_test.json") # Import the StructuredOCR class from structured_ocr import StructuredOCR # Initialize OCR processor processor = StructuredOCR() try: # Process the image file print(f"Processing image file: {image_path}") result = processor.process_file(image_path, file_type="image") # Convert any non-serializable objects in the result def sanitize_for_json(obj): if hasattr(obj, 'to_dict'): return obj.to_dict() elif hasattr(obj, '__dict__'): return obj.__dict__ else: return str(obj) # Save the result to the output file with a custom serializer with open(output_path, 'w') as f: json.dump(result, f, indent=2, default=sanitize_for_json) print(f"Image processing completed successfully. Output saved to {output_path}") # Check if the output file exists if os.path.isfile(output_path): print(f"Output file exists at {output_path}") # Print the file size file_size = os.path.getsize(output_path) print(f"Output file size: {file_size} bytes") # Print a preview of the output file print("\nPreview of output file:") with open(output_path, 'r') as f: data = json.load(f) print(f"File name: {data.get('file_name', '')}") print(f"Topics: {', '.join(data.get('topics', []))}") print(f"Languages: {', '.join(data.get('languages', []))}") print("OCR contents keys:", list(data.get('ocr_contents', {}).keys())) else: print(f"Error: Output file not found at {output_path}") except Exception as e: print(f"Error processing image: {e}") if __name__ == "__main__": main()