Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Test script for pdf_ocr.py | |
""" | |
from pdf_ocr import PDFOCR | |
import json | |
import os | |
def main(): | |
# Initialize PDF processor | |
processor = PDFOCR() | |
# Define input and output paths | |
pdf_path = "input/rubric.pdf" | |
output_path = "output/rubric_test.json" | |
# Create output directory if it doesn't exist | |
os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
# Process PDF and save output | |
print(f"Processing PDF: {pdf_path}") | |
processor.save_json_output(pdf_path, output_path) | |
print(f"Output saved to: {output_path}") | |
# Read and print the output | |
with open(output_path, 'r') as f: | |
result = json.load(f) | |
print("\nOutput preview:") | |
print(f"File name: {result.get('file_name')}") | |
print(f"Topics: {result.get('topics')}") | |
print(f"Languages: {result.get('languages')}") | |
print("OCR contents preview (first few keys):") | |
ocr_contents = result.get('ocr_contents', {}) | |
for i, (key, value) in enumerate(ocr_contents.items()): | |
if i >= 3: # Only show first 3 keys | |
break | |
print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}") | |
if __name__ == "__main__": | |
main() |