Spaces:

milwright
/

historical-ocr

Running

historical-ocr / test_pdf.py

Upload historical-ocr v1.1

e99f9b5 verified 11 days ago

1.26 kB

	#!/usr/bin/env python3
	"""
	Test script for pdf_ocr.py
	"""

	from pdf_ocr import PDFOCR
	import json
	import os

	def main():
	# Initialize PDF processor
	processor = PDFOCR()

	# Define input and output paths
	pdf_path = "input/rubric.pdf"
	output_path = "output/rubric_test.json"

	# Create output directory if it doesn't exist
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	# Process PDF and save output
	print(f"Processing PDF: {pdf_path}")
	processor.save_json_output(pdf_path, output_path)
	print(f"Output saved to: {output_path}")

	# Read and print the output
	with open(output_path, 'r') as f:
	result = json.load(f)

	print("\nOutput preview:")
	print(f"File name: {result.get('file_name')}")
	print(f"Topics: {result.get('topics')}")
	print(f"Languages: {result.get('languages')}")
	print("OCR contents preview (first few keys):")
	ocr_contents = result.get('ocr_contents', {})
	for i, (key, value) in enumerate(ocr_contents.items()):
	if i >= 3: # Only show first 3 keys
	break
	print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}")

	if __name__ == "__main__":
	main()