Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on 11 days ago

Commit

e99f9b5

verified ·

1 Parent(s): 131b3a2

Upload historical-ocr v1.1

Browse files

Files changed (19) hide show

.gitattributes +4 -0
README.md +54 -5
app.py +178 -0
config.py +14 -0
input/.DS_Store +0 -0
input/baldwin-letter-1.jpg +3 -0
input/baldwin-letter-2.jpg +3 -0
input/magellan-travels.jpg +3 -0
input/okeefe-menu.pdf +3 -0
packages.txt +2 -0
pdf_ocr.py +76 -0
prepare_for_hf.py +50 -0
requirements.txt +8 -0
run_local.sh +25 -0
setup_git.sh +35 -0
simple_test.py +69 -0
streamlit_app.py +168 -0
structured_ocr.py +218 -0
test_pdf.py +42 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
+input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
+input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
+input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,14 +1,63 @@
 ---
-title: Historical Ocr
-emoji: 🏆
-colorFrom: green
-colorTo: red
 sdk: streamlit
 sdk_version: 1.43.2
 app_file: app.py
 pinned: false
 license: mit
-short_description: Uses Minstral OCR to parse historical documents
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Historical OCR
+emoji: 🚀
+colorFrom: red
+colorTo: green
 sdk: streamlit
 sdk_version: 1.43.2
 app_file: app.py
 pinned: false
 license: mit
+short_description: Employs Mistral OCR for transcribing historical data
 ---
+# Historical Document OCR
+This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
+## Features
+- OCR processing for both image and PDF files
+- Automatic file type detection
+- Structured output generation using Mistral models
+- Interactive web interface with Streamlit
+- Supports historical documents and manuscripts
+## Setup for Local Development
+1. This directory is standalone and can be moved anywhere
+2. Install dependencies:
+```
+pip install -r requirements.txt
+```
+3. Set up your Mistral API key:
+   - Option 1: Create a `.env` file in this directory and add your Mistral API key:
+     ```
+     MISTRAL_API_KEY=your_api_key_here
+     ```
+   - Option 2: Set the `MISTRAL_API_KEY` environment variable directly:
+     ```
+     export MISTRAL_API_KEY=your_api_key_here
+     ```
+   - Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
+4. Run the Streamlit app using the script:
+```
+./run_local.sh
+```
+Or directly:
+```
+streamlit run app.py
+```
+## Usage
+1. Upload an image or PDF file using the file uploader
+2. Select processing options in the sidebar (e.g., use vision model)
+3. Click "Process Document" to analyze the file
+4. View the structured results and extract information
+## Deployment on Hugging Face Spaces
+This app is designed to be deployed on Hugging Face Spaces. The `README.md` contains the necessary configuration metadata.
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import streamlit as st
+import json
+import sys
+from pathlib import Path
+import tempfile
+# Import the StructuredOCR class and config from the local files
+from structured_ocr import StructuredOCR
+from config import MISTRAL_API_KEY
+# Set page configuration
+st.set_page_config(
+    page_title="Historical OCR",
+    page_icon="🚀",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Define functions
+def process_file(uploaded_file, use_vision=True):
+    """Process the uploaded file and return the OCR results"""
+    # Save the uploaded file to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+        tmp.write(uploaded_file.getvalue())
+        temp_path = tmp.name
+    try:
+        # Check if API key is available
+        if not MISTRAL_API_KEY:
+            # Return dummy data if no API key
+            return {
+                "file_name": uploaded_file.name,
+                "topics": ["Sample Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "title": "Sample Document",
+                    "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
+                }
+            }
+        # Initialize OCR processor
+        processor = StructuredOCR()
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        # Process the file
+        result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
+        return result
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+# App title and description
+st.title("Historical Document OCR")
+st.subheader("Powered by Mistral AI")
+st.markdown("""
+This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
+Upload an image or PDF file to get started.
+""")
+# Sidebar with options
+with st.sidebar:
+    st.header("Options")
+    use_vision = st.checkbox("Use Vision Model", value=True,
+                            help="For image files, use the vision model for improved analysis (may be slower)")
+    st.markdown("---")
+    st.subheader("About")
+    st.markdown("""
+    This app uses Mistral AI's OCR API to extract text from historical documents.
+    It can process:
+    - Image files (jpg, png, etc.)
+    - PDF documents
+    The extracted content is processed into structured data based on the document type.
+    """)
+# File uploader
+uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
+if uploaded_file is not None:
+    # Display the uploaded file
+    st.subheader("Uploaded Document")
+    file_ext = Path(uploaded_file.name).suffix.lower()
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if file_ext == ".pdf":
+            st.info("Processing PDF document...")
+            # For PDFs, you might show preview of first page
+            st.write(f"File: {uploaded_file.name}")
+        else:
+            st.image(uploaded_file, use_container_width=True)
+    # Process button
+    process_button = st.button("Process Document")
+    if process_button:
+        with st.spinner("Processing document..."):
+            try:
+                # Process the file
+                result = process_file(uploaded_file, use_vision)
+                # Display the results
+                with col2:
+                    st.subheader("Extracted Information")
+                    # Display file info
+                    st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+                    # Display languages if available
+                    if 'languages' in result:
+                        languages = [lang for lang in result['languages'] if lang is not None]
+                        if languages:
+                            st.write(f"**Languages Detected:** {', '.join(languages)}")
+                    # Display topics if available
+                    if 'topics' in result and result['topics']:
+                        st.write(f"**Topics:** {', '.join(result['topics'])}")
+                # Display the OCR contents
+                st.subheader("Document Contents")
+                if 'ocr_contents' in result:
+                    # Create tabs for different views
+                    tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
+                    with tab1:
+                        # Display in a more user-friendly format based on the content structure
+                        if isinstance(result['ocr_contents'], dict):
+                            for section, content in result['ocr_contents'].items():
+                                if content:  # Only display non-empty sections
+                                    st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                    if isinstance(content, str):
+                                        st.markdown(content)
+                                    elif isinstance(content, list):
+                                        for item in content:
+                                            if isinstance(item, str):
+                                                st.markdown(f"- {item}")
+                                            elif isinstance(item, dict):
+                                                st.json(item)
+                                    elif isinstance(content, dict):
+                                        for k, v in content.items():
+                                            st.markdown(f"**{k}:** {v}")
+                    with tab2:
+                        # Show the raw JSON for developers
+                        st.json(result)
+                else:
+                    st.error("No OCR content was extracted from the document.")
+            except Exception as e:
+                st.error(f"Error processing document: {str(e)}")
+else:
+    # Display sample images when no file is uploaded
+    st.info("Upload a document to get started.")
+    # Show example images
+    st.subheader("Example Documents")
+    col1, col2, col3 = st.columns([1, 1, 1])
+    # Find sample images from the input directory to display
+    input_dir = Path(__file__).parent / "input"
+    sample_images = []
+    if input_dir.exists():
+        sample_images = list(input_dir.glob("*.jpg"))[:3]  # Limit to 3 samples
+    if sample_images:
+        for i, img_path in enumerate(sample_images):
+            col = [col1, col2, col3][i % 3]
+            with col:
+                st.image(str(img_path), caption=img_path.name, use_container_width=True)

config.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# config.py
+"""
+Configuration file for Mistral OCR processing.
+Contains API key and other settings.
+"""
+import os
+# Your Mistral API key - get from environment variable
+MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
+# Model settings
+OCR_MODEL = "mistral-ocr-latest"
+TEXT_MODEL = "ministral-8b-latest"
+VISION_MODEL = "pixtral-12b-latest"

input/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

input/baldwin-letter-1.jpg ADDED Viewed

Git LFS Details

SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
Pointer size: 131 Bytes
Size of remote file: 135 kB

input/baldwin-letter-2.jpg ADDED Viewed

Git LFS Details

SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
Pointer size: 131 Bytes
Size of remote file: 114 kB

input/magellan-travels.jpg ADDED Viewed

Git LFS Details

SHA256: ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
Pointer size: 131 Bytes
Size of remote file: 283 kB

input/okeefe-menu.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
+size 2554815

packages.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ tesseract-ocr
2	+ poppler-utils

pdf_ocr.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#!/usr/bin/env python3
+"""
+PDFOCR - Module for processing PDF files with OCR and extracting structured data.
+"""
+import json
+from pathlib import Path
+from structured_ocr import StructuredOCR
+class PDFOCR:
+    """Class for processing PDF files with OCR and extracting structured data."""
+    def __init__(self, api_key=None):
+        """Initialize the PDF OCR processor."""
+        self.processor = StructuredOCR(api_key=api_key)
+    def process_pdf(self, pdf_path, use_vision=True):
+        """
+        Process a PDF file with OCR and extract structured data.
+        Args:
+            pdf_path: Path to the PDF file
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Dictionary with structured OCR results
+        """
+        pdf_path = Path(pdf_path)
+        if not pdf_path.exists():
+            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
+        return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
+    def save_json_output(self, pdf_path, output_path, use_vision=True):
+        """
+        Process a PDF file and save the structured output as JSON.
+        Args:
+            pdf_path: Path to the PDF file
+            output_path: Path where to save the JSON output
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Path to the saved JSON file
+        """
+        # Process the PDF
+        result = self.process_pdf(pdf_path, use_vision=use_vision)
+        # Save the result to JSON
+        output_path = Path(output_path)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump(result, f, indent=2)
+        return output_path
+# For testing directly
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
+        sys.exit(1)
+    pdf_path = sys.argv[1]
+    output_path = sys.argv[2] if len(sys.argv) > 2 else None
+    processor = PDFOCR()
+    if output_path:
+        result_path = processor.save_json_output(pdf_path, output_path)
+        print(f"Results saved to: {result_path}")
+    else:
+        result = processor.process_pdf(pdf_path)
+        print(json.dumps(result, indent=2))

prepare_for_hf.py ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/usr/bin/env python3
+"""
+Prepare the repository for Hugging Face Spaces deployment.
+This script:
+1. Creates a requirements.txt file with only the necessary dependencies
+2. Ensures app.py is ready for HF deployment
+3. Makes sure all configuration files are properly set up
+"""
+import os
+import shutil
+import sys
+def main():
+    print("Preparing repository for Hugging Face Spaces deployment...")
+    # Make sure output directory exists
+    if not os.path.exists("output"):
+        os.makedirs("output")
+        print("Created output directory")
+    # Clean up unnecessary files
+    files_to_remove = [".env", ".env.example", ".git"]
+    for file in files_to_remove:
+        if os.path.exists(file):
+            if os.path.isdir(file):
+                shutil.rmtree(file)
+            else:
+                os.remove(file)
+            print(f"Removed {file}")
+    # Check requirements.txt exists
+    if not os.path.exists("requirements.txt"):
+        print("ERROR: requirements.txt not found. Please create it before deploying.")
+        sys.exit(1)
+    # Make sure run_local.sh is executable
+    if os.path.exists("run_local.sh"):
+        os.chmod("run_local.sh", 0o755)
+        print("Made run_local.sh executable")
+    # Remove any large unnecessary files from input directory
+    # Keep only sample files that are needed for demos
+    print("NOTE: Large files in the input directory will be uploaded to Hugging Face.")
+    print("You may want to remove unnecessary files before deployment.")
+    print("Repository preparation complete!")
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit>=1.43.2
+mistralai
+pydantic
+pycountry
+pillow
+python-multipart
+pdf2image
+pytesseract

run_local.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+#!/bin/bash
+# Run the Streamlit app locally
+# No longer need to add parent directory to PYTHONPATH
+# as we now have a local copy of structured_ocr.py
+# Load environment variables from .env file if it exists
+if [ -f .env ]; then
+  echo "Loading environment variables from .env file"
+  set -o allexport
+  source .env
+  set +o allexport
+else
+  echo "No .env file found. Make sure to set MISTRAL_API_KEY environment variable manually."
+fi
+# Check if MISTRAL_API_KEY is set
+if [ -z "$MISTRAL_API_KEY" ]; then
+  echo "WARNING: MISTRAL_API_KEY is not set. The app will run with sample data."
+else
+  echo "MISTRAL_API_KEY is set. The app will use the Mistral API for OCR processing."
+fi
+# Run the Streamlit app
+streamlit run app.py

setup_git.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+#!/bin/bash
+# Setup git repository for Hugging Face Spaces
+# Check if HF_TOKEN environment variable is set
+if [ -z "$HF_TOKEN" ]; then
+    echo "Error: HF_TOKEN environment variable is not set."
+    echo "Please set it first with: export HF_TOKEN=your_hugging_face_token"
+    exit 1
+fi
+# Get your username
+echo "Enter your Hugging Face username:"
+read HF_USERNAME
+# Get the space name
+echo "Enter the name for your Hugging Face Space (e.g., historical-ocr):"
+read HF_SPACE
+# Prepare the files for deployment
+echo "Preparing files for deployment..."
+python3 prepare_for_hf.py
+# Initialize git
+git init
+git add .
+git commit -m "Initial commit"
+# Create the repository on Hugging Face
+echo "Creating and pushing to Hugging Face Space..."
+git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE
+huggingface-cli login --token $HF_TOKEN
+git push -u origin main
+echo "Deployment completed! Your app should be available at:"
+echo "https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE"

simple_test.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+"""
+Simple test script for structured_ocr.py
+"""
+import os
+import sys
+import json
+from pathlib import Path
+def main():
+    print("Testing OCR with a sample image file")
+    # Path to the sample image file
+    image_path = os.path.join("input", "recipe.jpg")
+    # Check if the file exists
+    if not os.path.isfile(image_path):
+        print(f"Error: Image file not found at {image_path}")
+        return
+    print(f"File found: {image_path}")
+    # Create the output directory if it doesn't exist
+    output_dir = "output"
+    os.makedirs(output_dir, exist_ok=True)
+    output_path = os.path.join(output_dir, "recipe_test.json")
+    # Import the StructuredOCR class
+    from structured_ocr import StructuredOCR
+    # Initialize OCR processor
+    processor = StructuredOCR()
+    try:
+        # Process the image file
+        print(f"Processing image file: {image_path}")
+        result = processor.process_file(image_path, file_type="image")
+        # Save the result to the output file
+        with open(output_path, 'w') as f:
+            json.dump(result, f, indent=2)
+        print(f"Image processing completed successfully. Output saved to {output_path}")
+        # Check if the output file exists
+        if os.path.isfile(output_path):
+            print(f"Output file exists at {output_path}")
+            # Print the file size
+            file_size = os.path.getsize(output_path)
+            print(f"Output file size: {file_size} bytes")
+            # Print a preview of the output file
+            print("\nPreview of output file:")
+            with open(output_path, 'r') as f:
+                data = json.load(f)
+                print(f"File name: {data.get('file_name', '')}")
+                print(f"Topics: {', '.join(data.get('topics', []))}")
+                print(f"Languages: {', '.join(data.get('languages', []))}")
+                print("OCR contents keys:", list(data.get('ocr_contents', {}).keys()))
+        else:
+            print(f"Error: Output file not found at {output_path}")
+    except Exception as e:
+        print(f"Error processing image: {e}")
+if __name__ == "__main__":
+    main()

streamlit_app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import streamlit as st
+import json
+import sys
+from pathlib import Path
+import tempfile
+# Add parent directory to path so we can import the OCR modules
+parent_dir = Path(__file__).parent.parent.absolute()
+sys.path.append(str(parent_dir))
+# Import the StructuredOCR class from the parent directory
+from structured_ocr import StructuredOCR
+# Set page configuration
+st.set_page_config(
+    page_title="Historical OCR",
+    page_icon="🚀",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Define functions
+def process_file(uploaded_file, use_vision=True):
+    """Process the uploaded file and return the OCR results"""
+    # Save the uploaded file to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
+        tmp.write(uploaded_file.getvalue())
+        temp_path = tmp.name
+    try:
+        # Initialize OCR processor
+        processor = StructuredOCR()
+        # Determine file type from extension
+        file_ext = Path(uploaded_file.name).suffix.lower()
+        file_type = "pdf" if file_ext == ".pdf" else "image"
+        # Process the file
+        result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
+        return result
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.unlink(temp_path)
+# App title and description
+st.title("Historical Document OCR")
+st.subheader("Powered by Mistral AI")
+st.markdown("""
+This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
+Upload an image or PDF file to get started.
+""")
+# Sidebar with options
+with st.sidebar:
+    st.header("Options")
+    use_vision = st.checkbox("Use Vision Model", value=True,
+                            help="For image files, use the vision model for improved analysis (may be slower)")
+    st.markdown("---")
+    st.subheader("About")
+    st.markdown("""
+    This app uses Mistral AI's OCR API to extract text from historical documents.
+    It can process:
+    - Image files (jpg, png, etc.)
+    - PDF documents
+    The extracted content is processed into structured data based on the document type.
+    """)
+# File uploader
+uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
+if uploaded_file is not None:
+    # Display the uploaded file
+    st.subheader("Uploaded Document")
+    file_ext = Path(uploaded_file.name).suffix.lower()
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if file_ext == ".pdf":
+            st.info("Processing PDF document...")
+            # For PDFs, you might show preview of first page
+            st.write(f"File: {uploaded_file.name}")
+        else:
+            st.image(uploaded_file, use_column_width=True)
+    # Process button
+    process_button = st.button("Process Document")
+    if process_button:
+        with st.spinner("Processing document..."):
+            try:
+                # Process the file
+                result = process_file(uploaded_file, use_vision)
+                # Display the results
+                with col2:
+                    st.subheader("Extracted Information")
+                    # Display file info
+                    st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
+                    # Display languages if available
+                    if 'languages' in result:
+                        languages = [lang for lang in result['languages'] if lang is not None]
+                        if languages:
+                            st.write(f"**Languages Detected:** {', '.join(languages)}")
+                    # Display topics if available
+                    if 'topics' in result and result['topics']:
+                        st.write(f"**Topics:** {', '.join(result['topics'])}")
+                # Display the OCR contents
+                st.subheader("Document Contents")
+                if 'ocr_contents' in result:
+                    # Create tabs for different views
+                    tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
+                    with tab1:
+                        # Display in a more user-friendly format based on the content structure
+                        if isinstance(result['ocr_contents'], dict):
+                            for section, content in result['ocr_contents'].items():
+                                if content:  # Only display non-empty sections
+                                    st.markdown(f"#### {section.replace('_', ' ').title()}")
+                                    if isinstance(content, str):
+                                        st.markdown(content)
+                                    elif isinstance(content, list):
+                                        for item in content:
+                                            if isinstance(item, str):
+                                                st.markdown(f"- {item}")
+                                            elif isinstance(item, dict):
+                                                st.json(item)
+                                    elif isinstance(content, dict):
+                                        for k, v in content.items():
+                                            st.markdown(f"**{k}:** {v}")
+                    with tab2:
+                        # Show the raw JSON for developers
+                        st.json(result)
+                else:
+                    st.error("No OCR content was extracted from the document.")
+            except Exception as e:
+                st.error(f"Error processing document: {str(e)}")
+else:
+    # Display sample images when no file is uploaded
+    st.info("Upload a document to get started.")
+    # Show example images
+    st.subheader("Example Documents")
+    col1, col2, col3 = st.columns([1, 1, 1])
+    # Find sample images from the input directory to display
+    input_dir = parent_dir / "input"
+    sample_images = []
+    if input_dir.exists():
+        sample_images = list(input_dir.glob("*.jpg"))[:3]  # Limit to 3 samples
+    if sample_images:
+        for i, img_path in enumerate(sample_images):
+            col = [col1, col2, col3][i % 3]
+            with col:
+                st.image(str(img_path), caption=img_path.name, use_column_width=True)

structured_ocr.py ADDED Viewed

	@@ -0,0 +1,218 @@

+import os
+import sys
+from enum import Enum
+from pathlib import Path
+import json
+import base64
+import pycountry
+from pydantic import BaseModel
+from mistralai import Mistral
+from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
+# Import config directly (now local to historical-ocr)
+from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
+# Create language enum for structured output
+languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
+class LanguageMeta(Enum.__class__):
+    def __new__(metacls, cls, bases, classdict):
+        for code, name in languages.items():
+            classdict[name.upper().replace(' ', '_')] = name
+        return super().__new__(metacls, cls, bases, classdict)
+class Language(Enum, metaclass=LanguageMeta):
+    pass
+class StructuredOCRModel(BaseModel):
+    file_name: str
+    topics: list[str]
+    languages: list[Language]
+    ocr_contents: dict
+class StructuredOCR:
+    def __init__(self, api_key=None):
+        """Initialize the OCR processor with API key"""
+        self.api_key = api_key or MISTRAL_API_KEY
+        self.client = Mistral(api_key=self.api_key)
+    def process_file(self, file_path, file_type=None, use_vision=True):
+        """Process a file and return structured OCR results
+        Args:
+            file_path: Path to the file to process
+            file_type: 'pdf' or 'image' (will be auto-detected if None)
+            use_vision: Whether to use vision model for improved analysis
+        Returns:
+            Dictionary with structured OCR results
+        """
+        # Convert file_path to Path object if it's a string
+        file_path = Path(file_path)
+        # Auto-detect file type if not provided
+        if file_type is None:
+            suffix = file_path.suffix.lower()
+            file_type = "pdf" if suffix == ".pdf" else "image"
+        # Read and process the file
+        if file_type == "pdf":
+            return self._process_pdf(file_path, use_vision)
+        else:
+            return self._process_image(file_path, use_vision)
+    def _process_pdf(self, file_path, use_vision=True):
+        """Process a PDF file with OCR"""
+        # Upload the PDF file
+        uploaded_file = self.client.files.upload(
+            file={
+                "file_name": file_path.stem,
+                "content": file_path.read_bytes(),
+            },
+            purpose="ocr",
+        )
+        # Get a signed URL for the uploaded file
+        signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
+        # Process the PDF with OCR
+        pdf_response = self.client.ocr.process(
+            document=DocumentURLChunk(document_url=signed_url.url),
+            model=OCR_MODEL,
+            include_image_base64=True
+        )
+        # Combine all pages' markdown into a single string
+        all_markdown = "\n\n".join([page.markdown for page in pdf_response.pages])
+        # Extract structured data using the appropriate model
+        if use_vision:
+            # Get base64 of first page for vision model
+            first_page_image = pdf_response.pages[0].images[0].image_base64 if pdf_response.pages and pdf_response.pages[0].images else None
+            if first_page_image:
+                # Use vision model
+                result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
+            else:
+                # Fall back to text-only model if no image available
+                result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+        else:
+            # Use text-only model
+            result = self._extract_structured_data_text_only(all_markdown, file_path.name)
+        return result
+    def _process_image(self, file_path, use_vision=True):
+        """Process an image file with OCR"""
+        # Read and encode the image file
+        encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+        # Process the image with OCR
+        image_response = self.client.ocr.process(
+            document=ImageURLChunk(image_url=base64_data_url),
+            model=OCR_MODEL
+        )
+        # Get the OCR markdown from the first page
+        image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
+        # Extract structured data using the appropriate model
+        if use_vision:
+            result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
+        else:
+            result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
+        return result
+    def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
+        """Extract structured data using vision model"""
+        try:
+            # Parse with vision model
+            chat_response = self.client.chat.parse(
+                model=VISION_MODEL,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": [
+                            ImageURLChunk(image_url=image_base64),
+                            TextChunk(text=(
+                                f"This is a historical document's OCR in markdown:\n"
+                                f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
+                                f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
+                                f"Extract topics, languages, and organize the content logically."
+                            ))
+                        ],
+                    },
+                ],
+                response_format=StructuredOCRModel,
+                temperature=0
+            )
+            # Convert the response to a dictionary
+            result = json.loads(chat_response.choices[0].message.parsed.json())
+            # Ensure languages is a list of strings, not Language enum objects
+            if 'languages' in result:
+                result['languages'] = [str(lang) for lang in result.get('languages', [])]
+        except Exception as e:
+            # Fall back to text-only model if vision model fails
+            print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
+            result = self._extract_structured_data_text_only(ocr_markdown, filename)
+        return result
+    def _extract_structured_data_text_only(self, ocr_markdown, filename):
+        """Extract structured data using text-only model"""
+        try:
+            # Parse with text-only model
+            chat_response = self.client.chat.parse(
+                model=TEXT_MODEL,
+                messages=[
+                    {
+                        "role": "user",
+                        "content": f"This is a historical document's OCR in markdown:\n"
+                                  f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
+                                  f"Convert this into a structured JSON response with the OCR contents. "
+                                  f"Extract topics, languages, and organize the content logically."
+                    },
+                ],
+                response_format=StructuredOCRModel,
+                temperature=0
+            )
+            # Convert the response to a dictionary
+            result = json.loads(chat_response.choices[0].message.parsed.json())
+            # Ensure languages is a list of strings, not Language enum objects
+            if 'languages' in result:
+                result['languages'] = [str(lang) for lang in result.get('languages', [])]
+        except Exception as e:
+            # Create a basic result if parsing fails
+            print(f"Text model failed: {str(e)}. Creating basic result.")
+            result = {
+                "file_name": filename,
+                "topics": ["Document"],
+                "languages": ["English"],
+                "ocr_contents": {
+                    "raw_text": ocr_markdown
+                }
+            }
+        return result
+# For testing directly
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python structured_ocr.py <file_path>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    processor = StructuredOCR()
+    result = processor.process_file(file_path)
+    print(json.dumps(result, indent=2))

test_pdf.py ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env python3
+"""
+Test script for pdf_ocr.py
+"""
+from pdf_ocr import PDFOCR
+import json
+import os
+def main():
+    # Initialize PDF processor
+    processor = PDFOCR()
+    # Define input and output paths
+    pdf_path = "input/rubric.pdf"
+    output_path = "output/rubric_test.json"
+    # Create output directory if it doesn't exist
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    # Process PDF and save output
+    print(f"Processing PDF: {pdf_path}")
+    processor.save_json_output(pdf_path, output_path)
+    print(f"Output saved to: {output_path}")
+    # Read and print the output
+    with open(output_path, 'r') as f:
+        result = json.load(f)
+    print("\nOutput preview:")
+    print(f"File name: {result.get('file_name')}")
+    print(f"Topics: {result.get('topics')}")
+    print(f"Languages: {result.get('languages')}")
+    print("OCR contents preview (first few keys):")
+    ocr_contents = result.get('ocr_contents', {})
+    for i, (key, value) in enumerate(ocr_contents.items()):
+        if i >= 3:  # Only show first 3 keys
+            break
+        print(f"  {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f"  {key}: {value}")
+if __name__ == "__main__":
+    main()