milwright commited on
Commit
e99f9b5
·
verified ·
1 Parent(s): 131b3a2

Upload historical-ocr v1.1

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
37
+ input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
38
+ input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
39
+ input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,63 @@
1
  ---
2
- title: Historical Ocr
3
- emoji: 🏆
4
- colorFrom: green
5
- colorTo: red
6
  sdk: streamlit
7
  sdk_version: 1.43.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Uses Minstral OCR to parse historical documents
12
  ---
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Historical OCR
3
+ emoji: 🚀
4
+ colorFrom: red
5
+ colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.43.2
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: Employs Mistral OCR for transcribing historical data
12
  ---
13
 
14
+ # Historical Document OCR
15
+
16
+ This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
17
+
18
+ ## Features
19
+
20
+ - OCR processing for both image and PDF files
21
+ - Automatic file type detection
22
+ - Structured output generation using Mistral models
23
+ - Interactive web interface with Streamlit
24
+ - Supports historical documents and manuscripts
25
+
26
+ ## Setup for Local Development
27
+
28
+ 1. This directory is standalone and can be moved anywhere
29
+ 2. Install dependencies:
30
+ ```
31
+ pip install -r requirements.txt
32
+ ```
33
+ 3. Set up your Mistral API key:
34
+ - Option 1: Create a `.env` file in this directory and add your Mistral API key:
35
+ ```
36
+ MISTRAL_API_KEY=your_api_key_here
37
+ ```
38
+ - Option 2: Set the `MISTRAL_API_KEY` environment variable directly:
39
+ ```
40
+ export MISTRAL_API_KEY=your_api_key_here
41
+ ```
42
+ - Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
43
+ 4. Run the Streamlit app using the script:
44
+ ```
45
+ ./run_local.sh
46
+ ```
47
+ Or directly:
48
+ ```
49
+ streamlit run app.py
50
+ ```
51
+
52
+ ## Usage
53
+
54
+ 1. Upload an image or PDF file using the file uploader
55
+ 2. Select processing options in the sidebar (e.g., use vision model)
56
+ 3. Click "Process Document" to analyze the file
57
+ 4. View the structured results and extract information
58
+
59
+ ## Deployment on Hugging Face Spaces
60
+
61
+ This app is designed to be deployed on Hugging Face Spaces. The `README.md` contains the necessary configuration metadata.
62
+
63
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ import tempfile
7
+
8
+ # Import the StructuredOCR class and config from the local files
9
+ from structured_ocr import StructuredOCR
10
+ from config import MISTRAL_API_KEY
11
+
12
+ # Set page configuration
13
+ st.set_page_config(
14
+ page_title="Historical OCR",
15
+ page_icon="🚀",
16
+ layout="wide",
17
+ initial_sidebar_state="expanded"
18
+ )
19
+
20
+ # Define functions
21
+ def process_file(uploaded_file, use_vision=True):
22
+ """Process the uploaded file and return the OCR results"""
23
+ # Save the uploaded file to a temporary file
24
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
25
+ tmp.write(uploaded_file.getvalue())
26
+ temp_path = tmp.name
27
+
28
+ try:
29
+ # Check if API key is available
30
+ if not MISTRAL_API_KEY:
31
+ # Return dummy data if no API key
32
+ return {
33
+ "file_name": uploaded_file.name,
34
+ "topics": ["Sample Document"],
35
+ "languages": ["English"],
36
+ "ocr_contents": {
37
+ "title": "Sample Document",
38
+ "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
39
+ }
40
+ }
41
+
42
+ # Initialize OCR processor
43
+ processor = StructuredOCR()
44
+
45
+ # Determine file type from extension
46
+ file_ext = Path(uploaded_file.name).suffix.lower()
47
+ file_type = "pdf" if file_ext == ".pdf" else "image"
48
+
49
+ # Process the file
50
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
51
+ return result
52
+ finally:
53
+ # Clean up the temporary file
54
+ if os.path.exists(temp_path):
55
+ os.unlink(temp_path)
56
+
57
+ # App title and description
58
+ st.title("Historical Document OCR")
59
+ st.subheader("Powered by Mistral AI")
60
+ st.markdown("""
61
+ This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
62
+ Upload an image or PDF file to get started.
63
+ """)
64
+
65
+ # Sidebar with options
66
+ with st.sidebar:
67
+ st.header("Options")
68
+ use_vision = st.checkbox("Use Vision Model", value=True,
69
+ help="For image files, use the vision model for improved analysis (may be slower)")
70
+
71
+ st.markdown("---")
72
+ st.subheader("About")
73
+ st.markdown("""
74
+ This app uses Mistral AI's OCR API to extract text from historical documents.
75
+
76
+ It can process:
77
+ - Image files (jpg, png, etc.)
78
+ - PDF documents
79
+
80
+ The extracted content is processed into structured data based on the document type.
81
+ """)
82
+
83
+ # File uploader
84
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
85
+
86
+ if uploaded_file is not None:
87
+ # Display the uploaded file
88
+ st.subheader("Uploaded Document")
89
+ file_ext = Path(uploaded_file.name).suffix.lower()
90
+
91
+ col1, col2 = st.columns([1, 1])
92
+
93
+ with col1:
94
+ if file_ext == ".pdf":
95
+ st.info("Processing PDF document...")
96
+ # For PDFs, you might show preview of first page
97
+ st.write(f"File: {uploaded_file.name}")
98
+ else:
99
+ st.image(uploaded_file, use_container_width=True)
100
+
101
+ # Process button
102
+ process_button = st.button("Process Document")
103
+
104
+ if process_button:
105
+ with st.spinner("Processing document..."):
106
+ try:
107
+ # Process the file
108
+ result = process_file(uploaded_file, use_vision)
109
+
110
+ # Display the results
111
+ with col2:
112
+ st.subheader("Extracted Information")
113
+
114
+ # Display file info
115
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
116
+
117
+ # Display languages if available
118
+ if 'languages' in result:
119
+ languages = [lang for lang in result['languages'] if lang is not None]
120
+ if languages:
121
+ st.write(f"**Languages Detected:** {', '.join(languages)}")
122
+
123
+ # Display topics if available
124
+ if 'topics' in result and result['topics']:
125
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
126
+
127
+ # Display the OCR contents
128
+ st.subheader("Document Contents")
129
+ if 'ocr_contents' in result:
130
+ # Create tabs for different views
131
+ tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
132
+
133
+ with tab1:
134
+ # Display in a more user-friendly format based on the content structure
135
+ if isinstance(result['ocr_contents'], dict):
136
+ for section, content in result['ocr_contents'].items():
137
+ if content: # Only display non-empty sections
138
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
139
+
140
+ if isinstance(content, str):
141
+ st.markdown(content)
142
+ elif isinstance(content, list):
143
+ for item in content:
144
+ if isinstance(item, str):
145
+ st.markdown(f"- {item}")
146
+ elif isinstance(item, dict):
147
+ st.json(item)
148
+ elif isinstance(content, dict):
149
+ for k, v in content.items():
150
+ st.markdown(f"**{k}:** {v}")
151
+
152
+ with tab2:
153
+ # Show the raw JSON for developers
154
+ st.json(result)
155
+ else:
156
+ st.error("No OCR content was extracted from the document.")
157
+
158
+ except Exception as e:
159
+ st.error(f"Error processing document: {str(e)}")
160
+ else:
161
+ # Display sample images when no file is uploaded
162
+ st.info("Upload a document to get started.")
163
+
164
+ # Show example images
165
+ st.subheader("Example Documents")
166
+ col1, col2, col3 = st.columns([1, 1, 1])
167
+
168
+ # Find sample images from the input directory to display
169
+ input_dir = Path(__file__).parent / "input"
170
+ sample_images = []
171
+ if input_dir.exists():
172
+ sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
173
+
174
+ if sample_images:
175
+ for i, img_path in enumerate(sample_images):
176
+ col = [col1, col2, col3][i % 3]
177
+ with col:
178
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ """
3
+ Configuration file for Mistral OCR processing.
4
+ Contains API key and other settings.
5
+ """
6
+ import os
7
+
8
+ # Your Mistral API key - get from environment variable
9
+ MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
10
+
11
+ # Model settings
12
+ OCR_MODEL = "mistral-ocr-latest"
13
+ TEXT_MODEL = "ministral-8b-latest"
14
+ VISION_MODEL = "pixtral-12b-latest"
input/.DS_Store ADDED
Binary file (6.15 kB). View file
 
input/baldwin-letter-1.jpg ADDED

Git LFS Details

  • SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
input/baldwin-letter-2.jpg ADDED

Git LFS Details

  • SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
input/magellan-travels.jpg ADDED

Git LFS Details

  • SHA256: ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
  • Pointer size: 131 Bytes
  • Size of remote file: 283 kB
input/okeefe-menu.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
packages.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ tesseract-ocr
2
+ poppler-utils
pdf_ocr.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDFOCR - Module for processing PDF files with OCR and extracting structured data.
4
+ """
5
+
6
+ import json
7
+ from pathlib import Path
8
+ from structured_ocr import StructuredOCR
9
+
10
+ class PDFOCR:
11
+ """Class for processing PDF files with OCR and extracting structured data."""
12
+
13
+ def __init__(self, api_key=None):
14
+ """Initialize the PDF OCR processor."""
15
+ self.processor = StructuredOCR(api_key=api_key)
16
+
17
+ def process_pdf(self, pdf_path, use_vision=True):
18
+ """
19
+ Process a PDF file with OCR and extract structured data.
20
+
21
+ Args:
22
+ pdf_path: Path to the PDF file
23
+ use_vision: Whether to use vision model for improved analysis
24
+
25
+ Returns:
26
+ Dictionary with structured OCR results
27
+ """
28
+ pdf_path = Path(pdf_path)
29
+ if not pdf_path.exists():
30
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
31
+
32
+ return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
33
+
34
+ def save_json_output(self, pdf_path, output_path, use_vision=True):
35
+ """
36
+ Process a PDF file and save the structured output as JSON.
37
+
38
+ Args:
39
+ pdf_path: Path to the PDF file
40
+ output_path: Path where to save the JSON output
41
+ use_vision: Whether to use vision model for improved analysis
42
+
43
+ Returns:
44
+ Path to the saved JSON file
45
+ """
46
+ # Process the PDF
47
+ result = self.process_pdf(pdf_path, use_vision=use_vision)
48
+
49
+ # Save the result to JSON
50
+ output_path = Path(output_path)
51
+ output_path.parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ with open(output_path, 'w') as f:
54
+ json.dump(result, f, indent=2)
55
+
56
+ return output_path
57
+
58
+ # For testing directly
59
+ if __name__ == "__main__":
60
+ import sys
61
+
62
+ if len(sys.argv) < 2:
63
+ print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
64
+ sys.exit(1)
65
+
66
+ pdf_path = sys.argv[1]
67
+ output_path = sys.argv[2] if len(sys.argv) > 2 else None
68
+
69
+ processor = PDFOCR()
70
+
71
+ if output_path:
72
+ result_path = processor.save_json_output(pdf_path, output_path)
73
+ print(f"Results saved to: {result_path}")
74
+ else:
75
+ result = processor.process_pdf(pdf_path)
76
+ print(json.dumps(result, indent=2))
prepare_for_hf.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Prepare the repository for Hugging Face Spaces deployment.
4
+ This script:
5
+ 1. Creates a requirements.txt file with only the necessary dependencies
6
+ 2. Ensures app.py is ready for HF deployment
7
+ 3. Makes sure all configuration files are properly set up
8
+ """
9
+
10
+ import os
11
+ import shutil
12
+ import sys
13
+
14
+ def main():
15
+ print("Preparing repository for Hugging Face Spaces deployment...")
16
+
17
+ # Make sure output directory exists
18
+ if not os.path.exists("output"):
19
+ os.makedirs("output")
20
+ print("Created output directory")
21
+
22
+ # Clean up unnecessary files
23
+ files_to_remove = [".env", ".env.example", ".git"]
24
+ for file in files_to_remove:
25
+ if os.path.exists(file):
26
+ if os.path.isdir(file):
27
+ shutil.rmtree(file)
28
+ else:
29
+ os.remove(file)
30
+ print(f"Removed {file}")
31
+
32
+ # Check requirements.txt exists
33
+ if not os.path.exists("requirements.txt"):
34
+ print("ERROR: requirements.txt not found. Please create it before deploying.")
35
+ sys.exit(1)
36
+
37
+ # Make sure run_local.sh is executable
38
+ if os.path.exists("run_local.sh"):
39
+ os.chmod("run_local.sh", 0o755)
40
+ print("Made run_local.sh executable")
41
+
42
+ # Remove any large unnecessary files from input directory
43
+ # Keep only sample files that are needed for demos
44
+ print("NOTE: Large files in the input directory will be uploaded to Hugging Face.")
45
+ print("You may want to remove unnecessary files before deployment.")
46
+
47
+ print("Repository preparation complete!")
48
+
49
+ if __name__ == "__main__":
50
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.43.2
2
+ mistralai
3
+ pydantic
4
+ pycountry
5
+ pillow
6
+ python-multipart
7
+ pdf2image
8
+ pytesseract
run_local.sh ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Run the Streamlit app locally
3
+
4
+ # No longer need to add parent directory to PYTHONPATH
5
+ # as we now have a local copy of structured_ocr.py
6
+
7
+ # Load environment variables from .env file if it exists
8
+ if [ -f .env ]; then
9
+ echo "Loading environment variables from .env file"
10
+ set -o allexport
11
+ source .env
12
+ set +o allexport
13
+ else
14
+ echo "No .env file found. Make sure to set MISTRAL_API_KEY environment variable manually."
15
+ fi
16
+
17
+ # Check if MISTRAL_API_KEY is set
18
+ if [ -z "$MISTRAL_API_KEY" ]; then
19
+ echo "WARNING: MISTRAL_API_KEY is not set. The app will run with sample data."
20
+ else
21
+ echo "MISTRAL_API_KEY is set. The app will use the Mistral API for OCR processing."
22
+ fi
23
+
24
+ # Run the Streamlit app
25
+ streamlit run app.py
setup_git.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Setup git repository for Hugging Face Spaces
3
+
4
+ # Check if HF_TOKEN environment variable is set
5
+ if [ -z "$HF_TOKEN" ]; then
6
+ echo "Error: HF_TOKEN environment variable is not set."
7
+ echo "Please set it first with: export HF_TOKEN=your_hugging_face_token"
8
+ exit 1
9
+ fi
10
+
11
+ # Get your username
12
+ echo "Enter your Hugging Face username:"
13
+ read HF_USERNAME
14
+
15
+ # Get the space name
16
+ echo "Enter the name for your Hugging Face Space (e.g., historical-ocr):"
17
+ read HF_SPACE
18
+
19
+ # Prepare the files for deployment
20
+ echo "Preparing files for deployment..."
21
+ python3 prepare_for_hf.py
22
+
23
+ # Initialize git
24
+ git init
25
+ git add .
26
+ git commit -m "Initial commit"
27
+
28
+ # Create the repository on Hugging Face
29
+ echo "Creating and pushing to Hugging Face Space..."
30
+ git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE
31
+ huggingface-cli login --token $HF_TOKEN
32
+ git push -u origin main
33
+
34
+ echo "Deployment completed! Your app should be available at:"
35
+ echo "https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE"
simple_test.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Simple test script for structured_ocr.py
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import json
9
+ from pathlib import Path
10
+
11
+ def main():
12
+ print("Testing OCR with a sample image file")
13
+
14
+ # Path to the sample image file
15
+ image_path = os.path.join("input", "recipe.jpg")
16
+
17
+ # Check if the file exists
18
+ if not os.path.isfile(image_path):
19
+ print(f"Error: Image file not found at {image_path}")
20
+ return
21
+
22
+ print(f"File found: {image_path}")
23
+
24
+ # Create the output directory if it doesn't exist
25
+ output_dir = "output"
26
+ os.makedirs(output_dir, exist_ok=True)
27
+
28
+ output_path = os.path.join(output_dir, "recipe_test.json")
29
+
30
+ # Import the StructuredOCR class
31
+ from structured_ocr import StructuredOCR
32
+
33
+ # Initialize OCR processor
34
+ processor = StructuredOCR()
35
+
36
+ try:
37
+ # Process the image file
38
+ print(f"Processing image file: {image_path}")
39
+ result = processor.process_file(image_path, file_type="image")
40
+
41
+ # Save the result to the output file
42
+ with open(output_path, 'w') as f:
43
+ json.dump(result, f, indent=2)
44
+
45
+ print(f"Image processing completed successfully. Output saved to {output_path}")
46
+
47
+ # Check if the output file exists
48
+ if os.path.isfile(output_path):
49
+ print(f"Output file exists at {output_path}")
50
+ # Print the file size
51
+ file_size = os.path.getsize(output_path)
52
+ print(f"Output file size: {file_size} bytes")
53
+
54
+ # Print a preview of the output file
55
+ print("\nPreview of output file:")
56
+ with open(output_path, 'r') as f:
57
+ data = json.load(f)
58
+ print(f"File name: {data.get('file_name', '')}")
59
+ print(f"Topics: {', '.join(data.get('topics', []))}")
60
+ print(f"Languages: {', '.join(data.get('languages', []))}")
61
+ print("OCR contents keys:", list(data.get('ocr_contents', {}).keys()))
62
+ else:
63
+ print(f"Error: Output file not found at {output_path}")
64
+
65
+ except Exception as e:
66
+ print(f"Error processing image: {e}")
67
+
68
+ if __name__ == "__main__":
69
+ main()
streamlit_app.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+ import tempfile
7
+
8
+ # Add parent directory to path so we can import the OCR modules
9
+ parent_dir = Path(__file__).parent.parent.absolute()
10
+ sys.path.append(str(parent_dir))
11
+
12
+ # Import the StructuredOCR class from the parent directory
13
+ from structured_ocr import StructuredOCR
14
+
15
+ # Set page configuration
16
+ st.set_page_config(
17
+ page_title="Historical OCR",
18
+ page_icon="🚀",
19
+ layout="wide",
20
+ initial_sidebar_state="expanded"
21
+ )
22
+
23
+ # Define functions
24
+ def process_file(uploaded_file, use_vision=True):
25
+ """Process the uploaded file and return the OCR results"""
26
+ # Save the uploaded file to a temporary file
27
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
28
+ tmp.write(uploaded_file.getvalue())
29
+ temp_path = tmp.name
30
+
31
+ try:
32
+ # Initialize OCR processor
33
+ processor = StructuredOCR()
34
+
35
+ # Determine file type from extension
36
+ file_ext = Path(uploaded_file.name).suffix.lower()
37
+ file_type = "pdf" if file_ext == ".pdf" else "image"
38
+
39
+ # Process the file
40
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
41
+ return result
42
+ finally:
43
+ # Clean up the temporary file
44
+ if os.path.exists(temp_path):
45
+ os.unlink(temp_path)
46
+
47
+ # App title and description
48
+ st.title("Historical Document OCR")
49
+ st.subheader("Powered by Mistral AI")
50
+ st.markdown("""
51
+ This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
52
+ Upload an image or PDF file to get started.
53
+ """)
54
+
55
+ # Sidebar with options
56
+ with st.sidebar:
57
+ st.header("Options")
58
+ use_vision = st.checkbox("Use Vision Model", value=True,
59
+ help="For image files, use the vision model for improved analysis (may be slower)")
60
+
61
+ st.markdown("---")
62
+ st.subheader("About")
63
+ st.markdown("""
64
+ This app uses Mistral AI's OCR API to extract text from historical documents.
65
+
66
+ It can process:
67
+ - Image files (jpg, png, etc.)
68
+ - PDF documents
69
+
70
+ The extracted content is processed into structured data based on the document type.
71
+ """)
72
+
73
+ # File uploader
74
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
75
+
76
+ if uploaded_file is not None:
77
+ # Display the uploaded file
78
+ st.subheader("Uploaded Document")
79
+ file_ext = Path(uploaded_file.name).suffix.lower()
80
+
81
+ col1, col2 = st.columns([1, 1])
82
+
83
+ with col1:
84
+ if file_ext == ".pdf":
85
+ st.info("Processing PDF document...")
86
+ # For PDFs, you might show preview of first page
87
+ st.write(f"File: {uploaded_file.name}")
88
+ else:
89
+ st.image(uploaded_file, use_column_width=True)
90
+
91
+ # Process button
92
+ process_button = st.button("Process Document")
93
+
94
+ if process_button:
95
+ with st.spinner("Processing document..."):
96
+ try:
97
+ # Process the file
98
+ result = process_file(uploaded_file, use_vision)
99
+
100
+ # Display the results
101
+ with col2:
102
+ st.subheader("Extracted Information")
103
+
104
+ # Display file info
105
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
106
+
107
+ # Display languages if available
108
+ if 'languages' in result:
109
+ languages = [lang for lang in result['languages'] if lang is not None]
110
+ if languages:
111
+ st.write(f"**Languages Detected:** {', '.join(languages)}")
112
+
113
+ # Display topics if available
114
+ if 'topics' in result and result['topics']:
115
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
116
+
117
+ # Display the OCR contents
118
+ st.subheader("Document Contents")
119
+ if 'ocr_contents' in result:
120
+ # Create tabs for different views
121
+ tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
122
+
123
+ with tab1:
124
+ # Display in a more user-friendly format based on the content structure
125
+ if isinstance(result['ocr_contents'], dict):
126
+ for section, content in result['ocr_contents'].items():
127
+ if content: # Only display non-empty sections
128
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
129
+
130
+ if isinstance(content, str):
131
+ st.markdown(content)
132
+ elif isinstance(content, list):
133
+ for item in content:
134
+ if isinstance(item, str):
135
+ st.markdown(f"- {item}")
136
+ elif isinstance(item, dict):
137
+ st.json(item)
138
+ elif isinstance(content, dict):
139
+ for k, v in content.items():
140
+ st.markdown(f"**{k}:** {v}")
141
+
142
+ with tab2:
143
+ # Show the raw JSON for developers
144
+ st.json(result)
145
+ else:
146
+ st.error("No OCR content was extracted from the document.")
147
+
148
+ except Exception as e:
149
+ st.error(f"Error processing document: {str(e)}")
150
+ else:
151
+ # Display sample images when no file is uploaded
152
+ st.info("Upload a document to get started.")
153
+
154
+ # Show example images
155
+ st.subheader("Example Documents")
156
+ col1, col2, col3 = st.columns([1, 1, 1])
157
+
158
+ # Find sample images from the input directory to display
159
+ input_dir = parent_dir / "input"
160
+ sample_images = []
161
+ if input_dir.exists():
162
+ sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
163
+
164
+ if sample_images:
165
+ for i, img_path in enumerate(sample_images):
166
+ col = [col1, col2, col3][i % 3]
167
+ with col:
168
+ st.image(str(img_path), caption=img_path.name, use_column_width=True)
structured_ocr.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from enum import Enum
4
+ from pathlib import Path
5
+ import json
6
+ import base64
7
+ import pycountry
8
+ from pydantic import BaseModel
9
+ from mistralai import Mistral
10
+ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
11
+
12
+ # Import config directly (now local to historical-ocr)
13
+ from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
14
+
15
+ # Create language enum for structured output
16
+ languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
17
+
18
+ class LanguageMeta(Enum.__class__):
19
+ def __new__(metacls, cls, bases, classdict):
20
+ for code, name in languages.items():
21
+ classdict[name.upper().replace(' ', '_')] = name
22
+ return super().__new__(metacls, cls, bases, classdict)
23
+
24
+ class Language(Enum, metaclass=LanguageMeta):
25
+ pass
26
+
27
+ class StructuredOCRModel(BaseModel):
28
+ file_name: str
29
+ topics: list[str]
30
+ languages: list[Language]
31
+ ocr_contents: dict
32
+
33
+ class StructuredOCR:
34
+ def __init__(self, api_key=None):
35
+ """Initialize the OCR processor with API key"""
36
+ self.api_key = api_key or MISTRAL_API_KEY
37
+ self.client = Mistral(api_key=self.api_key)
38
+
39
+ def process_file(self, file_path, file_type=None, use_vision=True):
40
+ """Process a file and return structured OCR results
41
+
42
+ Args:
43
+ file_path: Path to the file to process
44
+ file_type: 'pdf' or 'image' (will be auto-detected if None)
45
+ use_vision: Whether to use vision model for improved analysis
46
+
47
+ Returns:
48
+ Dictionary with structured OCR results
49
+ """
50
+ # Convert file_path to Path object if it's a string
51
+ file_path = Path(file_path)
52
+
53
+ # Auto-detect file type if not provided
54
+ if file_type is None:
55
+ suffix = file_path.suffix.lower()
56
+ file_type = "pdf" if suffix == ".pdf" else "image"
57
+
58
+ # Read and process the file
59
+ if file_type == "pdf":
60
+ return self._process_pdf(file_path, use_vision)
61
+ else:
62
+ return self._process_image(file_path, use_vision)
63
+
64
+ def _process_pdf(self, file_path, use_vision=True):
65
+ """Process a PDF file with OCR"""
66
+ # Upload the PDF file
67
+ uploaded_file = self.client.files.upload(
68
+ file={
69
+ "file_name": file_path.stem,
70
+ "content": file_path.read_bytes(),
71
+ },
72
+ purpose="ocr",
73
+ )
74
+
75
+ # Get a signed URL for the uploaded file
76
+ signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
77
+
78
+ # Process the PDF with OCR
79
+ pdf_response = self.client.ocr.process(
80
+ document=DocumentURLChunk(document_url=signed_url.url),
81
+ model=OCR_MODEL,
82
+ include_image_base64=True
83
+ )
84
+
85
+ # Combine all pages' markdown into a single string
86
+ all_markdown = "\n\n".join([page.markdown for page in pdf_response.pages])
87
+
88
+ # Extract structured data using the appropriate model
89
+ if use_vision:
90
+ # Get base64 of first page for vision model
91
+ first_page_image = pdf_response.pages[0].images[0].image_base64 if pdf_response.pages and pdf_response.pages[0].images else None
92
+
93
+ if first_page_image:
94
+ # Use vision model
95
+ result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
96
+ else:
97
+ # Fall back to text-only model if no image available
98
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
99
+ else:
100
+ # Use text-only model
101
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
102
+
103
+ return result
104
+
105
+ def _process_image(self, file_path, use_vision=True):
106
+ """Process an image file with OCR"""
107
+ # Read and encode the image file
108
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
109
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
110
+
111
+ # Process the image with OCR
112
+ image_response = self.client.ocr.process(
113
+ document=ImageURLChunk(image_url=base64_data_url),
114
+ model=OCR_MODEL
115
+ )
116
+
117
+ # Get the OCR markdown from the first page
118
+ image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
119
+
120
+ # Extract structured data using the appropriate model
121
+ if use_vision:
122
+ result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
123
+ else:
124
+ result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
125
+
126
+ return result
127
+
128
+ def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
129
+ """Extract structured data using vision model"""
130
+ try:
131
+ # Parse with vision model
132
+ chat_response = self.client.chat.parse(
133
+ model=VISION_MODEL,
134
+ messages=[
135
+ {
136
+ "role": "user",
137
+ "content": [
138
+ ImageURLChunk(image_url=image_base64),
139
+ TextChunk(text=(
140
+ f"This is a historical document's OCR in markdown:\n"
141
+ f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
142
+ f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
143
+ f"Extract topics, languages, and organize the content logically."
144
+ ))
145
+ ],
146
+ },
147
+ ],
148
+ response_format=StructuredOCRModel,
149
+ temperature=0
150
+ )
151
+
152
+ # Convert the response to a dictionary
153
+ result = json.loads(chat_response.choices[0].message.parsed.json())
154
+
155
+ # Ensure languages is a list of strings, not Language enum objects
156
+ if 'languages' in result:
157
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
158
+
159
+ except Exception as e:
160
+ # Fall back to text-only model if vision model fails
161
+ print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
162
+ result = self._extract_structured_data_text_only(ocr_markdown, filename)
163
+
164
+ return result
165
+
166
+ def _extract_structured_data_text_only(self, ocr_markdown, filename):
167
+ """Extract structured data using text-only model"""
168
+ try:
169
+ # Parse with text-only model
170
+ chat_response = self.client.chat.parse(
171
+ model=TEXT_MODEL,
172
+ messages=[
173
+ {
174
+ "role": "user",
175
+ "content": f"This is a historical document's OCR in markdown:\n"
176
+ f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
177
+ f"Convert this into a structured JSON response with the OCR contents. "
178
+ f"Extract topics, languages, and organize the content logically."
179
+ },
180
+ ],
181
+ response_format=StructuredOCRModel,
182
+ temperature=0
183
+ )
184
+
185
+ # Convert the response to a dictionary
186
+ result = json.loads(chat_response.choices[0].message.parsed.json())
187
+
188
+ # Ensure languages is a list of strings, not Language enum objects
189
+ if 'languages' in result:
190
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
191
+
192
+ except Exception as e:
193
+ # Create a basic result if parsing fails
194
+ print(f"Text model failed: {str(e)}. Creating basic result.")
195
+ result = {
196
+ "file_name": filename,
197
+ "topics": ["Document"],
198
+ "languages": ["English"],
199
+ "ocr_contents": {
200
+ "raw_text": ocr_markdown
201
+ }
202
+ }
203
+
204
+ return result
205
+
206
+ # For testing directly
207
+ if __name__ == "__main__":
208
+ import sys
209
+
210
+ if len(sys.argv) < 2:
211
+ print("Usage: python structured_ocr.py <file_path>")
212
+ sys.exit(1)
213
+
214
+ file_path = sys.argv[1]
215
+ processor = StructuredOCR()
216
+ result = processor.process_file(file_path)
217
+
218
+ print(json.dumps(result, indent=2))
test_pdf.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for pdf_ocr.py
4
+ """
5
+
6
+ from pdf_ocr import PDFOCR
7
+ import json
8
+ import os
9
+
10
+ def main():
11
+ # Initialize PDF processor
12
+ processor = PDFOCR()
13
+
14
+ # Define input and output paths
15
+ pdf_path = "input/rubric.pdf"
16
+ output_path = "output/rubric_test.json"
17
+
18
+ # Create output directory if it doesn't exist
19
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
20
+
21
+ # Process PDF and save output
22
+ print(f"Processing PDF: {pdf_path}")
23
+ processor.save_json_output(pdf_path, output_path)
24
+ print(f"Output saved to: {output_path}")
25
+
26
+ # Read and print the output
27
+ with open(output_path, 'r') as f:
28
+ result = json.load(f)
29
+
30
+ print("\nOutput preview:")
31
+ print(f"File name: {result.get('file_name')}")
32
+ print(f"Topics: {result.get('topics')}")
33
+ print(f"Languages: {result.get('languages')}")
34
+ print("OCR contents preview (first few keys):")
35
+ ocr_contents = result.get('ocr_contents', {})
36
+ for i, (key, value) in enumerate(ocr_contents.items()):
37
+ if i >= 3: # Only show first 3 keys
38
+ break
39
+ print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}")
40
+
41
+ if __name__ == "__main__":
42
+ main()