Spaces:
Running
Running
Upload historical-ocr v1.1
Browse files- .gitattributes +4 -0
- README.md +54 -5
- app.py +178 -0
- config.py +14 -0
- input/.DS_Store +0 -0
- input/baldwin-letter-1.jpg +3 -0
- input/baldwin-letter-2.jpg +3 -0
- input/magellan-travels.jpg +3 -0
- input/okeefe-menu.pdf +3 -0
- packages.txt +2 -0
- pdf_ocr.py +76 -0
- prepare_for_hf.py +50 -0
- requirements.txt +8 -0
- run_local.sh +25 -0
- setup_git.sh +35 -0
- simple_test.py +69 -0
- streamlit_app.py +168 -0
- structured_ocr.py +218 -0
- test_pdf.py +42 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
37 |
+
input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
+
input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
+
input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,14 +1,63 @@
|
|
1 |
---
|
2 |
-
title: Historical
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.43.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
-
short_description:
|
12 |
---
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: Historical OCR
|
3 |
+
emoji: 🚀
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: green
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.43.2
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
+
short_description: Employs Mistral OCR for transcribing historical data
|
12 |
---
|
13 |
|
14 |
+
# Historical Document OCR
|
15 |
+
|
16 |
+
This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
|
17 |
+
|
18 |
+
## Features
|
19 |
+
|
20 |
+
- OCR processing for both image and PDF files
|
21 |
+
- Automatic file type detection
|
22 |
+
- Structured output generation using Mistral models
|
23 |
+
- Interactive web interface with Streamlit
|
24 |
+
- Supports historical documents and manuscripts
|
25 |
+
|
26 |
+
## Setup for Local Development
|
27 |
+
|
28 |
+
1. This directory is standalone and can be moved anywhere
|
29 |
+
2. Install dependencies:
|
30 |
+
```
|
31 |
+
pip install -r requirements.txt
|
32 |
+
```
|
33 |
+
3. Set up your Mistral API key:
|
34 |
+
- Option 1: Create a `.env` file in this directory and add your Mistral API key:
|
35 |
+
```
|
36 |
+
MISTRAL_API_KEY=your_api_key_here
|
37 |
+
```
|
38 |
+
- Option 2: Set the `MISTRAL_API_KEY` environment variable directly:
|
39 |
+
```
|
40 |
+
export MISTRAL_API_KEY=your_api_key_here
|
41 |
+
```
|
42 |
+
- Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
|
43 |
+
4. Run the Streamlit app using the script:
|
44 |
+
```
|
45 |
+
./run_local.sh
|
46 |
+
```
|
47 |
+
Or directly:
|
48 |
+
```
|
49 |
+
streamlit run app.py
|
50 |
+
```
|
51 |
+
|
52 |
+
## Usage
|
53 |
+
|
54 |
+
1. Upload an image or PDF file using the file uploader
|
55 |
+
2. Select processing options in the sidebar (e.g., use vision model)
|
56 |
+
3. Click "Process Document" to analyze the file
|
57 |
+
4. View the structured results and extract information
|
58 |
+
|
59 |
+
## Deployment on Hugging Face Spaces
|
60 |
+
|
61 |
+
This app is designed to be deployed on Hugging Face Spaces. The `README.md` contains the necessary configuration metadata.
|
62 |
+
|
63 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import json
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
# Import the StructuredOCR class and config from the local files
|
9 |
+
from structured_ocr import StructuredOCR
|
10 |
+
from config import MISTRAL_API_KEY
|
11 |
+
|
12 |
+
# Set page configuration
|
13 |
+
st.set_page_config(
|
14 |
+
page_title="Historical OCR",
|
15 |
+
page_icon="🚀",
|
16 |
+
layout="wide",
|
17 |
+
initial_sidebar_state="expanded"
|
18 |
+
)
|
19 |
+
|
20 |
+
# Define functions
|
21 |
+
def process_file(uploaded_file, use_vision=True):
|
22 |
+
"""Process the uploaded file and return the OCR results"""
|
23 |
+
# Save the uploaded file to a temporary file
|
24 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
25 |
+
tmp.write(uploaded_file.getvalue())
|
26 |
+
temp_path = tmp.name
|
27 |
+
|
28 |
+
try:
|
29 |
+
# Check if API key is available
|
30 |
+
if not MISTRAL_API_KEY:
|
31 |
+
# Return dummy data if no API key
|
32 |
+
return {
|
33 |
+
"file_name": uploaded_file.name,
|
34 |
+
"topics": ["Sample Document"],
|
35 |
+
"languages": ["English"],
|
36 |
+
"ocr_contents": {
|
37 |
+
"title": "Sample Document",
|
38 |
+
"content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
# Initialize OCR processor
|
43 |
+
processor = StructuredOCR()
|
44 |
+
|
45 |
+
# Determine file type from extension
|
46 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
47 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
48 |
+
|
49 |
+
# Process the file
|
50 |
+
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
|
51 |
+
return result
|
52 |
+
finally:
|
53 |
+
# Clean up the temporary file
|
54 |
+
if os.path.exists(temp_path):
|
55 |
+
os.unlink(temp_path)
|
56 |
+
|
57 |
+
# App title and description
|
58 |
+
st.title("Historical Document OCR")
|
59 |
+
st.subheader("Powered by Mistral AI")
|
60 |
+
st.markdown("""
|
61 |
+
This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
|
62 |
+
Upload an image or PDF file to get started.
|
63 |
+
""")
|
64 |
+
|
65 |
+
# Sidebar with options
|
66 |
+
with st.sidebar:
|
67 |
+
st.header("Options")
|
68 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
69 |
+
help="For image files, use the vision model for improved analysis (may be slower)")
|
70 |
+
|
71 |
+
st.markdown("---")
|
72 |
+
st.subheader("About")
|
73 |
+
st.markdown("""
|
74 |
+
This app uses Mistral AI's OCR API to extract text from historical documents.
|
75 |
+
|
76 |
+
It can process:
|
77 |
+
- Image files (jpg, png, etc.)
|
78 |
+
- PDF documents
|
79 |
+
|
80 |
+
The extracted content is processed into structured data based on the document type.
|
81 |
+
""")
|
82 |
+
|
83 |
+
# File uploader
|
84 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
|
85 |
+
|
86 |
+
if uploaded_file is not None:
|
87 |
+
# Display the uploaded file
|
88 |
+
st.subheader("Uploaded Document")
|
89 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
90 |
+
|
91 |
+
col1, col2 = st.columns([1, 1])
|
92 |
+
|
93 |
+
with col1:
|
94 |
+
if file_ext == ".pdf":
|
95 |
+
st.info("Processing PDF document...")
|
96 |
+
# For PDFs, you might show preview of first page
|
97 |
+
st.write(f"File: {uploaded_file.name}")
|
98 |
+
else:
|
99 |
+
st.image(uploaded_file, use_container_width=True)
|
100 |
+
|
101 |
+
# Process button
|
102 |
+
process_button = st.button("Process Document")
|
103 |
+
|
104 |
+
if process_button:
|
105 |
+
with st.spinner("Processing document..."):
|
106 |
+
try:
|
107 |
+
# Process the file
|
108 |
+
result = process_file(uploaded_file, use_vision)
|
109 |
+
|
110 |
+
# Display the results
|
111 |
+
with col2:
|
112 |
+
st.subheader("Extracted Information")
|
113 |
+
|
114 |
+
# Display file info
|
115 |
+
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
116 |
+
|
117 |
+
# Display languages if available
|
118 |
+
if 'languages' in result:
|
119 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
120 |
+
if languages:
|
121 |
+
st.write(f"**Languages Detected:** {', '.join(languages)}")
|
122 |
+
|
123 |
+
# Display topics if available
|
124 |
+
if 'topics' in result and result['topics']:
|
125 |
+
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
126 |
+
|
127 |
+
# Display the OCR contents
|
128 |
+
st.subheader("Document Contents")
|
129 |
+
if 'ocr_contents' in result:
|
130 |
+
# Create tabs for different views
|
131 |
+
tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
|
132 |
+
|
133 |
+
with tab1:
|
134 |
+
# Display in a more user-friendly format based on the content structure
|
135 |
+
if isinstance(result['ocr_contents'], dict):
|
136 |
+
for section, content in result['ocr_contents'].items():
|
137 |
+
if content: # Only display non-empty sections
|
138 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
139 |
+
|
140 |
+
if isinstance(content, str):
|
141 |
+
st.markdown(content)
|
142 |
+
elif isinstance(content, list):
|
143 |
+
for item in content:
|
144 |
+
if isinstance(item, str):
|
145 |
+
st.markdown(f"- {item}")
|
146 |
+
elif isinstance(item, dict):
|
147 |
+
st.json(item)
|
148 |
+
elif isinstance(content, dict):
|
149 |
+
for k, v in content.items():
|
150 |
+
st.markdown(f"**{k}:** {v}")
|
151 |
+
|
152 |
+
with tab2:
|
153 |
+
# Show the raw JSON for developers
|
154 |
+
st.json(result)
|
155 |
+
else:
|
156 |
+
st.error("No OCR content was extracted from the document.")
|
157 |
+
|
158 |
+
except Exception as e:
|
159 |
+
st.error(f"Error processing document: {str(e)}")
|
160 |
+
else:
|
161 |
+
# Display sample images when no file is uploaded
|
162 |
+
st.info("Upload a document to get started.")
|
163 |
+
|
164 |
+
# Show example images
|
165 |
+
st.subheader("Example Documents")
|
166 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
167 |
+
|
168 |
+
# Find sample images from the input directory to display
|
169 |
+
input_dir = Path(__file__).parent / "input"
|
170 |
+
sample_images = []
|
171 |
+
if input_dir.exists():
|
172 |
+
sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
|
173 |
+
|
174 |
+
if sample_images:
|
175 |
+
for i, img_path in enumerate(sample_images):
|
176 |
+
col = [col1, col2, col3][i % 3]
|
177 |
+
with col:
|
178 |
+
st.image(str(img_path), caption=img_path.name, use_container_width=True)
|
config.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config.py
|
2 |
+
"""
|
3 |
+
Configuration file for Mistral OCR processing.
|
4 |
+
Contains API key and other settings.
|
5 |
+
"""
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Your Mistral API key - get from environment variable
|
9 |
+
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
|
10 |
+
|
11 |
+
# Model settings
|
12 |
+
OCR_MODEL = "mistral-ocr-latest"
|
13 |
+
TEXT_MODEL = "ministral-8b-latest"
|
14 |
+
VISION_MODEL = "pixtral-12b-latest"
|
input/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
input/baldwin-letter-1.jpg
ADDED
![]() |
Git LFS Details
|
input/baldwin-letter-2.jpg
ADDED
![]() |
Git LFS Details
|
input/magellan-travels.jpg
ADDED
![]() |
Git LFS Details
|
input/okeefe-menu.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
3 |
+
size 2554815
|
packages.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tesseract-ocr
|
2 |
+
poppler-utils
|
pdf_ocr.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
PDFOCR - Module for processing PDF files with OCR and extracting structured data.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import json
|
7 |
+
from pathlib import Path
|
8 |
+
from structured_ocr import StructuredOCR
|
9 |
+
|
10 |
+
class PDFOCR:
|
11 |
+
"""Class for processing PDF files with OCR and extracting structured data."""
|
12 |
+
|
13 |
+
def __init__(self, api_key=None):
|
14 |
+
"""Initialize the PDF OCR processor."""
|
15 |
+
self.processor = StructuredOCR(api_key=api_key)
|
16 |
+
|
17 |
+
def process_pdf(self, pdf_path, use_vision=True):
|
18 |
+
"""
|
19 |
+
Process a PDF file with OCR and extract structured data.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
pdf_path: Path to the PDF file
|
23 |
+
use_vision: Whether to use vision model for improved analysis
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
Dictionary with structured OCR results
|
27 |
+
"""
|
28 |
+
pdf_path = Path(pdf_path)
|
29 |
+
if not pdf_path.exists():
|
30 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
31 |
+
|
32 |
+
return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
|
33 |
+
|
34 |
+
def save_json_output(self, pdf_path, output_path, use_vision=True):
|
35 |
+
"""
|
36 |
+
Process a PDF file and save the structured output as JSON.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
pdf_path: Path to the PDF file
|
40 |
+
output_path: Path where to save the JSON output
|
41 |
+
use_vision: Whether to use vision model for improved analysis
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
Path to the saved JSON file
|
45 |
+
"""
|
46 |
+
# Process the PDF
|
47 |
+
result = self.process_pdf(pdf_path, use_vision=use_vision)
|
48 |
+
|
49 |
+
# Save the result to JSON
|
50 |
+
output_path = Path(output_path)
|
51 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
52 |
+
|
53 |
+
with open(output_path, 'w') as f:
|
54 |
+
json.dump(result, f, indent=2)
|
55 |
+
|
56 |
+
return output_path
|
57 |
+
|
58 |
+
# For testing directly
|
59 |
+
if __name__ == "__main__":
|
60 |
+
import sys
|
61 |
+
|
62 |
+
if len(sys.argv) < 2:
|
63 |
+
print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
|
64 |
+
sys.exit(1)
|
65 |
+
|
66 |
+
pdf_path = sys.argv[1]
|
67 |
+
output_path = sys.argv[2] if len(sys.argv) > 2 else None
|
68 |
+
|
69 |
+
processor = PDFOCR()
|
70 |
+
|
71 |
+
if output_path:
|
72 |
+
result_path = processor.save_json_output(pdf_path, output_path)
|
73 |
+
print(f"Results saved to: {result_path}")
|
74 |
+
else:
|
75 |
+
result = processor.process_pdf(pdf_path)
|
76 |
+
print(json.dumps(result, indent=2))
|
prepare_for_hf.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Prepare the repository for Hugging Face Spaces deployment.
|
4 |
+
This script:
|
5 |
+
1. Creates a requirements.txt file with only the necessary dependencies
|
6 |
+
2. Ensures app.py is ready for HF deployment
|
7 |
+
3. Makes sure all configuration files are properly set up
|
8 |
+
"""
|
9 |
+
|
10 |
+
import os
|
11 |
+
import shutil
|
12 |
+
import sys
|
13 |
+
|
14 |
+
def main():
|
15 |
+
print("Preparing repository for Hugging Face Spaces deployment...")
|
16 |
+
|
17 |
+
# Make sure output directory exists
|
18 |
+
if not os.path.exists("output"):
|
19 |
+
os.makedirs("output")
|
20 |
+
print("Created output directory")
|
21 |
+
|
22 |
+
# Clean up unnecessary files
|
23 |
+
files_to_remove = [".env", ".env.example", ".git"]
|
24 |
+
for file in files_to_remove:
|
25 |
+
if os.path.exists(file):
|
26 |
+
if os.path.isdir(file):
|
27 |
+
shutil.rmtree(file)
|
28 |
+
else:
|
29 |
+
os.remove(file)
|
30 |
+
print(f"Removed {file}")
|
31 |
+
|
32 |
+
# Check requirements.txt exists
|
33 |
+
if not os.path.exists("requirements.txt"):
|
34 |
+
print("ERROR: requirements.txt not found. Please create it before deploying.")
|
35 |
+
sys.exit(1)
|
36 |
+
|
37 |
+
# Make sure run_local.sh is executable
|
38 |
+
if os.path.exists("run_local.sh"):
|
39 |
+
os.chmod("run_local.sh", 0o755)
|
40 |
+
print("Made run_local.sh executable")
|
41 |
+
|
42 |
+
# Remove any large unnecessary files from input directory
|
43 |
+
# Keep only sample files that are needed for demos
|
44 |
+
print("NOTE: Large files in the input directory will be uploaded to Hugging Face.")
|
45 |
+
print("You may want to remove unnecessary files before deployment.")
|
46 |
+
|
47 |
+
print("Repository preparation complete!")
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit>=1.43.2
|
2 |
+
mistralai
|
3 |
+
pydantic
|
4 |
+
pycountry
|
5 |
+
pillow
|
6 |
+
python-multipart
|
7 |
+
pdf2image
|
8 |
+
pytesseract
|
run_local.sh
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Run the Streamlit app locally
|
3 |
+
|
4 |
+
# No longer need to add parent directory to PYTHONPATH
|
5 |
+
# as we now have a local copy of structured_ocr.py
|
6 |
+
|
7 |
+
# Load environment variables from .env file if it exists
|
8 |
+
if [ -f .env ]; then
|
9 |
+
echo "Loading environment variables from .env file"
|
10 |
+
set -o allexport
|
11 |
+
source .env
|
12 |
+
set +o allexport
|
13 |
+
else
|
14 |
+
echo "No .env file found. Make sure to set MISTRAL_API_KEY environment variable manually."
|
15 |
+
fi
|
16 |
+
|
17 |
+
# Check if MISTRAL_API_KEY is set
|
18 |
+
if [ -z "$MISTRAL_API_KEY" ]; then
|
19 |
+
echo "WARNING: MISTRAL_API_KEY is not set. The app will run with sample data."
|
20 |
+
else
|
21 |
+
echo "MISTRAL_API_KEY is set. The app will use the Mistral API for OCR processing."
|
22 |
+
fi
|
23 |
+
|
24 |
+
# Run the Streamlit app
|
25 |
+
streamlit run app.py
|
setup_git.sh
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
# Setup git repository for Hugging Face Spaces
|
3 |
+
|
4 |
+
# Check if HF_TOKEN environment variable is set
|
5 |
+
if [ -z "$HF_TOKEN" ]; then
|
6 |
+
echo "Error: HF_TOKEN environment variable is not set."
|
7 |
+
echo "Please set it first with: export HF_TOKEN=your_hugging_face_token"
|
8 |
+
exit 1
|
9 |
+
fi
|
10 |
+
|
11 |
+
# Get your username
|
12 |
+
echo "Enter your Hugging Face username:"
|
13 |
+
read HF_USERNAME
|
14 |
+
|
15 |
+
# Get the space name
|
16 |
+
echo "Enter the name for your Hugging Face Space (e.g., historical-ocr):"
|
17 |
+
read HF_SPACE
|
18 |
+
|
19 |
+
# Prepare the files for deployment
|
20 |
+
echo "Preparing files for deployment..."
|
21 |
+
python3 prepare_for_hf.py
|
22 |
+
|
23 |
+
# Initialize git
|
24 |
+
git init
|
25 |
+
git add .
|
26 |
+
git commit -m "Initial commit"
|
27 |
+
|
28 |
+
# Create the repository on Hugging Face
|
29 |
+
echo "Creating and pushing to Hugging Face Space..."
|
30 |
+
git remote add origin https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE
|
31 |
+
huggingface-cli login --token $HF_TOKEN
|
32 |
+
git push -u origin main
|
33 |
+
|
34 |
+
echo "Deployment completed! Your app should be available at:"
|
35 |
+
echo "https://huggingface.co/spaces/$HF_USERNAME/$HF_SPACE"
|
simple_test.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Simple test script for structured_ocr.py
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
import sys
|
8 |
+
import json
|
9 |
+
from pathlib import Path
|
10 |
+
|
11 |
+
def main():
|
12 |
+
print("Testing OCR with a sample image file")
|
13 |
+
|
14 |
+
# Path to the sample image file
|
15 |
+
image_path = os.path.join("input", "recipe.jpg")
|
16 |
+
|
17 |
+
# Check if the file exists
|
18 |
+
if not os.path.isfile(image_path):
|
19 |
+
print(f"Error: Image file not found at {image_path}")
|
20 |
+
return
|
21 |
+
|
22 |
+
print(f"File found: {image_path}")
|
23 |
+
|
24 |
+
# Create the output directory if it doesn't exist
|
25 |
+
output_dir = "output"
|
26 |
+
os.makedirs(output_dir, exist_ok=True)
|
27 |
+
|
28 |
+
output_path = os.path.join(output_dir, "recipe_test.json")
|
29 |
+
|
30 |
+
# Import the StructuredOCR class
|
31 |
+
from structured_ocr import StructuredOCR
|
32 |
+
|
33 |
+
# Initialize OCR processor
|
34 |
+
processor = StructuredOCR()
|
35 |
+
|
36 |
+
try:
|
37 |
+
# Process the image file
|
38 |
+
print(f"Processing image file: {image_path}")
|
39 |
+
result = processor.process_file(image_path, file_type="image")
|
40 |
+
|
41 |
+
# Save the result to the output file
|
42 |
+
with open(output_path, 'w') as f:
|
43 |
+
json.dump(result, f, indent=2)
|
44 |
+
|
45 |
+
print(f"Image processing completed successfully. Output saved to {output_path}")
|
46 |
+
|
47 |
+
# Check if the output file exists
|
48 |
+
if os.path.isfile(output_path):
|
49 |
+
print(f"Output file exists at {output_path}")
|
50 |
+
# Print the file size
|
51 |
+
file_size = os.path.getsize(output_path)
|
52 |
+
print(f"Output file size: {file_size} bytes")
|
53 |
+
|
54 |
+
# Print a preview of the output file
|
55 |
+
print("\nPreview of output file:")
|
56 |
+
with open(output_path, 'r') as f:
|
57 |
+
data = json.load(f)
|
58 |
+
print(f"File name: {data.get('file_name', '')}")
|
59 |
+
print(f"Topics: {', '.join(data.get('topics', []))}")
|
60 |
+
print(f"Languages: {', '.join(data.get('languages', []))}")
|
61 |
+
print("OCR contents keys:", list(data.get('ocr_contents', {}).keys()))
|
62 |
+
else:
|
63 |
+
print(f"Error: Output file not found at {output_path}")
|
64 |
+
|
65 |
+
except Exception as e:
|
66 |
+
print(f"Error processing image: {e}")
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
main()
|
streamlit_app.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import json
|
4 |
+
import sys
|
5 |
+
from pathlib import Path
|
6 |
+
import tempfile
|
7 |
+
|
8 |
+
# Add parent directory to path so we can import the OCR modules
|
9 |
+
parent_dir = Path(__file__).parent.parent.absolute()
|
10 |
+
sys.path.append(str(parent_dir))
|
11 |
+
|
12 |
+
# Import the StructuredOCR class from the parent directory
|
13 |
+
from structured_ocr import StructuredOCR
|
14 |
+
|
15 |
+
# Set page configuration
|
16 |
+
st.set_page_config(
|
17 |
+
page_title="Historical OCR",
|
18 |
+
page_icon="🚀",
|
19 |
+
layout="wide",
|
20 |
+
initial_sidebar_state="expanded"
|
21 |
+
)
|
22 |
+
|
23 |
+
# Define functions
|
24 |
+
def process_file(uploaded_file, use_vision=True):
|
25 |
+
"""Process the uploaded file and return the OCR results"""
|
26 |
+
# Save the uploaded file to a temporary file
|
27 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
28 |
+
tmp.write(uploaded_file.getvalue())
|
29 |
+
temp_path = tmp.name
|
30 |
+
|
31 |
+
try:
|
32 |
+
# Initialize OCR processor
|
33 |
+
processor = StructuredOCR()
|
34 |
+
|
35 |
+
# Determine file type from extension
|
36 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
37 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
38 |
+
|
39 |
+
# Process the file
|
40 |
+
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
|
41 |
+
return result
|
42 |
+
finally:
|
43 |
+
# Clean up the temporary file
|
44 |
+
if os.path.exists(temp_path):
|
45 |
+
os.unlink(temp_path)
|
46 |
+
|
47 |
+
# App title and description
|
48 |
+
st.title("Historical Document OCR")
|
49 |
+
st.subheader("Powered by Mistral AI")
|
50 |
+
st.markdown("""
|
51 |
+
This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
|
52 |
+
Upload an image or PDF file to get started.
|
53 |
+
""")
|
54 |
+
|
55 |
+
# Sidebar with options
|
56 |
+
with st.sidebar:
|
57 |
+
st.header("Options")
|
58 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
59 |
+
help="For image files, use the vision model for improved analysis (may be slower)")
|
60 |
+
|
61 |
+
st.markdown("---")
|
62 |
+
st.subheader("About")
|
63 |
+
st.markdown("""
|
64 |
+
This app uses Mistral AI's OCR API to extract text from historical documents.
|
65 |
+
|
66 |
+
It can process:
|
67 |
+
- Image files (jpg, png, etc.)
|
68 |
+
- PDF documents
|
69 |
+
|
70 |
+
The extracted content is processed into structured data based on the document type.
|
71 |
+
""")
|
72 |
+
|
73 |
+
# File uploader
|
74 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
|
75 |
+
|
76 |
+
if uploaded_file is not None:
|
77 |
+
# Display the uploaded file
|
78 |
+
st.subheader("Uploaded Document")
|
79 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
80 |
+
|
81 |
+
col1, col2 = st.columns([1, 1])
|
82 |
+
|
83 |
+
with col1:
|
84 |
+
if file_ext == ".pdf":
|
85 |
+
st.info("Processing PDF document...")
|
86 |
+
# For PDFs, you might show preview of first page
|
87 |
+
st.write(f"File: {uploaded_file.name}")
|
88 |
+
else:
|
89 |
+
st.image(uploaded_file, use_column_width=True)
|
90 |
+
|
91 |
+
# Process button
|
92 |
+
process_button = st.button("Process Document")
|
93 |
+
|
94 |
+
if process_button:
|
95 |
+
with st.spinner("Processing document..."):
|
96 |
+
try:
|
97 |
+
# Process the file
|
98 |
+
result = process_file(uploaded_file, use_vision)
|
99 |
+
|
100 |
+
# Display the results
|
101 |
+
with col2:
|
102 |
+
st.subheader("Extracted Information")
|
103 |
+
|
104 |
+
# Display file info
|
105 |
+
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
106 |
+
|
107 |
+
# Display languages if available
|
108 |
+
if 'languages' in result:
|
109 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
110 |
+
if languages:
|
111 |
+
st.write(f"**Languages Detected:** {', '.join(languages)}")
|
112 |
+
|
113 |
+
# Display topics if available
|
114 |
+
if 'topics' in result and result['topics']:
|
115 |
+
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
116 |
+
|
117 |
+
# Display the OCR contents
|
118 |
+
st.subheader("Document Contents")
|
119 |
+
if 'ocr_contents' in result:
|
120 |
+
# Create tabs for different views
|
121 |
+
tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
|
122 |
+
|
123 |
+
with tab1:
|
124 |
+
# Display in a more user-friendly format based on the content structure
|
125 |
+
if isinstance(result['ocr_contents'], dict):
|
126 |
+
for section, content in result['ocr_contents'].items():
|
127 |
+
if content: # Only display non-empty sections
|
128 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
129 |
+
|
130 |
+
if isinstance(content, str):
|
131 |
+
st.markdown(content)
|
132 |
+
elif isinstance(content, list):
|
133 |
+
for item in content:
|
134 |
+
if isinstance(item, str):
|
135 |
+
st.markdown(f"- {item}")
|
136 |
+
elif isinstance(item, dict):
|
137 |
+
st.json(item)
|
138 |
+
elif isinstance(content, dict):
|
139 |
+
for k, v in content.items():
|
140 |
+
st.markdown(f"**{k}:** {v}")
|
141 |
+
|
142 |
+
with tab2:
|
143 |
+
# Show the raw JSON for developers
|
144 |
+
st.json(result)
|
145 |
+
else:
|
146 |
+
st.error("No OCR content was extracted from the document.")
|
147 |
+
|
148 |
+
except Exception as e:
|
149 |
+
st.error(f"Error processing document: {str(e)}")
|
150 |
+
else:
|
151 |
+
# Display sample images when no file is uploaded
|
152 |
+
st.info("Upload a document to get started.")
|
153 |
+
|
154 |
+
# Show example images
|
155 |
+
st.subheader("Example Documents")
|
156 |
+
col1, col2, col3 = st.columns([1, 1, 1])
|
157 |
+
|
158 |
+
# Find sample images from the input directory to display
|
159 |
+
input_dir = parent_dir / "input"
|
160 |
+
sample_images = []
|
161 |
+
if input_dir.exists():
|
162 |
+
sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
|
163 |
+
|
164 |
+
if sample_images:
|
165 |
+
for i, img_path in enumerate(sample_images):
|
166 |
+
col = [col1, col2, col3][i % 3]
|
167 |
+
with col:
|
168 |
+
st.image(str(img_path), caption=img_path.name, use_column_width=True)
|
structured_ocr.py
ADDED
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from enum import Enum
|
4 |
+
from pathlib import Path
|
5 |
+
import json
|
6 |
+
import base64
|
7 |
+
import pycountry
|
8 |
+
from pydantic import BaseModel
|
9 |
+
from mistralai import Mistral
|
10 |
+
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
11 |
+
|
12 |
+
# Import config directly (now local to historical-ocr)
|
13 |
+
from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
|
14 |
+
|
15 |
+
# Create language enum for structured output
|
16 |
+
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
17 |
+
|
18 |
+
class LanguageMeta(Enum.__class__):
|
19 |
+
def __new__(metacls, cls, bases, classdict):
|
20 |
+
for code, name in languages.items():
|
21 |
+
classdict[name.upper().replace(' ', '_')] = name
|
22 |
+
return super().__new__(metacls, cls, bases, classdict)
|
23 |
+
|
24 |
+
class Language(Enum, metaclass=LanguageMeta):
|
25 |
+
pass
|
26 |
+
|
27 |
+
class StructuredOCRModel(BaseModel):
|
28 |
+
file_name: str
|
29 |
+
topics: list[str]
|
30 |
+
languages: list[Language]
|
31 |
+
ocr_contents: dict
|
32 |
+
|
33 |
+
class StructuredOCR:
|
34 |
+
def __init__(self, api_key=None):
|
35 |
+
"""Initialize the OCR processor with API key"""
|
36 |
+
self.api_key = api_key or MISTRAL_API_KEY
|
37 |
+
self.client = Mistral(api_key=self.api_key)
|
38 |
+
|
39 |
+
def process_file(self, file_path, file_type=None, use_vision=True):
|
40 |
+
"""Process a file and return structured OCR results
|
41 |
+
|
42 |
+
Args:
|
43 |
+
file_path: Path to the file to process
|
44 |
+
file_type: 'pdf' or 'image' (will be auto-detected if None)
|
45 |
+
use_vision: Whether to use vision model for improved analysis
|
46 |
+
|
47 |
+
Returns:
|
48 |
+
Dictionary with structured OCR results
|
49 |
+
"""
|
50 |
+
# Convert file_path to Path object if it's a string
|
51 |
+
file_path = Path(file_path)
|
52 |
+
|
53 |
+
# Auto-detect file type if not provided
|
54 |
+
if file_type is None:
|
55 |
+
suffix = file_path.suffix.lower()
|
56 |
+
file_type = "pdf" if suffix == ".pdf" else "image"
|
57 |
+
|
58 |
+
# Read and process the file
|
59 |
+
if file_type == "pdf":
|
60 |
+
return self._process_pdf(file_path, use_vision)
|
61 |
+
else:
|
62 |
+
return self._process_image(file_path, use_vision)
|
63 |
+
|
64 |
+
def _process_pdf(self, file_path, use_vision=True):
|
65 |
+
"""Process a PDF file with OCR"""
|
66 |
+
# Upload the PDF file
|
67 |
+
uploaded_file = self.client.files.upload(
|
68 |
+
file={
|
69 |
+
"file_name": file_path.stem,
|
70 |
+
"content": file_path.read_bytes(),
|
71 |
+
},
|
72 |
+
purpose="ocr",
|
73 |
+
)
|
74 |
+
|
75 |
+
# Get a signed URL for the uploaded file
|
76 |
+
signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
77 |
+
|
78 |
+
# Process the PDF with OCR
|
79 |
+
pdf_response = self.client.ocr.process(
|
80 |
+
document=DocumentURLChunk(document_url=signed_url.url),
|
81 |
+
model=OCR_MODEL,
|
82 |
+
include_image_base64=True
|
83 |
+
)
|
84 |
+
|
85 |
+
# Combine all pages' markdown into a single string
|
86 |
+
all_markdown = "\n\n".join([page.markdown for page in pdf_response.pages])
|
87 |
+
|
88 |
+
# Extract structured data using the appropriate model
|
89 |
+
if use_vision:
|
90 |
+
# Get base64 of first page for vision model
|
91 |
+
first_page_image = pdf_response.pages[0].images[0].image_base64 if pdf_response.pages and pdf_response.pages[0].images else None
|
92 |
+
|
93 |
+
if first_page_image:
|
94 |
+
# Use vision model
|
95 |
+
result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
|
96 |
+
else:
|
97 |
+
# Fall back to text-only model if no image available
|
98 |
+
result = self._extract_structured_data_text_only(all_markdown, file_path.name)
|
99 |
+
else:
|
100 |
+
# Use text-only model
|
101 |
+
result = self._extract_structured_data_text_only(all_markdown, file_path.name)
|
102 |
+
|
103 |
+
return result
|
104 |
+
|
105 |
+
def _process_image(self, file_path, use_vision=True):
|
106 |
+
"""Process an image file with OCR"""
|
107 |
+
# Read and encode the image file
|
108 |
+
encoded_image = base64.b64encode(file_path.read_bytes()).decode()
|
109 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
110 |
+
|
111 |
+
# Process the image with OCR
|
112 |
+
image_response = self.client.ocr.process(
|
113 |
+
document=ImageURLChunk(image_url=base64_data_url),
|
114 |
+
model=OCR_MODEL
|
115 |
+
)
|
116 |
+
|
117 |
+
# Get the OCR markdown from the first page
|
118 |
+
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
119 |
+
|
120 |
+
# Extract structured data using the appropriate model
|
121 |
+
if use_vision:
|
122 |
+
result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
|
123 |
+
else:
|
124 |
+
result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
|
125 |
+
|
126 |
+
return result
|
127 |
+
|
128 |
+
def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
|
129 |
+
"""Extract structured data using vision model"""
|
130 |
+
try:
|
131 |
+
# Parse with vision model
|
132 |
+
chat_response = self.client.chat.parse(
|
133 |
+
model=VISION_MODEL,
|
134 |
+
messages=[
|
135 |
+
{
|
136 |
+
"role": "user",
|
137 |
+
"content": [
|
138 |
+
ImageURLChunk(image_url=image_base64),
|
139 |
+
TextChunk(text=(
|
140 |
+
f"This is a historical document's OCR in markdown:\n"
|
141 |
+
f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
142 |
+
f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
|
143 |
+
f"Extract topics, languages, and organize the content logically."
|
144 |
+
))
|
145 |
+
],
|
146 |
+
},
|
147 |
+
],
|
148 |
+
response_format=StructuredOCRModel,
|
149 |
+
temperature=0
|
150 |
+
)
|
151 |
+
|
152 |
+
# Convert the response to a dictionary
|
153 |
+
result = json.loads(chat_response.choices[0].message.parsed.json())
|
154 |
+
|
155 |
+
# Ensure languages is a list of strings, not Language enum objects
|
156 |
+
if 'languages' in result:
|
157 |
+
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
158 |
+
|
159 |
+
except Exception as e:
|
160 |
+
# Fall back to text-only model if vision model fails
|
161 |
+
print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
|
162 |
+
result = self._extract_structured_data_text_only(ocr_markdown, filename)
|
163 |
+
|
164 |
+
return result
|
165 |
+
|
166 |
+
def _extract_structured_data_text_only(self, ocr_markdown, filename):
|
167 |
+
"""Extract structured data using text-only model"""
|
168 |
+
try:
|
169 |
+
# Parse with text-only model
|
170 |
+
chat_response = self.client.chat.parse(
|
171 |
+
model=TEXT_MODEL,
|
172 |
+
messages=[
|
173 |
+
{
|
174 |
+
"role": "user",
|
175 |
+
"content": f"This is a historical document's OCR in markdown:\n"
|
176 |
+
f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
177 |
+
f"Convert this into a structured JSON response with the OCR contents. "
|
178 |
+
f"Extract topics, languages, and organize the content logically."
|
179 |
+
},
|
180 |
+
],
|
181 |
+
response_format=StructuredOCRModel,
|
182 |
+
temperature=0
|
183 |
+
)
|
184 |
+
|
185 |
+
# Convert the response to a dictionary
|
186 |
+
result = json.loads(chat_response.choices[0].message.parsed.json())
|
187 |
+
|
188 |
+
# Ensure languages is a list of strings, not Language enum objects
|
189 |
+
if 'languages' in result:
|
190 |
+
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
# Create a basic result if parsing fails
|
194 |
+
print(f"Text model failed: {str(e)}. Creating basic result.")
|
195 |
+
result = {
|
196 |
+
"file_name": filename,
|
197 |
+
"topics": ["Document"],
|
198 |
+
"languages": ["English"],
|
199 |
+
"ocr_contents": {
|
200 |
+
"raw_text": ocr_markdown
|
201 |
+
}
|
202 |
+
}
|
203 |
+
|
204 |
+
return result
|
205 |
+
|
206 |
+
# For testing directly
|
207 |
+
if __name__ == "__main__":
|
208 |
+
import sys
|
209 |
+
|
210 |
+
if len(sys.argv) < 2:
|
211 |
+
print("Usage: python structured_ocr.py <file_path>")
|
212 |
+
sys.exit(1)
|
213 |
+
|
214 |
+
file_path = sys.argv[1]
|
215 |
+
processor = StructuredOCR()
|
216 |
+
result = processor.process_file(file_path)
|
217 |
+
|
218 |
+
print(json.dumps(result, indent=2))
|
test_pdf.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Test script for pdf_ocr.py
|
4 |
+
"""
|
5 |
+
|
6 |
+
from pdf_ocr import PDFOCR
|
7 |
+
import json
|
8 |
+
import os
|
9 |
+
|
10 |
+
def main():
|
11 |
+
# Initialize PDF processor
|
12 |
+
processor = PDFOCR()
|
13 |
+
|
14 |
+
# Define input and output paths
|
15 |
+
pdf_path = "input/rubric.pdf"
|
16 |
+
output_path = "output/rubric_test.json"
|
17 |
+
|
18 |
+
# Create output directory if it doesn't exist
|
19 |
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
20 |
+
|
21 |
+
# Process PDF and save output
|
22 |
+
print(f"Processing PDF: {pdf_path}")
|
23 |
+
processor.save_json_output(pdf_path, output_path)
|
24 |
+
print(f"Output saved to: {output_path}")
|
25 |
+
|
26 |
+
# Read and print the output
|
27 |
+
with open(output_path, 'r') as f:
|
28 |
+
result = json.load(f)
|
29 |
+
|
30 |
+
print("\nOutput preview:")
|
31 |
+
print(f"File name: {result.get('file_name')}")
|
32 |
+
print(f"Topics: {result.get('topics')}")
|
33 |
+
print(f"Languages: {result.get('languages')}")
|
34 |
+
print("OCR contents preview (first few keys):")
|
35 |
+
ocr_contents = result.get('ocr_contents', {})
|
36 |
+
for i, (key, value) in enumerate(ocr_contents.items()):
|
37 |
+
if i >= 3: # Only show first 3 keys
|
38 |
+
break
|
39 |
+
print(f" {key}: {value[:100]}..." if isinstance(value, str) and len(value) > 100 else f" {key}: {value}")
|
40 |
+
|
41 |
+
if __name__ == "__main__":
|
42 |
+
main()
|