Spaces:
Running
Running
submit pull for merge (#1)
Browse files- submit pull for merge (85bdb4e2f53788772ec789554db9a859d4a957e8)
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +17 -0
- README.md +91 -8
- __pycache__/config.cpython-312.pyc +0 -0
- __pycache__/ocr_utils.cpython-312.pyc +0 -0
- __pycache__/process_file.cpython-312.pyc +0 -0
- __pycache__/structured_ocr.cpython-312.pyc +0 -0
- app.py +525 -102
- backup/app.py +535 -0
- backup/config.py +17 -0
- backup/input/The Magician, or Bottle Cungerer.jpeg +3 -0
- backup/input/baldwin-letter-1.jpg +3 -0
- backup/input/baldwin-letter-2.jpg +3 -0
- backup/input/flier.png +0 -0
- backup/input/letter-1.jpg +3 -0
- backup/input/letter-2.jpg +3 -0
- backup/input/letter-3.jpg +3 -0
- backup/input/magellan-travels.jpg +3 -0
- backup/input/menu.pdf +3 -0
- backup/input/recipe.jpg +0 -0
- backup/ocr_utils.py +136 -0
- backup/pdf_ocr.py +76 -0
- backup/requirements.txt +10 -0
- backup/structured_ocr.py +414 -0
- config.py +6 -3
- input/The Magician, or Bottle Cungerer.jpeg +3 -0
- input/a-la-carte.pdf +3 -0
- input/flier.png +0 -0
- input/handwritten-letter.jpg +3 -0
- input/letter-1.jpg +3 -0
- input/letter-2.jpg +3 -0
- input/letter-3.jpg +3 -0
- input/magician-satire.jpg +3 -0
- input/menu.pdf +3 -0
- input/milgram-flier.png +0 -0
- input/okeefe-recipe.jpg +0 -0
- input/recipe.jpg +0 -0
- modules/content/__init__.py +36 -0
- modules/content/module1.py +85 -0
- modules/content/module2.py +88 -0
- modules/content/module3.py +106 -0
- modules/content/module4.py +124 -0
- modules/content/module5.py +547 -0
- modules/content/module6.py +154 -0
- modules/educational_module.py +547 -0
- modules/modular_app.py +276 -0
- ocr_utils.py +212 -0
- output/.gitkeep +0 -0
- output/example-1.html +0 -0
- output/recipe_test.json +16 -0
- output/ymca-letter.jpg +3 -0
.gitattributes
CHANGED
@@ -37,3 +37,20 @@ input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
|
37 |
input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
38 |
input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
|
39 |
input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
|
40 |
+
backup/input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
41 |
+
backup/input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
42 |
+
backup/input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
43 |
+
backup/input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
44 |
+
backup/input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
|
45 |
+
backup/input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
|
46 |
+
backup/input/menu.pdf filter=lfs diff=lfs merge=lfs -text
|
47 |
+
backup/input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
|
48 |
+
input/a-la-carte.pdf filter=lfs diff=lfs merge=lfs -text
|
49 |
+
input/handwritten-letter.jpg filter=lfs diff=lfs merge=lfs -text
|
50 |
+
input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
|
51 |
+
input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
|
52 |
+
input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
|
53 |
+
input/magician-satire.jpg filter=lfs diff=lfs merge=lfs -text
|
54 |
+
input/menu.pdf filter=lfs diff=lfs merge=lfs -text
|
55 |
+
input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
|
56 |
+
output/ymca-letter.jpg filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Historical OCR
|
3 |
-
emoji:
|
4 |
colorFrom: red
|
5 |
colorTo: green
|
6 |
sdk: streamlit
|
@@ -22,15 +22,78 @@ This application uses Mistral AI's OCR capabilities to transcribe and extract in
|
|
22 |
- Structured output generation using Mistral models
|
23 |
- Interactive web interface with Streamlit
|
24 |
- Supports historical documents and manuscripts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
## Setup for Local Development
|
27 |
|
28 |
-
1.
|
29 |
-
2. Install dependencies:
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
```
|
31 |
pip install -r requirements.txt
|
32 |
```
|
33 |
-
|
34 |
- Option 1: Create a `.env` file in this directory and add your Mistral API key:
|
35 |
```
|
36 |
MISTRAL_API_KEY=your_api_key_here
|
@@ -40,7 +103,7 @@ pip install -r requirements.txt
|
|
40 |
export MISTRAL_API_KEY=your_api_key_here
|
41 |
```
|
42 |
- Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
|
43 |
-
|
44 |
```
|
45 |
./run_local.sh
|
46 |
```
|
@@ -52,12 +115,32 @@ streamlit run app.py
|
|
52 |
## Usage
|
53 |
|
54 |
1. Upload an image or PDF file using the file uploader
|
55 |
-
2. Select processing options in the sidebar (e.g., use vision model)
|
56 |
3. Click "Process Document" to analyze the file
|
57 |
4. View the structured results and extract information
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
## Deployment on Hugging Face Spaces
|
60 |
|
61 |
-
This app is designed to be deployed on Hugging Face Spaces.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: Historical OCR
|
3 |
+
emoji: 📜
|
4 |
colorFrom: red
|
5 |
colorTo: green
|
6 |
sdk: streamlit
|
|
|
22 |
- Structured output generation using Mistral models
|
23 |
- Interactive web interface with Streamlit
|
24 |
- Supports historical documents and manuscripts
|
25 |
+
- PDF preview functionality for better user experience
|
26 |
+
- Smart handling of large PDFs with automatic page limiting
|
27 |
+
- Robust error handling with helpful messages
|
28 |
+
- Image preprocessing options for enhanced OCR accuracy
|
29 |
+
|
30 |
+
## Project Structure
|
31 |
+
|
32 |
+
The project is organized as follows:
|
33 |
+
|
34 |
+
```
|
35 |
+
Historical OCR - Project Structure
|
36 |
+
|
37 |
+
┌─ Main Applications
|
38 |
+
│ ├─ app.py # Standard Streamlit interface for OCR processing
|
39 |
+
│ └─ streamlit_app.py # Educational modular version with learning components
|
40 |
+
│
|
41 |
+
├─ Core Functionality
|
42 |
+
│ ├─ structured_ocr.py # Main OCR processing engine with Mistral AI integration
|
43 |
+
│ ├─ ocr_utils.py # Utility functions for OCR text and image processing
|
44 |
+
│ ├─ pdf_ocr.py # PDF-specific document processing functionality
|
45 |
+
│ └─ config.py # Configuration settings and API keys
|
46 |
+
│
|
47 |
+
├─ Testing & Development
|
48 |
+
│ ├─ simple_test.py # Basic OCR functionality test
|
49 |
+
│ ├─ test_pdf.py # PDF processing test
|
50 |
+
│ ├─ test_pdf_preview.py # PDF preview generation test
|
51 |
+
│ └─ prepare_for_hf.py # Prepare project for Hugging Face deployment
|
52 |
+
│
|
53 |
+
├─ Scripts
|
54 |
+
│ ├─ run_local.sh # Launch standard or educational app locally
|
55 |
+
│ ├─ run_large_files.sh # Process large documents with optimized settings
|
56 |
+
│ └─ setup_git.sh # Configure Git repositories
|
57 |
+
│
|
58 |
+
├─ Educational Modules (streamlit/)
|
59 |
+
│ ├─ modules/
|
60 |
+
│ │ ├─ module1.py # Introduction and Problematization
|
61 |
+
│ │ ├─ module2.py # Historical Typography & OCR Challenges
|
62 |
+
│ │ ├─ module3.py # Document Analysis Techniques
|
63 |
+
│ │ ├─ module4.py # Processing Methods
|
64 |
+
│ │ ├─ module5.py # Research Applications
|
65 |
+
│ │ └─ module6.py # Future Directions
|
66 |
+
│ │
|
67 |
+
│ ├─ modular_app.py # Learning module framework
|
68 |
+
│ ├─ layout.py # UI components for educational interface
|
69 |
+
│ └─ process_file.py # File processing for educational app
|
70 |
+
│
|
71 |
+
├─ UI Components (ui/)
|
72 |
+
│ └─ layout.py # Shared UI components and styling
|
73 |
+
│
|
74 |
+
├─ Data Directories
|
75 |
+
│ ├─ input/ # Sample documents for testing/demo
|
76 |
+
│ └─ output/ # Output directory for processed files
|
77 |
+
│
|
78 |
+
└─ Dependencies
|
79 |
+
├─ requirements.txt # Python package dependencies
|
80 |
+
└─ packages.txt # System-level dependencies
|
81 |
+
```
|
82 |
|
83 |
## Setup for Local Development
|
84 |
|
85 |
+
1. Clone this repository
|
86 |
+
2. Install system dependencies:
|
87 |
+
- For PDF processing, you need poppler:
|
88 |
+
- On macOS: `brew install poppler`
|
89 |
+
- On Ubuntu/Debian: `apt-get install poppler-utils`
|
90 |
+
- On Windows: Download from [poppler releases](https://github.com/oschwartz10612/poppler-windows/releases/) and add to PATH
|
91 |
+
- For text recognition: `tesseract-ocr`
|
92 |
+
3. Install Python dependencies:
|
93 |
```
|
94 |
pip install -r requirements.txt
|
95 |
```
|
96 |
+
4. Set up your Mistral API key:
|
97 |
- Option 1: Create a `.env` file in this directory and add your Mistral API key:
|
98 |
```
|
99 |
MISTRAL_API_KEY=your_api_key_here
|
|
|
103 |
export MISTRAL_API_KEY=your_api_key_here
|
104 |
```
|
105 |
- Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
|
106 |
+
5. Run the Streamlit app using the script:
|
107 |
```
|
108 |
./run_local.sh
|
109 |
```
|
|
|
115 |
## Usage
|
116 |
|
117 |
1. Upload an image or PDF file using the file uploader
|
118 |
+
2. Select processing options in the sidebar (e.g., use vision model, image preprocessing)
|
119 |
3. Click "Process Document" to analyze the file
|
120 |
4. View the structured results and extract information
|
121 |
|
122 |
+
## Application Versions
|
123 |
+
|
124 |
+
Two versions of the application are available:
|
125 |
+
|
126 |
+
1. **Standard Version** (`app.py`): Focused on document processing with a clean interface
|
127 |
+
2. **Educational Version** (`streamlit_app.py`): Enhanced with educational modules and interactive components
|
128 |
+
|
129 |
+
To run the educational version:
|
130 |
+
```
|
131 |
+
streamlit run streamlit_app.py
|
132 |
+
```
|
133 |
+
|
134 |
## Deployment on Hugging Face Spaces
|
135 |
|
136 |
+
This app is designed to be deployed on Hugging Face Spaces. To deploy:
|
137 |
+
|
138 |
+
1. Fork this repository to your GitHub account or directly create a new Space on [Hugging Face](https://huggingface.co/spaces)
|
139 |
+
2. Connect your GitHub repository to your Hugging Face Space for automatic deployment
|
140 |
+
3. Add your Mistral API key as a secret in your Hugging Face Space settings:
|
141 |
+
- Secret name: `HF_MISTRAL_API_KEY`
|
142 |
+
- Secret value: Your Mistral API key
|
143 |
+
|
144 |
+
The `README.md` contains the necessary configuration metadata for Hugging Face Spaces.
|
145 |
|
146 |
+
Check out the configuration reference at [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces-config-reference)
|
__pycache__/config.cpython-312.pyc
ADDED
Binary file (619 Bytes). View file
|
|
__pycache__/ocr_utils.cpython-312.pyc
ADDED
Binary file (8.08 kB). View file
|
|
__pycache__/process_file.cpython-312.pyc
ADDED
Binary file (2.86 kB). View file
|
|
__pycache__/structured_ocr.cpython-312.pyc
ADDED
Binary file (16.5 kB). View file
|
|
app.py
CHANGED
@@ -2,24 +2,105 @@ import os
|
|
2 |
import streamlit as st
|
3 |
import json
|
4 |
import sys
|
|
|
5 |
from pathlib import Path
|
6 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
# Import the StructuredOCR class and config from the local files
|
9 |
from structured_ocr import StructuredOCR
|
10 |
from config import MISTRAL_API_KEY
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# Set page configuration
|
13 |
st.set_page_config(
|
14 |
page_title="Historical OCR",
|
15 |
-
page_icon="
|
16 |
layout="wide",
|
17 |
initial_sidebar_state="expanded"
|
18 |
)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
# Define functions
|
21 |
-
def process_file(uploaded_file, use_vision=True):
|
22 |
-
"""Process the uploaded file and return the OCR results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Save the uploaded file to a temporary file
|
24 |
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
25 |
tmp.write(uploaded_file.getvalue())
|
@@ -29,6 +110,8 @@ def process_file(uploaded_file, use_vision=True):
|
|
29 |
# Check if API key is available
|
30 |
if not MISTRAL_API_KEY:
|
31 |
# Return dummy data if no API key
|
|
|
|
|
32 |
return {
|
33 |
"file_name": uploaded_file.name,
|
34 |
"topics": ["Sample Document"],
|
@@ -38,7 +121,11 @@ def process_file(uploaded_file, use_vision=True):
|
|
38 |
"content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
|
39 |
}
|
40 |
}
|
41 |
-
|
|
|
|
|
|
|
|
|
42 |
# Initialize OCR processor
|
43 |
processor = StructuredOCR()
|
44 |
|
@@ -46,9 +133,53 @@ def process_file(uploaded_file, use_vision=True):
|
|
46 |
file_ext = Path(uploaded_file.name).suffix.lower()
|
47 |
file_type = "pdf" if file_ext == ".pdf" else "image"
|
48 |
|
49 |
-
#
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
return result
|
|
|
|
|
|
|
|
|
|
|
52 |
finally:
|
53 |
# Clean up the temporary file
|
54 |
if os.path.exists(temp_path):
|
@@ -57,122 +188,414 @@ def process_file(uploaded_file, use_vision=True):
|
|
57 |
# App title and description
|
58 |
st.title("Historical Document OCR")
|
59 |
st.subheader("Powered by Mistral AI")
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
# Sidebar with options
|
66 |
with st.sidebar:
|
67 |
st.header("Options")
|
|
|
|
|
|
|
68 |
use_vision = st.checkbox("Use Vision Model", value=True,
|
69 |
help="For image files, use the vision model for improved analysis (may be slower)")
|
70 |
|
71 |
-
|
72 |
-
st.subheader("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
st.markdown("""
|
74 |
-
|
|
|
|
|
75 |
|
76 |
It can process:
|
77 |
- Image files (jpg, png, etc.)
|
78 |
-
- PDF documents
|
79 |
|
80 |
-
The extracted content is processed into structured data based on the document type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
""")
|
82 |
|
83 |
-
|
84 |
-
uploaded_file
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
st.
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
try:
|
107 |
-
#
|
108 |
-
|
109 |
|
110 |
-
#
|
111 |
-
|
112 |
-
st.subheader("Extracted Information")
|
113 |
-
|
114 |
-
# Display file info
|
115 |
-
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
116 |
-
|
117 |
-
# Display languages if available
|
118 |
-
if 'languages' in result:
|
119 |
-
languages = [lang for lang in result['languages'] if lang is not None]
|
120 |
-
if languages:
|
121 |
-
st.write(f"**Languages Detected:** {', '.join(languages)}")
|
122 |
-
|
123 |
-
# Display topics if available
|
124 |
-
if 'topics' in result and result['topics']:
|
125 |
-
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
126 |
|
127 |
-
#
|
128 |
-
st.
|
129 |
-
|
130 |
-
|
131 |
-
|
|
|
132 |
|
133 |
-
with
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
-
with
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
except Exception as e:
|
159 |
st.error(f"Error processing document: {str(e)}")
|
160 |
-
else:
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import streamlit as st
|
3 |
import json
|
4 |
import sys
|
5 |
+
import time
|
6 |
from pathlib import Path
|
7 |
import tempfile
|
8 |
+
import io
|
9 |
+
from pdf2image import convert_from_bytes
|
10 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
11 |
+
import cv2
|
12 |
+
import numpy as np
|
13 |
|
14 |
# Import the StructuredOCR class and config from the local files
|
15 |
from structured_ocr import StructuredOCR
|
16 |
from config import MISTRAL_API_KEY
|
17 |
|
18 |
+
# Check for modular UI components
|
19 |
+
try:
|
20 |
+
from ui.layout import tool_container, key_concept, research_question
|
21 |
+
MODULAR_UI = True
|
22 |
+
except ImportError:
|
23 |
+
MODULAR_UI = False
|
24 |
+
|
25 |
# Set page configuration
|
26 |
st.set_page_config(
|
27 |
page_title="Historical OCR",
|
28 |
+
page_icon="📜",
|
29 |
layout="wide",
|
30 |
initial_sidebar_state="expanded"
|
31 |
)
|
32 |
|
33 |
+
# Enable caching for expensive operations
|
34 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
35 |
+
def convert_pdf_to_images(pdf_bytes, dpi=150):
|
36 |
+
"""Convert PDF bytes to a list of images with caching"""
|
37 |
+
try:
|
38 |
+
return convert_from_bytes(pdf_bytes, dpi=dpi)
|
39 |
+
except Exception as e:
|
40 |
+
st.error(f"Error converting PDF: {str(e)}")
|
41 |
+
return []
|
42 |
+
|
43 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
44 |
+
def preprocess_image(image_bytes, preprocessing_options):
|
45 |
+
"""Preprocess image with selected options"""
|
46 |
+
# Convert bytes to OpenCV format
|
47 |
+
image = Image.open(io.BytesIO(image_bytes))
|
48 |
+
img_array = np.array(image)
|
49 |
+
|
50 |
+
# Apply preprocessing based on selected options
|
51 |
+
if preprocessing_options.get("grayscale", False):
|
52 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
53 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
54 |
+
|
55 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
56 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
|
57 |
+
image = Image.fromarray(img_array)
|
58 |
+
enhancer = ImageEnhance.Contrast(image)
|
59 |
+
image = enhancer.enhance(contrast_factor)
|
60 |
+
img_array = np.array(image)
|
61 |
+
|
62 |
+
if preprocessing_options.get("denoise", False):
|
63 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
|
64 |
+
|
65 |
+
if preprocessing_options.get("threshold", False):
|
66 |
+
# Convert to grayscale if not already
|
67 |
+
if len(img_array.shape) == 3:
|
68 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
69 |
+
else:
|
70 |
+
gray = img_array
|
71 |
+
# Apply adaptive threshold
|
72 |
+
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
73 |
+
cv2.THRESH_BINARY, 11, 2)
|
74 |
+
# Convert back to RGB
|
75 |
+
img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
|
76 |
+
|
77 |
+
# Convert back to PIL Image
|
78 |
+
processed_image = Image.fromarray(img_array)
|
79 |
+
|
80 |
+
# Convert to bytes
|
81 |
+
byte_io = io.BytesIO()
|
82 |
+
processed_image.save(byte_io, format='PNG')
|
83 |
+
byte_io.seek(0)
|
84 |
+
|
85 |
+
return byte_io.getvalue()
|
86 |
+
|
87 |
# Define functions
|
88 |
+
def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
|
89 |
+
"""Process the uploaded file and return the OCR results
|
90 |
+
|
91 |
+
Args:
|
92 |
+
uploaded_file: The uploaded file to process
|
93 |
+
use_vision: Whether to use vision model
|
94 |
+
preprocessing_options: Dictionary of preprocessing options
|
95 |
+
"""
|
96 |
+
if preprocessing_options is None:
|
97 |
+
preprocessing_options = {}
|
98 |
+
|
99 |
+
# Show progress indicator
|
100 |
+
progress_bar = st.progress(0)
|
101 |
+
status_text = st.empty()
|
102 |
+
status_text.text("Preparing file for processing...")
|
103 |
+
|
104 |
# Save the uploaded file to a temporary file
|
105 |
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
106 |
tmp.write(uploaded_file.getvalue())
|
|
|
110 |
# Check if API key is available
|
111 |
if not MISTRAL_API_KEY:
|
112 |
# Return dummy data if no API key
|
113 |
+
progress_bar.progress(100)
|
114 |
+
status_text.empty()
|
115 |
return {
|
116 |
"file_name": uploaded_file.name,
|
117 |
"topics": ["Sample Document"],
|
|
|
121 |
"content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
|
122 |
}
|
123 |
}
|
124 |
+
|
125 |
+
# Update progress
|
126 |
+
progress_bar.progress(20)
|
127 |
+
status_text.text("Initializing OCR processor...")
|
128 |
+
|
129 |
# Initialize OCR processor
|
130 |
processor = StructuredOCR()
|
131 |
|
|
|
133 |
file_ext = Path(uploaded_file.name).suffix.lower()
|
134 |
file_type = "pdf" if file_ext == ".pdf" else "image"
|
135 |
|
136 |
+
# Apply preprocessing if needed
|
137 |
+
if any(preprocessing_options.values()) and file_type == "image":
|
138 |
+
status_text.text("Applying image preprocessing...")
|
139 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
140 |
+
|
141 |
+
# Save processed image to temp file
|
142 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
|
143 |
+
proc_tmp.write(processed_bytes)
|
144 |
+
temp_path = proc_tmp.name
|
145 |
+
|
146 |
+
# Get file size in MB
|
147 |
+
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
|
148 |
+
|
149 |
+
# Check if file exceeds API limits (50 MB)
|
150 |
+
if file_size_mb > 50:
|
151 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
|
152 |
+
return {
|
153 |
+
"file_name": uploaded_file.name,
|
154 |
+
"topics": ["Document"],
|
155 |
+
"languages": ["English"],
|
156 |
+
"confidence_score": 0.0,
|
157 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
158 |
+
"ocr_contents": {
|
159 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
160 |
+
"partial_text": "Document could not be processed due to size limitations."
|
161 |
+
}
|
162 |
+
}
|
163 |
+
|
164 |
+
# Update progress
|
165 |
+
progress_bar.progress(40)
|
166 |
+
status_text.text("Processing document with OCR...")
|
167 |
+
|
168 |
+
# Process the file with file size information for automatic page limiting
|
169 |
+
# Make sure we're using the latest mistral-ocr model
|
170 |
+
# See https://docs.mistral.ai/capabilities/document/ for more info
|
171 |
+
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
|
172 |
+
|
173 |
+
# Complete progress
|
174 |
+
progress_bar.progress(100)
|
175 |
+
status_text.empty()
|
176 |
+
|
177 |
return result
|
178 |
+
except Exception as e:
|
179 |
+
progress_bar.progress(100)
|
180 |
+
status_text.empty()
|
181 |
+
st.error(f"Error during processing: {str(e)}")
|
182 |
+
raise
|
183 |
finally:
|
184 |
# Clean up the temporary file
|
185 |
if os.path.exists(temp_path):
|
|
|
188 |
# App title and description
|
189 |
st.title("Historical Document OCR")
|
190 |
st.subheader("Powered by Mistral AI")
|
191 |
+
|
192 |
+
# Create main layout with tabs and columns
|
193 |
+
main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
|
194 |
+
|
195 |
+
with main_tab1:
|
196 |
+
# Create a two-column layout for file upload and preview
|
197 |
+
upload_col, preview_col = st.columns([1, 1])
|
198 |
+
|
199 |
+
# File uploader in the left column
|
200 |
+
with upload_col:
|
201 |
+
st.markdown("""
|
202 |
+
Upload an image or PDF file to get started.
|
203 |
+
|
204 |
+
Using the latest `mistral-ocr-latest` model for advanced document understanding.
|
205 |
+
""")
|
206 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 50MB per file")
|
207 |
|
208 |
# Sidebar with options
|
209 |
with st.sidebar:
|
210 |
st.header("Options")
|
211 |
+
|
212 |
+
# Model options
|
213 |
+
st.subheader("Model Settings")
|
214 |
use_vision = st.checkbox("Use Vision Model", value=True,
|
215 |
help="For image files, use the vision model for improved analysis (may be slower)")
|
216 |
|
217 |
+
# Image preprocessing options (collapsible)
|
218 |
+
st.subheader("Image Preprocessing")
|
219 |
+
with st.expander("Preprocessing Options"):
|
220 |
+
preprocessing_options = {}
|
221 |
+
preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
|
222 |
+
help="Convert image to grayscale before OCR")
|
223 |
+
preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
|
224 |
+
help="Apply adaptive thresholding to enhance text")
|
225 |
+
preprocessing_options["denoise"] = st.checkbox("Denoise Image",
|
226 |
+
help="Remove noise from the image")
|
227 |
+
preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
|
228 |
+
help="Adjust image contrast (-5 to +5)")
|
229 |
+
|
230 |
+
# PDF options (collapsible)
|
231 |
+
st.subheader("PDF Options")
|
232 |
+
with st.expander("PDF Settings"):
|
233 |
+
pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
|
234 |
+
help="Higher DPI gives better quality but slower processing")
|
235 |
+
max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
|
236 |
+
help="Limit number of pages to process")
|
237 |
+
|
238 |
+
# About tab content
|
239 |
+
with main_tab2:
|
240 |
st.markdown("""
|
241 |
+
### About This Application
|
242 |
+
|
243 |
+
This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
|
244 |
|
245 |
It can process:
|
246 |
- Image files (jpg, png, etc.)
|
247 |
+
- PDF documents (multi-page support)
|
248 |
|
249 |
+
The extracted content is processed into structured data based on the document type, combining:
|
250 |
+
- Text extraction with `mistral-ocr-latest`
|
251 |
+
- Analysis with language models
|
252 |
+
- Layout preservation with images
|
253 |
+
|
254 |
+
View results in three formats:
|
255 |
+
- Structured HTML view
|
256 |
+
- Raw JSON (for developers)
|
257 |
+
- Markdown with images (preserves document layout)
|
258 |
+
|
259 |
+
**New Features:**
|
260 |
+
- Image preprocessing for better OCR quality
|
261 |
+
- PDF resolution and page controls
|
262 |
+
- Progress tracking during processing
|
263 |
""")
|
264 |
|
265 |
+
with main_tab1:
|
266 |
+
if uploaded_file is not None:
|
267 |
+
# Check file size (cap at 50MB)
|
268 |
+
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
269 |
+
|
270 |
+
if file_size_mb > 50:
|
271 |
+
with upload_col:
|
272 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
|
273 |
+
st.stop()
|
274 |
+
|
275 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
276 |
+
|
277 |
+
# Display document preview in preview column
|
278 |
+
with preview_col:
|
279 |
+
st.subheader("Document Preview")
|
280 |
+
if file_ext == ".pdf":
|
281 |
+
try:
|
282 |
+
# Convert first page of PDF to image for preview
|
283 |
+
pdf_bytes = uploaded_file.getvalue()
|
284 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
285 |
+
|
286 |
+
if images:
|
287 |
+
# Convert PIL image to bytes for Streamlit
|
288 |
+
first_page = images[0]
|
289 |
+
img_bytes = io.BytesIO()
|
290 |
+
first_page.save(img_bytes, format='JPEG')
|
291 |
+
img_bytes.seek(0)
|
292 |
+
|
293 |
+
# Display the PDF preview
|
294 |
+
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
|
295 |
+
else:
|
296 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
297 |
+
except Exception:
|
298 |
+
# Simply show the file name without an error message
|
299 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
300 |
+
st.info("Click 'Process Document' to analyze the content.")
|
301 |
+
else:
|
302 |
+
st.image(uploaded_file, use_container_width=True)
|
303 |
|
304 |
+
# Add image preprocessing preview in a collapsible section if needed
|
305 |
+
if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
|
306 |
+
with st.expander("Image Preprocessing Preview"):
|
307 |
+
preview_cols = st.columns(2)
|
308 |
+
|
309 |
+
with preview_cols[0]:
|
310 |
+
st.markdown("**Original Image**")
|
311 |
+
st.image(uploaded_file, use_container_width=True)
|
312 |
+
|
313 |
+
with preview_cols[1]:
|
314 |
+
st.markdown("**Preprocessed Image**")
|
315 |
+
try:
|
316 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
317 |
+
st.image(io.BytesIO(processed_bytes), use_container_width=True)
|
318 |
+
except Exception as e:
|
319 |
+
st.error(f"Error in preprocessing: {str(e)}")
|
320 |
+
|
321 |
+
# Process button - flush left with similar padding as file browser
|
322 |
+
with upload_col:
|
323 |
+
process_button = st.button("Process Document", use_container_width=True)
|
324 |
+
|
325 |
+
# Results section
|
326 |
+
if process_button:
|
327 |
try:
|
328 |
+
# Get max_pages or default if not available
|
329 |
+
max_pages_value = max_pages if 'max_pages' in locals() else None
|
330 |
|
331 |
+
# Call process_file with all options
|
332 |
+
result = process_file(uploaded_file, use_vision, preprocessing_options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
+
# Create results tabs for better organization
|
335 |
+
results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
|
336 |
+
|
337 |
+
with results_tab1:
|
338 |
+
# Create two columns for metadata and content
|
339 |
+
meta_col, content_col = st.columns([1, 2])
|
340 |
|
341 |
+
with meta_col:
|
342 |
+
st.subheader("Document Metadata")
|
343 |
+
st.success("**Document processed successfully**")
|
344 |
+
|
345 |
+
# Display file info
|
346 |
+
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
347 |
+
|
348 |
+
# Display info if only limited pages were processed
|
349 |
+
if 'limited_pages' in result:
|
350 |
+
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
|
351 |
+
|
352 |
+
# Display languages if available
|
353 |
+
if 'languages' in result:
|
354 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
355 |
+
if languages:
|
356 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
357 |
+
|
358 |
+
# Confidence score if available
|
359 |
+
if 'confidence_score' in result:
|
360 |
+
confidence = result['confidence_score']
|
361 |
+
st.write(f"**OCR Confidence:** {confidence:.1%}")
|
362 |
+
|
363 |
+
# Display topics if available
|
364 |
+
if 'topics' in result and result['topics']:
|
365 |
+
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
366 |
|
367 |
+
with content_col:
|
368 |
+
st.subheader("Document Contents")
|
369 |
+
if 'ocr_contents' in result:
|
370 |
+
# Check if there are images in the OCR result
|
371 |
+
has_images = False
|
372 |
+
if 'raw_response' in result:
|
373 |
+
try:
|
374 |
+
has_images = any(page.images for page in result['raw_response'].pages)
|
375 |
+
except Exception:
|
376 |
+
has_images = False
|
377 |
+
|
378 |
+
# Create tabs for different views
|
379 |
+
if has_images:
|
380 |
+
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
|
381 |
+
else:
|
382 |
+
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
|
383 |
+
|
384 |
+
with view_tab1:
|
385 |
+
# Display in a more user-friendly format based on the content structure
|
386 |
+
html_content = ""
|
387 |
+
if isinstance(result['ocr_contents'], dict):
|
388 |
+
for section, content in result['ocr_contents'].items():
|
389 |
+
if content: # Only display non-empty sections
|
390 |
+
section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
|
391 |
+
html_content += section_title
|
392 |
+
|
393 |
+
if isinstance(content, str):
|
394 |
+
html_content += f"<p>{content}</p>"
|
395 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
396 |
+
st.markdown(content)
|
397 |
+
elif isinstance(content, list):
|
398 |
+
html_list = "<ul>"
|
399 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
400 |
+
for item in content:
|
401 |
+
if isinstance(item, str):
|
402 |
+
html_list += f"<li>{item}</li>"
|
403 |
+
st.markdown(f"- {item}")
|
404 |
+
elif isinstance(item, dict):
|
405 |
+
html_list += f"<li>{json.dumps(item)}</li>"
|
406 |
+
st.json(item)
|
407 |
+
html_list += "</ul>"
|
408 |
+
html_content += html_list
|
409 |
+
elif isinstance(content, dict):
|
410 |
+
html_dict = "<dl>"
|
411 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
412 |
+
for k, v in content.items():
|
413 |
+
html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
|
414 |
+
st.markdown(f"**{k}:** {v}")
|
415 |
+
html_dict += "</dl>"
|
416 |
+
html_content += html_dict
|
417 |
+
|
418 |
+
# Add download button in a smaller section
|
419 |
+
with st.expander("Export Content"):
|
420 |
+
# Alternative download button
|
421 |
+
html_bytes = html_content.encode()
|
422 |
+
st.download_button(
|
423 |
+
label="Download as HTML",
|
424 |
+
data=html_bytes,
|
425 |
+
file_name="document_content.html",
|
426 |
+
mime="text/html"
|
427 |
+
)
|
428 |
+
|
429 |
+
with view_tab2:
|
430 |
+
# Show the raw JSON for developers
|
431 |
+
st.json(result)
|
432 |
+
|
433 |
+
if has_images:
|
434 |
+
with view_tab3:
|
435 |
+
# Show loading indicator while preparing images
|
436 |
+
with st.spinner("Preparing document with embedded images..."):
|
437 |
+
try:
|
438 |
+
# Import function
|
439 |
+
try:
|
440 |
+
from ocr_utils import get_combined_markdown
|
441 |
+
except ImportError:
|
442 |
+
st.error("Required module ocr_utils not found.")
|
443 |
+
st.stop()
|
444 |
+
|
445 |
+
# Check if raw_response is available
|
446 |
+
if 'raw_response' not in result:
|
447 |
+
st.warning("Raw OCR response not available. Cannot display images.")
|
448 |
+
st.stop()
|
449 |
+
|
450 |
+
# Validate the raw_response structure before processing
|
451 |
+
if not hasattr(result['raw_response'], 'pages'):
|
452 |
+
st.warning("Invalid OCR response format. Cannot display images.")
|
453 |
+
st.stop()
|
454 |
+
|
455 |
+
# Get the combined markdown with images
|
456 |
+
# Set a flag to compress images if needed
|
457 |
+
compress_images = True
|
458 |
+
max_image_width = 800 # Maximum width for images
|
459 |
+
|
460 |
+
try:
|
461 |
+
# First try to get combined markdown with compressed images
|
462 |
+
if compress_images and hasattr(result['raw_response'], 'pages'):
|
463 |
+
from ocr_utils import get_combined_markdown_compressed
|
464 |
+
combined_markdown = get_combined_markdown_compressed(
|
465 |
+
result['raw_response'],
|
466 |
+
max_width=max_image_width,
|
467 |
+
quality=85
|
468 |
+
)
|
469 |
+
else:
|
470 |
+
# Fall back to regular method if compression not available
|
471 |
+
combined_markdown = get_combined_markdown(result['raw_response'])
|
472 |
+
except (ImportError, AttributeError):
|
473 |
+
# Fall back to regular method
|
474 |
+
combined_markdown = get_combined_markdown(result['raw_response'])
|
475 |
+
|
476 |
+
if not combined_markdown or combined_markdown.strip() == "":
|
477 |
+
st.warning("No image content found in the document.")
|
478 |
+
st.stop()
|
479 |
+
|
480 |
+
# Check if there are many images that might cause loading issues
|
481 |
+
image_count = sum(len(page.images) for page in result['raw_response'].pages if hasattr(page, 'images'))
|
482 |
+
|
483 |
+
# Add warning for image-heavy documents
|
484 |
+
if image_count > 10:
|
485 |
+
st.warning(f"This document contains {image_count} images. Rendering may take longer than usual.")
|
486 |
+
|
487 |
+
# Add CSS to ensure proper spacing and handling of text and images
|
488 |
+
st.markdown("""
|
489 |
+
<style>
|
490 |
+
.markdown-text-container {
|
491 |
+
padding: 10px;
|
492 |
+
background-color: #f9f9f9;
|
493 |
+
border-radius: 5px;
|
494 |
+
}
|
495 |
+
.markdown-text-container img {
|
496 |
+
margin: 15px 0;
|
497 |
+
max-width: 100%;
|
498 |
+
border: 1px solid #ddd;
|
499 |
+
border-radius: 4px;
|
500 |
+
display: block;
|
501 |
+
}
|
502 |
+
.markdown-text-container p {
|
503 |
+
margin-bottom: 16px;
|
504 |
+
line-height: 1.6;
|
505 |
+
}
|
506 |
+
/* Add lazy loading for images to improve performance */
|
507 |
+
.markdown-text-container img {
|
508 |
+
loading: lazy;
|
509 |
+
}
|
510 |
+
</style>
|
511 |
+
""", unsafe_allow_html=True)
|
512 |
+
|
513 |
+
# For very image-heavy documents, show images in a paginated way
|
514 |
+
if image_count > 20:
|
515 |
+
# Show image content in a paginated way
|
516 |
+
st.write("Document contains many images. Showing in a paginated format:")
|
517 |
+
|
518 |
+
# Split the combined markdown by page separators
|
519 |
+
pages = combined_markdown.split("---")
|
520 |
+
|
521 |
+
# Create a page selector
|
522 |
+
page_num = st.selectbox("Select page to view:",
|
523 |
+
options=list(range(1, len(pages)+1)),
|
524 |
+
index=0)
|
525 |
+
|
526 |
+
# Display only the selected page
|
527 |
+
st.markdown(f"""
|
528 |
+
<div class="markdown-text-container">
|
529 |
+
{pages[page_num-1]}
|
530 |
+
</div>
|
531 |
+
""", unsafe_allow_html=True)
|
532 |
+
|
533 |
+
# Add note about pagination
|
534 |
+
st.info(f"Showing page {page_num} of {len(pages)}. Select a different page from the dropdown above.")
|
535 |
+
else:
|
536 |
+
# Wrap the markdown in a div with the class for styling
|
537 |
+
st.markdown(f"""
|
538 |
+
<div class="markdown-text-container">
|
539 |
+
{combined_markdown}
|
540 |
+
</div>
|
541 |
+
""", unsafe_allow_html=True)
|
542 |
+
|
543 |
+
# Add a download button for the combined content
|
544 |
+
st.download_button(
|
545 |
+
label="Download with Images (HTML)",
|
546 |
+
data=f"""
|
547 |
+
<html>
|
548 |
+
<head>
|
549 |
+
<style>
|
550 |
+
body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
|
551 |
+
img {{ max-width: 100%; margin: 15px 0; }}
|
552 |
+
</style>
|
553 |
+
</head>
|
554 |
+
<body>
|
555 |
+
{combined_markdown}
|
556 |
+
</body>
|
557 |
+
</html>
|
558 |
+
""",
|
559 |
+
file_name="document_with_images.html",
|
560 |
+
mime="text/html"
|
561 |
+
)
|
562 |
+
|
563 |
+
except Exception as e:
|
564 |
+
st.error(f"Could not display document with images: {str(e)}")
|
565 |
+
st.info("Try refreshing or processing the document again.")
|
566 |
+
else:
|
567 |
+
st.error("No OCR content was extracted from the document.")
|
568 |
+
|
569 |
+
with results_tab2:
|
570 |
+
st.subheader("Raw Processing Results")
|
571 |
+
st.json(result)
|
572 |
|
573 |
except Exception as e:
|
574 |
st.error(f"Error processing document: {str(e)}")
|
575 |
+
else:
|
576 |
+
# Display sample images in the main area when no file is uploaded
|
577 |
+
st.info("Upload a document to get started using the file uploader above.")
|
578 |
+
|
579 |
+
# Show example images in a grid
|
580 |
+
st.subheader("Example Documents")
|
581 |
+
|
582 |
+
# Add a sample images container
|
583 |
+
with st.container():
|
584 |
+
# Find sample images from the input directory to display
|
585 |
+
input_dir = Path(__file__).parent / "input"
|
586 |
+
sample_images = []
|
587 |
+
if input_dir.exists():
|
588 |
+
# Find valid jpg files (with size > 50KB to avoid placeholders)
|
589 |
+
sample_images = [
|
590 |
+
path for path in input_dir.glob("*.jpg")
|
591 |
+
if path.stat().st_size > 50000
|
592 |
+
][:3] # Limit to 3 samples
|
593 |
+
|
594 |
+
if sample_images:
|
595 |
+
columns = st.columns(3)
|
596 |
+
for i, img_path in enumerate(sample_images):
|
597 |
+
with columns[i % 3]:
|
598 |
+
try:
|
599 |
+
st.image(str(img_path), caption=img_path.name, use_container_width=True)
|
600 |
+
except Exception as e:
|
601 |
+
st.error(f"Error loading image {img_path.name}: {str(e)}")
|
backup/app.py
ADDED
@@ -0,0 +1,535 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import json
|
4 |
+
import sys
|
5 |
+
import time
|
6 |
+
from pathlib import Path
|
7 |
+
import tempfile
|
8 |
+
import io
|
9 |
+
from pdf2image import convert_from_bytes
|
10 |
+
from PIL import Image, ImageEnhance, ImageFilter
|
11 |
+
import cv2
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
# Import the StructuredOCR class and config from the local files
|
15 |
+
from structured_ocr import StructuredOCR
|
16 |
+
from config import MISTRAL_API_KEY
|
17 |
+
|
18 |
+
# Set page configuration
|
19 |
+
st.set_page_config(
|
20 |
+
page_title="Historical OCR",
|
21 |
+
page_icon="🚀",
|
22 |
+
layout="wide",
|
23 |
+
initial_sidebar_state="expanded"
|
24 |
+
)
|
25 |
+
|
26 |
+
# Enable caching for expensive operations
|
27 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
28 |
+
def convert_pdf_to_images(pdf_bytes, dpi=150):
|
29 |
+
"""Convert PDF bytes to a list of images with caching"""
|
30 |
+
try:
|
31 |
+
return convert_from_bytes(pdf_bytes, dpi=dpi)
|
32 |
+
except Exception as e:
|
33 |
+
st.error(f"Error converting PDF: {str(e)}")
|
34 |
+
return []
|
35 |
+
|
36 |
+
@st.cache_data(ttl=3600, show_spinner=False)
|
37 |
+
def preprocess_image(image_bytes, preprocessing_options):
|
38 |
+
"""Preprocess image with selected options"""
|
39 |
+
# Convert bytes to OpenCV format
|
40 |
+
image = Image.open(io.BytesIO(image_bytes))
|
41 |
+
img_array = np.array(image)
|
42 |
+
|
43 |
+
# Apply preprocessing based on selected options
|
44 |
+
if preprocessing_options.get("grayscale", False):
|
45 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
46 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
47 |
+
|
48 |
+
if preprocessing_options.get("contrast", 0) != 0:
|
49 |
+
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
|
50 |
+
image = Image.fromarray(img_array)
|
51 |
+
enhancer = ImageEnhance.Contrast(image)
|
52 |
+
image = enhancer.enhance(contrast_factor)
|
53 |
+
img_array = np.array(image)
|
54 |
+
|
55 |
+
if preprocessing_options.get("denoise", False):
|
56 |
+
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
|
57 |
+
|
58 |
+
if preprocessing_options.get("threshold", False):
|
59 |
+
# Convert to grayscale if not already
|
60 |
+
if len(img_array.shape) == 3:
|
61 |
+
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
|
62 |
+
else:
|
63 |
+
gray = img_array
|
64 |
+
# Apply adaptive threshold
|
65 |
+
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
|
66 |
+
cv2.THRESH_BINARY, 11, 2)
|
67 |
+
# Convert back to RGB
|
68 |
+
img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
|
69 |
+
|
70 |
+
# Convert back to PIL Image
|
71 |
+
processed_image = Image.fromarray(img_array)
|
72 |
+
|
73 |
+
# Convert to bytes
|
74 |
+
byte_io = io.BytesIO()
|
75 |
+
processed_image.save(byte_io, format='PNG')
|
76 |
+
byte_io.seek(0)
|
77 |
+
|
78 |
+
return byte_io.getvalue()
|
79 |
+
|
80 |
+
# Define functions
|
81 |
+
def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
|
82 |
+
"""Process the uploaded file and return the OCR results
|
83 |
+
|
84 |
+
Args:
|
85 |
+
uploaded_file: The uploaded file to process
|
86 |
+
use_vision: Whether to use vision model
|
87 |
+
preprocessing_options: Dictionary of preprocessing options
|
88 |
+
"""
|
89 |
+
if preprocessing_options is None:
|
90 |
+
preprocessing_options = {}
|
91 |
+
|
92 |
+
# Show progress indicator
|
93 |
+
progress_bar = st.progress(0)
|
94 |
+
status_text = st.empty()
|
95 |
+
status_text.text("Preparing file for processing...")
|
96 |
+
|
97 |
+
# Save the uploaded file to a temporary file
|
98 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
|
99 |
+
tmp.write(uploaded_file.getvalue())
|
100 |
+
temp_path = tmp.name
|
101 |
+
|
102 |
+
try:
|
103 |
+
# Check if API key is available
|
104 |
+
if not MISTRAL_API_KEY:
|
105 |
+
# Return dummy data if no API key
|
106 |
+
progress_bar.progress(100)
|
107 |
+
status_text.empty()
|
108 |
+
return {
|
109 |
+
"file_name": uploaded_file.name,
|
110 |
+
"topics": ["Sample Document"],
|
111 |
+
"languages": ["English"],
|
112 |
+
"ocr_contents": {
|
113 |
+
"title": "Sample Document",
|
114 |
+
"content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
# Update progress
|
119 |
+
progress_bar.progress(20)
|
120 |
+
status_text.text("Initializing OCR processor...")
|
121 |
+
|
122 |
+
# Initialize OCR processor
|
123 |
+
processor = StructuredOCR()
|
124 |
+
|
125 |
+
# Determine file type from extension
|
126 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
127 |
+
file_type = "pdf" if file_ext == ".pdf" else "image"
|
128 |
+
|
129 |
+
# Apply preprocessing if needed
|
130 |
+
if any(preprocessing_options.values()) and file_type == "image":
|
131 |
+
status_text.text("Applying image preprocessing...")
|
132 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
133 |
+
|
134 |
+
# Save processed image to temp file
|
135 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
|
136 |
+
proc_tmp.write(processed_bytes)
|
137 |
+
temp_path = proc_tmp.name
|
138 |
+
|
139 |
+
# Get file size in MB
|
140 |
+
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
|
141 |
+
|
142 |
+
# Check if file exceeds API limits (50 MB)
|
143 |
+
if file_size_mb > 50:
|
144 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
|
145 |
+
return {
|
146 |
+
"file_name": uploaded_file.name,
|
147 |
+
"topics": ["Document"],
|
148 |
+
"languages": ["English"],
|
149 |
+
"confidence_score": 0.0,
|
150 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
151 |
+
"ocr_contents": {
|
152 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
153 |
+
"partial_text": "Document could not be processed due to size limitations."
|
154 |
+
}
|
155 |
+
}
|
156 |
+
|
157 |
+
# Update progress
|
158 |
+
progress_bar.progress(40)
|
159 |
+
status_text.text("Processing document with OCR...")
|
160 |
+
|
161 |
+
# Process the file with file size information for automatic page limiting
|
162 |
+
# Make sure we're using the latest mistral-ocr model
|
163 |
+
# See https://docs.mistral.ai/capabilities/document/ for more info
|
164 |
+
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
|
165 |
+
|
166 |
+
# Complete progress
|
167 |
+
progress_bar.progress(100)
|
168 |
+
status_text.empty()
|
169 |
+
|
170 |
+
return result
|
171 |
+
except Exception as e:
|
172 |
+
progress_bar.progress(100)
|
173 |
+
status_text.empty()
|
174 |
+
st.error(f"Error during processing: {str(e)}")
|
175 |
+
raise
|
176 |
+
finally:
|
177 |
+
# Clean up the temporary file
|
178 |
+
if os.path.exists(temp_path):
|
179 |
+
os.unlink(temp_path)
|
180 |
+
|
181 |
+
# App title and description
|
182 |
+
st.title("Historical Document OCR")
|
183 |
+
st.subheader("Powered by Mistral AI")
|
184 |
+
|
185 |
+
# Create main layout with tabs and columns
|
186 |
+
main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
|
187 |
+
|
188 |
+
with main_tab1:
|
189 |
+
# Create a two-column layout for file upload and preview
|
190 |
+
upload_col, preview_col = st.columns([1, 1])
|
191 |
+
|
192 |
+
# File uploader in the left column
|
193 |
+
with upload_col:
|
194 |
+
st.markdown("""
|
195 |
+
Upload an image or PDF file to get started.
|
196 |
+
|
197 |
+
Using the latest `mistral-ocr-latest` model for advanced document understanding.
|
198 |
+
""")
|
199 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
|
200 |
+
|
201 |
+
# Sidebar with options
|
202 |
+
with st.sidebar:
|
203 |
+
st.header("Options")
|
204 |
+
|
205 |
+
# Model options
|
206 |
+
st.subheader("Model Settings")
|
207 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
208 |
+
help="For image files, use the vision model for improved analysis (may be slower)")
|
209 |
+
|
210 |
+
# Image preprocessing options (collapsible)
|
211 |
+
st.subheader("Image Preprocessing")
|
212 |
+
with st.expander("Preprocessing Options"):
|
213 |
+
preprocessing_options = {}
|
214 |
+
preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
|
215 |
+
help="Convert image to grayscale before OCR")
|
216 |
+
preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
|
217 |
+
help="Apply adaptive thresholding to enhance text")
|
218 |
+
preprocessing_options["denoise"] = st.checkbox("Denoise Image",
|
219 |
+
help="Remove noise from the image")
|
220 |
+
preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
|
221 |
+
help="Adjust image contrast (-5 to +5)")
|
222 |
+
|
223 |
+
# PDF options (collapsible)
|
224 |
+
st.subheader("PDF Options")
|
225 |
+
with st.expander("PDF Settings"):
|
226 |
+
pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
|
227 |
+
help="Higher DPI gives better quality but slower processing")
|
228 |
+
max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
|
229 |
+
help="Limit number of pages to process")
|
230 |
+
|
231 |
+
# About tab content
|
232 |
+
with main_tab2:
|
233 |
+
st.markdown("""
|
234 |
+
### About This Application
|
235 |
+
|
236 |
+
This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
|
237 |
+
|
238 |
+
It can process:
|
239 |
+
- Image files (jpg, png, etc.)
|
240 |
+
- PDF documents (multi-page support)
|
241 |
+
|
242 |
+
The extracted content is processed into structured data based on the document type, combining:
|
243 |
+
- Text extraction with `mistral-ocr-latest`
|
244 |
+
- Analysis with language models
|
245 |
+
- Layout preservation with images
|
246 |
+
|
247 |
+
View results in three formats:
|
248 |
+
- Structured HTML view
|
249 |
+
- Raw JSON (for developers)
|
250 |
+
- Markdown with images (preserves document layout)
|
251 |
+
|
252 |
+
**New Features:**
|
253 |
+
- Image preprocessing for better OCR quality
|
254 |
+
- PDF resolution and page controls
|
255 |
+
- Progress tracking during processing
|
256 |
+
""")
|
257 |
+
|
258 |
+
with main_tab1:
|
259 |
+
if uploaded_file is not None:
|
260 |
+
# Check file size (cap at 50MB)
|
261 |
+
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
|
262 |
+
|
263 |
+
if file_size_mb > 50:
|
264 |
+
with upload_col:
|
265 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
|
266 |
+
st.stop()
|
267 |
+
|
268 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
269 |
+
|
270 |
+
# Display document preview in preview column
|
271 |
+
with preview_col:
|
272 |
+
st.subheader("Document Preview")
|
273 |
+
if file_ext == ".pdf":
|
274 |
+
try:
|
275 |
+
# Convert first page of PDF to image for preview
|
276 |
+
pdf_bytes = uploaded_file.getvalue()
|
277 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
278 |
+
|
279 |
+
if images:
|
280 |
+
# Convert PIL image to bytes for Streamlit
|
281 |
+
first_page = images[0]
|
282 |
+
img_bytes = io.BytesIO()
|
283 |
+
first_page.save(img_bytes, format='JPEG')
|
284 |
+
img_bytes.seek(0)
|
285 |
+
|
286 |
+
# Display the PDF preview
|
287 |
+
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
|
288 |
+
else:
|
289 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
290 |
+
except Exception:
|
291 |
+
# Simply show the file name without an error message
|
292 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
293 |
+
st.info("Click 'Process Document' to analyze the content.")
|
294 |
+
else:
|
295 |
+
st.image(uploaded_file, use_container_width=True)
|
296 |
+
|
297 |
+
# Add image preprocessing preview in a collapsible section if needed
|
298 |
+
if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
|
299 |
+
with st.expander("Image Preprocessing Preview"):
|
300 |
+
preview_cols = st.columns(2)
|
301 |
+
|
302 |
+
with preview_cols[0]:
|
303 |
+
st.markdown("**Original Image**")
|
304 |
+
st.image(uploaded_file, use_container_width=True)
|
305 |
+
|
306 |
+
with preview_cols[1]:
|
307 |
+
st.markdown("**Preprocessed Image**")
|
308 |
+
try:
|
309 |
+
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
|
310 |
+
st.image(io.BytesIO(processed_bytes), use_container_width=True)
|
311 |
+
except Exception as e:
|
312 |
+
st.error(f"Error in preprocessing: {str(e)}")
|
313 |
+
|
314 |
+
# Process button - flush left with similar padding as file browser
|
315 |
+
with upload_col:
|
316 |
+
process_button = st.button("Process Document", use_container_width=True)
|
317 |
+
|
318 |
+
# Results section
|
319 |
+
if process_button:
|
320 |
+
try:
|
321 |
+
# Get max_pages or default if not available
|
322 |
+
max_pages_value = max_pages if 'max_pages' in locals() else None
|
323 |
+
|
324 |
+
# Call process_file with all options
|
325 |
+
result = process_file(uploaded_file, use_vision, preprocessing_options)
|
326 |
+
|
327 |
+
# Create results tabs for better organization
|
328 |
+
results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
|
329 |
+
|
330 |
+
with results_tab1:
|
331 |
+
# Create two columns for metadata and content
|
332 |
+
meta_col, content_col = st.columns([1, 2])
|
333 |
+
|
334 |
+
with meta_col:
|
335 |
+
st.subheader("Document Metadata")
|
336 |
+
st.success("**Document processed successfully**")
|
337 |
+
|
338 |
+
# Display file info
|
339 |
+
st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
|
340 |
+
|
341 |
+
# Display info if only limited pages were processed
|
342 |
+
if 'limited_pages' in result:
|
343 |
+
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
|
344 |
+
|
345 |
+
# Display languages if available
|
346 |
+
if 'languages' in result:
|
347 |
+
languages = [lang for lang in result['languages'] if lang is not None]
|
348 |
+
if languages:
|
349 |
+
st.write(f"**Languages:** {', '.join(languages)}")
|
350 |
+
|
351 |
+
# Confidence score if available
|
352 |
+
if 'confidence_score' in result:
|
353 |
+
confidence = result['confidence_score']
|
354 |
+
st.write(f"**OCR Confidence:** {confidence:.1%}")
|
355 |
+
|
356 |
+
# Display topics if available
|
357 |
+
if 'topics' in result and result['topics']:
|
358 |
+
st.write(f"**Topics:** {', '.join(result['topics'])}")
|
359 |
+
|
360 |
+
with content_col:
|
361 |
+
st.subheader("Document Contents")
|
362 |
+
if 'ocr_contents' in result:
|
363 |
+
# Check if there are images in the OCR result
|
364 |
+
has_images = False
|
365 |
+
if 'raw_response' in result:
|
366 |
+
try:
|
367 |
+
has_images = any(page.images for page in result['raw_response'].pages)
|
368 |
+
except Exception:
|
369 |
+
has_images = False
|
370 |
+
|
371 |
+
# Create tabs for different views
|
372 |
+
if has_images:
|
373 |
+
view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
|
374 |
+
else:
|
375 |
+
view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
|
376 |
+
|
377 |
+
with view_tab1:
|
378 |
+
# Display in a more user-friendly format based on the content structure
|
379 |
+
html_content = ""
|
380 |
+
if isinstance(result['ocr_contents'], dict):
|
381 |
+
for section, content in result['ocr_contents'].items():
|
382 |
+
if content: # Only display non-empty sections
|
383 |
+
section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
|
384 |
+
html_content += section_title
|
385 |
+
|
386 |
+
if isinstance(content, str):
|
387 |
+
html_content += f"<p>{content}</p>"
|
388 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
389 |
+
st.markdown(content)
|
390 |
+
elif isinstance(content, list):
|
391 |
+
html_list = "<ul>"
|
392 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
393 |
+
for item in content:
|
394 |
+
if isinstance(item, str):
|
395 |
+
html_list += f"<li>{item}</li>"
|
396 |
+
st.markdown(f"- {item}")
|
397 |
+
elif isinstance(item, dict):
|
398 |
+
html_list += f"<li>{json.dumps(item)}</li>"
|
399 |
+
st.json(item)
|
400 |
+
html_list += "</ul>"
|
401 |
+
html_content += html_list
|
402 |
+
elif isinstance(content, dict):
|
403 |
+
html_dict = "<dl>"
|
404 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
405 |
+
for k, v in content.items():
|
406 |
+
html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
|
407 |
+
st.markdown(f"**{k}:** {v}")
|
408 |
+
html_dict += "</dl>"
|
409 |
+
html_content += html_dict
|
410 |
+
|
411 |
+
# Add download button in a smaller section
|
412 |
+
with st.expander("Export Content"):
|
413 |
+
# Alternative download button
|
414 |
+
html_bytes = html_content.encode()
|
415 |
+
st.download_button(
|
416 |
+
label="Download as HTML",
|
417 |
+
data=html_bytes,
|
418 |
+
file_name="document_content.html",
|
419 |
+
mime="text/html"
|
420 |
+
)
|
421 |
+
|
422 |
+
with view_tab2:
|
423 |
+
# Show the raw JSON for developers
|
424 |
+
st.json(result)
|
425 |
+
|
426 |
+
if has_images:
|
427 |
+
with view_tab3:
|
428 |
+
# Show loading indicator while preparing images
|
429 |
+
with st.spinner("Preparing document with embedded images..."):
|
430 |
+
try:
|
431 |
+
# Import function
|
432 |
+
try:
|
433 |
+
from ocr_utils import get_combined_markdown
|
434 |
+
except ImportError:
|
435 |
+
st.error("Required module ocr_utils not found.")
|
436 |
+
st.stop()
|
437 |
+
|
438 |
+
# Check if raw_response is available
|
439 |
+
if 'raw_response' not in result:
|
440 |
+
st.warning("Raw OCR response not available. Cannot display images.")
|
441 |
+
st.stop()
|
442 |
+
|
443 |
+
# Validate the raw_response structure before processing
|
444 |
+
if not hasattr(result['raw_response'], 'pages'):
|
445 |
+
st.warning("Invalid OCR response format. Cannot display images.")
|
446 |
+
st.stop()
|
447 |
+
|
448 |
+
# Get the combined markdown with images
|
449 |
+
combined_markdown = get_combined_markdown(result['raw_response'])
|
450 |
+
|
451 |
+
if not combined_markdown or combined_markdown.strip() == "":
|
452 |
+
st.warning("No image content found in the document.")
|
453 |
+
st.stop()
|
454 |
+
|
455 |
+
# Add CSS to ensure proper spacing and handling of text and images
|
456 |
+
st.markdown("""
|
457 |
+
<style>
|
458 |
+
.markdown-text-container {
|
459 |
+
padding: 10px;
|
460 |
+
background-color: #f9f9f9;
|
461 |
+
border-radius: 5px;
|
462 |
+
}
|
463 |
+
.markdown-text-container img {
|
464 |
+
margin: 15px 0;
|
465 |
+
max-width: 100%;
|
466 |
+
border: 1px solid #ddd;
|
467 |
+
border-radius: 4px;
|
468 |
+
display: block;
|
469 |
+
}
|
470 |
+
.markdown-text-container p {
|
471 |
+
margin-bottom: 16px;
|
472 |
+
line-height: 1.6;
|
473 |
+
}
|
474 |
+
</style>
|
475 |
+
""", unsafe_allow_html=True)
|
476 |
+
|
477 |
+
# Wrap the markdown in a div with the class for styling
|
478 |
+
st.markdown(f"""
|
479 |
+
<div class="markdown-text-container">
|
480 |
+
{combined_markdown}
|
481 |
+
</div>
|
482 |
+
""", unsafe_allow_html=True)
|
483 |
+
|
484 |
+
# Add a download button for the combined content
|
485 |
+
st.download_button(
|
486 |
+
label="Download with Images (HTML)",
|
487 |
+
data=f"""
|
488 |
+
<html>
|
489 |
+
<head>
|
490 |
+
<style>
|
491 |
+
body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
|
492 |
+
img {{ max-width: 100%; margin: 15px 0; }}
|
493 |
+
</style>
|
494 |
+
</head>
|
495 |
+
<body>
|
496 |
+
{combined_markdown}
|
497 |
+
</body>
|
498 |
+
</html>
|
499 |
+
""",
|
500 |
+
file_name="document_with_images.html",
|
501 |
+
mime="text/html"
|
502 |
+
)
|
503 |
+
|
504 |
+
except Exception as e:
|
505 |
+
st.error(f"Could not display document with images: {str(e)}")
|
506 |
+
st.info("Try refreshing or processing the document again.")
|
507 |
+
else:
|
508 |
+
st.error("No OCR content was extracted from the document.")
|
509 |
+
|
510 |
+
with results_tab2:
|
511 |
+
st.subheader("Raw Processing Results")
|
512 |
+
st.json(result)
|
513 |
+
|
514 |
+
except Exception as e:
|
515 |
+
st.error(f"Error processing document: {str(e)}")
|
516 |
+
else:
|
517 |
+
# Display sample images in the main area when no file is uploaded
|
518 |
+
st.info("Upload a document to get started using the file uploader above.")
|
519 |
+
|
520 |
+
# Show example images in a grid
|
521 |
+
st.subheader("Example Documents")
|
522 |
+
|
523 |
+
# Add a sample images container
|
524 |
+
with st.container():
|
525 |
+
# Find sample images from the input directory to display
|
526 |
+
input_dir = Path(__file__).parent / "input"
|
527 |
+
sample_images = []
|
528 |
+
if input_dir.exists():
|
529 |
+
sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
|
530 |
+
|
531 |
+
if sample_images:
|
532 |
+
columns = st.columns(3)
|
533 |
+
for i, img_path in enumerate(sample_images):
|
534 |
+
with columns[i % 3]:
|
535 |
+
st.image(str(img_path), caption=img_path.name, use_container_width=True)
|
backup/config.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# config.py
|
2 |
+
"""
|
3 |
+
Configuration file for Mistral OCR processing.
|
4 |
+
Contains API key and other settings.
|
5 |
+
"""
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Your Mistral API key - get from Hugging Face secrets or environment variable
|
9 |
+
# The priority order is: HF_SPACES environment var > regular environment var > empty string
|
10 |
+
# Note: No default API key is provided for security reasons
|
11 |
+
MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
|
12 |
+
os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
|
13 |
+
|
14 |
+
# Model settings
|
15 |
+
OCR_MODEL = "mistral-ocr-latest"
|
16 |
+
TEXT_MODEL = "ministral-8b-latest"
|
17 |
+
VISION_MODEL = "pixtral-12b-latest"
|
backup/input/The Magician, or Bottle Cungerer.jpeg
ADDED
![]() |
Git LFS Details
|
backup/input/baldwin-letter-1.jpg
ADDED
![]() |
Git LFS Details
|
backup/input/baldwin-letter-2.jpg
ADDED
![]() |
Git LFS Details
|
backup/input/flier.png
ADDED
![]() |
backup/input/letter-1.jpg
ADDED
![]() |
Git LFS Details
|
backup/input/letter-2.jpg
ADDED
![]() |
Git LFS Details
|
backup/input/letter-3.jpg
ADDED
![]() |
Git LFS Details
|
backup/input/magellan-travels.jpg
ADDED
![]() |
Git LFS Details
|
backup/input/menu.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
3 |
+
size 2554815
|
backup/input/recipe.jpg
ADDED
![]() |
backup/ocr_utils.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utility functions for OCR processing with Mistral AI.
|
3 |
+
Contains helper functions for working with OCR responses and image handling.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import json
|
7 |
+
import base64
|
8 |
+
from pathlib import Path
|
9 |
+
from typing import Dict, List, Optional, Union
|
10 |
+
|
11 |
+
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
12 |
+
|
13 |
+
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
|
14 |
+
"""
|
15 |
+
Replace image placeholders in markdown with base64-encoded images.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
markdown_str: Markdown text containing image placeholders
|
19 |
+
images_dict: Dictionary mapping image IDs to base64 strings
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
Markdown text with images replaced by base64 data
|
23 |
+
"""
|
24 |
+
for img_name, base64_str in images_dict.items():
|
25 |
+
markdown_str = markdown_str.replace(
|
26 |
+
f"", f""
|
27 |
+
)
|
28 |
+
return markdown_str
|
29 |
+
|
30 |
+
def get_combined_markdown(ocr_response) -> str:
|
31 |
+
"""
|
32 |
+
Combine OCR text and images into a single markdown document.
|
33 |
+
Ensures proper spacing between text and images.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
ocr_response: Response from OCR processing containing text and images
|
37 |
+
See https://docs.mistral.ai/capabilities/document/ for API reference
|
38 |
+
|
39 |
+
Returns:
|
40 |
+
Combined markdown string with embedded images
|
41 |
+
"""
|
42 |
+
markdowns: list[str] = []
|
43 |
+
# Extract images from page
|
44 |
+
for page in ocr_response.pages:
|
45 |
+
image_data = {}
|
46 |
+
for img in page.images:
|
47 |
+
image_data[img.id] = img.image_base64
|
48 |
+
|
49 |
+
# Replace image placeholders with actual images
|
50 |
+
page_markdown = replace_images_in_markdown(page.markdown, image_data)
|
51 |
+
|
52 |
+
# Ensure proper spacing between paragraphs and images
|
53 |
+
# Add extra newlines between paragraphs to improve rendering
|
54 |
+
page_markdown = page_markdown.replace("\n", "\n\n")
|
55 |
+
|
56 |
+
# Add page separator for multi-page documents
|
57 |
+
markdowns.append(page_markdown)
|
58 |
+
|
59 |
+
# Join pages with clear separators for multi-page documents
|
60 |
+
return "\n\n---\n\n".join(markdowns)
|
61 |
+
|
62 |
+
def encode_image_for_api(image_path: Union[str, Path]) -> str:
|
63 |
+
"""
|
64 |
+
Encode an image as base64 for API use.
|
65 |
+
|
66 |
+
Args:
|
67 |
+
image_path: Path to the image file
|
68 |
+
|
69 |
+
Returns:
|
70 |
+
Base64 data URL for the image
|
71 |
+
"""
|
72 |
+
# Convert to Path object if string
|
73 |
+
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
74 |
+
|
75 |
+
# Verify image exists
|
76 |
+
if not image_file.is_file():
|
77 |
+
raise FileNotFoundError(f"Image file not found: {image_file}")
|
78 |
+
|
79 |
+
# Encode image as base64
|
80 |
+
encoded = base64.b64encode(image_file.read_bytes()).decode()
|
81 |
+
return f"data:image/jpeg;base64,{encoded}"
|
82 |
+
|
83 |
+
def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
|
84 |
+
"""
|
85 |
+
Process an image with OCR and return the response.
|
86 |
+
|
87 |
+
Args:
|
88 |
+
client: Mistral AI client
|
89 |
+
image_path: Path to the image file
|
90 |
+
model: OCR model to use
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
OCR response object
|
94 |
+
"""
|
95 |
+
# Encode image as base64
|
96 |
+
base64_data_url = encode_image_for_api(image_path)
|
97 |
+
|
98 |
+
# Process image with OCR
|
99 |
+
image_response = client.ocr.process(
|
100 |
+
document=ImageURLChunk(image_url=base64_data_url),
|
101 |
+
model=model
|
102 |
+
)
|
103 |
+
|
104 |
+
return image_response
|
105 |
+
|
106 |
+
def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
|
107 |
+
"""
|
108 |
+
Convert OCR response to a formatted JSON string.
|
109 |
+
|
110 |
+
Args:
|
111 |
+
ocr_response: OCR response object
|
112 |
+
indent: Indentation level for JSON formatting
|
113 |
+
|
114 |
+
Returns:
|
115 |
+
Formatted JSON string
|
116 |
+
"""
|
117 |
+
# Convert response to JSON
|
118 |
+
response_dict = json.loads(ocr_response.model_dump_json())
|
119 |
+
return json.dumps(response_dict, indent=indent)
|
120 |
+
|
121 |
+
# For display in notebooks
|
122 |
+
try:
|
123 |
+
from IPython.display import Markdown, display
|
124 |
+
|
125 |
+
def display_ocr_with_images(ocr_response):
|
126 |
+
"""
|
127 |
+
Display OCR response with embedded images in IPython environments.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
ocr_response: OCR response object
|
131 |
+
"""
|
132 |
+
combined_markdown = get_combined_markdown(ocr_response)
|
133 |
+
display(Markdown(combined_markdown))
|
134 |
+
except ImportError:
|
135 |
+
# IPython not available
|
136 |
+
pass
|
backup/pdf_ocr.py
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
PDFOCR - Module for processing PDF files with OCR and extracting structured data.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import json
|
7 |
+
from pathlib import Path
|
8 |
+
from structured_ocr import StructuredOCR
|
9 |
+
|
10 |
+
class PDFOCR:
|
11 |
+
"""Class for processing PDF files with OCR and extracting structured data."""
|
12 |
+
|
13 |
+
def __init__(self, api_key=None):
|
14 |
+
"""Initialize the PDF OCR processor."""
|
15 |
+
self.processor = StructuredOCR(api_key=api_key)
|
16 |
+
|
17 |
+
def process_pdf(self, pdf_path, use_vision=True):
|
18 |
+
"""
|
19 |
+
Process a PDF file with OCR and extract structured data.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
pdf_path: Path to the PDF file
|
23 |
+
use_vision: Whether to use vision model for improved analysis
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
Dictionary with structured OCR results
|
27 |
+
"""
|
28 |
+
pdf_path = Path(pdf_path)
|
29 |
+
if not pdf_path.exists():
|
30 |
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
31 |
+
|
32 |
+
return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
|
33 |
+
|
34 |
+
def save_json_output(self, pdf_path, output_path, use_vision=True):
|
35 |
+
"""
|
36 |
+
Process a PDF file and save the structured output as JSON.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
pdf_path: Path to the PDF file
|
40 |
+
output_path: Path where to save the JSON output
|
41 |
+
use_vision: Whether to use vision model for improved analysis
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
Path to the saved JSON file
|
45 |
+
"""
|
46 |
+
# Process the PDF
|
47 |
+
result = self.process_pdf(pdf_path, use_vision=use_vision)
|
48 |
+
|
49 |
+
# Save the result to JSON
|
50 |
+
output_path = Path(output_path)
|
51 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
52 |
+
|
53 |
+
with open(output_path, 'w') as f:
|
54 |
+
json.dump(result, f, indent=2)
|
55 |
+
|
56 |
+
return output_path
|
57 |
+
|
58 |
+
# For testing directly
|
59 |
+
if __name__ == "__main__":
|
60 |
+
import sys
|
61 |
+
|
62 |
+
if len(sys.argv) < 2:
|
63 |
+
print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
|
64 |
+
sys.exit(1)
|
65 |
+
|
66 |
+
pdf_path = sys.argv[1]
|
67 |
+
output_path = sys.argv[2] if len(sys.argv) > 2 else None
|
68 |
+
|
69 |
+
processor = PDFOCR()
|
70 |
+
|
71 |
+
if output_path:
|
72 |
+
result_path = processor.save_json_output(pdf_path, output_path)
|
73 |
+
print(f"Results saved to: {result_path}")
|
74 |
+
else:
|
75 |
+
result = processor.process_pdf(pdf_path)
|
76 |
+
print(json.dumps(result, indent=2))
|
backup/requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit>=1.43.2
|
2 |
+
mistralai>=0.0.7
|
3 |
+
pydantic>=2.0.0
|
4 |
+
pycountry>=23.12.11
|
5 |
+
pillow>=10.0.0
|
6 |
+
python-multipart>=0.0.6
|
7 |
+
pdf2image>=1.17.0
|
8 |
+
pytesseract>=0.3.10
|
9 |
+
opencv-python-headless>=4.6.0
|
10 |
+
numpy>=1.23.5
|
backup/structured_ocr.py
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
from enum import Enum
|
5 |
+
from pathlib import Path
|
6 |
+
import json
|
7 |
+
import base64
|
8 |
+
import pycountry
|
9 |
+
import logging
|
10 |
+
from pydantic import BaseModel
|
11 |
+
from mistralai import Mistral
|
12 |
+
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
13 |
+
|
14 |
+
# Configure logging
|
15 |
+
logging.basicConfig(level=logging.INFO,
|
16 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
17 |
+
|
18 |
+
# Import utilities for OCR processing
|
19 |
+
try:
|
20 |
+
from ocr_utils import replace_images_in_markdown, get_combined_markdown
|
21 |
+
except ImportError:
|
22 |
+
# Define fallback functions if module not found
|
23 |
+
def replace_images_in_markdown(markdown_str, images_dict):
|
24 |
+
for img_name, base64_str in images_dict.items():
|
25 |
+
markdown_str = markdown_str.replace(
|
26 |
+
f"", f""
|
27 |
+
)
|
28 |
+
return markdown_str
|
29 |
+
|
30 |
+
def get_combined_markdown(ocr_response):
|
31 |
+
markdowns = []
|
32 |
+
for page in ocr_response.pages:
|
33 |
+
image_data = {}
|
34 |
+
for img in page.images:
|
35 |
+
image_data[img.id] = img.image_base64
|
36 |
+
markdowns.append(replace_images_in_markdown(page.markdown, image_data))
|
37 |
+
return "\n\n".join(markdowns)
|
38 |
+
|
39 |
+
# Import config directly (now local to historical-ocr)
|
40 |
+
from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
|
41 |
+
|
42 |
+
# Create language enum for structured output
|
43 |
+
languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
|
44 |
+
|
45 |
+
class LanguageMeta(Enum.__class__):
|
46 |
+
def __new__(metacls, cls, bases, classdict):
|
47 |
+
for code, name in languages.items():
|
48 |
+
classdict[name.upper().replace(' ', '_')] = name
|
49 |
+
return super().__new__(metacls, cls, bases, classdict)
|
50 |
+
|
51 |
+
class Language(Enum, metaclass=LanguageMeta):
|
52 |
+
pass
|
53 |
+
|
54 |
+
class StructuredOCRModel(BaseModel):
|
55 |
+
file_name: str
|
56 |
+
topics: list[str]
|
57 |
+
languages: list[Language]
|
58 |
+
ocr_contents: dict
|
59 |
+
|
60 |
+
class StructuredOCR:
|
61 |
+
def __init__(self, api_key=None):
|
62 |
+
"""Initialize the OCR processor with API key"""
|
63 |
+
self.api_key = api_key or MISTRAL_API_KEY
|
64 |
+
self.client = Mistral(api_key=self.api_key)
|
65 |
+
|
66 |
+
def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
|
67 |
+
"""Process a file and return structured OCR results
|
68 |
+
|
69 |
+
Args:
|
70 |
+
file_path: Path to the file to process
|
71 |
+
file_type: 'pdf' or 'image' (will be auto-detected if None)
|
72 |
+
use_vision: Whether to use vision model for improved analysis
|
73 |
+
max_pages: Optional limit on number of pages to process
|
74 |
+
file_size_mb: Optional file size in MB (used for automatic page limiting)
|
75 |
+
custom_pages: Optional list of specific page numbers to process
|
76 |
+
|
77 |
+
Returns:
|
78 |
+
Dictionary with structured OCR results
|
79 |
+
"""
|
80 |
+
# Convert file_path to Path object if it's a string
|
81 |
+
file_path = Path(file_path)
|
82 |
+
|
83 |
+
# Auto-detect file type if not provided
|
84 |
+
if file_type is None:
|
85 |
+
suffix = file_path.suffix.lower()
|
86 |
+
file_type = "pdf" if suffix == ".pdf" else "image"
|
87 |
+
|
88 |
+
# Get file size if not provided
|
89 |
+
if file_size_mb is None and file_path.exists():
|
90 |
+
file_size_mb = file_path.stat().st_size / (1024 * 1024) # Convert bytes to MB
|
91 |
+
|
92 |
+
# Check if file exceeds API limits (50 MB)
|
93 |
+
if file_size_mb and file_size_mb > 50:
|
94 |
+
logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
|
95 |
+
return {
|
96 |
+
"file_name": file_path.name,
|
97 |
+
"topics": ["Document"],
|
98 |
+
"languages": ["English"],
|
99 |
+
"confidence_score": 0.0,
|
100 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
|
101 |
+
"ocr_contents": {
|
102 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
|
103 |
+
"partial_text": "Document could not be processed due to size limitations."
|
104 |
+
}
|
105 |
+
}
|
106 |
+
|
107 |
+
# For PDF files, limit pages based on file size if no explicit limit is given
|
108 |
+
if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
|
109 |
+
if file_size_mb > 100: # Very large files
|
110 |
+
max_pages = 3
|
111 |
+
elif file_size_mb > 50: # Large files
|
112 |
+
max_pages = 5
|
113 |
+
elif file_size_mb > 20: # Medium files
|
114 |
+
max_pages = 10
|
115 |
+
else: # Small files
|
116 |
+
max_pages = None # Process all pages
|
117 |
+
|
118 |
+
# Start processing timer
|
119 |
+
start_time = time.time()
|
120 |
+
|
121 |
+
# Read and process the file
|
122 |
+
if file_type == "pdf":
|
123 |
+
result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
|
124 |
+
else:
|
125 |
+
result = self._process_image(file_path, use_vision)
|
126 |
+
|
127 |
+
# Add processing time information
|
128 |
+
processing_time = time.time() - start_time
|
129 |
+
result['processing_time'] = processing_time
|
130 |
+
|
131 |
+
# Add a default confidence score if not present
|
132 |
+
if 'confidence_score' not in result:
|
133 |
+
result['confidence_score'] = 0.85 # Default confidence
|
134 |
+
|
135 |
+
return result
|
136 |
+
|
137 |
+
def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
|
138 |
+
"""Process a PDF file with OCR
|
139 |
+
|
140 |
+
Args:
|
141 |
+
file_path: Path to the PDF file
|
142 |
+
use_vision: Whether to use vision model
|
143 |
+
max_pages: Optional limit on the number of pages to process
|
144 |
+
custom_pages: Optional list of specific page numbers to process
|
145 |
+
"""
|
146 |
+
logger = logging.getLogger("pdf_processor")
|
147 |
+
logger.info(f"Processing PDF: {file_path}")
|
148 |
+
|
149 |
+
try:
|
150 |
+
# Upload the PDF file
|
151 |
+
logger.info("Uploading PDF file to Mistral API")
|
152 |
+
uploaded_file = self.client.files.upload(
|
153 |
+
file={
|
154 |
+
"file_name": file_path.stem,
|
155 |
+
"content": file_path.read_bytes(),
|
156 |
+
},
|
157 |
+
purpose="ocr",
|
158 |
+
)
|
159 |
+
|
160 |
+
# Get a signed URL for the uploaded file
|
161 |
+
signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
|
162 |
+
|
163 |
+
# Process the PDF with OCR
|
164 |
+
logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
|
165 |
+
pdf_response = self.client.ocr.process(
|
166 |
+
document=DocumentURLChunk(document_url=signed_url.url),
|
167 |
+
model=OCR_MODEL,
|
168 |
+
include_image_base64=True
|
169 |
+
)
|
170 |
+
|
171 |
+
# Limit pages if requested
|
172 |
+
pages_to_process = pdf_response.pages
|
173 |
+
total_pages = len(pdf_response.pages)
|
174 |
+
limited_pages = False
|
175 |
+
|
176 |
+
logger.info(f"PDF has {total_pages} total pages")
|
177 |
+
|
178 |
+
# Handle custom page selection if provided
|
179 |
+
if custom_pages:
|
180 |
+
# Convert to 0-based indexing and filter valid page numbers
|
181 |
+
valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
|
182 |
+
if valid_indices:
|
183 |
+
pages_to_process = [pdf_response.pages[i] for i in valid_indices]
|
184 |
+
limited_pages = True
|
185 |
+
logger.info(f"Processing {len(valid_indices)} custom-selected pages")
|
186 |
+
# Otherwise handle max_pages limit
|
187 |
+
elif max_pages and total_pages > max_pages:
|
188 |
+
pages_to_process = pages_to_process[:max_pages]
|
189 |
+
limited_pages = True
|
190 |
+
logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
|
191 |
+
|
192 |
+
# Calculate average confidence score based on OCR response if available
|
193 |
+
confidence_score = 0.0
|
194 |
+
try:
|
195 |
+
# Some OCR APIs provide confidence scores
|
196 |
+
confidence_values = []
|
197 |
+
for page in pages_to_process:
|
198 |
+
if hasattr(page, 'confidence'):
|
199 |
+
confidence_values.append(page.confidence)
|
200 |
+
|
201 |
+
if confidence_values:
|
202 |
+
confidence_score = sum(confidence_values) / len(confidence_values)
|
203 |
+
else:
|
204 |
+
confidence_score = 0.85 # Default if no confidence scores available
|
205 |
+
except:
|
206 |
+
confidence_score = 0.85 # Default fallback
|
207 |
+
|
208 |
+
# Combine pages' markdown into a single string
|
209 |
+
all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
|
210 |
+
|
211 |
+
# Extract structured data using the appropriate model
|
212 |
+
if use_vision:
|
213 |
+
# Get base64 of first page for vision model
|
214 |
+
first_page_image = None
|
215 |
+
if pages_to_process and pages_to_process[0].images:
|
216 |
+
first_page_image = pages_to_process[0].images[0].image_base64
|
217 |
+
|
218 |
+
if first_page_image:
|
219 |
+
# Use vision model
|
220 |
+
logger.info(f"Using vision model: {VISION_MODEL}")
|
221 |
+
result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
|
222 |
+
else:
|
223 |
+
# Fall back to text-only model if no image available
|
224 |
+
logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
|
225 |
+
result = self._extract_structured_data_text_only(all_markdown, file_path.name)
|
226 |
+
else:
|
227 |
+
# Use text-only model
|
228 |
+
logger.info(f"Using text-only model: {TEXT_MODEL}")
|
229 |
+
result = self._extract_structured_data_text_only(all_markdown, file_path.name)
|
230 |
+
|
231 |
+
# Add page limit info to result if needed
|
232 |
+
if limited_pages:
|
233 |
+
result['limited_pages'] = {
|
234 |
+
'processed': len(pages_to_process),
|
235 |
+
'total': total_pages
|
236 |
+
}
|
237 |
+
|
238 |
+
# Add confidence score
|
239 |
+
result['confidence_score'] = confidence_score
|
240 |
+
|
241 |
+
# Store the raw OCR response for image rendering
|
242 |
+
result['raw_response'] = pdf_response
|
243 |
+
|
244 |
+
logger.info(f"PDF processing completed successfully")
|
245 |
+
return result
|
246 |
+
|
247 |
+
except Exception as e:
|
248 |
+
logger.error(f"Error processing PDF: {str(e)}")
|
249 |
+
# Return basic result on error
|
250 |
+
return {
|
251 |
+
"file_name": file_path.name,
|
252 |
+
"topics": ["Document"],
|
253 |
+
"languages": ["English"],
|
254 |
+
"confidence_score": 0.0,
|
255 |
+
"error": str(e),
|
256 |
+
"ocr_contents": {
|
257 |
+
"error": f"Failed to process PDF: {str(e)}",
|
258 |
+
"partial_text": "Document could not be fully processed."
|
259 |
+
}
|
260 |
+
}
|
261 |
+
|
262 |
+
def _process_image(self, file_path, use_vision=True):
|
263 |
+
"""Process an image file with OCR"""
|
264 |
+
logger = logging.getLogger("image_processor")
|
265 |
+
logger.info(f"Processing image: {file_path}")
|
266 |
+
|
267 |
+
try:
|
268 |
+
# Read and encode the image file
|
269 |
+
logger.info("Encoding image for API")
|
270 |
+
encoded_image = base64.b64encode(file_path.read_bytes()).decode()
|
271 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
272 |
+
|
273 |
+
# Process the image with OCR
|
274 |
+
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
275 |
+
image_response = self.client.ocr.process(
|
276 |
+
document=ImageURLChunk(image_url=base64_data_url),
|
277 |
+
model=OCR_MODEL,
|
278 |
+
include_image_base64=True
|
279 |
+
)
|
280 |
+
|
281 |
+
# Get the OCR markdown from the first page
|
282 |
+
image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
|
283 |
+
|
284 |
+
# Calculate confidence score if available
|
285 |
+
confidence_score = 0.85 # Default value
|
286 |
+
try:
|
287 |
+
if hasattr(image_response.pages[0], 'confidence'):
|
288 |
+
confidence_score = image_response.pages[0].confidence
|
289 |
+
except:
|
290 |
+
pass
|
291 |
+
|
292 |
+
# Extract structured data using the appropriate model
|
293 |
+
if use_vision:
|
294 |
+
logger.info(f"Using vision model: {VISION_MODEL}")
|
295 |
+
result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
|
296 |
+
else:
|
297 |
+
logger.info(f"Using text-only model: {TEXT_MODEL}")
|
298 |
+
result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
|
299 |
+
|
300 |
+
# Add confidence score
|
301 |
+
result['confidence_score'] = confidence_score
|
302 |
+
|
303 |
+
# Store the raw OCR response for image rendering
|
304 |
+
result['raw_response'] = image_response
|
305 |
+
|
306 |
+
logger.info("Image processing completed successfully")
|
307 |
+
return result
|
308 |
+
|
309 |
+
except Exception as e:
|
310 |
+
logger.error(f"Error processing image: {str(e)}")
|
311 |
+
# Return basic result on error
|
312 |
+
return {
|
313 |
+
"file_name": file_path.name,
|
314 |
+
"topics": ["Document"],
|
315 |
+
"languages": ["English"],
|
316 |
+
"confidence_score": 0.0,
|
317 |
+
"error": str(e),
|
318 |
+
"ocr_contents": {
|
319 |
+
"error": f"Failed to process image: {str(e)}",
|
320 |
+
"partial_text": "Image could not be processed."
|
321 |
+
}
|
322 |
+
}
|
323 |
+
|
324 |
+
def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
|
325 |
+
"""Extract structured data using vision model"""
|
326 |
+
try:
|
327 |
+
# Parse with vision model with a timeout
|
328 |
+
chat_response = self.client.chat.parse(
|
329 |
+
model=VISION_MODEL,
|
330 |
+
messages=[
|
331 |
+
{
|
332 |
+
"role": "user",
|
333 |
+
"content": [
|
334 |
+
ImageURLChunk(image_url=image_base64),
|
335 |
+
TextChunk(text=(
|
336 |
+
f"This is a historical document's OCR in markdown:\n"
|
337 |
+
f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
338 |
+
f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
|
339 |
+
f"Extract topics, languages, and organize the content logically."
|
340 |
+
))
|
341 |
+
],
|
342 |
+
},
|
343 |
+
],
|
344 |
+
response_format=StructuredOCRModel,
|
345 |
+
temperature=0
|
346 |
+
)
|
347 |
+
|
348 |
+
# Convert the response to a dictionary
|
349 |
+
result = json.loads(chat_response.choices[0].message.parsed.json())
|
350 |
+
|
351 |
+
# Ensure languages is a list of strings, not Language enum objects
|
352 |
+
if 'languages' in result:
|
353 |
+
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
354 |
+
|
355 |
+
except Exception as e:
|
356 |
+
# Fall back to text-only model if vision model fails
|
357 |
+
print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
|
358 |
+
result = self._extract_structured_data_text_only(ocr_markdown, filename)
|
359 |
+
|
360 |
+
return result
|
361 |
+
|
362 |
+
def _extract_structured_data_text_only(self, ocr_markdown, filename):
|
363 |
+
"""Extract structured data using text-only model"""
|
364 |
+
try:
|
365 |
+
# Parse with text-only model with a timeout
|
366 |
+
chat_response = self.client.chat.parse(
|
367 |
+
model=TEXT_MODEL,
|
368 |
+
messages=[
|
369 |
+
{
|
370 |
+
"role": "user",
|
371 |
+
"content": f"This is a historical document's OCR in markdown:\n"
|
372 |
+
f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
|
373 |
+
f"Convert this into a structured JSON response with the OCR contents. "
|
374 |
+
f"Extract topics, languages, and organize the content logically."
|
375 |
+
},
|
376 |
+
],
|
377 |
+
response_format=StructuredOCRModel,
|
378 |
+
temperature=0
|
379 |
+
)
|
380 |
+
|
381 |
+
# Convert the response to a dictionary
|
382 |
+
result = json.loads(chat_response.choices[0].message.parsed.json())
|
383 |
+
|
384 |
+
# Ensure languages is a list of strings, not Language enum objects
|
385 |
+
if 'languages' in result:
|
386 |
+
result['languages'] = [str(lang) for lang in result.get('languages', [])]
|
387 |
+
|
388 |
+
except Exception as e:
|
389 |
+
# Create a basic result if parsing fails
|
390 |
+
print(f"Text model failed: {str(e)}. Creating basic result.")
|
391 |
+
result = {
|
392 |
+
"file_name": filename,
|
393 |
+
"topics": ["Document"],
|
394 |
+
"languages": ["English"],
|
395 |
+
"ocr_contents": {
|
396 |
+
"raw_text": ocr_markdown
|
397 |
+
}
|
398 |
+
}
|
399 |
+
|
400 |
+
return result
|
401 |
+
|
402 |
+
# For testing directly
|
403 |
+
if __name__ == "__main__":
|
404 |
+
import sys
|
405 |
+
|
406 |
+
if len(sys.argv) < 2:
|
407 |
+
print("Usage: python structured_ocr.py <file_path>")
|
408 |
+
sys.exit(1)
|
409 |
+
|
410 |
+
file_path = sys.argv[1]
|
411 |
+
processor = StructuredOCR()
|
412 |
+
result = processor.process_file(file_path)
|
413 |
+
|
414 |
+
print(json.dumps(result, indent=2))
|
config.py
CHANGED
@@ -5,10 +5,13 @@ Contains API key and other settings.
|
|
5 |
"""
|
6 |
import os
|
7 |
|
8 |
-
# Your Mistral API key - get from environment variable
|
9 |
-
|
|
|
|
|
|
|
10 |
|
11 |
-
# Model settings
|
12 |
OCR_MODEL = "mistral-ocr-latest"
|
13 |
TEXT_MODEL = "ministral-8b-latest"
|
14 |
VISION_MODEL = "pixtral-12b-latest"
|
|
|
5 |
"""
|
6 |
import os
|
7 |
|
8 |
+
# Your Mistral API key - get from Hugging Face secrets or environment variable
|
9 |
+
# The priority order is: HF_SPACES environment var > regular environment var > empty string
|
10 |
+
# Note: No default API key is provided for security reasons
|
11 |
+
MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
|
12 |
+
os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
|
13 |
|
14 |
+
# Model settings
|
15 |
OCR_MODEL = "mistral-ocr-latest"
|
16 |
TEXT_MODEL = "ministral-8b-latest"
|
17 |
VISION_MODEL = "pixtral-12b-latest"
|
input/The Magician, or Bottle Cungerer.jpeg
ADDED
![]() |
Git LFS Details
|
input/a-la-carte.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
3 |
+
size 2554815
|
input/flier.png
ADDED
![]() |
input/handwritten-letter.jpg
ADDED
![]() |
Git LFS Details
|
input/letter-1.jpg
ADDED
![]() |
Git LFS Details
|
input/letter-2.jpg
ADDED
![]() |
Git LFS Details
|
input/letter-3.jpg
ADDED
![]() |
Git LFS Details
|
input/magician-satire.jpg
ADDED
![]() |
Git LFS Details
|
input/menu.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
|
3 |
+
size 2554815
|
input/milgram-flier.png
ADDED
![]() |
input/okeefe-recipe.jpg
ADDED
![]() |
input/recipe.jpg
ADDED
![]() |
modules/content/__init__.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Module initialization file for the workshop modules.
|
3 |
+
"""
|
4 |
+
from . import module1, module2, module3, module4, module5, module6
|
5 |
+
|
6 |
+
# Module registry for easy access by module number
|
7 |
+
modules = {
|
8 |
+
1: module1,
|
9 |
+
2: module2,
|
10 |
+
3: module3,
|
11 |
+
4: module4,
|
12 |
+
5: module5,
|
13 |
+
6: module6
|
14 |
+
}
|
15 |
+
|
16 |
+
# Module names for navigation and display
|
17 |
+
module_names = [
|
18 |
+
"Introduction",
|
19 |
+
"Text-Image Relations",
|
20 |
+
"OCR Technology",
|
21 |
+
"Methodological Approaches",
|
22 |
+
"Interactive OCR",
|
23 |
+
"Conclusion"
|
24 |
+
]
|
25 |
+
|
26 |
+
def get_module(module_number):
|
27 |
+
"""Get a module by its number (1-6)"""
|
28 |
+
if module_number in modules:
|
29 |
+
return modules[module_number]
|
30 |
+
raise ValueError(f"Unknown module number: {module_number}")
|
31 |
+
|
32 |
+
def get_module_name(module_number):
|
33 |
+
"""Get a module name by its number (1-6)"""
|
34 |
+
if 1 <= module_number <= len(module_names):
|
35 |
+
return module_names[module_number - 1]
|
36 |
+
return f"Module {module_number}"
|
modules/content/module1.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from layout import gray_container, blue_container, yellow_container, card_grid, key_concept
|
3 |
+
|
4 |
+
def render():
|
5 |
+
"""Module 1: Introduction and Problematization"""
|
6 |
+
|
7 |
+
st.title("Module 1: Introduction and Problematization")
|
8 |
+
|
9 |
+
# Workshop overview in gray container
|
10 |
+
overview_content = """
|
11 |
+
<h3>Workshop Overview</h3>
|
12 |
+
<p>
|
13 |
+
This interactive workshop explores the application of OCR technology to historical documents,
|
14 |
+
combining theoretical understanding with practical experiences. Designed for historians,
|
15 |
+
archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
|
16 |
+
</p>
|
17 |
+
"""
|
18 |
+
gray_container(overview_content)
|
19 |
+
|
20 |
+
# For historians section with blue background
|
21 |
+
historians_content = """
|
22 |
+
<h3>For Historians:</h3>
|
23 |
+
<p>
|
24 |
+
How might OCR technology transform our access to and interpretation of historical
|
25 |
+
documents? What new research questions become possible when large archives
|
26 |
+
become machine-readable?
|
27 |
+
</p>
|
28 |
+
"""
|
29 |
+
blue_container(historians_content)
|
30 |
+
|
31 |
+
# What is OCR section with yellow background
|
32 |
+
ocr_content = """
|
33 |
+
<h3>What is OCR?</h3>
|
34 |
+
<p>
|
35 |
+
Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
|
36 |
+
Modern OCR uses AI vision models to understand both the text and its visual context.
|
37 |
+
</p>
|
38 |
+
"""
|
39 |
+
yellow_container(ocr_content)
|
40 |
+
|
41 |
+
# What you'll learn section
|
42 |
+
st.subheader("What You'll Learn")
|
43 |
+
|
44 |
+
# Create cards for the learning outcomes
|
45 |
+
cards = [
|
46 |
+
"""
|
47 |
+
<h4>Conceptual Understanding</h4>
|
48 |
+
<ul>
|
49 |
+
<li>Text-image relationships in historical documents</li>
|
50 |
+
<li>Evolution of OCR technology</li>
|
51 |
+
<li>AI vision models for document analysis</li>
|
52 |
+
<li>Historical typography challenges</li>
|
53 |
+
</ul>
|
54 |
+
""",
|
55 |
+
|
56 |
+
"""
|
57 |
+
<h4>Methodological Approaches</h4>
|
58 |
+
<ul>
|
59 |
+
<li>Critical frameworks for OCR research</li>
|
60 |
+
<li>Hybrid computational methods</li>
|
61 |
+
<li>Error analysis and interpretation</li>
|
62 |
+
<li>Contextual reading strategies</li>
|
63 |
+
</ul>
|
64 |
+
""",
|
65 |
+
|
66 |
+
"""
|
67 |
+
<h4>Practical Skills</h4>
|
68 |
+
<ul>
|
69 |
+
<li>Processing historical documents</li>
|
70 |
+
<li>Analyzing extracted information</li>
|
71 |
+
<li>Integrating OCR into workflows</li>
|
72 |
+
<li>Building searchable archives</li>
|
73 |
+
</ul>
|
74 |
+
"""
|
75 |
+
]
|
76 |
+
|
77 |
+
card_grid(cards)
|
78 |
+
|
79 |
+
# Add a key concept
|
80 |
+
concept_content = """
|
81 |
+
<h4>Workshop Structure</h4>
|
82 |
+
<p>This workshop combines theory and practice through six modules, each building on the previous ones.</p>
|
83 |
+
<p>Navigate between modules using the buttons at the bottom of the page.</p>
|
84 |
+
"""
|
85 |
+
key_concept(concept_content)
|
modules/content/module2.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from layout import gray_container, card_grid, key_concept, research_question
|
3 |
+
|
4 |
+
def render():
|
5 |
+
"""Module 2: Text-Image Relations in Historical Archives"""
|
6 |
+
|
7 |
+
st.title("Module 2: Text-Image Relations in Historical Archives")
|
8 |
+
|
9 |
+
col1, col2 = st.columns([1, 1])
|
10 |
+
|
11 |
+
with col1:
|
12 |
+
textual_content = """
|
13 |
+
<h3>Textual Elements</h3>
|
14 |
+
<ul>
|
15 |
+
<li><strong>Typography</strong>: Varying fonts, sizes, and styles</li>
|
16 |
+
<li><strong>Layout</strong>: Columns, margins, and spacing</li>
|
17 |
+
<li><strong>Marginalia</strong>: Notes, comments, and additions</li>
|
18 |
+
<li><strong>Decorative Text</strong>: Illuminated letters and calligraphy</li>
|
19 |
+
</ul>
|
20 |
+
"""
|
21 |
+
gray_container(textual_content)
|
22 |
+
|
23 |
+
visual_content = """
|
24 |
+
<h3>Visual Elements</h3>
|
25 |
+
<ul>
|
26 |
+
<li><strong>Illustrations</strong>: Diagrams, maps, and artistic representations</li>
|
27 |
+
<li><strong>Watermarks</strong>: Hidden identifiers that locate documents</li>
|
28 |
+
<li><strong>Damage</strong>: Tears, stains, and fading affecting legibility</li>
|
29 |
+
<li><strong>Material Features</strong>: Paper quality and physical dimensions</li>
|
30 |
+
</ul>
|
31 |
+
"""
|
32 |
+
gray_container(visual_content)
|
33 |
+
|
34 |
+
with col2:
|
35 |
+
interdependence_content = """
|
36 |
+
<h3>Interdependence</h3>
|
37 |
+
<p>The relationship between text and image in historical documents exists on a complex spectrum:</p>
|
38 |
+
<ul>
|
39 |
+
<li>Text functions as image (decorative headings)</li>
|
40 |
+
<li>Images function as text (symbolic representations)</li>
|
41 |
+
<li>Layout creates meaning through visual organization</li>
|
42 |
+
<li>Material conditions affect both textual and visual elements</li>
|
43 |
+
</ul>
|
44 |
+
"""
|
45 |
+
gray_container(interdependence_content)
|
46 |
+
|
47 |
+
# Display an example image
|
48 |
+
st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Book_of_Kells_folio_292r.jpg/800px-Book_of_Kells_folio_292r.jpg",
|
49 |
+
caption="Book of Kells - Example of text-image integration")
|
50 |
+
|
51 |
+
# OCR Challenges section
|
52 |
+
challenge_content = """
|
53 |
+
<h3>OCR Challenges</h3>
|
54 |
+
<p>These complex text-image relationships create particular challenges for OCR:</p>
|
55 |
+
"""
|
56 |
+
gray_container(challenge_content)
|
57 |
+
|
58 |
+
# Cards for OCR challenges
|
59 |
+
cards = [
|
60 |
+
"""
|
61 |
+
<h4>Distinguishing Text from Decoration</h4>
|
62 |
+
<p>Where does ornamental text end and functional text begin?</p>
|
63 |
+
""",
|
64 |
+
|
65 |
+
"""
|
66 |
+
<h4>Handling Illustrations</h4>
|
67 |
+
<p>Should they be processed as images or described as text?</p>
|
68 |
+
""",
|
69 |
+
|
70 |
+
"""
|
71 |
+
<h4>Interpreting Layout</h4>
|
72 |
+
<p>How to capture the significance of spacing and organization?</p>
|
73 |
+
""",
|
74 |
+
|
75 |
+
"""
|
76 |
+
<h4>Preserving Context</h4>
|
77 |
+
<p>Maintaining the relationship between textual and visual elements</p>
|
78 |
+
"""
|
79 |
+
]
|
80 |
+
|
81 |
+
card_grid(cards)
|
82 |
+
|
83 |
+
# Research question box
|
84 |
+
research_content = """
|
85 |
+
<h4>Research Question</h4>
|
86 |
+
<p>How do we approach documents where the visual presentation is as meaningful as the textual content itself?</p>
|
87 |
+
"""
|
88 |
+
research_question(research_content)
|
modules/content/module3.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pathlib import Path
|
3 |
+
from layout import gray_container, tool_container, key_concept, research_question
|
4 |
+
|
5 |
+
def render():
|
6 |
+
"""Module 3: OCR Technology and Historical Documents"""
|
7 |
+
|
8 |
+
st.title("Module 3: OCR Technology and Historical Documents")
|
9 |
+
|
10 |
+
col1, col2 = st.columns([1, 1])
|
11 |
+
|
12 |
+
with col1:
|
13 |
+
traditional_content = """
|
14 |
+
<h3>Traditional OCR Approaches</h3>
|
15 |
+
<ol>
|
16 |
+
<li><strong>Pattern Matching</strong>: Early OCR compared characters to templates</li>
|
17 |
+
<li><strong>Feature Extraction</strong>: Identifying key features of characters</li>
|
18 |
+
<li><strong>Statistical Models</strong>: Using probabilities to improve recognition</li>
|
19 |
+
</ol>
|
20 |
+
"""
|
21 |
+
gray_container(traditional_content)
|
22 |
+
|
23 |
+
modern_content = """
|
24 |
+
<h3>Modern AI-Enhanced OCR</h3>
|
25 |
+
<ol>
|
26 |
+
<li><strong>Neural Networks</strong>: Deep learning models trained on vast datasets</li>
|
27 |
+
<li><strong>Computer Vision</strong>: Advanced image processing techniques</li>
|
28 |
+
<li><strong>Language Models</strong>: Contextual understanding to resolve ambiguities</li>
|
29 |
+
<li><strong>Multimodal Models</strong>: Integration of text, layout, and visual understanding</li>
|
30 |
+
</ol>
|
31 |
+
"""
|
32 |
+
gray_container(modern_content)
|
33 |
+
|
34 |
+
with col2:
|
35 |
+
challenges_content = """
|
36 |
+
<h3>Challenges with Historical Documents</h3>
|
37 |
+
<p>Historical materials present unique difficulties:</p>
|
38 |
+
<ul>
|
39 |
+
<li><strong>Typography Variation</strong>: Non-standardized fonts and styles</li>
|
40 |
+
<li><strong>Historical Language</strong>: Archaic vocabulary and grammar</li>
|
41 |
+
<li><strong>Layout Complexity</strong>: Non-linear arrangements</li>
|
42 |
+
<li><strong>Document Degradation</strong>: Fading, tears, stains, and damage</li>
|
43 |
+
<li><strong>Material Artifacts</strong>: Paper texture, binding shadows, etc.</li>
|
44 |
+
</ul>
|
45 |
+
"""
|
46 |
+
gray_container(challenges_content)
|
47 |
+
|
48 |
+
# Display OCR processing diagram
|
49 |
+
st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg",
|
50 |
+
caption="OCR processing layers")
|
51 |
+
|
52 |
+
# Key concept section
|
53 |
+
concept_content = """
|
54 |
+
<h3>Vision-Enhanced OCR</h3>
|
55 |
+
<p>Modern OCR systems like those based on Mistral-7B-Vision combine:</p>
|
56 |
+
<ol>
|
57 |
+
<li>Image understanding capabilities to process the visual aspects</li>
|
58 |
+
<li>Text recognition to extract characters accurately</li>
|
59 |
+
<li>Layout analysis to understand structure</li>
|
60 |
+
<li>Contextual language processing for improved accuracy</li>
|
61 |
+
</ol>
|
62 |
+
<p>This multimodal approach dramatically improves OCR results on historical documents compared to traditional OCR.</p>
|
63 |
+
"""
|
64 |
+
key_concept(concept_content)
|
65 |
+
|
66 |
+
# Technical details in a tool container
|
67 |
+
tech_content = """
|
68 |
+
<h3>Technical Evolution of OCR</h3>
|
69 |
+
<p><strong>Traditional OCR Pipeline:</strong></p>
|
70 |
+
<ol>
|
71 |
+
<li>Preprocessing (binarization, noise removal)</li>
|
72 |
+
<li>Layout analysis (segmentation)</li>
|
73 |
+
<li>Character recognition (pattern matching)</li>
|
74 |
+
<li>Post-processing (spell checking)</li>
|
75 |
+
</ol>
|
76 |
+
|
77 |
+
<p><strong>Modern LLM-Vision Pipeline:</strong></p>
|
78 |
+
<ol>
|
79 |
+
<li>Image normalization</li>
|
80 |
+
<li>Image embedding via vision encoder</li>
|
81 |
+
<li>Integration with language model</li>
|
82 |
+
<li>Joint inference across modalities</li>
|
83 |
+
<li>Structured extraction of information</li>
|
84 |
+
</ol>
|
85 |
+
"""
|
86 |
+
tool_container(tech_content)
|
87 |
+
|
88 |
+
# Research question
|
89 |
+
research_content = """
|
90 |
+
<h4>Consider This:</h4>
|
91 |
+
<p>How might the capabilities of vision-language models change our approach to digitizing historical archives?</p>
|
92 |
+
"""
|
93 |
+
research_question(research_content)
|
94 |
+
|
95 |
+
# Display history if available
|
96 |
+
if 'processing_history' in st.session_state and st.session_state.processing_history:
|
97 |
+
with st.expander("Your OCR Processing History"):
|
98 |
+
st.markdown("You've already processed the following documents:")
|
99 |
+
|
100 |
+
for item in st.session_state.processing_history:
|
101 |
+
st.markdown(f"**{item['fileName']}**")
|
102 |
+
col1, col2 = st.columns(2)
|
103 |
+
with col1:
|
104 |
+
st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
|
105 |
+
with col2:
|
106 |
+
st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
|
modules/content/module4.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pathlib import Path
|
3 |
+
from layout import gray_container, tool_container, key_concept, quote
|
4 |
+
|
5 |
+
def render():
|
6 |
+
"""Module 4: Methodological Approaches"""
|
7 |
+
|
8 |
+
st.title("Module 4: Methodological Approaches")
|
9 |
+
|
10 |
+
col1, col2 = st.columns([1, 1])
|
11 |
+
|
12 |
+
with col1:
|
13 |
+
hybrid_content = """
|
14 |
+
<h3>Hybrid Methodologies</h3>
|
15 |
+
|
16 |
+
<h4>1. Computational + Human Reading</h4>
|
17 |
+
<ul>
|
18 |
+
<li>OCR for initial processing and discovery</li>
|
19 |
+
<li>Human review for context and interpretation</li>
|
20 |
+
<li>Iterative refinement of computational outputs</li>
|
21 |
+
</ul>
|
22 |
+
|
23 |
+
<h4>2. Close + Distant Reading</h4>
|
24 |
+
<ul>
|
25 |
+
<li>Distant reading through large-scale OCR processing</li>
|
26 |
+
<li>Close reading of selected passages</li>
|
27 |
+
<li>Zooming between scales of analysis</li>
|
28 |
+
</ul>
|
29 |
+
"""
|
30 |
+
gray_container(hybrid_content)
|
31 |
+
|
32 |
+
# Check if the diagram image is available and display it
|
33 |
+
input_dir = Path(__file__).parent.parent / "input"
|
34 |
+
diagram_path = input_dir / "diagram.jpg"
|
35 |
+
|
36 |
+
if diagram_path.exists():
|
37 |
+
try:
|
38 |
+
from PIL import Image
|
39 |
+
with Image.open(diagram_path) as img:
|
40 |
+
st.image(img, caption="Historical VLM architecture", use_column_width=True)
|
41 |
+
except Exception:
|
42 |
+
# If there's an error, just show a placeholder
|
43 |
+
st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
|
44 |
+
else:
|
45 |
+
# If the file doesn't exist, show a placeholder
|
46 |
+
st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
|
47 |
+
|
48 |
+
with col2:
|
49 |
+
mistral_content = """
|
50 |
+
<h3>Mistral-OCR-Latest: State-of-the-Art</h3>
|
51 |
+
|
52 |
+
<p>The Mistral-OCR model represents a significant advancement:</p>
|
53 |
+
<ul>
|
54 |
+
<li><strong>Multimodal Understanding</strong>: Processes both visual and textual information</li>
|
55 |
+
<li><strong>Contextual Awareness</strong>: Considers historical context</li>
|
56 |
+
<li><strong>Layout Recognition</strong>: Preserves complex document structures</li>
|
57 |
+
<li><strong>Historical Font Adaptation</strong>: Trained on diverse historical typography</li>
|
58 |
+
</ul>
|
59 |
+
"""
|
60 |
+
gray_container(mistral_content)
|
61 |
+
|
62 |
+
# Check if the workflow image is available and display it
|
63 |
+
workflow_path = input_dir / "workflow.jpg"
|
64 |
+
|
65 |
+
if workflow_path.exists():
|
66 |
+
try:
|
67 |
+
from PIL import Image
|
68 |
+
with Image.open(workflow_path) as img:
|
69 |
+
st.image(img, caption="Mistral OCR workflow", use_column_width=True)
|
70 |
+
except Exception:
|
71 |
+
# If there's an error, just show a placeholder
|
72 |
+
st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
|
73 |
+
else:
|
74 |
+
# If the file doesn't exist, show a placeholder
|
75 |
+
st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
|
76 |
+
|
77 |
+
# Practical workflow section
|
78 |
+
workflow_content = """
|
79 |
+
<h3>Practical Workflow</h3>
|
80 |
+
|
81 |
+
<p>A typical historical OCR workflow with Mistral-OCR includes:</p>
|
82 |
+
<ol>
|
83 |
+
<li><strong>Selection</strong>: Choosing appropriate documents</li>
|
84 |
+
<li><strong>Preprocessing</strong>: Enhancing images before OCR</li>
|
85 |
+
<li><strong>OCR Processing</strong>: Running documents through vision-enhanced OCR</li>
|
86 |
+
<li><strong>Post-processing</strong>: Cleaning up outputs and structured extraction</li>
|
87 |
+
<li><strong>Verification</strong>: Cross-checking results against originals</li>
|
88 |
+
<li><strong>Integration</strong>: Incorporating OCR outputs into research materials</li>
|
89 |
+
</ol>
|
90 |
+
"""
|
91 |
+
tool_container(workflow_content)
|
92 |
+
|
93 |
+
# Methodological considerations
|
94 |
+
st.subheader("Methodological Considerations")
|
95 |
+
|
96 |
+
col1, col2 = st.columns([1, 1])
|
97 |
+
|
98 |
+
with col1:
|
99 |
+
advantages_content = """
|
100 |
+
<h4>Advantages of Hybrid Approaches</h4>
|
101 |
+
<ul>
|
102 |
+
<li>Balance between automation and expert judgment</li>
|
103 |
+
<li>Ability to process large volumes while preserving detail</li>
|
104 |
+
<li>Context-sensitive analysis of complex documents</li>
|
105 |
+
<li>Iterative improvement of results</li>
|
106 |
+
</ul>
|
107 |
+
"""
|
108 |
+
gray_container(advantages_content)
|
109 |
+
|
110 |
+
with col2:
|
111 |
+
limitations_content = """
|
112 |
+
<h4>Limitations and Challenges</h4>
|
113 |
+
<ul>
|
114 |
+
<li>OCR errors requiring expert correction</li>
|
115 |
+
<li>Bias in training data affecting recognition</li>
|
116 |
+
<li>Complexity in evaluating OCR quality</li>
|
117 |
+
<li>Technical infrastructure requirements</li>
|
118 |
+
</ul>
|
119 |
+
"""
|
120 |
+
gray_container(limitations_content)
|
121 |
+
|
122 |
+
# Quote
|
123 |
+
quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
|
124 |
+
quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")
|
modules/content/module5.py
ADDED
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import io
|
3 |
+
import tempfile
|
4 |
+
from pathlib import Path
|
5 |
+
from datetime import datetime
|
6 |
+
from layout import tool_container, key_concept, research_question, upload_container
|
7 |
+
import sys
|
8 |
+
|
9 |
+
# Import the necessary modules for OCR processing
|
10 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
11 |
+
try:
|
12 |
+
from process_file import process_file as process_file_util
|
13 |
+
process_file = process_file_util
|
14 |
+
except ImportError:
|
15 |
+
# Fallback if process_file is not available
|
16 |
+
def process_file(uploaded_file, use_vision=True, custom_prompt=None):
|
17 |
+
"""Fallback function for processing files"""
|
18 |
+
st.warning("Using mock processing function. Real OCR functionality is not available.")
|
19 |
+
return {
|
20 |
+
"file_name": uploaded_file.name,
|
21 |
+
"languages": ["English"],
|
22 |
+
"topics": ["History", "Document"],
|
23 |
+
"ocr_contents": {
|
24 |
+
"content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
|
28 |
+
def render():
|
29 |
+
"""Module 5: Interactive OCR Experiment"""
|
30 |
+
|
31 |
+
st.title("Module 5: Interactive OCR Experiment")
|
32 |
+
|
33 |
+
# Introduction to the interactive experiment
|
34 |
+
intro_content = """
|
35 |
+
<h3>Interactive OCR Experiment</h3>
|
36 |
+
<p>
|
37 |
+
This interactive experiment allows you to process historical documents with OCR and analyze the results.
|
38 |
+
Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
|
39 |
+
</p>
|
40 |
+
"""
|
41 |
+
st.markdown(intro_content, unsafe_allow_html=True)
|
42 |
+
|
43 |
+
# Create tabs for different activities
|
44 |
+
experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
|
45 |
+
|
46 |
+
# Try to import PDF tools if available
|
47 |
+
try:
|
48 |
+
from pdf2image import convert_from_bytes
|
49 |
+
pdf_support = True
|
50 |
+
except ImportError:
|
51 |
+
pdf_support = False
|
52 |
+
st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
|
53 |
+
|
54 |
+
with experiment_tab:
|
55 |
+
# Create a two-column layout
|
56 |
+
col1, col2 = st.columns([1, 1])
|
57 |
+
|
58 |
+
with col1:
|
59 |
+
# Tool container for document selection and options
|
60 |
+
st.subheader("Step 1: Select Document & Options")
|
61 |
+
|
62 |
+
# Processing options
|
63 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
64 |
+
help="Use the vision model for improved analysis")
|
65 |
+
|
66 |
+
# Additional prompt
|
67 |
+
st.markdown("### Custom Research Prompt (Optional)")
|
68 |
+
st.markdown("""Provide additional instructions to guide the OCR analysis.
|
69 |
+
Focus on specific aspects of historical research you're interested in.""")
|
70 |
+
custom_prompt = st.text_area("Research Prompt",
|
71 |
+
placeholder="E.g., Focus on identifying dates and historical figures...",
|
72 |
+
help="Optional instructions to guide the analysis")
|
73 |
+
|
74 |
+
# Sample document selection
|
75 |
+
input_dir = Path(__file__).parent.parent / "input"
|
76 |
+
|
77 |
+
if input_dir.exists():
|
78 |
+
sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
|
79 |
+
|
80 |
+
if sample_files:
|
81 |
+
st.markdown("#### Sample Documents")
|
82 |
+
sample_options = ["Upload my own document"] + [f.name for f in sample_files]
|
83 |
+
sample_choice = st.selectbox("Choose a document:", sample_options)
|
84 |
+
|
85 |
+
if sample_choice != "Upload my own document":
|
86 |
+
# Process the selected sample file
|
87 |
+
selected_file = next((f for f in sample_files if f.name == sample_choice), None)
|
88 |
+
|
89 |
+
if selected_file:
|
90 |
+
# Store the selected sample file in session state
|
91 |
+
with open(selected_file, "rb") as f:
|
92 |
+
file_bytes = f.read()
|
93 |
+
|
94 |
+
st.session_state.sample_file = {
|
95 |
+
"name": selected_file.name,
|
96 |
+
"bytes": file_bytes
|
97 |
+
}
|
98 |
+
|
99 |
+
# Preview the selected sample
|
100 |
+
if selected_file.suffix.lower() == ".pdf" and pdf_support:
|
101 |
+
try:
|
102 |
+
with st.spinner("Generating PDF preview..."):
|
103 |
+
images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
|
104 |
+
if images:
|
105 |
+
st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
|
106 |
+
except Exception:
|
107 |
+
st.info(f"PDF selected: {selected_file.name}")
|
108 |
+
else:
|
109 |
+
# For images display directly
|
110 |
+
try:
|
111 |
+
from PIL import Image
|
112 |
+
img = Image.open(io.BytesIO(file_bytes))
|
113 |
+
st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
|
114 |
+
except Exception:
|
115 |
+
st.info(f"Selected: {selected_file.name}")
|
116 |
+
else:
|
117 |
+
# Clear the sample file if "Upload my own" is selected
|
118 |
+
if 'sample_file' in st.session_state:
|
119 |
+
del st.session_state.sample_file
|
120 |
+
|
121 |
+
# Display file uploader
|
122 |
+
upload_html = """
|
123 |
+
<h4>Upload a document to get started</h4>
|
124 |
+
<p>Supported formats: PDF, JPG, PNG</p>
|
125 |
+
"""
|
126 |
+
|
127 |
+
upload_container(upload_html)
|
128 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
|
129 |
+
|
130 |
+
if uploaded_file is not None:
|
131 |
+
# Display preview of the uploaded file
|
132 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
133 |
+
|
134 |
+
if file_ext == ".pdf" and pdf_support:
|
135 |
+
try:
|
136 |
+
# Convert first page of PDF to image for preview
|
137 |
+
pdf_bytes = uploaded_file.getvalue()
|
138 |
+
with st.spinner("Generating PDF preview..."):
|
139 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
140 |
+
if images:
|
141 |
+
st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
|
142 |
+
else:
|
143 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
144 |
+
except Exception:
|
145 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
146 |
+
elif file_ext != ".pdf":
|
147 |
+
st.image(uploaded_file, use_column_width=True)
|
148 |
+
else:
|
149 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
150 |
+
else:
|
151 |
+
# No sample files, just show the uploader
|
152 |
+
upload_html = """
|
153 |
+
<h4>Upload a document to get started</h4>
|
154 |
+
<p>Supported formats: PDF, JPG, PNG</p>
|
155 |
+
"""
|
156 |
+
|
157 |
+
upload_container(upload_html)
|
158 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
|
159 |
+
|
160 |
+
if uploaded_file is not None:
|
161 |
+
# Display the file preview
|
162 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
163 |
+
if file_ext == ".pdf" and pdf_support:
|
164 |
+
try:
|
165 |
+
pdf_bytes = uploaded_file.getvalue()
|
166 |
+
with st.spinner("Generating PDF preview..."):
|
167 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
168 |
+
if images:
|
169 |
+
st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
|
170 |
+
except Exception:
|
171 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
172 |
+
elif file_ext != ".pdf":
|
173 |
+
st.image(uploaded_file, use_column_width=True)
|
174 |
+
else:
|
175 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
176 |
+
else:
|
177 |
+
# No input directory
|
178 |
+
upload_html = """
|
179 |
+
<h4>Upload a document to get started</h4>
|
180 |
+
<p>Supported formats: PDF, JPG, PNG</p>
|
181 |
+
"""
|
182 |
+
|
183 |
+
upload_container(upload_html)
|
184 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
|
185 |
+
|
186 |
+
# Process button
|
187 |
+
st.subheader("Step 2: Process the Document")
|
188 |
+
|
189 |
+
# Get the file to process (either uploaded or sample)
|
190 |
+
file_to_process = None
|
191 |
+
if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
|
192 |
+
# Create a FileUploader-like object from the sample file
|
193 |
+
class SampleFileObject:
|
194 |
+
def __init__(self, name, data):
|
195 |
+
self.name = name
|
196 |
+
self._data = data
|
197 |
+
|
198 |
+
def getvalue(self):
|
199 |
+
return self._data
|
200 |
+
|
201 |
+
file_to_process = SampleFileObject(
|
202 |
+
st.session_state.sample_file["name"],
|
203 |
+
st.session_state.sample_file["bytes"]
|
204 |
+
)
|
205 |
+
elif 'uploaded_file' in locals() and uploaded_file is not None:
|
206 |
+
file_to_process = uploaded_file
|
207 |
+
|
208 |
+
# Process button
|
209 |
+
process_button = st.button(
|
210 |
+
"Process Document",
|
211 |
+
disabled=file_to_process is None,
|
212 |
+
use_container_width=True
|
213 |
+
)
|
214 |
+
|
215 |
+
if process_button and file_to_process is not None:
|
216 |
+
with st.spinner("Processing document..."):
|
217 |
+
try:
|
218 |
+
# Process the file
|
219 |
+
result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
|
220 |
+
|
221 |
+
if result:
|
222 |
+
st.success("Document processed successfully!")
|
223 |
+
|
224 |
+
# Store result in session state for display in the right column
|
225 |
+
st.session_state.current_result = result
|
226 |
+
|
227 |
+
# Add to processing history
|
228 |
+
history_item = {
|
229 |
+
"id": datetime.now().timestamp(),
|
230 |
+
"fileName": file_to_process.name,
|
231 |
+
"timestamp": datetime.now().isoformat(),
|
232 |
+
"result": result,
|
233 |
+
"useVision": use_vision
|
234 |
+
}
|
235 |
+
|
236 |
+
if 'processing_history' not in st.session_state:
|
237 |
+
st.session_state.processing_history = []
|
238 |
+
|
239 |
+
st.session_state.processing_history.append(history_item)
|
240 |
+
|
241 |
+
st.experimental_rerun()
|
242 |
+
else:
|
243 |
+
st.error("Failed to process document.")
|
244 |
+
except Exception as e:
|
245 |
+
st.error(f"Error processing document: {str(e)}")
|
246 |
+
|
247 |
+
# Experiment instructions
|
248 |
+
experiment_content = """
|
249 |
+
<h3>Experiment Instructions</h3>
|
250 |
+
<ol>
|
251 |
+
<li><strong>Step 1:</strong> Select a document and choose your options</li>
|
252 |
+
<li><strong>Step 2:</strong> Process the document with the selected options</li>
|
253 |
+
<li><strong>Step 3:</strong> Analyze the results in the panel on the right</li>
|
254 |
+
<li><strong>Step 4:</strong> Try again with different settings (e.g., toggle vision model)</li>
|
255 |
+
<li><strong>Step 5:</strong> Compare results between different runs</li>
|
256 |
+
</ol>
|
257 |
+
"""
|
258 |
+
key_concept(experiment_content)
|
259 |
+
|
260 |
+
with col2:
|
261 |
+
# Results display
|
262 |
+
st.subheader("Step 3: View Results")
|
263 |
+
|
264 |
+
if 'current_result' in st.session_state and st.session_state.current_result:
|
265 |
+
result = st.session_state.current_result
|
266 |
+
|
267 |
+
# Display results in a tool container
|
268 |
+
result_html = f"""
|
269 |
+
<h4>Results for: {result.get('file_name', 'Unknown')}</h4>
|
270 |
+
<p><strong>Languages:</strong> {', '.join(result.get('languages', ['Unknown']))}</p>
|
271 |
+
<p><strong>Topics:</strong> {', '.join(result.get('topics', ['Unknown']))}</p>
|
272 |
+
"""
|
273 |
+
tool_container(result_html)
|
274 |
+
|
275 |
+
# Create tabs for different views
|
276 |
+
tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
|
277 |
+
|
278 |
+
with tab1:
|
279 |
+
# Display in a more user-friendly format
|
280 |
+
if 'ocr_contents' in result:
|
281 |
+
if isinstance(result['ocr_contents'], dict):
|
282 |
+
for section, content in result['ocr_contents'].items():
|
283 |
+
if content: # Only display non-empty sections
|
284 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
285 |
+
|
286 |
+
if isinstance(content, str):
|
287 |
+
st.markdown(content)
|
288 |
+
elif isinstance(content, list):
|
289 |
+
for item in content:
|
290 |
+
if isinstance(item, str):
|
291 |
+
st.markdown(f"- {item}")
|
292 |
+
elif isinstance(item, dict):
|
293 |
+
st.json(item)
|
294 |
+
elif isinstance(content, dict):
|
295 |
+
for k, v in content.items():
|
296 |
+
st.markdown(f"**{k}:** {v}")
|
297 |
+
|
298 |
+
with tab2:
|
299 |
+
# Show the raw JSON
|
300 |
+
st.json(result)
|
301 |
+
|
302 |
+
# Download options
|
303 |
+
st.markdown("### Export Results")
|
304 |
+
|
305 |
+
col1, col2 = st.columns(2)
|
306 |
+
|
307 |
+
with col1:
|
308 |
+
# Export as JSON
|
309 |
+
import json
|
310 |
+
json_bytes = json.dumps(result, indent=2).encode()
|
311 |
+
st.download_button(
|
312 |
+
label="Download JSON",
|
313 |
+
data=json_bytes,
|
314 |
+
file_name="ocr_results.json",
|
315 |
+
mime="application/json",
|
316 |
+
use_container_width=True
|
317 |
+
)
|
318 |
+
|
319 |
+
with col2:
|
320 |
+
# Export as text if content is available
|
321 |
+
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
|
322 |
+
text_content = result['ocr_contents']['content']
|
323 |
+
st.download_button(
|
324 |
+
label="Download Text",
|
325 |
+
data=text_content.encode(),
|
326 |
+
file_name="ocr_text.txt",
|
327 |
+
mime="text/plain",
|
328 |
+
use_container_width=True
|
329 |
+
)
|
330 |
+
else:
|
331 |
+
# Show placeholder when no results are available
|
332 |
+
placeholder_html = """
|
333 |
+
<h4>Results will appear here</h4>
|
334 |
+
<p>Upload and process a document to see the OCR results in this panel.</p>
|
335 |
+
<p>The OCR tool will:</p>
|
336 |
+
<ol>
|
337 |
+
<li>Extract text from your document</li>
|
338 |
+
<li>Identify languages and topics</li>
|
339 |
+
<li>Provide structured content analysis</li>
|
340 |
+
<li>Generate downloadable results</li>
|
341 |
+
</ol>
|
342 |
+
"""
|
343 |
+
tool_container(placeholder_html)
|
344 |
+
|
345 |
+
# Display processing history if available
|
346 |
+
if 'processing_history' in st.session_state and st.session_state.processing_history:
|
347 |
+
st.subheader("Step 4: Review Processing History")
|
348 |
+
|
349 |
+
# Most recent result
|
350 |
+
latest = st.session_state.processing_history[-1]
|
351 |
+
latest_html = f"""
|
352 |
+
<h4>Latest Document: {latest['fileName']}</h4>
|
353 |
+
<p><strong>Processed at:</strong> {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
|
354 |
+
<p><strong>Vision model used:</strong> {'Yes' if latest['useVision'] else 'No'}</p>
|
355 |
+
"""
|
356 |
+
tool_container(latest_html)
|
357 |
+
|
358 |
+
# History in expander
|
359 |
+
with st.expander("View Complete Processing History"):
|
360 |
+
for i, item in enumerate(reversed(st.session_state.processing_history)):
|
361 |
+
st.markdown(f"""
|
362 |
+
<div style="background-color: var(--color-gray-700); padding: 0.75rem; border-radius: 0.5rem; margin-bottom: 0.5rem;">
|
363 |
+
<strong>{item['fileName']}</strong><br>
|
364 |
+
{datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} -
|
365 |
+
Vision model: {'Yes' if item['useVision'] else 'No'}
|
366 |
+
</div>
|
367 |
+
""", unsafe_allow_html=True)
|
368 |
+
|
369 |
+
# Option to view a previous result
|
370 |
+
if st.button(f"View This Result", key=f"view_history_{i}"):
|
371 |
+
st.session_state.current_result = item['result']
|
372 |
+
st.experimental_rerun()
|
373 |
+
|
374 |
+
# Compare tab for side-by-side comparison
|
375 |
+
with compare_tab:
|
376 |
+
st.subheader("Compare OCR Results")
|
377 |
+
|
378 |
+
if 'processing_history' in st.session_state and len(st.session_state.processing_history) >= 2:
|
379 |
+
st.markdown("""
|
380 |
+
Select two processing results to compare side by side. This allows you to see
|
381 |
+
how different options (like using the vision model) affect OCR quality.
|
382 |
+
""")
|
383 |
+
|
384 |
+
# Create selection dropdowns for the documents
|
385 |
+
col1, col2 = st.columns(2)
|
386 |
+
with col1:
|
387 |
+
# First document selector
|
388 |
+
doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
|
389 |
+
for i, item in enumerate(st.session_state.processing_history)]
|
390 |
+
doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
|
391 |
+
doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
|
392 |
+
|
393 |
+
with col2:
|
394 |
+
# Second document selector
|
395 |
+
doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
|
396 |
+
for i, item in enumerate(st.session_state.processing_history)]
|
397 |
+
default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
|
398 |
+
doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
|
399 |
+
doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
|
400 |
+
|
401 |
+
# Retrieve the selected documents
|
402 |
+
doc1 = st.session_state.processing_history[doc_index_1]
|
403 |
+
doc2 = st.session_state.processing_history[doc_index_2]
|
404 |
+
|
405 |
+
# Show comparison
|
406 |
+
col1, col2 = st.columns(2)
|
407 |
+
|
408 |
+
with col1:
|
409 |
+
doc1_html = f"""
|
410 |
+
<h4>Document 1: {doc1['fileName']}</h4>
|
411 |
+
<p><strong>Processed at:</strong> {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
|
412 |
+
<p><strong>Vision model used:</strong> {'Yes' if doc1['useVision'] else 'No'}</p>
|
413 |
+
"""
|
414 |
+
tool_container(doc1_html)
|
415 |
+
|
416 |
+
# Display content summary
|
417 |
+
if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
|
418 |
+
if 'content' in doc1['result']['ocr_contents']:
|
419 |
+
content = doc1['result']['ocr_contents']['content']
|
420 |
+
st.markdown(f"""
|
421 |
+
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
|
422 |
+
border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
|
423 |
+
{content[:500]}{'...' if len(content) > 500 else ''}
|
424 |
+
</div>
|
425 |
+
""", unsafe_allow_html=True)
|
426 |
+
|
427 |
+
with col2:
|
428 |
+
doc2_html = f"""
|
429 |
+
<h4>Document 2: {doc2['fileName']}</h4>
|
430 |
+
<p><strong>Processed at:</strong> {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
|
431 |
+
<p><strong>Vision model used:</strong> {'Yes' if doc2['useVision'] else 'No'}</p>
|
432 |
+
"""
|
433 |
+
tool_container(doc2_html)
|
434 |
+
|
435 |
+
# Display content summary
|
436 |
+
if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
|
437 |
+
if 'content' in doc2['result']['ocr_contents']:
|
438 |
+
content = doc2['result']['ocr_contents']['content']
|
439 |
+
st.markdown(f"""
|
440 |
+
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
|
441 |
+
border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
|
442 |
+
{content[:500]}{'...' if len(content) > 500 else ''}
|
443 |
+
</div>
|
444 |
+
""", unsafe_allow_html=True)
|
445 |
+
|
446 |
+
# Comparison analysis
|
447 |
+
if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
|
448 |
+
comparison_content = """
|
449 |
+
<h3>Vision vs. Non-Vision Model Comparison</h3>
|
450 |
+
<p>You're comparing the same document processed with different models.
|
451 |
+
This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p>
|
452 |
+
|
453 |
+
<p>Look for these differences:</p>
|
454 |
+
<ul>
|
455 |
+
<li>Completeness of extracted text</li>
|
456 |
+
<li>Accuracy of layout understanding</li>
|
457 |
+
<li>Recognition of complex elements (tables, figures)</li>
|
458 |
+
<li>Topic and language detection accuracy</li>
|
459 |
+
</ul>
|
460 |
+
"""
|
461 |
+
key_concept(comparison_content)
|
462 |
+
else:
|
463 |
+
need_more_content = """
|
464 |
+
<h3>Need More Documents to Compare</h3>
|
465 |
+
<p>Process at least two documents to enable side-by-side comparison. Try processing
|
466 |
+
the same document with and without the vision model to see the differences in OCR quality.</p>
|
467 |
+
"""
|
468 |
+
research_question(need_more_content)
|
469 |
+
|
470 |
+
# Analysis guide tab
|
471 |
+
with analyze_tab:
|
472 |
+
st.subheader("Analysis Guide")
|
473 |
+
|
474 |
+
st.markdown("""
|
475 |
+
### How to Analyze OCR Results
|
476 |
+
|
477 |
+
When analyzing OCR results from historical documents, consider these key factors:
|
478 |
+
|
479 |
+
1. **Text Accuracy**
|
480 |
+
- Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
|
481 |
+
- Assess recognition of period-specific typography and writing styles
|
482 |
+
- Evaluate handling of degraded or damaged text areas
|
483 |
+
|
484 |
+
2. **Structure Preservation**
|
485 |
+
- Does the OCR maintain paragraph and section breaks?
|
486 |
+
- Are columns and tabular data correctly preserved?
|
487 |
+
- How well are page transitions handled?
|
488 |
+
|
489 |
+
3. **Special Elements**
|
490 |
+
- Recognition of footnotes, marginalia, and annotations
|
491 |
+
- Handling of illustrations, diagrams, and decorative elements
|
492 |
+
- Treatment of watermarks, signatures, and stamps
|
493 |
+
|
494 |
+
4. **Metadata Extraction**
|
495 |
+
- Accuracy of detected languages, topics, and document type
|
496 |
+
- Identification of dates, names, and key entities
|
497 |
+
- Recognition of document purpose and context
|
498 |
+
""")
|
499 |
+
|
500 |
+
col1, col2 = st.columns(2)
|
501 |
+
|
502 |
+
with col1:
|
503 |
+
challenge_content = """
|
504 |
+
<h3>Common OCR Challenges</h3>
|
505 |
+
<ul>
|
506 |
+
<li><strong>Typography Variations</strong>: Historical fonts that differ from modern text</li>
|
507 |
+
<li><strong>Material Degradation</strong>: Fading, stains, tears affecting legibility</li>
|
508 |
+
<li><strong>Handwritten Elements</strong>: Marginalia, signatures, and annotations</li>
|
509 |
+
<li><strong>Complex Layouts</strong>: Multi-column formats and decorative elements</li>
|
510 |
+
<li><strong>Language and Terminology</strong>: Archaic terms and multilingual content</li>
|
511 |
+
</ul>
|
512 |
+
"""
|
513 |
+
gray_container(challenge_content)
|
514 |
+
|
515 |
+
with col2:
|
516 |
+
tips_content = """
|
517 |
+
<h3>Making the Most of OCR Results</h3>
|
518 |
+
<ul>
|
519 |
+
<li><strong>Contextual Reading</strong>: Use context to interpret unclear passages</li>
|
520 |
+
<li><strong>Error Patterns</strong>: Identify and correct systematic OCR errors</li>
|
521 |
+
<li><strong>Hybrid Analysis</strong>: Combine OCR search with close reading</li>
|
522 |
+
<li><strong>Comparative Processing</strong>: Try different settings on documents</li>
|
523 |
+
<li><strong>Iterative Refinement</strong>: Use insights to improve future processing</li>
|
524 |
+
</ul>
|
525 |
+
"""
|
526 |
+
gray_container(tips_content)
|
527 |
+
|
528 |
+
# Show example analysis if there's processing history
|
529 |
+
if 'processing_history' in st.session_state and st.session_state.processing_history:
|
530 |
+
with st.expander("Example Analysis from Your Documents"):
|
531 |
+
# Pick the latest document
|
532 |
+
latest = st.session_state.processing_history[-1]
|
533 |
+
|
534 |
+
st.markdown(f"""
|
535 |
+
#### Sample Analysis for: {latest['fileName']}
|
536 |
+
|
537 |
+
**Document Context:**
|
538 |
+
- Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
|
539 |
+
- Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
|
540 |
+
- Vision model used: {'Yes' if latest['useVision'] else 'No'}
|
541 |
+
|
542 |
+
**What to Look For:**
|
543 |
+
1. Check how well the model identified key topics and languages
|
544 |
+
2. Evaluate the completeness of extracted text
|
545 |
+
3. Note any systematic errors in text recognition
|
546 |
+
4. Assess how well document structure was preserved
|
547 |
+
""")
|
modules/content/module6.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from layout import gray_container, key_concept, quote, tool_container
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
def render():
|
6 |
+
"""Module 6: Conclusion and Future Directions"""
|
7 |
+
|
8 |
+
st.title("Module 6: Conclusion and Future Directions")
|
9 |
+
|
10 |
+
col1, col2 = st.columns([3, 2])
|
11 |
+
|
12 |
+
with col1:
|
13 |
+
summary_content = """
|
14 |
+
<h3>Workshop Summary</h3>
|
15 |
+
<p>Throughout this workshop, we've explored:</p>
|
16 |
+
<ol>
|
17 |
+
<li><strong>Text-Image Interdependence</strong>: The complex relationship between textual and visual elements</li>
|
18 |
+
<li><strong>OCR Technology</strong>: The evolution of OCR and its application to historical materials</li>
|
19 |
+
<li><strong>Methodological Approaches</strong>: Hybrid strategies for working with historical texts</li>
|
20 |
+
<li><strong>Practical Application</strong>: Hands-on experience with OCR processing tools</li>
|
21 |
+
</ol>
|
22 |
+
"""
|
23 |
+
gray_container(summary_content)
|
24 |
+
|
25 |
+
takeaways_content = """
|
26 |
+
<h3>Key Takeaways</h3>
|
27 |
+
<ol>
|
28 |
+
<li><strong>OCR is Not Perfect</strong>: Even advanced AI models face challenges with historical documents</li>
|
29 |
+
<li><strong>Context Matters</strong>: Vision-enhanced models provide better results by understanding document context</li>
|
30 |
+
<li><strong>Hybrid Approaches</strong>: Combining computational methods with traditional research yields best results</li>
|
31 |
+
<li><strong>Critical Evaluation</strong>: Always evaluate OCR outputs with awareness of limitations</li>
|
32 |
+
<li><strong>Structured Extraction</strong>: Modern OCR goes beyond text recognition to understand document structure</li>
|
33 |
+
</ol>
|
34 |
+
"""
|
35 |
+
gray_container(takeaways_content)
|
36 |
+
|
37 |
+
with col2:
|
38 |
+
# Display workshop statistics if there's processing history
|
39 |
+
if 'processing_history' in st.session_state and st.session_state.processing_history:
|
40 |
+
st.subheader("Your Workshop Statistics")
|
41 |
+
|
42 |
+
# Calculate statistics
|
43 |
+
total_docs = len(st.session_state.processing_history)
|
44 |
+
vision_docs = len([item for item in st.session_state.processing_history if item['useVision']])
|
45 |
+
non_vision_docs = total_docs - vision_docs
|
46 |
+
|
47 |
+
# Create metrics for statistics
|
48 |
+
col1, col2 = st.columns(2)
|
49 |
+
|
50 |
+
with col1:
|
51 |
+
st.metric("Documents Processed", total_docs)
|
52 |
+
st.metric("With Vision Model", vision_docs)
|
53 |
+
|
54 |
+
with col2:
|
55 |
+
st.metric("Without Vision Model", non_vision_docs)
|
56 |
+
|
57 |
+
# Topics word cloud
|
58 |
+
if total_docs > 0:
|
59 |
+
st.subheader("Topics Encountered")
|
60 |
+
all_topics = []
|
61 |
+
for item in st.session_state.processing_history:
|
62 |
+
if 'topics' in item['result']:
|
63 |
+
all_topics.extend(item['result']['topics'])
|
64 |
+
|
65 |
+
if all_topics:
|
66 |
+
# Count topic frequencies
|
67 |
+
topic_counts = {}
|
68 |
+
for topic in all_topics:
|
69 |
+
if topic in topic_counts:
|
70 |
+
topic_counts[topic] += 1
|
71 |
+
else:
|
72 |
+
topic_counts[topic] = 1
|
73 |
+
|
74 |
+
# Display as a horizontal bar chart
|
75 |
+
st.bar_chart(topic_counts)
|
76 |
+
else:
|
77 |
+
# Show placeholder stats
|
78 |
+
placeholder_content = """
|
79 |
+
<h3>Workshop Outcomes</h3>
|
80 |
+
<p>Complete the interactive OCR experiment in Module 5 to generate your personal workshop statistics.</p>
|
81 |
+
<p>You'll be able to see:</p>
|
82 |
+
<ul>
|
83 |
+
<li>Number of documents processed</li>
|
84 |
+
<li>Comparison of vision vs. non-vision models</li>
|
85 |
+
<li>Topics identified across your documents</li>
|
86 |
+
<li>Performance metrics for your processing tasks</li>
|
87 |
+
</ul>
|
88 |
+
"""
|
89 |
+
tool_container(placeholder_content)
|
90 |
+
|
91 |
+
# Future directions section
|
92 |
+
st.subheader("Future Directions")
|
93 |
+
|
94 |
+
col1, col2 = st.columns(2)
|
95 |
+
|
96 |
+
with col1:
|
97 |
+
tech_content = """
|
98 |
+
<h3>Technological Developments</h3>
|
99 |
+
<ul>
|
100 |
+
<li><strong>Multimodal AI models</strong>: Increasingly sophisticated understanding</li>
|
101 |
+
<li><strong>Historical font training</strong>: Models trained on historical typography</li>
|
102 |
+
<li><strong>Document intelligence</strong>: Enhanced understanding of structures</li>
|
103 |
+
<li><strong>Collaborative correction</strong>: Platforms for collective improvement</li>
|
104 |
+
</ul>
|
105 |
+
"""
|
106 |
+
gray_container(tech_content)
|
107 |
+
|
108 |
+
with col2:
|
109 |
+
research_content = """
|
110 |
+
<h3>Research Applications</h3>
|
111 |
+
<ul>
|
112 |
+
<li><strong>Large-scale corpus analysis</strong>: Processing entire archives</li>
|
113 |
+
<li><strong>Multilingual historical research</strong>: Working across languages</li>
|
114 |
+
<li><strong>Image-text integration</strong>: New methodologies for visual analysis</li>
|
115 |
+
<li><strong>Computational paleography</strong>: AI-assisted handwriting analysis</li>
|
116 |
+
</ul>
|
117 |
+
"""
|
118 |
+
gray_container(research_content)
|
119 |
+
|
120 |
+
# Inspiring quote
|
121 |
+
quote_content = "The digital humanities are not about building, they're about sharing. The digital humanities are not about the digital at all. They're all about innovation and disruption. The digital humanities are really an insurgent humanities."
|
122 |
+
quote(quote_content, "Matthew Kirschenbaum, Professor of Digital Humanities")
|
123 |
+
|
124 |
+
# Additional resources
|
125 |
+
resources_content = """
|
126 |
+
<h3>Additional Resources</h3>
|
127 |
+
<ul>
|
128 |
+
<li><a href="https://docs.mistral.ai/" target="_blank">Mistral AI Documentation</a>: Learn more about the OCR models used in this workshop</li>
|
129 |
+
<li><a href="https://readcoop.eu/transkribus/" target="_blank">Transkribus</a>: Platform for historical document transcription</li>
|
130 |
+
<li><a href="https://ocr-d.de/en/" target="_blank">OCR-D</a>: Coordinated OCR research project for historical documents</li>
|
131 |
+
<li><a href="https://scholar.google.com/scholar?q=historical+OCR" target="_blank">Historical OCR Research Papers</a>: Academic research on historical OCR</li>
|
132 |
+
</ul>
|
133 |
+
"""
|
134 |
+
tool_container(resources_content)
|
135 |
+
|
136 |
+
# Acknowledgments
|
137 |
+
st.subheader("Acknowledgments")
|
138 |
+
|
139 |
+
acknowledgment_content = """
|
140 |
+
<p>This workshop was designed as an educational resource for historians, archivists, and digital humanities scholars.</p>
|
141 |
+
<p>It demonstrates the integration of modern AI vision-language models with historical research methodologies.</p>
|
142 |
+
<p>Special thanks to the digital humanities community for continued innovation in computational approaches to historical research.</p>
|
143 |
+
"""
|
144 |
+
st.markdown(acknowledgment_content, unsafe_allow_html=True)
|
145 |
+
|
146 |
+
# Restart the workshop button
|
147 |
+
if st.button("Start Workshop Again", use_container_width=True):
|
148 |
+
# Reset the session state to start the workshop again
|
149 |
+
if 'current_module' in st.session_state:
|
150 |
+
st.session_state.current_module = 1
|
151 |
+
|
152 |
+
# Do not reset the processing history
|
153 |
+
|
154 |
+
st.experimental_rerun()
|
modules/educational_module.py
ADDED
@@ -0,0 +1,547 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import io
|
3 |
+
import tempfile
|
4 |
+
from pathlib import Path
|
5 |
+
from datetime import datetime
|
6 |
+
from layout import tool_container, key_concept, research_question, upload_container
|
7 |
+
import sys
|
8 |
+
|
9 |
+
# Import the necessary modules for OCR processing
|
10 |
+
sys.path.append(str(Path(__file__).parent.parent))
|
11 |
+
try:
|
12 |
+
from process_file import process_file as process_file_util
|
13 |
+
process_file = process_file_util
|
14 |
+
except ImportError:
|
15 |
+
# Fallback if process_file is not available
|
16 |
+
def process_file(uploaded_file, use_vision=True, custom_prompt=None):
|
17 |
+
"""Fallback function for processing files"""
|
18 |
+
st.warning("Using mock processing function. Real OCR functionality is not available.")
|
19 |
+
return {
|
20 |
+
"file_name": uploaded_file.name,
|
21 |
+
"languages": ["English"],
|
22 |
+
"topics": ["History", "Document"],
|
23 |
+
"ocr_contents": {
|
24 |
+
"content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
|
25 |
+
}
|
26 |
+
}
|
27 |
+
|
28 |
+
def render():
|
29 |
+
"""Module 5: Interactive OCR Experiment"""
|
30 |
+
|
31 |
+
st.title("Module 5: Interactive OCR Experiment")
|
32 |
+
|
33 |
+
# Introduction to the interactive experiment
|
34 |
+
intro_content = """
|
35 |
+
<h3>Interactive OCR Experiment</h3>
|
36 |
+
<p>
|
37 |
+
This interactive experiment allows you to process historical documents with OCR and analyze the results.
|
38 |
+
Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
|
39 |
+
</p>
|
40 |
+
"""
|
41 |
+
st.markdown(intro_content, unsafe_allow_html=True)
|
42 |
+
|
43 |
+
# Create tabs for different activities
|
44 |
+
experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
|
45 |
+
|
46 |
+
# Try to import PDF tools if available
|
47 |
+
try:
|
48 |
+
from pdf2image import convert_from_bytes
|
49 |
+
pdf_support = True
|
50 |
+
except ImportError:
|
51 |
+
pdf_support = False
|
52 |
+
st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
|
53 |
+
|
54 |
+
with experiment_tab:
|
55 |
+
# Create a two-column layout
|
56 |
+
col1, col2 = st.columns([1, 1])
|
57 |
+
|
58 |
+
with col1:
|
59 |
+
# Tool container for document selection and options
|
60 |
+
st.subheader("Step 1: Select Document & Options")
|
61 |
+
|
62 |
+
# Processing options
|
63 |
+
use_vision = st.checkbox("Use Vision Model", value=True,
|
64 |
+
help="Use the vision model for improved analysis")
|
65 |
+
|
66 |
+
# Additional prompt
|
67 |
+
st.markdown("### Custom Research Prompt (Optional)")
|
68 |
+
st.markdown("""Provide additional instructions to guide the OCR analysis.
|
69 |
+
Focus on specific aspects of historical research you're interested in.""")
|
70 |
+
custom_prompt = st.text_area("Research Prompt",
|
71 |
+
placeholder="E.g., Focus on identifying dates and historical figures...",
|
72 |
+
help="Optional instructions to guide the analysis")
|
73 |
+
|
74 |
+
# Sample document selection
|
75 |
+
input_dir = Path(__file__).parent.parent / "input"
|
76 |
+
|
77 |
+
if input_dir.exists():
|
78 |
+
sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
|
79 |
+
|
80 |
+
if sample_files:
|
81 |
+
st.markdown("#### Sample Documents")
|
82 |
+
sample_options = ["Upload my own document"] + [f.name for f in sample_files]
|
83 |
+
sample_choice = st.selectbox("Choose a document:", sample_options)
|
84 |
+
|
85 |
+
if sample_choice != "Upload my own document":
|
86 |
+
# Process the selected sample file
|
87 |
+
selected_file = next((f for f in sample_files if f.name == sample_choice), None)
|
88 |
+
|
89 |
+
if selected_file:
|
90 |
+
# Store the selected sample file in session state
|
91 |
+
with open(selected_file, "rb") as f:
|
92 |
+
file_bytes = f.read()
|
93 |
+
|
94 |
+
st.session_state.sample_file = {
|
95 |
+
"name": selected_file.name,
|
96 |
+
"bytes": file_bytes
|
97 |
+
}
|
98 |
+
|
99 |
+
# Preview the selected sample
|
100 |
+
if selected_file.suffix.lower() == ".pdf" and pdf_support:
|
101 |
+
try:
|
102 |
+
with st.spinner("Generating PDF preview..."):
|
103 |
+
images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
|
104 |
+
if images:
|
105 |
+
st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
|
106 |
+
except Exception:
|
107 |
+
st.info(f"PDF selected: {selected_file.name}")
|
108 |
+
else:
|
109 |
+
# For images display directly
|
110 |
+
try:
|
111 |
+
from PIL import Image
|
112 |
+
img = Image.open(io.BytesIO(file_bytes))
|
113 |
+
st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
|
114 |
+
except Exception:
|
115 |
+
st.info(f"Selected: {selected_file.name}")
|
116 |
+
else:
|
117 |
+
# Clear the sample file if "Upload my own" is selected
|
118 |
+
if 'sample_file' in st.session_state:
|
119 |
+
del st.session_state.sample_file
|
120 |
+
|
121 |
+
# Display file uploader
|
122 |
+
upload_html = """
|
123 |
+
<h4>Upload a document to get started</h4>
|
124 |
+
<p>Supported formats: PDF, JPG, PNG</p>
|
125 |
+
"""
|
126 |
+
|
127 |
+
upload_container(upload_html)
|
128 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
|
129 |
+
|
130 |
+
if uploaded_file is not None:
|
131 |
+
# Display preview of the uploaded file
|
132 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
133 |
+
|
134 |
+
if file_ext == ".pdf" and pdf_support:
|
135 |
+
try:
|
136 |
+
# Convert first page of PDF to image for preview
|
137 |
+
pdf_bytes = uploaded_file.getvalue()
|
138 |
+
with st.spinner("Generating PDF preview..."):
|
139 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
140 |
+
if images:
|
141 |
+
st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
|
142 |
+
else:
|
143 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
144 |
+
except Exception:
|
145 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
146 |
+
elif file_ext != ".pdf":
|
147 |
+
st.image(uploaded_file, use_column_width=True)
|
148 |
+
else:
|
149 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
150 |
+
else:
|
151 |
+
# No sample files, just show the uploader
|
152 |
+
upload_html = """
|
153 |
+
<h4>Upload a document to get started</h4>
|
154 |
+
<p>Supported formats: PDF, JPG, PNG</p>
|
155 |
+
"""
|
156 |
+
|
157 |
+
upload_container(upload_html)
|
158 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
|
159 |
+
|
160 |
+
if uploaded_file is not None:
|
161 |
+
# Display the file preview
|
162 |
+
file_ext = Path(uploaded_file.name).suffix.lower()
|
163 |
+
if file_ext == ".pdf" and pdf_support:
|
164 |
+
try:
|
165 |
+
pdf_bytes = uploaded_file.getvalue()
|
166 |
+
with st.spinner("Generating PDF preview..."):
|
167 |
+
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
|
168 |
+
if images:
|
169 |
+
st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
|
170 |
+
except Exception:
|
171 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
172 |
+
elif file_ext != ".pdf":
|
173 |
+
st.image(uploaded_file, use_column_width=True)
|
174 |
+
else:
|
175 |
+
st.info(f"PDF uploaded: {uploaded_file.name}")
|
176 |
+
else:
|
177 |
+
# No input directory
|
178 |
+
upload_html = """
|
179 |
+
<h4>Upload a document to get started</h4>
|
180 |
+
<p>Supported formats: PDF, JPG, PNG</p>
|
181 |
+
"""
|
182 |
+
|
183 |
+
upload_container(upload_html)
|
184 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
|
185 |
+
|
186 |
+
# Process button
|
187 |
+
st.subheader("Step 2: Process the Document")
|
188 |
+
|
189 |
+
# Get the file to process (either uploaded or sample)
|
190 |
+
file_to_process = None
|
191 |
+
if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
|
192 |
+
# Create a FileUploader-like object from the sample file
|
193 |
+
class SampleFileObject:
|
194 |
+
def __init__(self, name, data):
|
195 |
+
self.name = name
|
196 |
+
self._data = data
|
197 |
+
|
198 |
+
def getvalue(self):
|
199 |
+
return self._data
|
200 |
+
|
201 |
+
file_to_process = SampleFileObject(
|
202 |
+
st.session_state.sample_file["name"],
|
203 |
+
st.session_state.sample_file["bytes"]
|
204 |
+
)
|
205 |
+
elif 'uploaded_file' in locals() and uploaded_file is not None:
|
206 |
+
file_to_process = uploaded_file
|
207 |
+
|
208 |
+
# Process button
|
209 |
+
process_button = st.button(
|
210 |
+
"Process Document",
|
211 |
+
disabled=file_to_process is None,
|
212 |
+
use_container_width=True
|
213 |
+
)
|
214 |
+
|
215 |
+
if process_button and file_to_process is not None:
|
216 |
+
with st.spinner("Processing document..."):
|
217 |
+
try:
|
218 |
+
# Process the file
|
219 |
+
result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
|
220 |
+
|
221 |
+
if result:
|
222 |
+
st.success("Document processed successfully!")
|
223 |
+
|
224 |
+
# Store result in session state for display in the right column
|
225 |
+
st.session_state.current_result = result
|
226 |
+
|
227 |
+
# Add to processing history
|
228 |
+
history_item = {
|
229 |
+
"id": datetime.now().timestamp(),
|
230 |
+
"fileName": file_to_process.name,
|
231 |
+
"timestamp": datetime.now().isoformat(),
|
232 |
+
"result": result,
|
233 |
+
"useVision": use_vision
|
234 |
+
}
|
235 |
+
|
236 |
+
if 'processing_history' not in st.session_state:
|
237 |
+
st.session_state.processing_history = []
|
238 |
+
|
239 |
+
st.session_state.processing_history.append(history_item)
|
240 |
+
|
241 |
+
st.experimental_rerun()
|
242 |
+
else:
|
243 |
+
st.error("Failed to process document.")
|
244 |
+
except Exception as e:
|
245 |
+
st.error(f"Error processing document: {str(e)}")
|
246 |
+
|
247 |
+
# Experiment instructions
|
248 |
+
experiment_content = """
|
249 |
+
<h3>Experiment Instructions</h3>
|
250 |
+
<ol>
|
251 |
+
<li><strong>Step 1:</strong> Select a document and choose your options</li>
|
252 |
+
<li><strong>Step 2:</strong> Process the document with the selected options</li>
|
253 |
+
<li><strong>Step 3:</strong> Analyze the results in the panel on the right</li>
|
254 |
+
<li><strong>Step 4:</strong> Try again with different settings (e.g., toggle vision model)</li>
|
255 |
+
<li><strong>Step 5:</strong> Compare results between different runs</li>
|
256 |
+
</ol>
|
257 |
+
"""
|
258 |
+
key_concept(experiment_content)
|
259 |
+
|
260 |
+
with col2:
|
261 |
+
# Results display
|
262 |
+
st.subheader("Step 3: View Results")
|
263 |
+
|
264 |
+
if 'current_result' in st.session_state and st.session_state.current_result:
|
265 |
+
result = st.session_state.current_result
|
266 |
+
|
267 |
+
# Display results in a tool container
|
268 |
+
result_html = f"""
|
269 |
+
<h4>Results for: {result.get('file_name', 'Unknown')}</h4>
|
270 |
+
<p><strong>Languages:</strong> {', '.join(result.get('languages', ['Unknown']))}</p>
|
271 |
+
<p><strong>Topics:</strong> {', '.join(result.get('topics', ['Unknown']))}</p>
|
272 |
+
"""
|
273 |
+
tool_container(result_html)
|
274 |
+
|
275 |
+
# Create tabs for different views
|
276 |
+
tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
|
277 |
+
|
278 |
+
with tab1:
|
279 |
+
# Display in a more user-friendly format
|
280 |
+
if 'ocr_contents' in result:
|
281 |
+
if isinstance(result['ocr_contents'], dict):
|
282 |
+
for section, content in result['ocr_contents'].items():
|
283 |
+
if content: # Only display non-empty sections
|
284 |
+
st.markdown(f"#### {section.replace('_', ' ').title()}")
|
285 |
+
|
286 |
+
if isinstance(content, str):
|
287 |
+
st.markdown(content)
|
288 |
+
elif isinstance(content, list):
|
289 |
+
for item in content:
|
290 |
+
if isinstance(item, str):
|
291 |
+
st.markdown(f"- {item}")
|
292 |
+
elif isinstance(item, dict):
|
293 |
+
st.json(item)
|
294 |
+
elif isinstance(content, dict):
|
295 |
+
for k, v in content.items():
|
296 |
+
st.markdown(f"**{k}:** {v}")
|
297 |
+
|
298 |
+
with tab2:
|
299 |
+
# Show the raw JSON
|
300 |
+
st.json(result)
|
301 |
+
|
302 |
+
# Download options
|
303 |
+
st.markdown("### Export Results")
|
304 |
+
|
305 |
+
col1, col2 = st.columns(2)
|
306 |
+
|
307 |
+
with col1:
|
308 |
+
# Export as JSON
|
309 |
+
import json
|
310 |
+
json_bytes = json.dumps(result, indent=2).encode()
|
311 |
+
st.download_button(
|
312 |
+
label="Download JSON",
|
313 |
+
data=json_bytes,
|
314 |
+
file_name="ocr_results.json",
|
315 |
+
mime="application/json",
|
316 |
+
use_container_width=True
|
317 |
+
)
|
318 |
+
|
319 |
+
with col2:
|
320 |
+
# Export as text if content is available
|
321 |
+
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
|
322 |
+
text_content = result['ocr_contents']['content']
|
323 |
+
st.download_button(
|
324 |
+
label="Download Text",
|
325 |
+
data=text_content.encode(),
|
326 |
+
file_name="ocr_text.txt",
|
327 |
+
mime="text/plain",
|
328 |
+
use_container_width=True
|
329 |
+
)
|
330 |
+
else:
|
331 |
+
# Show placeholder when no results are available
|
332 |
+
placeholder_html = """
|
333 |
+
<h4>Results will appear here</h4>
|
334 |
+
<p>Upload and process a document to see the OCR results in this panel.</p>
|
335 |
+
<p>The OCR tool will:</p>
|
336 |
+
<ol>
|
337 |
+
<li>Extract text from your document</li>
|
338 |
+
<li>Identify languages and topics</li>
|
339 |
+
<li>Provide structured content analysis</li>
|
340 |
+
<li>Generate downloadable results</li>
|
341 |
+
</ol>
|
342 |
+
"""
|
343 |
+
tool_container(placeholder_html)
|
344 |
+
|
345 |
+
# Display processing history if available
|
346 |
+
if 'processing_history' in st.session_state and st.session_state.processing_history:
|
347 |
+
st.subheader("Step 4: Review Processing History")
|
348 |
+
|
349 |
+
# Most recent result
|
350 |
+
latest = st.session_state.processing_history[-1]
|
351 |
+
latest_html = f"""
|
352 |
+
<h4>Latest Document: {latest['fileName']}</h4>
|
353 |
+
<p><strong>Processed at:</strong> {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
|
354 |
+
<p><strong>Vision model used:</strong> {'Yes' if latest['useVision'] else 'No'}</p>
|
355 |
+
"""
|
356 |
+
tool_container(latest_html)
|
357 |
+
|
358 |
+
# History in expander
|
359 |
+
with st.expander("View Complete Processing History"):
|
360 |
+
for i, item in enumerate(reversed(st.session_state.processing_history)):
|
361 |
+
st.markdown(f"""
|
362 |
+
<div style="background-color: var(--color-gray-700); padding: 0.75rem; border-radius: 0.5rem; margin-bottom: 0.5rem;">
|
363 |
+
<strong>{item['fileName']}</strong><br>
|
364 |
+
{datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} -
|
365 |
+
Vision model: {'Yes' if item['useVision'] else 'No'}
|
366 |
+
</div>
|
367 |
+
""", unsafe_allow_html=True)
|
368 |
+
|
369 |
+
# Option to view a previous result
|
370 |
+
if st.button(f"View This Result", key=f"view_history_{i}"):
|
371 |
+
st.session_state.current_result = item['result']
|
372 |
+
st.experimental_rerun()
|
373 |
+
|
374 |
+
# Compare tab for side-by-side comparison
|
375 |
+
with compare_tab:
|
376 |
+
st.subheader("Compare OCR Results")
|
377 |
+
|
378 |
+
if 'processing_history' in st.session_state and len(st.session_state.processing_history) >= 2:
|
379 |
+
st.markdown("""
|
380 |
+
Select two processing results to compare side by side. This allows you to see
|
381 |
+
how different options (like using the vision model) affect OCR quality.
|
382 |
+
""")
|
383 |
+
|
384 |
+
# Create selection dropdowns for the documents
|
385 |
+
col1, col2 = st.columns(2)
|
386 |
+
with col1:
|
387 |
+
# First document selector
|
388 |
+
doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
|
389 |
+
for i, item in enumerate(st.session_state.processing_history)]
|
390 |
+
doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
|
391 |
+
doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
|
392 |
+
|
393 |
+
with col2:
|
394 |
+
# Second document selector
|
395 |
+
doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
|
396 |
+
for i, item in enumerate(st.session_state.processing_history)]
|
397 |
+
default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
|
398 |
+
doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
|
399 |
+
doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
|
400 |
+
|
401 |
+
# Retrieve the selected documents
|
402 |
+
doc1 = st.session_state.processing_history[doc_index_1]
|
403 |
+
doc2 = st.session_state.processing_history[doc_index_2]
|
404 |
+
|
405 |
+
# Show comparison
|
406 |
+
col1, col2 = st.columns(2)
|
407 |
+
|
408 |
+
with col1:
|
409 |
+
doc1_html = f"""
|
410 |
+
<h4>Document 1: {doc1['fileName']}</h4>
|
411 |
+
<p><strong>Processed at:</strong> {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
|
412 |
+
<p><strong>Vision model used:</strong> {'Yes' if doc1['useVision'] else 'No'}</p>
|
413 |
+
"""
|
414 |
+
tool_container(doc1_html)
|
415 |
+
|
416 |
+
# Display content summary
|
417 |
+
if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
|
418 |
+
if 'content' in doc1['result']['ocr_contents']:
|
419 |
+
content = doc1['result']['ocr_contents']['content']
|
420 |
+
st.markdown(f"""
|
421 |
+
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
|
422 |
+
border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
|
423 |
+
{content[:500]}{'...' if len(content) > 500 else ''}
|
424 |
+
</div>
|
425 |
+
""", unsafe_allow_html=True)
|
426 |
+
|
427 |
+
with col2:
|
428 |
+
doc2_html = f"""
|
429 |
+
<h4>Document 2: {doc2['fileName']}</h4>
|
430 |
+
<p><strong>Processed at:</strong> {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
|
431 |
+
<p><strong>Vision model used:</strong> {'Yes' if doc2['useVision'] else 'No'}</p>
|
432 |
+
"""
|
433 |
+
tool_container(doc2_html)
|
434 |
+
|
435 |
+
# Display content summary
|
436 |
+
if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
|
437 |
+
if 'content' in doc2['result']['ocr_contents']:
|
438 |
+
content = doc2['result']['ocr_contents']['content']
|
439 |
+
st.markdown(f"""
|
440 |
+
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
|
441 |
+
border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
|
442 |
+
{content[:500]}{'...' if len(content) > 500 else ''}
|
443 |
+
</div>
|
444 |
+
""", unsafe_allow_html=True)
|
445 |
+
|
446 |
+
# Comparison analysis
|
447 |
+
if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
|
448 |
+
comparison_content = """
|
449 |
+
<h3>Vision vs. Non-Vision Model Comparison</h3>
|
450 |
+
<p>You're comparing the same document processed with different models.
|
451 |
+
This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p>
|
452 |
+
|
453 |
+
<p>Look for these differences:</p>
|
454 |
+
<ul>
|
455 |
+
<li>Completeness of extracted text</li>
|
456 |
+
<li>Accuracy of layout understanding</li>
|
457 |
+
<li>Recognition of complex elements (tables, figures)</li>
|
458 |
+
<li>Topic and language detection accuracy</li>
|
459 |
+
</ul>
|
460 |
+
"""
|
461 |
+
key_concept(comparison_content)
|
462 |
+
else:
|
463 |
+
need_more_content = """
|
464 |
+
<h3>Need More Documents to Compare</h3>
|
465 |
+
<p>Process at least two documents to enable side-by-side comparison. Try processing
|
466 |
+
the same document with and without the vision model to see the differences in OCR quality.</p>
|
467 |
+
"""
|
468 |
+
research_question(need_more_content)
|
469 |
+
|
470 |
+
# Analysis guide tab
|
471 |
+
with analyze_tab:
|
472 |
+
st.subheader("Analysis Guide")
|
473 |
+
|
474 |
+
st.markdown("""
|
475 |
+
### How to Analyze OCR Results
|
476 |
+
|
477 |
+
When analyzing OCR results from historical documents, consider these key factors:
|
478 |
+
|
479 |
+
1. **Text Accuracy**
|
480 |
+
- Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
|
481 |
+
- Assess recognition of period-specific typography and writing styles
|
482 |
+
- Evaluate handling of degraded or damaged text areas
|
483 |
+
|
484 |
+
2. **Structure Preservation**
|
485 |
+
- Does the OCR maintain paragraph and section breaks?
|
486 |
+
- Are columns and tabular data correctly preserved?
|
487 |
+
- How well are page transitions handled?
|
488 |
+
|
489 |
+
3. **Special Elements**
|
490 |
+
- Recognition of footnotes, marginalia, and annotations
|
491 |
+
- Handling of illustrations, diagrams, and decorative elements
|
492 |
+
- Treatment of watermarks, signatures, and stamps
|
493 |
+
|
494 |
+
4. **Metadata Extraction**
|
495 |
+
- Accuracy of detected languages, topics, and document type
|
496 |
+
- Identification of dates, names, and key entities
|
497 |
+
- Recognition of document purpose and context
|
498 |
+
""")
|
499 |
+
|
500 |
+
col1, col2 = st.columns(2)
|
501 |
+
|
502 |
+
with col1:
|
503 |
+
challenge_content = """
|
504 |
+
<h3>Common OCR Challenges</h3>
|
505 |
+
<ul>
|
506 |
+
<li><strong>Typography Variations</strong>: Historical fonts that differ from modern text</li>
|
507 |
+
<li><strong>Material Degradation</strong>: Fading, stains, tears affecting legibility</li>
|
508 |
+
<li><strong>Handwritten Elements</strong>: Marginalia, signatures, and annotations</li>
|
509 |
+
<li><strong>Complex Layouts</strong>: Multi-column formats and decorative elements</li>
|
510 |
+
<li><strong>Language and Terminology</strong>: Archaic terms and multilingual content</li>
|
511 |
+
</ul>
|
512 |
+
"""
|
513 |
+
gray_container(challenge_content)
|
514 |
+
|
515 |
+
with col2:
|
516 |
+
tips_content = """
|
517 |
+
<h3>Making the Most of OCR Results</h3>
|
518 |
+
<ul>
|
519 |
+
<li><strong>Contextual Reading</strong>: Use context to interpret unclear passages</li>
|
520 |
+
<li><strong>Error Patterns</strong>: Identify and correct systematic OCR errors</li>
|
521 |
+
<li><strong>Hybrid Analysis</strong>: Combine OCR search with close reading</li>
|
522 |
+
<li><strong>Comparative Processing</strong>: Try different settings on documents</li>
|
523 |
+
<li><strong>Iterative Refinement</strong>: Use insights to improve future processing</li>
|
524 |
+
</ul>
|
525 |
+
"""
|
526 |
+
gray_container(tips_content)
|
527 |
+
|
528 |
+
# Show example analysis if there's processing history
|
529 |
+
if 'processing_history' in st.session_state and st.session_state.processing_history:
|
530 |
+
with st.expander("Example Analysis from Your Documents"):
|
531 |
+
# Pick the latest document
|
532 |
+
latest = st.session_state.processing_history[-1]
|
533 |
+
|
534 |
+
st.markdown(f"""
|
535 |
+
#### Sample Analysis for: {latest['fileName']}
|
536 |
+
|
537 |
+
**Document Context:**
|
538 |
+
- Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
|
539 |
+
- Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
|
540 |
+
- Vision model used: {'Yes' if latest['useVision'] else 'No'}
|
541 |
+
|
542 |
+
**What to Look For:**
|
543 |
+
1. Check how well the model identified key topics and languages
|
544 |
+
2. Evaluate the completeness of extracted text
|
545 |
+
3. Note any systematic errors in text recognition
|
546 |
+
4. Assess how well document structure was preserved
|
547 |
+
""")
|
modules/modular_app.py
ADDED
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pathlib import Path
|
3 |
+
import sys
|
4 |
+
from layout import page_wrapper
|
5 |
+
from modules import get_module, get_module_name, module_names
|
6 |
+
|
7 |
+
# Set page configuration with dark theme
|
8 |
+
st.set_page_config(
|
9 |
+
page_title="Historical OCR Workshop",
|
10 |
+
page_icon="📜",
|
11 |
+
layout="wide",
|
12 |
+
initial_sidebar_state="collapsed"
|
13 |
+
)
|
14 |
+
|
15 |
+
# Initialize session state for workshop navigation
|
16 |
+
if 'current_module' not in st.session_state:
|
17 |
+
st.session_state.current_module = 1
|
18 |
+
|
19 |
+
if 'workshop_started' not in st.session_state:
|
20 |
+
st.session_state.workshop_started = False
|
21 |
+
|
22 |
+
if 'processing_history' not in st.session_state:
|
23 |
+
st.session_state.processing_history = []
|
24 |
+
|
25 |
+
def navigate_to_module(module_number):
|
26 |
+
"""Navigate to a specific module"""
|
27 |
+
st.session_state.current_module = module_number
|
28 |
+
st.rerun()
|
29 |
+
|
30 |
+
# Welcome screen if workshop hasn't been started
|
31 |
+
if not st.session_state.workshop_started:
|
32 |
+
def welcome_screen():
|
33 |
+
"""Renders the welcome/start screen"""
|
34 |
+
# Hero section with eye-catching design
|
35 |
+
st.markdown("""
|
36 |
+
<div style="background: linear-gradient(135deg, #1E3A8A 0%, #2563EB 100%);
|
37 |
+
padding: 2rem; border-radius: 0.75rem; text-align: center;
|
38 |
+
margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.3);">
|
39 |
+
<h1>Historical OCR Workshop</h1>
|
40 |
+
<p style="font-size: 1.25rem;">Unlock the potential of historical documents with modern OCR technology</p>
|
41 |
+
</div>
|
42 |
+
""", unsafe_allow_html=True)
|
43 |
+
|
44 |
+
# Introduction with cleaner layout
|
45 |
+
col1, col2 = st.columns([3, 2])
|
46 |
+
|
47 |
+
with col1:
|
48 |
+
st.markdown("""
|
49 |
+
<div style="background-color: #1f2937; padding: 1.5rem; border-radius: 0.75rem; margin-bottom: 1.5rem;">
|
50 |
+
<h3>Workshop Overview</h3>
|
51 |
+
|
52 |
+
This interactive workshop explores the application of OCR technology to historical documents,
|
53 |
+
combining theoretical understanding with practical experiences. Designed for historians,
|
54 |
+
archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
|
55 |
+
</div>
|
56 |
+
""", unsafe_allow_html=True)
|
57 |
+
|
58 |
+
st.markdown("""
|
59 |
+
<div style="background-color: #374151; padding: 0.75rem; border-radius: 0.5rem;
|
60 |
+
margin: 1rem 0; border-left: 3px solid #3B82F6;">
|
61 |
+
<h4>What is OCR?</h4>
|
62 |
+
Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
|
63 |
+
Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for
|
64 |
+
historical research and digital humanities.
|
65 |
+
</div>
|
66 |
+
""", unsafe_allow_html=True)
|
67 |
+
|
68 |
+
with col2:
|
69 |
+
# Add an engaging research question
|
70 |
+
st.markdown("""
|
71 |
+
<div style="background-color: #1E3A8A; color: white; padding: 0.75rem;
|
72 |
+
border-radius: 0.5rem; margin: 1rem 0; border-left: 3px solid #60A5FA;">
|
73 |
+
<h4>For Historians:</h4>
|
74 |
+
How might OCR technology transform our access to and interpretation of historical documents?
|
75 |
+
What new research questions become possible when large archives become machine-readable?
|
76 |
+
</div>
|
77 |
+
""", unsafe_allow_html=True)
|
78 |
+
|
79 |
+
# Display a sample historical document image
|
80 |
+
input_dir = Path(__file__).parent / "input"
|
81 |
+
sample_path = input_dir / "magellan-travels.jpg"
|
82 |
+
if sample_path.exists():
|
83 |
+
try:
|
84 |
+
from PIL import Image
|
85 |
+
with Image.open(sample_path) as img:
|
86 |
+
st.image(img, caption="Sample Historical Document", width=300)
|
87 |
+
except Exception:
|
88 |
+
pass
|
89 |
+
|
90 |
+
# What you'll learn section with visual learning outcomes
|
91 |
+
st.markdown('<h3>What You\'ll Learn</h3>', unsafe_allow_html=True)
|
92 |
+
|
93 |
+
# Create three columns for clean layout
|
94 |
+
col1, col2, col3 = st.columns(3)
|
95 |
+
|
96 |
+
with col1:
|
97 |
+
st.markdown("""
|
98 |
+
<div style="background-color: #1f2937; padding: 1rem; border-radius: 0.5rem;">
|
99 |
+
<h4>Conceptual Understanding</h4>
|
100 |
+
|
101 |
+
- Text-image relationships in historical documents
|
102 |
+
- Evolution of OCR technology
|
103 |
+
- AI vision models for document analysis
|
104 |
+
- Historical typography challenges
|
105 |
+
</div>
|
106 |
+
""", unsafe_allow_html=True)
|
107 |
+
|
108 |
+
with col2:
|
109 |
+
st.markdown("""
|
110 |
+
<div style="background-color: #1f2937; padding: 1rem; border-radius: 0.5rem;">
|
111 |
+
<h4>Methodological Approaches</h4>
|
112 |
+
|
113 |
+
- Critical frameworks for OCR in historical research
|
114 |
+
- Hybrid computational-traditional methods
|
115 |
+
- Error analysis and interpretation
|
116 |
+
- Contextual reading strategies
|
117 |
+
</div>
|
118 |
+
""", unsafe_allow_html=True)
|
119 |
+
|
120 |
+
with col3:
|
121 |
+
st.markdown("""
|
122 |
+
<div style="background-color: #1f2937; padding: 1rem; border-radius: 0.5rem;">
|
123 |
+
<h4>Practical Skills</h4>
|
124 |
+
|
125 |
+
- Processing historical documents with OCR
|
126 |
+
- Analyzing and structuring extracted information
|
127 |
+
- Integrating OCR into research workflows
|
128 |
+
- Building searchable archives
|
129 |
+
</div>
|
130 |
+
""", unsafe_allow_html=True)
|
131 |
+
|
132 |
+
# Module overview
|
133 |
+
st.markdown('<h3>Workshop Modules</h3>', unsafe_allow_html=True)
|
134 |
+
|
135 |
+
# First row of modules
|
136 |
+
col1, col2 = st.columns(2)
|
137 |
+
|
138 |
+
with col1:
|
139 |
+
for i in [1, 3, 5]:
|
140 |
+
st.markdown(f"""
|
141 |
+
<div style="background-color: #1f2937; border-radius: 8px; padding: 16px;
|
142 |
+
margin-bottom: 16px; border-top: 4px solid #3B82F6;">
|
143 |
+
<div style="background-color: #3B82F6; color: white; font-weight: bold;
|
144 |
+
padding: 4px 10px; border-radius: 12px; font-size: 0.9rem;
|
145 |
+
display: inline-block; margin-bottom: 8px;">Module {i}</div>
|
146 |
+
<div style="font-weight: 600; margin-bottom: 8px; font-size: 1.1rem; color: white;">
|
147 |
+
{module_names[i-1]}
|
148 |
+
</div>
|
149 |
+
<p>Module {i} of the historical OCR workshop.</p>
|
150 |
+
</div>
|
151 |
+
""", unsafe_allow_html=True)
|
152 |
+
|
153 |
+
with col2:
|
154 |
+
for i in [2, 4, 6]:
|
155 |
+
st.markdown(f"""
|
156 |
+
<div style="background-color: #1f2937; border-radius: 8px; padding: 16px;
|
157 |
+
margin-bottom: 16px; border-top: 4px solid #3B82F6;">
|
158 |
+
<div style="background-color: #3B82F6; color: white; font-weight: bold;
|
159 |
+
padding: 4px 10px; border-radius: 12px; font-size: 0.9rem;
|
160 |
+
display: inline-block; margin-bottom: 8px;">Module {i}</div>
|
161 |
+
<div style="font-weight: 600; margin-bottom: 8px; font-size: 1.1rem; color: white;">
|
162 |
+
{module_names[i-1]}
|
163 |
+
</div>
|
164 |
+
<p>Module {i} of the historical OCR workshop.</p>
|
165 |
+
</div>
|
166 |
+
""", unsafe_allow_html=True)
|
167 |
+
|
168 |
+
# Inspirational quote
|
169 |
+
st.markdown("""
|
170 |
+
<div style="font-style: italic; color: #D1D5DB; padding: 0.5rem 1rem;
|
171 |
+
border-left: 3px solid #4B5563; margin: 1rem 0;">
|
172 |
+
"The digital turn in historical research is not just about converting analog to digital;
|
173 |
+
it's about transforming how we access, analyze, and interpret the past."
|
174 |
+
<br/><br/>
|
175 |
+
<span style="font-size:0.9rem; text-align:right; display:block;">— Dr. Jane Winters, Professor of Digital Humanities</span>
|
176 |
+
</div>
|
177 |
+
""", unsafe_allow_html=True)
|
178 |
+
|
179 |
+
# Start button with enhanced styling
|
180 |
+
st.markdown('<div style="text-align: center; margin-top: 2rem;">', unsafe_allow_html=True)
|
181 |
+
col1, col2, col3 = st.columns([1, 2, 1])
|
182 |
+
with col2:
|
183 |
+
if st.button("Begin Workshop Journey", key="start_workshop", type="primary", use_container_width=True):
|
184 |
+
st.session_state.workshop_started = True
|
185 |
+
st.rerun()
|
186 |
+
st.markdown('<p style="text-align:center; margin-top:8px; font-size:0.9rem; color:#666;">No installation required • Start immediately</p>', unsafe_allow_html=True)
|
187 |
+
st.markdown('</div>', unsafe_allow_html=True)
|
188 |
+
|
189 |
+
# Display the welcome screen (outside modules)
|
190 |
+
welcome_screen()
|
191 |
+
else:
|
192 |
+
# Get the current module to display
|
193 |
+
current_module = st.session_state.current_module
|
194 |
+
module = get_module(current_module)
|
195 |
+
|
196 |
+
# Create navigation callbacks for the page wrapper
|
197 |
+
def nav_to_prev():
|
198 |
+
if current_module > 1:
|
199 |
+
st.session_state.current_module = current_module - 1
|
200 |
+
st.rerun()
|
201 |
+
|
202 |
+
def nav_to_next():
|
203 |
+
if current_module < 6:
|
204 |
+
st.session_state.current_module = current_module + 1
|
205 |
+
st.rerun()
|
206 |
+
|
207 |
+
# Create the sidebar navigation
|
208 |
+
with st.sidebar:
|
209 |
+
st.markdown("<h1>Workshop Navigation</h1>", unsafe_allow_html=True)
|
210 |
+
|
211 |
+
# Visual header
|
212 |
+
st.markdown("<div style='display:flex; align-items:center; margin-bottom:20px;'>", unsafe_allow_html=True)
|
213 |
+
|
214 |
+
# Show a progress indicator
|
215 |
+
st.markdown(f"<div><b>Your Progress:</b> Module {current_module} of 6</div>", unsafe_allow_html=True)
|
216 |
+
st.progress(current_module / 6)
|
217 |
+
|
218 |
+
# Module navigation buttons
|
219 |
+
st.markdown("<h3>Modules</h3>", unsafe_allow_html=True)
|
220 |
+
|
221 |
+
for i, name in enumerate(module_names, 1):
|
222 |
+
btn_style = "primary" if i == current_module else "secondary"
|
223 |
+
if st.button(f"{i}: {name}", key=f"nav_module_{i}", type=btn_style, use_container_width=True):
|
224 |
+
st.session_state.current_module = i
|
225 |
+
st.rerun()
|
226 |
+
|
227 |
+
# About the workshop in a collapsible section
|
228 |
+
with st.expander("About the Workshop"):
|
229 |
+
st.markdown("""
|
230 |
+
This interactive workshop explores OCR technology for historical documents.
|
231 |
+
|
232 |
+
**How to use this workshop:**
|
233 |
+
1. Navigate through modules sequentially
|
234 |
+
2. Expand content sections to read more
|
235 |
+
3. Try the interactive OCR experiment
|
236 |
+
4. Reflect on research questions
|
237 |
+
|
238 |
+
For help or more information, use the reference materials in Module 6.
|
239 |
+
""")
|
240 |
+
|
241 |
+
# Processing history if available
|
242 |
+
if st.session_state.processing_history:
|
243 |
+
with st.expander("Your Activity"):
|
244 |
+
st.markdown(f"<b>Documents processed:</b> {len(st.session_state.processing_history)}", unsafe_allow_html=True)
|
245 |
+
|
246 |
+
# Show the most recent document processed
|
247 |
+
latest = st.session_state.processing_history[-1]
|
248 |
+
st.markdown(f"""
|
249 |
+
<div style="background:#f9f9f9; padding:8px; border-radius:4px; margin-top:10px; color:#333;">
|
250 |
+
<b>Latest document:</b> {latest['fileName']}<br>
|
251 |
+
<span style="font-size:0.9rem;">Processed with {' vision model' if latest['useVision'] else ' basic OCR'}</span>
|
252 |
+
</div>
|
253 |
+
""", unsafe_allow_html=True)
|
254 |
+
|
255 |
+
# Render the current module content using the page wrapper
|
256 |
+
page_wrapper(module.render, current_module)
|
257 |
+
|
258 |
+
# At the bottom of the page, create the hidden navigation buttons for the fixed navigation bar
|
259 |
+
if st.session_state.workshop_started:
|
260 |
+
# Previous navigation button (hidden, activated by the fixed nav)
|
261 |
+
if st.session_state.current_module > 1:
|
262 |
+
if st.button("←", key=f"nav_prev_{st.session_state.current_module-1}", label_visibility="collapsed"):
|
263 |
+
st.session_state.current_module -= 1
|
264 |
+
st.rerun()
|
265 |
+
|
266 |
+
# Next navigation button (hidden, activated by the fixed nav)
|
267 |
+
if st.session_state.current_module < 6:
|
268 |
+
if st.button("→", key=f"nav_next_{st.session_state.current_module+1}", label_visibility="collapsed"):
|
269 |
+
st.session_state.current_module += 1
|
270 |
+
st.rerun()
|
271 |
+
|
272 |
+
# Module navigation dots (hidden, activated by the fixed nav)
|
273 |
+
for i in range(1, 7):
|
274 |
+
if st.button(f"{i}", key=f"nav_dot_{i}", label_visibility="collapsed"):
|
275 |
+
st.session_state.current_module = i
|
276 |
+
st.rerun()
|
ocr_utils.py
ADDED
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Utility functions for OCR processing with Mistral AI.
|
3 |
+
Contains helper functions for working with OCR responses and image handling.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import json
|
7 |
+
import base64
|
8 |
+
import io
|
9 |
+
from pathlib import Path
|
10 |
+
from typing import Dict, List, Optional, Union, Any
|
11 |
+
|
12 |
+
try:
|
13 |
+
from PIL import Image
|
14 |
+
PILLOW_AVAILABLE = True
|
15 |
+
except ImportError:
|
16 |
+
PILLOW_AVAILABLE = False
|
17 |
+
|
18 |
+
from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
|
19 |
+
|
20 |
+
def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
|
21 |
+
"""
|
22 |
+
Replace image placeholders in markdown with base64-encoded images.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
markdown_str: Markdown text containing image placeholders
|
26 |
+
images_dict: Dictionary mapping image IDs to base64 strings
|
27 |
+
|
28 |
+
Returns:
|
29 |
+
Markdown text with images replaced by base64 data
|
30 |
+
"""
|
31 |
+
for img_name, base64_str in images_dict.items():
|
32 |
+
markdown_str = markdown_str.replace(
|
33 |
+
f"", f""
|
34 |
+
)
|
35 |
+
return markdown_str
|
36 |
+
|
37 |
+
def get_combined_markdown(ocr_response) -> str:
|
38 |
+
"""
|
39 |
+
Combine OCR text and images into a single markdown document.
|
40 |
+
Ensures proper spacing between text and images.
|
41 |
+
|
42 |
+
Args:
|
43 |
+
ocr_response: Response from OCR processing containing text and images
|
44 |
+
See https://docs.mistral.ai/capabilities/document/ for API reference
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
Combined markdown string with embedded images
|
48 |
+
"""
|
49 |
+
markdowns: list[str] = []
|
50 |
+
# Extract images from page
|
51 |
+
for page in ocr_response.pages:
|
52 |
+
image_data = {}
|
53 |
+
for img in page.images:
|
54 |
+
image_data[img.id] = img.image_base64
|
55 |
+
|
56 |
+
# Replace image placeholders with actual images
|
57 |
+
page_markdown = replace_images_in_markdown(page.markdown, image_data)
|
58 |
+
|
59 |
+
# Ensure proper spacing between paragraphs and images
|
60 |
+
# Add extra newlines between paragraphs to improve rendering
|
61 |
+
page_markdown = page_markdown.replace("\n", "\n\n")
|
62 |
+
|
63 |
+
# Add page separator for multi-page documents
|
64 |
+
markdowns.append(page_markdown)
|
65 |
+
|
66 |
+
# Join pages with clear separators for multi-page documents
|
67 |
+
return "\n\n---\n\n".join(markdowns)
|
68 |
+
|
69 |
+
def encode_image_for_api(image_path: Union[str, Path]) -> str:
|
70 |
+
"""
|
71 |
+
Encode an image as base64 for API use.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
image_path: Path to the image file
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
Base64 data URL for the image
|
78 |
+
"""
|
79 |
+
# Convert to Path object if string
|
80 |
+
image_file = Path(image_path) if isinstance(image_path, str) else image_path
|
81 |
+
|
82 |
+
# Verify image exists
|
83 |
+
if not image_file.is_file():
|
84 |
+
raise FileNotFoundError(f"Image file not found: {image_file}")
|
85 |
+
|
86 |
+
# Encode image as base64
|
87 |
+
encoded = base64.b64encode(image_file.read_bytes()).decode()
|
88 |
+
return f"data:image/jpeg;base64,{encoded}"
|
89 |
+
|
90 |
+
def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
|
91 |
+
"""
|
92 |
+
Process an image with OCR and return the response.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
client: Mistral AI client
|
96 |
+
image_path: Path to the image file
|
97 |
+
model: OCR model to use
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
OCR response object
|
101 |
+
"""
|
102 |
+
# Encode image as base64
|
103 |
+
base64_data_url = encode_image_for_api(image_path)
|
104 |
+
|
105 |
+
# Process image with OCR
|
106 |
+
image_response = client.ocr.process(
|
107 |
+
document=ImageURLChunk(image_url=base64_data_url),
|
108 |
+
model=model
|
109 |
+
)
|
110 |
+
|
111 |
+
return image_response
|
112 |
+
|
113 |
+
def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
|
114 |
+
"""
|
115 |
+
Convert OCR response to a formatted JSON string.
|
116 |
+
|
117 |
+
Args:
|
118 |
+
ocr_response: OCR response object
|
119 |
+
indent: Indentation level for JSON formatting
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
Formatted JSON string
|
123 |
+
"""
|
124 |
+
# Convert response to JSON
|
125 |
+
response_dict = json.loads(ocr_response.model_dump_json())
|
126 |
+
return json.dumps(response_dict, indent=indent)
|
127 |
+
|
128 |
+
def get_combined_markdown_compressed(ocr_response, max_width: int = 800, quality: int = 85) -> str:
|
129 |
+
"""
|
130 |
+
Combine OCR text and images into a single markdown document with compressed images.
|
131 |
+
Reduces image sizes to improve performance.
|
132 |
+
|
133 |
+
Args:
|
134 |
+
ocr_response: Response from OCR processing containing text and images
|
135 |
+
max_width: Maximum width to resize images to (preserves aspect ratio)
|
136 |
+
quality: JPEG quality (0-100) for compression
|
137 |
+
|
138 |
+
Returns:
|
139 |
+
Combined markdown string with embedded compressed images
|
140 |
+
"""
|
141 |
+
if not PILLOW_AVAILABLE:
|
142 |
+
# Fall back to regular method if PIL is not available
|
143 |
+
return get_combined_markdown(ocr_response)
|
144 |
+
|
145 |
+
markdowns: list[str] = []
|
146 |
+
|
147 |
+
# Process each page
|
148 |
+
for page in ocr_response.pages:
|
149 |
+
image_data = {}
|
150 |
+
|
151 |
+
# Process and compress each image
|
152 |
+
for img in page.images:
|
153 |
+
try:
|
154 |
+
# Decode base64 image
|
155 |
+
img_bytes = base64.b64decode(img.image_base64.split(',')[1] if ',' in img.image_base64 else img.image_base64)
|
156 |
+
|
157 |
+
# Open with PIL
|
158 |
+
pil_img = Image.open(io.BytesIO(img_bytes))
|
159 |
+
|
160 |
+
# Resize if needed (maintain aspect ratio)
|
161 |
+
original_width, original_height = pil_img.size
|
162 |
+
if original_width > max_width:
|
163 |
+
ratio = max_width / original_width
|
164 |
+
new_height = int(original_height * ratio)
|
165 |
+
pil_img = pil_img.resize((max_width, new_height), Image.LANCZOS)
|
166 |
+
|
167 |
+
# Convert to bytes with compression
|
168 |
+
buffer = io.BytesIO()
|
169 |
+
format = pil_img.format if pil_img.format else 'JPEG'
|
170 |
+
if format.upper() == 'JPEG' or format.upper() == 'JPG':
|
171 |
+
pil_img.save(buffer, format=format, quality=quality, optimize=True)
|
172 |
+
else:
|
173 |
+
# For non-JPEG formats (PNG, etc.)
|
174 |
+
pil_img.save(buffer, format=format, optimize=True)
|
175 |
+
|
176 |
+
# Convert back to base64
|
177 |
+
compressed_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
178 |
+
mime_type = f"image/{format.lower()}" if format else "image/jpeg"
|
179 |
+
image_data[img.id] = f"data:{mime_type};base64,{compressed_base64}"
|
180 |
+
|
181 |
+
except Exception as e:
|
182 |
+
# If compression fails, use original image
|
183 |
+
image_data[img.id] = img.image_base64
|
184 |
+
|
185 |
+
# Replace image placeholders with compressed images
|
186 |
+
page_markdown = replace_images_in_markdown(page.markdown, image_data)
|
187 |
+
|
188 |
+
# Ensure proper spacing between paragraphs and images
|
189 |
+
page_markdown = page_markdown.replace("\n", "\n\n")
|
190 |
+
|
191 |
+
# Add page to list
|
192 |
+
markdowns.append(page_markdown)
|
193 |
+
|
194 |
+
# Join pages with clear separators
|
195 |
+
return "\n\n---\n\n".join(markdowns)
|
196 |
+
|
197 |
+
# For display in notebooks
|
198 |
+
try:
|
199 |
+
from IPython.display import Markdown, display
|
200 |
+
|
201 |
+
def display_ocr_with_images(ocr_response):
|
202 |
+
"""
|
203 |
+
Display OCR response with embedded images in IPython environments.
|
204 |
+
|
205 |
+
Args:
|
206 |
+
ocr_response: OCR response object
|
207 |
+
"""
|
208 |
+
combined_markdown = get_combined_markdown(ocr_response)
|
209 |
+
display(Markdown(combined_markdown))
|
210 |
+
except ImportError:
|
211 |
+
# IPython not available
|
212 |
+
pass
|
output/.gitkeep
ADDED
File without changes
|
output/example-1.html
ADDED
The diff for this file is too large to render.
See raw diff
|
|
output/recipe_test.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"file_name": "img-0.jpeg",
|
3 |
+
"topics": [
|
4 |
+
"Cooking",
|
5 |
+
"Recipes",
|
6 |
+
"Baking"
|
7 |
+
],
|
8 |
+
"languages": [
|
9 |
+
"English"
|
10 |
+
],
|
11 |
+
"ocr_contents": {
|
12 |
+
"title": "Pecan Butterballs Cookies",
|
13 |
+
"recipe": "1 cup butter, creamy if possible\n1/4 inch honey\n2 \" ounces flour\n1/2 teaspoon salt\n2 \" ounces pecans\n2 cups finely chopped pecans\nForm into small balls, bake at 300 40-45 min roll in uncoated sugar"
|
14 |
+
},
|
15 |
+
"confidence_score": 0.85,
|
16 |
+
"raw_response":
|
output/ymca-letter.jpg
ADDED
![]() |
Git LFS Details
|