milwright commited on
Commit
f475c01
·
verified ·
1 Parent(s): e99f9b5

submit pull for merge (#1)

Browse files

- submit pull for merge (85bdb4e2f53788772ec789554db9a859d4a957e8)

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +17 -0
  2. README.md +91 -8
  3. __pycache__/config.cpython-312.pyc +0 -0
  4. __pycache__/ocr_utils.cpython-312.pyc +0 -0
  5. __pycache__/process_file.cpython-312.pyc +0 -0
  6. __pycache__/structured_ocr.cpython-312.pyc +0 -0
  7. app.py +525 -102
  8. backup/app.py +535 -0
  9. backup/config.py +17 -0
  10. backup/input/The Magician, or Bottle Cungerer.jpeg +3 -0
  11. backup/input/baldwin-letter-1.jpg +3 -0
  12. backup/input/baldwin-letter-2.jpg +3 -0
  13. backup/input/flier.png +0 -0
  14. backup/input/letter-1.jpg +3 -0
  15. backup/input/letter-2.jpg +3 -0
  16. backup/input/letter-3.jpg +3 -0
  17. backup/input/magellan-travels.jpg +3 -0
  18. backup/input/menu.pdf +3 -0
  19. backup/input/recipe.jpg +0 -0
  20. backup/ocr_utils.py +136 -0
  21. backup/pdf_ocr.py +76 -0
  22. backup/requirements.txt +10 -0
  23. backup/structured_ocr.py +414 -0
  24. config.py +6 -3
  25. input/The Magician, or Bottle Cungerer.jpeg +3 -0
  26. input/a-la-carte.pdf +3 -0
  27. input/flier.png +0 -0
  28. input/handwritten-letter.jpg +3 -0
  29. input/letter-1.jpg +3 -0
  30. input/letter-2.jpg +3 -0
  31. input/letter-3.jpg +3 -0
  32. input/magician-satire.jpg +3 -0
  33. input/menu.pdf +3 -0
  34. input/milgram-flier.png +0 -0
  35. input/okeefe-recipe.jpg +0 -0
  36. input/recipe.jpg +0 -0
  37. modules/content/__init__.py +36 -0
  38. modules/content/module1.py +85 -0
  39. modules/content/module2.py +88 -0
  40. modules/content/module3.py +106 -0
  41. modules/content/module4.py +124 -0
  42. modules/content/module5.py +547 -0
  43. modules/content/module6.py +154 -0
  44. modules/educational_module.py +547 -0
  45. modules/modular_app.py +276 -0
  46. ocr_utils.py +212 -0
  47. output/.gitkeep +0 -0
  48. output/example-1.html +0 -0
  49. output/recipe_test.json +16 -0
  50. output/ymca-letter.jpg +3 -0
.gitattributes CHANGED
@@ -37,3 +37,20 @@ input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
37
  input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
38
  input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
39
  input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
38
  input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
39
  input/okeefe-menu.pdf filter=lfs diff=lfs merge=lfs -text
40
+ backup/input/baldwin-letter-1.jpg filter=lfs diff=lfs merge=lfs -text
41
+ backup/input/baldwin-letter-2.jpg filter=lfs diff=lfs merge=lfs -text
42
+ backup/input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
43
+ backup/input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
44
+ backup/input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
45
+ backup/input/magellan-travels.jpg filter=lfs diff=lfs merge=lfs -text
46
+ backup/input/menu.pdf filter=lfs diff=lfs merge=lfs -text
47
+ backup/input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
48
+ input/a-la-carte.pdf filter=lfs diff=lfs merge=lfs -text
49
+ input/handwritten-letter.jpg filter=lfs diff=lfs merge=lfs -text
50
+ input/letter-1.jpg filter=lfs diff=lfs merge=lfs -text
51
+ input/letter-2.jpg filter=lfs diff=lfs merge=lfs -text
52
+ input/letter-3.jpg filter=lfs diff=lfs merge=lfs -text
53
+ input/magician-satire.jpg filter=lfs diff=lfs merge=lfs -text
54
+ input/menu.pdf filter=lfs diff=lfs merge=lfs -text
55
+ input/The[[:space:]]Magician,[[:space:]]or[[:space:]]Bottle[[:space:]]Cungerer.jpeg filter=lfs diff=lfs merge=lfs -text
56
+ output/ymca-letter.jpg filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Historical OCR
3
- emoji: 🚀
4
  colorFrom: red
5
  colorTo: green
6
  sdk: streamlit
@@ -22,15 +22,78 @@ This application uses Mistral AI's OCR capabilities to transcribe and extract in
22
  - Structured output generation using Mistral models
23
  - Interactive web interface with Streamlit
24
  - Supports historical documents and manuscripts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  ## Setup for Local Development
27
 
28
- 1. This directory is standalone and can be moved anywhere
29
- 2. Install dependencies:
 
 
 
 
 
 
30
  ```
31
  pip install -r requirements.txt
32
  ```
33
- 3. Set up your Mistral API key:
34
  - Option 1: Create a `.env` file in this directory and add your Mistral API key:
35
  ```
36
  MISTRAL_API_KEY=your_api_key_here
@@ -40,7 +103,7 @@ pip install -r requirements.txt
40
  export MISTRAL_API_KEY=your_api_key_here
41
  ```
42
  - Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
43
- 4. Run the Streamlit app using the script:
44
  ```
45
  ./run_local.sh
46
  ```
@@ -52,12 +115,32 @@ streamlit run app.py
52
  ## Usage
53
 
54
  1. Upload an image or PDF file using the file uploader
55
- 2. Select processing options in the sidebar (e.g., use vision model)
56
  3. Click "Process Document" to analyze the file
57
  4. View the structured results and extract information
58
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ## Deployment on Hugging Face Spaces
60
 
61
- This app is designed to be deployed on Hugging Face Spaces. The `README.md` contains the necessary configuration metadata.
 
 
 
 
 
 
 
 
62
 
63
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Historical OCR
3
+ emoji: 📜
4
  colorFrom: red
5
  colorTo: green
6
  sdk: streamlit
 
22
  - Structured output generation using Mistral models
23
  - Interactive web interface with Streamlit
24
  - Supports historical documents and manuscripts
25
+ - PDF preview functionality for better user experience
26
+ - Smart handling of large PDFs with automatic page limiting
27
+ - Robust error handling with helpful messages
28
+ - Image preprocessing options for enhanced OCR accuracy
29
+
30
+ ## Project Structure
31
+
32
+ The project is organized as follows:
33
+
34
+ ```
35
+ Historical OCR - Project Structure
36
+
37
+ ┌─ Main Applications
38
+ │ ├─ app.py # Standard Streamlit interface for OCR processing
39
+ │ └─ streamlit_app.py # Educational modular version with learning components
40
+
41
+ ├─ Core Functionality
42
+ │ ├─ structured_ocr.py # Main OCR processing engine with Mistral AI integration
43
+ │ ├─ ocr_utils.py # Utility functions for OCR text and image processing
44
+ │ ├─ pdf_ocr.py # PDF-specific document processing functionality
45
+ │ └─ config.py # Configuration settings and API keys
46
+
47
+ ├─ Testing & Development
48
+ │ ├─ simple_test.py # Basic OCR functionality test
49
+ │ ├─ test_pdf.py # PDF processing test
50
+ │ ├─ test_pdf_preview.py # PDF preview generation test
51
+ │ └─ prepare_for_hf.py # Prepare project for Hugging Face deployment
52
+
53
+ ├─ Scripts
54
+ │ ├─ run_local.sh # Launch standard or educational app locally
55
+ │ ├─ run_large_files.sh # Process large documents with optimized settings
56
+ │ └─ setup_git.sh # Configure Git repositories
57
+
58
+ ├─ Educational Modules (streamlit/)
59
+ │ ├─ modules/
60
+ │ │ ├─ module1.py # Introduction and Problematization
61
+ │ │ ├─ module2.py # Historical Typography & OCR Challenges
62
+ │ │ ├─ module3.py # Document Analysis Techniques
63
+ │ │ ├─ module4.py # Processing Methods
64
+ │ │ ├─ module5.py # Research Applications
65
+ │ │ └─ module6.py # Future Directions
66
+ │ │
67
+ │ ├─ modular_app.py # Learning module framework
68
+ │ ├─ layout.py # UI components for educational interface
69
+ │ └─ process_file.py # File processing for educational app
70
+
71
+ ├─ UI Components (ui/)
72
+ │ └─ layout.py # Shared UI components and styling
73
+
74
+ ├─ Data Directories
75
+ │ ├─ input/ # Sample documents for testing/demo
76
+ │ └─ output/ # Output directory for processed files
77
+
78
+ └─ Dependencies
79
+ ├─ requirements.txt # Python package dependencies
80
+ └─ packages.txt # System-level dependencies
81
+ ```
82
 
83
  ## Setup for Local Development
84
 
85
+ 1. Clone this repository
86
+ 2. Install system dependencies:
87
+ - For PDF processing, you need poppler:
88
+ - On macOS: `brew install poppler`
89
+ - On Ubuntu/Debian: `apt-get install poppler-utils`
90
+ - On Windows: Download from [poppler releases](https://github.com/oschwartz10612/poppler-windows/releases/) and add to PATH
91
+ - For text recognition: `tesseract-ocr`
92
+ 3. Install Python dependencies:
93
  ```
94
  pip install -r requirements.txt
95
  ```
96
+ 4. Set up your Mistral API key:
97
  - Option 1: Create a `.env` file in this directory and add your Mistral API key:
98
  ```
99
  MISTRAL_API_KEY=your_api_key_here
 
103
  export MISTRAL_API_KEY=your_api_key_here
104
  ```
105
  - Get your API key from [Mistral AI Console](https://console.mistral.ai/api-keys/)
106
+ 5. Run the Streamlit app using the script:
107
  ```
108
  ./run_local.sh
109
  ```
 
115
  ## Usage
116
 
117
  1. Upload an image or PDF file using the file uploader
118
+ 2. Select processing options in the sidebar (e.g., use vision model, image preprocessing)
119
  3. Click "Process Document" to analyze the file
120
  4. View the structured results and extract information
121
 
122
+ ## Application Versions
123
+
124
+ Two versions of the application are available:
125
+
126
+ 1. **Standard Version** (`app.py`): Focused on document processing with a clean interface
127
+ 2. **Educational Version** (`streamlit_app.py`): Enhanced with educational modules and interactive components
128
+
129
+ To run the educational version:
130
+ ```
131
+ streamlit run streamlit_app.py
132
+ ```
133
+
134
  ## Deployment on Hugging Face Spaces
135
 
136
+ This app is designed to be deployed on Hugging Face Spaces. To deploy:
137
+
138
+ 1. Fork this repository to your GitHub account or directly create a new Space on [Hugging Face](https://huggingface.co/spaces)
139
+ 2. Connect your GitHub repository to your Hugging Face Space for automatic deployment
140
+ 3. Add your Mistral API key as a secret in your Hugging Face Space settings:
141
+ - Secret name: `HF_MISTRAL_API_KEY`
142
+ - Secret value: Your Mistral API key
143
+
144
+ The `README.md` contains the necessary configuration metadata for Hugging Face Spaces.
145
 
146
+ Check out the configuration reference at [Hugging Face Spaces documentation](https://huggingface.co/docs/hub/spaces-config-reference)
__pycache__/config.cpython-312.pyc ADDED
Binary file (619 Bytes). View file
 
__pycache__/ocr_utils.cpython-312.pyc ADDED
Binary file (8.08 kB). View file
 
__pycache__/process_file.cpython-312.pyc ADDED
Binary file (2.86 kB). View file
 
__pycache__/structured_ocr.cpython-312.pyc ADDED
Binary file (16.5 kB). View file
 
app.py CHANGED
@@ -2,24 +2,105 @@ import os
2
  import streamlit as st
3
  import json
4
  import sys
 
5
  from pathlib import Path
6
  import tempfile
 
 
 
 
 
7
 
8
  # Import the StructuredOCR class and config from the local files
9
  from structured_ocr import StructuredOCR
10
  from config import MISTRAL_API_KEY
11
 
 
 
 
 
 
 
 
12
  # Set page configuration
13
  st.set_page_config(
14
  page_title="Historical OCR",
15
- page_icon="🚀",
16
  layout="wide",
17
  initial_sidebar_state="expanded"
18
  )
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  # Define functions
21
- def process_file(uploaded_file, use_vision=True):
22
- """Process the uploaded file and return the OCR results"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Save the uploaded file to a temporary file
24
  with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
25
  tmp.write(uploaded_file.getvalue())
@@ -29,6 +110,8 @@ def process_file(uploaded_file, use_vision=True):
29
  # Check if API key is available
30
  if not MISTRAL_API_KEY:
31
  # Return dummy data if no API key
 
 
32
  return {
33
  "file_name": uploaded_file.name,
34
  "topics": ["Sample Document"],
@@ -38,7 +121,11 @@ def process_file(uploaded_file, use_vision=True):
38
  "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
39
  }
40
  }
41
-
 
 
 
 
42
  # Initialize OCR processor
43
  processor = StructuredOCR()
44
 
@@ -46,9 +133,53 @@ def process_file(uploaded_file, use_vision=True):
46
  file_ext = Path(uploaded_file.name).suffix.lower()
47
  file_type = "pdf" if file_ext == ".pdf" else "image"
48
 
49
- # Process the file
50
- result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  return result
 
 
 
 
 
52
  finally:
53
  # Clean up the temporary file
54
  if os.path.exists(temp_path):
@@ -57,122 +188,414 @@ def process_file(uploaded_file, use_vision=True):
57
  # App title and description
58
  st.title("Historical Document OCR")
59
  st.subheader("Powered by Mistral AI")
60
- st.markdown("""
61
- This application uses Mistral AI's OCR capabilities to transcribe and extract information from historical documents.
62
- Upload an image or PDF file to get started.
63
- """)
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  # Sidebar with options
66
  with st.sidebar:
67
  st.header("Options")
 
 
 
68
  use_vision = st.checkbox("Use Vision Model", value=True,
69
  help="For image files, use the vision model for improved analysis (may be slower)")
70
 
71
- st.markdown("---")
72
- st.subheader("About")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  st.markdown("""
74
- This app uses Mistral AI's OCR API to extract text from historical documents.
 
 
75
 
76
  It can process:
77
  - Image files (jpg, png, etc.)
78
- - PDF documents
79
 
80
- The extracted content is processed into structured data based on the document type.
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  """)
82
 
83
- # File uploader
84
- uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
85
-
86
- if uploaded_file is not None:
87
- # Display the uploaded file
88
- st.subheader("Uploaded Document")
89
- file_ext = Path(uploaded_file.name).suffix.lower()
90
-
91
- col1, col2 = st.columns([1, 1])
92
-
93
- with col1:
94
- if file_ext == ".pdf":
95
- st.info("Processing PDF document...")
96
- # For PDFs, you might show preview of first page
97
- st.write(f"File: {uploaded_file.name}")
98
- else:
99
- st.image(uploaded_file, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # Process button
102
- process_button = st.button("Process Document")
103
-
104
- if process_button:
105
- with st.spinner("Processing document..."):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  try:
107
- # Process the file
108
- result = process_file(uploaded_file, use_vision)
109
 
110
- # Display the results
111
- with col2:
112
- st.subheader("Extracted Information")
113
-
114
- # Display file info
115
- st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
116
-
117
- # Display languages if available
118
- if 'languages' in result:
119
- languages = [lang for lang in result['languages'] if lang is not None]
120
- if languages:
121
- st.write(f"**Languages Detected:** {', '.join(languages)}")
122
-
123
- # Display topics if available
124
- if 'topics' in result and result['topics']:
125
- st.write(f"**Topics:** {', '.join(result['topics'])}")
126
 
127
- # Display the OCR contents
128
- st.subheader("Document Contents")
129
- if 'ocr_contents' in result:
130
- # Create tabs for different views
131
- tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
 
132
 
133
- with tab1:
134
- # Display in a more user-friendly format based on the content structure
135
- if isinstance(result['ocr_contents'], dict):
136
- for section, content in result['ocr_contents'].items():
137
- if content: # Only display non-empty sections
138
- st.markdown(f"#### {section.replace('_', ' ').title()}")
139
-
140
- if isinstance(content, str):
141
- st.markdown(content)
142
- elif isinstance(content, list):
143
- for item in content:
144
- if isinstance(item, str):
145
- st.markdown(f"- {item}")
146
- elif isinstance(item, dict):
147
- st.json(item)
148
- elif isinstance(content, dict):
149
- for k, v in content.items():
150
- st.markdown(f"**{k}:** {v}")
 
 
 
 
 
 
 
151
 
152
- with tab2:
153
- # Show the raw JSON for developers
154
- st.json(result)
155
- else:
156
- st.error("No OCR content was extracted from the document.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  except Exception as e:
159
  st.error(f"Error processing document: {str(e)}")
160
- else:
161
- # Display sample images when no file is uploaded
162
- st.info("Upload a document to get started.")
163
-
164
- # Show example images
165
- st.subheader("Example Documents")
166
- col1, col2, col3 = st.columns([1, 1, 1])
167
-
168
- # Find sample images from the input directory to display
169
- input_dir = Path(__file__).parent / "input"
170
- sample_images = []
171
- if input_dir.exists():
172
- sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
173
-
174
- if sample_images:
175
- for i, img_path in enumerate(sample_images):
176
- col = [col1, col2, col3][i % 3]
177
- with col:
178
- st.image(str(img_path), caption=img_path.name, use_container_width=True)
 
 
 
 
 
 
 
 
 
2
  import streamlit as st
3
  import json
4
  import sys
5
+ import time
6
  from pathlib import Path
7
  import tempfile
8
+ import io
9
+ from pdf2image import convert_from_bytes
10
+ from PIL import Image, ImageEnhance, ImageFilter
11
+ import cv2
12
+ import numpy as np
13
 
14
  # Import the StructuredOCR class and config from the local files
15
  from structured_ocr import StructuredOCR
16
  from config import MISTRAL_API_KEY
17
 
18
+ # Check for modular UI components
19
+ try:
20
+ from ui.layout import tool_container, key_concept, research_question
21
+ MODULAR_UI = True
22
+ except ImportError:
23
+ MODULAR_UI = False
24
+
25
  # Set page configuration
26
  st.set_page_config(
27
  page_title="Historical OCR",
28
+ page_icon="📜",
29
  layout="wide",
30
  initial_sidebar_state="expanded"
31
  )
32
 
33
+ # Enable caching for expensive operations
34
+ @st.cache_data(ttl=3600, show_spinner=False)
35
+ def convert_pdf_to_images(pdf_bytes, dpi=150):
36
+ """Convert PDF bytes to a list of images with caching"""
37
+ try:
38
+ return convert_from_bytes(pdf_bytes, dpi=dpi)
39
+ except Exception as e:
40
+ st.error(f"Error converting PDF: {str(e)}")
41
+ return []
42
+
43
+ @st.cache_data(ttl=3600, show_spinner=False)
44
+ def preprocess_image(image_bytes, preprocessing_options):
45
+ """Preprocess image with selected options"""
46
+ # Convert bytes to OpenCV format
47
+ image = Image.open(io.BytesIO(image_bytes))
48
+ img_array = np.array(image)
49
+
50
+ # Apply preprocessing based on selected options
51
+ if preprocessing_options.get("grayscale", False):
52
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
53
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
54
+
55
+ if preprocessing_options.get("contrast", 0) != 0:
56
+ contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
57
+ image = Image.fromarray(img_array)
58
+ enhancer = ImageEnhance.Contrast(image)
59
+ image = enhancer.enhance(contrast_factor)
60
+ img_array = np.array(image)
61
+
62
+ if preprocessing_options.get("denoise", False):
63
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
64
+
65
+ if preprocessing_options.get("threshold", False):
66
+ # Convert to grayscale if not already
67
+ if len(img_array.shape) == 3:
68
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
69
+ else:
70
+ gray = img_array
71
+ # Apply adaptive threshold
72
+ binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
73
+ cv2.THRESH_BINARY, 11, 2)
74
+ # Convert back to RGB
75
+ img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
76
+
77
+ # Convert back to PIL Image
78
+ processed_image = Image.fromarray(img_array)
79
+
80
+ # Convert to bytes
81
+ byte_io = io.BytesIO()
82
+ processed_image.save(byte_io, format='PNG')
83
+ byte_io.seek(0)
84
+
85
+ return byte_io.getvalue()
86
+
87
  # Define functions
88
+ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
89
+ """Process the uploaded file and return the OCR results
90
+
91
+ Args:
92
+ uploaded_file: The uploaded file to process
93
+ use_vision: Whether to use vision model
94
+ preprocessing_options: Dictionary of preprocessing options
95
+ """
96
+ if preprocessing_options is None:
97
+ preprocessing_options = {}
98
+
99
+ # Show progress indicator
100
+ progress_bar = st.progress(0)
101
+ status_text = st.empty()
102
+ status_text.text("Preparing file for processing...")
103
+
104
  # Save the uploaded file to a temporary file
105
  with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
106
  tmp.write(uploaded_file.getvalue())
 
110
  # Check if API key is available
111
  if not MISTRAL_API_KEY:
112
  # Return dummy data if no API key
113
+ progress_bar.progress(100)
114
+ status_text.empty()
115
  return {
116
  "file_name": uploaded_file.name,
117
  "topics": ["Sample Document"],
 
121
  "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
122
  }
123
  }
124
+
125
+ # Update progress
126
+ progress_bar.progress(20)
127
+ status_text.text("Initializing OCR processor...")
128
+
129
  # Initialize OCR processor
130
  processor = StructuredOCR()
131
 
 
133
  file_ext = Path(uploaded_file.name).suffix.lower()
134
  file_type = "pdf" if file_ext == ".pdf" else "image"
135
 
136
+ # Apply preprocessing if needed
137
+ if any(preprocessing_options.values()) and file_type == "image":
138
+ status_text.text("Applying image preprocessing...")
139
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
140
+
141
+ # Save processed image to temp file
142
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
143
+ proc_tmp.write(processed_bytes)
144
+ temp_path = proc_tmp.name
145
+
146
+ # Get file size in MB
147
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
148
+
149
+ # Check if file exceeds API limits (50 MB)
150
+ if file_size_mb > 50:
151
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
152
+ return {
153
+ "file_name": uploaded_file.name,
154
+ "topics": ["Document"],
155
+ "languages": ["English"],
156
+ "confidence_score": 0.0,
157
+ "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
158
+ "ocr_contents": {
159
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
160
+ "partial_text": "Document could not be processed due to size limitations."
161
+ }
162
+ }
163
+
164
+ # Update progress
165
+ progress_bar.progress(40)
166
+ status_text.text("Processing document with OCR...")
167
+
168
+ # Process the file with file size information for automatic page limiting
169
+ # Make sure we're using the latest mistral-ocr model
170
+ # See https://docs.mistral.ai/capabilities/document/ for more info
171
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
172
+
173
+ # Complete progress
174
+ progress_bar.progress(100)
175
+ status_text.empty()
176
+
177
  return result
178
+ except Exception as e:
179
+ progress_bar.progress(100)
180
+ status_text.empty()
181
+ st.error(f"Error during processing: {str(e)}")
182
+ raise
183
  finally:
184
  # Clean up the temporary file
185
  if os.path.exists(temp_path):
 
188
  # App title and description
189
  st.title("Historical Document OCR")
190
  st.subheader("Powered by Mistral AI")
191
+
192
+ # Create main layout with tabs and columns
193
+ main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
194
+
195
+ with main_tab1:
196
+ # Create a two-column layout for file upload and preview
197
+ upload_col, preview_col = st.columns([1, 1])
198
+
199
+ # File uploader in the left column
200
+ with upload_col:
201
+ st.markdown("""
202
+ Upload an image or PDF file to get started.
203
+
204
+ Using the latest `mistral-ocr-latest` model for advanced document understanding.
205
+ """)
206
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 50MB per file")
207
 
208
  # Sidebar with options
209
  with st.sidebar:
210
  st.header("Options")
211
+
212
+ # Model options
213
+ st.subheader("Model Settings")
214
  use_vision = st.checkbox("Use Vision Model", value=True,
215
  help="For image files, use the vision model for improved analysis (may be slower)")
216
 
217
+ # Image preprocessing options (collapsible)
218
+ st.subheader("Image Preprocessing")
219
+ with st.expander("Preprocessing Options"):
220
+ preprocessing_options = {}
221
+ preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
222
+ help="Convert image to grayscale before OCR")
223
+ preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
224
+ help="Apply adaptive thresholding to enhance text")
225
+ preprocessing_options["denoise"] = st.checkbox("Denoise Image",
226
+ help="Remove noise from the image")
227
+ preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
228
+ help="Adjust image contrast (-5 to +5)")
229
+
230
+ # PDF options (collapsible)
231
+ st.subheader("PDF Options")
232
+ with st.expander("PDF Settings"):
233
+ pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
234
+ help="Higher DPI gives better quality but slower processing")
235
+ max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
236
+ help="Limit number of pages to process")
237
+
238
+ # About tab content
239
+ with main_tab2:
240
  st.markdown("""
241
+ ### About This Application
242
+
243
+ This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
244
 
245
  It can process:
246
  - Image files (jpg, png, etc.)
247
+ - PDF documents (multi-page support)
248
 
249
+ The extracted content is processed into structured data based on the document type, combining:
250
+ - Text extraction with `mistral-ocr-latest`
251
+ - Analysis with language models
252
+ - Layout preservation with images
253
+
254
+ View results in three formats:
255
+ - Structured HTML view
256
+ - Raw JSON (for developers)
257
+ - Markdown with images (preserves document layout)
258
+
259
+ **New Features:**
260
+ - Image preprocessing for better OCR quality
261
+ - PDF resolution and page controls
262
+ - Progress tracking during processing
263
  """)
264
 
265
+ with main_tab1:
266
+ if uploaded_file is not None:
267
+ # Check file size (cap at 50MB)
268
+ file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
269
+
270
+ if file_size_mb > 50:
271
+ with upload_col:
272
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
273
+ st.stop()
274
+
275
+ file_ext = Path(uploaded_file.name).suffix.lower()
276
+
277
+ # Display document preview in preview column
278
+ with preview_col:
279
+ st.subheader("Document Preview")
280
+ if file_ext == ".pdf":
281
+ try:
282
+ # Convert first page of PDF to image for preview
283
+ pdf_bytes = uploaded_file.getvalue()
284
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
285
+
286
+ if images:
287
+ # Convert PIL image to bytes for Streamlit
288
+ first_page = images[0]
289
+ img_bytes = io.BytesIO()
290
+ first_page.save(img_bytes, format='JPEG')
291
+ img_bytes.seek(0)
292
+
293
+ # Display the PDF preview
294
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
295
+ else:
296
+ st.info(f"PDF uploaded: {uploaded_file.name}")
297
+ except Exception:
298
+ # Simply show the file name without an error message
299
+ st.info(f"PDF uploaded: {uploaded_file.name}")
300
+ st.info("Click 'Process Document' to analyze the content.")
301
+ else:
302
+ st.image(uploaded_file, use_container_width=True)
303
 
304
+ # Add image preprocessing preview in a collapsible section if needed
305
+ if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
306
+ with st.expander("Image Preprocessing Preview"):
307
+ preview_cols = st.columns(2)
308
+
309
+ with preview_cols[0]:
310
+ st.markdown("**Original Image**")
311
+ st.image(uploaded_file, use_container_width=True)
312
+
313
+ with preview_cols[1]:
314
+ st.markdown("**Preprocessed Image**")
315
+ try:
316
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
317
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
318
+ except Exception as e:
319
+ st.error(f"Error in preprocessing: {str(e)}")
320
+
321
+ # Process button - flush left with similar padding as file browser
322
+ with upload_col:
323
+ process_button = st.button("Process Document", use_container_width=True)
324
+
325
+ # Results section
326
+ if process_button:
327
  try:
328
+ # Get max_pages or default if not available
329
+ max_pages_value = max_pages if 'max_pages' in locals() else None
330
 
331
+ # Call process_file with all options
332
+ result = process_file(uploaded_file, use_vision, preprocessing_options)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
+ # Create results tabs for better organization
335
+ results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
336
+
337
+ with results_tab1:
338
+ # Create two columns for metadata and content
339
+ meta_col, content_col = st.columns([1, 2])
340
 
341
+ with meta_col:
342
+ st.subheader("Document Metadata")
343
+ st.success("**Document processed successfully**")
344
+
345
+ # Display file info
346
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
347
+
348
+ # Display info if only limited pages were processed
349
+ if 'limited_pages' in result:
350
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
351
+
352
+ # Display languages if available
353
+ if 'languages' in result:
354
+ languages = [lang for lang in result['languages'] if lang is not None]
355
+ if languages:
356
+ st.write(f"**Languages:** {', '.join(languages)}")
357
+
358
+ # Confidence score if available
359
+ if 'confidence_score' in result:
360
+ confidence = result['confidence_score']
361
+ st.write(f"**OCR Confidence:** {confidence:.1%}")
362
+
363
+ # Display topics if available
364
+ if 'topics' in result and result['topics']:
365
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
366
 
367
+ with content_col:
368
+ st.subheader("Document Contents")
369
+ if 'ocr_contents' in result:
370
+ # Check if there are images in the OCR result
371
+ has_images = False
372
+ if 'raw_response' in result:
373
+ try:
374
+ has_images = any(page.images for page in result['raw_response'].pages)
375
+ except Exception:
376
+ has_images = False
377
+
378
+ # Create tabs for different views
379
+ if has_images:
380
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
381
+ else:
382
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
383
+
384
+ with view_tab1:
385
+ # Display in a more user-friendly format based on the content structure
386
+ html_content = ""
387
+ if isinstance(result['ocr_contents'], dict):
388
+ for section, content in result['ocr_contents'].items():
389
+ if content: # Only display non-empty sections
390
+ section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
391
+ html_content += section_title
392
+
393
+ if isinstance(content, str):
394
+ html_content += f"<p>{content}</p>"
395
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
396
+ st.markdown(content)
397
+ elif isinstance(content, list):
398
+ html_list = "<ul>"
399
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
400
+ for item in content:
401
+ if isinstance(item, str):
402
+ html_list += f"<li>{item}</li>"
403
+ st.markdown(f"- {item}")
404
+ elif isinstance(item, dict):
405
+ html_list += f"<li>{json.dumps(item)}</li>"
406
+ st.json(item)
407
+ html_list += "</ul>"
408
+ html_content += html_list
409
+ elif isinstance(content, dict):
410
+ html_dict = "<dl>"
411
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
412
+ for k, v in content.items():
413
+ html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
414
+ st.markdown(f"**{k}:** {v}")
415
+ html_dict += "</dl>"
416
+ html_content += html_dict
417
+
418
+ # Add download button in a smaller section
419
+ with st.expander("Export Content"):
420
+ # Alternative download button
421
+ html_bytes = html_content.encode()
422
+ st.download_button(
423
+ label="Download as HTML",
424
+ data=html_bytes,
425
+ file_name="document_content.html",
426
+ mime="text/html"
427
+ )
428
+
429
+ with view_tab2:
430
+ # Show the raw JSON for developers
431
+ st.json(result)
432
+
433
+ if has_images:
434
+ with view_tab3:
435
+ # Show loading indicator while preparing images
436
+ with st.spinner("Preparing document with embedded images..."):
437
+ try:
438
+ # Import function
439
+ try:
440
+ from ocr_utils import get_combined_markdown
441
+ except ImportError:
442
+ st.error("Required module ocr_utils not found.")
443
+ st.stop()
444
+
445
+ # Check if raw_response is available
446
+ if 'raw_response' not in result:
447
+ st.warning("Raw OCR response not available. Cannot display images.")
448
+ st.stop()
449
+
450
+ # Validate the raw_response structure before processing
451
+ if not hasattr(result['raw_response'], 'pages'):
452
+ st.warning("Invalid OCR response format. Cannot display images.")
453
+ st.stop()
454
+
455
+ # Get the combined markdown with images
456
+ # Set a flag to compress images if needed
457
+ compress_images = True
458
+ max_image_width = 800 # Maximum width for images
459
+
460
+ try:
461
+ # First try to get combined markdown with compressed images
462
+ if compress_images and hasattr(result['raw_response'], 'pages'):
463
+ from ocr_utils import get_combined_markdown_compressed
464
+ combined_markdown = get_combined_markdown_compressed(
465
+ result['raw_response'],
466
+ max_width=max_image_width,
467
+ quality=85
468
+ )
469
+ else:
470
+ # Fall back to regular method if compression not available
471
+ combined_markdown = get_combined_markdown(result['raw_response'])
472
+ except (ImportError, AttributeError):
473
+ # Fall back to regular method
474
+ combined_markdown = get_combined_markdown(result['raw_response'])
475
+
476
+ if not combined_markdown or combined_markdown.strip() == "":
477
+ st.warning("No image content found in the document.")
478
+ st.stop()
479
+
480
+ # Check if there are many images that might cause loading issues
481
+ image_count = sum(len(page.images) for page in result['raw_response'].pages if hasattr(page, 'images'))
482
+
483
+ # Add warning for image-heavy documents
484
+ if image_count > 10:
485
+ st.warning(f"This document contains {image_count} images. Rendering may take longer than usual.")
486
+
487
+ # Add CSS to ensure proper spacing and handling of text and images
488
+ st.markdown("""
489
+ <style>
490
+ .markdown-text-container {
491
+ padding: 10px;
492
+ background-color: #f9f9f9;
493
+ border-radius: 5px;
494
+ }
495
+ .markdown-text-container img {
496
+ margin: 15px 0;
497
+ max-width: 100%;
498
+ border: 1px solid #ddd;
499
+ border-radius: 4px;
500
+ display: block;
501
+ }
502
+ .markdown-text-container p {
503
+ margin-bottom: 16px;
504
+ line-height: 1.6;
505
+ }
506
+ /* Add lazy loading for images to improve performance */
507
+ .markdown-text-container img {
508
+ loading: lazy;
509
+ }
510
+ </style>
511
+ """, unsafe_allow_html=True)
512
+
513
+ # For very image-heavy documents, show images in a paginated way
514
+ if image_count > 20:
515
+ # Show image content in a paginated way
516
+ st.write("Document contains many images. Showing in a paginated format:")
517
+
518
+ # Split the combined markdown by page separators
519
+ pages = combined_markdown.split("---")
520
+
521
+ # Create a page selector
522
+ page_num = st.selectbox("Select page to view:",
523
+ options=list(range(1, len(pages)+1)),
524
+ index=0)
525
+
526
+ # Display only the selected page
527
+ st.markdown(f"""
528
+ <div class="markdown-text-container">
529
+ {pages[page_num-1]}
530
+ </div>
531
+ """, unsafe_allow_html=True)
532
+
533
+ # Add note about pagination
534
+ st.info(f"Showing page {page_num} of {len(pages)}. Select a different page from the dropdown above.")
535
+ else:
536
+ # Wrap the markdown in a div with the class for styling
537
+ st.markdown(f"""
538
+ <div class="markdown-text-container">
539
+ {combined_markdown}
540
+ </div>
541
+ """, unsafe_allow_html=True)
542
+
543
+ # Add a download button for the combined content
544
+ st.download_button(
545
+ label="Download with Images (HTML)",
546
+ data=f"""
547
+ <html>
548
+ <head>
549
+ <style>
550
+ body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
551
+ img {{ max-width: 100%; margin: 15px 0; }}
552
+ </style>
553
+ </head>
554
+ <body>
555
+ {combined_markdown}
556
+ </body>
557
+ </html>
558
+ """,
559
+ file_name="document_with_images.html",
560
+ mime="text/html"
561
+ )
562
+
563
+ except Exception as e:
564
+ st.error(f"Could not display document with images: {str(e)}")
565
+ st.info("Try refreshing or processing the document again.")
566
+ else:
567
+ st.error("No OCR content was extracted from the document.")
568
+
569
+ with results_tab2:
570
+ st.subheader("Raw Processing Results")
571
+ st.json(result)
572
 
573
  except Exception as e:
574
  st.error(f"Error processing document: {str(e)}")
575
+ else:
576
+ # Display sample images in the main area when no file is uploaded
577
+ st.info("Upload a document to get started using the file uploader above.")
578
+
579
+ # Show example images in a grid
580
+ st.subheader("Example Documents")
581
+
582
+ # Add a sample images container
583
+ with st.container():
584
+ # Find sample images from the input directory to display
585
+ input_dir = Path(__file__).parent / "input"
586
+ sample_images = []
587
+ if input_dir.exists():
588
+ # Find valid jpg files (with size > 50KB to avoid placeholders)
589
+ sample_images = [
590
+ path for path in input_dir.glob("*.jpg")
591
+ if path.stat().st_size > 50000
592
+ ][:3] # Limit to 3 samples
593
+
594
+ if sample_images:
595
+ columns = st.columns(3)
596
+ for i, img_path in enumerate(sample_images):
597
+ with columns[i % 3]:
598
+ try:
599
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
600
+ except Exception as e:
601
+ st.error(f"Error loading image {img_path.name}: {str(e)}")
backup/app.py ADDED
@@ -0,0 +1,535 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ import json
4
+ import sys
5
+ import time
6
+ from pathlib import Path
7
+ import tempfile
8
+ import io
9
+ from pdf2image import convert_from_bytes
10
+ from PIL import Image, ImageEnhance, ImageFilter
11
+ import cv2
12
+ import numpy as np
13
+
14
+ # Import the StructuredOCR class and config from the local files
15
+ from structured_ocr import StructuredOCR
16
+ from config import MISTRAL_API_KEY
17
+
18
+ # Set page configuration
19
+ st.set_page_config(
20
+ page_title="Historical OCR",
21
+ page_icon="🚀",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded"
24
+ )
25
+
26
+ # Enable caching for expensive operations
27
+ @st.cache_data(ttl=3600, show_spinner=False)
28
+ def convert_pdf_to_images(pdf_bytes, dpi=150):
29
+ """Convert PDF bytes to a list of images with caching"""
30
+ try:
31
+ return convert_from_bytes(pdf_bytes, dpi=dpi)
32
+ except Exception as e:
33
+ st.error(f"Error converting PDF: {str(e)}")
34
+ return []
35
+
36
+ @st.cache_data(ttl=3600, show_spinner=False)
37
+ def preprocess_image(image_bytes, preprocessing_options):
38
+ """Preprocess image with selected options"""
39
+ # Convert bytes to OpenCV format
40
+ image = Image.open(io.BytesIO(image_bytes))
41
+ img_array = np.array(image)
42
+
43
+ # Apply preprocessing based on selected options
44
+ if preprocessing_options.get("grayscale", False):
45
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
46
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
47
+
48
+ if preprocessing_options.get("contrast", 0) != 0:
49
+ contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
50
+ image = Image.fromarray(img_array)
51
+ enhancer = ImageEnhance.Contrast(image)
52
+ image = enhancer.enhance(contrast_factor)
53
+ img_array = np.array(image)
54
+
55
+ if preprocessing_options.get("denoise", False):
56
+ img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
57
+
58
+ if preprocessing_options.get("threshold", False):
59
+ # Convert to grayscale if not already
60
+ if len(img_array.shape) == 3:
61
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
62
+ else:
63
+ gray = img_array
64
+ # Apply adaptive threshold
65
+ binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
66
+ cv2.THRESH_BINARY, 11, 2)
67
+ # Convert back to RGB
68
+ img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
69
+
70
+ # Convert back to PIL Image
71
+ processed_image = Image.fromarray(img_array)
72
+
73
+ # Convert to bytes
74
+ byte_io = io.BytesIO()
75
+ processed_image.save(byte_io, format='PNG')
76
+ byte_io.seek(0)
77
+
78
+ return byte_io.getvalue()
79
+
80
+ # Define functions
81
+ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
82
+ """Process the uploaded file and return the OCR results
83
+
84
+ Args:
85
+ uploaded_file: The uploaded file to process
86
+ use_vision: Whether to use vision model
87
+ preprocessing_options: Dictionary of preprocessing options
88
+ """
89
+ if preprocessing_options is None:
90
+ preprocessing_options = {}
91
+
92
+ # Show progress indicator
93
+ progress_bar = st.progress(0)
94
+ status_text = st.empty()
95
+ status_text.text("Preparing file for processing...")
96
+
97
+ # Save the uploaded file to a temporary file
98
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
99
+ tmp.write(uploaded_file.getvalue())
100
+ temp_path = tmp.name
101
+
102
+ try:
103
+ # Check if API key is available
104
+ if not MISTRAL_API_KEY:
105
+ # Return dummy data if no API key
106
+ progress_bar.progress(100)
107
+ status_text.empty()
108
+ return {
109
+ "file_name": uploaded_file.name,
110
+ "topics": ["Sample Document"],
111
+ "languages": ["English"],
112
+ "ocr_contents": {
113
+ "title": "Sample Document",
114
+ "content": "This is sample content. To process real documents, please set the MISTRAL_API_KEY environment variable."
115
+ }
116
+ }
117
+
118
+ # Update progress
119
+ progress_bar.progress(20)
120
+ status_text.text("Initializing OCR processor...")
121
+
122
+ # Initialize OCR processor
123
+ processor = StructuredOCR()
124
+
125
+ # Determine file type from extension
126
+ file_ext = Path(uploaded_file.name).suffix.lower()
127
+ file_type = "pdf" if file_ext == ".pdf" else "image"
128
+
129
+ # Apply preprocessing if needed
130
+ if any(preprocessing_options.values()) and file_type == "image":
131
+ status_text.text("Applying image preprocessing...")
132
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
133
+
134
+ # Save processed image to temp file
135
+ with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as proc_tmp:
136
+ proc_tmp.write(processed_bytes)
137
+ temp_path = proc_tmp.name
138
+
139
+ # Get file size in MB
140
+ file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
141
+
142
+ # Check if file exceeds API limits (50 MB)
143
+ if file_size_mb > 50:
144
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size allowed by Mistral API is 50MB.")
145
+ return {
146
+ "file_name": uploaded_file.name,
147
+ "topics": ["Document"],
148
+ "languages": ["English"],
149
+ "confidence_score": 0.0,
150
+ "error": f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
151
+ "ocr_contents": {
152
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
153
+ "partial_text": "Document could not be processed due to size limitations."
154
+ }
155
+ }
156
+
157
+ # Update progress
158
+ progress_bar.progress(40)
159
+ status_text.text("Processing document with OCR...")
160
+
161
+ # Process the file with file size information for automatic page limiting
162
+ # Make sure we're using the latest mistral-ocr model
163
+ # See https://docs.mistral.ai/capabilities/document/ for more info
164
+ result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
165
+
166
+ # Complete progress
167
+ progress_bar.progress(100)
168
+ status_text.empty()
169
+
170
+ return result
171
+ except Exception as e:
172
+ progress_bar.progress(100)
173
+ status_text.empty()
174
+ st.error(f"Error during processing: {str(e)}")
175
+ raise
176
+ finally:
177
+ # Clean up the temporary file
178
+ if os.path.exists(temp_path):
179
+ os.unlink(temp_path)
180
+
181
+ # App title and description
182
+ st.title("Historical Document OCR")
183
+ st.subheader("Powered by Mistral AI")
184
+
185
+ # Create main layout with tabs and columns
186
+ main_tab1, main_tab2 = st.tabs(["Document Processing", "About"])
187
+
188
+ with main_tab1:
189
+ # Create a two-column layout for file upload and preview
190
+ upload_col, preview_col = st.columns([1, 1])
191
+
192
+ # File uploader in the left column
193
+ with upload_col:
194
+ st.markdown("""
195
+ Upload an image or PDF file to get started.
196
+
197
+ Using the latest `mistral-ocr-latest` model for advanced document understanding.
198
+ """)
199
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
200
+
201
+ # Sidebar with options
202
+ with st.sidebar:
203
+ st.header("Options")
204
+
205
+ # Model options
206
+ st.subheader("Model Settings")
207
+ use_vision = st.checkbox("Use Vision Model", value=True,
208
+ help="For image files, use the vision model for improved analysis (may be slower)")
209
+
210
+ # Image preprocessing options (collapsible)
211
+ st.subheader("Image Preprocessing")
212
+ with st.expander("Preprocessing Options"):
213
+ preprocessing_options = {}
214
+ preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
215
+ help="Convert image to grayscale before OCR")
216
+ preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
217
+ help="Apply adaptive thresholding to enhance text")
218
+ preprocessing_options["denoise"] = st.checkbox("Denoise Image",
219
+ help="Remove noise from the image")
220
+ preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
221
+ help="Adjust image contrast (-5 to +5)")
222
+
223
+ # PDF options (collapsible)
224
+ st.subheader("PDF Options")
225
+ with st.expander("PDF Settings"):
226
+ pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
227
+ help="Higher DPI gives better quality but slower processing")
228
+ max_pages = st.number_input("Maximum Pages to Process", 1, 20, 5,
229
+ help="Limit number of pages to process")
230
+
231
+ # About tab content
232
+ with main_tab2:
233
+ st.markdown("""
234
+ ### About This Application
235
+
236
+ This app uses [Mistral AI's Document OCR](https://docs.mistral.ai/capabilities/document/) to extract text and images from historical documents.
237
+
238
+ It can process:
239
+ - Image files (jpg, png, etc.)
240
+ - PDF documents (multi-page support)
241
+
242
+ The extracted content is processed into structured data based on the document type, combining:
243
+ - Text extraction with `mistral-ocr-latest`
244
+ - Analysis with language models
245
+ - Layout preservation with images
246
+
247
+ View results in three formats:
248
+ - Structured HTML view
249
+ - Raw JSON (for developers)
250
+ - Markdown with images (preserves document layout)
251
+
252
+ **New Features:**
253
+ - Image preprocessing for better OCR quality
254
+ - PDF resolution and page controls
255
+ - Progress tracking during processing
256
+ """)
257
+
258
+ with main_tab1:
259
+ if uploaded_file is not None:
260
+ # Check file size (cap at 50MB)
261
+ file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
262
+
263
+ if file_size_mb > 50:
264
+ with upload_col:
265
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 50MB.")
266
+ st.stop()
267
+
268
+ file_ext = Path(uploaded_file.name).suffix.lower()
269
+
270
+ # Display document preview in preview column
271
+ with preview_col:
272
+ st.subheader("Document Preview")
273
+ if file_ext == ".pdf":
274
+ try:
275
+ # Convert first page of PDF to image for preview
276
+ pdf_bytes = uploaded_file.getvalue()
277
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
278
+
279
+ if images:
280
+ # Convert PIL image to bytes for Streamlit
281
+ first_page = images[0]
282
+ img_bytes = io.BytesIO()
283
+ first_page.save(img_bytes, format='JPEG')
284
+ img_bytes.seek(0)
285
+
286
+ # Display the PDF preview
287
+ st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
288
+ else:
289
+ st.info(f"PDF uploaded: {uploaded_file.name}")
290
+ except Exception:
291
+ # Simply show the file name without an error message
292
+ st.info(f"PDF uploaded: {uploaded_file.name}")
293
+ st.info("Click 'Process Document' to analyze the content.")
294
+ else:
295
+ st.image(uploaded_file, use_container_width=True)
296
+
297
+ # Add image preprocessing preview in a collapsible section if needed
298
+ if any(preprocessing_options.values()) and uploaded_file.type.startswith('image/'):
299
+ with st.expander("Image Preprocessing Preview"):
300
+ preview_cols = st.columns(2)
301
+
302
+ with preview_cols[0]:
303
+ st.markdown("**Original Image**")
304
+ st.image(uploaded_file, use_container_width=True)
305
+
306
+ with preview_cols[1]:
307
+ st.markdown("**Preprocessed Image**")
308
+ try:
309
+ processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
310
+ st.image(io.BytesIO(processed_bytes), use_container_width=True)
311
+ except Exception as e:
312
+ st.error(f"Error in preprocessing: {str(e)}")
313
+
314
+ # Process button - flush left with similar padding as file browser
315
+ with upload_col:
316
+ process_button = st.button("Process Document", use_container_width=True)
317
+
318
+ # Results section
319
+ if process_button:
320
+ try:
321
+ # Get max_pages or default if not available
322
+ max_pages_value = max_pages if 'max_pages' in locals() else None
323
+
324
+ # Call process_file with all options
325
+ result = process_file(uploaded_file, use_vision, preprocessing_options)
326
+
327
+ # Create results tabs for better organization
328
+ results_tab1, results_tab2 = st.tabs(["Document Analysis", "Technical Details"])
329
+
330
+ with results_tab1:
331
+ # Create two columns for metadata and content
332
+ meta_col, content_col = st.columns([1, 2])
333
+
334
+ with meta_col:
335
+ st.subheader("Document Metadata")
336
+ st.success("**Document processed successfully**")
337
+
338
+ # Display file info
339
+ st.write(f"**File Name:** {result.get('file_name', uploaded_file.name)}")
340
+
341
+ # Display info if only limited pages were processed
342
+ if 'limited_pages' in result:
343
+ st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
344
+
345
+ # Display languages if available
346
+ if 'languages' in result:
347
+ languages = [lang for lang in result['languages'] if lang is not None]
348
+ if languages:
349
+ st.write(f"**Languages:** {', '.join(languages)}")
350
+
351
+ # Confidence score if available
352
+ if 'confidence_score' in result:
353
+ confidence = result['confidence_score']
354
+ st.write(f"**OCR Confidence:** {confidence:.1%}")
355
+
356
+ # Display topics if available
357
+ if 'topics' in result and result['topics']:
358
+ st.write(f"**Topics:** {', '.join(result['topics'])}")
359
+
360
+ with content_col:
361
+ st.subheader("Document Contents")
362
+ if 'ocr_contents' in result:
363
+ # Check if there are images in the OCR result
364
+ has_images = False
365
+ if 'raw_response' in result:
366
+ try:
367
+ has_images = any(page.images for page in result['raw_response'].pages)
368
+ except Exception:
369
+ has_images = False
370
+
371
+ # Create tabs for different views
372
+ if has_images:
373
+ view_tab1, view_tab2, view_tab3 = st.tabs(["Structured View", "Raw JSON", "With Images"])
374
+ else:
375
+ view_tab1, view_tab2 = st.tabs(["Structured View", "Raw JSON"])
376
+
377
+ with view_tab1:
378
+ # Display in a more user-friendly format based on the content structure
379
+ html_content = ""
380
+ if isinstance(result['ocr_contents'], dict):
381
+ for section, content in result['ocr_contents'].items():
382
+ if content: # Only display non-empty sections
383
+ section_title = f"<h4>{section.replace('_', ' ').title()}</h4>"
384
+ html_content += section_title
385
+
386
+ if isinstance(content, str):
387
+ html_content += f"<p>{content}</p>"
388
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
389
+ st.markdown(content)
390
+ elif isinstance(content, list):
391
+ html_list = "<ul>"
392
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
393
+ for item in content:
394
+ if isinstance(item, str):
395
+ html_list += f"<li>{item}</li>"
396
+ st.markdown(f"- {item}")
397
+ elif isinstance(item, dict):
398
+ html_list += f"<li>{json.dumps(item)}</li>"
399
+ st.json(item)
400
+ html_list += "</ul>"
401
+ html_content += html_list
402
+ elif isinstance(content, dict):
403
+ html_dict = "<dl>"
404
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
405
+ for k, v in content.items():
406
+ html_dict += f"<dt><strong>{k}</strong></dt><dd>{v}</dd>"
407
+ st.markdown(f"**{k}:** {v}")
408
+ html_dict += "</dl>"
409
+ html_content += html_dict
410
+
411
+ # Add download button in a smaller section
412
+ with st.expander("Export Content"):
413
+ # Alternative download button
414
+ html_bytes = html_content.encode()
415
+ st.download_button(
416
+ label="Download as HTML",
417
+ data=html_bytes,
418
+ file_name="document_content.html",
419
+ mime="text/html"
420
+ )
421
+
422
+ with view_tab2:
423
+ # Show the raw JSON for developers
424
+ st.json(result)
425
+
426
+ if has_images:
427
+ with view_tab3:
428
+ # Show loading indicator while preparing images
429
+ with st.spinner("Preparing document with embedded images..."):
430
+ try:
431
+ # Import function
432
+ try:
433
+ from ocr_utils import get_combined_markdown
434
+ except ImportError:
435
+ st.error("Required module ocr_utils not found.")
436
+ st.stop()
437
+
438
+ # Check if raw_response is available
439
+ if 'raw_response' not in result:
440
+ st.warning("Raw OCR response not available. Cannot display images.")
441
+ st.stop()
442
+
443
+ # Validate the raw_response structure before processing
444
+ if not hasattr(result['raw_response'], 'pages'):
445
+ st.warning("Invalid OCR response format. Cannot display images.")
446
+ st.stop()
447
+
448
+ # Get the combined markdown with images
449
+ combined_markdown = get_combined_markdown(result['raw_response'])
450
+
451
+ if not combined_markdown or combined_markdown.strip() == "":
452
+ st.warning("No image content found in the document.")
453
+ st.stop()
454
+
455
+ # Add CSS to ensure proper spacing and handling of text and images
456
+ st.markdown("""
457
+ <style>
458
+ .markdown-text-container {
459
+ padding: 10px;
460
+ background-color: #f9f9f9;
461
+ border-radius: 5px;
462
+ }
463
+ .markdown-text-container img {
464
+ margin: 15px 0;
465
+ max-width: 100%;
466
+ border: 1px solid #ddd;
467
+ border-radius: 4px;
468
+ display: block;
469
+ }
470
+ .markdown-text-container p {
471
+ margin-bottom: 16px;
472
+ line-height: 1.6;
473
+ }
474
+ </style>
475
+ """, unsafe_allow_html=True)
476
+
477
+ # Wrap the markdown in a div with the class for styling
478
+ st.markdown(f"""
479
+ <div class="markdown-text-container">
480
+ {combined_markdown}
481
+ </div>
482
+ """, unsafe_allow_html=True)
483
+
484
+ # Add a download button for the combined content
485
+ st.download_button(
486
+ label="Download with Images (HTML)",
487
+ data=f"""
488
+ <html>
489
+ <head>
490
+ <style>
491
+ body {{ font-family: Arial, sans-serif; line-height: 1.6; }}
492
+ img {{ max-width: 100%; margin: 15px 0; }}
493
+ </style>
494
+ </head>
495
+ <body>
496
+ {combined_markdown}
497
+ </body>
498
+ </html>
499
+ """,
500
+ file_name="document_with_images.html",
501
+ mime="text/html"
502
+ )
503
+
504
+ except Exception as e:
505
+ st.error(f"Could not display document with images: {str(e)}")
506
+ st.info("Try refreshing or processing the document again.")
507
+ else:
508
+ st.error("No OCR content was extracted from the document.")
509
+
510
+ with results_tab2:
511
+ st.subheader("Raw Processing Results")
512
+ st.json(result)
513
+
514
+ except Exception as e:
515
+ st.error(f"Error processing document: {str(e)}")
516
+ else:
517
+ # Display sample images in the main area when no file is uploaded
518
+ st.info("Upload a document to get started using the file uploader above.")
519
+
520
+ # Show example images in a grid
521
+ st.subheader("Example Documents")
522
+
523
+ # Add a sample images container
524
+ with st.container():
525
+ # Find sample images from the input directory to display
526
+ input_dir = Path(__file__).parent / "input"
527
+ sample_images = []
528
+ if input_dir.exists():
529
+ sample_images = list(input_dir.glob("*.jpg"))[:3] # Limit to 3 samples
530
+
531
+ if sample_images:
532
+ columns = st.columns(3)
533
+ for i, img_path in enumerate(sample_images):
534
+ with columns[i % 3]:
535
+ st.image(str(img_path), caption=img_path.name, use_container_width=True)
backup/config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ """
3
+ Configuration file for Mistral OCR processing.
4
+ Contains API key and other settings.
5
+ """
6
+ import os
7
+
8
+ # Your Mistral API key - get from Hugging Face secrets or environment variable
9
+ # The priority order is: HF_SPACES environment var > regular environment var > empty string
10
+ # Note: No default API key is provided for security reasons
11
+ MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
12
+ os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
13
+
14
+ # Model settings
15
+ OCR_MODEL = "mistral-ocr-latest"
16
+ TEXT_MODEL = "ministral-8b-latest"
17
+ VISION_MODEL = "pixtral-12b-latest"
backup/input/The Magician, or Bottle Cungerer.jpeg ADDED

Git LFS Details

  • SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
  • Pointer size: 132 Bytes
  • Size of remote file: 2.96 MB
backup/input/baldwin-letter-1.jpg ADDED

Git LFS Details

  • SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
backup/input/baldwin-letter-2.jpg ADDED

Git LFS Details

  • SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
backup/input/flier.png ADDED
backup/input/letter-1.jpg ADDED

Git LFS Details

  • SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
backup/input/letter-2.jpg ADDED

Git LFS Details

  • SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
backup/input/letter-3.jpg ADDED

Git LFS Details

  • SHA256: 7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
  • Pointer size: 131 Bytes
  • Size of remote file: 231 kB
backup/input/magellan-travels.jpg ADDED

Git LFS Details

  • SHA256: ae3e860789e2c3c8032499e5326864294dbc1b01059169fd08203c980577010b
  • Pointer size: 131 Bytes
  • Size of remote file: 283 kB
backup/input/menu.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
backup/input/recipe.jpg ADDED
backup/ocr_utils.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for OCR processing with Mistral AI.
3
+ Contains helper functions for working with OCR responses and image handling.
4
+ """
5
+
6
+ import json
7
+ import base64
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Union
10
+
11
+ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
12
+
13
+ def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
14
+ """
15
+ Replace image placeholders in markdown with base64-encoded images.
16
+
17
+ Args:
18
+ markdown_str: Markdown text containing image placeholders
19
+ images_dict: Dictionary mapping image IDs to base64 strings
20
+
21
+ Returns:
22
+ Markdown text with images replaced by base64 data
23
+ """
24
+ for img_name, base64_str in images_dict.items():
25
+ markdown_str = markdown_str.replace(
26
+ f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
27
+ )
28
+ return markdown_str
29
+
30
+ def get_combined_markdown(ocr_response) -> str:
31
+ """
32
+ Combine OCR text and images into a single markdown document.
33
+ Ensures proper spacing between text and images.
34
+
35
+ Args:
36
+ ocr_response: Response from OCR processing containing text and images
37
+ See https://docs.mistral.ai/capabilities/document/ for API reference
38
+
39
+ Returns:
40
+ Combined markdown string with embedded images
41
+ """
42
+ markdowns: list[str] = []
43
+ # Extract images from page
44
+ for page in ocr_response.pages:
45
+ image_data = {}
46
+ for img in page.images:
47
+ image_data[img.id] = img.image_base64
48
+
49
+ # Replace image placeholders with actual images
50
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
51
+
52
+ # Ensure proper spacing between paragraphs and images
53
+ # Add extra newlines between paragraphs to improve rendering
54
+ page_markdown = page_markdown.replace("\n", "\n\n")
55
+
56
+ # Add page separator for multi-page documents
57
+ markdowns.append(page_markdown)
58
+
59
+ # Join pages with clear separators for multi-page documents
60
+ return "\n\n---\n\n".join(markdowns)
61
+
62
+ def encode_image_for_api(image_path: Union[str, Path]) -> str:
63
+ """
64
+ Encode an image as base64 for API use.
65
+
66
+ Args:
67
+ image_path: Path to the image file
68
+
69
+ Returns:
70
+ Base64 data URL for the image
71
+ """
72
+ # Convert to Path object if string
73
+ image_file = Path(image_path) if isinstance(image_path, str) else image_path
74
+
75
+ # Verify image exists
76
+ if not image_file.is_file():
77
+ raise FileNotFoundError(f"Image file not found: {image_file}")
78
+
79
+ # Encode image as base64
80
+ encoded = base64.b64encode(image_file.read_bytes()).decode()
81
+ return f"data:image/jpeg;base64,{encoded}"
82
+
83
+ def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
84
+ """
85
+ Process an image with OCR and return the response.
86
+
87
+ Args:
88
+ client: Mistral AI client
89
+ image_path: Path to the image file
90
+ model: OCR model to use
91
+
92
+ Returns:
93
+ OCR response object
94
+ """
95
+ # Encode image as base64
96
+ base64_data_url = encode_image_for_api(image_path)
97
+
98
+ # Process image with OCR
99
+ image_response = client.ocr.process(
100
+ document=ImageURLChunk(image_url=base64_data_url),
101
+ model=model
102
+ )
103
+
104
+ return image_response
105
+
106
+ def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
107
+ """
108
+ Convert OCR response to a formatted JSON string.
109
+
110
+ Args:
111
+ ocr_response: OCR response object
112
+ indent: Indentation level for JSON formatting
113
+
114
+ Returns:
115
+ Formatted JSON string
116
+ """
117
+ # Convert response to JSON
118
+ response_dict = json.loads(ocr_response.model_dump_json())
119
+ return json.dumps(response_dict, indent=indent)
120
+
121
+ # For display in notebooks
122
+ try:
123
+ from IPython.display import Markdown, display
124
+
125
+ def display_ocr_with_images(ocr_response):
126
+ """
127
+ Display OCR response with embedded images in IPython environments.
128
+
129
+ Args:
130
+ ocr_response: OCR response object
131
+ """
132
+ combined_markdown = get_combined_markdown(ocr_response)
133
+ display(Markdown(combined_markdown))
134
+ except ImportError:
135
+ # IPython not available
136
+ pass
backup/pdf_ocr.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDFOCR - Module for processing PDF files with OCR and extracting structured data.
4
+ """
5
+
6
+ import json
7
+ from pathlib import Path
8
+ from structured_ocr import StructuredOCR
9
+
10
+ class PDFOCR:
11
+ """Class for processing PDF files with OCR and extracting structured data."""
12
+
13
+ def __init__(self, api_key=None):
14
+ """Initialize the PDF OCR processor."""
15
+ self.processor = StructuredOCR(api_key=api_key)
16
+
17
+ def process_pdf(self, pdf_path, use_vision=True):
18
+ """
19
+ Process a PDF file with OCR and extract structured data.
20
+
21
+ Args:
22
+ pdf_path: Path to the PDF file
23
+ use_vision: Whether to use vision model for improved analysis
24
+
25
+ Returns:
26
+ Dictionary with structured OCR results
27
+ """
28
+ pdf_path = Path(pdf_path)
29
+ if not pdf_path.exists():
30
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
31
+
32
+ return self.processor.process_file(pdf_path, file_type="pdf", use_vision=use_vision)
33
+
34
+ def save_json_output(self, pdf_path, output_path, use_vision=True):
35
+ """
36
+ Process a PDF file and save the structured output as JSON.
37
+
38
+ Args:
39
+ pdf_path: Path to the PDF file
40
+ output_path: Path where to save the JSON output
41
+ use_vision: Whether to use vision model for improved analysis
42
+
43
+ Returns:
44
+ Path to the saved JSON file
45
+ """
46
+ # Process the PDF
47
+ result = self.process_pdf(pdf_path, use_vision=use_vision)
48
+
49
+ # Save the result to JSON
50
+ output_path = Path(output_path)
51
+ output_path.parent.mkdir(parents=True, exist_ok=True)
52
+
53
+ with open(output_path, 'w') as f:
54
+ json.dump(result, f, indent=2)
55
+
56
+ return output_path
57
+
58
+ # For testing directly
59
+ if __name__ == "__main__":
60
+ import sys
61
+
62
+ if len(sys.argv) < 2:
63
+ print("Usage: python pdf_ocr.py <pdf_path> [output_path]")
64
+ sys.exit(1)
65
+
66
+ pdf_path = sys.argv[1]
67
+ output_path = sys.argv[2] if len(sys.argv) > 2 else None
68
+
69
+ processor = PDFOCR()
70
+
71
+ if output_path:
72
+ result_path = processor.save_json_output(pdf_path, output_path)
73
+ print(f"Results saved to: {result_path}")
74
+ else:
75
+ result = processor.process_pdf(pdf_path)
76
+ print(json.dumps(result, indent=2))
backup/requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.43.2
2
+ mistralai>=0.0.7
3
+ pydantic>=2.0.0
4
+ pycountry>=23.12.11
5
+ pillow>=10.0.0
6
+ python-multipart>=0.0.6
7
+ pdf2image>=1.17.0
8
+ pytesseract>=0.3.10
9
+ opencv-python-headless>=4.6.0
10
+ numpy>=1.23.5
backup/structured_ocr.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ from enum import Enum
5
+ from pathlib import Path
6
+ import json
7
+ import base64
8
+ import pycountry
9
+ import logging
10
+ from pydantic import BaseModel
11
+ from mistralai import Mistral
12
+ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO,
16
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
17
+
18
+ # Import utilities for OCR processing
19
+ try:
20
+ from ocr_utils import replace_images_in_markdown, get_combined_markdown
21
+ except ImportError:
22
+ # Define fallback functions if module not found
23
+ def replace_images_in_markdown(markdown_str, images_dict):
24
+ for img_name, base64_str in images_dict.items():
25
+ markdown_str = markdown_str.replace(
26
+ f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
27
+ )
28
+ return markdown_str
29
+
30
+ def get_combined_markdown(ocr_response):
31
+ markdowns = []
32
+ for page in ocr_response.pages:
33
+ image_data = {}
34
+ for img in page.images:
35
+ image_data[img.id] = img.image_base64
36
+ markdowns.append(replace_images_in_markdown(page.markdown, image_data))
37
+ return "\n\n".join(markdowns)
38
+
39
+ # Import config directly (now local to historical-ocr)
40
+ from config import MISTRAL_API_KEY, OCR_MODEL, TEXT_MODEL, VISION_MODEL
41
+
42
+ # Create language enum for structured output
43
+ languages = {lang.alpha_2: lang.name for lang in pycountry.languages if hasattr(lang, 'alpha_2')}
44
+
45
+ class LanguageMeta(Enum.__class__):
46
+ def __new__(metacls, cls, bases, classdict):
47
+ for code, name in languages.items():
48
+ classdict[name.upper().replace(' ', '_')] = name
49
+ return super().__new__(metacls, cls, bases, classdict)
50
+
51
+ class Language(Enum, metaclass=LanguageMeta):
52
+ pass
53
+
54
+ class StructuredOCRModel(BaseModel):
55
+ file_name: str
56
+ topics: list[str]
57
+ languages: list[Language]
58
+ ocr_contents: dict
59
+
60
+ class StructuredOCR:
61
+ def __init__(self, api_key=None):
62
+ """Initialize the OCR processor with API key"""
63
+ self.api_key = api_key or MISTRAL_API_KEY
64
+ self.client = Mistral(api_key=self.api_key)
65
+
66
+ def process_file(self, file_path, file_type=None, use_vision=True, max_pages=None, file_size_mb=None, custom_pages=None):
67
+ """Process a file and return structured OCR results
68
+
69
+ Args:
70
+ file_path: Path to the file to process
71
+ file_type: 'pdf' or 'image' (will be auto-detected if None)
72
+ use_vision: Whether to use vision model for improved analysis
73
+ max_pages: Optional limit on number of pages to process
74
+ file_size_mb: Optional file size in MB (used for automatic page limiting)
75
+ custom_pages: Optional list of specific page numbers to process
76
+
77
+ Returns:
78
+ Dictionary with structured OCR results
79
+ """
80
+ # Convert file_path to Path object if it's a string
81
+ file_path = Path(file_path)
82
+
83
+ # Auto-detect file type if not provided
84
+ if file_type is None:
85
+ suffix = file_path.suffix.lower()
86
+ file_type = "pdf" if suffix == ".pdf" else "image"
87
+
88
+ # Get file size if not provided
89
+ if file_size_mb is None and file_path.exists():
90
+ file_size_mb = file_path.stat().st_size / (1024 * 1024) # Convert bytes to MB
91
+
92
+ # Check if file exceeds API limits (50 MB)
93
+ if file_size_mb and file_size_mb > 50:
94
+ logging.warning(f"File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB")
95
+ return {
96
+ "file_name": file_path.name,
97
+ "topics": ["Document"],
98
+ "languages": ["English"],
99
+ "confidence_score": 0.0,
100
+ "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 50 MB",
101
+ "ocr_contents": {
102
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds Mistral API limit of 50 MB",
103
+ "partial_text": "Document could not be processed due to size limitations."
104
+ }
105
+ }
106
+
107
+ # For PDF files, limit pages based on file size if no explicit limit is given
108
+ if file_type == "pdf" and file_size_mb and max_pages is None and custom_pages is None:
109
+ if file_size_mb > 100: # Very large files
110
+ max_pages = 3
111
+ elif file_size_mb > 50: # Large files
112
+ max_pages = 5
113
+ elif file_size_mb > 20: # Medium files
114
+ max_pages = 10
115
+ else: # Small files
116
+ max_pages = None # Process all pages
117
+
118
+ # Start processing timer
119
+ start_time = time.time()
120
+
121
+ # Read and process the file
122
+ if file_type == "pdf":
123
+ result = self._process_pdf(file_path, use_vision, max_pages, custom_pages)
124
+ else:
125
+ result = self._process_image(file_path, use_vision)
126
+
127
+ # Add processing time information
128
+ processing_time = time.time() - start_time
129
+ result['processing_time'] = processing_time
130
+
131
+ # Add a default confidence score if not present
132
+ if 'confidence_score' not in result:
133
+ result['confidence_score'] = 0.85 # Default confidence
134
+
135
+ return result
136
+
137
+ def _process_pdf(self, file_path, use_vision=True, max_pages=None, custom_pages=None):
138
+ """Process a PDF file with OCR
139
+
140
+ Args:
141
+ file_path: Path to the PDF file
142
+ use_vision: Whether to use vision model
143
+ max_pages: Optional limit on the number of pages to process
144
+ custom_pages: Optional list of specific page numbers to process
145
+ """
146
+ logger = logging.getLogger("pdf_processor")
147
+ logger.info(f"Processing PDF: {file_path}")
148
+
149
+ try:
150
+ # Upload the PDF file
151
+ logger.info("Uploading PDF file to Mistral API")
152
+ uploaded_file = self.client.files.upload(
153
+ file={
154
+ "file_name": file_path.stem,
155
+ "content": file_path.read_bytes(),
156
+ },
157
+ purpose="ocr",
158
+ )
159
+
160
+ # Get a signed URL for the uploaded file
161
+ signed_url = self.client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
162
+
163
+ # Process the PDF with OCR
164
+ logger.info(f"Processing PDF with OCR using {OCR_MODEL}")
165
+ pdf_response = self.client.ocr.process(
166
+ document=DocumentURLChunk(document_url=signed_url.url),
167
+ model=OCR_MODEL,
168
+ include_image_base64=True
169
+ )
170
+
171
+ # Limit pages if requested
172
+ pages_to_process = pdf_response.pages
173
+ total_pages = len(pdf_response.pages)
174
+ limited_pages = False
175
+
176
+ logger.info(f"PDF has {total_pages} total pages")
177
+
178
+ # Handle custom page selection if provided
179
+ if custom_pages:
180
+ # Convert to 0-based indexing and filter valid page numbers
181
+ valid_indices = [i-1 for i in custom_pages if 0 < i <= total_pages]
182
+ if valid_indices:
183
+ pages_to_process = [pdf_response.pages[i] for i in valid_indices]
184
+ limited_pages = True
185
+ logger.info(f"Processing {len(valid_indices)} custom-selected pages")
186
+ # Otherwise handle max_pages limit
187
+ elif max_pages and total_pages > max_pages:
188
+ pages_to_process = pages_to_process[:max_pages]
189
+ limited_pages = True
190
+ logger.info(f"Processing only first {max_pages} pages out of {total_pages} total pages")
191
+
192
+ # Calculate average confidence score based on OCR response if available
193
+ confidence_score = 0.0
194
+ try:
195
+ # Some OCR APIs provide confidence scores
196
+ confidence_values = []
197
+ for page in pages_to_process:
198
+ if hasattr(page, 'confidence'):
199
+ confidence_values.append(page.confidence)
200
+
201
+ if confidence_values:
202
+ confidence_score = sum(confidence_values) / len(confidence_values)
203
+ else:
204
+ confidence_score = 0.85 # Default if no confidence scores available
205
+ except:
206
+ confidence_score = 0.85 # Default fallback
207
+
208
+ # Combine pages' markdown into a single string
209
+ all_markdown = "\n\n".join([page.markdown for page in pages_to_process])
210
+
211
+ # Extract structured data using the appropriate model
212
+ if use_vision:
213
+ # Get base64 of first page for vision model
214
+ first_page_image = None
215
+ if pages_to_process and pages_to_process[0].images:
216
+ first_page_image = pages_to_process[0].images[0].image_base64
217
+
218
+ if first_page_image:
219
+ # Use vision model
220
+ logger.info(f"Using vision model: {VISION_MODEL}")
221
+ result = self._extract_structured_data_with_vision(first_page_image, all_markdown, file_path.name)
222
+ else:
223
+ # Fall back to text-only model if no image available
224
+ logger.info(f"No images in PDF, falling back to text model: {TEXT_MODEL}")
225
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
226
+ else:
227
+ # Use text-only model
228
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
229
+ result = self._extract_structured_data_text_only(all_markdown, file_path.name)
230
+
231
+ # Add page limit info to result if needed
232
+ if limited_pages:
233
+ result['limited_pages'] = {
234
+ 'processed': len(pages_to_process),
235
+ 'total': total_pages
236
+ }
237
+
238
+ # Add confidence score
239
+ result['confidence_score'] = confidence_score
240
+
241
+ # Store the raw OCR response for image rendering
242
+ result['raw_response'] = pdf_response
243
+
244
+ logger.info(f"PDF processing completed successfully")
245
+ return result
246
+
247
+ except Exception as e:
248
+ logger.error(f"Error processing PDF: {str(e)}")
249
+ # Return basic result on error
250
+ return {
251
+ "file_name": file_path.name,
252
+ "topics": ["Document"],
253
+ "languages": ["English"],
254
+ "confidence_score": 0.0,
255
+ "error": str(e),
256
+ "ocr_contents": {
257
+ "error": f"Failed to process PDF: {str(e)}",
258
+ "partial_text": "Document could not be fully processed."
259
+ }
260
+ }
261
+
262
+ def _process_image(self, file_path, use_vision=True):
263
+ """Process an image file with OCR"""
264
+ logger = logging.getLogger("image_processor")
265
+ logger.info(f"Processing image: {file_path}")
266
+
267
+ try:
268
+ # Read and encode the image file
269
+ logger.info("Encoding image for API")
270
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
271
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
272
+
273
+ # Process the image with OCR
274
+ logger.info(f"Processing image with OCR using {OCR_MODEL}")
275
+ image_response = self.client.ocr.process(
276
+ document=ImageURLChunk(image_url=base64_data_url),
277
+ model=OCR_MODEL,
278
+ include_image_base64=True
279
+ )
280
+
281
+ # Get the OCR markdown from the first page
282
+ image_ocr_markdown = image_response.pages[0].markdown if image_response.pages else ""
283
+
284
+ # Calculate confidence score if available
285
+ confidence_score = 0.85 # Default value
286
+ try:
287
+ if hasattr(image_response.pages[0], 'confidence'):
288
+ confidence_score = image_response.pages[0].confidence
289
+ except:
290
+ pass
291
+
292
+ # Extract structured data using the appropriate model
293
+ if use_vision:
294
+ logger.info(f"Using vision model: {VISION_MODEL}")
295
+ result = self._extract_structured_data_with_vision(base64_data_url, image_ocr_markdown, file_path.name)
296
+ else:
297
+ logger.info(f"Using text-only model: {TEXT_MODEL}")
298
+ result = self._extract_structured_data_text_only(image_ocr_markdown, file_path.name)
299
+
300
+ # Add confidence score
301
+ result['confidence_score'] = confidence_score
302
+
303
+ # Store the raw OCR response for image rendering
304
+ result['raw_response'] = image_response
305
+
306
+ logger.info("Image processing completed successfully")
307
+ return result
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error processing image: {str(e)}")
311
+ # Return basic result on error
312
+ return {
313
+ "file_name": file_path.name,
314
+ "topics": ["Document"],
315
+ "languages": ["English"],
316
+ "confidence_score": 0.0,
317
+ "error": str(e),
318
+ "ocr_contents": {
319
+ "error": f"Failed to process image: {str(e)}",
320
+ "partial_text": "Image could not be processed."
321
+ }
322
+ }
323
+
324
+ def _extract_structured_data_with_vision(self, image_base64, ocr_markdown, filename):
325
+ """Extract structured data using vision model"""
326
+ try:
327
+ # Parse with vision model with a timeout
328
+ chat_response = self.client.chat.parse(
329
+ model=VISION_MODEL,
330
+ messages=[
331
+ {
332
+ "role": "user",
333
+ "content": [
334
+ ImageURLChunk(image_url=image_base64),
335
+ TextChunk(text=(
336
+ f"This is a historical document's OCR in markdown:\n"
337
+ f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
338
+ f"Convert this into a structured JSON response with the OCR contents in a sensible dictionary. "
339
+ f"Extract topics, languages, and organize the content logically."
340
+ ))
341
+ ],
342
+ },
343
+ ],
344
+ response_format=StructuredOCRModel,
345
+ temperature=0
346
+ )
347
+
348
+ # Convert the response to a dictionary
349
+ result = json.loads(chat_response.choices[0].message.parsed.json())
350
+
351
+ # Ensure languages is a list of strings, not Language enum objects
352
+ if 'languages' in result:
353
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
354
+
355
+ except Exception as e:
356
+ # Fall back to text-only model if vision model fails
357
+ print(f"Vision model failed: {str(e)}. Falling back to text-only model.")
358
+ result = self._extract_structured_data_text_only(ocr_markdown, filename)
359
+
360
+ return result
361
+
362
+ def _extract_structured_data_text_only(self, ocr_markdown, filename):
363
+ """Extract structured data using text-only model"""
364
+ try:
365
+ # Parse with text-only model with a timeout
366
+ chat_response = self.client.chat.parse(
367
+ model=TEXT_MODEL,
368
+ messages=[
369
+ {
370
+ "role": "user",
371
+ "content": f"This is a historical document's OCR in markdown:\n"
372
+ f"<BEGIN_IMAGE_OCR>\n{ocr_markdown}\n<END_IMAGE_OCR>.\n"
373
+ f"Convert this into a structured JSON response with the OCR contents. "
374
+ f"Extract topics, languages, and organize the content logically."
375
+ },
376
+ ],
377
+ response_format=StructuredOCRModel,
378
+ temperature=0
379
+ )
380
+
381
+ # Convert the response to a dictionary
382
+ result = json.loads(chat_response.choices[0].message.parsed.json())
383
+
384
+ # Ensure languages is a list of strings, not Language enum objects
385
+ if 'languages' in result:
386
+ result['languages'] = [str(lang) for lang in result.get('languages', [])]
387
+
388
+ except Exception as e:
389
+ # Create a basic result if parsing fails
390
+ print(f"Text model failed: {str(e)}. Creating basic result.")
391
+ result = {
392
+ "file_name": filename,
393
+ "topics": ["Document"],
394
+ "languages": ["English"],
395
+ "ocr_contents": {
396
+ "raw_text": ocr_markdown
397
+ }
398
+ }
399
+
400
+ return result
401
+
402
+ # For testing directly
403
+ if __name__ == "__main__":
404
+ import sys
405
+
406
+ if len(sys.argv) < 2:
407
+ print("Usage: python structured_ocr.py <file_path>")
408
+ sys.exit(1)
409
+
410
+ file_path = sys.argv[1]
411
+ processor = StructuredOCR()
412
+ result = processor.process_file(file_path)
413
+
414
+ print(json.dumps(result, indent=2))
config.py CHANGED
@@ -5,10 +5,13 @@ Contains API key and other settings.
5
  """
6
  import os
7
 
8
- # Your Mistral API key - get from environment variable
9
- MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "")
 
 
 
10
 
11
- # Model settings
12
  OCR_MODEL = "mistral-ocr-latest"
13
  TEXT_MODEL = "ministral-8b-latest"
14
  VISION_MODEL = "pixtral-12b-latest"
 
5
  """
6
  import os
7
 
8
+ # Your Mistral API key - get from Hugging Face secrets or environment variable
9
+ # The priority order is: HF_SPACES environment var > regular environment var > empty string
10
+ # Note: No default API key is provided for security reasons
11
+ MISTRAL_API_KEY = os.environ.get("HF_MISTRAL_API_KEY", # First check HF-specific env var
12
+ os.environ.get("MISTRAL_API_KEY", "")) # Then check regular env var
13
 
14
+ # Model settings
15
  OCR_MODEL = "mistral-ocr-latest"
16
  TEXT_MODEL = "ministral-8b-latest"
17
  VISION_MODEL = "pixtral-12b-latest"
input/The Magician, or Bottle Cungerer.jpeg ADDED

Git LFS Details

  • SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
  • Pointer size: 132 Bytes
  • Size of remote file: 2.96 MB
input/a-la-carte.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
input/flier.png ADDED
input/handwritten-letter.jpg ADDED

Git LFS Details

  • SHA256: 7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
  • Pointer size: 131 Bytes
  • Size of remote file: 231 kB
input/letter-1.jpg ADDED

Git LFS Details

  • SHA256: a30d7d9f224c777a1697507200a87e41be5fd590efbe8271fa41dbd8bd8a158d
  • Pointer size: 131 Bytes
  • Size of remote file: 135 kB
input/letter-2.jpg ADDED

Git LFS Details

  • SHA256: 8b605a6eabd466da265e9e1aa3576160c4dbee06643ece5a18cdb1e45f3f683a
  • Pointer size: 131 Bytes
  • Size of remote file: 114 kB
input/letter-3.jpg ADDED

Git LFS Details

  • SHA256: 7fe2d81bb4e8bef7cdbf87c58a8cc180c49c313e5099de167ae37bbbfb895e88
  • Pointer size: 131 Bytes
  • Size of remote file: 231 kB
input/magician-satire.jpg ADDED

Git LFS Details

  • SHA256: 3becaf6f5548a794436864885bb125f3fa09f1e6f7bdd76e8878f2d36ff26232
  • Pointer size: 132 Bytes
  • Size of remote file: 2.96 MB
input/menu.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42d96008f374f5be8046b569c868e33f4e5a0e5e166c245d324b44140c7e6c2e
3
+ size 2554815
input/milgram-flier.png ADDED
input/okeefe-recipe.jpg ADDED
input/recipe.jpg ADDED
modules/content/__init__.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module initialization file for the workshop modules.
3
+ """
4
+ from . import module1, module2, module3, module4, module5, module6
5
+
6
+ # Module registry for easy access by module number
7
+ modules = {
8
+ 1: module1,
9
+ 2: module2,
10
+ 3: module3,
11
+ 4: module4,
12
+ 5: module5,
13
+ 6: module6
14
+ }
15
+
16
+ # Module names for navigation and display
17
+ module_names = [
18
+ "Introduction",
19
+ "Text-Image Relations",
20
+ "OCR Technology",
21
+ "Methodological Approaches",
22
+ "Interactive OCR",
23
+ "Conclusion"
24
+ ]
25
+
26
+ def get_module(module_number):
27
+ """Get a module by its number (1-6)"""
28
+ if module_number in modules:
29
+ return modules[module_number]
30
+ raise ValueError(f"Unknown module number: {module_number}")
31
+
32
+ def get_module_name(module_number):
33
+ """Get a module name by its number (1-6)"""
34
+ if 1 <= module_number <= len(module_names):
35
+ return module_names[module_number - 1]
36
+ return f"Module {module_number}"
modules/content/module1.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from layout import gray_container, blue_container, yellow_container, card_grid, key_concept
3
+
4
+ def render():
5
+ """Module 1: Introduction and Problematization"""
6
+
7
+ st.title("Module 1: Introduction and Problematization")
8
+
9
+ # Workshop overview in gray container
10
+ overview_content = """
11
+ <h3>Workshop Overview</h3>
12
+ <p>
13
+ This interactive workshop explores the application of OCR technology to historical documents,
14
+ combining theoretical understanding with practical experiences. Designed for historians,
15
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
16
+ </p>
17
+ """
18
+ gray_container(overview_content)
19
+
20
+ # For historians section with blue background
21
+ historians_content = """
22
+ <h3>For Historians:</h3>
23
+ <p>
24
+ How might OCR technology transform our access to and interpretation of historical
25
+ documents? What new research questions become possible when large archives
26
+ become machine-readable?
27
+ </p>
28
+ """
29
+ blue_container(historians_content)
30
+
31
+ # What is OCR section with yellow background
32
+ ocr_content = """
33
+ <h3>What is OCR?</h3>
34
+ <p>
35
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
36
+ Modern OCR uses AI vision models to understand both the text and its visual context.
37
+ </p>
38
+ """
39
+ yellow_container(ocr_content)
40
+
41
+ # What you'll learn section
42
+ st.subheader("What You'll Learn")
43
+
44
+ # Create cards for the learning outcomes
45
+ cards = [
46
+ """
47
+ <h4>Conceptual Understanding</h4>
48
+ <ul>
49
+ <li>Text-image relationships in historical documents</li>
50
+ <li>Evolution of OCR technology</li>
51
+ <li>AI vision models for document analysis</li>
52
+ <li>Historical typography challenges</li>
53
+ </ul>
54
+ """,
55
+
56
+ """
57
+ <h4>Methodological Approaches</h4>
58
+ <ul>
59
+ <li>Critical frameworks for OCR research</li>
60
+ <li>Hybrid computational methods</li>
61
+ <li>Error analysis and interpretation</li>
62
+ <li>Contextual reading strategies</li>
63
+ </ul>
64
+ """,
65
+
66
+ """
67
+ <h4>Practical Skills</h4>
68
+ <ul>
69
+ <li>Processing historical documents</li>
70
+ <li>Analyzing extracted information</li>
71
+ <li>Integrating OCR into workflows</li>
72
+ <li>Building searchable archives</li>
73
+ </ul>
74
+ """
75
+ ]
76
+
77
+ card_grid(cards)
78
+
79
+ # Add a key concept
80
+ concept_content = """
81
+ <h4>Workshop Structure</h4>
82
+ <p>This workshop combines theory and practice through six modules, each building on the previous ones.</p>
83
+ <p>Navigate between modules using the buttons at the bottom of the page.</p>
84
+ """
85
+ key_concept(concept_content)
modules/content/module2.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from layout import gray_container, card_grid, key_concept, research_question
3
+
4
+ def render():
5
+ """Module 2: Text-Image Relations in Historical Archives"""
6
+
7
+ st.title("Module 2: Text-Image Relations in Historical Archives")
8
+
9
+ col1, col2 = st.columns([1, 1])
10
+
11
+ with col1:
12
+ textual_content = """
13
+ <h3>Textual Elements</h3>
14
+ <ul>
15
+ <li><strong>Typography</strong>: Varying fonts, sizes, and styles</li>
16
+ <li><strong>Layout</strong>: Columns, margins, and spacing</li>
17
+ <li><strong>Marginalia</strong>: Notes, comments, and additions</li>
18
+ <li><strong>Decorative Text</strong>: Illuminated letters and calligraphy</li>
19
+ </ul>
20
+ """
21
+ gray_container(textual_content)
22
+
23
+ visual_content = """
24
+ <h3>Visual Elements</h3>
25
+ <ul>
26
+ <li><strong>Illustrations</strong>: Diagrams, maps, and artistic representations</li>
27
+ <li><strong>Watermarks</strong>: Hidden identifiers that locate documents</li>
28
+ <li><strong>Damage</strong>: Tears, stains, and fading affecting legibility</li>
29
+ <li><strong>Material Features</strong>: Paper quality and physical dimensions</li>
30
+ </ul>
31
+ """
32
+ gray_container(visual_content)
33
+
34
+ with col2:
35
+ interdependence_content = """
36
+ <h3>Interdependence</h3>
37
+ <p>The relationship between text and image in historical documents exists on a complex spectrum:</p>
38
+ <ul>
39
+ <li>Text functions as image (decorative headings)</li>
40
+ <li>Images function as text (symbolic representations)</li>
41
+ <li>Layout creates meaning through visual organization</li>
42
+ <li>Material conditions affect both textual and visual elements</li>
43
+ </ul>
44
+ """
45
+ gray_container(interdependence_content)
46
+
47
+ # Display an example image
48
+ st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Book_of_Kells_folio_292r.jpg/800px-Book_of_Kells_folio_292r.jpg",
49
+ caption="Book of Kells - Example of text-image integration")
50
+
51
+ # OCR Challenges section
52
+ challenge_content = """
53
+ <h3>OCR Challenges</h3>
54
+ <p>These complex text-image relationships create particular challenges for OCR:</p>
55
+ """
56
+ gray_container(challenge_content)
57
+
58
+ # Cards for OCR challenges
59
+ cards = [
60
+ """
61
+ <h4>Distinguishing Text from Decoration</h4>
62
+ <p>Where does ornamental text end and functional text begin?</p>
63
+ """,
64
+
65
+ """
66
+ <h4>Handling Illustrations</h4>
67
+ <p>Should they be processed as images or described as text?</p>
68
+ """,
69
+
70
+ """
71
+ <h4>Interpreting Layout</h4>
72
+ <p>How to capture the significance of spacing and organization?</p>
73
+ """,
74
+
75
+ """
76
+ <h4>Preserving Context</h4>
77
+ <p>Maintaining the relationship between textual and visual elements</p>
78
+ """
79
+ ]
80
+
81
+ card_grid(cards)
82
+
83
+ # Research question box
84
+ research_content = """
85
+ <h4>Research Question</h4>
86
+ <p>How do we approach documents where the visual presentation is as meaningful as the textual content itself?</p>
87
+ """
88
+ research_question(research_content)
modules/content/module3.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ from layout import gray_container, tool_container, key_concept, research_question
4
+
5
+ def render():
6
+ """Module 3: OCR Technology and Historical Documents"""
7
+
8
+ st.title("Module 3: OCR Technology and Historical Documents")
9
+
10
+ col1, col2 = st.columns([1, 1])
11
+
12
+ with col1:
13
+ traditional_content = """
14
+ <h3>Traditional OCR Approaches</h3>
15
+ <ol>
16
+ <li><strong>Pattern Matching</strong>: Early OCR compared characters to templates</li>
17
+ <li><strong>Feature Extraction</strong>: Identifying key features of characters</li>
18
+ <li><strong>Statistical Models</strong>: Using probabilities to improve recognition</li>
19
+ </ol>
20
+ """
21
+ gray_container(traditional_content)
22
+
23
+ modern_content = """
24
+ <h3>Modern AI-Enhanced OCR</h3>
25
+ <ol>
26
+ <li><strong>Neural Networks</strong>: Deep learning models trained on vast datasets</li>
27
+ <li><strong>Computer Vision</strong>: Advanced image processing techniques</li>
28
+ <li><strong>Language Models</strong>: Contextual understanding to resolve ambiguities</li>
29
+ <li><strong>Multimodal Models</strong>: Integration of text, layout, and visual understanding</li>
30
+ </ol>
31
+ """
32
+ gray_container(modern_content)
33
+
34
+ with col2:
35
+ challenges_content = """
36
+ <h3>Challenges with Historical Documents</h3>
37
+ <p>Historical materials present unique difficulties:</p>
38
+ <ul>
39
+ <li><strong>Typography Variation</strong>: Non-standardized fonts and styles</li>
40
+ <li><strong>Historical Language</strong>: Archaic vocabulary and grammar</li>
41
+ <li><strong>Layout Complexity</strong>: Non-linear arrangements</li>
42
+ <li><strong>Document Degradation</strong>: Fading, tears, stains, and damage</li>
43
+ <li><strong>Material Artifacts</strong>: Paper texture, binding shadows, etc.</li>
44
+ </ul>
45
+ """
46
+ gray_container(challenges_content)
47
+
48
+ # Display OCR processing diagram
49
+ st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg",
50
+ caption="OCR processing layers")
51
+
52
+ # Key concept section
53
+ concept_content = """
54
+ <h3>Vision-Enhanced OCR</h3>
55
+ <p>Modern OCR systems like those based on Mistral-7B-Vision combine:</p>
56
+ <ol>
57
+ <li>Image understanding capabilities to process the visual aspects</li>
58
+ <li>Text recognition to extract characters accurately</li>
59
+ <li>Layout analysis to understand structure</li>
60
+ <li>Contextual language processing for improved accuracy</li>
61
+ </ol>
62
+ <p>This multimodal approach dramatically improves OCR results on historical documents compared to traditional OCR.</p>
63
+ """
64
+ key_concept(concept_content)
65
+
66
+ # Technical details in a tool container
67
+ tech_content = """
68
+ <h3>Technical Evolution of OCR</h3>
69
+ <p><strong>Traditional OCR Pipeline:</strong></p>
70
+ <ol>
71
+ <li>Preprocessing (binarization, noise removal)</li>
72
+ <li>Layout analysis (segmentation)</li>
73
+ <li>Character recognition (pattern matching)</li>
74
+ <li>Post-processing (spell checking)</li>
75
+ </ol>
76
+
77
+ <p><strong>Modern LLM-Vision Pipeline:</strong></p>
78
+ <ol>
79
+ <li>Image normalization</li>
80
+ <li>Image embedding via vision encoder</li>
81
+ <li>Integration with language model</li>
82
+ <li>Joint inference across modalities</li>
83
+ <li>Structured extraction of information</li>
84
+ </ol>
85
+ """
86
+ tool_container(tech_content)
87
+
88
+ # Research question
89
+ research_content = """
90
+ <h4>Consider This:</h4>
91
+ <p>How might the capabilities of vision-language models change our approach to digitizing historical archives?</p>
92
+ """
93
+ research_question(research_content)
94
+
95
+ # Display history if available
96
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
97
+ with st.expander("Your OCR Processing History"):
98
+ st.markdown("You've already processed the following documents:")
99
+
100
+ for item in st.session_state.processing_history:
101
+ st.markdown(f"**{item['fileName']}**")
102
+ col1, col2 = st.columns(2)
103
+ with col1:
104
+ st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
105
+ with col2:
106
+ st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
modules/content/module4.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ from layout import gray_container, tool_container, key_concept, quote
4
+
5
+ def render():
6
+ """Module 4: Methodological Approaches"""
7
+
8
+ st.title("Module 4: Methodological Approaches")
9
+
10
+ col1, col2 = st.columns([1, 1])
11
+
12
+ with col1:
13
+ hybrid_content = """
14
+ <h3>Hybrid Methodologies</h3>
15
+
16
+ <h4>1. Computational + Human Reading</h4>
17
+ <ul>
18
+ <li>OCR for initial processing and discovery</li>
19
+ <li>Human review for context and interpretation</li>
20
+ <li>Iterative refinement of computational outputs</li>
21
+ </ul>
22
+
23
+ <h4>2. Close + Distant Reading</h4>
24
+ <ul>
25
+ <li>Distant reading through large-scale OCR processing</li>
26
+ <li>Close reading of selected passages</li>
27
+ <li>Zooming between scales of analysis</li>
28
+ </ul>
29
+ """
30
+ gray_container(hybrid_content)
31
+
32
+ # Check if the diagram image is available and display it
33
+ input_dir = Path(__file__).parent.parent / "input"
34
+ diagram_path = input_dir / "diagram.jpg"
35
+
36
+ if diagram_path.exists():
37
+ try:
38
+ from PIL import Image
39
+ with Image.open(diagram_path) as img:
40
+ st.image(img, caption="Historical VLM architecture", use_column_width=True)
41
+ except Exception:
42
+ # If there's an error, just show a placeholder
43
+ st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
44
+ else:
45
+ # If the file doesn't exist, show a placeholder
46
+ st.image("https://placekitten.com/800/400", caption="Historical VLM architecture placeholder")
47
+
48
+ with col2:
49
+ mistral_content = """
50
+ <h3>Mistral-OCR-Latest: State-of-the-Art</h3>
51
+
52
+ <p>The Mistral-OCR model represents a significant advancement:</p>
53
+ <ul>
54
+ <li><strong>Multimodal Understanding</strong>: Processes both visual and textual information</li>
55
+ <li><strong>Contextual Awareness</strong>: Considers historical context</li>
56
+ <li><strong>Layout Recognition</strong>: Preserves complex document structures</li>
57
+ <li><strong>Historical Font Adaptation</strong>: Trained on diverse historical typography</li>
58
+ </ul>
59
+ """
60
+ gray_container(mistral_content)
61
+
62
+ # Check if the workflow image is available and display it
63
+ workflow_path = input_dir / "workflow.jpg"
64
+
65
+ if workflow_path.exists():
66
+ try:
67
+ from PIL import Image
68
+ with Image.open(workflow_path) as img:
69
+ st.image(img, caption="Mistral OCR workflow", use_column_width=True)
70
+ except Exception:
71
+ # If there's an error, just show a placeholder
72
+ st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
73
+ else:
74
+ # If the file doesn't exist, show a placeholder
75
+ st.image("https://placekitten.com/800/400", caption="Mistral OCR workflow placeholder")
76
+
77
+ # Practical workflow section
78
+ workflow_content = """
79
+ <h3>Practical Workflow</h3>
80
+
81
+ <p>A typical historical OCR workflow with Mistral-OCR includes:</p>
82
+ <ol>
83
+ <li><strong>Selection</strong>: Choosing appropriate documents</li>
84
+ <li><strong>Preprocessing</strong>: Enhancing images before OCR</li>
85
+ <li><strong>OCR Processing</strong>: Running documents through vision-enhanced OCR</li>
86
+ <li><strong>Post-processing</strong>: Cleaning up outputs and structured extraction</li>
87
+ <li><strong>Verification</strong>: Cross-checking results against originals</li>
88
+ <li><strong>Integration</strong>: Incorporating OCR outputs into research materials</li>
89
+ </ol>
90
+ """
91
+ tool_container(workflow_content)
92
+
93
+ # Methodological considerations
94
+ st.subheader("Methodological Considerations")
95
+
96
+ col1, col2 = st.columns([1, 1])
97
+
98
+ with col1:
99
+ advantages_content = """
100
+ <h4>Advantages of Hybrid Approaches</h4>
101
+ <ul>
102
+ <li>Balance between automation and expert judgment</li>
103
+ <li>Ability to process large volumes while preserving detail</li>
104
+ <li>Context-sensitive analysis of complex documents</li>
105
+ <li>Iterative improvement of results</li>
106
+ </ul>
107
+ """
108
+ gray_container(advantages_content)
109
+
110
+ with col2:
111
+ limitations_content = """
112
+ <h4>Limitations and Challenges</h4>
113
+ <ul>
114
+ <li>OCR errors requiring expert correction</li>
115
+ <li>Bias in training data affecting recognition</li>
116
+ <li>Complexity in evaluating OCR quality</li>
117
+ <li>Technical infrastructure requirements</li>
118
+ </ul>
119
+ """
120
+ gray_container(limitations_content)
121
+
122
+ # Quote
123
+ quote_content = "The most powerful digital humanities work occurs at the intersection of computational methods and traditional humanistic inquiry."
124
+ quote(quote_content, "Dr. Sarah E. Bond, Digital Humanities Scholar")
modules/content/module5.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import io
3
+ import tempfile
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+ from layout import tool_container, key_concept, research_question, upload_container
7
+ import sys
8
+
9
+ # Import the necessary modules for OCR processing
10
+ sys.path.append(str(Path(__file__).parent.parent))
11
+ try:
12
+ from process_file import process_file as process_file_util
13
+ process_file = process_file_util
14
+ except ImportError:
15
+ # Fallback if process_file is not available
16
+ def process_file(uploaded_file, use_vision=True, custom_prompt=None):
17
+ """Fallback function for processing files"""
18
+ st.warning("Using mock processing function. Real OCR functionality is not available.")
19
+ return {
20
+ "file_name": uploaded_file.name,
21
+ "languages": ["English"],
22
+ "topics": ["History", "Document"],
23
+ "ocr_contents": {
24
+ "content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
25
+ }
26
+ }
27
+
28
+ def render():
29
+ """Module 5: Interactive OCR Experiment"""
30
+
31
+ st.title("Module 5: Interactive OCR Experiment")
32
+
33
+ # Introduction to the interactive experiment
34
+ intro_content = """
35
+ <h3>Interactive OCR Experiment</h3>
36
+ <p>
37
+ This interactive experiment allows you to process historical documents with OCR and analyze the results.
38
+ Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
39
+ </p>
40
+ """
41
+ st.markdown(intro_content, unsafe_allow_html=True)
42
+
43
+ # Create tabs for different activities
44
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
45
+
46
+ # Try to import PDF tools if available
47
+ try:
48
+ from pdf2image import convert_from_bytes
49
+ pdf_support = True
50
+ except ImportError:
51
+ pdf_support = False
52
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
53
+
54
+ with experiment_tab:
55
+ # Create a two-column layout
56
+ col1, col2 = st.columns([1, 1])
57
+
58
+ with col1:
59
+ # Tool container for document selection and options
60
+ st.subheader("Step 1: Select Document & Options")
61
+
62
+ # Processing options
63
+ use_vision = st.checkbox("Use Vision Model", value=True,
64
+ help="Use the vision model for improved analysis")
65
+
66
+ # Additional prompt
67
+ st.markdown("### Custom Research Prompt (Optional)")
68
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
69
+ Focus on specific aspects of historical research you're interested in.""")
70
+ custom_prompt = st.text_area("Research Prompt",
71
+ placeholder="E.g., Focus on identifying dates and historical figures...",
72
+ help="Optional instructions to guide the analysis")
73
+
74
+ # Sample document selection
75
+ input_dir = Path(__file__).parent.parent / "input"
76
+
77
+ if input_dir.exists():
78
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
79
+
80
+ if sample_files:
81
+ st.markdown("#### Sample Documents")
82
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
83
+ sample_choice = st.selectbox("Choose a document:", sample_options)
84
+
85
+ if sample_choice != "Upload my own document":
86
+ # Process the selected sample file
87
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
88
+
89
+ if selected_file:
90
+ # Store the selected sample file in session state
91
+ with open(selected_file, "rb") as f:
92
+ file_bytes = f.read()
93
+
94
+ st.session_state.sample_file = {
95
+ "name": selected_file.name,
96
+ "bytes": file_bytes
97
+ }
98
+
99
+ # Preview the selected sample
100
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
101
+ try:
102
+ with st.spinner("Generating PDF preview..."):
103
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
104
+ if images:
105
+ st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
106
+ except Exception:
107
+ st.info(f"PDF selected: {selected_file.name}")
108
+ else:
109
+ # For images display directly
110
+ try:
111
+ from PIL import Image
112
+ img = Image.open(io.BytesIO(file_bytes))
113
+ st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
114
+ except Exception:
115
+ st.info(f"Selected: {selected_file.name}")
116
+ else:
117
+ # Clear the sample file if "Upload my own" is selected
118
+ if 'sample_file' in st.session_state:
119
+ del st.session_state.sample_file
120
+
121
+ # Display file uploader
122
+ upload_html = """
123
+ <h4>Upload a document to get started</h4>
124
+ <p>Supported formats: PDF, JPG, PNG</p>
125
+ """
126
+
127
+ upload_container(upload_html)
128
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
129
+
130
+ if uploaded_file is not None:
131
+ # Display preview of the uploaded file
132
+ file_ext = Path(uploaded_file.name).suffix.lower()
133
+
134
+ if file_ext == ".pdf" and pdf_support:
135
+ try:
136
+ # Convert first page of PDF to image for preview
137
+ pdf_bytes = uploaded_file.getvalue()
138
+ with st.spinner("Generating PDF preview..."):
139
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
140
+ if images:
141
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
142
+ else:
143
+ st.info(f"PDF uploaded: {uploaded_file.name}")
144
+ except Exception:
145
+ st.info(f"PDF uploaded: {uploaded_file.name}")
146
+ elif file_ext != ".pdf":
147
+ st.image(uploaded_file, use_column_width=True)
148
+ else:
149
+ st.info(f"PDF uploaded: {uploaded_file.name}")
150
+ else:
151
+ # No sample files, just show the uploader
152
+ upload_html = """
153
+ <h4>Upload a document to get started</h4>
154
+ <p>Supported formats: PDF, JPG, PNG</p>
155
+ """
156
+
157
+ upload_container(upload_html)
158
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
159
+
160
+ if uploaded_file is not None:
161
+ # Display the file preview
162
+ file_ext = Path(uploaded_file.name).suffix.lower()
163
+ if file_ext == ".pdf" and pdf_support:
164
+ try:
165
+ pdf_bytes = uploaded_file.getvalue()
166
+ with st.spinner("Generating PDF preview..."):
167
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
168
+ if images:
169
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
170
+ except Exception:
171
+ st.info(f"PDF uploaded: {uploaded_file.name}")
172
+ elif file_ext != ".pdf":
173
+ st.image(uploaded_file, use_column_width=True)
174
+ else:
175
+ st.info(f"PDF uploaded: {uploaded_file.name}")
176
+ else:
177
+ # No input directory
178
+ upload_html = """
179
+ <h4>Upload a document to get started</h4>
180
+ <p>Supported formats: PDF, JPG, PNG</p>
181
+ """
182
+
183
+ upload_container(upload_html)
184
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
185
+
186
+ # Process button
187
+ st.subheader("Step 2: Process the Document")
188
+
189
+ # Get the file to process (either uploaded or sample)
190
+ file_to_process = None
191
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
192
+ # Create a FileUploader-like object from the sample file
193
+ class SampleFileObject:
194
+ def __init__(self, name, data):
195
+ self.name = name
196
+ self._data = data
197
+
198
+ def getvalue(self):
199
+ return self._data
200
+
201
+ file_to_process = SampleFileObject(
202
+ st.session_state.sample_file["name"],
203
+ st.session_state.sample_file["bytes"]
204
+ )
205
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
206
+ file_to_process = uploaded_file
207
+
208
+ # Process button
209
+ process_button = st.button(
210
+ "Process Document",
211
+ disabled=file_to_process is None,
212
+ use_container_width=True
213
+ )
214
+
215
+ if process_button and file_to_process is not None:
216
+ with st.spinner("Processing document..."):
217
+ try:
218
+ # Process the file
219
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
220
+
221
+ if result:
222
+ st.success("Document processed successfully!")
223
+
224
+ # Store result in session state for display in the right column
225
+ st.session_state.current_result = result
226
+
227
+ # Add to processing history
228
+ history_item = {
229
+ "id": datetime.now().timestamp(),
230
+ "fileName": file_to_process.name,
231
+ "timestamp": datetime.now().isoformat(),
232
+ "result": result,
233
+ "useVision": use_vision
234
+ }
235
+
236
+ if 'processing_history' not in st.session_state:
237
+ st.session_state.processing_history = []
238
+
239
+ st.session_state.processing_history.append(history_item)
240
+
241
+ st.experimental_rerun()
242
+ else:
243
+ st.error("Failed to process document.")
244
+ except Exception as e:
245
+ st.error(f"Error processing document: {str(e)}")
246
+
247
+ # Experiment instructions
248
+ experiment_content = """
249
+ <h3>Experiment Instructions</h3>
250
+ <ol>
251
+ <li><strong>Step 1:</strong> Select a document and choose your options</li>
252
+ <li><strong>Step 2:</strong> Process the document with the selected options</li>
253
+ <li><strong>Step 3:</strong> Analyze the results in the panel on the right</li>
254
+ <li><strong>Step 4:</strong> Try again with different settings (e.g., toggle vision model)</li>
255
+ <li><strong>Step 5:</strong> Compare results between different runs</li>
256
+ </ol>
257
+ """
258
+ key_concept(experiment_content)
259
+
260
+ with col2:
261
+ # Results display
262
+ st.subheader("Step 3: View Results")
263
+
264
+ if 'current_result' in st.session_state and st.session_state.current_result:
265
+ result = st.session_state.current_result
266
+
267
+ # Display results in a tool container
268
+ result_html = f"""
269
+ <h4>Results for: {result.get('file_name', 'Unknown')}</h4>
270
+ <p><strong>Languages:</strong> {', '.join(result.get('languages', ['Unknown']))}</p>
271
+ <p><strong>Topics:</strong> {', '.join(result.get('topics', ['Unknown']))}</p>
272
+ """
273
+ tool_container(result_html)
274
+
275
+ # Create tabs for different views
276
+ tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
277
+
278
+ with tab1:
279
+ # Display in a more user-friendly format
280
+ if 'ocr_contents' in result:
281
+ if isinstance(result['ocr_contents'], dict):
282
+ for section, content in result['ocr_contents'].items():
283
+ if content: # Only display non-empty sections
284
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
285
+
286
+ if isinstance(content, str):
287
+ st.markdown(content)
288
+ elif isinstance(content, list):
289
+ for item in content:
290
+ if isinstance(item, str):
291
+ st.markdown(f"- {item}")
292
+ elif isinstance(item, dict):
293
+ st.json(item)
294
+ elif isinstance(content, dict):
295
+ for k, v in content.items():
296
+ st.markdown(f"**{k}:** {v}")
297
+
298
+ with tab2:
299
+ # Show the raw JSON
300
+ st.json(result)
301
+
302
+ # Download options
303
+ st.markdown("### Export Results")
304
+
305
+ col1, col2 = st.columns(2)
306
+
307
+ with col1:
308
+ # Export as JSON
309
+ import json
310
+ json_bytes = json.dumps(result, indent=2).encode()
311
+ st.download_button(
312
+ label="Download JSON",
313
+ data=json_bytes,
314
+ file_name="ocr_results.json",
315
+ mime="application/json",
316
+ use_container_width=True
317
+ )
318
+
319
+ with col2:
320
+ # Export as text if content is available
321
+ if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
322
+ text_content = result['ocr_contents']['content']
323
+ st.download_button(
324
+ label="Download Text",
325
+ data=text_content.encode(),
326
+ file_name="ocr_text.txt",
327
+ mime="text/plain",
328
+ use_container_width=True
329
+ )
330
+ else:
331
+ # Show placeholder when no results are available
332
+ placeholder_html = """
333
+ <h4>Results will appear here</h4>
334
+ <p>Upload and process a document to see the OCR results in this panel.</p>
335
+ <p>The OCR tool will:</p>
336
+ <ol>
337
+ <li>Extract text from your document</li>
338
+ <li>Identify languages and topics</li>
339
+ <li>Provide structured content analysis</li>
340
+ <li>Generate downloadable results</li>
341
+ </ol>
342
+ """
343
+ tool_container(placeholder_html)
344
+
345
+ # Display processing history if available
346
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
347
+ st.subheader("Step 4: Review Processing History")
348
+
349
+ # Most recent result
350
+ latest = st.session_state.processing_history[-1]
351
+ latest_html = f"""
352
+ <h4>Latest Document: {latest['fileName']}</h4>
353
+ <p><strong>Processed at:</strong> {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
354
+ <p><strong>Vision model used:</strong> {'Yes' if latest['useVision'] else 'No'}</p>
355
+ """
356
+ tool_container(latest_html)
357
+
358
+ # History in expander
359
+ with st.expander("View Complete Processing History"):
360
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
361
+ st.markdown(f"""
362
+ <div style="background-color: var(--color-gray-700); padding: 0.75rem; border-radius: 0.5rem; margin-bottom: 0.5rem;">
363
+ <strong>{item['fileName']}</strong><br>
364
+ {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} -
365
+ Vision model: {'Yes' if item['useVision'] else 'No'}
366
+ </div>
367
+ """, unsafe_allow_html=True)
368
+
369
+ # Option to view a previous result
370
+ if st.button(f"View This Result", key=f"view_history_{i}"):
371
+ st.session_state.current_result = item['result']
372
+ st.experimental_rerun()
373
+
374
+ # Compare tab for side-by-side comparison
375
+ with compare_tab:
376
+ st.subheader("Compare OCR Results")
377
+
378
+ if 'processing_history' in st.session_state and len(st.session_state.processing_history) >= 2:
379
+ st.markdown("""
380
+ Select two processing results to compare side by side. This allows you to see
381
+ how different options (like using the vision model) affect OCR quality.
382
+ """)
383
+
384
+ # Create selection dropdowns for the documents
385
+ col1, col2 = st.columns(2)
386
+ with col1:
387
+ # First document selector
388
+ doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
389
+ for i, item in enumerate(st.session_state.processing_history)]
390
+ doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
391
+ doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
392
+
393
+ with col2:
394
+ # Second document selector
395
+ doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
396
+ for i, item in enumerate(st.session_state.processing_history)]
397
+ default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
398
+ doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
399
+ doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
400
+
401
+ # Retrieve the selected documents
402
+ doc1 = st.session_state.processing_history[doc_index_1]
403
+ doc2 = st.session_state.processing_history[doc_index_2]
404
+
405
+ # Show comparison
406
+ col1, col2 = st.columns(2)
407
+
408
+ with col1:
409
+ doc1_html = f"""
410
+ <h4>Document 1: {doc1['fileName']}</h4>
411
+ <p><strong>Processed at:</strong> {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
412
+ <p><strong>Vision model used:</strong> {'Yes' if doc1['useVision'] else 'No'}</p>
413
+ """
414
+ tool_container(doc1_html)
415
+
416
+ # Display content summary
417
+ if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
418
+ if 'content' in doc1['result']['ocr_contents']:
419
+ content = doc1['result']['ocr_contents']['content']
420
+ st.markdown(f"""
421
+ <div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
422
+ border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
423
+ {content[:500]}{'...' if len(content) > 500 else ''}
424
+ </div>
425
+ """, unsafe_allow_html=True)
426
+
427
+ with col2:
428
+ doc2_html = f"""
429
+ <h4>Document 2: {doc2['fileName']}</h4>
430
+ <p><strong>Processed at:</strong> {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
431
+ <p><strong>Vision model used:</strong> {'Yes' if doc2['useVision'] else 'No'}</p>
432
+ """
433
+ tool_container(doc2_html)
434
+
435
+ # Display content summary
436
+ if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
437
+ if 'content' in doc2['result']['ocr_contents']:
438
+ content = doc2['result']['ocr_contents']['content']
439
+ st.markdown(f"""
440
+ <div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
441
+ border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
442
+ {content[:500]}{'...' if len(content) > 500 else ''}
443
+ </div>
444
+ """, unsafe_allow_html=True)
445
+
446
+ # Comparison analysis
447
+ if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
448
+ comparison_content = """
449
+ <h3>Vision vs. Non-Vision Model Comparison</h3>
450
+ <p>You're comparing the same document processed with different models.
451
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p>
452
+
453
+ <p>Look for these differences:</p>
454
+ <ul>
455
+ <li>Completeness of extracted text</li>
456
+ <li>Accuracy of layout understanding</li>
457
+ <li>Recognition of complex elements (tables, figures)</li>
458
+ <li>Topic and language detection accuracy</li>
459
+ </ul>
460
+ """
461
+ key_concept(comparison_content)
462
+ else:
463
+ need_more_content = """
464
+ <h3>Need More Documents to Compare</h3>
465
+ <p>Process at least two documents to enable side-by-side comparison. Try processing
466
+ the same document with and without the vision model to see the differences in OCR quality.</p>
467
+ """
468
+ research_question(need_more_content)
469
+
470
+ # Analysis guide tab
471
+ with analyze_tab:
472
+ st.subheader("Analysis Guide")
473
+
474
+ st.markdown("""
475
+ ### How to Analyze OCR Results
476
+
477
+ When analyzing OCR results from historical documents, consider these key factors:
478
+
479
+ 1. **Text Accuracy**
480
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
481
+ - Assess recognition of period-specific typography and writing styles
482
+ - Evaluate handling of degraded or damaged text areas
483
+
484
+ 2. **Structure Preservation**
485
+ - Does the OCR maintain paragraph and section breaks?
486
+ - Are columns and tabular data correctly preserved?
487
+ - How well are page transitions handled?
488
+
489
+ 3. **Special Elements**
490
+ - Recognition of footnotes, marginalia, and annotations
491
+ - Handling of illustrations, diagrams, and decorative elements
492
+ - Treatment of watermarks, signatures, and stamps
493
+
494
+ 4. **Metadata Extraction**
495
+ - Accuracy of detected languages, topics, and document type
496
+ - Identification of dates, names, and key entities
497
+ - Recognition of document purpose and context
498
+ """)
499
+
500
+ col1, col2 = st.columns(2)
501
+
502
+ with col1:
503
+ challenge_content = """
504
+ <h3>Common OCR Challenges</h3>
505
+ <ul>
506
+ <li><strong>Typography Variations</strong>: Historical fonts that differ from modern text</li>
507
+ <li><strong>Material Degradation</strong>: Fading, stains, tears affecting legibility</li>
508
+ <li><strong>Handwritten Elements</strong>: Marginalia, signatures, and annotations</li>
509
+ <li><strong>Complex Layouts</strong>: Multi-column formats and decorative elements</li>
510
+ <li><strong>Language and Terminology</strong>: Archaic terms and multilingual content</li>
511
+ </ul>
512
+ """
513
+ gray_container(challenge_content)
514
+
515
+ with col2:
516
+ tips_content = """
517
+ <h3>Making the Most of OCR Results</h3>
518
+ <ul>
519
+ <li><strong>Contextual Reading</strong>: Use context to interpret unclear passages</li>
520
+ <li><strong>Error Patterns</strong>: Identify and correct systematic OCR errors</li>
521
+ <li><strong>Hybrid Analysis</strong>: Combine OCR search with close reading</li>
522
+ <li><strong>Comparative Processing</strong>: Try different settings on documents</li>
523
+ <li><strong>Iterative Refinement</strong>: Use insights to improve future processing</li>
524
+ </ul>
525
+ """
526
+ gray_container(tips_content)
527
+
528
+ # Show example analysis if there's processing history
529
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
530
+ with st.expander("Example Analysis from Your Documents"):
531
+ # Pick the latest document
532
+ latest = st.session_state.processing_history[-1]
533
+
534
+ st.markdown(f"""
535
+ #### Sample Analysis for: {latest['fileName']}
536
+
537
+ **Document Context:**
538
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
539
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
540
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
541
+
542
+ **What to Look For:**
543
+ 1. Check how well the model identified key topics and languages
544
+ 2. Evaluate the completeness of extracted text
545
+ 3. Note any systematic errors in text recognition
546
+ 4. Assess how well document structure was preserved
547
+ """)
modules/content/module6.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from layout import gray_container, key_concept, quote, tool_container
3
+ from datetime import datetime
4
+
5
+ def render():
6
+ """Module 6: Conclusion and Future Directions"""
7
+
8
+ st.title("Module 6: Conclusion and Future Directions")
9
+
10
+ col1, col2 = st.columns([3, 2])
11
+
12
+ with col1:
13
+ summary_content = """
14
+ <h3>Workshop Summary</h3>
15
+ <p>Throughout this workshop, we've explored:</p>
16
+ <ol>
17
+ <li><strong>Text-Image Interdependence</strong>: The complex relationship between textual and visual elements</li>
18
+ <li><strong>OCR Technology</strong>: The evolution of OCR and its application to historical materials</li>
19
+ <li><strong>Methodological Approaches</strong>: Hybrid strategies for working with historical texts</li>
20
+ <li><strong>Practical Application</strong>: Hands-on experience with OCR processing tools</li>
21
+ </ol>
22
+ """
23
+ gray_container(summary_content)
24
+
25
+ takeaways_content = """
26
+ <h3>Key Takeaways</h3>
27
+ <ol>
28
+ <li><strong>OCR is Not Perfect</strong>: Even advanced AI models face challenges with historical documents</li>
29
+ <li><strong>Context Matters</strong>: Vision-enhanced models provide better results by understanding document context</li>
30
+ <li><strong>Hybrid Approaches</strong>: Combining computational methods with traditional research yields best results</li>
31
+ <li><strong>Critical Evaluation</strong>: Always evaluate OCR outputs with awareness of limitations</li>
32
+ <li><strong>Structured Extraction</strong>: Modern OCR goes beyond text recognition to understand document structure</li>
33
+ </ol>
34
+ """
35
+ gray_container(takeaways_content)
36
+
37
+ with col2:
38
+ # Display workshop statistics if there's processing history
39
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
40
+ st.subheader("Your Workshop Statistics")
41
+
42
+ # Calculate statistics
43
+ total_docs = len(st.session_state.processing_history)
44
+ vision_docs = len([item for item in st.session_state.processing_history if item['useVision']])
45
+ non_vision_docs = total_docs - vision_docs
46
+
47
+ # Create metrics for statistics
48
+ col1, col2 = st.columns(2)
49
+
50
+ with col1:
51
+ st.metric("Documents Processed", total_docs)
52
+ st.metric("With Vision Model", vision_docs)
53
+
54
+ with col2:
55
+ st.metric("Without Vision Model", non_vision_docs)
56
+
57
+ # Topics word cloud
58
+ if total_docs > 0:
59
+ st.subheader("Topics Encountered")
60
+ all_topics = []
61
+ for item in st.session_state.processing_history:
62
+ if 'topics' in item['result']:
63
+ all_topics.extend(item['result']['topics'])
64
+
65
+ if all_topics:
66
+ # Count topic frequencies
67
+ topic_counts = {}
68
+ for topic in all_topics:
69
+ if topic in topic_counts:
70
+ topic_counts[topic] += 1
71
+ else:
72
+ topic_counts[topic] = 1
73
+
74
+ # Display as a horizontal bar chart
75
+ st.bar_chart(topic_counts)
76
+ else:
77
+ # Show placeholder stats
78
+ placeholder_content = """
79
+ <h3>Workshop Outcomes</h3>
80
+ <p>Complete the interactive OCR experiment in Module 5 to generate your personal workshop statistics.</p>
81
+ <p>You'll be able to see:</p>
82
+ <ul>
83
+ <li>Number of documents processed</li>
84
+ <li>Comparison of vision vs. non-vision models</li>
85
+ <li>Topics identified across your documents</li>
86
+ <li>Performance metrics for your processing tasks</li>
87
+ </ul>
88
+ """
89
+ tool_container(placeholder_content)
90
+
91
+ # Future directions section
92
+ st.subheader("Future Directions")
93
+
94
+ col1, col2 = st.columns(2)
95
+
96
+ with col1:
97
+ tech_content = """
98
+ <h3>Technological Developments</h3>
99
+ <ul>
100
+ <li><strong>Multimodal AI models</strong>: Increasingly sophisticated understanding</li>
101
+ <li><strong>Historical font training</strong>: Models trained on historical typography</li>
102
+ <li><strong>Document intelligence</strong>: Enhanced understanding of structures</li>
103
+ <li><strong>Collaborative correction</strong>: Platforms for collective improvement</li>
104
+ </ul>
105
+ """
106
+ gray_container(tech_content)
107
+
108
+ with col2:
109
+ research_content = """
110
+ <h3>Research Applications</h3>
111
+ <ul>
112
+ <li><strong>Large-scale corpus analysis</strong>: Processing entire archives</li>
113
+ <li><strong>Multilingual historical research</strong>: Working across languages</li>
114
+ <li><strong>Image-text integration</strong>: New methodologies for visual analysis</li>
115
+ <li><strong>Computational paleography</strong>: AI-assisted handwriting analysis</li>
116
+ </ul>
117
+ """
118
+ gray_container(research_content)
119
+
120
+ # Inspiring quote
121
+ quote_content = "The digital humanities are not about building, they're about sharing. The digital humanities are not about the digital at all. They're all about innovation and disruption. The digital humanities are really an insurgent humanities."
122
+ quote(quote_content, "Matthew Kirschenbaum, Professor of Digital Humanities")
123
+
124
+ # Additional resources
125
+ resources_content = """
126
+ <h3>Additional Resources</h3>
127
+ <ul>
128
+ <li><a href="https://docs.mistral.ai/" target="_blank">Mistral AI Documentation</a>: Learn more about the OCR models used in this workshop</li>
129
+ <li><a href="https://readcoop.eu/transkribus/" target="_blank">Transkribus</a>: Platform for historical document transcription</li>
130
+ <li><a href="https://ocr-d.de/en/" target="_blank">OCR-D</a>: Coordinated OCR research project for historical documents</li>
131
+ <li><a href="https://scholar.google.com/scholar?q=historical+OCR" target="_blank">Historical OCR Research Papers</a>: Academic research on historical OCR</li>
132
+ </ul>
133
+ """
134
+ tool_container(resources_content)
135
+
136
+ # Acknowledgments
137
+ st.subheader("Acknowledgments")
138
+
139
+ acknowledgment_content = """
140
+ <p>This workshop was designed as an educational resource for historians, archivists, and digital humanities scholars.</p>
141
+ <p>It demonstrates the integration of modern AI vision-language models with historical research methodologies.</p>
142
+ <p>Special thanks to the digital humanities community for continued innovation in computational approaches to historical research.</p>
143
+ """
144
+ st.markdown(acknowledgment_content, unsafe_allow_html=True)
145
+
146
+ # Restart the workshop button
147
+ if st.button("Start Workshop Again", use_container_width=True):
148
+ # Reset the session state to start the workshop again
149
+ if 'current_module' in st.session_state:
150
+ st.session_state.current_module = 1
151
+
152
+ # Do not reset the processing history
153
+
154
+ st.experimental_rerun()
modules/educational_module.py ADDED
@@ -0,0 +1,547 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import io
3
+ import tempfile
4
+ from pathlib import Path
5
+ from datetime import datetime
6
+ from layout import tool_container, key_concept, research_question, upload_container
7
+ import sys
8
+
9
+ # Import the necessary modules for OCR processing
10
+ sys.path.append(str(Path(__file__).parent.parent))
11
+ try:
12
+ from process_file import process_file as process_file_util
13
+ process_file = process_file_util
14
+ except ImportError:
15
+ # Fallback if process_file is not available
16
+ def process_file(uploaded_file, use_vision=True, custom_prompt=None):
17
+ """Fallback function for processing files"""
18
+ st.warning("Using mock processing function. Real OCR functionality is not available.")
19
+ return {
20
+ "file_name": uploaded_file.name,
21
+ "languages": ["English"],
22
+ "topics": ["History", "Document"],
23
+ "ocr_contents": {
24
+ "content": f"This is mock OCR content for {uploaded_file.name}. Vision model: {use_vision}"
25
+ }
26
+ }
27
+
28
+ def render():
29
+ """Module 5: Interactive OCR Experiment"""
30
+
31
+ st.title("Module 5: Interactive OCR Experiment")
32
+
33
+ # Introduction to the interactive experiment
34
+ intro_content = """
35
+ <h3>Interactive OCR Experiment</h3>
36
+ <p>
37
+ This interactive experiment allows you to process historical documents with OCR and analyze the results.
38
+ Try different settings and compare the outcomes to understand the strengths and limitations of OCR technology.
39
+ </p>
40
+ """
41
+ st.markdown(intro_content, unsafe_allow_html=True)
42
+
43
+ # Create tabs for different activities
44
+ experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
45
+
46
+ # Try to import PDF tools if available
47
+ try:
48
+ from pdf2image import convert_from_bytes
49
+ pdf_support = True
50
+ except ImportError:
51
+ pdf_support = False
52
+ st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
53
+
54
+ with experiment_tab:
55
+ # Create a two-column layout
56
+ col1, col2 = st.columns([1, 1])
57
+
58
+ with col1:
59
+ # Tool container for document selection and options
60
+ st.subheader("Step 1: Select Document & Options")
61
+
62
+ # Processing options
63
+ use_vision = st.checkbox("Use Vision Model", value=True,
64
+ help="Use the vision model for improved analysis")
65
+
66
+ # Additional prompt
67
+ st.markdown("### Custom Research Prompt (Optional)")
68
+ st.markdown("""Provide additional instructions to guide the OCR analysis.
69
+ Focus on specific aspects of historical research you're interested in.""")
70
+ custom_prompt = st.text_area("Research Prompt",
71
+ placeholder="E.g., Focus on identifying dates and historical figures...",
72
+ help="Optional instructions to guide the analysis")
73
+
74
+ # Sample document selection
75
+ input_dir = Path(__file__).parent.parent / "input"
76
+
77
+ if input_dir.exists():
78
+ sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
79
+
80
+ if sample_files:
81
+ st.markdown("#### Sample Documents")
82
+ sample_options = ["Upload my own document"] + [f.name for f in sample_files]
83
+ sample_choice = st.selectbox("Choose a document:", sample_options)
84
+
85
+ if sample_choice != "Upload my own document":
86
+ # Process the selected sample file
87
+ selected_file = next((f for f in sample_files if f.name == sample_choice), None)
88
+
89
+ if selected_file:
90
+ # Store the selected sample file in session state
91
+ with open(selected_file, "rb") as f:
92
+ file_bytes = f.read()
93
+
94
+ st.session_state.sample_file = {
95
+ "name": selected_file.name,
96
+ "bytes": file_bytes
97
+ }
98
+
99
+ # Preview the selected sample
100
+ if selected_file.suffix.lower() == ".pdf" and pdf_support:
101
+ try:
102
+ with st.spinner("Generating PDF preview..."):
103
+ images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
104
+ if images:
105
+ st.image(images[0], caption=f"Preview: {selected_file.name}", use_column_width=True)
106
+ except Exception:
107
+ st.info(f"PDF selected: {selected_file.name}")
108
+ else:
109
+ # For images display directly
110
+ try:
111
+ from PIL import Image
112
+ img = Image.open(io.BytesIO(file_bytes))
113
+ st.image(img, caption=f"Preview: {selected_file.name}", use_column_width=True)
114
+ except Exception:
115
+ st.info(f"Selected: {selected_file.name}")
116
+ else:
117
+ # Clear the sample file if "Upload my own" is selected
118
+ if 'sample_file' in st.session_state:
119
+ del st.session_state.sample_file
120
+
121
+ # Display file uploader
122
+ upload_html = """
123
+ <h4>Upload a document to get started</h4>
124
+ <p>Supported formats: PDF, JPG, PNG</p>
125
+ """
126
+
127
+ upload_container(upload_html)
128
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
129
+
130
+ if uploaded_file is not None:
131
+ # Display preview of the uploaded file
132
+ file_ext = Path(uploaded_file.name).suffix.lower()
133
+
134
+ if file_ext == ".pdf" and pdf_support:
135
+ try:
136
+ # Convert first page of PDF to image for preview
137
+ pdf_bytes = uploaded_file.getvalue()
138
+ with st.spinner("Generating PDF preview..."):
139
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
140
+ if images:
141
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
142
+ else:
143
+ st.info(f"PDF uploaded: {uploaded_file.name}")
144
+ except Exception:
145
+ st.info(f"PDF uploaded: {uploaded_file.name}")
146
+ elif file_ext != ".pdf":
147
+ st.image(uploaded_file, use_column_width=True)
148
+ else:
149
+ st.info(f"PDF uploaded: {uploaded_file.name}")
150
+ else:
151
+ # No sample files, just show the uploader
152
+ upload_html = """
153
+ <h4>Upload a document to get started</h4>
154
+ <p>Supported formats: PDF, JPG, PNG</p>
155
+ """
156
+
157
+ upload_container(upload_html)
158
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
159
+
160
+ if uploaded_file is not None:
161
+ # Display the file preview
162
+ file_ext = Path(uploaded_file.name).suffix.lower()
163
+ if file_ext == ".pdf" and pdf_support:
164
+ try:
165
+ pdf_bytes = uploaded_file.getvalue()
166
+ with st.spinner("Generating PDF preview..."):
167
+ images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
168
+ if images:
169
+ st.image(images[0], caption=f"PDF Preview: {uploaded_file.name}", use_column_width=True)
170
+ except Exception:
171
+ st.info(f"PDF uploaded: {uploaded_file.name}")
172
+ elif file_ext != ".pdf":
173
+ st.image(uploaded_file, use_column_width=True)
174
+ else:
175
+ st.info(f"PDF uploaded: {uploaded_file.name}")
176
+ else:
177
+ # No input directory
178
+ upload_html = """
179
+ <h4>Upload a document to get started</h4>
180
+ <p>Supported formats: PDF, JPG, PNG</p>
181
+ """
182
+
183
+ upload_container(upload_html)
184
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
185
+
186
+ # Process button
187
+ st.subheader("Step 2: Process the Document")
188
+
189
+ # Get the file to process (either uploaded or sample)
190
+ file_to_process = None
191
+ if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
192
+ # Create a FileUploader-like object from the sample file
193
+ class SampleFileObject:
194
+ def __init__(self, name, data):
195
+ self.name = name
196
+ self._data = data
197
+
198
+ def getvalue(self):
199
+ return self._data
200
+
201
+ file_to_process = SampleFileObject(
202
+ st.session_state.sample_file["name"],
203
+ st.session_state.sample_file["bytes"]
204
+ )
205
+ elif 'uploaded_file' in locals() and uploaded_file is not None:
206
+ file_to_process = uploaded_file
207
+
208
+ # Process button
209
+ process_button = st.button(
210
+ "Process Document",
211
+ disabled=file_to_process is None,
212
+ use_container_width=True
213
+ )
214
+
215
+ if process_button and file_to_process is not None:
216
+ with st.spinner("Processing document..."):
217
+ try:
218
+ # Process the file
219
+ result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
220
+
221
+ if result:
222
+ st.success("Document processed successfully!")
223
+
224
+ # Store result in session state for display in the right column
225
+ st.session_state.current_result = result
226
+
227
+ # Add to processing history
228
+ history_item = {
229
+ "id": datetime.now().timestamp(),
230
+ "fileName": file_to_process.name,
231
+ "timestamp": datetime.now().isoformat(),
232
+ "result": result,
233
+ "useVision": use_vision
234
+ }
235
+
236
+ if 'processing_history' not in st.session_state:
237
+ st.session_state.processing_history = []
238
+
239
+ st.session_state.processing_history.append(history_item)
240
+
241
+ st.experimental_rerun()
242
+ else:
243
+ st.error("Failed to process document.")
244
+ except Exception as e:
245
+ st.error(f"Error processing document: {str(e)}")
246
+
247
+ # Experiment instructions
248
+ experiment_content = """
249
+ <h3>Experiment Instructions</h3>
250
+ <ol>
251
+ <li><strong>Step 1:</strong> Select a document and choose your options</li>
252
+ <li><strong>Step 2:</strong> Process the document with the selected options</li>
253
+ <li><strong>Step 3:</strong> Analyze the results in the panel on the right</li>
254
+ <li><strong>Step 4:</strong> Try again with different settings (e.g., toggle vision model)</li>
255
+ <li><strong>Step 5:</strong> Compare results between different runs</li>
256
+ </ol>
257
+ """
258
+ key_concept(experiment_content)
259
+
260
+ with col2:
261
+ # Results display
262
+ st.subheader("Step 3: View Results")
263
+
264
+ if 'current_result' in st.session_state and st.session_state.current_result:
265
+ result = st.session_state.current_result
266
+
267
+ # Display results in a tool container
268
+ result_html = f"""
269
+ <h4>Results for: {result.get('file_name', 'Unknown')}</h4>
270
+ <p><strong>Languages:</strong> {', '.join(result.get('languages', ['Unknown']))}</p>
271
+ <p><strong>Topics:</strong> {', '.join(result.get('topics', ['Unknown']))}</p>
272
+ """
273
+ tool_container(result_html)
274
+
275
+ # Create tabs for different views
276
+ tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
277
+
278
+ with tab1:
279
+ # Display in a more user-friendly format
280
+ if 'ocr_contents' in result:
281
+ if isinstance(result['ocr_contents'], dict):
282
+ for section, content in result['ocr_contents'].items():
283
+ if content: # Only display non-empty sections
284
+ st.markdown(f"#### {section.replace('_', ' ').title()}")
285
+
286
+ if isinstance(content, str):
287
+ st.markdown(content)
288
+ elif isinstance(content, list):
289
+ for item in content:
290
+ if isinstance(item, str):
291
+ st.markdown(f"- {item}")
292
+ elif isinstance(item, dict):
293
+ st.json(item)
294
+ elif isinstance(content, dict):
295
+ for k, v in content.items():
296
+ st.markdown(f"**{k}:** {v}")
297
+
298
+ with tab2:
299
+ # Show the raw JSON
300
+ st.json(result)
301
+
302
+ # Download options
303
+ st.markdown("### Export Results")
304
+
305
+ col1, col2 = st.columns(2)
306
+
307
+ with col1:
308
+ # Export as JSON
309
+ import json
310
+ json_bytes = json.dumps(result, indent=2).encode()
311
+ st.download_button(
312
+ label="Download JSON",
313
+ data=json_bytes,
314
+ file_name="ocr_results.json",
315
+ mime="application/json",
316
+ use_container_width=True
317
+ )
318
+
319
+ with col2:
320
+ # Export as text if content is available
321
+ if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
322
+ text_content = result['ocr_contents']['content']
323
+ st.download_button(
324
+ label="Download Text",
325
+ data=text_content.encode(),
326
+ file_name="ocr_text.txt",
327
+ mime="text/plain",
328
+ use_container_width=True
329
+ )
330
+ else:
331
+ # Show placeholder when no results are available
332
+ placeholder_html = """
333
+ <h4>Results will appear here</h4>
334
+ <p>Upload and process a document to see the OCR results in this panel.</p>
335
+ <p>The OCR tool will:</p>
336
+ <ol>
337
+ <li>Extract text from your document</li>
338
+ <li>Identify languages and topics</li>
339
+ <li>Provide structured content analysis</li>
340
+ <li>Generate downloadable results</li>
341
+ </ol>
342
+ """
343
+ tool_container(placeholder_html)
344
+
345
+ # Display processing history if available
346
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
347
+ st.subheader("Step 4: Review Processing History")
348
+
349
+ # Most recent result
350
+ latest = st.session_state.processing_history[-1]
351
+ latest_html = f"""
352
+ <h4>Latest Document: {latest['fileName']}</h4>
353
+ <p><strong>Processed at:</strong> {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
354
+ <p><strong>Vision model used:</strong> {'Yes' if latest['useVision'] else 'No'}</p>
355
+ """
356
+ tool_container(latest_html)
357
+
358
+ # History in expander
359
+ with st.expander("View Complete Processing History"):
360
+ for i, item in enumerate(reversed(st.session_state.processing_history)):
361
+ st.markdown(f"""
362
+ <div style="background-color: var(--color-gray-700); padding: 0.75rem; border-radius: 0.5rem; margin-bottom: 0.5rem;">
363
+ <strong>{item['fileName']}</strong><br>
364
+ {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} -
365
+ Vision model: {'Yes' if item['useVision'] else 'No'}
366
+ </div>
367
+ """, unsafe_allow_html=True)
368
+
369
+ # Option to view a previous result
370
+ if st.button(f"View This Result", key=f"view_history_{i}"):
371
+ st.session_state.current_result = item['result']
372
+ st.experimental_rerun()
373
+
374
+ # Compare tab for side-by-side comparison
375
+ with compare_tab:
376
+ st.subheader("Compare OCR Results")
377
+
378
+ if 'processing_history' in st.session_state and len(st.session_state.processing_history) >= 2:
379
+ st.markdown("""
380
+ Select two processing results to compare side by side. This allows you to see
381
+ how different options (like using the vision model) affect OCR quality.
382
+ """)
383
+
384
+ # Create selection dropdowns for the documents
385
+ col1, col2 = st.columns(2)
386
+ with col1:
387
+ # First document selector
388
+ doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
389
+ for i, item in enumerate(st.session_state.processing_history)]
390
+ doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
391
+ doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
392
+
393
+ with col2:
394
+ # Second document selector
395
+ doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
396
+ for i, item in enumerate(st.session_state.processing_history)]
397
+ default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
398
+ doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
399
+ doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
400
+
401
+ # Retrieve the selected documents
402
+ doc1 = st.session_state.processing_history[doc_index_1]
403
+ doc2 = st.session_state.processing_history[doc_index_2]
404
+
405
+ # Show comparison
406
+ col1, col2 = st.columns(2)
407
+
408
+ with col1:
409
+ doc1_html = f"""
410
+ <h4>Document 1: {doc1['fileName']}</h4>
411
+ <p><strong>Processed at:</strong> {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
412
+ <p><strong>Vision model used:</strong> {'Yes' if doc1['useVision'] else 'No'}</p>
413
+ """
414
+ tool_container(doc1_html)
415
+
416
+ # Display content summary
417
+ if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
418
+ if 'content' in doc1['result']['ocr_contents']:
419
+ content = doc1['result']['ocr_contents']['content']
420
+ st.markdown(f"""
421
+ <div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
422
+ border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
423
+ {content[:500]}{'...' if len(content) > 500 else ''}
424
+ </div>
425
+ """, unsafe_allow_html=True)
426
+
427
+ with col2:
428
+ doc2_html = f"""
429
+ <h4>Document 2: {doc2['fileName']}</h4>
430
+ <p><strong>Processed at:</strong> {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}</p>
431
+ <p><strong>Vision model used:</strong> {'Yes' if doc2['useVision'] else 'No'}</p>
432
+ """
433
+ tool_container(doc2_html)
434
+
435
+ # Display content summary
436
+ if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
437
+ if 'content' in doc2['result']['ocr_contents']:
438
+ content = doc2['result']['ocr_contents']['content']
439
+ st.markdown(f"""
440
+ <div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
441
+ border: 1px solid #374151; padding: 1rem; background-color: #1f2937;">
442
+ {content[:500]}{'...' if len(content) > 500 else ''}
443
+ </div>
444
+ """, unsafe_allow_html=True)
445
+
446
+ # Comparison analysis
447
+ if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
448
+ comparison_content = """
449
+ <h3>Vision vs. Non-Vision Model Comparison</h3>
450
+ <p>You're comparing the same document processed with different models.
451
+ This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p>
452
+
453
+ <p>Look for these differences:</p>
454
+ <ul>
455
+ <li>Completeness of extracted text</li>
456
+ <li>Accuracy of layout understanding</li>
457
+ <li>Recognition of complex elements (tables, figures)</li>
458
+ <li>Topic and language detection accuracy</li>
459
+ </ul>
460
+ """
461
+ key_concept(comparison_content)
462
+ else:
463
+ need_more_content = """
464
+ <h3>Need More Documents to Compare</h3>
465
+ <p>Process at least two documents to enable side-by-side comparison. Try processing
466
+ the same document with and without the vision model to see the differences in OCR quality.</p>
467
+ """
468
+ research_question(need_more_content)
469
+
470
+ # Analysis guide tab
471
+ with analyze_tab:
472
+ st.subheader("Analysis Guide")
473
+
474
+ st.markdown("""
475
+ ### How to Analyze OCR Results
476
+
477
+ When analyzing OCR results from historical documents, consider these key factors:
478
+
479
+ 1. **Text Accuracy**
480
+ - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
481
+ - Assess recognition of period-specific typography and writing styles
482
+ - Evaluate handling of degraded or damaged text areas
483
+
484
+ 2. **Structure Preservation**
485
+ - Does the OCR maintain paragraph and section breaks?
486
+ - Are columns and tabular data correctly preserved?
487
+ - How well are page transitions handled?
488
+
489
+ 3. **Special Elements**
490
+ - Recognition of footnotes, marginalia, and annotations
491
+ - Handling of illustrations, diagrams, and decorative elements
492
+ - Treatment of watermarks, signatures, and stamps
493
+
494
+ 4. **Metadata Extraction**
495
+ - Accuracy of detected languages, topics, and document type
496
+ - Identification of dates, names, and key entities
497
+ - Recognition of document purpose and context
498
+ """)
499
+
500
+ col1, col2 = st.columns(2)
501
+
502
+ with col1:
503
+ challenge_content = """
504
+ <h3>Common OCR Challenges</h3>
505
+ <ul>
506
+ <li><strong>Typography Variations</strong>: Historical fonts that differ from modern text</li>
507
+ <li><strong>Material Degradation</strong>: Fading, stains, tears affecting legibility</li>
508
+ <li><strong>Handwritten Elements</strong>: Marginalia, signatures, and annotations</li>
509
+ <li><strong>Complex Layouts</strong>: Multi-column formats and decorative elements</li>
510
+ <li><strong>Language and Terminology</strong>: Archaic terms and multilingual content</li>
511
+ </ul>
512
+ """
513
+ gray_container(challenge_content)
514
+
515
+ with col2:
516
+ tips_content = """
517
+ <h3>Making the Most of OCR Results</h3>
518
+ <ul>
519
+ <li><strong>Contextual Reading</strong>: Use context to interpret unclear passages</li>
520
+ <li><strong>Error Patterns</strong>: Identify and correct systematic OCR errors</li>
521
+ <li><strong>Hybrid Analysis</strong>: Combine OCR search with close reading</li>
522
+ <li><strong>Comparative Processing</strong>: Try different settings on documents</li>
523
+ <li><strong>Iterative Refinement</strong>: Use insights to improve future processing</li>
524
+ </ul>
525
+ """
526
+ gray_container(tips_content)
527
+
528
+ # Show example analysis if there's processing history
529
+ if 'processing_history' in st.session_state and st.session_state.processing_history:
530
+ with st.expander("Example Analysis from Your Documents"):
531
+ # Pick the latest document
532
+ latest = st.session_state.processing_history[-1]
533
+
534
+ st.markdown(f"""
535
+ #### Sample Analysis for: {latest['fileName']}
536
+
537
+ **Document Context:**
538
+ - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
539
+ - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
540
+ - Vision model used: {'Yes' if latest['useVision'] else 'No'}
541
+
542
+ **What to Look For:**
543
+ 1. Check how well the model identified key topics and languages
544
+ 2. Evaluate the completeness of extracted text
545
+ 3. Note any systematic errors in text recognition
546
+ 4. Assess how well document structure was preserved
547
+ """)
modules/modular_app.py ADDED
@@ -0,0 +1,276 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ import sys
4
+ from layout import page_wrapper
5
+ from modules import get_module, get_module_name, module_names
6
+
7
+ # Set page configuration with dark theme
8
+ st.set_page_config(
9
+ page_title="Historical OCR Workshop",
10
+ page_icon="📜",
11
+ layout="wide",
12
+ initial_sidebar_state="collapsed"
13
+ )
14
+
15
+ # Initialize session state for workshop navigation
16
+ if 'current_module' not in st.session_state:
17
+ st.session_state.current_module = 1
18
+
19
+ if 'workshop_started' not in st.session_state:
20
+ st.session_state.workshop_started = False
21
+
22
+ if 'processing_history' not in st.session_state:
23
+ st.session_state.processing_history = []
24
+
25
+ def navigate_to_module(module_number):
26
+ """Navigate to a specific module"""
27
+ st.session_state.current_module = module_number
28
+ st.rerun()
29
+
30
+ # Welcome screen if workshop hasn't been started
31
+ if not st.session_state.workshop_started:
32
+ def welcome_screen():
33
+ """Renders the welcome/start screen"""
34
+ # Hero section with eye-catching design
35
+ st.markdown("""
36
+ <div style="background: linear-gradient(135deg, #1E3A8A 0%, #2563EB 100%);
37
+ padding: 2rem; border-radius: 0.75rem; text-align: center;
38
+ margin-bottom: 2rem; box-shadow: 0 4px 6px rgba(0,0,0,0.3);">
39
+ <h1>Historical OCR Workshop</h1>
40
+ <p style="font-size: 1.25rem;">Unlock the potential of historical documents with modern OCR technology</p>
41
+ </div>
42
+ """, unsafe_allow_html=True)
43
+
44
+ # Introduction with cleaner layout
45
+ col1, col2 = st.columns([3, 2])
46
+
47
+ with col1:
48
+ st.markdown("""
49
+ <div style="background-color: #1f2937; padding: 1.5rem; border-radius: 0.75rem; margin-bottom: 1.5rem;">
50
+ <h3>Workshop Overview</h3>
51
+
52
+ This interactive workshop explores the application of OCR technology to historical documents,
53
+ combining theoretical understanding with practical experiences. Designed for historians,
54
+ archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
55
+ </div>
56
+ """, unsafe_allow_html=True)
57
+
58
+ st.markdown("""
59
+ <div style="background-color: #374151; padding: 0.75rem; border-radius: 0.5rem;
60
+ margin: 1rem 0; border-left: 3px solid #3B82F6;">
61
+ <h4>What is OCR?</h4>
62
+ Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
63
+ Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for
64
+ historical research and digital humanities.
65
+ </div>
66
+ """, unsafe_allow_html=True)
67
+
68
+ with col2:
69
+ # Add an engaging research question
70
+ st.markdown("""
71
+ <div style="background-color: #1E3A8A; color: white; padding: 0.75rem;
72
+ border-radius: 0.5rem; margin: 1rem 0; border-left: 3px solid #60A5FA;">
73
+ <h4>For Historians:</h4>
74
+ How might OCR technology transform our access to and interpretation of historical documents?
75
+ What new research questions become possible when large archives become machine-readable?
76
+ </div>
77
+ """, unsafe_allow_html=True)
78
+
79
+ # Display a sample historical document image
80
+ input_dir = Path(__file__).parent / "input"
81
+ sample_path = input_dir / "magellan-travels.jpg"
82
+ if sample_path.exists():
83
+ try:
84
+ from PIL import Image
85
+ with Image.open(sample_path) as img:
86
+ st.image(img, caption="Sample Historical Document", width=300)
87
+ except Exception:
88
+ pass
89
+
90
+ # What you'll learn section with visual learning outcomes
91
+ st.markdown('<h3>What You\'ll Learn</h3>', unsafe_allow_html=True)
92
+
93
+ # Create three columns for clean layout
94
+ col1, col2, col3 = st.columns(3)
95
+
96
+ with col1:
97
+ st.markdown("""
98
+ <div style="background-color: #1f2937; padding: 1rem; border-radius: 0.5rem;">
99
+ <h4>Conceptual Understanding</h4>
100
+
101
+ - Text-image relationships in historical documents
102
+ - Evolution of OCR technology
103
+ - AI vision models for document analysis
104
+ - Historical typography challenges
105
+ </div>
106
+ """, unsafe_allow_html=True)
107
+
108
+ with col2:
109
+ st.markdown("""
110
+ <div style="background-color: #1f2937; padding: 1rem; border-radius: 0.5rem;">
111
+ <h4>Methodological Approaches</h4>
112
+
113
+ - Critical frameworks for OCR in historical research
114
+ - Hybrid computational-traditional methods
115
+ - Error analysis and interpretation
116
+ - Contextual reading strategies
117
+ </div>
118
+ """, unsafe_allow_html=True)
119
+
120
+ with col3:
121
+ st.markdown("""
122
+ <div style="background-color: #1f2937; padding: 1rem; border-radius: 0.5rem;">
123
+ <h4>Practical Skills</h4>
124
+
125
+ - Processing historical documents with OCR
126
+ - Analyzing and structuring extracted information
127
+ - Integrating OCR into research workflows
128
+ - Building searchable archives
129
+ </div>
130
+ """, unsafe_allow_html=True)
131
+
132
+ # Module overview
133
+ st.markdown('<h3>Workshop Modules</h3>', unsafe_allow_html=True)
134
+
135
+ # First row of modules
136
+ col1, col2 = st.columns(2)
137
+
138
+ with col1:
139
+ for i in [1, 3, 5]:
140
+ st.markdown(f"""
141
+ <div style="background-color: #1f2937; border-radius: 8px; padding: 16px;
142
+ margin-bottom: 16px; border-top: 4px solid #3B82F6;">
143
+ <div style="background-color: #3B82F6; color: white; font-weight: bold;
144
+ padding: 4px 10px; border-radius: 12px; font-size: 0.9rem;
145
+ display: inline-block; margin-bottom: 8px;">Module {i}</div>
146
+ <div style="font-weight: 600; margin-bottom: 8px; font-size: 1.1rem; color: white;">
147
+ {module_names[i-1]}
148
+ </div>
149
+ <p>Module {i} of the historical OCR workshop.</p>
150
+ </div>
151
+ """, unsafe_allow_html=True)
152
+
153
+ with col2:
154
+ for i in [2, 4, 6]:
155
+ st.markdown(f"""
156
+ <div style="background-color: #1f2937; border-radius: 8px; padding: 16px;
157
+ margin-bottom: 16px; border-top: 4px solid #3B82F6;">
158
+ <div style="background-color: #3B82F6; color: white; font-weight: bold;
159
+ padding: 4px 10px; border-radius: 12px; font-size: 0.9rem;
160
+ display: inline-block; margin-bottom: 8px;">Module {i}</div>
161
+ <div style="font-weight: 600; margin-bottom: 8px; font-size: 1.1rem; color: white;">
162
+ {module_names[i-1]}
163
+ </div>
164
+ <p>Module {i} of the historical OCR workshop.</p>
165
+ </div>
166
+ """, unsafe_allow_html=True)
167
+
168
+ # Inspirational quote
169
+ st.markdown("""
170
+ <div style="font-style: italic; color: #D1D5DB; padding: 0.5rem 1rem;
171
+ border-left: 3px solid #4B5563; margin: 1rem 0;">
172
+ "The digital turn in historical research is not just about converting analog to digital;
173
+ it's about transforming how we access, analyze, and interpret the past."
174
+ <br/><br/>
175
+ <span style="font-size:0.9rem; text-align:right; display:block;">— Dr. Jane Winters, Professor of Digital Humanities</span>
176
+ </div>
177
+ """, unsafe_allow_html=True)
178
+
179
+ # Start button with enhanced styling
180
+ st.markdown('<div style="text-align: center; margin-top: 2rem;">', unsafe_allow_html=True)
181
+ col1, col2, col3 = st.columns([1, 2, 1])
182
+ with col2:
183
+ if st.button("Begin Workshop Journey", key="start_workshop", type="primary", use_container_width=True):
184
+ st.session_state.workshop_started = True
185
+ st.rerun()
186
+ st.markdown('<p style="text-align:center; margin-top:8px; font-size:0.9rem; color:#666;">No installation required • Start immediately</p>', unsafe_allow_html=True)
187
+ st.markdown('</div>', unsafe_allow_html=True)
188
+
189
+ # Display the welcome screen (outside modules)
190
+ welcome_screen()
191
+ else:
192
+ # Get the current module to display
193
+ current_module = st.session_state.current_module
194
+ module = get_module(current_module)
195
+
196
+ # Create navigation callbacks for the page wrapper
197
+ def nav_to_prev():
198
+ if current_module > 1:
199
+ st.session_state.current_module = current_module - 1
200
+ st.rerun()
201
+
202
+ def nav_to_next():
203
+ if current_module < 6:
204
+ st.session_state.current_module = current_module + 1
205
+ st.rerun()
206
+
207
+ # Create the sidebar navigation
208
+ with st.sidebar:
209
+ st.markdown("<h1>Workshop Navigation</h1>", unsafe_allow_html=True)
210
+
211
+ # Visual header
212
+ st.markdown("<div style='display:flex; align-items:center; margin-bottom:20px;'>", unsafe_allow_html=True)
213
+
214
+ # Show a progress indicator
215
+ st.markdown(f"<div><b>Your Progress:</b> Module {current_module} of 6</div>", unsafe_allow_html=True)
216
+ st.progress(current_module / 6)
217
+
218
+ # Module navigation buttons
219
+ st.markdown("<h3>Modules</h3>", unsafe_allow_html=True)
220
+
221
+ for i, name in enumerate(module_names, 1):
222
+ btn_style = "primary" if i == current_module else "secondary"
223
+ if st.button(f"{i}: {name}", key=f"nav_module_{i}", type=btn_style, use_container_width=True):
224
+ st.session_state.current_module = i
225
+ st.rerun()
226
+
227
+ # About the workshop in a collapsible section
228
+ with st.expander("About the Workshop"):
229
+ st.markdown("""
230
+ This interactive workshop explores OCR technology for historical documents.
231
+
232
+ **How to use this workshop:**
233
+ 1. Navigate through modules sequentially
234
+ 2. Expand content sections to read more
235
+ 3. Try the interactive OCR experiment
236
+ 4. Reflect on research questions
237
+
238
+ For help or more information, use the reference materials in Module 6.
239
+ """)
240
+
241
+ # Processing history if available
242
+ if st.session_state.processing_history:
243
+ with st.expander("Your Activity"):
244
+ st.markdown(f"<b>Documents processed:</b> {len(st.session_state.processing_history)}", unsafe_allow_html=True)
245
+
246
+ # Show the most recent document processed
247
+ latest = st.session_state.processing_history[-1]
248
+ st.markdown(f"""
249
+ <div style="background:#f9f9f9; padding:8px; border-radius:4px; margin-top:10px; color:#333;">
250
+ <b>Latest document:</b> {latest['fileName']}<br>
251
+ <span style="font-size:0.9rem;">Processed with {' vision model' if latest['useVision'] else ' basic OCR'}</span>
252
+ </div>
253
+ """, unsafe_allow_html=True)
254
+
255
+ # Render the current module content using the page wrapper
256
+ page_wrapper(module.render, current_module)
257
+
258
+ # At the bottom of the page, create the hidden navigation buttons for the fixed navigation bar
259
+ if st.session_state.workshop_started:
260
+ # Previous navigation button (hidden, activated by the fixed nav)
261
+ if st.session_state.current_module > 1:
262
+ if st.button("←", key=f"nav_prev_{st.session_state.current_module-1}", label_visibility="collapsed"):
263
+ st.session_state.current_module -= 1
264
+ st.rerun()
265
+
266
+ # Next navigation button (hidden, activated by the fixed nav)
267
+ if st.session_state.current_module < 6:
268
+ if st.button("→", key=f"nav_next_{st.session_state.current_module+1}", label_visibility="collapsed"):
269
+ st.session_state.current_module += 1
270
+ st.rerun()
271
+
272
+ # Module navigation dots (hidden, activated by the fixed nav)
273
+ for i in range(1, 7):
274
+ if st.button(f"{i}", key=f"nav_dot_{i}", label_visibility="collapsed"):
275
+ st.session_state.current_module = i
276
+ st.rerun()
ocr_utils.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for OCR processing with Mistral AI.
3
+ Contains helper functions for working with OCR responses and image handling.
4
+ """
5
+
6
+ import json
7
+ import base64
8
+ import io
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional, Union, Any
11
+
12
+ try:
13
+ from PIL import Image
14
+ PILLOW_AVAILABLE = True
15
+ except ImportError:
16
+ PILLOW_AVAILABLE = False
17
+
18
+ from mistralai import DocumentURLChunk, ImageURLChunk, TextChunk
19
+
20
+ def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
21
+ """
22
+ Replace image placeholders in markdown with base64-encoded images.
23
+
24
+ Args:
25
+ markdown_str: Markdown text containing image placeholders
26
+ images_dict: Dictionary mapping image IDs to base64 strings
27
+
28
+ Returns:
29
+ Markdown text with images replaced by base64 data
30
+ """
31
+ for img_name, base64_str in images_dict.items():
32
+ markdown_str = markdown_str.replace(
33
+ f"![{img_name}]({img_name})", f"![{img_name}]({base64_str})"
34
+ )
35
+ return markdown_str
36
+
37
+ def get_combined_markdown(ocr_response) -> str:
38
+ """
39
+ Combine OCR text and images into a single markdown document.
40
+ Ensures proper spacing between text and images.
41
+
42
+ Args:
43
+ ocr_response: Response from OCR processing containing text and images
44
+ See https://docs.mistral.ai/capabilities/document/ for API reference
45
+
46
+ Returns:
47
+ Combined markdown string with embedded images
48
+ """
49
+ markdowns: list[str] = []
50
+ # Extract images from page
51
+ for page in ocr_response.pages:
52
+ image_data = {}
53
+ for img in page.images:
54
+ image_data[img.id] = img.image_base64
55
+
56
+ # Replace image placeholders with actual images
57
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
58
+
59
+ # Ensure proper spacing between paragraphs and images
60
+ # Add extra newlines between paragraphs to improve rendering
61
+ page_markdown = page_markdown.replace("\n", "\n\n")
62
+
63
+ # Add page separator for multi-page documents
64
+ markdowns.append(page_markdown)
65
+
66
+ # Join pages with clear separators for multi-page documents
67
+ return "\n\n---\n\n".join(markdowns)
68
+
69
+ def encode_image_for_api(image_path: Union[str, Path]) -> str:
70
+ """
71
+ Encode an image as base64 for API use.
72
+
73
+ Args:
74
+ image_path: Path to the image file
75
+
76
+ Returns:
77
+ Base64 data URL for the image
78
+ """
79
+ # Convert to Path object if string
80
+ image_file = Path(image_path) if isinstance(image_path, str) else image_path
81
+
82
+ # Verify image exists
83
+ if not image_file.is_file():
84
+ raise FileNotFoundError(f"Image file not found: {image_file}")
85
+
86
+ # Encode image as base64
87
+ encoded = base64.b64encode(image_file.read_bytes()).decode()
88
+ return f"data:image/jpeg;base64,{encoded}"
89
+
90
+ def process_image_with_ocr(client, image_path: Union[str, Path], model: str = "mistral-ocr-latest"):
91
+ """
92
+ Process an image with OCR and return the response.
93
+
94
+ Args:
95
+ client: Mistral AI client
96
+ image_path: Path to the image file
97
+ model: OCR model to use
98
+
99
+ Returns:
100
+ OCR response object
101
+ """
102
+ # Encode image as base64
103
+ base64_data_url = encode_image_for_api(image_path)
104
+
105
+ # Process image with OCR
106
+ image_response = client.ocr.process(
107
+ document=ImageURLChunk(image_url=base64_data_url),
108
+ model=model
109
+ )
110
+
111
+ return image_response
112
+
113
+ def ocr_response_to_json(ocr_response, indent: int = 4) -> str:
114
+ """
115
+ Convert OCR response to a formatted JSON string.
116
+
117
+ Args:
118
+ ocr_response: OCR response object
119
+ indent: Indentation level for JSON formatting
120
+
121
+ Returns:
122
+ Formatted JSON string
123
+ """
124
+ # Convert response to JSON
125
+ response_dict = json.loads(ocr_response.model_dump_json())
126
+ return json.dumps(response_dict, indent=indent)
127
+
128
+ def get_combined_markdown_compressed(ocr_response, max_width: int = 800, quality: int = 85) -> str:
129
+ """
130
+ Combine OCR text and images into a single markdown document with compressed images.
131
+ Reduces image sizes to improve performance.
132
+
133
+ Args:
134
+ ocr_response: Response from OCR processing containing text and images
135
+ max_width: Maximum width to resize images to (preserves aspect ratio)
136
+ quality: JPEG quality (0-100) for compression
137
+
138
+ Returns:
139
+ Combined markdown string with embedded compressed images
140
+ """
141
+ if not PILLOW_AVAILABLE:
142
+ # Fall back to regular method if PIL is not available
143
+ return get_combined_markdown(ocr_response)
144
+
145
+ markdowns: list[str] = []
146
+
147
+ # Process each page
148
+ for page in ocr_response.pages:
149
+ image_data = {}
150
+
151
+ # Process and compress each image
152
+ for img in page.images:
153
+ try:
154
+ # Decode base64 image
155
+ img_bytes = base64.b64decode(img.image_base64.split(',')[1] if ',' in img.image_base64 else img.image_base64)
156
+
157
+ # Open with PIL
158
+ pil_img = Image.open(io.BytesIO(img_bytes))
159
+
160
+ # Resize if needed (maintain aspect ratio)
161
+ original_width, original_height = pil_img.size
162
+ if original_width > max_width:
163
+ ratio = max_width / original_width
164
+ new_height = int(original_height * ratio)
165
+ pil_img = pil_img.resize((max_width, new_height), Image.LANCZOS)
166
+
167
+ # Convert to bytes with compression
168
+ buffer = io.BytesIO()
169
+ format = pil_img.format if pil_img.format else 'JPEG'
170
+ if format.upper() == 'JPEG' or format.upper() == 'JPG':
171
+ pil_img.save(buffer, format=format, quality=quality, optimize=True)
172
+ else:
173
+ # For non-JPEG formats (PNG, etc.)
174
+ pil_img.save(buffer, format=format, optimize=True)
175
+
176
+ # Convert back to base64
177
+ compressed_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
178
+ mime_type = f"image/{format.lower()}" if format else "image/jpeg"
179
+ image_data[img.id] = f"data:{mime_type};base64,{compressed_base64}"
180
+
181
+ except Exception as e:
182
+ # If compression fails, use original image
183
+ image_data[img.id] = img.image_base64
184
+
185
+ # Replace image placeholders with compressed images
186
+ page_markdown = replace_images_in_markdown(page.markdown, image_data)
187
+
188
+ # Ensure proper spacing between paragraphs and images
189
+ page_markdown = page_markdown.replace("\n", "\n\n")
190
+
191
+ # Add page to list
192
+ markdowns.append(page_markdown)
193
+
194
+ # Join pages with clear separators
195
+ return "\n\n---\n\n".join(markdowns)
196
+
197
+ # For display in notebooks
198
+ try:
199
+ from IPython.display import Markdown, display
200
+
201
+ def display_ocr_with_images(ocr_response):
202
+ """
203
+ Display OCR response with embedded images in IPython environments.
204
+
205
+ Args:
206
+ ocr_response: OCR response object
207
+ """
208
+ combined_markdown = get_combined_markdown(ocr_response)
209
+ display(Markdown(combined_markdown))
210
+ except ImportError:
211
+ # IPython not available
212
+ pass
output/.gitkeep ADDED
File without changes
output/example-1.html ADDED
The diff for this file is too large to render. See raw diff
 
output/recipe_test.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "img-0.jpeg",
3
+ "topics": [
4
+ "Cooking",
5
+ "Recipes",
6
+ "Baking"
7
+ ],
8
+ "languages": [
9
+ "English"
10
+ ],
11
+ "ocr_contents": {
12
+ "title": "Pecan Butterballs Cookies",
13
+ "recipe": "1 cup butter, creamy if possible\n1/4 inch honey\n2 \" ounces flour\n1/2 teaspoon salt\n2 \" ounces pecans\n2 cups finely chopped pecans\nForm into small balls, bake at 300 40-45 min roll in uncoated sugar"
14
+ },
15
+ "confidence_score": 0.85,
16
+ "raw_response":
output/ymca-letter.jpg ADDED

Git LFS Details

  • SHA256: 22e0102df7d37ad482169f796435fa228a0b42b2a1661380044f781589ccbac8
  • Pointer size: 131 Bytes
  • Size of remote file: 211 kB