import os
import streamlit as st
import json
import sys
from pathlib import Path
import tempfile
from datetime import datetime
import io
import base64
from io import BytesIO
from enum import Enum
import inspect
# Add parent directory to path so we can import the OCR modules
parent_dir = Path(__file__).parent.absolute()
# Import the StructuredOCR class and process_file utility
from structured_ocr import StructuredOCR
# Add API endpoint support for the React app
from streamlit.web.server.server import Server
from streamlit.runtime.scriptrunner import get_script_run_ctx
# Custom JSON encoder to handle Enum types and other non-serializable objects
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, Enum):
return obj.value
elif hasattr(obj, '__dict__'):
# For objects that have a __dict__ but aren't directly serializable
return {key: value for key, value in obj.__dict__.items()
if not key.startswith('_')}
elif hasattr(obj, 'model_dump'):
# For Pydantic models
return obj.model_dump()
elif hasattr(obj, 'to_dict'):
# For objects with to_dict method
return obj.to_dict()
# Let the base class handle other types or raise TypeError
return super().default(obj)
# Helper function to convert any result to JSON-serializable
def make_serializable(obj):
"""Convert any object to a JSON-serializable form"""
if isinstance(obj, dict):
return {k: make_serializable(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [make_serializable(item) for item in obj]
elif isinstance(obj, Enum):
return obj.value
elif hasattr(obj, 'pages'):
# Special case for OCRResponse objects which have pages attribute
if hasattr(obj, '__dict__'):
result = {k: make_serializable(v) for k, v in obj.__dict__.items()
if not k.startswith('_')}
# Explicitly handle pages attribute
if hasattr(obj, 'pages'):
result['pages'] = [make_serializable(page) for page in obj.pages]
return result
elif hasattr(obj, '__dict__'):
# For objects with __dict__ attribute
return {k: make_serializable(v) for k, v in obj.__dict__.items()
if not k.startswith('_')}
elif hasattr(obj, 'model_dump'):
# For Pydantic models
return make_serializable(obj.model_dump())
elif hasattr(obj, 'to_dict'):
# For objects with to_dict method
return make_serializable(obj.to_dict())
# Basic types will be returned as is
return obj
# API response handler
def process_api_request():
"""Handle API requests from the React frontend"""
# Get the current Streamlit session
ctx = get_script_run_ctx()
if ctx is None:
session_id = ctx.session_id
session_info = Server.get_current()._get_session_info(session_id)
if session_info is None:
request = session_info.uploaded_file_mgr._uploaded_files.get('file')
if not request:
# Extract file and parameters
uploaded_file = request[0]
use_vision = session_info.query_string.get('use_vision', ['true'])[0].lower() == 'true'
# Process file
result = process_file(uploaded_file, use_vision=use_vision)
# Convert result to JSON-serializable format
serializable_result = make_serializable(result)
# Return JSON response
return serializable_result
except Exception as e:
# Return error response
return {"error": str(e)}
from process_file import process_file as process_file_util
# Use the utility function instead of the local function
process_file = process_file_util
except ImportError:
# Define the process_file function if it's not available
def process_file(uploaded_file, use_vision=True):
"""Process the uploaded file and return the OCR results"""
# Save the uploaded file to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
temp_path = tmp.name
# Initialize OCR processor
processor = StructuredOCR()
# Determine file type from extension
file_ext = Path(uploaded_file.name).suffix.lower()
file_type = "pdf" if file_ext == ".pdf" else "image"
# Process the file
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision)
# Add to processing history
history_item = {
"id": datetime.now().timestamp(),
"fileName": uploaded_file.name,
"timestamp": datetime.now().isoformat(),
"result": result,
"useVision": use_vision
if 'processing_history' not in st.session_state:
st.session_state.processing_history = []
return result
except Exception as e:
st.error(f"Error processing document: {str(e)}")
return None
# Clean up the temporary file
if os.path.exists(temp_path):
# Set page configuration
page_title="Historical OCR Workshop",
initial_sidebar_state="collapsed" # Start with sidebar collapsed for cleaner landing
# Custom CSS to match React dark theme and improve Streamlit integration
/* Global theme alignment with React */
.stApp {
background-color: #111827; /* bg-gray-900 in Tailwind */
color: white;
position: relative;
/* Add subtle grid background to the entire app */
.stApp::before {
content: "";
position: fixed;
top: 0;
left: 0;
right: 0;
bottom: 0;
linear-gradient(rgba(55, 65, 81, 0.1) 1px, transparent 1px),
linear-gradient(90deg, rgba(55, 65, 81, 0.1) 1px, transparent 1px);
background-size: 25px 25px;
opacity: 0.3;
z-index: 0;
pointer-events: none;
/* Make sure all text is visible */
p, h1, h2, h3, h4, h5, h6, div, span, li, label, a {
color: white !important;
position: relative;
/* Add spacier typography */
p, li {
line-height: 1.7;
margin-bottom: 0.8rem;
h3 {
margin-top: 1.5rem;
margin-bottom: 1.2rem;
font-weight: 600;
letter-spacing: 0.015em;
/* Fix empty-looking containers */
div:empty {
min-height: 0 !important;
padding: 0 !important;
margin: 0 !important;
/* Custom header */
.main-header {
background-color: #000000;
padding: 1rem;
border-bottom: 1px solid #374151; /* border-gray-700 */
margin-bottom: 1.5rem;
/* Content containers */
.content-container {
background-color: #1f2937; /* bg-gray-800 in Tailwind */
color: white;
padding: 1.8rem;
border-radius: 0.75rem;
margin-bottom: 2rem;
box-shadow: 0 6px 12px rgba(0,0,0,0.2);
border: 1px solid rgba(75, 85, 99, 0.3);
position: relative;
overflow: hidden;
.content-container::before {
content: "";
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-image: linear-gradient(rgba(55, 65, 81, 0.2) 1px, transparent 1px),
linear-gradient(90deg, rgba(55, 65, 81, 0.2) 1px, transparent 1px);
background-size: 20px 20px;
opacity: 0.05;
z-index: 0;
.content-container > * {
position: relative;
z-index: 1;
color: white !important;
.content-container h4 {
color: #60A5FA !important;
margin-bottom: 1rem;
font-size: 1.2rem;
letter-spacing: 0.02em;
.content-container ul, .content-container ol {
padding-left: 1.5rem;
margin-top: 0.8rem;
margin-bottom: 0.8rem;
.content-container li {
margin-bottom: 0.5rem;
line-height: 1.5;
/* Interactive elements */
.tool-container {
background-color: #1f2937; /* bg-gray-800 */
color: white;
padding: 1.5rem;
border-radius: 0.5rem;
border: 1px solid #374151; /* border-gray-700 */
margin-bottom: 1.5rem;
.tool-container * {
color: white !important;
/* Special containers */
.key-concept {
background-color: #374151; /* bg-gray-700 */
padding: 0.75rem;
border-radius: 0.5rem;
margin: 1rem 0;
border-left: 3px solid #3B82F6; /* border-blue-500 */
color: white;
.key-concept * {
color: white !important;
.research-question {
background-color: #1E3A8A; /* bg-blue-900 */
padding: 0.75rem;
border-radius: 0.5rem;
margin: 1rem 0;
border-left: 3px solid #60A5FA; /* border-blue-400 */
color: white;
.research-question * {
color: white !important;
.quote-container {
font-style: italic;
color: #D1D5DB; /* text-gray-300 */
padding: 0.5rem 1rem;
border-left: 3px solid #4B5563; /* border-gray-600 */
margin: 1rem 0;
/* Hero section */
.hero-container {
background: linear-gradient(135deg, #1E3A8A 0%, #2563EB 100%); /* blue-900 to blue-600 */
color: white;
padding: 3rem 2rem;
border-radius: 0.75rem;
margin-bottom: 3rem;
box-shadow: 0 8px 16px rgba(0,0,0,0.4);
text-align: center;
position: relative;
overflow: hidden;
.hero-container::before {
content: "";
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
linear-gradient(rgba(30, 58, 138, 0.8) 1px, transparent 1px),
linear-gradient(90deg, rgba(30, 58, 138, 0.8) 1px, transparent 1px);
background-size: 20px 20px;
opacity: 0.2;
z-index: 0;
.hero-title {
font-size: 2.5rem;
font-weight: 700;
margin-bottom: 1rem;
position: relative;
z-index: 1;
.hero-subtitle {
font-size: 1.25rem;
opacity: 0.9;
max-width: 700px;
margin: 0 auto;
position: relative;
z-index: 1;
/* File upload styling */
.upload-container {
border: 2px dashed #4B5563; /* border-gray-600 */
padding: 1.5rem;
text-align: center;
border-radius: 0.5rem;
margin-bottom: 1rem;
background-color: #374151; /* bg-gray-700 */
color: white;
/* History cards */
.history-card {
padding: 0.75rem;
background-color: #374151; /* bg-gray-700 */
color: white;
border-radius: 0.25rem;
border: 1px solid #4B5563; /* border-gray-600 */
margin-bottom: 0.5rem;
/* Override Streamlit defaults */
.stTextInput > div > div > input {
background-color: #374151; /* bg-gray-700 */
color: white !important;
.stSelectbox > div > div > div {
background-color: #374151; /* bg-gray-700 */
color: white !important;
.stCheckbox > div > label {
color: white !important;
/* Fix empty containers */
.tool-container:empty {
display: none;
/* Ensure all text is visible */
div, span, p, h1, h2, h3, h4, h5, h6, label, li {
color: white !important;
/* Tab panel content */
.stTabs [data-baseweb="tab-panel"] * {
color: white !important;
/* Button styling */
.stButton > button {
background-color: #2563EB; /* bg-blue-600 */
color: white;
.stButton > button:hover {
background-color: #1D4ED8; /* bg-blue-700 */
/* Make sure all text is readable */
p, h1, h2, h3, h4, h5, h6, span, label {
color: white;
.stMarkdown a {
color: #93C5FD; /* text-blue-300 */
/* Tabs */
.stTabs [data-baseweb="tab"] {
color: white;
.stTabs [data-baseweb="tab-highlight"] {
background-color: #2563EB; /* bg-blue-600 */
/* Expander */
.streamlit-expanderHeader {
color: white;
background-color: #1f2937; /* bg-gray-800 */
/* Sidebar */
[data-testid="stSidebar"] {
background-color: #111827; /* bg-gray-900 */
[data-testid="stSidebar"] .stMarkdown {
color: white;
""", unsafe_allow_html=True)
# Initialize session state for workshop progress
if 'current_module' not in st.session_state:
st.session_state.current_module = 1
if 'processing_history' not in st.session_state:
st.session_state.processing_history = []
if 'workshop_started' not in st.session_state:
st.session_state.workshop_started = False
def navigate_to_module(module_number):
"""Navigate to a specific module"""
st.session_state.current_module = module_number
# Welcome/Start screen if workshop hasn't been started
if not st.session_state.workshop_started:
# Hero section with eye-catching design
<div class="hero-container">
<h1 class="hero-title">Historical OCR Workshop</h1>
<p class="hero-subtitle">Unlock the potential of historical documents with modern OCR technology</p>
""", unsafe_allow_html=True)
# Introduction with cleaner layout
col1, col2 = st.columns([3, 2])
with col1:
<div class="content-container">
<h3>Workshop Overview</h3>
This interactive workshop explores the application of OCR technology to historical documents,
combining theoretical understanding with practical experiences. Designed for historians,
archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
""", unsafe_allow_html=True)
<div class="key-concept">
<h4>What is OCR?</h4>
Optical Character Recognition (OCR) technology enables computers to extract text from images and documents.
Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for
historical research and digital humanities.
""", unsafe_allow_html=True)
with col2:
# Add an engaging research question to connect with historians
<div class="research-question">
<h4>For Historians:</h4>
How might OCR technology transform our access to and interpretation of historical documents?
What new research questions become possible when large archives become machine-readable?
""", unsafe_allow_html=True)
# Display sample historical document images in a better format
input_dir = Path(__file__).parent / "input"
sample_images = [
{"path": input_dir / "letter-1.jpg", "caption": "Historical Letter"},
{"path": input_dir / "recipe.jpg", "caption": "Historical Recipe"}
# Try to find any of the sample images
for sample in sample_images:
if sample["path"].exists():
from PIL import Image
with Image.open(sample["path"]) as img:
# Add a better styled border and shadow
<div style="
border: 1px solid rgba(75, 85, 99, 0.6);
padding: 12px;
border-radius: 8px;
margin-bottom: 1rem;
box-shadow: 0 6px 15px rgba(0,0,0,0.3);
background-color: #1f2937;
position: relative;
""", unsafe_allow_html=True)
st.image(img, caption=sample["caption"], width=280)
<p style="
font-size: 0.85rem;
opacity: 0.8;
margin-top: 8px;
font-style: italic;
text-align: center;
">Sample document for OCR analysis</p>
""", unsafe_allow_html=True)
break # Only show one image
except Exception:
# What you'll learn section combined with Workshop Modules in parallel columns
col1, col2 = st.columns(2)
with col1:
st.markdown('<h3 class="workshop-heading" style="margin-bottom:1.5rem; padding-bottom:0.5rem; border-bottom:1px solid rgba(75, 85, 99, 0.5);">What You\'ll Learn</h3>', unsafe_allow_html=True)
<div class="content-container">
<h4><i class="fas fa-book-open"></i> Conceptual Understanding</h4>
- Text-image relationships in historical documents
- Evolution of OCR technology
- AI vision models for document analysis
- Historical typography challenges
""", unsafe_allow_html=True)
<div class="content-container">
<h4><i class="fas fa-microscope"></i> Methodological Approaches</h4>
- Critical frameworks for OCR in historical research
- Hybrid computational-traditional methods
- Error analysis and interpretation
- Contextual reading strategies
""", unsafe_allow_html=True)
<div class="content-container">
<h4><i class="fas fa-tools"></i> Practical Skills</h4>
- Processing historical documents with OCR
- Analyzing and structuring extracted information
- Integrating OCR into research workflows
- Building searchable archives
""", unsafe_allow_html=True)
with col2:
# Workshop modules with visually appealing cards
st.markdown('<h3 class="workshop-heading" style="margin-bottom:1.5rem; padding-bottom:0.5rem; border-bottom:1px solid rgba(75, 85, 99, 0.5);">Workshop Modules</h3>', unsafe_allow_html=True)
# Add some styling for the module cards
.module-card {
background-color: #1f2937; /* bg-gray-800 */
border-radius: 10px;
box-shadow: 0 4px 8px rgba(0,0,0,0.2);
padding: 1.5rem;
margin-bottom: 1.5rem;
transition: all 0.3s ease;
border-left: 4px solid #3B82F6; /* border-blue-500 */
color: white;
position: relative;
overflow: hidden;
.module-card::before {
content: "";
position: absolute;
top: 0;
left: 0;
right: 0;
bottom: 0;
background-image: linear-gradient(rgba(55, 65, 81, 0.2) 1px, transparent 1px),
linear-gradient(90deg, rgba(55, 65, 81, 0.2) 1px, transparent 1px);
background-size: 16px 16px;
opacity: 0.05;
z-index: 0;
.module-card > * {
position: relative;
z-index: 1;
.module-card:hover {
transform: translateY(-5px);
box-shadow: 0 10px 20px rgba(0,0,0,0.25);
border-left-color: #60A5FA;
.module-number {
background-color: #3B82F6; /* bg-blue-500 */
color: white;
font-weight: bold;
padding: 0.4rem 0.8rem;
border-radius: 20px;
font-size: 0.9rem;
display: inline-block;
margin-bottom: 12px;
box-shadow: 0 2px 4px rgba(0,0,0,0.2);
.module-title {
font-weight: 600;
margin-bottom: 1rem;
font-size: 1.25rem;
color: white;
letter-spacing: 0.015em;
.module-card p {
line-height: 1.6;
opacity: 0.9;
font-size: 0.95rem;
""", unsafe_allow_html=True)
# Modules inside the col2 from earlier
with col2:
<div class="module-card">
<div class="module-number">Module 1</div>
<div class="module-title">Introduction and Problematization</div>
<p>Explore the challenges of historical document digitization and the potential of OCR technologies
to transform historical research. Examine key problems and opportunities in historical OCR.</p>
""", unsafe_allow_html=True)
<div class="module-card">
<div class="module-number">Module 2</div>
<div class="module-title">Text-Image Relations in Historical Archives</div>
<p>Analyze the complex relationships between text and images in historical documents,
from typography and layout to marginalia and decorative elements.</p>
""", unsafe_allow_html=True)
<div class="module-card">
<div class="module-number">Module 3</div>
<div class="module-title">OCR Technology and Historical Documents</div>
<p>Understand the evolution of OCR technology from pattern matching to modern AI vision-language models,
and how they address the unique challenges of historical documents.</p>
""", unsafe_allow_html=True)
<div class="module-card">
<div class="module-number">Module 4</div>
<div class="module-title">Methodological Approaches</div>
<p>Develop hybrid methodologies that combine computational processing with traditional
historical research practices, balancing distant and close reading.</p>
""", unsafe_allow_html=True)
<div class="module-card">
<div class="module-number">Module 5</div>
<div class="module-title">Interactive OCR Experiment</div>
<p>Gain hands-on experience processing historical documents with OCR technology,
analyzing results, and comparing different approaches.</p>
""", unsafe_allow_html=True)
<div class="module-card">
<div class="module-number">Module 6</div>
<div class="module-title">Conclusion and Future Directions</div>
<p>Synthesize workshop insights and explore future directions for OCR in historical research,
from large-scale corpus analysis to computational paleography.</p>
""", unsafe_allow_html=True)
# Engaging quote to inspire participation with citation - in a better styled container
<div style="
background: linear-gradient(to right, rgba(30, 58, 138, 0.3), rgba(37, 99, 235, 0.1));
border-left: 4px solid #3B82F6;
padding: 1.5rem 2rem;
border-radius: 8px;
margin: 2.5rem 0;
position: relative;
font-style: italic;
box-shadow: 0 4px 12px rgba(0,0,0,0.2);
<div style="
position: absolute;
left: 20px;
top: -18px;
font-size: 2rem;
color: #60A5FA;
opacity: 0.7;
<p style="
font-size: 1.15rem;
line-height: 1.7;
max-width: 800px;
margin: 0 auto;
text-align: center;
color: #E5E7EB !important;
">The digital turn in historical research is not just about converting analog to digital;
it's about transforming how we access, analyze, and interpret the past.</p>
<div style="
text-align: right;
margin-top: 1rem;
color: #9CA3AF !important;
opacity: 0.9;
font-size: 0.9rem;
">— Dr. Jane Winters, Professor of Digital Humanities</div>
""", unsafe_allow_html=True)
# Feature highlight before call to action with better styling
<div style="
background: linear-gradient(135deg, #1E3A8A 0%, #1E40AF 100%);
border-radius: 12px;
padding: 2rem;
margin: 3rem 0;
border-top: 5px solid #3B82F6;
color: white;
box-shadow: 0 8px 20px rgba(0,0,0,0.3);
position: relative;
overflow: hidden;
<div style="
position: absolute;
top: 0;
right: 0;
bottom: 0;
left: 0;
linear-gradient(rgba(30, 58, 138, 0.2) 1px, transparent 1px),
linear-gradient(90deg, rgba(30, 58, 138, 0.2) 1px, transparent 1px);
background-size: 20px 20px;
opacity: 0.1;
<h3 style="
margin-top: 0;
color: white !important;
font-size: 1.5rem;
margin-bottom: 1.5rem;
position: relative;
z-index: 1;
">Workshop Highlights</h3>
<div style="
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1.5rem;
position: relative;
z-index: 1;
<div style="
background-color: rgba(30, 64, 175, 0.5);
padding: 1rem;
border-radius: 8px;
border-left: 3px solid #60A5FA;
<h4 style="
margin-top: 0;
color: #93C5FD !important;
font-size: 1.1rem;
">Interactive Learning</h4>
<p style="
margin-bottom: 0;
opacity: 0.9;
font-size: 0.95rem;
">Hands-on document processing with real-time feedback and analysis</p>
<div style="
background-color: rgba(30, 64, 175, 0.5);
padding: 1rem;
border-radius: 8px;
border-left: 3px solid #60A5FA;
<h4 style="
margin-top: 0;
color: #93C5FD !important;
font-size: 1.1rem;
">Real Historical Documents</h4>
<p style="
margin-bottom: 0;
opacity: 0.9;
font-size: 0.95rem;
">Work with authentic materials spanning different eras and formats</p>
<div style="
background-color: rgba(30, 64, 175, 0.5);
padding: 1rem;
border-radius: 8px;
border-left: 3px solid #60A5FA;
<h4 style="
margin-top: 0;
color: #93C5FD !important;
font-size: 1.1rem;
">Vision AI Models</h4>
<p style="
margin-bottom: 0;
opacity: 0.9;
font-size: 0.95rem;
">Experience state-of-the-art OCR technology powered by advanced AI</p>
<div style="
background-color: rgba(30, 64, 175, 0.5);
padding: 1rem;
border-radius: 8px;
border-left: 3px solid #60A5FA;
<h4 style="
margin-top: 0;
color: #93C5FD !important;
font-size: 1.1rem;
">Research Applications</h4>
<p style="
margin-bottom: 0;
opacity: 0.9;
font-size: 0.95rem;
">Learn to integrate OCR into historical research workflows</p>
""", unsafe_allow_html=True)
# Enhanced start button with dynamic styling and clear call to action
<div style="
text-align: center;
margin: 3.5rem 0;
padding: 2rem;
background: linear-gradient(180deg, rgba(31, 41, 55, 0.6) 0%, rgba(17, 24, 39, 0.8) 100%);
border-radius: 12px;
border: 1px solid rgba(75, 85, 99, 0.3);
box-shadow: 0 10px 25px rgba(0,0,0,0.2);
<h3 style="
margin-bottom: 1.5rem;
font-size: 1.5rem;
color: white !important;
">Ready to Start Your Journey?</h3>
<button id="start-button" style="
background: linear-gradient(90deg, #2563EB 0%, #1D4ED8 100%);
color: white;
border: none;
padding: 0.8rem 2rem;
font-size: 1.1rem;
font-weight: 500;
border-radius: 8px;
cursor: pointer;
transition: all 0.3s ease;
box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3);
margin-bottom: 1rem;
width: 280px;
" onclick="document.getElementById('streamlit-button').click()">Begin Workshop Journey</button>
<p style="
text-align: center;
margin-top: 1rem;
font-size: 0.95rem;
color: #9CA3AF !important;
">No installation required • Start immediately • Interactive experience</p>
// Animation for the button
document.getElementById('start-button').addEventListener('mouseover', function() {
this.style.transform = 'translateY(-3px)';
this.style.boxShadow = '0 6px 15px rgba(37, 99, 235, 0.4)';
document.getElementById('start-button').addEventListener('mouseout', function() {
this.style.transform = 'translateY(0)';
this.style.boxShadow = '0 4px 12px rgba(37, 99, 235, 0.3)';
""", unsafe_allow_html=True)
# Hidden button to trigger the workshop start
col1, col2, col3 = st.columns([1, 1, 1])
with col2:
if st.button("Begin Workshop", key="streamlit-button", use_container_width=True, type="primary"):
st.session_state.workshop_started = True
# Display workshop navigation sidebar only if workshop has started
elif st.session_state.workshop_started:
# Define input directory for images
input_dir = Path(__file__).parent / "input"
# Enhanced sidebar navigation
with st.sidebar:
st.markdown("<h1 style='margin-bottom:15px;'>Workshop Navigation</h1>", unsafe_allow_html=True)
# Improved visual header with logo/image
st.markdown("<div style='display:flex; align-items:center; margin-bottom:20px;'>", unsafe_allow_html=True)
# Add a visual element with better sizing/styling
workflow_path = input_dir / "workflow.jpg"
if workflow_path.exists():
from PIL import Image
with Image.open(workflow_path) as img:
st.image(img, width=160, output_format="PNG")
except Exception:
st.markdown("</div>", unsafe_allow_html=True)
# Show enhanced progress indicator
current_module = st.session_state.current_module
st.markdown(f"<div style='margin-bottom:15px;'><b>Your Progress:</b> Module {current_module} of 6</div>", unsafe_allow_html=True)
st.progress(current_module / 6)
# More visually appealing module navigation
modules = [
"Text-Image Relations",
"OCR Technology",
"Methodological Approaches",
"Interactive OCR Experiment",
# Custom styling for navigation buttons
.nav-button {
padding: 8px 12px;
margin-bottom: 8px;
border-radius: 6px;
background-color: #f5f5f5;
display: block;
text-decoration: none;
color: #333;
font-weight: 500;
border-left: 3px solid transparent;
transition: all 0.2s;
.nav-button:hover {
background-color: #e6e6e6;
.nav-button.active {
background-color: #e8f0fe;
border-left: 3px solid #0d3c84;
font-weight: 600;
.nav-section {
margin: 20px 0 10px 0;
font-weight: 600;
color: #555;
""", unsafe_allow_html=True)
# Group into clearer sections
st.markdown("<div class='nav-section'>Theory & Concepts</div>", unsafe_allow_html=True)
for i in range(1, 4): # Modules 1-3
active_class = "active" if i == current_module else ""
<div class="nav-button {active_class}" onclick="document.getElementById('nav_{i}').click()">
<span style="display:inline-block; width:22px; text-align:center; margin-right:8px;
background-color:{'#0d3c84' if i == current_module else '#ddd'}; color:{'white' if i == current_module else '#555'};
border-radius:11px; font-size:0.8rem; font-weight:bold;">{i}</span>
""", unsafe_allow_html=True)
# Hidden button to handle the click
if st.button(f"{i}", key=f"nav_{i}"):
st.markdown("<div class='nav-section'>Application & Practice</div>", unsafe_allow_html=True)
for i in range(4, 7): # Modules 4-6
active_class = "active" if i == current_module else ""
<div class="nav-button {active_class}" onclick="document.getElementById('nav_{i}').click()">
<span style="display:inline-block; width:22px; text-align:center; margin-right:8px;
background-color:{'#0d3c84' if i == current_module else '#ddd'}; color:{'white' if i == current_module else '#555'};
border-radius:11px; font-size:0.8rem; font-weight:bold;">{i}</span>
""", unsafe_allow_html=True)
# Hidden button to handle the click
if st.button(f"{i}", key=f"nav_{i}"):
# Enhanced quick jump button
.jump-button {
background-color: #f0f7ff;
padding: 10px;
border-radius: 6px;
border-left: 3px solid #0d3c84;
margin-bottom: 15px;
cursor: pointer;
.jump-button:hover {
background-color: #e3f0ff;
""", unsafe_allow_html=True)
<div class="jump-button" onclick="document.getElementById('jump_exp').click()">
<span style="font-weight:500;">📊 Jump to OCR Experiment</span>
""", unsafe_allow_html=True)
# Hidden button for jump
if st.button("Jump to Experiment", key="jump_exp"):
# Workshop information in a cleaner collapsible section
with st.expander("About the Workshop"):
This interactive workshop explores OCR technology for historical documents.
**How to use this workshop:**
1. Navigate through modules sequentially
2. Expand content sections to read more
3. Try the interactive OCR experiment
4. Reflect on research questions
For help or more information, use the reference materials in Module 6.
# Enhanced progress tracking
if st.session_state.processing_history:
with st.expander("Your Activity"):
st.markdown(f"<b>Documents processed:</b> {len(st.session_state.processing_history)}", unsafe_allow_html=True)
# Show the most recent document processed with better formatting
latest = st.session_state.processing_history[-1]
<div style="background:#f9f9f9; padding:8px; border-radius:4px; margin-top:10px;">
<b>Latest document:</b> {latest['fileName']}<br>
<span style="font-size:0.9rem; color:#666;">Processed with {' vision model' if latest['useVision'] else ' basic OCR'}</span>
""", unsafe_allow_html=True)
# Main content based on current module
if st.session_state.current_module == 1:
# MODULE 1: Introduction
st.title("Module 1: Introduction and Problematization")
col1, col2 = st.columns([2, 1])
with col1:
## Historical OCR Workshop
### The Problem
Historical documents present unique challenges for OCR technology:
- Varying typography and handwriting styles
- Document degradation and damage
- Complex layouts and formatting
- Multiple languages and archaic terminology
- Illustrations and decorative elements
with col2:
### Workshop Goals
By the end of this workshop, you will:
1. Understand text-image relationships in historical archives
2. Learn about advanced OCR technology
3. Explore methodological approaches
4. Gain hands-on experience with OCR tools
5. Develop research integration strategies
# Next button
st.button("Next: Text-Image Relations", key="next_to_2", on_click=navigate_to_module, args=(2,))
elif st.session_state.current_module == 2:
# MODULE 2: Text-Image Relations
st.title("Module 2: Text-Image Relations in Historical Archives")
col1, col2 = st.columns([1, 1])
with col1:
### Textual Elements
- **Typography**: Varying fonts, sizes, and styles
- **Layout**: Columns, margins, and spacing
- **Marginalia**: Notes, comments, and additions
- **Decorative Text**: Illuminated letters and calligraphy
### Visual Elements
- **Illustrations**: Diagrams, maps, and artistic representations
- **Watermarks**: Hidden identifiers that locate documents
- **Damage**: Tears, stains, and fading affecting legibility
- **Material Features**: Paper quality and physical dimensions
with col2:
### Interdependence
The relationship between text and image in historical documents exists on a complex spectrum:
- Text functions as image (decorative headings)
- Images function as text (symbolic representations)
- Layout creates meaning through visual organization
- Material conditions affect both textual and visual elements
caption="Book of Kells - Example of text-image integration")
### OCR Challenges
These complex text-image relationships create particular challenges for OCR:
1. **Distinguishing Text from Decoration**: Where does ornamental text end and functional text begin?
2. **Handling Illustrations**: Should they be processed as images or described as text?
3. **Interpreting Layout**: How to capture the significance of spacing and organization?
4. **Preserving Context**: Maintaining the relationship between textual and visual elements
# Navigation buttons
col1, col2 = st.columns(2)
with col1:
st.button("Previous: Introduction", key="prev_to_1", on_click=navigate_to_module, args=(1,))
with col2:
st.button("Next: OCR Technology", key="next_to_3", on_click=navigate_to_module, args=(3,))
elif st.session_state.current_module == 3:
# MODULE 3: OCR Technology
st.title("Module 3: OCR Technology and Historical Documents")
col1, col2 = st.columns([1, 1])
with col1:
### Traditional OCR Approaches
1. **Pattern Matching**: Early OCR compared characters to templates
2. **Feature Extraction**: Identifying key features of characters
3. **Statistical Models**: Using probabilities to improve recognition
### Modern AI-Enhanced OCR
1. **Neural Networks**: Deep learning models trained on vast datasets
2. **Computer Vision**: Advanced image processing techniques
3. **Language Models**: Contextual understanding to resolve ambiguities
4. **Multimodal Models**: Integration of text, layout, and visual understanding
with col2:
### Challenges with Historical Documents
Historical materials present unique difficulties:
- **Typography Variation**: Non-standardized fonts and styles
- **Historical Language**: Archaic vocabulary and grammar
- **Layout Complexity**: Non-linear arrangements
- **Document Degradation**: Fading, tears, stains, and damage
- **Material Artifacts**: Paper texture, binding shadows, etc.
caption="OCR processing layers")
# Display processing history if available
if st.session_state.processing_history:
with st.expander("Your OCR Processing History"):
st.markdown("You've already processed the following documents:")
for item in st.session_state.processing_history:
st.markdown(f"**{item['fileName']}** - {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')}")
col1, col2 = st.columns(2)
with col1:
st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}")
with col2:
st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}")
# Quick link to experiment
st.button("Jump to OCR Experiment", key="jump_to_5", on_click=navigate_to_module, args=(5,))
# Navigation buttons
col1, col2 = st.columns(2)
with col1:
st.button("Previous: Text-Image Relations", key="prev_to_2", on_click=navigate_to_module, args=(2,))
with col2:
st.button("Next: Methodological Approaches", key="next_to_4", on_click=navigate_to_module, args=(4,))
elif st.session_state.current_module == 4:
# MODULE 4: Methodological Approaches
st.title("Module 4: Methodological Approaches")
col1, col2 = st.columns([1, 1])
with col1:
### Hybrid Methodologies
1. **Computational + Human Reading**
- OCR for initial processing and discovery
- Human review for context and interpretation
- Iterative refinement of computational outputs
2. **Close + Distant Reading**
- Distant reading through large-scale OCR processing
- Close reading of selected passages
- Zooming between scales of analysis
# Reference to diagram.jpg
input_dir = Path(__file__).parent / "input"
diagram_path = input_dir / "diagram.jpg"
if diagram_path.exists():
# Load image file directly from disk
from PIL import Image
with Image.open(diagram_path) as img:
st.image(img, caption="Historical VLM architecture", use_column_width=True)
except Exception:
with col2:
### Mistral-OCR-Latest: State-of-the-Art
The Mistral-OCR model represents a significant advancement:
- **Multimodal Understanding**: Processes both visual and textual information
- **Contextual Awareness**: Considers historical context
- **Layout Recognition**: Preserves complex document structures
- **Historical Font Adaptation**: Trained on diverse historical typography
# Reference to workflow.jpg
workflow_path = input_dir / "workflow.jpg"
if workflow_path.exists():
# Load image file directly from disk
from PIL import Image
with Image.open(workflow_path) as img:
st.image(img, caption="Mistral OCR workflow", use_column_width=True)
except Exception:
### Practical Workflow
A typical historical OCR workflow with Mistral-OCR includes:
1. **Selection**: Choosing appropriate documents
2. **Preprocessing**: Enhancing images before OCR
3. **OCR Processing**: Running documents through vision-enhanced OCR
4. **Post-processing**: Cleaning up outputs and structured extraction
5. **Verification**: Cross-checking results against originals
6. **Integration**: Incorporating OCR outputs into research materials
# Navigation buttons
col1, col2 = st.columns(2)
with col1:
st.button("Previous: OCR Technology", key="prev_to_3", on_click=navigate_to_module, args=(3,))
with col2:
st.button("Next: Interactive OCR Experiment", key="next_to_5", on_click=navigate_to_module, args=(5,))
elif st.session_state.current_module == 5:
# MODULE 5: Interactive OCR Experiment
st.title("Module 5: Interactive OCR Experiment")
# More modular design with sequenced steps
<div class="workshop-container">
This interactive module allows you to process historical documents with OCR and analyze the results.
Follow the sequenced steps below to experiment with historical document analysis.
""", unsafe_allow_html=True)
# Tabbed interface for different activities
experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"])
with experiment_tab:
# Import additional libraries for enhanced functionality
from pdf2image import convert_from_bytes
pdf_support = True
except ImportError:
pdf_support = False
st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.")
# OCR tool in a compact layout
col1, col2 = st.columns([1, 1])
with col1:
st.markdown('<div class="tool-container" style="color:white !important; background-color:#1f2937; padding:1.5rem; border-radius:0.5rem; border:1px solid #374151;">', unsafe_allow_html=True)
st.markdown("<h3 style='color:white !important;'>Step 1: Select Document & Options</h3>", unsafe_allow_html=True)
# Processing options
use_vision = st.checkbox("Use Vision Model", value=True,
help="Use the vision model for improved analysis")
# Additional prompt for the model
st.markdown("### Custom Research Prompt (Optional)")
st.markdown("""Provide additional instructions to guide the OCR analysis.
Focus on specific aspects of historical research you're interested in.""")
custom_prompt = st.text_area("Research Prompt",
placeholder="E.g., Focus on identifying dates and historical figures; Analyze the writing style for period-specific terminology; Highlight any cultural or social indicators of the time period...",
help="Optional instructions to guide the analysis of the historical document")
# Example preset documents selection
input_dir = Path(__file__).parent / "input"
if input_dir.exists():
sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf"))
if sample_files:
st.markdown("#### Sample Documents")
sample_options = ["Upload my own document"] + [f.name for f in sample_files]
sample_choice = st.selectbox("Choose a document:", sample_options)
if sample_choice != "Upload my own document":
selected_file = next((f for f in sample_files if f.name == sample_choice), None)
if selected_file:
# Store the selected sample file in session state
with open(selected_file, "rb") as f:
file_bytes = f.read()
st.session_state.sample_file = {
"name": selected_file.name,
"bytes": file_bytes
# Preview the selected sample
if selected_file.suffix.lower() == ".pdf" and pdf_support:
with st.spinner("Generating PDF preview..."):
images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150)
if images:
# Convert PIL image to bytes for Streamlit
first_page = images[0]
img_bytes = io.BytesIO()
first_page.save(img_bytes, format='JPEG')
st.image(img_bytes, caption=f"Preview: {selected_file.name}", use_container_width=True)
except Exception:
st.info(f"PDF selected: {selected_file.name}")
# For images display directly
from PIL import Image
img = Image.open(BytesIO(file_bytes))
st.image(img, caption=f"Preview: {selected_file.name}", use_container_width=True)
except Exception:
st.info(f"Selected: {selected_file.name}")
# Clear the sample file if "Upload my own" is selected
if 'sample_file' in st.session_state:
del st.session_state.sample_file
# File uploader with styling matched to React theme
st.markdown('<div class="upload-container">', unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
if uploaded_file is None:
st.markdown("### Upload a document to get started")
st.markdown("Supported formats: PDF, JPG, PNG")
# Display the uploaded file
file_ext = Path(uploaded_file.name).suffix.lower()
if file_ext == ".pdf" and pdf_support:
# Convert first page of PDF to image for preview
pdf_bytes = uploaded_file.getvalue()
with st.spinner("Generating PDF preview..."):
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
if images:
# Convert PIL image to bytes for Streamlit
first_page = images[0]
img_bytes = io.BytesIO()
first_page.save(img_bytes, format='JPEG')
# Display the PDF preview
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
st.info(f"PDF uploaded: {uploaded_file.name}")
except Exception:
# Simply show the file name without an error message
st.info(f"PDF uploaded: {uploaded_file.name}")
elif file_ext != ".pdf":
st.image(uploaded_file, use_container_width=True)
st.info(f"PDF uploaded: {uploaded_file.name}")
st.markdown('</div>', unsafe_allow_html=True)
# No sample files found, just show the uploader
st.markdown('<div class="upload-container">', unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
if uploaded_file is None:
st.markdown("### Upload a document to get started")
st.markdown("Supported formats: PDF, JPG, PNG")
# Display the uploaded file preview
file_ext = Path(uploaded_file.name).suffix.lower()
if file_ext == ".pdf" and pdf_support:
# PDF preview logic
pdf_bytes = uploaded_file.getvalue()
with st.spinner("Generating PDF preview..."):
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
if images:
first_page = images[0]
img_bytes = io.BytesIO()
first_page.save(img_bytes, format='JPEG')
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True)
st.info(f"PDF uploaded: {uploaded_file.name}")
except Exception:
st.info(f"PDF uploaded: {uploaded_file.name}")
elif file_ext != ".pdf":
st.image(uploaded_file, use_container_width=True)
st.info(f"PDF uploaded: {uploaded_file.name}")
st.markdown('</div>', unsafe_allow_html=True)
# Input directory doesn't exist, just show the uploader
st.markdown('<div class="upload-container">', unsafe_allow_html=True)
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed")
if uploaded_file is None:
st.markdown("### Upload a document to get started")
st.markdown("Supported formats: PDF, JPG, PNG")
st.markdown('</div>', unsafe_allow_html=True)
# Step 2: Process document
st.subheader("Step 2: Process the Document")
# Get the file to process (either uploaded or sample)
file_to_process = None
if 'sample_file' in st.session_state and sample_choice != "Upload my own document":
# Create a FileUploader-like object from the sample file
class SampleFileObject:
def __init__(self, name, data):
self.name = name
self._data = data
def getvalue(self):
return self._data
file_to_process = SampleFileObject(
elif 'uploaded_file' in locals() and uploaded_file is not None:
file_to_process = uploaded_file
# Process button (disabled if no file selected)
process_button = st.button(
"Process Document",
disabled=file_to_process is None,
if process_button and file_to_process is not None:
with st.spinner("Processing document..."):
# Process the file
result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None)
if result:
st.success("Document processed successfully!")
# Store result in session state for display in the right column
st.session_state.current_result = result
st.rerun() # Refresh to show result
st.error("Failed to process document.")
except Exception as e:
st.error(f"Error processing document: {str(e)}")
st.markdown('</div>', unsafe_allow_html=True)
# Experiment instructions in a compact format
st.markdown('<div class="key-concept" style="background-color:#374151; padding:0.75rem; border-radius:0.5rem; margin:1rem 0; border-left:3px solid #3B82F6; color:white;">', unsafe_allow_html=True)
st.markdown("<h3 style='color:white !important;'>Experiment Instructions</h3>", unsafe_allow_html=True)
1. **Step 1:** Select a document and choose your options
2. **Step 2:** Process the document with the selected options
3. **Step 3:** Analyze the results in the panel on the right
4. **Step 4:** Try again with different settings (e.g., toggle vision model)
5. **Step 5:** Compare results between different runs
st.markdown('</div>', unsafe_allow_html=True)
with col2:
st.markdown('<div class="tool-container" style="color:white !important; background-color:#1f2937; padding:1.5rem; border-radius:0.5rem; border:1px solid #374151;">', unsafe_allow_html=True)
st.markdown("<h3 style='color:white !important;'>Step 3: View Results</h3>", unsafe_allow_html=True)
if 'current_result' in st.session_state and st.session_state.current_result:
result = st.session_state.current_result
# File info in a compact format
st.markdown(f"**File:** {result.get('file_name', 'Unknown')}")
# Horizontal display of metadata
col1, col2 = st.columns(2)
with col1:
if 'languages' in result and result['languages']:
languages = [lang for lang in result['languages'] if lang]
if languages:
st.markdown(f"**Languages:** {', '.join(languages)}")
with col2:
if 'topics' in result and result['topics']:
st.markdown(f"**Topics:** {', '.join(result['topics'])}")
# Create tabs for different views with inline styling to ensure visibility
tab1, tab2 = st.tabs(["Structured View", "Raw JSON"])
.stTabs [data-baseweb="tab-panel"] * {color: white !important;}
""", unsafe_allow_html=True)
with tab1:
# Display in a more user-friendly format based on the content structure
if 'ocr_contents' in result:
if isinstance(result['ocr_contents'], dict):
for section, content in result['ocr_contents'].items():
if content: # Only display non-empty sections
st.markdown(f"<h4 style='color:white !important;'>{section.replace('_', ' ').title()}</h4>", unsafe_allow_html=True)
if isinstance(content, str):
st.markdown(f"<p style='color:white !important;'>{content}</p>", unsafe_allow_html=True)
elif isinstance(content, list):
for item in content:
if isinstance(item, str):
st.markdown(f"<p style='color:white !important;'>- {item}</p>", unsafe_allow_html=True)
elif isinstance(item, dict):
st.markdown("<div style='color:white !important;'>", unsafe_allow_html=True)
st.markdown("</div>", unsafe_allow_html=True)
elif isinstance(content, dict):
for k, v in content.items():
st.markdown(f"<p style='color:white !important;'><strong>{k}:</strong> {v}</p>", unsafe_allow_html=True)
with tab2:
# Show the raw JSON for developers
# Convert to serializable format first
serializable_result = make_serializable(result)
# Download options
st.markdown("### Export Results")
col1, col2 = st.columns(2)
with col1:
# Export as JSON (using the serializable converter)
serializable_result = make_serializable(result)
json_bytes = json.dumps(serializable_result, indent=2).encode()
label="Download JSON",
with col2:
# Export as text
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']:
text_content = result['ocr_contents']['content']
label="Download Text",
<div style="background-color:#1f2937; padding:1rem; border-radius:0.5rem;">
<h3 style="color:white !important;">Results will appear here</h3>
<p style="color:white !important;">Upload and process a document to see the OCR results in this panel.</p>
<p style="color:white !important;">The OCR tool will:</p>
<ol style="color:white !important;">
<li style="color:white !important;">Extract text from your document</li>
<li style="color:white !important;">Identify languages and topics</li>
<li style="color:white !important;">Provide structured content analysis</li>
<li style="color:white !important;">Generate downloadable results</li>
""", unsafe_allow_html=True)
st.markdown('</div>', unsafe_allow_html=True)
# Processing history
if st.session_state.processing_history:
st.markdown('<div class="tool-container" style="color:white !important; background-color:#1f2937; padding:1.5rem; border-radius:0.5rem; border:1px solid #374151;">', unsafe_allow_html=True)
st.markdown("<h3 style='color:white !important;'>Step 4: Review Processing History</h3>", unsafe_allow_html=True)
# Most recent result summary
latest = st.session_state.processing_history[-1]
st.markdown(f"**Latest Document:** {latest['fileName']}")
st.markdown(f"**Processed at:** {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}")
st.markdown(f"**Vision model used:** {'Yes' if latest['useVision'] else 'No'}")
# Full history in expander
with st.expander("View Complete Processing History"):
for i, item in enumerate(reversed(st.session_state.processing_history)):
<div class="history-card" style="padding:0.75rem; background-color:#374151; color:white; border-radius:0.25rem; border:1px solid #4B5563; margin-bottom:0.5rem;">
<strong style="color:white !important;">{item['fileName']}</strong><br>
<span style="color:white !important;">{datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} -
Vision model: {'Yes' if item['useVision'] else 'No'}</span>
""", unsafe_allow_html=True)
# Add option to view a previous result
if st.button(f"View This Result", key=f"view_history_{i}"):
st.session_state.current_result = item['result']
st.markdown('</div>', unsafe_allow_html=True)
# Compare tab for side-by-side comparison
with compare_tab:
st.subheader("Compare OCR Results")
if len(st.session_state.processing_history) >= 2:
Select two processing results to compare side by side. This allows you to see
how different options (like using the vision model) affect OCR quality.
# Create selection dropdowns for the documents
col1, col2 = st.columns(2)
with col1:
# First document selector
doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
for i, item in enumerate(st.session_state.processing_history)]
doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1")
doc_index_1 = int(doc_choice_1.split(":")[0]) - 1
with col2:
# Second document selector
doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})"
for i, item in enumerate(st.session_state.processing_history)]
default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item
doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index)
doc_index_2 = int(doc_choice_2.split(":")[0]) - 1
# Retrieve the selected documents
doc1 = st.session_state.processing_history[doc_index_1]
doc2 = st.session_state.processing_history[doc_index_2]
# Show comparison
col1, col2 = st.columns(2)
with col1:
st.markdown(f"### Document 1: {doc1['fileName']}")
st.markdown(f"**Processed at:** {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}")
st.markdown(f"**Vision model used:** {'Yes' if doc1['useVision'] else 'No'}")
# Display content summary
if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict):
if 'content' in doc1['result']['ocr_contents']:
content = doc1['result']['ocr_contents']['content']
# Display first 500 characters with word wrap
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
border: 1px solid #ddd; padding: 1rem; background-color: #f9f9f9;">
{content[:500]}{'...' if len(content) > 500 else ''}
""", unsafe_allow_html=True)
with col2:
st.markdown(f"### Document 2: {doc2['fileName']}")
st.markdown(f"**Processed at:** {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}")
st.markdown(f"**Vision model used:** {'Yes' if doc2['useVision'] else 'No'}")
# Display content summary
if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict):
if 'content' in doc2['result']['ocr_contents']:
content = doc2['result']['ocr_contents']['content']
# Display first 500 characters with word wrap
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word;
border: 1px solid #ddd; padding: 1rem; background-color: #f9f9f9;">
{content[:500]}{'...' if len(content) > 500 else ''}
""", unsafe_allow_html=True)
# Comparison analysis
if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']:
<div class="key-concept">
<h3>Vision vs. Non-Vision Model Comparison</h3>
<p>You're comparing the same document processed with different models.
This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p>
<p>Look for these differences:</p>
<li>Completeness of extracted text</li>
<li>Accuracy of layout understanding</li>
<li>Recognition of complex elements (tables, figures)</li>
<li>Topic and language detection accuracy</li>
""", unsafe_allow_html=True)
<div class="research-question">
<h3>Need More Documents to Compare</h3>
<p>Process at least two documents to enable side-by-side comparison. Try processing
the same document with and without the vision model to see the differences in OCR quality.</p>
""", unsafe_allow_html=True)
# Analysis tab for guidance on working with OCR results
with analyze_tab:
st.subheader("Analysis Guide")
<div class="workshop-container">
<h3>How to Analyze OCR Results</h3>
<p>This guide helps you assess the quality and usefulness of OCR output for historical research.</p>
""", unsafe_allow_html=True)
### Evaluating OCR Quality
When analyzing OCR results from historical documents, consider these key factors:
1. **Text Accuracy**
- Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1")
- Assess recognition of period-specific typography and writing styles
- Evaluate handling of degraded or damaged text areas
2. **Structure Preservation**
- Does the OCR maintain paragraph and section breaks?
- Are columns and tabular data correctly preserved?
- How well are page transitions handled?
3. **Special Elements**
- Recognition of footnotes, marginalia, and annotations
- Handling of illustrations, diagrams, and decorative elements
- Treatment of watermarks, signatures, and stamps
4. **Metadata Extraction**
- Accuracy of detected languages, topics, and document type
- Identification of dates, names, and key entities
- Recognition of document purpose and context
col1, col2 = st.columns(2)
with col1:
### Common OCR Challenges in Historical Documents
- **Typography Variations**: Historical fonts and writing styles that differ from modern text
- **Material Degradation**: Fading, stains, tears, and other damage affecting legibility
- **Handwritten Elements**: Marginalia, signatures, and handwritten annotations
- **Complex Layouts**: Multi-column formats, non-linear reading order, and decorative elements
- **Language and Terminology**: Archaic terms, specialized vocabulary, and multilingual content
with col2:
### Making the Most of OCR Results
- **Contextual Reading**: Use historical context to interpret unclear passages
- **Error Patterns**: Identify and mentally correct for systematic OCR errors
- **Hybrid Analysis**: Combine OCR-based search with close reading of original images
- **Comparative Processing**: Try different OCR settings and models on the same document
- **Iterative Refinement**: Use insights from each document to improve future processing
### Research Integration
Once you've obtained and analyzed OCR results from historical documents, consider these approaches for integrating them into your research:
1. **Digital Corpus Building**: Create searchable collections of processed texts
2. **Computational Analysis**: Apply text mining, topic modeling, or network analysis
3. **Cross-Document Linking**: Identify connections across multiple sources
4. **Annotation and Enrichment**: Add context, translations, or explanatory notes
5. **Collaborative Research**: Share processed texts with other researchers
Remember that OCR is a tool to assist your research, not replace careful reading and analysis. The most effective approaches combine computational methods with traditional historical research practices.
# Example of what to look for
if st.session_state.processing_history:
with st.expander("Example Analysis from Your Documents"):
# Pick the latest document
latest = st.session_state.processing_history[-1]
#### Sample Analysis for: {latest['fileName']}
**Document Context:**
- Languages: {', '.join(latest['result'].get('languages', ['Unknown']))}
- Topics: {', '.join(latest['result'].get('topics', ['Unknown']))}
- Vision model used: {'Yes' if latest['useVision'] else 'No'}
**What to Look For:**
1. Check how well the model identified key topics and languages
2. Evaluate the completeness of extracted text
3. Note any systematic errors in text recognition
4. Assess how well document structure was preserved
# Navigation buttons
col1, col2 = st.columns(2)
with col1:
st.button("Previous: Methodological Approaches", key="prev_to_4", on_click=navigate_to_module, args=(4,))
with col2:
st.button("Next: Conclusion", key="next_to_6", on_click=navigate_to_module, args=(6,))
else: # Module 6
# MODULE 6: Conclusion
st.title("Module 6: Conclusion and Future Directions")
col1, col2 = st.columns([3, 2])
with col1:
### Workshop Summary
Throughout this workshop, we've explored:
1. **Text-Image Interdependence**: The complex relationship between textual and visual elements
2. **OCR Technology**: The evolution of OCR and its application to historical materials
3. **Methodological Approaches**: Hybrid strategies for working with historical texts
4. **Practical Application**: Hands-on experience with OCR processing tools
### Key Takeaways
1. **OCR is Not Perfect**: Even advanced AI models face challenges with historical documents
2. **Context Matters**: Vision-enhanced models provide better results by understanding document context
3. **Hybrid Approaches**: Combining computational methods with traditional research yields best results
4. **Critical Evaluation**: Always evaluate OCR outputs with awareness of limitations
5. **Structured Extraction**: Modern OCR goes beyond text recognition to understand document structure
with col2:
# Display statistics if there's processing history
if st.session_state.processing_history:
st.subheader("Your Workshop Statistics")
# Calculate statistics
total_docs = len(st.session_state.processing_history)
vision_docs = len([item for item in st.session_state.processing_history if item['useVision']])
non_vision_docs = total_docs - vision_docs
# Create metrics for statistics
col1, col2 = st.columns(2)
with col1:
st.metric("Documents Processed", total_docs)
st.metric("With Vision Model", vision_docs)
with col2:
st.metric("Without Vision Model", non_vision_docs)
# Topics word cloud
if total_docs > 0:
st.subheader("Topics Encountered")
all_topics = []
for item in st.session_state.processing_history:
if 'topics' in item['result']:
if all_topics:
# Count topic frequencies
topic_counts = {}
for topic in all_topics:
if topic in topic_counts:
topic_counts[topic] += 1
topic_counts[topic] = 1
# Display as a horizontal bar chart
st.subheader("Future Directions")
col1, col2 = st.columns(2)
with col1:
### Technological Developments
- **Multimodal AI models**: Increasingly sophisticated understanding
- **Historical font training**: Models trained on historical typography
- **Document intelligence**: Enhanced understanding of structures
- **Collaborative correction**: Platforms for collective improvement
with col2:
### Research Applications
- **Large-scale corpus analysis**: Processing entire archives
- **Multilingual historical research**: Working across languages
- **Image-text integration**: New methodologies for visual analysis
- **Computational paleography**: AI-assisted handwriting analysis
### Additional Resources
- **[Mistral AI Documentation](https://docs.mistral.ai/)**: Learn more about the OCR models used in this workshop
- **[Transkribus](https://readcoop.eu/transkribus/)**: Platform for historical document transcription
- **[OCR-D](https://ocr-d.de/en/)**: Coordinated OCR research project for historical documents
- **[Historical OCR Research Papers](https://scholar.google.com/scholar?q=historical+OCR)**: Academic research on historical OCR
# Reset button to start over
if st.button("Start Workshop Again", key="reset_workshop", use_container_width=True):
st.session_state.current_module = 1
# Handle API requests if the URL contains /api/process
if 'api/process' in st.query_params.get('', ''):
# Process the API request
result = process_api_request()
if result:
# Return the result as JSON
# Make sure result is serializable
serializable_result = make_serializable(result)
st.json({"error": "Invalid request"})