Spaces:
Running
Running
import os | |
import streamlit as st | |
import json | |
import sys | |
from pathlib import Path | |
import tempfile | |
from datetime import datetime | |
import io | |
import base64 | |
from io import BytesIO | |
from enum import Enum | |
import inspect | |
# Add parent directory to path so we can import the OCR modules | |
parent_dir = Path(__file__).parent.absolute() | |
sys.path.append(str(parent_dir)) | |
# Import the StructuredOCR class and process_file utility | |
from structured_ocr import StructuredOCR | |
# Add API endpoint support for the React app | |
from streamlit.web.server.server import Server | |
from streamlit.runtime.scriptrunner import get_script_run_ctx | |
# Custom JSON encoder to handle Enum types and other non-serializable objects | |
class EnhancedJSONEncoder(json.JSONEncoder): | |
def default(self, obj): | |
if isinstance(obj, Enum): | |
return obj.value | |
elif hasattr(obj, '__dict__'): | |
# For objects that have a __dict__ but aren't directly serializable | |
return {key: value for key, value in obj.__dict__.items() | |
if not key.startswith('_')} | |
elif hasattr(obj, 'model_dump'): | |
# For Pydantic models | |
return obj.model_dump() | |
elif hasattr(obj, 'to_dict'): | |
# For objects with to_dict method | |
return obj.to_dict() | |
# Let the base class handle other types or raise TypeError | |
return super().default(obj) | |
# Helper function to convert any result to JSON-serializable | |
def make_serializable(obj): | |
"""Convert any object to a JSON-serializable form""" | |
if isinstance(obj, dict): | |
return {k: make_serializable(v) for k, v in obj.items()} | |
elif isinstance(obj, list): | |
return [make_serializable(item) for item in obj] | |
elif isinstance(obj, Enum): | |
return obj.value | |
elif hasattr(obj, 'pages'): | |
# Special case for OCRResponse objects which have pages attribute | |
if hasattr(obj, '__dict__'): | |
result = {k: make_serializable(v) for k, v in obj.__dict__.items() | |
if not k.startswith('_')} | |
# Explicitly handle pages attribute | |
if hasattr(obj, 'pages'): | |
result['pages'] = [make_serializable(page) for page in obj.pages] | |
return result | |
elif hasattr(obj, '__dict__'): | |
# For objects with __dict__ attribute | |
return {k: make_serializable(v) for k, v in obj.__dict__.items() | |
if not k.startswith('_')} | |
elif hasattr(obj, 'model_dump'): | |
# For Pydantic models | |
return make_serializable(obj.model_dump()) | |
elif hasattr(obj, 'to_dict'): | |
# For objects with to_dict method | |
return make_serializable(obj.to_dict()) | |
# Basic types will be returned as is | |
return obj | |
# API response handler | |
def process_api_request(): | |
"""Handle API requests from the React frontend""" | |
# Get the current Streamlit session | |
ctx = get_script_run_ctx() | |
if ctx is None: | |
return | |
session_id = ctx.session_id | |
session_info = Server.get_current()._get_session_info(session_id) | |
if session_info is None: | |
return | |
request = session_info.uploaded_file_mgr._uploaded_files.get('file') | |
if not request: | |
return | |
# Extract file and parameters | |
uploaded_file = request[0] | |
use_vision = session_info.query_string.get('use_vision', ['true'])[0].lower() == 'true' | |
try: | |
# Process file | |
result = process_file(uploaded_file, use_vision=use_vision) | |
# Convert result to JSON-serializable format | |
serializable_result = make_serializable(result) | |
# Return JSON response | |
return serializable_result | |
except Exception as e: | |
# Return error response | |
return {"error": str(e)} | |
try: | |
from process_file import process_file as process_file_util | |
# Use the utility function instead of the local function | |
process_file = process_file_util | |
except ImportError: | |
# Define the process_file function if it's not available | |
def process_file(uploaded_file, use_vision=True): | |
"""Process the uploaded file and return the OCR results""" | |
# Save the uploaded file to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp: | |
tmp.write(uploaded_file.getvalue()) | |
temp_path = tmp.name | |
try: | |
# Initialize OCR processor | |
processor = StructuredOCR() | |
# Determine file type from extension | |
file_ext = Path(uploaded_file.name).suffix.lower() | |
file_type = "pdf" if file_ext == ".pdf" else "image" | |
# Process the file | |
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision) | |
# Add to processing history | |
history_item = { | |
"id": datetime.now().timestamp(), | |
"fileName": uploaded_file.name, | |
"timestamp": datetime.now().isoformat(), | |
"result": result, | |
"useVision": use_vision | |
} | |
if 'processing_history' not in st.session_state: | |
st.session_state.processing_history = [] | |
st.session_state.processing_history.append(history_item) | |
return result | |
except Exception as e: | |
st.error(f"Error processing document: {str(e)}") | |
return None | |
finally: | |
# Clean up the temporary file | |
if os.path.exists(temp_path): | |
os.unlink(temp_path) | |
# Set page configuration | |
st.set_page_config( | |
page_title="Historical OCR Workshop", | |
page_icon="📜", | |
layout="wide", | |
initial_sidebar_state="collapsed" # Start with sidebar collapsed for cleaner landing | |
) | |
# Custom CSS to match React dark theme and improve Streamlit integration | |
st.markdown(""" | |
<style> | |
/* Global theme alignment with React */ | |
.stApp { | |
background-color: #111827; /* bg-gray-900 in Tailwind */ | |
color: white; | |
position: relative; | |
} | |
/* Add subtle grid background to the entire app */ | |
.stApp::before { | |
content: ""; | |
position: fixed; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background-image: | |
linear-gradient(rgba(55, 65, 81, 0.1) 1px, transparent 1px), | |
linear-gradient(90deg, rgba(55, 65, 81, 0.1) 1px, transparent 1px); | |
background-size: 25px 25px; | |
opacity: 0.3; | |
z-index: 0; | |
pointer-events: none; | |
} | |
/* Make sure all text is visible */ | |
p, h1, h2, h3, h4, h5, h6, div, span, li, label, a { | |
color: white !important; | |
position: relative; | |
} | |
/* Add spacier typography */ | |
p, li { | |
line-height: 1.7; | |
margin-bottom: 0.8rem; | |
} | |
h3 { | |
margin-top: 1.5rem; | |
margin-bottom: 1.2rem; | |
font-weight: 600; | |
letter-spacing: 0.015em; | |
} | |
/* Fix empty-looking containers */ | |
div:empty { | |
min-height: 0 !important; | |
padding: 0 !important; | |
margin: 0 !important; | |
} | |
/* Custom header */ | |
.main-header { | |
background-color: #000000; | |
padding: 1rem; | |
border-bottom: 1px solid #374151; /* border-gray-700 */ | |
margin-bottom: 1.5rem; | |
} | |
/* Content containers */ | |
.content-container { | |
background-color: #1f2937; /* bg-gray-800 in Tailwind */ | |
color: white; | |
padding: 1.8rem; | |
border-radius: 0.75rem; | |
margin-bottom: 2rem; | |
box-shadow: 0 6px 12px rgba(0,0,0,0.2); | |
border: 1px solid rgba(75, 85, 99, 0.3); | |
position: relative; | |
overflow: hidden; | |
} | |
.content-container::before { | |
content: ""; | |
position: absolute; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background-image: linear-gradient(rgba(55, 65, 81, 0.2) 1px, transparent 1px), | |
linear-gradient(90deg, rgba(55, 65, 81, 0.2) 1px, transparent 1px); | |
background-size: 20px 20px; | |
opacity: 0.05; | |
z-index: 0; | |
} | |
.content-container > * { | |
position: relative; | |
z-index: 1; | |
color: white !important; | |
} | |
.content-container h4 { | |
color: #60A5FA !important; | |
margin-bottom: 1rem; | |
font-size: 1.2rem; | |
letter-spacing: 0.02em; | |
} | |
.content-container ul, .content-container ol { | |
padding-left: 1.5rem; | |
margin-top: 0.8rem; | |
margin-bottom: 0.8rem; | |
} | |
.content-container li { | |
margin-bottom: 0.5rem; | |
line-height: 1.5; | |
} | |
/* Interactive elements */ | |
.tool-container { | |
background-color: #1f2937; /* bg-gray-800 */ | |
color: white; | |
padding: 1.5rem; | |
border-radius: 0.5rem; | |
border: 1px solid #374151; /* border-gray-700 */ | |
margin-bottom: 1.5rem; | |
} | |
.tool-container * { | |
color: white !important; | |
} | |
/* Special containers */ | |
.key-concept { | |
background-color: #374151; /* bg-gray-700 */ | |
padding: 0.75rem; | |
border-radius: 0.5rem; | |
margin: 1rem 0; | |
border-left: 3px solid #3B82F6; /* border-blue-500 */ | |
color: white; | |
} | |
.key-concept * { | |
color: white !important; | |
} | |
.research-question { | |
background-color: #1E3A8A; /* bg-blue-900 */ | |
padding: 0.75rem; | |
border-radius: 0.5rem; | |
margin: 1rem 0; | |
border-left: 3px solid #60A5FA; /* border-blue-400 */ | |
color: white; | |
} | |
.research-question * { | |
color: white !important; | |
} | |
.quote-container { | |
font-style: italic; | |
color: #D1D5DB; /* text-gray-300 */ | |
padding: 0.5rem 1rem; | |
border-left: 3px solid #4B5563; /* border-gray-600 */ | |
margin: 1rem 0; | |
} | |
/* Hero section */ | |
.hero-container { | |
background: linear-gradient(135deg, #1E3A8A 0%, #2563EB 100%); /* blue-900 to blue-600 */ | |
color: white; | |
padding: 3rem 2rem; | |
border-radius: 0.75rem; | |
margin-bottom: 3rem; | |
box-shadow: 0 8px 16px rgba(0,0,0,0.4); | |
text-align: center; | |
position: relative; | |
overflow: hidden; | |
} | |
.hero-container::before { | |
content: ""; | |
position: absolute; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background-image: | |
linear-gradient(rgba(30, 58, 138, 0.8) 1px, transparent 1px), | |
linear-gradient(90deg, rgba(30, 58, 138, 0.8) 1px, transparent 1px); | |
background-size: 20px 20px; | |
opacity: 0.2; | |
z-index: 0; | |
} | |
.hero-title { | |
font-size: 2.5rem; | |
font-weight: 700; | |
margin-bottom: 1rem; | |
position: relative; | |
z-index: 1; | |
} | |
.hero-subtitle { | |
font-size: 1.25rem; | |
opacity: 0.9; | |
max-width: 700px; | |
margin: 0 auto; | |
position: relative; | |
z-index: 1; | |
} | |
/* File upload styling */ | |
.upload-container { | |
border: 2px dashed #4B5563; /* border-gray-600 */ | |
padding: 1.5rem; | |
text-align: center; | |
border-radius: 0.5rem; | |
margin-bottom: 1rem; | |
background-color: #374151; /* bg-gray-700 */ | |
color: white; | |
} | |
/* History cards */ | |
.history-card { | |
padding: 0.75rem; | |
background-color: #374151; /* bg-gray-700 */ | |
color: white; | |
border-radius: 0.25rem; | |
border: 1px solid #4B5563; /* border-gray-600 */ | |
margin-bottom: 0.5rem; | |
} | |
/* Override Streamlit defaults */ | |
.stTextInput > div > div > input { | |
background-color: #374151; /* bg-gray-700 */ | |
color: white !important; | |
} | |
.stSelectbox > div > div > div { | |
background-color: #374151; /* bg-gray-700 */ | |
color: white !important; | |
} | |
.stCheckbox > div > label { | |
color: white !important; | |
} | |
/* Fix empty containers */ | |
.tool-container:empty { | |
display: none; | |
} | |
/* Ensure all text is visible */ | |
div, span, p, h1, h2, h3, h4, h5, h6, label, li { | |
color: white !important; | |
} | |
/* Tab panel content */ | |
.stTabs [data-baseweb="tab-panel"] * { | |
color: white !important; | |
} | |
/* Button styling */ | |
.stButton > button { | |
background-color: #2563EB; /* bg-blue-600 */ | |
color: white; | |
} | |
.stButton > button:hover { | |
background-color: #1D4ED8; /* bg-blue-700 */ | |
} | |
/* Make sure all text is readable */ | |
p, h1, h2, h3, h4, h5, h6, span, label { | |
color: white; | |
} | |
.stMarkdown a { | |
color: #93C5FD; /* text-blue-300 */ | |
} | |
/* Tabs */ | |
.stTabs [data-baseweb="tab"] { | |
color: white; | |
} | |
.stTabs [data-baseweb="tab-highlight"] { | |
background-color: #2563EB; /* bg-blue-600 */ | |
} | |
/* Expander */ | |
.streamlit-expanderHeader { | |
color: white; | |
background-color: #1f2937; /* bg-gray-800 */ | |
} | |
/* Sidebar */ | |
[data-testid="stSidebar"] { | |
background-color: #111827; /* bg-gray-900 */ | |
} | |
[data-testid="stSidebar"] .stMarkdown { | |
color: white; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Initialize session state for workshop progress | |
if 'current_module' not in st.session_state: | |
st.session_state.current_module = 1 | |
if 'processing_history' not in st.session_state: | |
st.session_state.processing_history = [] | |
if 'workshop_started' not in st.session_state: | |
st.session_state.workshop_started = False | |
def navigate_to_module(module_number): | |
"""Navigate to a specific module""" | |
st.session_state.current_module = module_number | |
# Welcome/Start screen if workshop hasn't been started | |
if not st.session_state.workshop_started: | |
# Hero section with eye-catching design | |
st.markdown(""" | |
<div class="hero-container"> | |
<h1 class="hero-title">Historical OCR Workshop</h1> | |
<p class="hero-subtitle">Unlock the potential of historical documents with modern OCR technology</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Introduction with cleaner layout | |
col1, col2 = st.columns([3, 2]) | |
with col1: | |
st.markdown(""" | |
<div class="content-container"> | |
<h3>Workshop Overview</h3> | |
This interactive workshop explores the application of OCR technology to historical documents, | |
combining theoretical understanding with practical experiences. Designed for historians, | |
archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills. | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="key-concept"> | |
<h4>What is OCR?</h4> | |
Optical Character Recognition (OCR) technology enables computers to extract text from images and documents. | |
Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for | |
historical research and digital humanities. | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
# Add an engaging research question to connect with historians | |
st.markdown(""" | |
<div class="research-question"> | |
<h4>For Historians:</h4> | |
How might OCR technology transform our access to and interpretation of historical documents? | |
What new research questions become possible when large archives become machine-readable? | |
</div> | |
""", unsafe_allow_html=True) | |
# Display sample historical document images in a better format | |
input_dir = Path(__file__).parent / "input" | |
sample_images = [ | |
{"path": input_dir / "letter-1.jpg", "caption": "Historical Letter"}, | |
{"path": input_dir / "recipe.jpg", "caption": "Historical Recipe"} | |
] | |
# Try to find any of the sample images | |
for sample in sample_images: | |
if sample["path"].exists(): | |
try: | |
from PIL import Image | |
with Image.open(sample["path"]) as img: | |
# Add a better styled border and shadow | |
st.markdown(f""" | |
<div style=" | |
border: 1px solid rgba(75, 85, 99, 0.6); | |
padding: 12px; | |
border-radius: 8px; | |
margin-bottom: 1rem; | |
box-shadow: 0 6px 15px rgba(0,0,0,0.3); | |
background-color: #1f2937; | |
position: relative; | |
"> | |
""", unsafe_allow_html=True) | |
st.image(img, caption=sample["caption"], width=280) | |
st.markdown(""" | |
<p style=" | |
font-size: 0.85rem; | |
opacity: 0.8; | |
margin-top: 8px; | |
font-style: italic; | |
text-align: center; | |
">Sample document for OCR analysis</p> | |
</div> | |
""", unsafe_allow_html=True) | |
break # Only show one image | |
except Exception: | |
pass | |
# What you'll learn section combined with Workshop Modules in parallel columns | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown('<h3 class="workshop-heading" style="margin-bottom:1.5rem; padding-bottom:0.5rem; border-bottom:1px solid rgba(75, 85, 99, 0.5);">What You\'ll Learn</h3>', unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="content-container"> | |
<h4><i class="fas fa-book-open"></i> Conceptual Understanding</h4> | |
- Text-image relationships in historical documents | |
- Evolution of OCR technology | |
- AI vision models for document analysis | |
- Historical typography challenges | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="content-container"> | |
<h4><i class="fas fa-microscope"></i> Methodological Approaches</h4> | |
- Critical frameworks for OCR in historical research | |
- Hybrid computational-traditional methods | |
- Error analysis and interpretation | |
- Contextual reading strategies | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="content-container"> | |
<h4><i class="fas fa-tools"></i> Practical Skills</h4> | |
- Processing historical documents with OCR | |
- Analyzing and structuring extracted information | |
- Integrating OCR into research workflows | |
- Building searchable archives | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
# Workshop modules with visually appealing cards | |
st.markdown('<h3 class="workshop-heading" style="margin-bottom:1.5rem; padding-bottom:0.5rem; border-bottom:1px solid rgba(75, 85, 99, 0.5);">Workshop Modules</h3>', unsafe_allow_html=True) | |
# Add some styling for the module cards | |
st.markdown(""" | |
<style> | |
.module-card { | |
background-color: #1f2937; /* bg-gray-800 */ | |
border-radius: 10px; | |
box-shadow: 0 4px 8px rgba(0,0,0,0.2); | |
padding: 1.5rem; | |
margin-bottom: 1.5rem; | |
transition: all 0.3s ease; | |
border-left: 4px solid #3B82F6; /* border-blue-500 */ | |
color: white; | |
position: relative; | |
overflow: hidden; | |
} | |
.module-card::before { | |
content: ""; | |
position: absolute; | |
top: 0; | |
left: 0; | |
right: 0; | |
bottom: 0; | |
background-image: linear-gradient(rgba(55, 65, 81, 0.2) 1px, transparent 1px), | |
linear-gradient(90deg, rgba(55, 65, 81, 0.2) 1px, transparent 1px); | |
background-size: 16px 16px; | |
opacity: 0.05; | |
z-index: 0; | |
} | |
.module-card > * { | |
position: relative; | |
z-index: 1; | |
} | |
.module-card:hover { | |
transform: translateY(-5px); | |
box-shadow: 0 10px 20px rgba(0,0,0,0.25); | |
border-left-color: #60A5FA; | |
} | |
.module-number { | |
background-color: #3B82F6; /* bg-blue-500 */ | |
color: white; | |
font-weight: bold; | |
padding: 0.4rem 0.8rem; | |
border-radius: 20px; | |
font-size: 0.9rem; | |
display: inline-block; | |
margin-bottom: 12px; | |
box-shadow: 0 2px 4px rgba(0,0,0,0.2); | |
} | |
.module-title { | |
font-weight: 600; | |
margin-bottom: 1rem; | |
font-size: 1.25rem; | |
color: white; | |
letter-spacing: 0.015em; | |
} | |
.module-card p { | |
line-height: 1.6; | |
opacity: 0.9; | |
font-size: 0.95rem; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Modules inside the col2 from earlier | |
with col2: | |
st.markdown(""" | |
<div class="module-card"> | |
<div class="module-number">Module 1</div> | |
<div class="module-title">Introduction and Problematization</div> | |
<p>Explore the challenges of historical document digitization and the potential of OCR technologies | |
to transform historical research. Examine key problems and opportunities in historical OCR.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="module-card"> | |
<div class="module-number">Module 2</div> | |
<div class="module-title">Text-Image Relations in Historical Archives</div> | |
<p>Analyze the complex relationships between text and images in historical documents, | |
from typography and layout to marginalia and decorative elements.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="module-card"> | |
<div class="module-number">Module 3</div> | |
<div class="module-title">OCR Technology and Historical Documents</div> | |
<p>Understand the evolution of OCR technology from pattern matching to modern AI vision-language models, | |
and how they address the unique challenges of historical documents.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="module-card"> | |
<div class="module-number">Module 4</div> | |
<div class="module-title">Methodological Approaches</div> | |
<p>Develop hybrid methodologies that combine computational processing with traditional | |
historical research practices, balancing distant and close reading.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="module-card"> | |
<div class="module-number">Module 5</div> | |
<div class="module-title">Interactive OCR Experiment</div> | |
<p>Gain hands-on experience processing historical documents with OCR technology, | |
analyzing results, and comparing different approaches.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="module-card"> | |
<div class="module-number">Module 6</div> | |
<div class="module-title">Conclusion and Future Directions</div> | |
<p>Synthesize workshop insights and explore future directions for OCR in historical research, | |
from large-scale corpus analysis to computational paleography.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Engaging quote to inspire participation with citation - in a better styled container | |
st.markdown(""" | |
<div style=" | |
background: linear-gradient(to right, rgba(30, 58, 138, 0.3), rgba(37, 99, 235, 0.1)); | |
border-left: 4px solid #3B82F6; | |
padding: 1.5rem 2rem; | |
border-radius: 8px; | |
margin: 2.5rem 0; | |
position: relative; | |
font-style: italic; | |
box-shadow: 0 4px 12px rgba(0,0,0,0.2); | |
"> | |
<div style=" | |
position: absolute; | |
left: 20px; | |
top: -18px; | |
font-size: 2rem; | |
color: #60A5FA; | |
opacity: 0.7; | |
">"</div> | |
<p style=" | |
font-size: 1.15rem; | |
line-height: 1.7; | |
max-width: 800px; | |
margin: 0 auto; | |
text-align: center; | |
color: #E5E7EB !important; | |
">The digital turn in historical research is not just about converting analog to digital; | |
it's about transforming how we access, analyze, and interpret the past.</p> | |
<div style=" | |
text-align: right; | |
margin-top: 1rem; | |
color: #9CA3AF !important; | |
opacity: 0.9; | |
font-size: 0.9rem; | |
">— Dr. Jane Winters, Professor of Digital Humanities</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Feature highlight before call to action with better styling | |
st.markdown(""" | |
<div style=" | |
background: linear-gradient(135deg, #1E3A8A 0%, #1E40AF 100%); | |
border-radius: 12px; | |
padding: 2rem; | |
margin: 3rem 0; | |
border-top: 5px solid #3B82F6; | |
color: white; | |
box-shadow: 0 8px 20px rgba(0,0,0,0.3); | |
position: relative; | |
overflow: hidden; | |
"> | |
<div style=" | |
position: absolute; | |
top: 0; | |
right: 0; | |
bottom: 0; | |
left: 0; | |
background-image: | |
linear-gradient(rgba(30, 58, 138, 0.2) 1px, transparent 1px), | |
linear-gradient(90deg, rgba(30, 58, 138, 0.2) 1px, transparent 1px); | |
background-size: 20px 20px; | |
opacity: 0.1; | |
"></div> | |
<h3 style=" | |
margin-top: 0; | |
color: white !important; | |
font-size: 1.5rem; | |
margin-bottom: 1.5rem; | |
position: relative; | |
z-index: 1; | |
">Workshop Highlights</h3> | |
<div style=" | |
display: grid; | |
grid-template-columns: 1fr 1fr; | |
gap: 1.5rem; | |
position: relative; | |
z-index: 1; | |
"> | |
<div style=" | |
background-color: rgba(30, 64, 175, 0.5); | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 3px solid #60A5FA; | |
"> | |
<h4 style=" | |
margin-top: 0; | |
color: #93C5FD !important; | |
font-size: 1.1rem; | |
">Interactive Learning</h4> | |
<p style=" | |
margin-bottom: 0; | |
opacity: 0.9; | |
font-size: 0.95rem; | |
">Hands-on document processing with real-time feedback and analysis</p> | |
</div> | |
<div style=" | |
background-color: rgba(30, 64, 175, 0.5); | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 3px solid #60A5FA; | |
"> | |
<h4 style=" | |
margin-top: 0; | |
color: #93C5FD !important; | |
font-size: 1.1rem; | |
">Real Historical Documents</h4> | |
<p style=" | |
margin-bottom: 0; | |
opacity: 0.9; | |
font-size: 0.95rem; | |
">Work with authentic materials spanning different eras and formats</p> | |
</div> | |
<div style=" | |
background-color: rgba(30, 64, 175, 0.5); | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 3px solid #60A5FA; | |
"> | |
<h4 style=" | |
margin-top: 0; | |
color: #93C5FD !important; | |
font-size: 1.1rem; | |
">Vision AI Models</h4> | |
<p style=" | |
margin-bottom: 0; | |
opacity: 0.9; | |
font-size: 0.95rem; | |
">Experience state-of-the-art OCR technology powered by advanced AI</p> | |
</div> | |
<div style=" | |
background-color: rgba(30, 64, 175, 0.5); | |
padding: 1rem; | |
border-radius: 8px; | |
border-left: 3px solid #60A5FA; | |
"> | |
<h4 style=" | |
margin-top: 0; | |
color: #93C5FD !important; | |
font-size: 1.1rem; | |
">Research Applications</h4> | |
<p style=" | |
margin-bottom: 0; | |
opacity: 0.9; | |
font-size: 0.95rem; | |
">Learn to integrate OCR into historical research workflows</p> | |
</div> | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Enhanced start button with dynamic styling and clear call to action | |
st.markdown(""" | |
<div style=" | |
text-align: center; | |
margin: 3.5rem 0; | |
padding: 2rem; | |
background: linear-gradient(180deg, rgba(31, 41, 55, 0.6) 0%, rgba(17, 24, 39, 0.8) 100%); | |
border-radius: 12px; | |
border: 1px solid rgba(75, 85, 99, 0.3); | |
box-shadow: 0 10px 25px rgba(0,0,0,0.2); | |
"> | |
<h3 style=" | |
margin-bottom: 1.5rem; | |
font-size: 1.5rem; | |
color: white !important; | |
">Ready to Start Your Journey?</h3> | |
<button id="start-button" style=" | |
background: linear-gradient(90deg, #2563EB 0%, #1D4ED8 100%); | |
color: white; | |
border: none; | |
padding: 0.8rem 2rem; | |
font-size: 1.1rem; | |
font-weight: 500; | |
border-radius: 8px; | |
cursor: pointer; | |
transition: all 0.3s ease; | |
box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3); | |
margin-bottom: 1rem; | |
width: 280px; | |
" onclick="document.getElementById('streamlit-button').click()">Begin Workshop Journey</button> | |
<p style=" | |
text-align: center; | |
margin-top: 1rem; | |
font-size: 0.95rem; | |
color: #9CA3AF !important; | |
">No installation required • Start immediately • Interactive experience</p> | |
</div> | |
<script> | |
// Animation for the button | |
document.getElementById('start-button').addEventListener('mouseover', function() { | |
this.style.transform = 'translateY(-3px)'; | |
this.style.boxShadow = '0 6px 15px rgba(37, 99, 235, 0.4)'; | |
}); | |
document.getElementById('start-button').addEventListener('mouseout', function() { | |
this.style.transform = 'translateY(0)'; | |
this.style.boxShadow = '0 4px 12px rgba(37, 99, 235, 0.3)'; | |
}); | |
</script> | |
""", unsafe_allow_html=True) | |
# Hidden button to trigger the workshop start | |
col1, col2, col3 = st.columns([1, 1, 1]) | |
with col2: | |
if st.button("Begin Workshop", key="streamlit-button", use_container_width=True, type="primary"): | |
st.session_state.workshop_started = True | |
st.rerun() | |
# Display workshop navigation sidebar only if workshop has started | |
elif st.session_state.workshop_started: | |
# Define input directory for images | |
input_dir = Path(__file__).parent / "input" | |
# Enhanced sidebar navigation | |
with st.sidebar: | |
st.markdown("<h1 style='margin-bottom:15px;'>Workshop Navigation</h1>", unsafe_allow_html=True) | |
# Improved visual header with logo/image | |
st.markdown("<div style='display:flex; align-items:center; margin-bottom:20px;'>", unsafe_allow_html=True) | |
# Add a visual element with better sizing/styling | |
workflow_path = input_dir / "workflow.jpg" | |
if workflow_path.exists(): | |
try: | |
from PIL import Image | |
with Image.open(workflow_path) as img: | |
st.image(img, width=160, output_format="PNG") | |
except Exception: | |
pass | |
st.markdown("</div>", unsafe_allow_html=True) | |
# Show enhanced progress indicator | |
current_module = st.session_state.current_module | |
st.markdown(f"<div style='margin-bottom:15px;'><b>Your Progress:</b> Module {current_module} of 6</div>", unsafe_allow_html=True) | |
st.progress(current_module / 6) | |
# More visually appealing module navigation | |
modules = [ | |
"Introduction", | |
"Text-Image Relations", | |
"OCR Technology", | |
"Methodological Approaches", | |
"Interactive OCR Experiment", | |
"Conclusion" | |
] | |
# Custom styling for navigation buttons | |
st.markdown(""" | |
<style> | |
.nav-button { | |
padding: 8px 12px; | |
margin-bottom: 8px; | |
border-radius: 6px; | |
background-color: #f5f5f5; | |
display: block; | |
text-decoration: none; | |
color: #333; | |
font-weight: 500; | |
border-left: 3px solid transparent; | |
transition: all 0.2s; | |
} | |
.nav-button:hover { | |
background-color: #e6e6e6; | |
} | |
.nav-button.active { | |
background-color: #e8f0fe; | |
border-left: 3px solid #0d3c84; | |
font-weight: 600; | |
} | |
.nav-section { | |
margin: 20px 0 10px 0; | |
font-weight: 600; | |
color: #555; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Group into clearer sections | |
st.markdown("<div class='nav-section'>Theory & Concepts</div>", unsafe_allow_html=True) | |
for i in range(1, 4): # Modules 1-3 | |
active_class = "active" if i == current_module else "" | |
st.markdown(f""" | |
<div class="nav-button {active_class}" onclick="document.getElementById('nav_{i}').click()"> | |
<span style="display:inline-block; width:22px; text-align:center; margin-right:8px; | |
background-color:{'#0d3c84' if i == current_module else '#ddd'}; color:{'white' if i == current_module else '#555'}; | |
border-radius:11px; font-size:0.8rem; font-weight:bold;">{i}</span> | |
{modules[i-1]} | |
</div> | |
""", unsafe_allow_html=True) | |
# Hidden button to handle the click | |
if st.button(f"{i}", key=f"nav_{i}"): | |
navigate_to_module(i) | |
st.rerun() | |
st.markdown("<div class='nav-section'>Application & Practice</div>", unsafe_allow_html=True) | |
for i in range(4, 7): # Modules 4-6 | |
active_class = "active" if i == current_module else "" | |
st.markdown(f""" | |
<div class="nav-button {active_class}" onclick="document.getElementById('nav_{i}').click()"> | |
<span style="display:inline-block; width:22px; text-align:center; margin-right:8px; | |
background-color:{'#0d3c84' if i == current_module else '#ddd'}; color:{'white' if i == current_module else '#555'}; | |
border-radius:11px; font-size:0.8rem; font-weight:bold;">{i}</span> | |
{modules[i-1]} | |
</div> | |
""", unsafe_allow_html=True) | |
# Hidden button to handle the click | |
if st.button(f"{i}", key=f"nav_{i}"): | |
navigate_to_module(i) | |
st.rerun() | |
st.markdown("---") | |
# Enhanced quick jump button | |
st.markdown(""" | |
<style> | |
.jump-button { | |
background-color: #f0f7ff; | |
padding: 10px; | |
border-radius: 6px; | |
border-left: 3px solid #0d3c84; | |
margin-bottom: 15px; | |
cursor: pointer; | |
} | |
.jump-button:hover { | |
background-color: #e3f0ff; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<div class="jump-button" onclick="document.getElementById('jump_exp').click()"> | |
<span style="font-weight:500;">📊 Jump to OCR Experiment</span> | |
</div> | |
""", unsafe_allow_html=True) | |
# Hidden button for jump | |
if st.button("Jump to Experiment", key="jump_exp"): | |
navigate_to_module(5) | |
st.rerun() | |
# Workshop information in a cleaner collapsible section | |
with st.expander("About the Workshop"): | |
st.markdown(""" | |
This interactive workshop explores OCR technology for historical documents. | |
**How to use this workshop:** | |
1. Navigate through modules sequentially | |
2. Expand content sections to read more | |
3. Try the interactive OCR experiment | |
4. Reflect on research questions | |
For help or more information, use the reference materials in Module 6. | |
""") | |
# Enhanced progress tracking | |
if st.session_state.processing_history: | |
with st.expander("Your Activity"): | |
st.markdown(f"<b>Documents processed:</b> {len(st.session_state.processing_history)}", unsafe_allow_html=True) | |
# Show the most recent document processed with better formatting | |
latest = st.session_state.processing_history[-1] | |
st.markdown(f""" | |
<div style="background:#f9f9f9; padding:8px; border-radius:4px; margin-top:10px;"> | |
<b>Latest document:</b> {latest['fileName']}<br> | |
<span style="font-size:0.9rem; color:#666;">Processed with {' vision model' if latest['useVision'] else ' basic OCR'}</span> | |
</div> | |
""", unsafe_allow_html=True) | |
# Main content based on current module | |
if st.session_state.current_module == 1: | |
# MODULE 1: Introduction | |
st.title("Module 1: Introduction and Problematization") | |
col1, col2 = st.columns([2, 1]) | |
with col1: | |
st.markdown(""" | |
## Historical OCR Workshop | |
### The Problem | |
Historical documents present unique challenges for OCR technology: | |
- Varying typography and handwriting styles | |
- Document degradation and damage | |
- Complex layouts and formatting | |
- Multiple languages and archaic terminology | |
- Illustrations and decorative elements | |
""") | |
with col2: | |
st.markdown(""" | |
### Workshop Goals | |
By the end of this workshop, you will: | |
1. Understand text-image relationships in historical archives | |
2. Learn about advanced OCR technology | |
3. Explore methodological approaches | |
4. Gain hands-on experience with OCR tools | |
5. Develop research integration strategies | |
""") | |
# Next button | |
st.button("Next: Text-Image Relations", key="next_to_2", on_click=navigate_to_module, args=(2,)) | |
elif st.session_state.current_module == 2: | |
# MODULE 2: Text-Image Relations | |
st.title("Module 2: Text-Image Relations in Historical Archives") | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.markdown(""" | |
### Textual Elements | |
- **Typography**: Varying fonts, sizes, and styles | |
- **Layout**: Columns, margins, and spacing | |
- **Marginalia**: Notes, comments, and additions | |
- **Decorative Text**: Illuminated letters and calligraphy | |
""") | |
st.markdown(""" | |
### Visual Elements | |
- **Illustrations**: Diagrams, maps, and artistic representations | |
- **Watermarks**: Hidden identifiers that locate documents | |
- **Damage**: Tears, stains, and fading affecting legibility | |
- **Material Features**: Paper quality and physical dimensions | |
""") | |
with col2: | |
st.markdown(""" | |
### Interdependence | |
The relationship between text and image in historical documents exists on a complex spectrum: | |
- Text functions as image (decorative headings) | |
- Images function as text (symbolic representations) | |
- Layout creates meaning through visual organization | |
- Material conditions affect both textual and visual elements | |
""") | |
st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Book_of_Kells_folio_292r.jpg/800px-Book_of_Kells_folio_292r.jpg", | |
caption="Book of Kells - Example of text-image integration") | |
st.markdown(""" | |
### OCR Challenges | |
These complex text-image relationships create particular challenges for OCR: | |
1. **Distinguishing Text from Decoration**: Where does ornamental text end and functional text begin? | |
2. **Handling Illustrations**: Should they be processed as images or described as text? | |
3. **Interpreting Layout**: How to capture the significance of spacing and organization? | |
4. **Preserving Context**: Maintaining the relationship between textual and visual elements | |
""") | |
# Navigation buttons | |
col1, col2 = st.columns(2) | |
with col1: | |
st.button("Previous: Introduction", key="prev_to_1", on_click=navigate_to_module, args=(1,)) | |
with col2: | |
st.button("Next: OCR Technology", key="next_to_3", on_click=navigate_to_module, args=(3,)) | |
elif st.session_state.current_module == 3: | |
# MODULE 3: OCR Technology | |
st.title("Module 3: OCR Technology and Historical Documents") | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.markdown(""" | |
### Traditional OCR Approaches | |
1. **Pattern Matching**: Early OCR compared characters to templates | |
2. **Feature Extraction**: Identifying key features of characters | |
3. **Statistical Models**: Using probabilities to improve recognition | |
""") | |
st.markdown(""" | |
### Modern AI-Enhanced OCR | |
1. **Neural Networks**: Deep learning models trained on vast datasets | |
2. **Computer Vision**: Advanced image processing techniques | |
3. **Language Models**: Contextual understanding to resolve ambiguities | |
4. **Multimodal Models**: Integration of text, layout, and visual understanding | |
""") | |
with col2: | |
st.markdown(""" | |
### Challenges with Historical Documents | |
Historical materials present unique difficulties: | |
- **Typography Variation**: Non-standardized fonts and styles | |
- **Historical Language**: Archaic vocabulary and grammar | |
- **Layout Complexity**: Non-linear arrangements | |
- **Document Degradation**: Fading, tears, stains, and damage | |
- **Material Artifacts**: Paper texture, binding shadows, etc. | |
""") | |
st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg", | |
caption="OCR processing layers") | |
# Display processing history if available | |
if st.session_state.processing_history: | |
with st.expander("Your OCR Processing History"): | |
st.markdown("You've already processed the following documents:") | |
for item in st.session_state.processing_history: | |
st.markdown(f"**{item['fileName']}** - {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')}") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}") | |
with col2: | |
st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}") | |
# Quick link to experiment | |
st.button("Jump to OCR Experiment", key="jump_to_5", on_click=navigate_to_module, args=(5,)) | |
# Navigation buttons | |
col1, col2 = st.columns(2) | |
with col1: | |
st.button("Previous: Text-Image Relations", key="prev_to_2", on_click=navigate_to_module, args=(2,)) | |
with col2: | |
st.button("Next: Methodological Approaches", key="next_to_4", on_click=navigate_to_module, args=(4,)) | |
elif st.session_state.current_module == 4: | |
# MODULE 4: Methodological Approaches | |
st.title("Module 4: Methodological Approaches") | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.markdown(""" | |
### Hybrid Methodologies | |
1. **Computational + Human Reading** | |
- OCR for initial processing and discovery | |
- Human review for context and interpretation | |
- Iterative refinement of computational outputs | |
2. **Close + Distant Reading** | |
- Distant reading through large-scale OCR processing | |
- Close reading of selected passages | |
- Zooming between scales of analysis | |
""") | |
# Reference to diagram.jpg | |
input_dir = Path(__file__).parent / "input" | |
diagram_path = input_dir / "diagram.jpg" | |
if diagram_path.exists(): | |
try: | |
# Load image file directly from disk | |
from PIL import Image | |
with Image.open(diagram_path) as img: | |
st.image(img, caption="Historical VLM architecture", use_column_width=True) | |
except Exception: | |
pass | |
with col2: | |
st.markdown(""" | |
### Mistral-OCR-Latest: State-of-the-Art | |
The Mistral-OCR model represents a significant advancement: | |
- **Multimodal Understanding**: Processes both visual and textual information | |
- **Contextual Awareness**: Considers historical context | |
- **Layout Recognition**: Preserves complex document structures | |
- **Historical Font Adaptation**: Trained on diverse historical typography | |
""") | |
# Reference to workflow.jpg | |
workflow_path = input_dir / "workflow.jpg" | |
if workflow_path.exists(): | |
try: | |
# Load image file directly from disk | |
from PIL import Image | |
with Image.open(workflow_path) as img: | |
st.image(img, caption="Mistral OCR workflow", use_column_width=True) | |
except Exception: | |
pass | |
st.markdown(""" | |
### Practical Workflow | |
A typical historical OCR workflow with Mistral-OCR includes: | |
1. **Selection**: Choosing appropriate documents | |
2. **Preprocessing**: Enhancing images before OCR | |
3. **OCR Processing**: Running documents through vision-enhanced OCR | |
4. **Post-processing**: Cleaning up outputs and structured extraction | |
5. **Verification**: Cross-checking results against originals | |
6. **Integration**: Incorporating OCR outputs into research materials | |
""") | |
# Navigation buttons | |
col1, col2 = st.columns(2) | |
with col1: | |
st.button("Previous: OCR Technology", key="prev_to_3", on_click=navigate_to_module, args=(3,)) | |
with col2: | |
st.button("Next: Interactive OCR Experiment", key="next_to_5", on_click=navigate_to_module, args=(5,)) | |
elif st.session_state.current_module == 5: | |
# MODULE 5: Interactive OCR Experiment | |
st.title("Module 5: Interactive OCR Experiment") | |
# More modular design with sequenced steps | |
st.markdown(""" | |
<div class="workshop-container"> | |
This interactive module allows you to process historical documents with OCR and analyze the results. | |
Follow the sequenced steps below to experiment with historical document analysis. | |
</div> | |
""", unsafe_allow_html=True) | |
# Tabbed interface for different activities | |
experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"]) | |
with experiment_tab: | |
# Import additional libraries for enhanced functionality | |
try: | |
from pdf2image import convert_from_bytes | |
pdf_support = True | |
except ImportError: | |
pdf_support = False | |
st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.") | |
# OCR tool in a compact layout | |
col1, col2 = st.columns([1, 1]) | |
with col1: | |
st.markdown('<div class="tool-container" style="color:white !important; background-color:#1f2937; padding:1.5rem; border-radius:0.5rem; border:1px solid #374151;">', unsafe_allow_html=True) | |
st.markdown("<h3 style='color:white !important;'>Step 1: Select Document & Options</h3>", unsafe_allow_html=True) | |
# Processing options | |
use_vision = st.checkbox("Use Vision Model", value=True, | |
help="Use the vision model for improved analysis") | |
# Additional prompt for the model | |
st.markdown("### Custom Research Prompt (Optional)") | |
st.markdown("""Provide additional instructions to guide the OCR analysis. | |
Focus on specific aspects of historical research you're interested in.""") | |
custom_prompt = st.text_area("Research Prompt", | |
placeholder="E.g., Focus on identifying dates and historical figures; Analyze the writing style for period-specific terminology; Highlight any cultural or social indicators of the time period...", | |
help="Optional instructions to guide the analysis of the historical document") | |
# Example preset documents selection | |
input_dir = Path(__file__).parent / "input" | |
if input_dir.exists(): | |
sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf")) | |
if sample_files: | |
st.markdown("#### Sample Documents") | |
sample_options = ["Upload my own document"] + [f.name for f in sample_files] | |
sample_choice = st.selectbox("Choose a document:", sample_options) | |
if sample_choice != "Upload my own document": | |
selected_file = next((f for f in sample_files if f.name == sample_choice), None) | |
if selected_file: | |
# Store the selected sample file in session state | |
with open(selected_file, "rb") as f: | |
file_bytes = f.read() | |
st.session_state.sample_file = { | |
"name": selected_file.name, | |
"bytes": file_bytes | |
} | |
# Preview the selected sample | |
if selected_file.suffix.lower() == ".pdf" and pdf_support: | |
try: | |
with st.spinner("Generating PDF preview..."): | |
images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150) | |
if images: | |
# Convert PIL image to bytes for Streamlit | |
first_page = images[0] | |
img_bytes = io.BytesIO() | |
first_page.save(img_bytes, format='JPEG') | |
img_bytes.seek(0) | |
st.image(img_bytes, caption=f"Preview: {selected_file.name}", use_container_width=True) | |
except Exception: | |
st.info(f"PDF selected: {selected_file.name}") | |
else: | |
# For images display directly | |
try: | |
from PIL import Image | |
img = Image.open(BytesIO(file_bytes)) | |
st.image(img, caption=f"Preview: {selected_file.name}", use_container_width=True) | |
except Exception: | |
st.info(f"Selected: {selected_file.name}") | |
else: | |
# Clear the sample file if "Upload my own" is selected | |
if 'sample_file' in st.session_state: | |
del st.session_state.sample_file | |
# File uploader with styling matched to React theme | |
st.markdown('<div class="upload-container">', unsafe_allow_html=True) | |
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") | |
if uploaded_file is None: | |
st.markdown("### Upload a document to get started") | |
st.markdown("Supported formats: PDF, JPG, PNG") | |
else: | |
# Display the uploaded file | |
file_ext = Path(uploaded_file.name).suffix.lower() | |
if file_ext == ".pdf" and pdf_support: | |
try: | |
# Convert first page of PDF to image for preview | |
pdf_bytes = uploaded_file.getvalue() | |
with st.spinner("Generating PDF preview..."): | |
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) | |
if images: | |
# Convert PIL image to bytes for Streamlit | |
first_page = images[0] | |
img_bytes = io.BytesIO() | |
first_page.save(img_bytes, format='JPEG') | |
img_bytes.seek(0) | |
# Display the PDF preview | |
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True) | |
else: | |
st.info(f"PDF uploaded: {uploaded_file.name}") | |
except Exception: | |
# Simply show the file name without an error message | |
st.info(f"PDF uploaded: {uploaded_file.name}") | |
elif file_ext != ".pdf": | |
st.image(uploaded_file, use_container_width=True) | |
else: | |
st.info(f"PDF uploaded: {uploaded_file.name}") | |
st.markdown('</div>', unsafe_allow_html=True) | |
else: | |
# No sample files found, just show the uploader | |
st.markdown('<div class="upload-container">', unsafe_allow_html=True) | |
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") | |
if uploaded_file is None: | |
st.markdown("### Upload a document to get started") | |
st.markdown("Supported formats: PDF, JPG, PNG") | |
else: | |
# Display the uploaded file preview | |
file_ext = Path(uploaded_file.name).suffix.lower() | |
if file_ext == ".pdf" and pdf_support: | |
try: | |
# PDF preview logic | |
pdf_bytes = uploaded_file.getvalue() | |
with st.spinner("Generating PDF preview..."): | |
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) | |
if images: | |
first_page = images[0] | |
img_bytes = io.BytesIO() | |
first_page.save(img_bytes, format='JPEG') | |
img_bytes.seek(0) | |
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True) | |
else: | |
st.info(f"PDF uploaded: {uploaded_file.name}") | |
except Exception: | |
st.info(f"PDF uploaded: {uploaded_file.name}") | |
elif file_ext != ".pdf": | |
st.image(uploaded_file, use_container_width=True) | |
else: | |
st.info(f"PDF uploaded: {uploaded_file.name}") | |
st.markdown('</div>', unsafe_allow_html=True) | |
else: | |
# Input directory doesn't exist, just show the uploader | |
st.markdown('<div class="upload-container">', unsafe_allow_html=True) | |
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") | |
if uploaded_file is None: | |
st.markdown("### Upload a document to get started") | |
st.markdown("Supported formats: PDF, JPG, PNG") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Step 2: Process document | |
st.subheader("Step 2: Process the Document") | |
# Get the file to process (either uploaded or sample) | |
file_to_process = None | |
if 'sample_file' in st.session_state and sample_choice != "Upload my own document": | |
# Create a FileUploader-like object from the sample file | |
class SampleFileObject: | |
def __init__(self, name, data): | |
self.name = name | |
self._data = data | |
def getvalue(self): | |
return self._data | |
file_to_process = SampleFileObject( | |
st.session_state.sample_file["name"], | |
st.session_state.sample_file["bytes"] | |
) | |
elif 'uploaded_file' in locals() and uploaded_file is not None: | |
file_to_process = uploaded_file | |
# Process button (disabled if no file selected) | |
process_button = st.button( | |
"Process Document", | |
disabled=file_to_process is None, | |
use_container_width=True | |
) | |
if process_button and file_to_process is not None: | |
with st.spinner("Processing document..."): | |
try: | |
# Process the file | |
result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None) | |
if result: | |
st.success("Document processed successfully!") | |
# Store result in session state for display in the right column | |
st.session_state.current_result = result | |
st.rerun() # Refresh to show result | |
else: | |
st.error("Failed to process document.") | |
except Exception as e: | |
st.error(f"Error processing document: {str(e)}") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Experiment instructions in a compact format | |
st.markdown('<div class="key-concept" style="background-color:#374151; padding:0.75rem; border-radius:0.5rem; margin:1rem 0; border-left:3px solid #3B82F6; color:white;">', unsafe_allow_html=True) | |
st.markdown("<h3 style='color:white !important;'>Experiment Instructions</h3>", unsafe_allow_html=True) | |
st.markdown(""" | |
1. **Step 1:** Select a document and choose your options | |
2. **Step 2:** Process the document with the selected options | |
3. **Step 3:** Analyze the results in the panel on the right | |
4. **Step 4:** Try again with different settings (e.g., toggle vision model) | |
5. **Step 5:** Compare results between different runs | |
""") | |
st.markdown('</div>', unsafe_allow_html=True) | |
with col2: | |
st.markdown('<div class="tool-container" style="color:white !important; background-color:#1f2937; padding:1.5rem; border-radius:0.5rem; border:1px solid #374151;">', unsafe_allow_html=True) | |
st.markdown("<h3 style='color:white !important;'>Step 3: View Results</h3>", unsafe_allow_html=True) | |
if 'current_result' in st.session_state and st.session_state.current_result: | |
result = st.session_state.current_result | |
# File info in a compact format | |
st.markdown(f"**File:** {result.get('file_name', 'Unknown')}") | |
# Horizontal display of metadata | |
col1, col2 = st.columns(2) | |
with col1: | |
if 'languages' in result and result['languages']: | |
languages = [lang for lang in result['languages'] if lang] | |
if languages: | |
st.markdown(f"**Languages:** {', '.join(languages)}") | |
with col2: | |
if 'topics' in result and result['topics']: | |
st.markdown(f"**Topics:** {', '.join(result['topics'])}") | |
# Create tabs for different views with inline styling to ensure visibility | |
tab1, tab2 = st.tabs(["Structured View", "Raw JSON"]) | |
st.markdown(""" | |
<style> | |
.stTabs [data-baseweb="tab-panel"] * {color: white !important;} | |
</style> | |
""", unsafe_allow_html=True) | |
with tab1: | |
# Display in a more user-friendly format based on the content structure | |
if 'ocr_contents' in result: | |
if isinstance(result['ocr_contents'], dict): | |
for section, content in result['ocr_contents'].items(): | |
if content: # Only display non-empty sections | |
st.markdown(f"<h4 style='color:white !important;'>{section.replace('_', ' ').title()}</h4>", unsafe_allow_html=True) | |
if isinstance(content, str): | |
st.markdown(f"<p style='color:white !important;'>{content}</p>", unsafe_allow_html=True) | |
elif isinstance(content, list): | |
for item in content: | |
if isinstance(item, str): | |
st.markdown(f"<p style='color:white !important;'>- {item}</p>", unsafe_allow_html=True) | |
elif isinstance(item, dict): | |
st.markdown("<div style='color:white !important;'>", unsafe_allow_html=True) | |
st.json(item) | |
st.markdown("</div>", unsafe_allow_html=True) | |
elif isinstance(content, dict): | |
for k, v in content.items(): | |
st.markdown(f"<p style='color:white !important;'><strong>{k}:</strong> {v}</p>", unsafe_allow_html=True) | |
with tab2: | |
# Show the raw JSON for developers | |
# Convert to serializable format first | |
serializable_result = make_serializable(result) | |
st.json(serializable_result) | |
# Download options | |
st.markdown("### Export Results") | |
col1, col2 = st.columns(2) | |
with col1: | |
# Export as JSON (using the serializable converter) | |
serializable_result = make_serializable(result) | |
json_bytes = json.dumps(serializable_result, indent=2).encode() | |
st.download_button( | |
label="Download JSON", | |
data=json_bytes, | |
file_name="ocr_results.json", | |
mime="application/json", | |
use_container_width=True | |
) | |
with col2: | |
# Export as text | |
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']: | |
text_content = result['ocr_contents']['content'] | |
st.download_button( | |
label="Download Text", | |
data=text_content.encode(), | |
file_name="ocr_text.txt", | |
mime="text/plain", | |
use_container_width=True | |
) | |
else: | |
st.markdown(""" | |
<div style="background-color:#1f2937; padding:1rem; border-radius:0.5rem;"> | |
<h3 style="color:white !important;">Results will appear here</h3> | |
<p style="color:white !important;">Upload and process a document to see the OCR results in this panel.</p> | |
<p style="color:white !important;">The OCR tool will:</p> | |
<ol style="color:white !important;"> | |
<li style="color:white !important;">Extract text from your document</li> | |
<li style="color:white !important;">Identify languages and topics</li> | |
<li style="color:white !important;">Provide structured content analysis</li> | |
<li style="color:white !important;">Generate downloadable results</li> | |
</ol> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Processing history | |
if st.session_state.processing_history: | |
st.markdown('<div class="tool-container" style="color:white !important; background-color:#1f2937; padding:1.5rem; border-radius:0.5rem; border:1px solid #374151;">', unsafe_allow_html=True) | |
st.markdown("<h3 style='color:white !important;'>Step 4: Review Processing History</h3>", unsafe_allow_html=True) | |
# Most recent result summary | |
latest = st.session_state.processing_history[-1] | |
st.markdown(f"**Latest Document:** {latest['fileName']}") | |
st.markdown(f"**Processed at:** {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}") | |
st.markdown(f"**Vision model used:** {'Yes' if latest['useVision'] else 'No'}") | |
# Full history in expander | |
with st.expander("View Complete Processing History"): | |
for i, item in enumerate(reversed(st.session_state.processing_history)): | |
st.markdown(f""" | |
<div class="history-card" style="padding:0.75rem; background-color:#374151; color:white; border-radius:0.25rem; border:1px solid #4B5563; margin-bottom:0.5rem;"> | |
<strong style="color:white !important;">{item['fileName']}</strong><br> | |
<span style="color:white !important;">{datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} - | |
Vision model: {'Yes' if item['useVision'] else 'No'}</span> | |
</div> | |
""", unsafe_allow_html=True) | |
# Add option to view a previous result | |
if st.button(f"View This Result", key=f"view_history_{i}"): | |
st.session_state.current_result = item['result'] | |
st.rerun() | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Compare tab for side-by-side comparison | |
with compare_tab: | |
st.subheader("Compare OCR Results") | |
if len(st.session_state.processing_history) >= 2: | |
st.markdown(""" | |
Select two processing results to compare side by side. This allows you to see | |
how different options (like using the vision model) affect OCR quality. | |
""") | |
# Create selection dropdowns for the documents | |
col1, col2 = st.columns(2) | |
with col1: | |
# First document selector | |
doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})" | |
for i, item in enumerate(st.session_state.processing_history)] | |
doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1") | |
doc_index_1 = int(doc_choice_1.split(":")[0]) - 1 | |
with col2: | |
# Second document selector | |
doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})" | |
for i, item in enumerate(st.session_state.processing_history)] | |
default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item | |
doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index) | |
doc_index_2 = int(doc_choice_2.split(":")[0]) - 1 | |
# Retrieve the selected documents | |
doc1 = st.session_state.processing_history[doc_index_1] | |
doc2 = st.session_state.processing_history[doc_index_2] | |
# Show comparison | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown(f"### Document 1: {doc1['fileName']}") | |
st.markdown(f"**Processed at:** {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}") | |
st.markdown(f"**Vision model used:** {'Yes' if doc1['useVision'] else 'No'}") | |
# Display content summary | |
if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict): | |
if 'content' in doc1['result']['ocr_contents']: | |
content = doc1['result']['ocr_contents']['content'] | |
# Display first 500 characters with word wrap | |
st.markdown(f""" | |
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word; | |
border: 1px solid #ddd; padding: 1rem; background-color: #f9f9f9;"> | |
{content[:500]}{'...' if len(content) > 500 else ''} | |
</div> | |
""", unsafe_allow_html=True) | |
with col2: | |
st.markdown(f"### Document 2: {doc2['fileName']}") | |
st.markdown(f"**Processed at:** {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}") | |
st.markdown(f"**Vision model used:** {'Yes' if doc2['useVision'] else 'No'}") | |
# Display content summary | |
if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict): | |
if 'content' in doc2['result']['ocr_contents']: | |
content = doc2['result']['ocr_contents']['content'] | |
# Display first 500 characters with word wrap | |
st.markdown(f""" | |
<div style="max-height: 300px; overflow-y: auto; word-wrap: break-word; | |
border: 1px solid #ddd; padding: 1rem; background-color: #f9f9f9;"> | |
{content[:500]}{'...' if len(content) > 500 else ''} | |
</div> | |
""", unsafe_allow_html=True) | |
# Comparison analysis | |
if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']: | |
st.markdown(""" | |
<div class="key-concept"> | |
<h3>Vision vs. Non-Vision Model Comparison</h3> | |
<p>You're comparing the same document processed with different models. | |
This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.</p> | |
<p>Look for these differences:</p> | |
<ul> | |
<li>Completeness of extracted text</li> | |
<li>Accuracy of layout understanding</li> | |
<li>Recognition of complex elements (tables, figures)</li> | |
<li>Topic and language detection accuracy</li> | |
</ul> | |
</div> | |
""", unsafe_allow_html=True) | |
else: | |
st.markdown(""" | |
<div class="research-question"> | |
<h3>Need More Documents to Compare</h3> | |
<p>Process at least two documents to enable side-by-side comparison. Try processing | |
the same document with and without the vision model to see the differences in OCR quality.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
# Analysis tab for guidance on working with OCR results | |
with analyze_tab: | |
st.subheader("Analysis Guide") | |
st.markdown(""" | |
<div class="workshop-container"> | |
<h3>How to Analyze OCR Results</h3> | |
<p>This guide helps you assess the quality and usefulness of OCR output for historical research.</p> | |
</div> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
### Evaluating OCR Quality | |
When analyzing OCR results from historical documents, consider these key factors: | |
1. **Text Accuracy** | |
- Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1") | |
- Assess recognition of period-specific typography and writing styles | |
- Evaluate handling of degraded or damaged text areas | |
2. **Structure Preservation** | |
- Does the OCR maintain paragraph and section breaks? | |
- Are columns and tabular data correctly preserved? | |
- How well are page transitions handled? | |
3. **Special Elements** | |
- Recognition of footnotes, marginalia, and annotations | |
- Handling of illustrations, diagrams, and decorative elements | |
- Treatment of watermarks, signatures, and stamps | |
4. **Metadata Extraction** | |
- Accuracy of detected languages, topics, and document type | |
- Identification of dates, names, and key entities | |
- Recognition of document purpose and context | |
""") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown(""" | |
### Common OCR Challenges in Historical Documents | |
- **Typography Variations**: Historical fonts and writing styles that differ from modern text | |
- **Material Degradation**: Fading, stains, tears, and other damage affecting legibility | |
- **Handwritten Elements**: Marginalia, signatures, and handwritten annotations | |
- **Complex Layouts**: Multi-column formats, non-linear reading order, and decorative elements | |
- **Language and Terminology**: Archaic terms, specialized vocabulary, and multilingual content | |
""") | |
with col2: | |
st.markdown(""" | |
### Making the Most of OCR Results | |
- **Contextual Reading**: Use historical context to interpret unclear passages | |
- **Error Patterns**: Identify and mentally correct for systematic OCR errors | |
- **Hybrid Analysis**: Combine OCR-based search with close reading of original images | |
- **Comparative Processing**: Try different OCR settings and models on the same document | |
- **Iterative Refinement**: Use insights from each document to improve future processing | |
""") | |
st.markdown(""" | |
### Research Integration | |
Once you've obtained and analyzed OCR results from historical documents, consider these approaches for integrating them into your research: | |
1. **Digital Corpus Building**: Create searchable collections of processed texts | |
2. **Computational Analysis**: Apply text mining, topic modeling, or network analysis | |
3. **Cross-Document Linking**: Identify connections across multiple sources | |
4. **Annotation and Enrichment**: Add context, translations, or explanatory notes | |
5. **Collaborative Research**: Share processed texts with other researchers | |
Remember that OCR is a tool to assist your research, not replace careful reading and analysis. The most effective approaches combine computational methods with traditional historical research practices. | |
""") | |
# Example of what to look for | |
if st.session_state.processing_history: | |
with st.expander("Example Analysis from Your Documents"): | |
# Pick the latest document | |
latest = st.session_state.processing_history[-1] | |
st.markdown(f""" | |
#### Sample Analysis for: {latest['fileName']} | |
**Document Context:** | |
- Languages: {', '.join(latest['result'].get('languages', ['Unknown']))} | |
- Topics: {', '.join(latest['result'].get('topics', ['Unknown']))} | |
- Vision model used: {'Yes' if latest['useVision'] else 'No'} | |
**What to Look For:** | |
1. Check how well the model identified key topics and languages | |
2. Evaluate the completeness of extracted text | |
3. Note any systematic errors in text recognition | |
4. Assess how well document structure was preserved | |
""") | |
# Navigation buttons | |
col1, col2 = st.columns(2) | |
with col1: | |
st.button("Previous: Methodological Approaches", key="prev_to_4", on_click=navigate_to_module, args=(4,)) | |
with col2: | |
st.button("Next: Conclusion", key="next_to_6", on_click=navigate_to_module, args=(6,)) | |
else: # Module 6 | |
# MODULE 6: Conclusion | |
st.title("Module 6: Conclusion and Future Directions") | |
col1, col2 = st.columns([3, 2]) | |
with col1: | |
st.markdown(""" | |
### Workshop Summary | |
Throughout this workshop, we've explored: | |
1. **Text-Image Interdependence**: The complex relationship between textual and visual elements | |
2. **OCR Technology**: The evolution of OCR and its application to historical materials | |
3. **Methodological Approaches**: Hybrid strategies for working with historical texts | |
4. **Practical Application**: Hands-on experience with OCR processing tools | |
""") | |
st.markdown(""" | |
### Key Takeaways | |
1. **OCR is Not Perfect**: Even advanced AI models face challenges with historical documents | |
2. **Context Matters**: Vision-enhanced models provide better results by understanding document context | |
3. **Hybrid Approaches**: Combining computational methods with traditional research yields best results | |
4. **Critical Evaluation**: Always evaluate OCR outputs with awareness of limitations | |
5. **Structured Extraction**: Modern OCR goes beyond text recognition to understand document structure | |
""") | |
with col2: | |
# Display statistics if there's processing history | |
if st.session_state.processing_history: | |
st.subheader("Your Workshop Statistics") | |
# Calculate statistics | |
total_docs = len(st.session_state.processing_history) | |
vision_docs = len([item for item in st.session_state.processing_history if item['useVision']]) | |
non_vision_docs = total_docs - vision_docs | |
# Create metrics for statistics | |
col1, col2 = st.columns(2) | |
with col1: | |
st.metric("Documents Processed", total_docs) | |
st.metric("With Vision Model", vision_docs) | |
with col2: | |
st.metric("Without Vision Model", non_vision_docs) | |
# Topics word cloud | |
if total_docs > 0: | |
st.subheader("Topics Encountered") | |
all_topics = [] | |
for item in st.session_state.processing_history: | |
if 'topics' in item['result']: | |
all_topics.extend(item['result']['topics']) | |
if all_topics: | |
# Count topic frequencies | |
topic_counts = {} | |
for topic in all_topics: | |
if topic in topic_counts: | |
topic_counts[topic] += 1 | |
else: | |
topic_counts[topic] = 1 | |
# Display as a horizontal bar chart | |
st.bar_chart(topic_counts) | |
st.subheader("Future Directions") | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown(""" | |
### Technological Developments | |
- **Multimodal AI models**: Increasingly sophisticated understanding | |
- **Historical font training**: Models trained on historical typography | |
- **Document intelligence**: Enhanced understanding of structures | |
- **Collaborative correction**: Platforms for collective improvement | |
""") | |
with col2: | |
st.markdown(""" | |
### Research Applications | |
- **Large-scale corpus analysis**: Processing entire archives | |
- **Multilingual historical research**: Working across languages | |
- **Image-text integration**: New methodologies for visual analysis | |
- **Computational paleography**: AI-assisted handwriting analysis | |
""") | |
st.markdown(""" | |
### Additional Resources | |
- **[Mistral AI Documentation](https://docs.mistral.ai/)**: Learn more about the OCR models used in this workshop | |
- **[Transkribus](https://readcoop.eu/transkribus/)**: Platform for historical document transcription | |
- **[OCR-D](https://ocr-d.de/en/)**: Coordinated OCR research project for historical documents | |
- **[Historical OCR Research Papers](https://scholar.google.com/scholar?q=historical+OCR)**: Academic research on historical OCR | |
""") | |
# Reset button to start over | |
if st.button("Start Workshop Again", key="reset_workshop", use_container_width=True): | |
st.session_state.current_module = 1 | |
st.rerun() | |
# Handle API requests if the URL contains /api/process | |
if 'api/process' in st.query_params.get('', ''): | |
# Process the API request | |
result = process_api_request() | |
if result: | |
# Return the result as JSON | |
# Make sure result is serializable | |
serializable_result = make_serializable(result) | |
st.json(serializable_result) | |
else: | |
st.json({"error": "Invalid request"}) |