import os import streamlit as st import json import sys from pathlib import Path import tempfile from datetime import datetime import io import base64 from io import BytesIO from enum import Enum import inspect # Add parent directory to path so we can import the OCR modules parent_dir = Path(__file__).parent.absolute() sys.path.append(str(parent_dir)) # Import the StructuredOCR class and process_file utility from structured_ocr import StructuredOCR # Add API endpoint support for the React app from streamlit.web.server.server import Server from streamlit.runtime.scriptrunner import get_script_run_ctx # Custom JSON encoder to handle Enum types and other non-serializable objects class EnhancedJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, Enum): return obj.value elif hasattr(obj, '__dict__'): # For objects that have a __dict__ but aren't directly serializable return {key: value for key, value in obj.__dict__.items() if not key.startswith('_')} elif hasattr(obj, 'model_dump'): # For Pydantic models return obj.model_dump() elif hasattr(obj, 'to_dict'): # For objects with to_dict method return obj.to_dict() # Let the base class handle other types or raise TypeError return super().default(obj) # Helper function to convert any result to JSON-serializable def make_serializable(obj): """Convert any object to a JSON-serializable form""" if isinstance(obj, dict): return {k: make_serializable(v) for k, v in obj.items()} elif isinstance(obj, list): return [make_serializable(item) for item in obj] elif isinstance(obj, Enum): return obj.value elif hasattr(obj, 'pages'): # Special case for OCRResponse objects which have pages attribute if hasattr(obj, '__dict__'): result = {k: make_serializable(v) for k, v in obj.__dict__.items() if not k.startswith('_')} # Explicitly handle pages attribute if hasattr(obj, 'pages'): result['pages'] = [make_serializable(page) for page in obj.pages] return result elif hasattr(obj, '__dict__'): # For objects with __dict__ attribute return {k: make_serializable(v) for k, v in obj.__dict__.items() if not k.startswith('_')} elif hasattr(obj, 'model_dump'): # For Pydantic models return make_serializable(obj.model_dump()) elif hasattr(obj, 'to_dict'): # For objects with to_dict method return make_serializable(obj.to_dict()) # Basic types will be returned as is return obj # API response handler def process_api_request(): """Handle API requests from the React frontend""" # Get the current Streamlit session ctx = get_script_run_ctx() if ctx is None: return session_id = ctx.session_id session_info = Server.get_current()._get_session_info(session_id) if session_info is None: return request = session_info.uploaded_file_mgr._uploaded_files.get('file') if not request: return # Extract file and parameters uploaded_file = request[0] use_vision = session_info.query_string.get('use_vision', ['true'])[0].lower() == 'true' try: # Process file result = process_file(uploaded_file, use_vision=use_vision) # Convert result to JSON-serializable format serializable_result = make_serializable(result) # Return JSON response return serializable_result except Exception as e: # Return error response return {"error": str(e)} try: from process_file import process_file as process_file_util # Use the utility function instead of the local function process_file = process_file_util except ImportError: # Define the process_file function if it's not available def process_file(uploaded_file, use_vision=True): """Process the uploaded file and return the OCR results""" # Save the uploaded file to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp: tmp.write(uploaded_file.getvalue()) temp_path = tmp.name try: # Initialize OCR processor processor = StructuredOCR() # Determine file type from extension file_ext = Path(uploaded_file.name).suffix.lower() file_type = "pdf" if file_ext == ".pdf" else "image" # Process the file result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision) # Add to processing history history_item = { "id": datetime.now().timestamp(), "fileName": uploaded_file.name, "timestamp": datetime.now().isoformat(), "result": result, "useVision": use_vision } if 'processing_history' not in st.session_state: st.session_state.processing_history = [] st.session_state.processing_history.append(history_item) return result except Exception as e: st.error(f"Error processing document: {str(e)}") return None finally: # Clean up the temporary file if os.path.exists(temp_path): os.unlink(temp_path) # Set page configuration st.set_page_config( page_title="Historical OCR Workshop", page_icon="📜", layout="wide", initial_sidebar_state="collapsed" # Start with sidebar collapsed for cleaner landing ) # Custom CSS to match React dark theme and improve Streamlit integration st.markdown(""" """, unsafe_allow_html=True) # Initialize session state for workshop progress if 'current_module' not in st.session_state: st.session_state.current_module = 1 if 'processing_history' not in st.session_state: st.session_state.processing_history = [] if 'workshop_started' not in st.session_state: st.session_state.workshop_started = False def navigate_to_module(module_number): """Navigate to a specific module""" st.session_state.current_module = module_number # Welcome/Start screen if workshop hasn't been started if not st.session_state.workshop_started: # Hero section with eye-catching design st.markdown("""

Historical OCR Workshop

Unlock the potential of historical documents with modern OCR technology

""", unsafe_allow_html=True) # Introduction with cleaner layout col1, col2 = st.columns([3, 2]) with col1: st.markdown("""

Workshop Overview

This interactive workshop explores the application of OCR technology to historical documents, combining theoretical understanding with practical experiences. Designed for historians, archivists, and digital humanities scholars, it offers both conceptual frameworks and hands-on skills.
""", unsafe_allow_html=True) st.markdown("""

What is OCR?

Optical Character Recognition (OCR) technology enables computers to extract text from images and documents. Modern OCR uses AI vision models to understand both the text and its visual context, making it powerful for historical research and digital humanities.
""", unsafe_allow_html=True) with col2: # Add an engaging research question to connect with historians st.markdown("""

For Historians:

How might OCR technology transform our access to and interpretation of historical documents? What new research questions become possible when large archives become machine-readable?
""", unsafe_allow_html=True) # Display sample historical document images in a better format input_dir = Path(__file__).parent / "input" sample_images = [ {"path": input_dir / "letter-1.jpg", "caption": "Historical Letter"}, {"path": input_dir / "recipe.jpg", "caption": "Historical Recipe"} ] # Try to find any of the sample images for sample in sample_images: if sample["path"].exists(): try: from PIL import Image with Image.open(sample["path"]) as img: # Add a better styled border and shadow st.markdown(f"""
""", unsafe_allow_html=True) st.image(img, caption=sample["caption"], width=280) st.markdown("""

Sample document for OCR analysis

""", unsafe_allow_html=True) break # Only show one image except Exception: pass # What you'll learn section combined with Workshop Modules in parallel columns col1, col2 = st.columns(2) with col1: st.markdown('

What You\'ll Learn

', unsafe_allow_html=True) st.markdown("""

Conceptual Understanding

- Text-image relationships in historical documents - Evolution of OCR technology - AI vision models for document analysis - Historical typography challenges
""", unsafe_allow_html=True) st.markdown("""

Methodological Approaches

- Critical frameworks for OCR in historical research - Hybrid computational-traditional methods - Error analysis and interpretation - Contextual reading strategies
""", unsafe_allow_html=True) st.markdown("""

Practical Skills

- Processing historical documents with OCR - Analyzing and structuring extracted information - Integrating OCR into research workflows - Building searchable archives
""", unsafe_allow_html=True) with col2: # Workshop modules with visually appealing cards st.markdown('

Workshop Modules

', unsafe_allow_html=True) # Add some styling for the module cards st.markdown(""" """, unsafe_allow_html=True) # Modules inside the col2 from earlier with col2: st.markdown("""
Module 1
Introduction and Problematization

Explore the challenges of historical document digitization and the potential of OCR technologies to transform historical research. Examine key problems and opportunities in historical OCR.

""", unsafe_allow_html=True) st.markdown("""
Module 2
Text-Image Relations in Historical Archives

Analyze the complex relationships between text and images in historical documents, from typography and layout to marginalia and decorative elements.

""", unsafe_allow_html=True) st.markdown("""
Module 3
OCR Technology and Historical Documents

Understand the evolution of OCR technology from pattern matching to modern AI vision-language models, and how they address the unique challenges of historical documents.

""", unsafe_allow_html=True) st.markdown("""
Module 4
Methodological Approaches

Develop hybrid methodologies that combine computational processing with traditional historical research practices, balancing distant and close reading.

""", unsafe_allow_html=True) st.markdown("""
Module 5
Interactive OCR Experiment

Gain hands-on experience processing historical documents with OCR technology, analyzing results, and comparing different approaches.

""", unsafe_allow_html=True) st.markdown("""
Module 6
Conclusion and Future Directions

Synthesize workshop insights and explore future directions for OCR in historical research, from large-scale corpus analysis to computational paleography.

""", unsafe_allow_html=True) # Engaging quote to inspire participation with citation - in a better styled container st.markdown("""
"

The digital turn in historical research is not just about converting analog to digital; it's about transforming how we access, analyze, and interpret the past.

— Dr. Jane Winters, Professor of Digital Humanities
""", unsafe_allow_html=True) # Feature highlight before call to action with better styling st.markdown("""

Workshop Highlights

Interactive Learning

Hands-on document processing with real-time feedback and analysis

Real Historical Documents

Work with authentic materials spanning different eras and formats

Vision AI Models

Experience state-of-the-art OCR technology powered by advanced AI

Research Applications

Learn to integrate OCR into historical research workflows

""", unsafe_allow_html=True) # Enhanced start button with dynamic styling and clear call to action st.markdown("""

Ready to Start Your Journey?

No installation required • Start immediately • Interactive experience

""", unsafe_allow_html=True) # Hidden button to trigger the workshop start col1, col2, col3 = st.columns([1, 1, 1]) with col2: if st.button("Begin Workshop", key="streamlit-button", use_container_width=True, type="primary"): st.session_state.workshop_started = True st.rerun() # Display workshop navigation sidebar only if workshop has started elif st.session_state.workshop_started: # Define input directory for images input_dir = Path(__file__).parent / "input" # Enhanced sidebar navigation with st.sidebar: st.markdown("

Workshop Navigation

", unsafe_allow_html=True) # Improved visual header with logo/image st.markdown("
", unsafe_allow_html=True) # Add a visual element with better sizing/styling workflow_path = input_dir / "workflow.jpg" if workflow_path.exists(): try: from PIL import Image with Image.open(workflow_path) as img: st.image(img, width=160, output_format="PNG") except Exception: pass st.markdown("
", unsafe_allow_html=True) # Show enhanced progress indicator current_module = st.session_state.current_module st.markdown(f"
Your Progress: Module {current_module} of 6
", unsafe_allow_html=True) st.progress(current_module / 6) # More visually appealing module navigation modules = [ "Introduction", "Text-Image Relations", "OCR Technology", "Methodological Approaches", "Interactive OCR Experiment", "Conclusion" ] # Custom styling for navigation buttons st.markdown(""" """, unsafe_allow_html=True) # Group into clearer sections st.markdown("", unsafe_allow_html=True) for i in range(1, 4): # Modules 1-3 active_class = "active" if i == current_module else "" st.markdown(f""" """, unsafe_allow_html=True) # Hidden button to handle the click if st.button(f"{i}", key=f"nav_{i}"): navigate_to_module(i) st.rerun() st.markdown("", unsafe_allow_html=True) for i in range(4, 7): # Modules 4-6 active_class = "active" if i == current_module else "" st.markdown(f""" """, unsafe_allow_html=True) # Hidden button to handle the click if st.button(f"{i}", key=f"nav_{i}"): navigate_to_module(i) st.rerun() st.markdown("---") # Enhanced quick jump button st.markdown(""" """, unsafe_allow_html=True) st.markdown("""
📊 Jump to OCR Experiment
""", unsafe_allow_html=True) # Hidden button for jump if st.button("Jump to Experiment", key="jump_exp"): navigate_to_module(5) st.rerun() # Workshop information in a cleaner collapsible section with st.expander("About the Workshop"): st.markdown(""" This interactive workshop explores OCR technology for historical documents. **How to use this workshop:** 1. Navigate through modules sequentially 2. Expand content sections to read more 3. Try the interactive OCR experiment 4. Reflect on research questions For help or more information, use the reference materials in Module 6. """) # Enhanced progress tracking if st.session_state.processing_history: with st.expander("Your Activity"): st.markdown(f"Documents processed: {len(st.session_state.processing_history)}", unsafe_allow_html=True) # Show the most recent document processed with better formatting latest = st.session_state.processing_history[-1] st.markdown(f"""
Latest document: {latest['fileName']}
Processed with {' vision model' if latest['useVision'] else ' basic OCR'}
""", unsafe_allow_html=True) # Main content based on current module if st.session_state.current_module == 1: # MODULE 1: Introduction st.title("Module 1: Introduction and Problematization") col1, col2 = st.columns([2, 1]) with col1: st.markdown(""" ## Historical OCR Workshop ### The Problem Historical documents present unique challenges for OCR technology: - Varying typography and handwriting styles - Document degradation and damage - Complex layouts and formatting - Multiple languages and archaic terminology - Illustrations and decorative elements """) with col2: st.markdown(""" ### Workshop Goals By the end of this workshop, you will: 1. Understand text-image relationships in historical archives 2. Learn about advanced OCR technology 3. Explore methodological approaches 4. Gain hands-on experience with OCR tools 5. Develop research integration strategies """) # Next button st.button("Next: Text-Image Relations", key="next_to_2", on_click=navigate_to_module, args=(2,)) elif st.session_state.current_module == 2: # MODULE 2: Text-Image Relations st.title("Module 2: Text-Image Relations in Historical Archives") col1, col2 = st.columns([1, 1]) with col1: st.markdown(""" ### Textual Elements - **Typography**: Varying fonts, sizes, and styles - **Layout**: Columns, margins, and spacing - **Marginalia**: Notes, comments, and additions - **Decorative Text**: Illuminated letters and calligraphy """) st.markdown(""" ### Visual Elements - **Illustrations**: Diagrams, maps, and artistic representations - **Watermarks**: Hidden identifiers that locate documents - **Damage**: Tears, stains, and fading affecting legibility - **Material Features**: Paper quality and physical dimensions """) with col2: st.markdown(""" ### Interdependence The relationship between text and image in historical documents exists on a complex spectrum: - Text functions as image (decorative headings) - Images function as text (symbolic representations) - Layout creates meaning through visual organization - Material conditions affect both textual and visual elements """) st.image("https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Book_of_Kells_folio_292r.jpg/800px-Book_of_Kells_folio_292r.jpg", caption="Book of Kells - Example of text-image integration") st.markdown(""" ### OCR Challenges These complex text-image relationships create particular challenges for OCR: 1. **Distinguishing Text from Decoration**: Where does ornamental text end and functional text begin? 2. **Handling Illustrations**: Should they be processed as images or described as text? 3. **Interpreting Layout**: How to capture the significance of spacing and organization? 4. **Preserving Context**: Maintaining the relationship between textual and visual elements """) # Navigation buttons col1, col2 = st.columns(2) with col1: st.button("Previous: Introduction", key="prev_to_1", on_click=navigate_to_module, args=(1,)) with col2: st.button("Next: OCR Technology", key="next_to_3", on_click=navigate_to_module, args=(3,)) elif st.session_state.current_module == 3: # MODULE 3: OCR Technology st.title("Module 3: OCR Technology and Historical Documents") col1, col2 = st.columns([1, 1]) with col1: st.markdown(""" ### Traditional OCR Approaches 1. **Pattern Matching**: Early OCR compared characters to templates 2. **Feature Extraction**: Identifying key features of characters 3. **Statistical Models**: Using probabilities to improve recognition """) st.markdown(""" ### Modern AI-Enhanced OCR 1. **Neural Networks**: Deep learning models trained on vast datasets 2. **Computer Vision**: Advanced image processing techniques 3. **Language Models**: Contextual understanding to resolve ambiguities 4. **Multimodal Models**: Integration of text, layout, and visual understanding """) with col2: st.markdown(""" ### Challenges with Historical Documents Historical materials present unique difficulties: - **Typography Variation**: Non-standardized fonts and styles - **Historical Language**: Archaic vocabulary and grammar - **Layout Complexity**: Non-linear arrangements - **Document Degradation**: Fading, tears, stains, and damage - **Material Artifacts**: Paper texture, binding shadows, etc. """) st.image("https://cdn.dribbble.com/users/412119/screenshots/16353886/media/82e593c60a5e4d460db917236eab6ece.jpg", caption="OCR processing layers") # Display processing history if available if st.session_state.processing_history: with st.expander("Your OCR Processing History"): st.markdown("You've already processed the following documents:") for item in st.session_state.processing_history: st.markdown(f"**{item['fileName']}** - {datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')}") col1, col2 = st.columns(2) with col1: st.write(f"**Topics:** {', '.join(item['result'].get('topics', ['Unknown']))}") with col2: st.write(f"**Vision model used:** {'Yes' if item['useVision'] else 'No'}") # Quick link to experiment st.button("Jump to OCR Experiment", key="jump_to_5", on_click=navigate_to_module, args=(5,)) # Navigation buttons col1, col2 = st.columns(2) with col1: st.button("Previous: Text-Image Relations", key="prev_to_2", on_click=navigate_to_module, args=(2,)) with col2: st.button("Next: Methodological Approaches", key="next_to_4", on_click=navigate_to_module, args=(4,)) elif st.session_state.current_module == 4: # MODULE 4: Methodological Approaches st.title("Module 4: Methodological Approaches") col1, col2 = st.columns([1, 1]) with col1: st.markdown(""" ### Hybrid Methodologies 1. **Computational + Human Reading** - OCR for initial processing and discovery - Human review for context and interpretation - Iterative refinement of computational outputs 2. **Close + Distant Reading** - Distant reading through large-scale OCR processing - Close reading of selected passages - Zooming between scales of analysis """) # Reference to diagram.jpg input_dir = Path(__file__).parent / "input" diagram_path = input_dir / "diagram.jpg" if diagram_path.exists(): try: # Load image file directly from disk from PIL import Image with Image.open(diagram_path) as img: st.image(img, caption="Historical VLM architecture", use_column_width=True) except Exception: pass with col2: st.markdown(""" ### Mistral-OCR-Latest: State-of-the-Art The Mistral-OCR model represents a significant advancement: - **Multimodal Understanding**: Processes both visual and textual information - **Contextual Awareness**: Considers historical context - **Layout Recognition**: Preserves complex document structures - **Historical Font Adaptation**: Trained on diverse historical typography """) # Reference to workflow.jpg workflow_path = input_dir / "workflow.jpg" if workflow_path.exists(): try: # Load image file directly from disk from PIL import Image with Image.open(workflow_path) as img: st.image(img, caption="Mistral OCR workflow", use_column_width=True) except Exception: pass st.markdown(""" ### Practical Workflow A typical historical OCR workflow with Mistral-OCR includes: 1. **Selection**: Choosing appropriate documents 2. **Preprocessing**: Enhancing images before OCR 3. **OCR Processing**: Running documents through vision-enhanced OCR 4. **Post-processing**: Cleaning up outputs and structured extraction 5. **Verification**: Cross-checking results against originals 6. **Integration**: Incorporating OCR outputs into research materials """) # Navigation buttons col1, col2 = st.columns(2) with col1: st.button("Previous: OCR Technology", key="prev_to_3", on_click=navigate_to_module, args=(3,)) with col2: st.button("Next: Interactive OCR Experiment", key="next_to_5", on_click=navigate_to_module, args=(5,)) elif st.session_state.current_module == 5: # MODULE 5: Interactive OCR Experiment st.title("Module 5: Interactive OCR Experiment") # More modular design with sequenced steps st.markdown("""
This interactive module allows you to process historical documents with OCR and analyze the results. Follow the sequenced steps below to experiment with historical document analysis.
""", unsafe_allow_html=True) # Tabbed interface for different activities experiment_tab, compare_tab, analyze_tab = st.tabs(["Process Documents", "Compare Results", "Analysis Guide"]) with experiment_tab: # Import additional libraries for enhanced functionality try: from pdf2image import convert_from_bytes pdf_support = True except ImportError: pdf_support = False st.warning("PDF preview functionality is limited. The pdf2image module is required for PDF previews.") # OCR tool in a compact layout col1, col2 = st.columns([1, 1]) with col1: st.markdown('
', unsafe_allow_html=True) st.markdown("

Step 1: Select Document & Options

", unsafe_allow_html=True) # Processing options use_vision = st.checkbox("Use Vision Model", value=True, help="Use the vision model for improved analysis") # Additional prompt for the model st.markdown("### Custom Research Prompt (Optional)") st.markdown("""Provide additional instructions to guide the OCR analysis. Focus on specific aspects of historical research you're interested in.""") custom_prompt = st.text_area("Research Prompt", placeholder="E.g., Focus on identifying dates and historical figures; Analyze the writing style for period-specific terminology; Highlight any cultural or social indicators of the time period...", help="Optional instructions to guide the analysis of the historical document") # Example preset documents selection input_dir = Path(__file__).parent / "input" if input_dir.exists(): sample_files = list(input_dir.glob("*.jpg")) + list(input_dir.glob("*.png")) + list(input_dir.glob("*.pdf")) if sample_files: st.markdown("#### Sample Documents") sample_options = ["Upload my own document"] + [f.name for f in sample_files] sample_choice = st.selectbox("Choose a document:", sample_options) if sample_choice != "Upload my own document": selected_file = next((f for f in sample_files if f.name == sample_choice), None) if selected_file: # Store the selected sample file in session state with open(selected_file, "rb") as f: file_bytes = f.read() st.session_state.sample_file = { "name": selected_file.name, "bytes": file_bytes } # Preview the selected sample if selected_file.suffix.lower() == ".pdf" and pdf_support: try: with st.spinner("Generating PDF preview..."): images = convert_from_bytes(file_bytes, first_page=1, last_page=1, dpi=150) if images: # Convert PIL image to bytes for Streamlit first_page = images[0] img_bytes = io.BytesIO() first_page.save(img_bytes, format='JPEG') img_bytes.seek(0) st.image(img_bytes, caption=f"Preview: {selected_file.name}", use_container_width=True) except Exception: st.info(f"PDF selected: {selected_file.name}") else: # For images display directly try: from PIL import Image img = Image.open(BytesIO(file_bytes)) st.image(img, caption=f"Preview: {selected_file.name}", use_container_width=True) except Exception: st.info(f"Selected: {selected_file.name}") else: # Clear the sample file if "Upload my own" is selected if 'sample_file' in st.session_state: del st.session_state.sample_file # File uploader with styling matched to React theme st.markdown('
', unsafe_allow_html=True) uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") if uploaded_file is None: st.markdown("### Upload a document to get started") st.markdown("Supported formats: PDF, JPG, PNG") else: # Display the uploaded file file_ext = Path(uploaded_file.name).suffix.lower() if file_ext == ".pdf" and pdf_support: try: # Convert first page of PDF to image for preview pdf_bytes = uploaded_file.getvalue() with st.spinner("Generating PDF preview..."): images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) if images: # Convert PIL image to bytes for Streamlit first_page = images[0] img_bytes = io.BytesIO() first_page.save(img_bytes, format='JPEG') img_bytes.seek(0) # Display the PDF preview st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True) else: st.info(f"PDF uploaded: {uploaded_file.name}") except Exception: # Simply show the file name without an error message st.info(f"PDF uploaded: {uploaded_file.name}") elif file_ext != ".pdf": st.image(uploaded_file, use_container_width=True) else: st.info(f"PDF uploaded: {uploaded_file.name}") st.markdown('
', unsafe_allow_html=True) else: # No sample files found, just show the uploader st.markdown('
', unsafe_allow_html=True) uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") if uploaded_file is None: st.markdown("### Upload a document to get started") st.markdown("Supported formats: PDF, JPG, PNG") else: # Display the uploaded file preview file_ext = Path(uploaded_file.name).suffix.lower() if file_ext == ".pdf" and pdf_support: try: # PDF preview logic pdf_bytes = uploaded_file.getvalue() with st.spinner("Generating PDF preview..."): images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) if images: first_page = images[0] img_bytes = io.BytesIO() first_page.save(img_bytes, format='JPEG') img_bytes.seek(0) st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", use_container_width=True) else: st.info(f"PDF uploaded: {uploaded_file.name}") except Exception: st.info(f"PDF uploaded: {uploaded_file.name}") elif file_ext != ".pdf": st.image(uploaded_file, use_container_width=True) else: st.info(f"PDF uploaded: {uploaded_file.name}") st.markdown('
', unsafe_allow_html=True) else: # Input directory doesn't exist, just show the uploader st.markdown('
', unsafe_allow_html=True) uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], label_visibility="collapsed") if uploaded_file is None: st.markdown("### Upload a document to get started") st.markdown("Supported formats: PDF, JPG, PNG") st.markdown('
', unsafe_allow_html=True) # Step 2: Process document st.subheader("Step 2: Process the Document") # Get the file to process (either uploaded or sample) file_to_process = None if 'sample_file' in st.session_state and sample_choice != "Upload my own document": # Create a FileUploader-like object from the sample file class SampleFileObject: def __init__(self, name, data): self.name = name self._data = data def getvalue(self): return self._data file_to_process = SampleFileObject( st.session_state.sample_file["name"], st.session_state.sample_file["bytes"] ) elif 'uploaded_file' in locals() and uploaded_file is not None: file_to_process = uploaded_file # Process button (disabled if no file selected) process_button = st.button( "Process Document", disabled=file_to_process is None, use_container_width=True ) if process_button and file_to_process is not None: with st.spinner("Processing document..."): try: # Process the file result = process_file(file_to_process, use_vision, custom_prompt=custom_prompt if custom_prompt else None) if result: st.success("Document processed successfully!") # Store result in session state for display in the right column st.session_state.current_result = result st.rerun() # Refresh to show result else: st.error("Failed to process document.") except Exception as e: st.error(f"Error processing document: {str(e)}") st.markdown('
', unsafe_allow_html=True) # Experiment instructions in a compact format st.markdown('
', unsafe_allow_html=True) st.markdown("

Experiment Instructions

", unsafe_allow_html=True) st.markdown(""" 1. **Step 1:** Select a document and choose your options 2. **Step 2:** Process the document with the selected options 3. **Step 3:** Analyze the results in the panel on the right 4. **Step 4:** Try again with different settings (e.g., toggle vision model) 5. **Step 5:** Compare results between different runs """) st.markdown('
', unsafe_allow_html=True) with col2: st.markdown('
', unsafe_allow_html=True) st.markdown("

Step 3: View Results

", unsafe_allow_html=True) if 'current_result' in st.session_state and st.session_state.current_result: result = st.session_state.current_result # File info in a compact format st.markdown(f"**File:** {result.get('file_name', 'Unknown')}") # Horizontal display of metadata col1, col2 = st.columns(2) with col1: if 'languages' in result and result['languages']: languages = [lang for lang in result['languages'] if lang] if languages: st.markdown(f"**Languages:** {', '.join(languages)}") with col2: if 'topics' in result and result['topics']: st.markdown(f"**Topics:** {', '.join(result['topics'])}") # Create tabs for different views with inline styling to ensure visibility tab1, tab2 = st.tabs(["Structured View", "Raw JSON"]) st.markdown(""" """, unsafe_allow_html=True) with tab1: # Display in a more user-friendly format based on the content structure if 'ocr_contents' in result: if isinstance(result['ocr_contents'], dict): for section, content in result['ocr_contents'].items(): if content: # Only display non-empty sections st.markdown(f"

{section.replace('_', ' ').title()}

", unsafe_allow_html=True) if isinstance(content, str): st.markdown(f"

{content}

", unsafe_allow_html=True) elif isinstance(content, list): for item in content: if isinstance(item, str): st.markdown(f"

- {item}

", unsafe_allow_html=True) elif isinstance(item, dict): st.markdown("
", unsafe_allow_html=True) st.json(item) st.markdown("
", unsafe_allow_html=True) elif isinstance(content, dict): for k, v in content.items(): st.markdown(f"

{k}: {v}

", unsafe_allow_html=True) with tab2: # Show the raw JSON for developers # Convert to serializable format first serializable_result = make_serializable(result) st.json(serializable_result) # Download options st.markdown("### Export Results") col1, col2 = st.columns(2) with col1: # Export as JSON (using the serializable converter) serializable_result = make_serializable(result) json_bytes = json.dumps(serializable_result, indent=2).encode() st.download_button( label="Download JSON", data=json_bytes, file_name="ocr_results.json", mime="application/json", use_container_width=True ) with col2: # Export as text if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict) and 'content' in result['ocr_contents']: text_content = result['ocr_contents']['content'] st.download_button( label="Download Text", data=text_content.encode(), file_name="ocr_text.txt", mime="text/plain", use_container_width=True ) else: st.markdown("""

Results will appear here

Upload and process a document to see the OCR results in this panel.

The OCR tool will:

  1. Extract text from your document
  2. Identify languages and topics
  3. Provide structured content analysis
  4. Generate downloadable results
""", unsafe_allow_html=True) st.markdown('
', unsafe_allow_html=True) # Processing history if st.session_state.processing_history: st.markdown('
', unsafe_allow_html=True) st.markdown("

Step 4: Review Processing History

", unsafe_allow_html=True) # Most recent result summary latest = st.session_state.processing_history[-1] st.markdown(f"**Latest Document:** {latest['fileName']}") st.markdown(f"**Processed at:** {datetime.fromisoformat(latest['timestamp']).strftime('%Y-%m-%d %H:%M')}") st.markdown(f"**Vision model used:** {'Yes' if latest['useVision'] else 'No'}") # Full history in expander with st.expander("View Complete Processing History"): for i, item in enumerate(reversed(st.session_state.processing_history)): st.markdown(f"""
{item['fileName']}
{datetime.fromisoformat(item['timestamp']).strftime('%Y-%m-%d %H:%M')} - Vision model: {'Yes' if item['useVision'] else 'No'}
""", unsafe_allow_html=True) # Add option to view a previous result if st.button(f"View This Result", key=f"view_history_{i}"): st.session_state.current_result = item['result'] st.rerun() st.markdown('
', unsafe_allow_html=True) # Compare tab for side-by-side comparison with compare_tab: st.subheader("Compare OCR Results") if len(st.session_state.processing_history) >= 2: st.markdown(""" Select two processing results to compare side by side. This allows you to see how different options (like using the vision model) affect OCR quality. """) # Create selection dropdowns for the documents col1, col2 = st.columns(2) with col1: # First document selector doc_options_1 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})" for i, item in enumerate(st.session_state.processing_history)] doc_choice_1 = st.selectbox("First Document:", doc_options_1, key="compare_doc_1") doc_index_1 = int(doc_choice_1.split(":")[0]) - 1 with col2: # Second document selector doc_options_2 = [f"{i+1}: {item['fileName']} ({'Vision' if item['useVision'] else 'No Vision'})" for i, item in enumerate(st.session_state.processing_history)] default_index = min(1, len(st.session_state.processing_history) - 1) # Default to second item doc_choice_2 = st.selectbox("Second Document:", doc_options_2, key="compare_doc_2", index=default_index) doc_index_2 = int(doc_choice_2.split(":")[0]) - 1 # Retrieve the selected documents doc1 = st.session_state.processing_history[doc_index_1] doc2 = st.session_state.processing_history[doc_index_2] # Show comparison col1, col2 = st.columns(2) with col1: st.markdown(f"### Document 1: {doc1['fileName']}") st.markdown(f"**Processed at:** {datetime.fromisoformat(doc1['timestamp']).strftime('%Y-%m-%d %H:%M')}") st.markdown(f"**Vision model used:** {'Yes' if doc1['useVision'] else 'No'}") # Display content summary if 'ocr_contents' in doc1['result'] and isinstance(doc1['result']['ocr_contents'], dict): if 'content' in doc1['result']['ocr_contents']: content = doc1['result']['ocr_contents']['content'] # Display first 500 characters with word wrap st.markdown(f"""
{content[:500]}{'...' if len(content) > 500 else ''}
""", unsafe_allow_html=True) with col2: st.markdown(f"### Document 2: {doc2['fileName']}") st.markdown(f"**Processed at:** {datetime.fromisoformat(doc2['timestamp']).strftime('%Y-%m-%d %H:%M')}") st.markdown(f"**Vision model used:** {'Yes' if doc2['useVision'] else 'No'}") # Display content summary if 'ocr_contents' in doc2['result'] and isinstance(doc2['result']['ocr_contents'], dict): if 'content' in doc2['result']['ocr_contents']: content = doc2['result']['ocr_contents']['content'] # Display first 500 characters with word wrap st.markdown(f"""
{content[:500]}{'...' if len(content) > 500 else ''}
""", unsafe_allow_html=True) # Comparison analysis if doc1['fileName'] == doc2['fileName'] and doc1['useVision'] != doc2['useVision']: st.markdown("""

Vision vs. Non-Vision Model Comparison

You're comparing the same document processed with different models. This is an excellent way to evaluate the impact of vision capabilities on OCR accuracy.

Look for these differences:

""", unsafe_allow_html=True) else: st.markdown("""

Need More Documents to Compare

Process at least two documents to enable side-by-side comparison. Try processing the same document with and without the vision model to see the differences in OCR quality.

""", unsafe_allow_html=True) # Analysis tab for guidance on working with OCR results with analyze_tab: st.subheader("Analysis Guide") st.markdown("""

How to Analyze OCR Results

This guide helps you assess the quality and usefulness of OCR output for historical research.

""", unsafe_allow_html=True) st.markdown(""" ### Evaluating OCR Quality When analyzing OCR results from historical documents, consider these key factors: 1. **Text Accuracy** - Check for common OCR errors (e.g., mistaking "e" for "c", "l" for "1") - Assess recognition of period-specific typography and writing styles - Evaluate handling of degraded or damaged text areas 2. **Structure Preservation** - Does the OCR maintain paragraph and section breaks? - Are columns and tabular data correctly preserved? - How well are page transitions handled? 3. **Special Elements** - Recognition of footnotes, marginalia, and annotations - Handling of illustrations, diagrams, and decorative elements - Treatment of watermarks, signatures, and stamps 4. **Metadata Extraction** - Accuracy of detected languages, topics, and document type - Identification of dates, names, and key entities - Recognition of document purpose and context """) col1, col2 = st.columns(2) with col1: st.markdown(""" ### Common OCR Challenges in Historical Documents - **Typography Variations**: Historical fonts and writing styles that differ from modern text - **Material Degradation**: Fading, stains, tears, and other damage affecting legibility - **Handwritten Elements**: Marginalia, signatures, and handwritten annotations - **Complex Layouts**: Multi-column formats, non-linear reading order, and decorative elements - **Language and Terminology**: Archaic terms, specialized vocabulary, and multilingual content """) with col2: st.markdown(""" ### Making the Most of OCR Results - **Contextual Reading**: Use historical context to interpret unclear passages - **Error Patterns**: Identify and mentally correct for systematic OCR errors - **Hybrid Analysis**: Combine OCR-based search with close reading of original images - **Comparative Processing**: Try different OCR settings and models on the same document - **Iterative Refinement**: Use insights from each document to improve future processing """) st.markdown(""" ### Research Integration Once you've obtained and analyzed OCR results from historical documents, consider these approaches for integrating them into your research: 1. **Digital Corpus Building**: Create searchable collections of processed texts 2. **Computational Analysis**: Apply text mining, topic modeling, or network analysis 3. **Cross-Document Linking**: Identify connections across multiple sources 4. **Annotation and Enrichment**: Add context, translations, or explanatory notes 5. **Collaborative Research**: Share processed texts with other researchers Remember that OCR is a tool to assist your research, not replace careful reading and analysis. The most effective approaches combine computational methods with traditional historical research practices. """) # Example of what to look for if st.session_state.processing_history: with st.expander("Example Analysis from Your Documents"): # Pick the latest document latest = st.session_state.processing_history[-1] st.markdown(f""" #### Sample Analysis for: {latest['fileName']} **Document Context:** - Languages: {', '.join(latest['result'].get('languages', ['Unknown']))} - Topics: {', '.join(latest['result'].get('topics', ['Unknown']))} - Vision model used: {'Yes' if latest['useVision'] else 'No'} **What to Look For:** 1. Check how well the model identified key topics and languages 2. Evaluate the completeness of extracted text 3. Note any systematic errors in text recognition 4. Assess how well document structure was preserved """) # Navigation buttons col1, col2 = st.columns(2) with col1: st.button("Previous: Methodological Approaches", key="prev_to_4", on_click=navigate_to_module, args=(4,)) with col2: st.button("Next: Conclusion", key="next_to_6", on_click=navigate_to_module, args=(6,)) else: # Module 6 # MODULE 6: Conclusion st.title("Module 6: Conclusion and Future Directions") col1, col2 = st.columns([3, 2]) with col1: st.markdown(""" ### Workshop Summary Throughout this workshop, we've explored: 1. **Text-Image Interdependence**: The complex relationship between textual and visual elements 2. **OCR Technology**: The evolution of OCR and its application to historical materials 3. **Methodological Approaches**: Hybrid strategies for working with historical texts 4. **Practical Application**: Hands-on experience with OCR processing tools """) st.markdown(""" ### Key Takeaways 1. **OCR is Not Perfect**: Even advanced AI models face challenges with historical documents 2. **Context Matters**: Vision-enhanced models provide better results by understanding document context 3. **Hybrid Approaches**: Combining computational methods with traditional research yields best results 4. **Critical Evaluation**: Always evaluate OCR outputs with awareness of limitations 5. **Structured Extraction**: Modern OCR goes beyond text recognition to understand document structure """) with col2: # Display statistics if there's processing history if st.session_state.processing_history: st.subheader("Your Workshop Statistics") # Calculate statistics total_docs = len(st.session_state.processing_history) vision_docs = len([item for item in st.session_state.processing_history if item['useVision']]) non_vision_docs = total_docs - vision_docs # Create metrics for statistics col1, col2 = st.columns(2) with col1: st.metric("Documents Processed", total_docs) st.metric("With Vision Model", vision_docs) with col2: st.metric("Without Vision Model", non_vision_docs) # Topics word cloud if total_docs > 0: st.subheader("Topics Encountered") all_topics = [] for item in st.session_state.processing_history: if 'topics' in item['result']: all_topics.extend(item['result']['topics']) if all_topics: # Count topic frequencies topic_counts = {} for topic in all_topics: if topic in topic_counts: topic_counts[topic] += 1 else: topic_counts[topic] = 1 # Display as a horizontal bar chart st.bar_chart(topic_counts) st.subheader("Future Directions") col1, col2 = st.columns(2) with col1: st.markdown(""" ### Technological Developments - **Multimodal AI models**: Increasingly sophisticated understanding - **Historical font training**: Models trained on historical typography - **Document intelligence**: Enhanced understanding of structures - **Collaborative correction**: Platforms for collective improvement """) with col2: st.markdown(""" ### Research Applications - **Large-scale corpus analysis**: Processing entire archives - **Multilingual historical research**: Working across languages - **Image-text integration**: New methodologies for visual analysis - **Computational paleography**: AI-assisted handwriting analysis """) st.markdown(""" ### Additional Resources - **[Mistral AI Documentation](https://docs.mistral.ai/)**: Learn more about the OCR models used in this workshop - **[Transkribus](https://readcoop.eu/transkribus/)**: Platform for historical document transcription - **[OCR-D](https://ocr-d.de/en/)**: Coordinated OCR research project for historical documents - **[Historical OCR Research Papers](https://scholar.google.com/scholar?q=historical+OCR)**: Academic research on historical OCR """) # Reset button to start over if st.button("Start Workshop Again", key="reset_workshop", use_container_width=True): st.session_state.current_module = 1 st.rerun() # Handle API requests if the URL contains /api/process if 'api/process' in st.query_params.get('', ''): # Process the API request result = process_api_request() if result: # Return the result as JSON # Make sure result is serializable serializable_result = make_serializable(result) st.json(serializable_result) else: st.json({"error": "Invalid request"})