historical-ocr / app.py
milwright's picture
Show preprocessing preview only when substantive options are selected
793fc87
import os
import streamlit as st
import json
import sys
import time
from pathlib import Path
import tempfile
import io
from pdf2image import convert_from_bytes
from PIL import Image, ImageEnhance, ImageFilter, UnidentifiedImageError
import PIL
import cv2
import numpy as np
# Import the StructuredOCR class and config from the local files
from structured_ocr import StructuredOCR
from config import MISTRAL_API_KEY
# Import UI layout if available
try:
from ui.layout import tool_container
UI_LAYOUT_AVAILABLE = True
except ImportError:
UI_LAYOUT_AVAILABLE = False
# Set page configuration
st.set_page_config(
page_title="Historical OCR",
page_icon="πŸ“œ",
layout="wide",
initial_sidebar_state="expanded"
)
# Enable caching for expensive operations
@st.cache_data(ttl=3600, show_spinner=False)
def convert_pdf_to_images(pdf_bytes, dpi=150):
"""Convert PDF bytes to a list of images with caching"""
try:
return convert_from_bytes(pdf_bytes, dpi=dpi)
except Exception as e:
st.error(f"Error converting PDF: {str(e)}")
return []
def safe_open_image(image_bytes):
"""Safe wrapper for PIL.Image.open with robust error handling"""
try:
return Image.open(io.BytesIO(image_bytes))
except Exception:
# Return None if image can't be opened
return None
@st.cache_data(ttl=3600, show_spinner=False)
def preprocess_image(image_bytes, preprocessing_options):
"""Preprocess image with selected options"""
try:
# Attempt to open the image safely
image = safe_open_image(image_bytes)
# If image could not be opened, return the original bytes
if image is None:
return image_bytes
# Ensure image is in RGB mode for OpenCV processing
if image.mode not in ['RGB', 'RGBA']:
image = image.convert('RGB')
elif image.mode == 'RGBA':
# Handle RGBA images by removing transparency
background = Image.new('RGB', image.size, (255, 255, 255))
background.paste(image, mask=image.split()[3]) # 3 is the alpha channel
image = background
# Handle image rotation based on user selection
rotation_option = preprocessing_options.get("rotation", "None")
if rotation_option != "None":
if rotation_option == "Rotate 90Β° clockwise":
image = image.transpose(Image.ROTATE_270)
elif rotation_option == "Rotate 90Β° counterclockwise":
image = image.transpose(Image.ROTATE_90)
elif rotation_option == "Rotate 180Β°":
image = image.transpose(Image.ROTATE_180)
elif rotation_option == "Auto-detect":
# Auto-detect orientation
width, height = image.size
# If image is in landscape and likely a document (typically portrait is better for OCR)
if width > height and (width / height) > 1.5:
image = image.transpose(Image.ROTATE_90)
# Convert to numpy array for OpenCV processing
try:
img_array = np.array(image)
except Exception:
# Return the original image as JPEG if we can't convert to array
byte_io = io.BytesIO()
image.save(byte_io, format='JPEG')
byte_io.seek(0)
return byte_io.getvalue()
# Apply preprocessing based on selected options
try:
if preprocessing_options.get("grayscale", False):
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
if preprocessing_options.get("contrast", 0) != 0:
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10)
image = Image.fromarray(img_array)
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(contrast_factor)
img_array = np.array(image)
if preprocessing_options.get("denoise", False):
# Ensure the image is in the correct format for denoising (CV_8UC3)
if len(img_array.shape) != 3 or img_array.shape[2] != 3:
# Convert to RGB if it's not already a 3-channel color image
if len(img_array.shape) == 2: # Grayscale
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
if preprocessing_options.get("threshold", False):
# Convert to grayscale if not already
if len(img_array.shape) == 3:
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
else:
gray = img_array
# Apply adaptive threshold
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
# Convert back to RGB
img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
except Exception:
# Return the original image if preprocessing fails
byte_io = io.BytesIO()
image.save(byte_io, format='JPEG')
byte_io.seek(0)
return byte_io.getvalue()
# Convert back to PIL Image
try:
processed_image = Image.fromarray(img_array)
# Convert to bytes
byte_io = io.BytesIO()
processed_image.save(byte_io, format='JPEG') # Use JPEG for better compatibility
byte_io.seek(0)
return byte_io.getvalue()
except Exception:
# Final fallback - return original bytes
return image_bytes
except Exception:
# Return original image bytes as fallback
return image_bytes
# Define functions
def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
"""Process the uploaded file and return the OCR results
Args:
uploaded_file: The uploaded file to process
use_vision: Whether to use vision model
preprocessing_options: Dictionary of preprocessing options
"""
if preprocessing_options is None:
preprocessing_options = {}
# Show progress indicator
progress_bar = st.progress(0)
status_text = st.empty()
status_text.text("Preparing file for processing...")
# Save the uploaded file to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp:
tmp.write(uploaded_file.getvalue())
temp_path = tmp.name
try:
# Check if API key is available
if not MISTRAL_API_KEY:
# Return dummy data if no API key
progress_bar.progress(100)
status_text.empty()
# Show a clear message about the missing API key
st.error("πŸ”‘ **Missing API Key**: Cannot process document without a valid Mistral AI API key.")
st.info("""
**How to add your API key:**
For Hugging Face Spaces:
1. Go to your Space settings
2. Add a secret named `MISTRAL_API_KEY` with your API key value
For local development:
1. Add to your shell: `export MISTRAL_API_KEY=your_key_here`
2. Or create a `.env` file with `MISTRAL_API_KEY=your_key_here`
""")
return {
"file_name": uploaded_file.name,
"topics": ["API Key Required"],
"languages": ["English"],
"ocr_contents": {
"title": "Missing Mistral API Key",
"content": "To process real documents, please set the MISTRAL_API_KEY environment variable as described above."
}
}
# Update progress
progress_bar.progress(20)
status_text.text("Initializing OCR processor...")
# Initialize OCR processor with explicit API key
try:
# Make sure the API key is properly formatted
api_key = MISTRAL_API_KEY.strip()
processor = StructuredOCR(api_key=api_key)
except Exception as e:
st.error(f"Error initializing OCR processor: {str(e)}")
return {
"file_name": uploaded_file.name,
"error": "API authentication failed",
"ocr_contents": {
"error": "Could not authenticate with Mistral API. Please check your API key."
}
}
# Determine file type from extension
file_ext = Path(uploaded_file.name).suffix.lower()
file_type = "pdf" if file_ext == ".pdf" else "image"
# Store original filename in session state for preservation
st.session_state.original_filename = uploaded_file.name
# Apply preprocessing if needed
if any(preprocessing_options.values()) and file_type == "image":
status_text.text("Applying image preprocessing...")
try:
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
# Save processed image to temp file but preserve original filename for results
original_ext = Path(uploaded_file.name).suffix.lower()
# Use original extension when possible for better format recognition
if original_ext in ['.jpg', '.jpeg', '.png']:
suffix = original_ext
else:
suffix = '.jpg' # Default fallback to JPEG
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as proc_tmp:
proc_tmp.write(processed_bytes)
temp_path = proc_tmp.name
except Exception as e:
st.warning(f"Image preprocessing failed: {str(e)}. Proceeding with original image.")
# If preprocessing fails, use original file
# This ensures the OCR process continues even if preprocessing has issues
# Get file size in MB
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
# Check if file exceeds size limits (10 MB for API processing)
# This is a lower limit than the UI file size to ensure API requests don't fail
if file_size_mb > 10:
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size for API processing is 10MB.")
return {
"file_name": uploaded_file.name,
"topics": ["Document"],
"languages": ["English"],
"confidence_score": 0.0,
"error": f"File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
"ocr_contents": {
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
"partial_text": "Document could not be processed due to API limitations. Try reducing the file size or resolution."
}
}
# Update progress
progress_bar.progress(40)
status_text.text("Processing document with OCR...")
# Process the file with file size information for automatic page limiting
# Make sure we're using the latest mistral-ocr model
# See https://docs.mistral.ai/capabilities/document/ for more info
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb)
# Complete progress
progress_bar.progress(100)
status_text.empty()
# Preserve original filename in results
if hasattr(st.session_state, 'original_filename'):
result['file_name'] = st.session_state.original_filename
# Clear the stored filename for next run
del st.session_state.original_filename
return result
except Exception as e:
progress_bar.progress(100)
status_text.empty()
st.error(f"Error during processing: {str(e)}")
raise
finally:
# Clean up the temporary file
if os.path.exists(temp_path):
os.unlink(temp_path)
# Initialize session state for storing results
if 'previous_results' not in st.session_state:
st.session_state.previous_results = []
if 'current_result' not in st.session_state:
st.session_state.current_result = None
# App title and description
st.title("Historical Document OCR")
st.write("Process historical documents and images with AI-powered OCR.")
# Check if API key is available
if not MISTRAL_API_KEY:
st.warning("⚠️ **No Mistral API key found.** Please set the MISTRAL_API_KEY environment variable.")
st.info("For Hugging Face Spaces, add it as a secret. For local development, export it in your shell or add it to a .env file.")
# Create main layout with tabs
main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"])
# Sidebar with options
with st.sidebar:
st.header("Options")
# Model options
st.subheader("Model Settings")
use_vision = st.checkbox("Use Vision Model", value=True,
help="For image files, use the vision model for improved analysis")
# Image preprocessing options
st.subheader("Image Preprocessing")
with st.expander("Preprocessing Options"):
preprocessing_options = {}
preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale",
help="Convert image to grayscale before OCR")
preprocessing_options["threshold"] = st.checkbox("Apply Thresholding",
help="Apply adaptive thresholding to enhance text")
preprocessing_options["denoise"] = st.checkbox("Denoise Image",
help="Remove noise from the image")
preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0,
help="Adjust image contrast (-5 to +5)")
# Add rotation options
rotation_options = ["None", "Rotate 90Β° clockwise", "Rotate 90Β° counterclockwise", "Rotate 180Β°", "Auto-detect"]
preprocessing_options["rotation"] = st.selectbox("Image Orientation", rotation_options, index=0,
help="Rotate image to correct orientation")
# PDF options
st.subheader("PDF Options")
with st.expander("PDF Settings"):
pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150,
help="Higher DPI gives better quality but slower processing")
max_pages = st.number_input("Maximum Pages", 1, 20, 5,
help="Limit number of pages to process")
# Previous Results tab
with main_tab2:
if not st.session_state.previous_results:
st.info("No previous documents have been processed yet. Process a document to see results here.")
else:
st.subheader("Previously Processed Documents")
# Display previous results in a selectable list
previous_files = [f"{i+1}. {result.get('file_name', 'Document')}"
for i, result in enumerate(st.session_state.previous_results)]
selected_index = st.selectbox("Select a previous document:",
options=range(len(previous_files)),
format_func=lambda i: previous_files[i])
selected_result = st.session_state.previous_results[selected_index]
# Display selected result in tabs
has_images = selected_result.get('has_images', False)
if has_images:
prev_tabs = st.tabs(["Document Info", "Content", "With Images"])
else:
prev_tabs = st.tabs(["Document Info", "Content"])
# Document Info tab
with prev_tabs[0]:
st.write(f"**File:** {selected_result.get('file_name', 'Document')}")
# Remove confidence score from display
# Show languages if available
if 'languages' in selected_result and selected_result['languages']:
languages = [lang for lang in selected_result['languages'] if lang is not None]
if languages:
st.write(f"**Languages:** {', '.join(languages)}")
# Show topics if available
if 'topics' in selected_result and selected_result['topics']:
st.write(f"**Topics:** {', '.join(selected_result['topics'])}")
# Show any limited pages info
if 'limited_pages' in selected_result:
st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages")
# Content tab
with prev_tabs[1]:
if 'ocr_contents' in selected_result:
st.markdown("## Document Contents")
if isinstance(selected_result['ocr_contents'], dict):
for section, content in selected_result['ocr_contents'].items():
if not content:
continue
section_title = section.replace('_', ' ').title()
# Special handling for title and subtitle
if section.lower() == 'title':
st.markdown(f"# {content}")
elif section.lower() == 'subtitle':
st.markdown(f"*{content}*")
else:
st.markdown(f"### {section_title}")
# Handle different content types
if isinstance(content, str):
st.markdown(content)
elif isinstance(content, list):
for item in content:
if isinstance(item, str):
st.markdown(f"* {item}")
else:
st.json(item)
elif isinstance(content, dict):
for k, v in content.items():
st.markdown(f"**{k}:** {v}")
else:
st.warning("No content available for this document.")
# Images tab if available
if has_images and len(prev_tabs) > 2:
with prev_tabs[2]:
try:
# Import function
from ocr_utils import create_html_with_images
if 'pages_data' in selected_result:
# Generate HTML with images
html_with_images = create_html_with_images(selected_result)
# Display HTML content
st.components.v1.html(html_with_images, height=600, scrolling=True)
# Download button with unique key to prevent resets
st.download_button(
label="Download with Images (HTML)",
data=html_with_images,
file_name=f"{selected_result.get('file_name', 'document')}_with_images.html",
mime="text/html",
key=f"prev_download_{hash(selected_result.get('file_name', 'doc'))}_{selected_index}"
)
else:
st.warning("No image data available for this document.")
except Exception as e:
st.error(f"Could not display document with images: {str(e)}")
# About tab content
with main_tab3:
st.markdown("""
### About This Application
This app uses Mistral AI's Document OCR to extract text and images from historical documents with enhanced formatting.
It can process:
- Image files (jpg, png, etc.)
- PDF documents (multi-page support)
The extracted content is processed into structured data based on the document type, combining:
- Text extraction with `mistral-ocr-latest`
- Analysis with language models
- Layout preservation with images
- Enhanced typography for historical documents
View results in three formats:
- **Structured View**: Beautifully formatted HTML with proper document structure
- **Raw JSON**: Complete data structure for developers
- **With Images**: Document with embedded images preserving original layout
**History Feature:**
- All processed documents are saved in the session history
- Access previous documents in the "Previous Results" tab
- No need to reprocess the same document multiple times
""")
# Main tab content
with main_tab1:
# Create a more compact layout using custom CSS
st.markdown('<div class="compact-layout">', unsafe_allow_html=True)
# Create two columns for the main interface with a better ratio
col1, col2 = st.columns([1, 1.2])
# File upload column
with col1:
with st.container():
st.markdown('<div class="upload-section">', unsafe_allow_html=True)
st.subheader("Upload Document")
# File uploader
uploaded_file = st.file_uploader("Choose an image or PDF file",
type=["pdf", "png", "jpg", "jpeg"],
help="Select a document to process with OCR")
st.markdown('</div>', unsafe_allow_html=True)
# Show preprocessing summary only if substantive options are selected
substantive_options = (preprocessing_options.get("grayscale", False) or
preprocessing_options.get("threshold", False) or
preprocessing_options.get("denoise", False) or
preprocessing_options.get("contrast", 0) != 0)
if uploaded_file is not None and substantive_options:
st.write("**Active preprocessing:**")
prep_list = []
if preprocessing_options.get("grayscale", False):
prep_list.append("Grayscale conversion")
if preprocessing_options.get("threshold", False):
prep_list.append("Adaptive thresholding")
if preprocessing_options.get("denoise", False):
prep_list.append("Noise reduction")
contrast_value = preprocessing_options.get("contrast", 0)
if contrast_value != 0:
direction = "increased" if contrast_value > 0 else "decreased"
prep_list.append(f"Contrast {direction} by {abs(contrast_value)}")
rotation = preprocessing_options.get("rotation", "None")
if rotation != "None":
prep_list.append(f"{rotation}")
for item in prep_list:
st.write(f"- {item}")
# Process button - show only when file is uploaded
if uploaded_file is not None:
# Check file size (cap at 20MB)
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024)
if file_size_mb > 20:
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 20MB.")
else:
# Display file info
st.write(f"**File:** {uploaded_file.name} ({file_size_mb:.2f} MB)")
# Process button
st.markdown('<div class="process-button">', unsafe_allow_html=True)
process_button = st.button("Process Document",
type="primary",
use_container_width=True,
help="Start OCR processing with the selected options")
st.markdown('</div>', unsafe_allow_html=True)
# Preview column
with col2:
if uploaded_file is not None:
with st.expander("Document Preview", expanded=False):
file_ext = Path(uploaded_file.name).suffix.lower()
# Show preview tabs for original and processed (if applicable)
if uploaded_file.type and uploaded_file.type.startswith('image/'):
# For image files
preview_tabs = st.tabs(["Original"])
# Show original image preview
with preview_tabs[0]:
try:
image = safe_open_image(uploaded_file.getvalue())
if image:
# Display with controlled size
st.image(image, caption=uploaded_file.name, width=400)
else:
st.info("Image preview not available")
except Exception:
st.info("Image preview could not be displayed")
# Add processed preview ONLY if substantive preprocessing options are selected
if preprocessing_options.get("grayscale", False) or preprocessing_options.get("threshold", False) or preprocessing_options.get("denoise", False) or preprocessing_options.get("contrast", 0) != 0:
# Create a before-after comparison
st.subheader("Preprocessing Preview")
try:
# Process the image with selected options
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options)
processed_image = safe_open_image(processed_bytes)
# Show before/after in columns
col1, col2 = st.columns(2)
with col1:
st.write("**Original**")
image = safe_open_image(uploaded_file.getvalue())
if image:
st.image(image, width=300)
with col2:
st.write("**Processed**")
if processed_image:
st.image(processed_image, width=300)
else:
st.info("Processed preview not available")
except Exception:
st.info("Preprocessing preview could not be generated")
elif file_ext == ".pdf":
# For PDF files
try:
# Convert first page of PDF to image
pdf_bytes = uploaded_file.getvalue()
with st.spinner("Generating PDF preview..."):
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150)
if images:
# Convert to JPEG for display
first_page = images[0]
img_bytes = io.BytesIO()
first_page.save(img_bytes, format='JPEG')
img_bytes.seek(0)
# Display preview
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", width=400)
st.info(f"PDF document with {len(convert_from_bytes(pdf_bytes, dpi=100))} pages")
else:
st.info(f"PDF preview not available: {uploaded_file.name}")
except Exception:
st.info(f"PDF preview could not be displayed: {uploaded_file.name}")
# Results section - spans full width
if 'process_button' in locals() and process_button:
# Horizontal line to separate input and results
st.markdown('<div class="processing-results">', unsafe_allow_html=True)
st.markdown("---")
st.subheader("Processing Results")
try:
# Process the file with selected options
result = process_file(uploaded_file, use_vision, preprocessing_options)
# Save result to session state
st.session_state.current_result = result
# Add to previous results if not already there
if result not in st.session_state.previous_results:
st.session_state.previous_results.append(result)
# Keep only the last 10 results to avoid memory issues
if len(st.session_state.previous_results) > 10:
st.session_state.previous_results.pop(0)
# Create tabs for viewing results
has_images = result.get('has_images', False)
if has_images:
result_tabs = st.tabs(["Structured View", "Raw JSON", "With Images"])
else:
result_tabs = st.tabs(["Structured View", "Raw JSON"])
# Structured view tab
with result_tabs[0]:
# Display file info
st.write(f"**File:** {result.get('file_name', uploaded_file.name)}")
# Remove confidence score from display
# Show languages if available
if 'languages' in result and result['languages']:
languages = [lang for lang in result['languages'] if lang is not None]
if languages:
st.write(f"**Languages:** {', '.join(languages)}")
# Show topics if available
if 'topics' in result and result['topics']:
st.write(f"**Topics:** {', '.join(result['topics'])}")
# Display limited pages info if applicable
if 'limited_pages' in result:
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages")
# Display structured content
if 'ocr_contents' in result:
st.markdown("## Document Contents")
# Format based on content structure
if isinstance(result['ocr_contents'], dict):
for section, content in result['ocr_contents'].items():
if not content: # Skip empty sections
continue
section_title = section.replace('_', ' ').title()
# Special handling for title and subtitle
if section.lower() == 'title':
st.markdown(f"# {content}")
elif section.lower() == 'subtitle':
st.markdown(f"*{content}*")
else:
# Section headers for non-title sections
st.markdown(f"### {section_title}")
# Process different content types
if isinstance(content, str):
st.markdown(content)
elif isinstance(content, list):
# Display list items with proper formatting
st.write("") # Add spacing
for item in content:
if isinstance(item, str):
st.markdown(f"* {item}")
elif isinstance(item, dict):
# Better handling for image data and other dictionaries
if 'src' in item or 'alt' in item:
# For image data, show only alt text if available
if 'alt' in item and item['alt']:
st.markdown(f"* {item['alt']}")
else:
st.markdown(f"* Image")
else:
# For other dictionaries, show a simplified version
key = list(item.keys())[0] if item else 'Item'
if isinstance(item.get(key), str):
st.markdown(f"* **{key}**: {item[key]}")
else:
st.markdown(f"* **{key}**")
elif isinstance(content, dict):
# Special handling for poem type
if 'type' in content and content['type'] == 'poem' and 'lines' in content:
st.markdown("```") # Use code block for poem to preserve spacing
for line in content['lines']:
st.markdown(line)
st.markdown("```")
else:
# Regular dictionary display with better formatting
st.write("") # Add spacing
for k, v in content.items():
if isinstance(v, str):
st.markdown(f"**{k}:** {v}")
elif isinstance(v, list):
st.markdown(f"**{k}:**")
for item in v:
st.markdown(f" * {item}")
else:
st.markdown(f"**{k}:** {v}")
st.markdown('</div>', unsafe_allow_html=True) # Close processing-results div
# Download button
with st.expander("Export Content"):
# Generate HTML content for download with proper CSS styling
html_content = '''<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OCR Document</title>
<style>
body {
font-family: 'Georgia', serif;
line-height: 1.6;
margin: 0;
padding: 20px;
background-color: #f9f9f9;
color: #333;
}
.container {
max-width: 1000px;
margin: 0 auto;
background-color: #fff;
padding: 30px;
border-radius: 8px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
}
h1, h2, h3 {
font-family: 'Bookman', 'Georgia', serif;
margin-top: 1.5em;
margin-bottom: 0.5em;
color: #222;
}
h1 { font-size: 2.2em; border-bottom: 2px solid #e0e0e0; padding-bottom: 10px; }
h2 { font-size: 1.8em; border-bottom: 1px solid #e0e0e0; padding-bottom: 6px; }
h3 { font-size: 1.5em; }
p { margin-bottom: 1.2em; text-align: justify; }
ul { margin-bottom: 1.5em; }
li { margin-bottom: 0.3em; }
dl { margin-bottom: 1.5em; }
dt { font-weight: bold; margin-top: 1em; }
dd { margin-left: 2em; margin-bottom: 0.5em; }
.poem {
font-family: 'Baskerville', 'Georgia', serif;
margin-left: 2em;
line-height: 1.8;
white-space: pre-wrap;
}
</style>
</head>
<body>
<div class="container">'''
# Add content to HTML with proper formatting
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict):
for section, content in result['ocr_contents'].items():
if not content:
continue
section_title = section.replace('_', ' ').title()
# Handle title and subtitle with special formatting
if section.lower() == 'title':
html_content += f'<h1>{content}</h1>\n'
elif section.lower() == 'subtitle':
html_content += f'<div style="font-style:italic;font-size:1.1em;margin-bottom:1.5em;">{content}</div>\n'
else:
html_content += f'<h3>{section_title}</h3>\n'
# Handle different content types with appropriate HTML
if isinstance(content, str):
# Split into paragraphs and format each properly
paragraphs = content.split('\n\n')
for p in paragraphs:
if p.strip():
html_content += f'<p>{p.strip()}</p>\n'
elif isinstance(content, list):
# Properly format lists with better handling for dict items
html_content += '<ul>\n'
for item in content:
if isinstance(item, str):
html_content += f'<li>{item}</li>\n'
elif isinstance(item, dict):
# Better handling for image data and other dictionaries in HTML
if 'src' in item or 'alt' in item:
# For image data, show only alt text if available
if 'alt' in item and item['alt']:
html_content += f'<li>{item["alt"]}</li>\n'
else:
html_content += f'<li>Image</li>\n'
else:
# For other dictionaries, simplify display
key = list(item.keys())[0] if item else 'Item'
if isinstance(item.get(key), str):
html_content += f'<li><strong>{key}</strong>: {item[key]}</li>\n'
else:
html_content += f'<li><strong>{key}</strong></li>\n'
else:
html_content += f'<li>{str(item)}</li>\n'
html_content += '</ul>\n'
elif isinstance(content, dict):
# Special handling for poem content
if 'type' in content and content['type'] == 'poem' and 'lines' in content:
html_content += '<div class="poem">\n'
for line in content['lines']:
html_content += f'{line}<br>\n'
html_content += '</div>\n'
else:
# Regular dictionary display with proper nesting
html_content += '<dl>\n'
for k, v in content.items():
html_content += f'<dt>{k}</dt>\n'
if isinstance(v, str):
html_content += f'<dd>{v}</dd>\n'
elif isinstance(v, list):
html_content += '<dd><ul>\n'
for item in v:
html_content += f'<li>{item}</li>\n'
html_content += '</ul></dd>\n'
else:
html_content += f'<dd>{str(v)}</dd>\n'
html_content += '</dl>\n'
# Close HTML
html_content += '''
</div>
</body>
</html>'''
# Create download button with unique key to prevent resets
html_bytes = html_content.encode()
st.download_button(
label="Download as HTML",
data=html_bytes,
file_name="document_content.html",
mime="text/html",
key=f"download_html_{hash(result.get('file_name', 'doc'))}"
)
# Raw JSON tab
with result_tabs[1]:
st.json(result)
# Images tab (if available)
if has_images:
with result_tabs[2]:
try:
# Import create_html_with_images function
from ocr_utils import create_html_with_images
# Check if images are available
if 'pages_data' not in result:
st.warning("No image data available in the OCR response.")
else:
# Count images for warning
image_count = 0
for page in result.get('pages_data', []):
image_count += len(page.get('images', []))
if image_count > 10:
st.warning(f"This document contains {image_count} images. Rendering may take longer.")
# Display info about pages and images
page_count = len(result.get('pages_data', []))
st.write(f"**Document contains {page_count} page{'' if page_count == 1 else 's'} with {image_count} image{'' if image_count == 1 else 's'} total**")
# Add pagination if multiple pages
if page_count > 1:
page_options = [f"Page {i+1}" for i in range(page_count)]
selected_page = st.selectbox("Select page to view:", options=page_options)
selected_page_num = int(selected_page.split(" ")[1])
st.write(f"**Viewing {selected_page}**")
# Generate HTML with images
with st.spinner("Generating document with embedded images..."):
html_with_images = create_html_with_images(result)
# Display document in a fixed height container with scrolling
st.write("**Document with Original Images**")
st.components.v1.html(html_with_images, height=600, scrolling=True)
# Provide a download option
col1, col2 = st.columns([3, 1])
with col2:
st.download_button(
label="Download with Images",
data=html_with_images,
file_name=f"{result.get('file_name', 'document')}_with_images.html",
mime="text/html",
use_container_width=True,
key=f"download_images_{hash(result.get('file_name', 'doc'))}"
)
with col1:
st.info("This HTML document includes the original document images embedded at their correct positions.")
st.write("Original filenames and image positions are preserved in the downloaded file.")
except Exception as e:
st.error(f"Could not display document with images: {str(e)}")
except Exception as e:
st.error(f"Error processing document: {str(e)}")
# Show sample examples when no file is uploaded
elif uploaded_file is None:
# Show info about supported formats
st.markdown('</div>', unsafe_allow_html=True) # Close compact-layout div
st.info("πŸ“ Upload a document to get started. Supported formats: JPG, PNG, PDF")
# Show example usage
with st.expander("Tips for best results"):
st.markdown("""
**For best OCR results:**
1. **Image quality** - Higher resolution images produce better results
2. **Document orientation** - Use rotation options for incorrectly oriented documents
3. **Preprocessing** - Try grayscale and thresholding for low-contrast documents
4. **File size** - Keep files under 10MB for best API performance
**File preservation:** Original filenames are preserved in the results.
""")