Spaces:
Running
Running
import os | |
import streamlit as st | |
import json | |
import sys | |
import time | |
from pathlib import Path | |
import tempfile | |
import io | |
from pdf2image import convert_from_bytes | |
from PIL import Image, ImageEnhance, ImageFilter, UnidentifiedImageError | |
import PIL | |
import cv2 | |
import numpy as np | |
# Import the StructuredOCR class and config from the local files | |
from structured_ocr import StructuredOCR | |
from config import MISTRAL_API_KEY | |
# Import UI layout if available | |
try: | |
from ui.layout import tool_container | |
UI_LAYOUT_AVAILABLE = True | |
except ImportError: | |
UI_LAYOUT_AVAILABLE = False | |
# Set page configuration | |
st.set_page_config( | |
page_title="Historical OCR", | |
page_icon="π", | |
layout="wide", | |
initial_sidebar_state="expanded" | |
) | |
# Enable caching for expensive operations | |
def convert_pdf_to_images(pdf_bytes, dpi=150): | |
"""Convert PDF bytes to a list of images with caching""" | |
try: | |
return convert_from_bytes(pdf_bytes, dpi=dpi) | |
except Exception as e: | |
st.error(f"Error converting PDF: {str(e)}") | |
return [] | |
def safe_open_image(image_bytes): | |
"""Safe wrapper for PIL.Image.open with robust error handling""" | |
try: | |
return Image.open(io.BytesIO(image_bytes)) | |
except Exception: | |
# Return None if image can't be opened | |
return None | |
def preprocess_image(image_bytes, preprocessing_options): | |
"""Preprocess image with selected options""" | |
try: | |
# Attempt to open the image safely | |
image = safe_open_image(image_bytes) | |
# If image could not be opened, return the original bytes | |
if image is None: | |
return image_bytes | |
# Ensure image is in RGB mode for OpenCV processing | |
if image.mode not in ['RGB', 'RGBA']: | |
image = image.convert('RGB') | |
elif image.mode == 'RGBA': | |
# Handle RGBA images by removing transparency | |
background = Image.new('RGB', image.size, (255, 255, 255)) | |
background.paste(image, mask=image.split()[3]) # 3 is the alpha channel | |
image = background | |
# Handle image rotation based on user selection | |
rotation_option = preprocessing_options.get("rotation", "None") | |
if rotation_option != "None": | |
if rotation_option == "Rotate 90Β° clockwise": | |
image = image.transpose(Image.ROTATE_270) | |
elif rotation_option == "Rotate 90Β° counterclockwise": | |
image = image.transpose(Image.ROTATE_90) | |
elif rotation_option == "Rotate 180Β°": | |
image = image.transpose(Image.ROTATE_180) | |
elif rotation_option == "Auto-detect": | |
# Auto-detect orientation | |
width, height = image.size | |
# If image is in landscape and likely a document (typically portrait is better for OCR) | |
if width > height and (width / height) > 1.5: | |
image = image.transpose(Image.ROTATE_90) | |
# Convert to numpy array for OpenCV processing | |
try: | |
img_array = np.array(image) | |
except Exception: | |
# Return the original image as JPEG if we can't convert to array | |
byte_io = io.BytesIO() | |
image.save(byte_io, format='JPEG') | |
byte_io.seek(0) | |
return byte_io.getvalue() | |
# Apply preprocessing based on selected options | |
try: | |
if preprocessing_options.get("grayscale", False): | |
img_array = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) | |
if preprocessing_options.get("contrast", 0) != 0: | |
contrast_factor = 1 + (preprocessing_options.get("contrast", 0) / 10) | |
image = Image.fromarray(img_array) | |
enhancer = ImageEnhance.Contrast(image) | |
image = enhancer.enhance(contrast_factor) | |
img_array = np.array(image) | |
if preprocessing_options.get("denoise", False): | |
# Ensure the image is in the correct format for denoising (CV_8UC3) | |
if len(img_array.shape) != 3 or img_array.shape[2] != 3: | |
# Convert to RGB if it's not already a 3-channel color image | |
if len(img_array.shape) == 2: # Grayscale | |
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB) | |
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21) | |
if preprocessing_options.get("threshold", False): | |
# Convert to grayscale if not already | |
if len(img_array.shape) == 3: | |
gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY) | |
else: | |
gray = img_array | |
# Apply adaptive threshold | |
binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, | |
cv2.THRESH_BINARY, 11, 2) | |
# Convert back to RGB | |
img_array = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB) | |
except Exception: | |
# Return the original image if preprocessing fails | |
byte_io = io.BytesIO() | |
image.save(byte_io, format='JPEG') | |
byte_io.seek(0) | |
return byte_io.getvalue() | |
# Convert back to PIL Image | |
try: | |
processed_image = Image.fromarray(img_array) | |
# Convert to bytes | |
byte_io = io.BytesIO() | |
processed_image.save(byte_io, format='JPEG') # Use JPEG for better compatibility | |
byte_io.seek(0) | |
return byte_io.getvalue() | |
except Exception: | |
# Final fallback - return original bytes | |
return image_bytes | |
except Exception: | |
# Return original image bytes as fallback | |
return image_bytes | |
# Define functions | |
def process_file(uploaded_file, use_vision=True, preprocessing_options=None): | |
"""Process the uploaded file and return the OCR results | |
Args: | |
uploaded_file: The uploaded file to process | |
use_vision: Whether to use vision model | |
preprocessing_options: Dictionary of preprocessing options | |
""" | |
if preprocessing_options is None: | |
preprocessing_options = {} | |
# Show progress indicator | |
progress_bar = st.progress(0) | |
status_text = st.empty() | |
status_text.text("Preparing file for processing...") | |
# Save the uploaded file to a temporary file | |
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(uploaded_file.name).suffix) as tmp: | |
tmp.write(uploaded_file.getvalue()) | |
temp_path = tmp.name | |
try: | |
# Check if API key is available | |
if not MISTRAL_API_KEY: | |
# Return dummy data if no API key | |
progress_bar.progress(100) | |
status_text.empty() | |
# Show a clear message about the missing API key | |
st.error("π **Missing API Key**: Cannot process document without a valid Mistral AI API key.") | |
st.info(""" | |
**How to add your API key:** | |
For Hugging Face Spaces: | |
1. Go to your Space settings | |
2. Add a secret named `MISTRAL_API_KEY` with your API key value | |
For local development: | |
1. Add to your shell: `export MISTRAL_API_KEY=your_key_here` | |
2. Or create a `.env` file with `MISTRAL_API_KEY=your_key_here` | |
""") | |
return { | |
"file_name": uploaded_file.name, | |
"topics": ["API Key Required"], | |
"languages": ["English"], | |
"ocr_contents": { | |
"title": "Missing Mistral API Key", | |
"content": "To process real documents, please set the MISTRAL_API_KEY environment variable as described above." | |
} | |
} | |
# Update progress | |
progress_bar.progress(20) | |
status_text.text("Initializing OCR processor...") | |
# Initialize OCR processor with explicit API key | |
try: | |
# Make sure the API key is properly formatted | |
api_key = MISTRAL_API_KEY.strip() | |
processor = StructuredOCR(api_key=api_key) | |
except Exception as e: | |
st.error(f"Error initializing OCR processor: {str(e)}") | |
return { | |
"file_name": uploaded_file.name, | |
"error": "API authentication failed", | |
"ocr_contents": { | |
"error": "Could not authenticate with Mistral API. Please check your API key." | |
} | |
} | |
# Determine file type from extension | |
file_ext = Path(uploaded_file.name).suffix.lower() | |
file_type = "pdf" if file_ext == ".pdf" else "image" | |
# Store original filename in session state for preservation | |
st.session_state.original_filename = uploaded_file.name | |
# Apply preprocessing if needed | |
if any(preprocessing_options.values()) and file_type == "image": | |
status_text.text("Applying image preprocessing...") | |
try: | |
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options) | |
# Save processed image to temp file but preserve original filename for results | |
original_ext = Path(uploaded_file.name).suffix.lower() | |
# Use original extension when possible for better format recognition | |
if original_ext in ['.jpg', '.jpeg', '.png']: | |
suffix = original_ext | |
else: | |
suffix = '.jpg' # Default fallback to JPEG | |
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as proc_tmp: | |
proc_tmp.write(processed_bytes) | |
temp_path = proc_tmp.name | |
except Exception as e: | |
st.warning(f"Image preprocessing failed: {str(e)}. Proceeding with original image.") | |
# If preprocessing fails, use original file | |
# This ensures the OCR process continues even if preprocessing has issues | |
# Get file size in MB | |
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024) | |
# Check if file exceeds size limits (10 MB for API processing) | |
# This is a lower limit than the UI file size to ensure API requests don't fail | |
if file_size_mb > 10: | |
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size for API processing is 10MB.") | |
return { | |
"file_name": uploaded_file.name, | |
"topics": ["Document"], | |
"languages": ["English"], | |
"confidence_score": 0.0, | |
"error": f"File size {file_size_mb:.2f} MB exceeds API limit of 10 MB", | |
"ocr_contents": { | |
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds API limit of 10 MB", | |
"partial_text": "Document could not be processed due to API limitations. Try reducing the file size or resolution." | |
} | |
} | |
# Update progress | |
progress_bar.progress(40) | |
status_text.text("Processing document with OCR...") | |
# Process the file with file size information for automatic page limiting | |
# Make sure we're using the latest mistral-ocr model | |
# See https://docs.mistral.ai/capabilities/document/ for more info | |
result = processor.process_file(temp_path, file_type=file_type, use_vision=use_vision, file_size_mb=file_size_mb) | |
# Complete progress | |
progress_bar.progress(100) | |
status_text.empty() | |
# Preserve original filename in results | |
if hasattr(st.session_state, 'original_filename'): | |
result['file_name'] = st.session_state.original_filename | |
# Clear the stored filename for next run | |
del st.session_state.original_filename | |
return result | |
except Exception as e: | |
progress_bar.progress(100) | |
status_text.empty() | |
st.error(f"Error during processing: {str(e)}") | |
raise | |
finally: | |
# Clean up the temporary file | |
if os.path.exists(temp_path): | |
os.unlink(temp_path) | |
# Initialize session state for storing results | |
if 'previous_results' not in st.session_state: | |
st.session_state.previous_results = [] | |
if 'current_result' not in st.session_state: | |
st.session_state.current_result = None | |
# App title and description | |
st.title("Historical Document OCR") | |
st.write("Process historical documents and images with AI-powered OCR.") | |
# Check if API key is available | |
if not MISTRAL_API_KEY: | |
st.warning("β οΈ **No Mistral API key found.** Please set the MISTRAL_API_KEY environment variable.") | |
st.info("For Hugging Face Spaces, add it as a secret. For local development, export it in your shell or add it to a .env file.") | |
# Create main layout with tabs | |
main_tab1, main_tab2, main_tab3 = st.tabs(["Document Processing", "Previous Results", "About"]) | |
# Sidebar with options | |
with st.sidebar: | |
st.header("Options") | |
# Model options | |
st.subheader("Model Settings") | |
use_vision = st.checkbox("Use Vision Model", value=True, | |
help="For image files, use the vision model for improved analysis") | |
# Image preprocessing options | |
st.subheader("Image Preprocessing") | |
with st.expander("Preprocessing Options"): | |
preprocessing_options = {} | |
preprocessing_options["grayscale"] = st.checkbox("Convert to Grayscale", | |
help="Convert image to grayscale before OCR") | |
preprocessing_options["threshold"] = st.checkbox("Apply Thresholding", | |
help="Apply adaptive thresholding to enhance text") | |
preprocessing_options["denoise"] = st.checkbox("Denoise Image", | |
help="Remove noise from the image") | |
preprocessing_options["contrast"] = st.slider("Adjust Contrast", -5, 5, 0, | |
help="Adjust image contrast (-5 to +5)") | |
# Add rotation options | |
rotation_options = ["None", "Rotate 90Β° clockwise", "Rotate 90Β° counterclockwise", "Rotate 180Β°", "Auto-detect"] | |
preprocessing_options["rotation"] = st.selectbox("Image Orientation", rotation_options, index=0, | |
help="Rotate image to correct orientation") | |
# PDF options | |
st.subheader("PDF Options") | |
with st.expander("PDF Settings"): | |
pdf_dpi = st.slider("PDF Resolution (DPI)", 72, 300, 150, | |
help="Higher DPI gives better quality but slower processing") | |
max_pages = st.number_input("Maximum Pages", 1, 20, 5, | |
help="Limit number of pages to process") | |
# Previous Results tab | |
with main_tab2: | |
if not st.session_state.previous_results: | |
st.info("No previous documents have been processed yet. Process a document to see results here.") | |
else: | |
st.subheader("Previously Processed Documents") | |
# Display previous results in a selectable list | |
previous_files = [f"{i+1}. {result.get('file_name', 'Document')}" | |
for i, result in enumerate(st.session_state.previous_results)] | |
selected_index = st.selectbox("Select a previous document:", | |
options=range(len(previous_files)), | |
format_func=lambda i: previous_files[i]) | |
selected_result = st.session_state.previous_results[selected_index] | |
# Display selected result in tabs | |
has_images = selected_result.get('has_images', False) | |
if has_images: | |
prev_tabs = st.tabs(["Document Info", "Content", "With Images"]) | |
else: | |
prev_tabs = st.tabs(["Document Info", "Content"]) | |
# Document Info tab | |
with prev_tabs[0]: | |
st.write(f"**File:** {selected_result.get('file_name', 'Document')}") | |
# Remove confidence score from display | |
# Show languages if available | |
if 'languages' in selected_result and selected_result['languages']: | |
languages = [lang for lang in selected_result['languages'] if lang is not None] | |
if languages: | |
st.write(f"**Languages:** {', '.join(languages)}") | |
# Show topics if available | |
if 'topics' in selected_result and selected_result['topics']: | |
st.write(f"**Topics:** {', '.join(selected_result['topics'])}") | |
# Show any limited pages info | |
if 'limited_pages' in selected_result: | |
st.info(f"Processed {selected_result['limited_pages']['processed']} of {selected_result['limited_pages']['total']} pages") | |
# Content tab | |
with prev_tabs[1]: | |
if 'ocr_contents' in selected_result: | |
st.markdown("## Document Contents") | |
if isinstance(selected_result['ocr_contents'], dict): | |
for section, content in selected_result['ocr_contents'].items(): | |
if not content: | |
continue | |
section_title = section.replace('_', ' ').title() | |
# Special handling for title and subtitle | |
if section.lower() == 'title': | |
st.markdown(f"# {content}") | |
elif section.lower() == 'subtitle': | |
st.markdown(f"*{content}*") | |
else: | |
st.markdown(f"### {section_title}") | |
# Handle different content types | |
if isinstance(content, str): | |
st.markdown(content) | |
elif isinstance(content, list): | |
for item in content: | |
if isinstance(item, str): | |
st.markdown(f"* {item}") | |
else: | |
st.json(item) | |
elif isinstance(content, dict): | |
for k, v in content.items(): | |
st.markdown(f"**{k}:** {v}") | |
else: | |
st.warning("No content available for this document.") | |
# Images tab if available | |
if has_images and len(prev_tabs) > 2: | |
with prev_tabs[2]: | |
try: | |
# Import function | |
from ocr_utils import create_html_with_images | |
if 'pages_data' in selected_result: | |
# Generate HTML with images | |
html_with_images = create_html_with_images(selected_result) | |
# Display HTML content | |
st.components.v1.html(html_with_images, height=600, scrolling=True) | |
# Download button with unique key to prevent resets | |
st.download_button( | |
label="Download with Images (HTML)", | |
data=html_with_images, | |
file_name=f"{selected_result.get('file_name', 'document')}_with_images.html", | |
mime="text/html", | |
key=f"prev_download_{hash(selected_result.get('file_name', 'doc'))}_{selected_index}" | |
) | |
else: | |
st.warning("No image data available for this document.") | |
except Exception as e: | |
st.error(f"Could not display document with images: {str(e)}") | |
# About tab content | |
with main_tab3: | |
st.markdown(""" | |
### About This Application | |
This app uses Mistral AI's Document OCR to extract text and images from historical documents with enhanced formatting. | |
It can process: | |
- Image files (jpg, png, etc.) | |
- PDF documents (multi-page support) | |
The extracted content is processed into structured data based on the document type, combining: | |
- Text extraction with `mistral-ocr-latest` | |
- Analysis with language models | |
- Layout preservation with images | |
- Enhanced typography for historical documents | |
View results in three formats: | |
- **Structured View**: Beautifully formatted HTML with proper document structure | |
- **Raw JSON**: Complete data structure for developers | |
- **With Images**: Document with embedded images preserving original layout | |
**History Feature:** | |
- All processed documents are saved in the session history | |
- Access previous documents in the "Previous Results" tab | |
- No need to reprocess the same document multiple times | |
""") | |
# Main tab content | |
with main_tab1: | |
# Create a more compact layout using custom CSS | |
st.markdown('<div class="compact-layout">', unsafe_allow_html=True) | |
# Create two columns for the main interface with a better ratio | |
col1, col2 = st.columns([1, 1.2]) | |
# File upload column | |
with col1: | |
with st.container(): | |
st.markdown('<div class="upload-section">', unsafe_allow_html=True) | |
st.subheader("Upload Document") | |
# File uploader | |
uploaded_file = st.file_uploader("Choose an image or PDF file", | |
type=["pdf", "png", "jpg", "jpeg"], | |
help="Select a document to process with OCR") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Show preprocessing summary only if substantive options are selected | |
substantive_options = (preprocessing_options.get("grayscale", False) or | |
preprocessing_options.get("threshold", False) or | |
preprocessing_options.get("denoise", False) or | |
preprocessing_options.get("contrast", 0) != 0) | |
if uploaded_file is not None and substantive_options: | |
st.write("**Active preprocessing:**") | |
prep_list = [] | |
if preprocessing_options.get("grayscale", False): | |
prep_list.append("Grayscale conversion") | |
if preprocessing_options.get("threshold", False): | |
prep_list.append("Adaptive thresholding") | |
if preprocessing_options.get("denoise", False): | |
prep_list.append("Noise reduction") | |
contrast_value = preprocessing_options.get("contrast", 0) | |
if contrast_value != 0: | |
direction = "increased" if contrast_value > 0 else "decreased" | |
prep_list.append(f"Contrast {direction} by {abs(contrast_value)}") | |
rotation = preprocessing_options.get("rotation", "None") | |
if rotation != "None": | |
prep_list.append(f"{rotation}") | |
for item in prep_list: | |
st.write(f"- {item}") | |
# Process button - show only when file is uploaded | |
if uploaded_file is not None: | |
# Check file size (cap at 20MB) | |
file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024) | |
if file_size_mb > 20: | |
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 20MB.") | |
else: | |
# Display file info | |
st.write(f"**File:** {uploaded_file.name} ({file_size_mb:.2f} MB)") | |
# Process button | |
st.markdown('<div class="process-button">', unsafe_allow_html=True) | |
process_button = st.button("Process Document", | |
type="primary", | |
use_container_width=True, | |
help="Start OCR processing with the selected options") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Preview column | |
with col2: | |
if uploaded_file is not None: | |
with st.expander("Document Preview", expanded=False): | |
file_ext = Path(uploaded_file.name).suffix.lower() | |
# Show preview tabs for original and processed (if applicable) | |
if uploaded_file.type and uploaded_file.type.startswith('image/'): | |
# For image files | |
preview_tabs = st.tabs(["Original"]) | |
# Show original image preview | |
with preview_tabs[0]: | |
try: | |
image = safe_open_image(uploaded_file.getvalue()) | |
if image: | |
# Display with controlled size | |
st.image(image, caption=uploaded_file.name, width=400) | |
else: | |
st.info("Image preview not available") | |
except Exception: | |
st.info("Image preview could not be displayed") | |
# Add processed preview ONLY if substantive preprocessing options are selected | |
if preprocessing_options.get("grayscale", False) or preprocessing_options.get("threshold", False) or preprocessing_options.get("denoise", False) or preprocessing_options.get("contrast", 0) != 0: | |
# Create a before-after comparison | |
st.subheader("Preprocessing Preview") | |
try: | |
# Process the image with selected options | |
processed_bytes = preprocess_image(uploaded_file.getvalue(), preprocessing_options) | |
processed_image = safe_open_image(processed_bytes) | |
# Show before/after in columns | |
col1, col2 = st.columns(2) | |
with col1: | |
st.write("**Original**") | |
image = safe_open_image(uploaded_file.getvalue()) | |
if image: | |
st.image(image, width=300) | |
with col2: | |
st.write("**Processed**") | |
if processed_image: | |
st.image(processed_image, width=300) | |
else: | |
st.info("Processed preview not available") | |
except Exception: | |
st.info("Preprocessing preview could not be generated") | |
elif file_ext == ".pdf": | |
# For PDF files | |
try: | |
# Convert first page of PDF to image | |
pdf_bytes = uploaded_file.getvalue() | |
with st.spinner("Generating PDF preview..."): | |
images = convert_from_bytes(pdf_bytes, first_page=1, last_page=1, dpi=150) | |
if images: | |
# Convert to JPEG for display | |
first_page = images[0] | |
img_bytes = io.BytesIO() | |
first_page.save(img_bytes, format='JPEG') | |
img_bytes.seek(0) | |
# Display preview | |
st.image(img_bytes, caption=f"PDF Preview: {uploaded_file.name}", width=400) | |
st.info(f"PDF document with {len(convert_from_bytes(pdf_bytes, dpi=100))} pages") | |
else: | |
st.info(f"PDF preview not available: {uploaded_file.name}") | |
except Exception: | |
st.info(f"PDF preview could not be displayed: {uploaded_file.name}") | |
# Results section - spans full width | |
if 'process_button' in locals() and process_button: | |
# Horizontal line to separate input and results | |
st.markdown('<div class="processing-results">', unsafe_allow_html=True) | |
st.markdown("---") | |
st.subheader("Processing Results") | |
try: | |
# Process the file with selected options | |
result = process_file(uploaded_file, use_vision, preprocessing_options) | |
# Save result to session state | |
st.session_state.current_result = result | |
# Add to previous results if not already there | |
if result not in st.session_state.previous_results: | |
st.session_state.previous_results.append(result) | |
# Keep only the last 10 results to avoid memory issues | |
if len(st.session_state.previous_results) > 10: | |
st.session_state.previous_results.pop(0) | |
# Create tabs for viewing results | |
has_images = result.get('has_images', False) | |
if has_images: | |
result_tabs = st.tabs(["Structured View", "Raw JSON", "With Images"]) | |
else: | |
result_tabs = st.tabs(["Structured View", "Raw JSON"]) | |
# Structured view tab | |
with result_tabs[0]: | |
# Display file info | |
st.write(f"**File:** {result.get('file_name', uploaded_file.name)}") | |
# Remove confidence score from display | |
# Show languages if available | |
if 'languages' in result and result['languages']: | |
languages = [lang for lang in result['languages'] if lang is not None] | |
if languages: | |
st.write(f"**Languages:** {', '.join(languages)}") | |
# Show topics if available | |
if 'topics' in result and result['topics']: | |
st.write(f"**Topics:** {', '.join(result['topics'])}") | |
# Display limited pages info if applicable | |
if 'limited_pages' in result: | |
st.info(f"Processed {result['limited_pages']['processed']} of {result['limited_pages']['total']} pages") | |
# Display structured content | |
if 'ocr_contents' in result: | |
st.markdown("## Document Contents") | |
# Format based on content structure | |
if isinstance(result['ocr_contents'], dict): | |
for section, content in result['ocr_contents'].items(): | |
if not content: # Skip empty sections | |
continue | |
section_title = section.replace('_', ' ').title() | |
# Special handling for title and subtitle | |
if section.lower() == 'title': | |
st.markdown(f"# {content}") | |
elif section.lower() == 'subtitle': | |
st.markdown(f"*{content}*") | |
else: | |
# Section headers for non-title sections | |
st.markdown(f"### {section_title}") | |
# Process different content types | |
if isinstance(content, str): | |
st.markdown(content) | |
elif isinstance(content, list): | |
# Display list items with proper formatting | |
st.write("") # Add spacing | |
for item in content: | |
if isinstance(item, str): | |
st.markdown(f"* {item}") | |
elif isinstance(item, dict): | |
# Better handling for image data and other dictionaries | |
if 'src' in item or 'alt' in item: | |
# For image data, show only alt text if available | |
if 'alt' in item and item['alt']: | |
st.markdown(f"* {item['alt']}") | |
else: | |
st.markdown(f"* Image") | |
else: | |
# For other dictionaries, show a simplified version | |
key = list(item.keys())[0] if item else 'Item' | |
if isinstance(item.get(key), str): | |
st.markdown(f"* **{key}**: {item[key]}") | |
else: | |
st.markdown(f"* **{key}**") | |
elif isinstance(content, dict): | |
# Special handling for poem type | |
if 'type' in content and content['type'] == 'poem' and 'lines' in content: | |
st.markdown("```") # Use code block for poem to preserve spacing | |
for line in content['lines']: | |
st.markdown(line) | |
st.markdown("```") | |
else: | |
# Regular dictionary display with better formatting | |
st.write("") # Add spacing | |
for k, v in content.items(): | |
if isinstance(v, str): | |
st.markdown(f"**{k}:** {v}") | |
elif isinstance(v, list): | |
st.markdown(f"**{k}:**") | |
for item in v: | |
st.markdown(f" * {item}") | |
else: | |
st.markdown(f"**{k}:** {v}") | |
st.markdown('</div>', unsafe_allow_html=True) # Close processing-results div | |
# Download button | |
with st.expander("Export Content"): | |
# Generate HTML content for download with proper CSS styling | |
html_content = '''<!DOCTYPE html> | |
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>OCR Document</title> | |
<style> | |
body { | |
font-family: 'Georgia', serif; | |
line-height: 1.6; | |
margin: 0; | |
padding: 20px; | |
background-color: #f9f9f9; | |
color: #333; | |
} | |
.container { | |
max-width: 1000px; | |
margin: 0 auto; | |
background-color: #fff; | |
padding: 30px; | |
border-radius: 8px; | |
box-shadow: 0 4px 12px rgba(0,0,0,0.1); | |
} | |
h1, h2, h3 { | |
font-family: 'Bookman', 'Georgia', serif; | |
margin-top: 1.5em; | |
margin-bottom: 0.5em; | |
color: #222; | |
} | |
h1 { font-size: 2.2em; border-bottom: 2px solid #e0e0e0; padding-bottom: 10px; } | |
h2 { font-size: 1.8em; border-bottom: 1px solid #e0e0e0; padding-bottom: 6px; } | |
h3 { font-size: 1.5em; } | |
p { margin-bottom: 1.2em; text-align: justify; } | |
ul { margin-bottom: 1.5em; } | |
li { margin-bottom: 0.3em; } | |
dl { margin-bottom: 1.5em; } | |
dt { font-weight: bold; margin-top: 1em; } | |
dd { margin-left: 2em; margin-bottom: 0.5em; } | |
.poem { | |
font-family: 'Baskerville', 'Georgia', serif; | |
margin-left: 2em; | |
line-height: 1.8; | |
white-space: pre-wrap; | |
} | |
</style> | |
</head> | |
<body> | |
<div class="container">''' | |
# Add content to HTML with proper formatting | |
if 'ocr_contents' in result and isinstance(result['ocr_contents'], dict): | |
for section, content in result['ocr_contents'].items(): | |
if not content: | |
continue | |
section_title = section.replace('_', ' ').title() | |
# Handle title and subtitle with special formatting | |
if section.lower() == 'title': | |
html_content += f'<h1>{content}</h1>\n' | |
elif section.lower() == 'subtitle': | |
html_content += f'<div style="font-style:italic;font-size:1.1em;margin-bottom:1.5em;">{content}</div>\n' | |
else: | |
html_content += f'<h3>{section_title}</h3>\n' | |
# Handle different content types with appropriate HTML | |
if isinstance(content, str): | |
# Split into paragraphs and format each properly | |
paragraphs = content.split('\n\n') | |
for p in paragraphs: | |
if p.strip(): | |
html_content += f'<p>{p.strip()}</p>\n' | |
elif isinstance(content, list): | |
# Properly format lists with better handling for dict items | |
html_content += '<ul>\n' | |
for item in content: | |
if isinstance(item, str): | |
html_content += f'<li>{item}</li>\n' | |
elif isinstance(item, dict): | |
# Better handling for image data and other dictionaries in HTML | |
if 'src' in item or 'alt' in item: | |
# For image data, show only alt text if available | |
if 'alt' in item and item['alt']: | |
html_content += f'<li>{item["alt"]}</li>\n' | |
else: | |
html_content += f'<li>Image</li>\n' | |
else: | |
# For other dictionaries, simplify display | |
key = list(item.keys())[0] if item else 'Item' | |
if isinstance(item.get(key), str): | |
html_content += f'<li><strong>{key}</strong>: {item[key]}</li>\n' | |
else: | |
html_content += f'<li><strong>{key}</strong></li>\n' | |
else: | |
html_content += f'<li>{str(item)}</li>\n' | |
html_content += '</ul>\n' | |
elif isinstance(content, dict): | |
# Special handling for poem content | |
if 'type' in content and content['type'] == 'poem' and 'lines' in content: | |
html_content += '<div class="poem">\n' | |
for line in content['lines']: | |
html_content += f'{line}<br>\n' | |
html_content += '</div>\n' | |
else: | |
# Regular dictionary display with proper nesting | |
html_content += '<dl>\n' | |
for k, v in content.items(): | |
html_content += f'<dt>{k}</dt>\n' | |
if isinstance(v, str): | |
html_content += f'<dd>{v}</dd>\n' | |
elif isinstance(v, list): | |
html_content += '<dd><ul>\n' | |
for item in v: | |
html_content += f'<li>{item}</li>\n' | |
html_content += '</ul></dd>\n' | |
else: | |
html_content += f'<dd>{str(v)}</dd>\n' | |
html_content += '</dl>\n' | |
# Close HTML | |
html_content += ''' | |
</div> | |
</body> | |
</html>''' | |
# Create download button with unique key to prevent resets | |
html_bytes = html_content.encode() | |
st.download_button( | |
label="Download as HTML", | |
data=html_bytes, | |
file_name="document_content.html", | |
mime="text/html", | |
key=f"download_html_{hash(result.get('file_name', 'doc'))}" | |
) | |
# Raw JSON tab | |
with result_tabs[1]: | |
st.json(result) | |
# Images tab (if available) | |
if has_images: | |
with result_tabs[2]: | |
try: | |
# Import create_html_with_images function | |
from ocr_utils import create_html_with_images | |
# Check if images are available | |
if 'pages_data' not in result: | |
st.warning("No image data available in the OCR response.") | |
else: | |
# Count images for warning | |
image_count = 0 | |
for page in result.get('pages_data', []): | |
image_count += len(page.get('images', [])) | |
if image_count > 10: | |
st.warning(f"This document contains {image_count} images. Rendering may take longer.") | |
# Display info about pages and images | |
page_count = len(result.get('pages_data', [])) | |
st.write(f"**Document contains {page_count} page{'' if page_count == 1 else 's'} with {image_count} image{'' if image_count == 1 else 's'} total**") | |
# Add pagination if multiple pages | |
if page_count > 1: | |
page_options = [f"Page {i+1}" for i in range(page_count)] | |
selected_page = st.selectbox("Select page to view:", options=page_options) | |
selected_page_num = int(selected_page.split(" ")[1]) | |
st.write(f"**Viewing {selected_page}**") | |
# Generate HTML with images | |
with st.spinner("Generating document with embedded images..."): | |
html_with_images = create_html_with_images(result) | |
# Display document in a fixed height container with scrolling | |
st.write("**Document with Original Images**") | |
st.components.v1.html(html_with_images, height=600, scrolling=True) | |
# Provide a download option | |
col1, col2 = st.columns([3, 1]) | |
with col2: | |
st.download_button( | |
label="Download with Images", | |
data=html_with_images, | |
file_name=f"{result.get('file_name', 'document')}_with_images.html", | |
mime="text/html", | |
use_container_width=True, | |
key=f"download_images_{hash(result.get('file_name', 'doc'))}" | |
) | |
with col1: | |
st.info("This HTML document includes the original document images embedded at their correct positions.") | |
st.write("Original filenames and image positions are preserved in the downloaded file.") | |
except Exception as e: | |
st.error(f"Could not display document with images: {str(e)}") | |
except Exception as e: | |
st.error(f"Error processing document: {str(e)}") | |
# Show sample examples when no file is uploaded | |
elif uploaded_file is None: | |
# Show info about supported formats | |
st.markdown('</div>', unsafe_allow_html=True) # Close compact-layout div | |
st.info("π Upload a document to get started. Supported formats: JPG, PNG, PDF") | |
# Show example usage | |
with st.expander("Tips for best results"): | |
st.markdown(""" | |
**For best OCR results:** | |
1. **Image quality** - Higher resolution images produce better results | |
2. **Document orientation** - Use rotation options for incorrectly oriented documents | |
3. **Preprocessing** - Try grayscale and thresholding for low-contrast documents | |
4. **File size** - Keep files under 10MB for best API performance | |
**File preservation:** Original filenames are preserved in the results. | |
""") |