Spaces:

milwright
/

historical-ocr

Running

App Files Files Community

milwright commited on 7 days ago

Commit

b4338aa

1 Parent(s): 8a9c37d

Fix image processing issues

Browse files

Files changed (3) hide show

app.py +16 -7
ocr_utils.py +4 -0
structured_ocr.py +53 -4

app.py CHANGED Viewed

@@ -45,6 +45,9 @@ def preprocess_image(image_bytes, preprocessing_options):
     """Preprocess image with selected options"""
     # Convert bytes to OpenCV format
     image = Image.open(io.BytesIO(image_bytes))
     img_array = np.array(image)
     # Apply preprocessing based on selected options
@@ -60,6 +63,11 @@ def preprocess_image(image_bytes, preprocessing_options):
         img_array = np.array(image)
     if preprocessing_options.get("denoise", False):
         img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
     if preprocessing_options.get("threshold", False):
@@ -146,18 +154,19 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
         # Get file size in MB
         file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
-        # Check if file exceeds size limits (20 MB)
-        if file_size_mb > 20:
-            st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 20MB.")
             return {
                 "file_name": uploaded_file.name,
                 "topics": ["Document"],
                 "languages": ["English"],
                 "confidence_score": 0.0,
-                "error": f"File size {file_size_mb:.2f} MB exceeds limit of 20 MB",
                 "ocr_contents": {
-                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds limit of 20 MB",
-                    "partial_text": "Document could not be processed due to size limitations."
                 }
             }
@@ -203,7 +212,7 @@ with main_tab1:
         Using the `mistral-ocr-latest` model for advanced document understanding.
         """)
-        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 20MB per file")
 # Sidebar with options
 with st.sidebar:

     """Preprocess image with selected options"""
     # Convert bytes to OpenCV format
     image = Image.open(io.BytesIO(image_bytes))
+    # Ensure image is in RGB mode for OpenCV processing
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
     img_array = np.array(image)
     # Apply preprocessing based on selected options
         img_array = np.array(image)
     if preprocessing_options.get("denoise", False):
+        # Ensure the image is in the correct format for denoising (CV_8UC3)
+        if len(img_array.shape) != 3 or img_array.shape[2] != 3:
+            # Convert to RGB if it's not already a 3-channel color image
+            if len(img_array.shape) == 2:  # Grayscale
+                img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
         img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
     if preprocessing_options.get("threshold", False):
         # Get file size in MB
         file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
+        # Check if file exceeds size limits (10 MB for API processing)
+        # This is a lower limit than the UI file size to ensure API requests don't fail
+        if file_size_mb > 10:
+            st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size for API processing is 10MB.")
             return {
                 "file_name": uploaded_file.name,
                 "topics": ["Document"],
                 "languages": ["English"],
                 "confidence_score": 0.0,
+                "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
                 "ocr_contents": {
+                    "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
+                    "partial_text": "Document could not be processed due to API limitations. Try reducing the file size or resolution."
                 }
             }
         Using the `mistral-ocr-latest` model for advanced document understanding.
         """)
+        uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
 # Sidebar with options
 with st.sidebar:

ocr_utils.py CHANGED Viewed

@@ -157,6 +157,10 @@ def get_combined_markdown_compressed(ocr_response, max_width: int = 1200, qualit
                 # Open with PIL
                 pil_img = Image.open(io.BytesIO(img_bytes))
                 # Resize if needed (maintain aspect ratio)
                 original_width, original_height = pil_img.size
                 if original_width > max_width:

                 # Open with PIL
                 pil_img = Image.open(io.BytesIO(img_bytes))
+                # Convert to RGB if not already (to ensure CV_8UC3 format)
+                if pil_img.mode != 'RGB':
+                    pil_img = pil_img.convert('RGB')
                 # Resize if needed (maintain aspect ratio)
                 original_width, original_height = pil_img.size
                 if original_width > max_width:

structured_ocr.py CHANGED Viewed

@@ -288,10 +288,59 @@ class StructuredOCR:
         logger.info(f"Processing image: {file_path}")
         try:
-            # Read and encode the image file
-            logger.info("Encoding image for API")
-            encoded_image = base64.b64encode(file_path.read_bytes()).decode()
-            base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
             # Process the image with OCR
             logger.info(f"Processing image with OCR using {OCR_MODEL}")

         logger.info(f"Processing image: {file_path}")
         try:
+            # Check file size
+            file_size_mb = file_path.stat().st_size / (1024 * 1024)
+            logger.info(f"Original image size: {file_size_mb:.2f} MB")
+            # If image is larger than 4MB, resize it to reduce API payload size
+            if file_size_mb > 4:
+                logger.info("Image is large, resizing before API submission")
+                try:
+                    from PIL import Image
+                    import io
+                    # Open and resize the image
+                    with Image.open(file_path) as img:
+                        # Convert to RGB if not already (prevents mode errors)
+                        if img.mode != 'RGB':
+                            img = img.convert('RGB')
+                        # Calculate new dimensions (maintain aspect ratio)
+                        # Target around 2000-3000 pixels on longest side for good OCR quality
+                        width, height = img.size
+                        max_dimension = max(width, height)
+                        target_dimension = 2500  # Good balance between quality and size
+                        if max_dimension > target_dimension:
+                            scale_factor = target_dimension / max_dimension
+                            new_width = int(width * scale_factor)
+                            new_height = int(height * scale_factor)
+                            img = img.resize((new_width, new_height), Image.LANCZOS)
+                        # Save to bytes with compression
+                        buffer = io.BytesIO()
+                        img.save(buffer, format="JPEG", quality=85, optimize=True)
+                        buffer.seek(0)
+                        # Get the base64
+                        encoded_image = base64.b64encode(buffer.getvalue()).decode()
+                        base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+                        # Log the new size
+                        new_size_mb = len(buffer.getvalue()) / (1024 * 1024)
+                        logger.info(f"Resized image to {new_size_mb:.2f} MB")
+                except ImportError:
+                    logger.warning("PIL not available for resizing. Using original image.")
+                    encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                    base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+                except Exception as e:
+                    logger.warning(f"Image resize failed: {str(e)}. Using original image.")
+                    encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                    base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
+            else:
+                # For smaller images, use as-is
+                encoded_image = base64.b64encode(file_path.read_bytes()).decode()
+                base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
             # Process the image with OCR
             logger.info(f"Processing image with OCR using {OCR_MODEL}")