milwright commited on
Commit
b4338aa
·
1 Parent(s): 8a9c37d

Fix image processing issues

Browse files
Files changed (3) hide show
  1. app.py +16 -7
  2. ocr_utils.py +4 -0
  3. structured_ocr.py +53 -4
app.py CHANGED
@@ -45,6 +45,9 @@ def preprocess_image(image_bytes, preprocessing_options):
45
  """Preprocess image with selected options"""
46
  # Convert bytes to OpenCV format
47
  image = Image.open(io.BytesIO(image_bytes))
 
 
 
48
  img_array = np.array(image)
49
 
50
  # Apply preprocessing based on selected options
@@ -60,6 +63,11 @@ def preprocess_image(image_bytes, preprocessing_options):
60
  img_array = np.array(image)
61
 
62
  if preprocessing_options.get("denoise", False):
 
 
 
 
 
63
  img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
64
 
65
  if preprocessing_options.get("threshold", False):
@@ -146,18 +154,19 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
146
  # Get file size in MB
147
  file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
148
 
149
- # Check if file exceeds size limits (20 MB)
150
- if file_size_mb > 20:
151
- st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size is 20MB.")
 
152
  return {
153
  "file_name": uploaded_file.name,
154
  "topics": ["Document"],
155
  "languages": ["English"],
156
  "confidence_score": 0.0,
157
- "error": f"File size {file_size_mb:.2f} MB exceeds limit of 20 MB",
158
  "ocr_contents": {
159
- "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds limit of 20 MB",
160
- "partial_text": "Document could not be processed due to size limitations."
161
  }
162
  }
163
 
@@ -203,7 +212,7 @@ with main_tab1:
203
 
204
  Using the `mistral-ocr-latest` model for advanced document understanding.
205
  """)
206
- uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"], help="Limit 20MB per file")
207
 
208
  # Sidebar with options
209
  with st.sidebar:
 
45
  """Preprocess image with selected options"""
46
  # Convert bytes to OpenCV format
47
  image = Image.open(io.BytesIO(image_bytes))
48
+ # Ensure image is in RGB mode for OpenCV processing
49
+ if image.mode != 'RGB':
50
+ image = image.convert('RGB')
51
  img_array = np.array(image)
52
 
53
  # Apply preprocessing based on selected options
 
63
  img_array = np.array(image)
64
 
65
  if preprocessing_options.get("denoise", False):
66
+ # Ensure the image is in the correct format for denoising (CV_8UC3)
67
+ if len(img_array.shape) != 3 or img_array.shape[2] != 3:
68
+ # Convert to RGB if it's not already a 3-channel color image
69
+ if len(img_array.shape) == 2: # Grayscale
70
+ img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
71
  img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
72
 
73
  if preprocessing_options.get("threshold", False):
 
154
  # Get file size in MB
155
  file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
156
 
157
+ # Check if file exceeds size limits (10 MB for API processing)
158
+ # This is a lower limit than the UI file size to ensure API requests don't fail
159
+ if file_size_mb > 10:
160
+ st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size for API processing is 10MB.")
161
  return {
162
  "file_name": uploaded_file.name,
163
  "topics": ["Document"],
164
  "languages": ["English"],
165
  "confidence_score": 0.0,
166
+ "error": f"File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
167
  "ocr_contents": {
168
+ "error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
169
+ "partial_text": "Document could not be processed due to API limitations. Try reducing the file size or resolution."
170
  }
171
  }
172
 
 
212
 
213
  Using the `mistral-ocr-latest` model for advanced document understanding.
214
  """)
215
+ uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
216
 
217
  # Sidebar with options
218
  with st.sidebar:
ocr_utils.py CHANGED
@@ -157,6 +157,10 @@ def get_combined_markdown_compressed(ocr_response, max_width: int = 1200, qualit
157
  # Open with PIL
158
  pil_img = Image.open(io.BytesIO(img_bytes))
159
 
 
 
 
 
160
  # Resize if needed (maintain aspect ratio)
161
  original_width, original_height = pil_img.size
162
  if original_width > max_width:
 
157
  # Open with PIL
158
  pil_img = Image.open(io.BytesIO(img_bytes))
159
 
160
+ # Convert to RGB if not already (to ensure CV_8UC3 format)
161
+ if pil_img.mode != 'RGB':
162
+ pil_img = pil_img.convert('RGB')
163
+
164
  # Resize if needed (maintain aspect ratio)
165
  original_width, original_height = pil_img.size
166
  if original_width > max_width:
structured_ocr.py CHANGED
@@ -288,10 +288,59 @@ class StructuredOCR:
288
  logger.info(f"Processing image: {file_path}")
289
 
290
  try:
291
- # Read and encode the image file
292
- logger.info("Encoding image for API")
293
- encoded_image = base64.b64encode(file_path.read_bytes()).decode()
294
- base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
  # Process the image with OCR
297
  logger.info(f"Processing image with OCR using {OCR_MODEL}")
 
288
  logger.info(f"Processing image: {file_path}")
289
 
290
  try:
291
+ # Check file size
292
+ file_size_mb = file_path.stat().st_size / (1024 * 1024)
293
+ logger.info(f"Original image size: {file_size_mb:.2f} MB")
294
+
295
+ # If image is larger than 4MB, resize it to reduce API payload size
296
+ if file_size_mb > 4:
297
+ logger.info("Image is large, resizing before API submission")
298
+ try:
299
+ from PIL import Image
300
+ import io
301
+
302
+ # Open and resize the image
303
+ with Image.open(file_path) as img:
304
+ # Convert to RGB if not already (prevents mode errors)
305
+ if img.mode != 'RGB':
306
+ img = img.convert('RGB')
307
+
308
+ # Calculate new dimensions (maintain aspect ratio)
309
+ # Target around 2000-3000 pixels on longest side for good OCR quality
310
+ width, height = img.size
311
+ max_dimension = max(width, height)
312
+ target_dimension = 2500 # Good balance between quality and size
313
+
314
+ if max_dimension > target_dimension:
315
+ scale_factor = target_dimension / max_dimension
316
+ new_width = int(width * scale_factor)
317
+ new_height = int(height * scale_factor)
318
+ img = img.resize((new_width, new_height), Image.LANCZOS)
319
+
320
+ # Save to bytes with compression
321
+ buffer = io.BytesIO()
322
+ img.save(buffer, format="JPEG", quality=85, optimize=True)
323
+ buffer.seek(0)
324
+
325
+ # Get the base64
326
+ encoded_image = base64.b64encode(buffer.getvalue()).decode()
327
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
328
+
329
+ # Log the new size
330
+ new_size_mb = len(buffer.getvalue()) / (1024 * 1024)
331
+ logger.info(f"Resized image to {new_size_mb:.2f} MB")
332
+ except ImportError:
333
+ logger.warning("PIL not available for resizing. Using original image.")
334
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
335
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
336
+ except Exception as e:
337
+ logger.warning(f"Image resize failed: {str(e)}. Using original image.")
338
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
339
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
340
+ else:
341
+ # For smaller images, use as-is
342
+ encoded_image = base64.b64encode(file_path.read_bytes()).decode()
343
+ base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
344
 
345
  # Process the image with OCR
346
  logger.info(f"Processing image with OCR using {OCR_MODEL}")