Spaces:
Running
Running
Fix image processing issues
Browse files- app.py +16 -7
- ocr_utils.py +4 -0
- structured_ocr.py +53 -4
app.py
CHANGED
@@ -45,6 +45,9 @@ def preprocess_image(image_bytes, preprocessing_options):
|
|
45 |
"""Preprocess image with selected options"""
|
46 |
# Convert bytes to OpenCV format
|
47 |
image = Image.open(io.BytesIO(image_bytes))
|
|
|
|
|
|
|
48 |
img_array = np.array(image)
|
49 |
|
50 |
# Apply preprocessing based on selected options
|
@@ -60,6 +63,11 @@ def preprocess_image(image_bytes, preprocessing_options):
|
|
60 |
img_array = np.array(image)
|
61 |
|
62 |
if preprocessing_options.get("denoise", False):
|
|
|
|
|
|
|
|
|
|
|
63 |
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
|
64 |
|
65 |
if preprocessing_options.get("threshold", False):
|
@@ -146,18 +154,19 @@ def process_file(uploaded_file, use_vision=True, preprocessing_options=None):
|
|
146 |
# Get file size in MB
|
147 |
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
|
148 |
|
149 |
-
# Check if file exceeds size limits (
|
150 |
-
|
151 |
-
|
|
|
152 |
return {
|
153 |
"file_name": uploaded_file.name,
|
154 |
"topics": ["Document"],
|
155 |
"languages": ["English"],
|
156 |
"confidence_score": 0.0,
|
157 |
-
"error": f"File size {file_size_mb:.2f} MB exceeds limit of
|
158 |
"ocr_contents": {
|
159 |
-
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds limit of
|
160 |
-
"partial_text": "Document could not be processed due to
|
161 |
}
|
162 |
}
|
163 |
|
@@ -203,7 +212,7 @@ with main_tab1:
|
|
203 |
|
204 |
Using the `mistral-ocr-latest` model for advanced document understanding.
|
205 |
""")
|
206 |
-
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"]
|
207 |
|
208 |
# Sidebar with options
|
209 |
with st.sidebar:
|
|
|
45 |
"""Preprocess image with selected options"""
|
46 |
# Convert bytes to OpenCV format
|
47 |
image = Image.open(io.BytesIO(image_bytes))
|
48 |
+
# Ensure image is in RGB mode for OpenCV processing
|
49 |
+
if image.mode != 'RGB':
|
50 |
+
image = image.convert('RGB')
|
51 |
img_array = np.array(image)
|
52 |
|
53 |
# Apply preprocessing based on selected options
|
|
|
63 |
img_array = np.array(image)
|
64 |
|
65 |
if preprocessing_options.get("denoise", False):
|
66 |
+
# Ensure the image is in the correct format for denoising (CV_8UC3)
|
67 |
+
if len(img_array.shape) != 3 or img_array.shape[2] != 3:
|
68 |
+
# Convert to RGB if it's not already a 3-channel color image
|
69 |
+
if len(img_array.shape) == 2: # Grayscale
|
70 |
+
img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)
|
71 |
img_array = cv2.fastNlMeansDenoisingColored(img_array, None, 10, 10, 7, 21)
|
72 |
|
73 |
if preprocessing_options.get("threshold", False):
|
|
|
154 |
# Get file size in MB
|
155 |
file_size_mb = os.path.getsize(temp_path) / (1024 * 1024)
|
156 |
|
157 |
+
# Check if file exceeds size limits (10 MB for API processing)
|
158 |
+
# This is a lower limit than the UI file size to ensure API requests don't fail
|
159 |
+
if file_size_mb > 10:
|
160 |
+
st.error(f"File too large ({file_size_mb:.1f} MB). Maximum file size for API processing is 10MB.")
|
161 |
return {
|
162 |
"file_name": uploaded_file.name,
|
163 |
"topics": ["Document"],
|
164 |
"languages": ["English"],
|
165 |
"confidence_score": 0.0,
|
166 |
+
"error": f"File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
|
167 |
"ocr_contents": {
|
168 |
+
"error": f"Failed to process file: File size {file_size_mb:.2f} MB exceeds API limit of 10 MB",
|
169 |
+
"partial_text": "Document could not be processed due to API limitations. Try reducing the file size or resolution."
|
170 |
}
|
171 |
}
|
172 |
|
|
|
212 |
|
213 |
Using the `mistral-ocr-latest` model for advanced document understanding.
|
214 |
""")
|
215 |
+
uploaded_file = st.file_uploader("Choose a file", type=["pdf", "png", "jpg", "jpeg"])
|
216 |
|
217 |
# Sidebar with options
|
218 |
with st.sidebar:
|
ocr_utils.py
CHANGED
@@ -157,6 +157,10 @@ def get_combined_markdown_compressed(ocr_response, max_width: int = 1200, qualit
|
|
157 |
# Open with PIL
|
158 |
pil_img = Image.open(io.BytesIO(img_bytes))
|
159 |
|
|
|
|
|
|
|
|
|
160 |
# Resize if needed (maintain aspect ratio)
|
161 |
original_width, original_height = pil_img.size
|
162 |
if original_width > max_width:
|
|
|
157 |
# Open with PIL
|
158 |
pil_img = Image.open(io.BytesIO(img_bytes))
|
159 |
|
160 |
+
# Convert to RGB if not already (to ensure CV_8UC3 format)
|
161 |
+
if pil_img.mode != 'RGB':
|
162 |
+
pil_img = pil_img.convert('RGB')
|
163 |
+
|
164 |
# Resize if needed (maintain aspect ratio)
|
165 |
original_width, original_height = pil_img.size
|
166 |
if original_width > max_width:
|
structured_ocr.py
CHANGED
@@ -288,10 +288,59 @@ class StructuredOCR:
|
|
288 |
logger.info(f"Processing image: {file_path}")
|
289 |
|
290 |
try:
|
291 |
-
#
|
292 |
-
|
293 |
-
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
# Process the image with OCR
|
297 |
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|
|
|
288 |
logger.info(f"Processing image: {file_path}")
|
289 |
|
290 |
try:
|
291 |
+
# Check file size
|
292 |
+
file_size_mb = file_path.stat().st_size / (1024 * 1024)
|
293 |
+
logger.info(f"Original image size: {file_size_mb:.2f} MB")
|
294 |
+
|
295 |
+
# If image is larger than 4MB, resize it to reduce API payload size
|
296 |
+
if file_size_mb > 4:
|
297 |
+
logger.info("Image is large, resizing before API submission")
|
298 |
+
try:
|
299 |
+
from PIL import Image
|
300 |
+
import io
|
301 |
+
|
302 |
+
# Open and resize the image
|
303 |
+
with Image.open(file_path) as img:
|
304 |
+
# Convert to RGB if not already (prevents mode errors)
|
305 |
+
if img.mode != 'RGB':
|
306 |
+
img = img.convert('RGB')
|
307 |
+
|
308 |
+
# Calculate new dimensions (maintain aspect ratio)
|
309 |
+
# Target around 2000-3000 pixels on longest side for good OCR quality
|
310 |
+
width, height = img.size
|
311 |
+
max_dimension = max(width, height)
|
312 |
+
target_dimension = 2500 # Good balance between quality and size
|
313 |
+
|
314 |
+
if max_dimension > target_dimension:
|
315 |
+
scale_factor = target_dimension / max_dimension
|
316 |
+
new_width = int(width * scale_factor)
|
317 |
+
new_height = int(height * scale_factor)
|
318 |
+
img = img.resize((new_width, new_height), Image.LANCZOS)
|
319 |
+
|
320 |
+
# Save to bytes with compression
|
321 |
+
buffer = io.BytesIO()
|
322 |
+
img.save(buffer, format="JPEG", quality=85, optimize=True)
|
323 |
+
buffer.seek(0)
|
324 |
+
|
325 |
+
# Get the base64
|
326 |
+
encoded_image = base64.b64encode(buffer.getvalue()).decode()
|
327 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
328 |
+
|
329 |
+
# Log the new size
|
330 |
+
new_size_mb = len(buffer.getvalue()) / (1024 * 1024)
|
331 |
+
logger.info(f"Resized image to {new_size_mb:.2f} MB")
|
332 |
+
except ImportError:
|
333 |
+
logger.warning("PIL not available for resizing. Using original image.")
|
334 |
+
encoded_image = base64.b64encode(file_path.read_bytes()).decode()
|
335 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
336 |
+
except Exception as e:
|
337 |
+
logger.warning(f"Image resize failed: {str(e)}. Using original image.")
|
338 |
+
encoded_image = base64.b64encode(file_path.read_bytes()).decode()
|
339 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
340 |
+
else:
|
341 |
+
# For smaller images, use as-is
|
342 |
+
encoded_image = base64.b64encode(file_path.read_bytes()).decode()
|
343 |
+
base64_data_url = f"data:image/jpeg;base64,{encoded_image}"
|
344 |
|
345 |
# Process the image with OCR
|
346 |
logger.info(f"Processing image with OCR using {OCR_MODEL}")
|