File size: 9,268 Bytes
38f4471 15503d7 3ecbba0 f1dff19 15503d7 0ca6a92 3ecbba0 38f4471 f1dff19 38f4471 54c68aa 0ca6a92 3ecbba0 0ca6a92 54c68aa 0ca6a92 9023327 0ca6a92 9023327 f1dff19 0ca6a92 9023327 f1dff19 db14a92 54c68aa 3ecbba0 f1dff19 38f4471 f1dff19 f1351ad 54c68aa f1dff19 3ecbba0 f1dff19 f1351ad 54c68aa 3ecbba0 f1351ad 3ecbba0 bef72f9 38f4471 f1dff19 38f4471 f1351ad 455f086 9023327 455f086 38f4471 0ca6a92 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
import easyocr
import numpy as np
from PIL import Image
from transformers import pipeline
import gradio as gr
import pdf2image
import PyPDF2
import io
import pandas as pd
import logging
from datetime import datetime
import os
import torch
# Add these near the top of your script, after imports
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
torch.backends.cudnn.benchmark = True
# If you're running out of memory, uncomment these lines:
# import gc
# gc.collect()
# torch.cuda.empty_cache()
# Basic logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize models with specific device placement and lower precision
device = 0 if torch.cuda.is_available() else -1
logger.info(f"Using device: {'CUDA' if device == 0 else 'CPU'}")
# Initialize models with memory optimization
def init_models():
try:
# Initialize EasyOCR with lower memory usage
reader = easyocr.Reader(['en'], gpu=bool(device == 0),
model_storage_directory='./models',
download_enabled=True)
# Initialize text classifier with optimizations
text_classifier = pipeline(
"text-classification",
model="distilbert-base-uncased-finetuned-sst-2-english",
device=device,
model_kwargs={"low_cpu_mem_usage": True}
)
# Use a more lightweight document classifier
doc_classifier = pipeline(
"image-classification",
model="microsoft/dit-base-finetuned-rvlcdip",
device=device,
model_kwargs={"low_cpu_mem_usage": True}
)
return reader, text_classifier, doc_classifier
except Exception as e:
logger.error(f"Error initializing models: {str(e)}")
raise
try:
logger.info("Initializing models...")
reader, text_classifier, doc_classifier = init_models()
logger.info("Models initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize models: {str(e)}")
raise
def validate_insurance_claim(text):
"""Validate if the text contains insurance claim related content"""
keywords = ['claim', 'policy', 'insurance', 'damage', 'loss', 'accident', 'coverage']
return any(keyword in text.lower() for keyword in keywords)
def process_document(file):
try:
if file is None:
return "Please upload an insurance claim document", None, None
# Get file extension
file_extension = os.path.splitext(file.name)[1].lower()
# Handle PDF files
if file_extension == '.pdf':
try:
images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1)
if not images:
return "Failed to process insurance claim PDF", None, None
image = images[0]
except Exception as e:
logger.error(f"PDF processing error: {str(e)}")
return "Error processing PDF file", None, None
# Handle image files
elif file_extension in ('.png', '.jpg', '.jpeg'):
try:
image = Image.open(file)
except Exception as e:
logger.error(f"Image processing error: {str(e)}")
return "Error processing image file", None, None
else:
return "Unsupported file format. Please upload PDF or image files.", None, None
# Convert image to RGB if necessary
if image.mode != 'RGB':
image = image.convert('RGB')
# Extract text with error handling
try:
result = reader.readtext(np.array(image))
text = ' '.join([t[1] for t in result])
except Exception as e:
logger.error(f"Text extraction error: {str(e)}")
return "Error extracting text from document", None, None
# Format the extracted text
formatted_text = format_insurance_claim(text)
# Validate if it's an insurance claim
if not validate_insurance_claim(text):
return "Document does not appear to be an insurance claim", None, None
# Classify text with error handling
try:
text_analysis = text_classifier(text[:512])[0]
except Exception as e:
logger.error(f"Text classification error: {str(e)}")
text_analysis = {'score': 0.5}
# Classify document with error handling
try:
doc_analysis = doc_classifier(image)[0]
except Exception as e:
logger.error(f"Document classification error: {str(e)}")
doc_analysis = {'score': 0.5}
# Generate validation results
validation_result = analyze_claim_validity(text_analysis['score'])
return (
formatted_text,
f"Claim Status: {validation_result['status']}\n" +
f"Confidence Score: {text_analysis['score']:.2f}\n" +
f"Validation Notes: {validation_result['notes']}",
f"Document Type: Insurance Claim Form\n" +
f"Form Type: NUCC Health Insurance Claim\n" +
f"Confidence: {doc_analysis['score']:.2f}"
)
except Exception as e:
logger.error(f"General processing error: {str(e)}")
return "Error processing document", None, None
def format_insurance_claim(text):
"""Format the extracted text in a more readable way"""
# Extract key information using regex or simple text processing
lines = text.split('\n')
formatted_lines = []
key_fields = {
'Insured Name': '',
'Policy Number': '',
'Provider': '',
'Date of Service': '',
'Claim Details': ''
}
# Process the text and organize it
for line in lines:
if 'HEALTH INSURANCE CLAIM FORM' in line:
formatted_lines.append(f"Document Type: {line.strip()}")
elif any(field in line for field in ['Name:', 'Policy', 'Provider', 'Date']):
formatted_lines.append(line.strip())
return '\n'.join(formatted_lines)
def analyze_claim_validity(score):
"""Provide more detailed validation analysis"""
if score > 0.9:
return {
'status': 'VALID',
'notes': 'High confidence in claim validity. All required fields present.'
}
elif score > 0.7:
return {
'status': 'VALID - REVIEW RECOMMENDED',
'notes': 'Claim appears valid but manual review suggested.'
}
else:
return {
'status': 'NEEDS REVIEW',
'notes': 'Low confidence score. Please review manually.'
}
# Custom CSS for better UI
custom_css = """
.gradio-container {
max-width: 900px !important;
margin: auto;
padding-top: 1.5rem;
padding-bottom: 1.5rem;
}
.main-div {
display: flex;
flex-direction: column;
gap: 20px;
}
.container {
border-radius: 10px;
background-color: #ffffff;
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
margin-bottom: 20px;
padding: 20px;
}
.output-div {
min-height: 100px;
margin-bottom: 10px;
}
h1 {
color: #2a4365;
text-align: center;
font-size: 2.5rem;
margin-bottom: 1rem;
font-weight: bold;
}
.description {
text-align: center;
color: #4a5568;
margin-bottom: 2rem;
}
.file-upload {
border: 2px dashed #cbd5e0;
border-radius: 8px;
padding: 20px;
text-align: center;
transition: all 0.3s ease;
}
.file-upload:hover {
border-color: #4299e1;
}
.output-label {
font-weight: bold;
color: #2d3748;
margin-bottom: 0.5rem;
}
.output-text {
background-color: #f7fafc;
border-radius: 6px;
padding: 12px;
}
"""
# Create Gradio interface with enhanced UI
with gr.Blocks(css=custom_css) as iface:
gr.HTML("<h1>π Automated Insurance Claim Validation System</h1>")
gr.HTML("""
<div class="description">
Upload insurance claim documents (PDF or image) for automated validation and analysis.
Our AI system will process and validate your claims instantly.
</div>
""")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload Insurance Claim Document",
file_types=[".pdf", ".png", ".jpg", ".jpeg"], # Changed from ["pdf", "png", "jpg", "jpeg"]
elem_classes="file-upload"
)
with gr.Row():
with gr.Column():
text_output = gr.Textbox(
label="Extracted Claim Details",
elem_classes="output-div",
lines=5
)
validation_output = gr.Textbox(
label="Claim Validation Results",
elem_classes="output-div"
)
classification_output = gr.Textbox(
label="Document Classification",
elem_classes="output-div"
)
file_input.change(
fn=process_document,
inputs=[file_input],
outputs=[text_output, validation_output, classification_output]
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860)
|