|
import easyocr |
|
import numpy as np |
|
from PIL import Image |
|
from transformers import pipeline |
|
import gradio as gr |
|
import pdf2image |
|
import PyPDF2 |
|
import io |
|
import pandas as pd |
|
import logging |
|
from datetime import datetime |
|
import os |
|
import torch |
|
|
|
|
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' |
|
torch.backends.cudnn.benchmark = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
device = 0 if torch.cuda.is_available() else -1 |
|
logger.info(f"Using device: {'CUDA' if device == 0 else 'CPU'}") |
|
|
|
|
|
def init_models(): |
|
try: |
|
|
|
reader = easyocr.Reader(['en'], gpu=bool(device == 0), |
|
model_storage_directory='./models', |
|
download_enabled=True) |
|
|
|
|
|
text_classifier = pipeline( |
|
"text-classification", |
|
model="distilbert-base-uncased-finetuned-sst-2-english", |
|
device=device, |
|
model_kwargs={"low_cpu_mem_usage": True} |
|
) |
|
|
|
|
|
doc_classifier = pipeline( |
|
"image-classification", |
|
model="microsoft/dit-base-finetuned-rvlcdip", |
|
device=device, |
|
model_kwargs={"low_cpu_mem_usage": True} |
|
) |
|
|
|
return reader, text_classifier, doc_classifier |
|
except Exception as e: |
|
logger.error(f"Error initializing models: {str(e)}") |
|
raise |
|
|
|
try: |
|
logger.info("Initializing models...") |
|
reader, text_classifier, doc_classifier = init_models() |
|
logger.info("Models initialized successfully") |
|
except Exception as e: |
|
logger.error(f"Failed to initialize models: {str(e)}") |
|
raise |
|
|
|
def validate_insurance_claim(text): |
|
"""Validate if the text contains insurance claim related content""" |
|
keywords = ['claim', 'policy', 'insurance', 'damage', 'loss', 'accident', 'coverage'] |
|
return any(keyword in text.lower() for keyword in keywords) |
|
|
|
def process_document(file): |
|
try: |
|
if file is None: |
|
return "Please upload an insurance claim document", None, None |
|
|
|
|
|
file_extension = os.path.splitext(file.name)[1].lower() |
|
|
|
|
|
if file_extension == '.pdf': |
|
try: |
|
images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1) |
|
if not images: |
|
return "Failed to process insurance claim PDF", None, None |
|
image = images[0] |
|
except Exception as e: |
|
logger.error(f"PDF processing error: {str(e)}") |
|
return "Error processing PDF file", None, None |
|
|
|
|
|
elif file_extension in ('.png', '.jpg', '.jpeg'): |
|
try: |
|
image = Image.open(file) |
|
except Exception as e: |
|
logger.error(f"Image processing error: {str(e)}") |
|
return "Error processing image file", None, None |
|
else: |
|
return "Unsupported file format. Please upload PDF or image files.", None, None |
|
|
|
|
|
if image.mode != 'RGB': |
|
image = image.convert('RGB') |
|
|
|
|
|
try: |
|
result = reader.readtext(np.array(image)) |
|
text = ' '.join([t[1] for t in result]) |
|
except Exception as e: |
|
logger.error(f"Text extraction error: {str(e)}") |
|
return "Error extracting text from document", None, None |
|
|
|
|
|
formatted_text = format_insurance_claim(text) |
|
|
|
|
|
if not validate_insurance_claim(text): |
|
return "Document does not appear to be an insurance claim", None, None |
|
|
|
|
|
try: |
|
text_analysis = text_classifier(text[:512])[0] |
|
except Exception as e: |
|
logger.error(f"Text classification error: {str(e)}") |
|
text_analysis = {'score': 0.5} |
|
|
|
|
|
try: |
|
doc_analysis = doc_classifier(image)[0] |
|
except Exception as e: |
|
logger.error(f"Document classification error: {str(e)}") |
|
doc_analysis = {'score': 0.5} |
|
|
|
|
|
validation_result = analyze_claim_validity(text_analysis['score']) |
|
|
|
return ( |
|
formatted_text, |
|
f"Claim Status: {validation_result['status']}\n" + |
|
f"Confidence Score: {text_analysis['score']:.2f}\n" + |
|
f"Validation Notes: {validation_result['notes']}", |
|
f"Document Type: Insurance Claim Form\n" + |
|
f"Form Type: NUCC Health Insurance Claim\n" + |
|
f"Confidence: {doc_analysis['score']:.2f}" |
|
) |
|
|
|
except Exception as e: |
|
logger.error(f"General processing error: {str(e)}") |
|
return "Error processing document", None, None |
|
|
|
def format_insurance_claim(text): |
|
"""Format the extracted text in a more readable way""" |
|
|
|
lines = text.split('\n') |
|
formatted_lines = [] |
|
|
|
key_fields = { |
|
'Insured Name': '', |
|
'Policy Number': '', |
|
'Provider': '', |
|
'Date of Service': '', |
|
'Claim Details': '' |
|
} |
|
|
|
|
|
for line in lines: |
|
if 'HEALTH INSURANCE CLAIM FORM' in line: |
|
formatted_lines.append(f"Document Type: {line.strip()}") |
|
elif any(field in line for field in ['Name:', 'Policy', 'Provider', 'Date']): |
|
formatted_lines.append(line.strip()) |
|
|
|
return '\n'.join(formatted_lines) |
|
|
|
def analyze_claim_validity(score): |
|
"""Provide more detailed validation analysis""" |
|
if score > 0.9: |
|
return { |
|
'status': 'VALID', |
|
'notes': 'High confidence in claim validity. All required fields present.' |
|
} |
|
elif score > 0.7: |
|
return { |
|
'status': 'VALID - REVIEW RECOMMENDED', |
|
'notes': 'Claim appears valid but manual review suggested.' |
|
} |
|
else: |
|
return { |
|
'status': 'NEEDS REVIEW', |
|
'notes': 'Low confidence score. Please review manually.' |
|
} |
|
|
|
|
|
custom_css = """ |
|
.gradio-container { |
|
max-width: 900px !important; |
|
margin: auto; |
|
padding-top: 1.5rem; |
|
padding-bottom: 1.5rem; |
|
} |
|
|
|
.main-div { |
|
display: flex; |
|
flex-direction: column; |
|
gap: 20px; |
|
} |
|
|
|
.container { |
|
border-radius: 10px; |
|
background-color: #ffffff; |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
margin-bottom: 20px; |
|
padding: 20px; |
|
} |
|
|
|
.output-div { |
|
min-height: 100px; |
|
margin-bottom: 10px; |
|
} |
|
|
|
h1 { |
|
color: #2a4365; |
|
text-align: center; |
|
font-size: 2.5rem; |
|
margin-bottom: 1rem; |
|
font-weight: bold; |
|
} |
|
|
|
.description { |
|
text-align: center; |
|
color: #4a5568; |
|
margin-bottom: 2rem; |
|
} |
|
|
|
.file-upload { |
|
border: 2px dashed #cbd5e0; |
|
border-radius: 8px; |
|
padding: 20px; |
|
text-align: center; |
|
transition: all 0.3s ease; |
|
} |
|
|
|
.file-upload:hover { |
|
border-color: #4299e1; |
|
} |
|
|
|
.output-label { |
|
font-weight: bold; |
|
color: #2d3748; |
|
margin-bottom: 0.5rem; |
|
} |
|
|
|
.output-text { |
|
background-color: #f7fafc; |
|
border-radius: 6px; |
|
padding: 12px; |
|
} |
|
""" |
|
|
|
|
|
with gr.Blocks(css=custom_css) as iface: |
|
gr.HTML("<h1>π Automated Insurance Claim Validation System</h1>") |
|
gr.HTML(""" |
|
<div class="description"> |
|
Upload insurance claim documents (PDF or image) for automated validation and analysis. |
|
Our AI system will process and validate your claims instantly. |
|
</div> |
|
""") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
file_input = gr.File( |
|
label="Upload Insurance Claim Document", |
|
file_types=[".pdf", ".png", ".jpg", ".jpeg"], |
|
elem_classes="file-upload" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
text_output = gr.Textbox( |
|
label="Extracted Claim Details", |
|
elem_classes="output-div", |
|
lines=5 |
|
) |
|
validation_output = gr.Textbox( |
|
label="Claim Validation Results", |
|
elem_classes="output-div" |
|
) |
|
classification_output = gr.Textbox( |
|
label="Document Classification", |
|
elem_classes="output-div" |
|
) |
|
|
|
file_input.change( |
|
fn=process_document, |
|
inputs=[file_input], |
|
outputs=[text_output, validation_output, classification_output] |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch(server_name="0.0.0.0", server_port=7860) |
|
|