File size: 9,268 Bytes
38f4471
 
 
 
 
 
 
 
15503d7
 
 
3ecbba0
f1dff19
 
 
 
 
 
 
 
 
 
15503d7
0ca6a92
 
3ecbba0
38f4471
f1dff19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38f4471
54c68aa
 
 
 
 
0ca6a92
3ecbba0
0ca6a92
54c68aa
0ca6a92
9023327
 
 
0ca6a92
9023327
f1dff19
 
 
 
 
 
 
 
 
0ca6a92
9023327
f1dff19
 
 
 
 
db14a92
54c68aa
3ecbba0
f1dff19
 
 
38f4471
f1dff19
 
 
 
 
 
 
 
 
f1351ad
 
54c68aa
 
 
 
f1dff19
 
 
 
 
 
 
 
 
 
 
 
 
3ecbba0
f1dff19
f1351ad
54c68aa
3ecbba0
f1351ad
 
 
 
 
 
 
3ecbba0
bef72f9
38f4471
f1dff19
 
38f4471
f1351ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455f086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9023327
455f086
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38f4471
 
0ca6a92
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import easyocr
import numpy as np
from PIL import Image
from transformers import pipeline
import gradio as gr
import pdf2image
import PyPDF2
import io
import pandas as pd
import logging
from datetime import datetime
import os
import torch

# Add these near the top of your script, after imports
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
torch.backends.cudnn.benchmark = True

# If you're running out of memory, uncomment these lines:
# import gc
# gc.collect()
# torch.cuda.empty_cache()

# Basic logging setup
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize models with specific device placement and lower precision
device = 0 if torch.cuda.is_available() else -1
logger.info(f"Using device: {'CUDA' if device == 0 else 'CPU'}")

# Initialize models with memory optimization
def init_models():
    try:
        # Initialize EasyOCR with lower memory usage
        reader = easyocr.Reader(['en'], gpu=bool(device == 0), 
                              model_storage_directory='./models',
                              download_enabled=True)
        
        # Initialize text classifier with optimizations
        text_classifier = pipeline(
            "text-classification",
            model="distilbert-base-uncased-finetuned-sst-2-english",
            device=device,
            model_kwargs={"low_cpu_mem_usage": True}
        )
        
        # Use a more lightweight document classifier
        doc_classifier = pipeline(
            "image-classification",
            model="microsoft/dit-base-finetuned-rvlcdip",
            device=device,
            model_kwargs={"low_cpu_mem_usage": True}
        )
        
        return reader, text_classifier, doc_classifier
    except Exception as e:
        logger.error(f"Error initializing models: {str(e)}")
        raise

try:
    logger.info("Initializing models...")
    reader, text_classifier, doc_classifier = init_models()
    logger.info("Models initialized successfully")
except Exception as e:
    logger.error(f"Failed to initialize models: {str(e)}")
    raise

def validate_insurance_claim(text):
    """Validate if the text contains insurance claim related content"""
    keywords = ['claim', 'policy', 'insurance', 'damage', 'loss', 'accident', 'coverage']
    return any(keyword in text.lower() for keyword in keywords)

def process_document(file):
    try:
        if file is None:
            return "Please upload an insurance claim document", None, None

        # Get file extension
        file_extension = os.path.splitext(file.name)[1].lower()

        # Handle PDF files
        if file_extension == '.pdf':
            try:
                images = pdf2image.convert_from_bytes(file.read(), first_page=1, last_page=1)
                if not images:
                    return "Failed to process insurance claim PDF", None, None
                image = images[0]
            except Exception as e:
                logger.error(f"PDF processing error: {str(e)}")
                return "Error processing PDF file", None, None

        # Handle image files
        elif file_extension in ('.png', '.jpg', '.jpeg'):
            try:
                image = Image.open(file)
            except Exception as e:
                logger.error(f"Image processing error: {str(e)}")
                return "Error processing image file", None, None
        else:
            return "Unsupported file format. Please upload PDF or image files.", None, None

        # Convert image to RGB if necessary
        if image.mode != 'RGB':
            image = image.convert('RGB')

        # Extract text with error handling
        try:
            result = reader.readtext(np.array(image))
            text = ' '.join([t[1] for t in result])
        except Exception as e:
            logger.error(f"Text extraction error: {str(e)}")
            return "Error extracting text from document", None, None

        # Format the extracted text
        formatted_text = format_insurance_claim(text)

        # Validate if it's an insurance claim
        if not validate_insurance_claim(text):
            return "Document does not appear to be an insurance claim", None, None

        # Classify text with error handling
        try:
            text_analysis = text_classifier(text[:512])[0]
        except Exception as e:
            logger.error(f"Text classification error: {str(e)}")
            text_analysis = {'score': 0.5}

        # Classify document with error handling
        try:
            doc_analysis = doc_classifier(image)[0]
        except Exception as e:
            logger.error(f"Document classification error: {str(e)}")
            doc_analysis = {'score': 0.5}

        # Generate validation results
        validation_result = analyze_claim_validity(text_analysis['score'])
        
        return (
            formatted_text,
            f"Claim Status: {validation_result['status']}\n" +
            f"Confidence Score: {text_analysis['score']:.2f}\n" +
            f"Validation Notes: {validation_result['notes']}",
            f"Document Type: Insurance Claim Form\n" +
            f"Form Type: NUCC Health Insurance Claim\n" +
            f"Confidence: {doc_analysis['score']:.2f}"
        )

    except Exception as e:
        logger.error(f"General processing error: {str(e)}")
        return "Error processing document", None, None

def format_insurance_claim(text):
    """Format the extracted text in a more readable way"""
    # Extract key information using regex or simple text processing
    lines = text.split('\n')
    formatted_lines = []
    
    key_fields = {
        'Insured Name': '',
        'Policy Number': '',
        'Provider': '',
        'Date of Service': '',
        'Claim Details': ''
    }
    
    # Process the text and organize it
    for line in lines:
        if 'HEALTH INSURANCE CLAIM FORM' in line:
            formatted_lines.append(f"Document Type: {line.strip()}")
        elif any(field in line for field in ['Name:', 'Policy', 'Provider', 'Date']):
            formatted_lines.append(line.strip())
    
    return '\n'.join(formatted_lines)

def analyze_claim_validity(score):
    """Provide more detailed validation analysis"""
    if score > 0.9:
        return {
            'status': 'VALID',
            'notes': 'High confidence in claim validity. All required fields present.'
        }
    elif score > 0.7:
        return {
            'status': 'VALID - REVIEW RECOMMENDED',
            'notes': 'Claim appears valid but manual review suggested.'
        }
    else:
        return {
            'status': 'NEEDS REVIEW',
            'notes': 'Low confidence score. Please review manually.'
        }

# Custom CSS for better UI
custom_css = """
.gradio-container {
    max-width: 900px !important;
    margin: auto;
    padding-top: 1.5rem;
    padding-bottom: 1.5rem;
}

.main-div {
    display: flex;
    flex-direction: column;
    gap: 20px;
}

.container {
    border-radius: 10px;
    background-color: #ffffff;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    margin-bottom: 20px;
    padding: 20px;
}

.output-div {
    min-height: 100px;
    margin-bottom: 10px;
}

h1 {
    color: #2a4365;
    text-align: center;
    font-size: 2.5rem;
    margin-bottom: 1rem;
    font-weight: bold;
}

.description {
    text-align: center;
    color: #4a5568;
    margin-bottom: 2rem;
}

.file-upload {
    border: 2px dashed #cbd5e0;
    border-radius: 8px;
    padding: 20px;
    text-align: center;
    transition: all 0.3s ease;
}

.file-upload:hover {
    border-color: #4299e1;
}

.output-label {
    font-weight: bold;
    color: #2d3748;
    margin-bottom: 0.5rem;
}

.output-text {
    background-color: #f7fafc;
    border-radius: 6px;
    padding: 12px;
}
"""

# Create Gradio interface with enhanced UI
with gr.Blocks(css=custom_css) as iface:
    gr.HTML("<h1>πŸ” Automated Insurance Claim Validation System</h1>")
    gr.HTML("""
        <div class="description">
            Upload insurance claim documents (PDF or image) for automated validation and analysis.
            Our AI system will process and validate your claims instantly.
        </div>
    """)
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(
                label="Upload Insurance Claim Document",
                file_types=[".pdf", ".png", ".jpg", ".jpeg"],  # Changed from ["pdf", "png", "jpg", "jpeg"]
                elem_classes="file-upload"
            )
    
    with gr.Row():
        with gr.Column():
            text_output = gr.Textbox(
                label="Extracted Claim Details",
                elem_classes="output-div",
                lines=5
            )
            validation_output = gr.Textbox(
                label="Claim Validation Results",
                elem_classes="output-div"
            )
            classification_output = gr.Textbox(
                label="Document Classification",
                elem_classes="output-div"
            )
    
    file_input.change(
        fn=process_document,
        inputs=[file_input],
        outputs=[text_output, validation_output, classification_output]
    )

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)