|
import os |
|
import torch |
|
from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3FeatureExtractor, LayoutLMv3Tokenizer |
|
from PIL import Image |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
import re |
|
|
|
|
|
|
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
|
|
|
|
|
model_name = "microsoft/layoutlmv3-base" |
|
model = LayoutLMv3ForTokenClassification.from_pretrained(model_name, num_labels=5) |
|
feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained(model_name) |
|
tokenizer = LayoutLMv3Tokenizer.from_pretrained(model_name) |
|
|
|
|
|
id2label = {0: "O", 1: "COMPANY", 2: "EDUCATION", 3: "POSITION", 4: "DATE"} |
|
label2id = {v: k for k, v in id2label.items()} |
|
|
|
def preprocess_document(file_path): |
|
if file_path.lower().endswith('.pdf'): |
|
images = convert_from_path(file_path) |
|
image = images[0] |
|
else: |
|
image = Image.open(file_path) |
|
|
|
|
|
ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) |
|
|
|
words = ocr_result['text'] |
|
boxes = [] |
|
for i in range(len(words)): |
|
x, y, w, h = ocr_result['left'][i], ocr_result['top'][i], ocr_result['width'][i], ocr_result['height'][i] |
|
boxes.append([x, y, x+w, y+h]) |
|
|
|
return image, words, boxes |
|
|
|
def process_resume(file_path): |
|
image, words, boxes = preprocess_document(file_path) |
|
|
|
|
|
encoding = feature_extractor(image, words, boxes=boxes, return_tensors="pt") |
|
input_ids = encoding["input_ids"] |
|
attention_mask = encoding["attention_mask"] |
|
token_type_ids = encoding["token_type_ids"] |
|
bbox = encoding["bbox"] |
|
|
|
|
|
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, bbox=bbox) |
|
|
|
|
|
predictions = outputs.logits.argmax(-1).squeeze().tolist() |
|
|
|
|
|
parsed_info = {"COMPANY": [], "EDUCATION": [], "POSITION": [], "DATE": []} |
|
current_entity = None |
|
current_text = "" |
|
|
|
for word, label_id in zip(words, predictions): |
|
if label_id != 0: |
|
label = id2label[label_id] |
|
if label != current_entity: |
|
if current_entity: |
|
parsed_info[current_entity].append(current_text.strip()) |
|
current_entity = label |
|
current_text = word + " " |
|
else: |
|
current_text += word + " " |
|
else: |
|
if current_entity: |
|
parsed_info[current_entity].append(current_text.strip()) |
|
current_entity = None |
|
current_text = "" |
|
|
|
return parsed_info |
|
|
|
def main(): |
|
resume_path = input("Enter the path to your resume file (PDF or image): ") |
|
if not os.path.exists(resume_path): |
|
print("File not found. Please check the path and try again.") |
|
return |
|
|
|
parsed_info = process_resume(resume_path) |
|
|
|
print("\nExtracted Information:") |
|
print("Companies worked for:", ", ".join(parsed_info["COMPANY"])) |
|
print("Education:", ", ".join(parsed_info["EDUCATION"])) |
|
print("Positions held:", ", ".join(parsed_info["POSITION"])) |
|
print("Relevant dates:", ", ".join(parsed_info["DATE"])) |
|
|
|
if __name__ == "__main__": |
|
main() |