import os import torch from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3FeatureExtractor, LayoutLMv3Tokenizer from PIL import Image import pytesseract from pdf2image import convert_from_path import re # Ensure you have the necessary dependencies installed: # pip install transformers torch Pillow pytesseract pdf2image # Set up pytesseract path (adjust as needed) pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Load pre-trained model and tokenizer model_name = "microsoft/layoutlmv3-base" model = LayoutLMv3ForTokenClassification.from_pretrained(model_name, num_labels=5) # Adjust num_labels as needed feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained(model_name) tokenizer = LayoutLMv3Tokenizer.from_pretrained(model_name) # Define label mapping id2label = {0: "O", 1: "COMPANY", 2: "EDUCATION", 3: "POSITION", 4: "DATE"} label2id = {v: k for k, v in id2label.items()} def preprocess_document(file_path): if file_path.lower().endswith('.pdf'): images = convert_from_path(file_path) image = images[0] # Process only the first page for simplicity else: image = Image.open(file_path) # Perform OCR ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) words = ocr_result['text'] boxes = [] for i in range(len(words)): x, y, w, h = ocr_result['left'][i], ocr_result['top'][i], ocr_result['width'][i], ocr_result['height'][i] boxes.append([x, y, x+w, y+h]) return image, words, boxes def process_resume(file_path): image, words, boxes = preprocess_document(file_path) # Prepare inputs for the model encoding = feature_extractor(image, words, boxes=boxes, return_tensors="pt") input_ids = encoding["input_ids"] attention_mask = encoding["attention_mask"] token_type_ids = encoding["token_type_ids"] bbox = encoding["bbox"] # Forward pass outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, bbox=bbox) # Get predictions predictions = outputs.logits.argmax(-1).squeeze().tolist() # Post-process results parsed_info = {"COMPANY": [], "EDUCATION": [], "POSITION": [], "DATE": []} current_entity = None current_text = "" for word, label_id in zip(words, predictions): if label_id != 0: # Not 'O' label label = id2label[label_id] if label != current_entity: if current_entity: parsed_info[current_entity].append(current_text.strip()) current_entity = label current_text = word + " " else: current_text += word + " " else: if current_entity: parsed_info[current_entity].append(current_text.strip()) current_entity = None current_text = "" return parsed_info def main(): resume_path = input("Enter the path to your resume file (PDF or image): ") if not os.path.exists(resume_path): print("File not found. Please check the path and try again.") return parsed_info = process_resume(resume_path) print("\nExtracted Information:") print("Companies worked for:", ", ".join(parsed_info["COMPANY"])) print("Education:", ", ".join(parsed_info["EDUCATION"])) print("Positions held:", ", ".join(parsed_info["POSITION"])) print("Relevant dates:", ", ".join(parsed_info["DATE"])) if __name__ == "__main__": main()