File size: 8,036 Bytes
1ec17ce
ea88d49
 
 
 
 
 
 
 
 
5b5483f
1ec17ce
 
 
 
 
1d58793
ea88d49
1ec17ce
ea88d49
 
 
 
 
 
 
1ec17ce
 
5b5483f
 
 
ea88d49
 
5b5483f
ea88d49
 
 
1ec17ce
 
ea88d49
 
1ec17ce
ea88d49
1ec17ce
 
ea88d49
1ec17ce
 
 
ea88d49
 
1ec17ce
ea88d49
1ec17ce
ea88d49
 
1ec17ce
ea88d49
 
 
 
 
 
 
1ec17ce
ea88d49
1ec17ce
 
 
 
 
 
ea88d49
1ec17ce
 
ea88d49
1ec17ce
 
 
 
 
 
 
 
ea88d49
1ec17ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea88d49
 
 
 
 
 
 
 
 
 
1ec17ce
ea88d49
 
 
 
1ec17ce
ea88d49
 
 
 
 
 
 
1ec17ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea88d49
 
1ec17ce
ea88d49
 
5b5483f
 
 
ea88d49
 
 
5b5483f
ea88d49
 
 
 
 
 
 
 
 
 
 
1ec17ce
ea88d49
 
1ec17ce
 
ea88d49
 
1ec17ce
 
 
 
 
 
 
 
ea88d49
 
1ec17ce
ea88d49
 
1ec17ce
 
ea88d49
1d58793
ce24de7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218

import google.generativeai as genai
import fitz  # PyMuPDF for PDF text extraction
import streamlit as st
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from docx import Document
import re
import dateparser
from datetime import datetime
import os
from typing import List, Dict
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load SpaCy model for dependency parsing and NER
nlp_spacy = spacy.load('en_core_web_sm')

# Load the NER model
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def authenticate_gemini() -> genai.GenerativeModel:
    api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
    if not api_key:
        st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
        return None
    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel(model_name="gemini-pro")
        st.success("Gemini API successfully configured.")
        return model
    except Exception as e:
        logger.error(f"Error configuring Gemini API: {e}")
        st.error(f"Error configuring Gemini API. Please check your API key and try again.")
        return None

def refine_org_entities(entities: List[str]) -> List[str]:
    refined_entities = set()
    company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.', 'Company', 'Group']
    
    for entity in entities:
        # Remove common prefixes that might interfere with company names
        entity = re.sub(r'^(The|A|An)\s+', '', entity).strip()
        
        if any(entity.endswith(suffix) for suffix in company_suffixes):
            refined_entities.add(entity)
        elif re.match(r'([A-Z][a-z]+\s?)+', entity):  # Match sequences of capitalized words
            refined_entities.add(entity)
    
    return list(refined_entities)

def extract_orgs(text: str) -> List[str]:
    ner_results = nlp_ner(text)
    orgs = set()
    for entity in ner_results:
        if entity['entity_group'] == 'ORG':
            orgs.add(entity['word'])
    return refine_org_entities(orgs)

def extract_text_from_pdf(pdf_file) -> str:
    try:
        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
        text = ""
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        logger.error(f"Error extracting text from PDF: {e}")
        return ""

def extract_text_from_doc(doc_file) -> str:
    try:
        doc = Document(doc_file)
        text = '\n'.join([para.text for para in doc.paragraphs])
        return text
    except Exception as e:
        logger.error(f"Error extracting text from DOCX: {e}")
        return ""

def generate_summary(text: str, model: genai.GenerativeModel) -> str:
    prompt = f"Summarize the following resume in 100 words, highlighting key skills and experiences:\n\n{text}"
    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        logger.error(f"Error generating summary: {e}")
        return "Error generating summary. Please try again."

def extract_experience(text: str) -> str:
    # Patterns to match experience in years and months
    experience_patterns = [
        r'(\d+)\s*(?:years?|yrs?)',  # e.g., 5 years, 2 yrs
        r'(\d+)\s*(?:months?|mos?)',  # e.g., 6 months
        r'(\d+)\s*(?:years?|yrs?)\s*(?:and)?\s*(\d+)\s*(?:months?|mos?)'  # e.g., 2 years and 6 months
    ]
    
    # Extract and prioritize years of experience
    total_years = 0
    for pattern in experience_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        for match in matches:
            if len(match) == 1:  # Only years or months
                value = int(match[0])
                if 'year' in pattern:
                    total_years += value
                # We ignore months in this case
            elif len(match) == 2:  # Years and months
                years, _ = int(match[0]), int(match[1])
                total_years += years
    
    # Return only the number of years (ignore months)
    if total_years > 0:
        return f"{total_years} years"
    else:
        return "Experience not found"


def extract_phone(text: str) -> str:
    phone_patterns = [
        r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
        r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
    ]
    for pattern in phone_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()
    return "Not found"

def extract_email(text: str) -> str:
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    match = re.search(email_pattern, text)
    return match.group() if match else "Not found"

def extract_colleges(doc) -> List[str]:
    colleges = set()
    edu_keywords = ["university", "college", "institute", "school"]
    for ent in doc.ents:
        if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
            colleges.add(ent.text)
    return list(colleges)

def extract_linkedin(text: str) -> str:
    linkedin_patterns = [
        r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?',
        r'linkedin\.com\/in\/[A-z0-9_-]+',
        r'@[A-z0-9_-]+\s+\(LinkedIn\)'
    ]
    for pattern in linkedin_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            return match.group()
    return "Not found"

def analyze_resume(text: str, model: genai.GenerativeModel) -> Dict:
    doc = nlp_spacy(text)
    return {
        "companies": extract_orgs(text),
        "summary": generate_summary(text, model),
        "experience": extract_experience(text),
        "phone": extract_phone(text),
        "email": extract_email(text),
        "colleges": extract_colleges(doc),
        "linkedin": extract_linkedin(text)
    }

def main():
    st.title("Enhanced Resume Analyzer")
    st.write("Upload a resume to extract information, generate a summary, and analyze details.")

    model = authenticate_gemini()
    if model is None:
        return

    uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])

    if uploaded_file is not None:
        try:
            file_ext = uploaded_file.name.split('.')[-1].lower()
            if file_ext == 'pdf':
                resume_text = extract_text_from_pdf(uploaded_file)
            elif file_ext in ['docx', 'doc']:
                resume_text = extract_text_from_doc(uploaded_file)
            else:
                st.error("Unsupported file format.")
                return

            if not resume_text.strip():
                st.error("The resume appears to be empty or couldn't be read.")
                return

            with st.spinner("Analyzing resume..."):
                results = analyze_resume(resume_text, model)

            st.subheader("Extracted Information")
            st.write(f"Experience: {results['experience']}")
            st.write("Companies Worked For:")
            st.write(", ".join(results['companies']))
            st.write(f"Phone Number: {results['phone']}")
            st.write(f"Email ID: {results['email']}")
            st.write("Colleges Attended:")
            st.write(", ".join(results['colleges']))
            st.write(f"LinkedIn: {results['linkedin']}")

            st.subheader("Generated Summary")
            st.write(results['summary'])

        except Exception as e:
            logger.error(f"Error during resume analysis: {e}")
            st.error("An error occurred during resume analysis. Please try again or contact support if the issue persists.")

if __name__ == "__main__":
    main()