Spaces:
Running
Running
File size: 8,036 Bytes
1ec17ce ea88d49 5b5483f 1ec17ce 1d58793 ea88d49 1ec17ce ea88d49 1ec17ce 5b5483f ea88d49 5b5483f ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 5b5483f ea88d49 5b5483f ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1ec17ce ea88d49 1d58793 ce24de7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
import google.generativeai as genai
import fitz # PyMuPDF for PDF text extraction
import streamlit as st
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from docx import Document
import re
import dateparser
from datetime import datetime
import os
from typing import List, Dict
import logging
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load SpaCy model for dependency parsing and NER
nlp_spacy = spacy.load('en_core_web_sm')
# Load the NER model
tokenizer = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
model = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
nlp_ner = pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy="simple")
def authenticate_gemini() -> genai.GenerativeModel:
api_key = "AIzaSyCG-qpFRqJc0QOJT-AcAaO5XIEdE-nk3Tc"
if not api_key:
st.error("Google Gemini API key not found. Please set it in the Hugging Face Spaces secrets.")
return None
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel(model_name="gemini-pro")
st.success("Gemini API successfully configured.")
return model
except Exception as e:
logger.error(f"Error configuring Gemini API: {e}")
st.error(f"Error configuring Gemini API. Please check your API key and try again.")
return None
def refine_org_entities(entities: List[str]) -> List[str]:
refined_entities = set()
company_suffixes = ['Inc', 'LLC', 'Corporation', 'Corp', 'Ltd', 'Co', 'GmbH', 'S.A.', 'Company', 'Group']
for entity in entities:
# Remove common prefixes that might interfere with company names
entity = re.sub(r'^(The|A|An)\s+', '', entity).strip()
if any(entity.endswith(suffix) for suffix in company_suffixes):
refined_entities.add(entity)
elif re.match(r'([A-Z][a-z]+\s?)+', entity): # Match sequences of capitalized words
refined_entities.add(entity)
return list(refined_entities)
def extract_orgs(text: str) -> List[str]:
ner_results = nlp_ner(text)
orgs = set()
for entity in ner_results:
if entity['entity_group'] == 'ORG':
orgs.add(entity['word'])
return refine_org_entities(orgs)
def extract_text_from_pdf(pdf_file) -> str:
try:
doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
text = ""
for page_num in range(doc.page_count):
page = doc.load_page(page_num)
text += page.get_text()
return text
except Exception as e:
logger.error(f"Error extracting text from PDF: {e}")
return ""
def extract_text_from_doc(doc_file) -> str:
try:
doc = Document(doc_file)
text = '\n'.join([para.text for para in doc.paragraphs])
return text
except Exception as e:
logger.error(f"Error extracting text from DOCX: {e}")
return ""
def generate_summary(text: str, model: genai.GenerativeModel) -> str:
prompt = f"Summarize the following resume in 100 words, highlighting key skills and experiences:\n\n{text}"
try:
response = model.generate_content(prompt)
return response.text
except Exception as e:
logger.error(f"Error generating summary: {e}")
return "Error generating summary. Please try again."
def extract_experience(text: str) -> str:
# Patterns to match experience in years and months
experience_patterns = [
r'(\d+)\s*(?:years?|yrs?)', # e.g., 5 years, 2 yrs
r'(\d+)\s*(?:months?|mos?)', # e.g., 6 months
r'(\d+)\s*(?:years?|yrs?)\s*(?:and)?\s*(\d+)\s*(?:months?|mos?)' # e.g., 2 years and 6 months
]
# Extract and prioritize years of experience
total_years = 0
for pattern in experience_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for match in matches:
if len(match) == 1: # Only years or months
value = int(match[0])
if 'year' in pattern:
total_years += value
# We ignore months in this case
elif len(match) == 2: # Years and months
years, _ = int(match[0]), int(match[1])
total_years += years
# Return only the number of years (ignore months)
if total_years > 0:
return f"{total_years} years"
else:
return "Experience not found"
def extract_phone(text: str) -> str:
phone_patterns = [
r'\b(?:\+?1[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}\b',
r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b'
]
for pattern in phone_patterns:
match = re.search(pattern, text)
if match:
return match.group()
return "Not found"
def extract_email(text: str) -> str:
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
match = re.search(email_pattern, text)
return match.group() if match else "Not found"
def extract_colleges(doc) -> List[str]:
colleges = set()
edu_keywords = ["university", "college", "institute", "school"]
for ent in doc.ents:
if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords):
colleges.add(ent.text)
return list(colleges)
def extract_linkedin(text: str) -> str:
linkedin_patterns = [
r'(?:https?:)?\/\/(?:[\w]+\.)?linkedin\.com\/in\/[A-z0-9_-]+\/?',
r'linkedin\.com\/in\/[A-z0-9_-]+',
r'@[A-z0-9_-]+\s+\(LinkedIn\)'
]
for pattern in linkedin_patterns:
match = re.search(pattern, text, re.IGNORECASE)
if match:
return match.group()
return "Not found"
def analyze_resume(text: str, model: genai.GenerativeModel) -> Dict:
doc = nlp_spacy(text)
return {
"companies": extract_orgs(text),
"summary": generate_summary(text, model),
"experience": extract_experience(text),
"phone": extract_phone(text),
"email": extract_email(text),
"colleges": extract_colleges(doc),
"linkedin": extract_linkedin(text)
}
def main():
st.title("Enhanced Resume Analyzer")
st.write("Upload a resume to extract information, generate a summary, and analyze details.")
model = authenticate_gemini()
if model is None:
return
uploaded_file = st.file_uploader("Choose a PDF or DOCX file", type=["pdf", "docx", "doc"])
if uploaded_file is not None:
try:
file_ext = uploaded_file.name.split('.')[-1].lower()
if file_ext == 'pdf':
resume_text = extract_text_from_pdf(uploaded_file)
elif file_ext in ['docx', 'doc']:
resume_text = extract_text_from_doc(uploaded_file)
else:
st.error("Unsupported file format.")
return
if not resume_text.strip():
st.error("The resume appears to be empty or couldn't be read.")
return
with st.spinner("Analyzing resume..."):
results = analyze_resume(resume_text, model)
st.subheader("Extracted Information")
st.write(f"Experience: {results['experience']}")
st.write("Companies Worked For:")
st.write(", ".join(results['companies']))
st.write(f"Phone Number: {results['phone']}")
st.write(f"Email ID: {results['email']}")
st.write("Colleges Attended:")
st.write(", ".join(results['colleges']))
st.write(f"LinkedIn: {results['linkedin']}")
st.subheader("Generated Summary")
st.write(results['summary'])
except Exception as e:
logger.error(f"Error during resume analysis: {e}")
st.error("An error occurred during resume analysis. Please try again or contact support if the issue persists.")
if __name__ == "__main__":
main() |