Spaces:

gauravbox
/

TalentLensAI

Sleeping

TalentLensAI / utils /extractor_fixed.py

Johnny

updated resume_format > template, hide sidebar, download Spacy model with spacy_loader.py

102e49d about 1 year ago

11.1 kB

	import os, re, json, subprocess
	from utils.parser import extract_name # <= your helper
	from utils.spacy_loader import get_nlp, is_spacy_available
	from datetime import datetime
	from dateutil.parser import parse as date_parse

	# Load spaCy model with fallback
	nlp = get_nlp()

	# Initialize spaCy matchers only if spaCy is available
	if nlp and is_spacy_available():
	from spacy.matcher import PhraseMatcher, Matcher

	# ----------------------------- data lists -----------------------------
	BASE = os.path.dirname(__file__)
	SKILLS = json.load(open(os.path.join(BASE, "data/skills.json"))) \
	if os.path.exists(os.path.join(BASE,"data/skills.json")) \
	else ["python","sql","aws","selenium"]
	JOB_TITLES = json.load(open(os.path.join(BASE, "data/job_titles.json")))\
	if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\
	else []

	skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
	skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS])

	edu_matcher = Matcher(nlp.vocab)
	edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]])
	edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]])
	else:
	# Fallback: set matchers to None when spaCy is not available
	skill_matcher = None
	edu_matcher = None
	SKILLS = ["python","sql","aws","selenium"]
	JOB_TITLES = []

	# ----------------------------- regex helpers --------------------------
	# Jonathan's format: Company \| Location \| Title \| Date
	ROLE_FOUR_PARTS = re.compile(
	r"""^(?P<company>.+?)\s\\|\s(?P<location>.+?)\s\\|\s(?P<title>.+?)\s\\|\s
	(?P<dates>(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+\d{4}
	(?:\s[–-]\s(?:Present\|\w+\s+\d{4}))?)\s*$""", re.I\|re.X)

	# Original format: Title \| Company \| Date
	ROLE_ONE = re.compile(
	r"""^(?P<title>.+?)\s\\|\s(?P<company>.+?)\s\\|\s
	(?P<dates>(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+\d{4}
	(?:\s[–-]\s(?:Present\|\w+\s+\d{4}))?)\s*$""", re.I\|re.X)

	# Also support the original comma/@ format for backward compatibility
	ROLE_ONE_COMMA = re.compile(
	r"""^(?P<company>.+?)\s[,@]\s(?P<title>[^,@]+?)\s+
	(?P<dates>(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+\d{4}
	(?:\s[–-]\s(?:Present\|\w+\s+\d{4}))?)\s*$""", re.I\|re.X)

	DATE_LINE = re.compile(
	r"""^(?:Jan\|Feb\|Mar\|Apr\|May\|Jun\|Jul\|Aug\|Sep\|Oct\|Nov\|Dec)[a-z]*\s+\d{4}
	(?:\s[–-]\s(?:Present\|\w+\s+\d{4}))?\s*$""", re.I\|re.X)

	BULLET = re.compile(r"^\s(?:[-•·]\|\\|●)\s+")
	HEAD = re.compile(r"^\s(summary\|skills?\|technical\s+skills?\|education\|training\|projects?\|work\s+experience\|experience\|professional\s+experience\|certifications?)[:\s]$",re.I)

	# ----------------------------- main -----------------------------------
	def extract_sections_spacy_fixed(text:str)->dict:
	lines = [ln.rstrip() for ln in text.splitlines()]

	# Only create spaCy doc if nlp is available
	doc = nlp(text) if nlp and is_spacy_available() else None

	# Helper function for contact detection
	def is_contact(s): return bool(re.search(r"@\w\|\d{3}[-.\s]?\d{3}",s))

	out = {
	"Name" : extract_name(text),
	"Summary" : "",
	"Skills" : [],
	"StructuredExperiences": [],
	"Education" : [],
	"Training" : []
	}

	# ---------- skills extraction (FIXED) ------
	# Extract ONLY from Technical Skills section to avoid noise
	skills_from_section = set()
	for i, line in enumerate(lines):
	if re.match(r"^\stechnical\s+skills?\s$", line.strip(), re.I):
	# Found the heading, now collect the skills content
	for j in range(i + 1, len(lines)):
	next_line = lines[j].strip()
	if not next_line: # Empty line
	continue
	if HEAD.match(next_line): # Next section heading
	break
	if is_contact(next_line): # Contact info
	break

	# Handle bullet point format like "● Programming Languages: Python, Java, SQL, Apex, Bash"
	if next_line.startswith('●'):
	# Remove bullet and extract the part after the colon
	clean_line = next_line[1:].strip() # Remove ●
	if ':' in clean_line:
	# Split on colon and take the part after it
	skills_part = clean_line.split(':', 1)[1].strip()
	# Split skills by comma
	skills_in_line = re.split(r',\s*', skills_part)
	for skill in skills_in_line:
	skill = skill.strip()
	if skill and len(skill) > 1 and not skill.endswith(')'): # Avoid incomplete entries
	skills_from_section.add(skill)
	else:
	# Handle non-bullet format
	skills_in_line = re.split(r',\s*', next_line)
	for skill in skills_in_line:
	skill = skill.strip()
	# Remove bullet points and clean up
	skill = re.sub(r'^\s[•·\-\●]\s*', '', skill)
	if skill and len(skill) > 1: # Avoid single characters
	skills_from_section.add(skill)
	break

	# Use only section-extracted skills to avoid spaCy noise
	out["Skills"] = sorted(skills_from_section)

	# ---------- summary (improved extraction) ------
	# First try: look for content after "Summary" or "Professional Summary" heading
	summary_found = False
	for i, line in enumerate(lines):
	if re.match(r"^\s(professional\s+)?summary\s$", line.strip(), re.I):
	# Found the heading, now collect the summary content
	summary_lines = []
	for j in range(i + 1, len(lines)):
	next_line = lines[j].strip()
	if not next_line: # Empty line
	continue
	if HEAD.match(next_line): # Next section heading
	break
	if is_contact(next_line): # Contact info
	break
	summary_lines.append(next_line)
	if summary_lines:
	out["Summary"] = " ".join(summary_lines)
	summary_found = True
	break

	# Fallback: original method (first non-heading/non-contact paragraph)
	if not summary_found:
	for para in re.split(r"\n\s*\n", text):
	p = para.strip()
	if p and not HEAD.match(p) and not is_contact(p):
	out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I)
	break

	# ---------- experiences (FIXED) -------------------------------------------
	i=0
	while i < len(lines):
	ln = lines[i].strip()

	# Try four-part format first (Company \| Location \| Title \| Date)
	m4 = ROLE_FOUR_PARTS.match(ln)
	if m4:
	company, location, title, dates = m4.group("company","location","title","dates")
	company = f"{company}, {location}" # Combine company and location
	i += 1
	# Try pipe-separated format (Title \| Company \| Date)
	elif ROLE_ONE.match(ln):
	m1 = ROLE_ONE.match(ln)
	title, company, dates = m1.group("title","company","dates")
	i += 1
	# Try comma-separated format (Company, Title Date)
	elif ROLE_ONE_COMMA.match(ln):
	m2 = ROLE_ONE_COMMA.match(ln)
	company, title, dates = m2.group("company","title","dates")
	i += 1
	# Try two-liner format
	elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()):
	first = lines[i].strip()
	parts = re.split(r"[,@\|\\|]\s*", first, 1) # Support both comma and pipe
	if len(parts) == 2:
	title = parts[0].strip()
	company = parts[1].strip()
	else:
	title = first
	company = ""
	dates = lines[i+1].strip()
	i += 2
	else:
	i += 1
	continue

	exp = {
	"title" : title,
	"company" : company,
	"date_range" : dates,
	"responsibilities": []
	}

	# FIXED: Collect responsibilities properly
	while i < len(lines):
	nxt = lines[i].strip()
	if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt):
	break
	if BULLET.match(nxt):
	responsibility = BULLET.sub("",nxt).strip()
	if responsibility: # Only add non-empty responsibilities
	exp["responsibilities"].append(responsibility)
	i += 1

	out["StructuredExperiences"].append(exp)

	# ---------- education / training / certifications -----------------------------------
	# Use spaCy matchers if available, otherwise use regex fallback
	if doc and edu_matcher and is_spacy_available():
	for mid, s, e in edu_matcher(doc):
	bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training"
	out[bucket].append(doc[s:e].text)
	else:
	# Regex fallback for education extraction
	edu_patterns = [
	r"(?i)\b(?:bachelor\|master\|phd\|doctorate\|associate).*(?:degree\|of\|in)\s+([^,\n]+)",
	r"(?i)\b(?:bs\|ba\|ms\|ma\|mba\|phd)\s+(?:in\s+)?([^,\n]+)",
	r"(?i)\b(?:university\|college\|institute).\n?.(?:bachelor\|master\|phd\|degree)",
	]

	for pattern in edu_patterns:
	matches = re.findall(pattern, text)
	for match in matches:
	if isinstance(match, str) and len(match.strip()) > 3:
	out["Education"].append(match.strip())

	# Also extract certifications section manually
	cert_section_found = False
	for i, line in enumerate(lines):
	if re.match(r"^\scertifications?\s$", line.strip(), re.I):
	cert_section_found = True
	# Collect certification lines
	for j in range(i + 1, len(lines)):
	next_line = lines[j].strip()
	if not next_line: # Empty line
	continue
	if HEAD.match(next_line): # Next section heading
	break
	# Split multiple certifications on the same line
	certs = re.split(r',\s*', next_line)
	for cert in certs:
	cert = cert.strip()
	if cert and not is_contact(cert):
	out["Training"].append(cert)
	break

	return out