Spaces:
Sleeping
Sleeping
Johnny
updated resume_format > template, hide sidebar, download Spacy model with spacy_loader.py
102e49d | import os, re, json, subprocess | |
| from utils.parser import extract_name # <= your helper | |
| from utils.spacy_loader import get_nlp, is_spacy_available | |
| from datetime import datetime | |
| from dateutil.parser import parse as date_parse | |
| # Load spaCy model with fallback | |
| nlp = get_nlp() | |
| # Initialize spaCy matchers only if spaCy is available | |
| if nlp and is_spacy_available(): | |
| from spacy.matcher import PhraseMatcher, Matcher | |
| # ----------------------------- data lists ----------------------------- | |
| BASE = os.path.dirname(__file__) | |
| SKILLS = json.load(open(os.path.join(BASE, "data/skills.json"))) \ | |
| if os.path.exists(os.path.join(BASE,"data/skills.json")) \ | |
| else ["python","sql","aws","selenium"] | |
| JOB_TITLES = json.load(open(os.path.join(BASE, "data/job_titles.json")))\ | |
| if os.path.exists(os.path.join(BASE,"data/job_titles.json"))\ | |
| else [] | |
| skill_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") | |
| skill_matcher.add("SKILL", [nlp.make_doc(s) for s in SKILLS]) | |
| edu_matcher = Matcher(nlp.vocab) | |
| edu_matcher.add("EDU" , [[{"LOWER":"bachelor"},{"LOWER":"of"},{"IS_TITLE":True,"OP":"+"}]]) | |
| edu_matcher.add("CERT", [[{"LOWER":"certified"},{"IS_TITLE":True,"OP":"+"}]]) | |
| else: | |
| # Fallback: set matchers to None when spaCy is not available | |
| skill_matcher = None | |
| edu_matcher = None | |
| SKILLS = ["python","sql","aws","selenium"] | |
| JOB_TITLES = [] | |
| # ----------------------------- regex helpers -------------------------- | |
| # Jonathan's format: Company | Location | Title | Date | |
| ROLE_FOUR_PARTS = re.compile( | |
| r"""^(?P<company>.+?)\s*\|\s*(?P<location>.+?)\s*\|\s*(?P<title>.+?)\s*\|\s* | |
| (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
| (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) | |
| # Original format: Title | Company | Date | |
| ROLE_ONE = re.compile( | |
| r"""^(?P<title>.+?)\s*\|\s*(?P<company>.+?)\s*\|\s* | |
| (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
| (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) | |
| # Also support the original comma/@ format for backward compatibility | |
| ROLE_ONE_COMMA = re.compile( | |
| r"""^(?P<company>.+?)\s*[,@]\s*(?P<title>[^,@]+?)\s+ | |
| (?P<dates>(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
| (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?)\s*$""", re.I|re.X) | |
| DATE_LINE = re.compile( | |
| r"""^(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4} | |
| (?:\s*[–-]\s*(?:Present|\w+\s+\d{4}))?\s*$""", re.I|re.X) | |
| BULLET = re.compile(r"^\s*(?:[-•·]|\*|●)\s+") | |
| HEAD = re.compile(r"^\s*(summary|skills?|technical\s+skills?|education|training|projects?|work\s+experience|experience|professional\s+experience|certifications?)[:\s]*$",re.I) | |
| # ----------------------------- main ----------------------------------- | |
| def extract_sections_spacy_fixed(text:str)->dict: | |
| lines = [ln.rstrip() for ln in text.splitlines()] | |
| # Only create spaCy doc if nlp is available | |
| doc = nlp(text) if nlp and is_spacy_available() else None | |
| # Helper function for contact detection | |
| def is_contact(s): return bool(re.search(r"@\w|\d{3}[-.\s]?\d{3}",s)) | |
| out = { | |
| "Name" : extract_name(text), | |
| "Summary" : "", | |
| "Skills" : [], | |
| "StructuredExperiences": [], | |
| "Education" : [], | |
| "Training" : [] | |
| } | |
| # ---------- skills extraction (FIXED) ------ | |
| # Extract ONLY from Technical Skills section to avoid noise | |
| skills_from_section = set() | |
| for i, line in enumerate(lines): | |
| if re.match(r"^\s*technical\s+skills?\s*$", line.strip(), re.I): | |
| # Found the heading, now collect the skills content | |
| for j in range(i + 1, len(lines)): | |
| next_line = lines[j].strip() | |
| if not next_line: # Empty line | |
| continue | |
| if HEAD.match(next_line): # Next section heading | |
| break | |
| if is_contact(next_line): # Contact info | |
| break | |
| # Handle bullet point format like "● Programming Languages: Python, Java, SQL, Apex, Bash" | |
| if next_line.startswith('●'): | |
| # Remove bullet and extract the part after the colon | |
| clean_line = next_line[1:].strip() # Remove ● | |
| if ':' in clean_line: | |
| # Split on colon and take the part after it | |
| skills_part = clean_line.split(':', 1)[1].strip() | |
| # Split skills by comma | |
| skills_in_line = re.split(r',\s*', skills_part) | |
| for skill in skills_in_line: | |
| skill = skill.strip() | |
| if skill and len(skill) > 1 and not skill.endswith(')'): # Avoid incomplete entries | |
| skills_from_section.add(skill) | |
| else: | |
| # Handle non-bullet format | |
| skills_in_line = re.split(r',\s*', next_line) | |
| for skill in skills_in_line: | |
| skill = skill.strip() | |
| # Remove bullet points and clean up | |
| skill = re.sub(r'^\s*[•·\-\*●]\s*', '', skill) | |
| if skill and len(skill) > 1: # Avoid single characters | |
| skills_from_section.add(skill) | |
| break | |
| # Use only section-extracted skills to avoid spaCy noise | |
| out["Skills"] = sorted(skills_from_section) | |
| # ---------- summary (improved extraction) ------ | |
| # First try: look for content after "Summary" or "Professional Summary" heading | |
| summary_found = False | |
| for i, line in enumerate(lines): | |
| if re.match(r"^\s*(professional\s+)?summary\s*$", line.strip(), re.I): | |
| # Found the heading, now collect the summary content | |
| summary_lines = [] | |
| for j in range(i + 1, len(lines)): | |
| next_line = lines[j].strip() | |
| if not next_line: # Empty line | |
| continue | |
| if HEAD.match(next_line): # Next section heading | |
| break | |
| if is_contact(next_line): # Contact info | |
| break | |
| summary_lines.append(next_line) | |
| if summary_lines: | |
| out["Summary"] = " ".join(summary_lines) | |
| summary_found = True | |
| break | |
| # Fallback: original method (first non-heading/non-contact paragraph) | |
| if not summary_found: | |
| for para in re.split(r"\n\s*\n", text): | |
| p = para.strip() | |
| if p and not HEAD.match(p) and not is_contact(p): | |
| out["Summary"] = re.sub(r"^(professional\s+)?summary[:,\s]+", "", p, flags=re.I) | |
| break | |
| # ---------- experiences (FIXED) ------------------------------------------- | |
| i=0 | |
| while i < len(lines): | |
| ln = lines[i].strip() | |
| # Try four-part format first (Company | Location | Title | Date) | |
| m4 = ROLE_FOUR_PARTS.match(ln) | |
| if m4: | |
| company, location, title, dates = m4.group("company","location","title","dates") | |
| company = f"{company}, {location}" # Combine company and location | |
| i += 1 | |
| # Try pipe-separated format (Title | Company | Date) | |
| elif ROLE_ONE.match(ln): | |
| m1 = ROLE_ONE.match(ln) | |
| title, company, dates = m1.group("title","company","dates") | |
| i += 1 | |
| # Try comma-separated format (Company, Title Date) | |
| elif ROLE_ONE_COMMA.match(ln): | |
| m2 = ROLE_ONE_COMMA.match(ln) | |
| company, title, dates = m2.group("company","title","dates") | |
| i += 1 | |
| # Try two-liner format | |
| elif i+1 < len(lines) and DATE_LINE.match(lines[i+1].strip()): | |
| first = lines[i].strip() | |
| parts = re.split(r"[,@|\|]\s*", first, 1) # Support both comma and pipe | |
| if len(parts) == 2: | |
| title = parts[0].strip() | |
| company = parts[1].strip() | |
| else: | |
| title = first | |
| company = "" | |
| dates = lines[i+1].strip() | |
| i += 2 | |
| else: | |
| i += 1 | |
| continue | |
| exp = { | |
| "title" : title, | |
| "company" : company, | |
| "date_range" : dates, | |
| "responsibilities": [] | |
| } | |
| # FIXED: Collect responsibilities properly | |
| while i < len(lines): | |
| nxt = lines[i].strip() | |
| if not nxt or HEAD.match(nxt) or ROLE_FOUR_PARTS.match(nxt) or ROLE_ONE.match(nxt) or ROLE_ONE_COMMA.match(nxt) or DATE_LINE.match(nxt): | |
| break | |
| if BULLET.match(nxt): | |
| responsibility = BULLET.sub("",nxt).strip() | |
| if responsibility: # Only add non-empty responsibilities | |
| exp["responsibilities"].append(responsibility) | |
| i += 1 | |
| out["StructuredExperiences"].append(exp) | |
| # ---------- education / training / certifications ----------------------------------- | |
| # Use spaCy matchers if available, otherwise use regex fallback | |
| if doc and edu_matcher and is_spacy_available(): | |
| for mid, s, e in edu_matcher(doc): | |
| bucket = "Education" if nlp.vocab.strings[mid]=="EDU" else "Training" | |
| out[bucket].append(doc[s:e].text) | |
| else: | |
| # Regex fallback for education extraction | |
| edu_patterns = [ | |
| r"(?i)\b(?:bachelor|master|phd|doctorate|associate).*(?:degree|of|in)\s+([^,\n]+)", | |
| r"(?i)\b(?:bs|ba|ms|ma|mba|phd)\s+(?:in\s+)?([^,\n]+)", | |
| r"(?i)\b(?:university|college|institute).*\n?.*(?:bachelor|master|phd|degree)", | |
| ] | |
| for pattern in edu_patterns: | |
| matches = re.findall(pattern, text) | |
| for match in matches: | |
| if isinstance(match, str) and len(match.strip()) > 3: | |
| out["Education"].append(match.strip()) | |
| # Also extract certifications section manually | |
| cert_section_found = False | |
| for i, line in enumerate(lines): | |
| if re.match(r"^\s*certifications?\s*$", line.strip(), re.I): | |
| cert_section_found = True | |
| # Collect certification lines | |
| for j in range(i + 1, len(lines)): | |
| next_line = lines[j].strip() | |
| if not next_line: # Empty line | |
| continue | |
| if HEAD.match(next_line): # Next section heading | |
| break | |
| # Split multiple certifications on the same line | |
| certs = re.split(r',\s*', next_line) | |
| for cert in certs: | |
| cert = cert.strip() | |
| if cert and not is_contact(cert): | |
| out["Training"].append(cert) | |
| break | |
| return out |