| import pandas as pd |
| import re |
| from utils.text_processing import clean_text, tokenize_text |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.decomposition import TruncatedSVD |
|
|
| def extract_sections(resume_text): |
| sections = { |
| 'experience': '', |
| 'education': '', |
| 'skills': '', |
| 'summary': '' |
| } |
| |
| |
| exp_match = re.search(r'experience(.+?)(?=education|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL) |
| edu_match = re.search(r'education(.+?)(?=experience|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL) |
| skills_match = re.search(r'skills(.+?)(?=experience|education|summary|$)', resume_text, re.IGNORECASE | re.DOTALL) |
| summary_match = re.search(r'summary(.+?)(?=experience|education|skills|$)', resume_text, re.IGNORECASE | re.DOTALL) |
| |
| if exp_match: sections['experience'] = clean_text(exp_match.group(1)) |
| if edu_match: sections['education'] = clean_text(edu_match.group(1)) |
| if skills_match: sections['skills'] = clean_text(skills_match.group(1)) |
| if summary_match: sections['summary'] = clean_text(summary_match.group(1)) |
| |
| return sections |
|
|
| def reduce_dimensionality(texts, n_components=100): |
| vectorizer = TfidfVectorizer(max_features=5000) |
| tfidf_matrix = vectorizer.fit_transform(texts) |
| |
| svd = TruncatedSVD(n_components=n_components) |
| reduced_matrix = svd.fit_transform(tfidf_matrix) |
| |
| return reduced_matrix, svd, vectorizer |