import pandas as pd import re from utils.text_processing import clean_text, tokenize_text from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import TruncatedSVD def extract_sections(resume_text): sections = { 'experience': '', 'education': '', 'skills': '', 'summary': '' } # Simple regex-based section extraction exp_match = re.search(r'experience(.+?)(?=education|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL) edu_match = re.search(r'education(.+?)(?=experience|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL) skills_match = re.search(r'skills(.+?)(?=experience|education|summary|$)', resume_text, re.IGNORECASE | re.DOTALL) summary_match = re.search(r'summary(.+?)(?=experience|education|skills|$)', resume_text, re.IGNORECASE | re.DOTALL) if exp_match: sections['experience'] = clean_text(exp_match.group(1)) if edu_match: sections['education'] = clean_text(edu_match.group(1)) if skills_match: sections['skills'] = clean_text(skills_match.group(1)) if summary_match: sections['summary'] = clean_text(summary_match.group(1)) return sections def reduce_dimensionality(texts, n_components=100): vectorizer = TfidfVectorizer(max_features=5000) tfidf_matrix = vectorizer.fit_transform(texts) svd = TruncatedSVD(n_components=n_components) reduced_matrix = svd.fit_transform(tfidf_matrix) return reduced_matrix, svd, vectorizer