skillsync-cli / model /praproses /resume_preprocessing.py
Mr-Haseeb786
Clean deployment build
88da18c
Raw
History Blame Contribute Delete
1.51 kB
import pandas as pd
import re
from utils.text_processing import clean_text, tokenize_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
def extract_sections(resume_text):
sections = {
'experience': '',
'education': '',
'skills': '',
'summary': ''
}
# Simple regex-based section extraction
exp_match = re.search(r'experience(.+?)(?=education|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
edu_match = re.search(r'education(.+?)(?=experience|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
skills_match = re.search(r'skills(.+?)(?=experience|education|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
summary_match = re.search(r'summary(.+?)(?=experience|education|skills|$)', resume_text, re.IGNORECASE | re.DOTALL)
if exp_match: sections['experience'] = clean_text(exp_match.group(1))
if edu_match: sections['education'] = clean_text(edu_match.group(1))
if skills_match: sections['skills'] = clean_text(skills_match.group(1))
if summary_match: sections['summary'] = clean_text(summary_match.group(1))
return sections
def reduce_dimensionality(texts, n_components=100):
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(texts)
svd = TruncatedSVD(n_components=n_components)
reduced_matrix = svd.fit_transform(tfidf_matrix)
return reduced_matrix, svd, vectorizer