Spaces:
Paused
Paused
File size: 1,514 Bytes
56c7b6d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import pandas as pd
import re
from utils.text_processing import clean_text, tokenize_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
def extract_sections(resume_text):
sections = {
'experience': '',
'education': '',
'skills': '',
'summary': ''
}
# Simple regex-based section extraction
exp_match = re.search(r'experience(.+?)(?=education|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
edu_match = re.search(r'education(.+?)(?=experience|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
skills_match = re.search(r'skills(.+?)(?=experience|education|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
summary_match = re.search(r'summary(.+?)(?=experience|education|skills|$)', resume_text, re.IGNORECASE | re.DOTALL)
if exp_match: sections['experience'] = clean_text(exp_match.group(1))
if edu_match: sections['education'] = clean_text(edu_match.group(1))
if skills_match: sections['skills'] = clean_text(skills_match.group(1))
if summary_match: sections['summary'] = clean_text(summary_match.group(1))
return sections
def reduce_dimensionality(texts, n_components=100):
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(texts)
svd = TruncatedSVD(n_components=n_components)
reduced_matrix = svd.fit_transform(tfidf_matrix)
return reduced_matrix, svd, vectorizer |