File size: 1,514 Bytes
56c7b6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import pandas as pd
import re
from utils.text_processing import clean_text, tokenize_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def extract_sections(resume_text):
    sections = {
        'experience': '',
        'education': '',
        'skills': '',
        'summary': ''
    }
    
    # Simple regex-based section extraction
    exp_match = re.search(r'experience(.+?)(?=education|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
    edu_match = re.search(r'education(.+?)(?=experience|skills|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
    skills_match = re.search(r'skills(.+?)(?=experience|education|summary|$)', resume_text, re.IGNORECASE | re.DOTALL)
    summary_match = re.search(r'summary(.+?)(?=experience|education|skills|$)', resume_text, re.IGNORECASE | re.DOTALL)
    
    if exp_match: sections['experience'] = clean_text(exp_match.group(1))
    if edu_match: sections['education'] = clean_text(edu_match.group(1))
    if skills_match: sections['skills'] = clean_text(skills_match.group(1))
    if summary_match: sections['summary'] = clean_text(summary_match.group(1))
    
    return sections

def reduce_dimensionality(texts, n_components=100):
    vectorizer = TfidfVectorizer(max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    svd = TruncatedSVD(n_components=n_components)
    reduced_matrix = svd.fit_transform(tfidf_matrix)
    
    return reduced_matrix, svd, vectorizer