| import pandas as pd |
| from utils.text_processing import clean_text, tokenize_text |
| from sklearn.model_selection import train_test_split |
|
|
| def preprocess_jobcv_data(file_path): |
| |
| df = pd.read_csv(file_path) |
| |
| |
| df['cleaned_resume'] = df['resume'].apply(clean_text) |
| df['cleaned_jd'] = df['job_description'].apply(clean_text) |
| df['resume_tokens'] = df['cleaned_resume'].apply(tokenize_text) |
| df['jd_tokens'] = df['cleaned_jd'].apply(tokenize_text) |
| |
| |
| df['match_score'] = df['match_label'].apply(lambda x: 1 if x == 'match' else 0) |
| |
| |
| train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) |
| |
| return train_df, test_df |