Spaces:
Paused
Paused
| import pandas as pd | |
| from utils.text_processing import clean_text, tokenize_text | |
| from sklearn.model_selection import train_test_split | |
| def preprocess_jobcv_data(file_path): | |
| # Load data | |
| df = pd.read_csv(file_path) | |
| # Clean and tokenize | |
| df['cleaned_resume'] = df['resume'].apply(clean_text) | |
| df['cleaned_jd'] = df['job_description'].apply(clean_text) | |
| df['resume_tokens'] = df['cleaned_resume'].apply(tokenize_text) | |
| df['jd_tokens'] = df['cleaned_jd'].apply(tokenize_text) | |
| # Create labels (simplified example) | |
| df['match_score'] = df['match_label'].apply(lambda x: 1 if x == 'match' else 0) | |
| # Split data | |
| train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) | |
| return train_df, test_df |