skillsync-cli / model /praproses /jobcv_preprocessing.py
Mr-Haseeb786
Sanitized Production Build
56c7b6d
Raw
History Blame Contribute Delete
761 Bytes
import pandas as pd
from utils.text_processing import clean_text, tokenize_text
from sklearn.model_selection import train_test_split
def preprocess_jobcv_data(file_path):
# Load data
df = pd.read_csv(file_path)
# Clean and tokenize
df['cleaned_resume'] = df['resume'].apply(clean_text)
df['cleaned_jd'] = df['job_description'].apply(clean_text)
df['resume_tokens'] = df['cleaned_resume'].apply(tokenize_text)
df['jd_tokens'] = df['cleaned_jd'].apply(tokenize_text)
# Create labels (simplified example)
df['match_score'] = df['match_label'].apply(lambda x: 1 if x == 'match' else 0)
# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
return train_df, test_df