import pandas as pd
from utils.text_processing import clean_text, tokenize_text
from sklearn.model_selection import train_test_split

def preprocess_jobcv_data(file_path):
    # Load data
    df = pd.read_csv(file_path)
    
    # Clean and tokenize
    df['cleaned_resume'] = df['resume'].apply(clean_text)
    df['cleaned_jd'] = df['job_description'].apply(clean_text)
    df['resume_tokens'] = df['cleaned_resume'].apply(tokenize_text)
    df['jd_tokens'] = df['cleaned_jd'].apply(tokenize_text)
    
    # Create labels (simplified example)
    df['match_score'] = df['match_label'].apply(lambda x: 1 if x == 'match' else 0)
    
    # Split data
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    return train_df, test_df