Spaces:

SkillSync-II
/

skillsync-cli

Paused

skillsync-cli / model /praproses /jobcv_preprocessing.py

Mr-Haseeb786

Sanitized Production Build

56c7b6d 2 days ago

761 Bytes

	import pandas as pd
	from utils.text_processing import clean_text, tokenize_text
	from sklearn.model_selection import train_test_split

	def preprocess_jobcv_data(file_path):
	# Load data
	df = pd.read_csv(file_path)

	# Clean and tokenize
	df['cleaned_resume'] = df['resume'].apply(clean_text)
	df['cleaned_jd'] = df['job_description'].apply(clean_text)
	df['resume_tokens'] = df['cleaned_resume'].apply(tokenize_text)
	df['jd_tokens'] = df['cleaned_jd'].apply(tokenize_text)

	# Create labels (simplified example)
	df['match_score'] = df['match_label'].apply(lambda x: 1 if x == 'match' else 0)

	# Split data
	train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

	return train_df, test_df