| | """ |
| | Train-Test Validation Suite - Deepchecks validation for train-test consistency |
| | |
| | This module implements comprehensive train-test validation checks using Deepchecks |
| | to ensure consistency and proper splitting between training and test datasets. |
| | |
| | Checks included: |
| | - Train-Test Feature Drift: Detects distribution changes between train and test |
| | - Train-Test Label Drift: Checks if label distribution differs |
| | - Train-Test Samples Mix: Validates no data leakage |
| | - Whole Dataset Drift: Overall distribution comparison |
| | - Feature Label Correlation Change: Checks if correlations change |
| | - New Label: Detects labels in test not present in train |
| | - New Category: Detects new categorical values in test |
| | - String Mismatch Comparison: Compares string inconsistencies |
| | - Date Train Test Leakage Duplicates: Checks for temporal leakage |
| | - Date Train Test Leakage Overlap: Validates proper temporal split |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import json |
| | from pathlib import Path |
| | from deepchecks.tabular import Dataset |
| | from deepchecks.tabular.suites import train_test_validation |
| |
|
| | from hopcroft_skill_classification_tool_competition.config import PROCESSED_DATA_DIR |
| |
|
| |
|
| | def load_train_test_data(use_cleaned=True): |
| | """ |
| | Load training and test datasets from processed data directory. |
| | |
| | Args: |
| | use_cleaned: If True, load cleaned data (with '_clean' suffix) - DEFAULT |
| | |
| | Returns: |
| | tuple: (X_train, y_train, X_test, y_test) |
| | """ |
| | tfidf_dir = PROCESSED_DATA_DIR / "tfidf" |
| | |
| | |
| | if use_cleaned: |
| | train_features = tfidf_dir / "features_tfidf_clean.npy" |
| | train_labels = tfidf_dir / "labels_tfidf_clean.npy" |
| | test_features = tfidf_dir / "X_test_clean.npy" |
| | test_labels = tfidf_dir / "Y_test_clean.npy" |
| | data_type = "cleaned" |
| | else: |
| | train_features = tfidf_dir / "features_tfidf.npy" |
| | train_labels = tfidf_dir / "labels_tfidf.npy" |
| | test_features = tfidf_dir / "X_test.npy" |
| | test_labels = tfidf_dir / "Y_test.npy" |
| | data_type = "original" |
| | |
| | |
| | X_train = np.load(train_features) |
| | y_train = np.load(train_labels) |
| | X_test = np.load(test_features) |
| | y_test = np.load(test_labels) |
| | |
| | print(f"Loaded {data_type} data:") |
| | print(f"Training set shape: X={X_train.shape}, y={y_train.shape}") |
| | print(f"Test set shape: X={X_test.shape}, y={y_test.shape}") |
| | |
| | return X_train, y_train, X_test, y_test |
| |
|
| |
|
| | def create_deepchecks_dataset(X, y, dataset_name="dataset"): |
| | """ |
| | Create a Deepchecks Dataset object from numpy arrays. |
| | |
| | Args: |
| | X: Feature matrix (numpy array) |
| | y: Labels (numpy array) - can be multi-label (2D) or single-label (1D) |
| | dataset_name: Name identifier for the dataset |
| | |
| | Returns: |
| | Dataset: Deepchecks Dataset object |
| | """ |
| | |
| | |
| | feature_names = [f"feature_{i}" for i in range(X.shape[1])] |
| | |
| | |
| | df = pd.DataFrame(X, columns=feature_names) |
| | |
| | |
| | if len(y.shape) > 1 and y.shape[1] > 1: |
| | |
| | |
| | y_single = np.argmax(y, axis=1) |
| | df['label'] = y_single |
| | print(f"Note: Converted multi-label ({y.shape[1]} labels) to single-label for Deepchecks") |
| | else: |
| | df['label'] = y |
| | |
| | |
| | ds = Dataset(df, label='label', cat_features=[]) |
| | |
| | return ds |
| |
|
| |
|
| | def run_train_test_validation_suite(save_output=True, use_cleaned=True): |
| | """ |
| | Run the complete Train-Test Validation Suite. |
| | |
| | This suite performs comprehensive checks including: |
| | - Train Test Feature Drift: Detects significant distribution changes in features |
| | - Train Test Label Drift: Checks if label distribution is consistent |
| | - Train Test Samples Mix: Validates no samples appear in both sets |
| | - Whole Dataset Drift: Overall dataset distribution comparison |
| | - Feature Label Correlation Change: Detects changes in feature-label relationships |
| | - New Label: Identifies labels in test that don't exist in train |
| | - New Category: Finds new categorical values in test set |
| | - String Mismatch Comparison: Compares string format consistency |
| | - Date Train Test Leakage: Checks for temporal data leakage |
| | - Index Train Test Leakage: Validates proper index separation |
| | |
| | Args: |
| | save_output: Whether to save the HTML report |
| | use_cleaned: If True, use cleaned data instead of original |
| | |
| | Returns: |
| | SuiteResult: Results from the train-test validation suite |
| | """ |
| | data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| | print("="*80) |
| | print(f"TRAIN-TEST VALIDATION SUITE - {data_type} DATA") |
| | print("="*80) |
| | |
| | |
| | X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| | |
| | |
| | train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
| | test_dataset = create_deepchecks_dataset(X_test, y_test, "test") |
| | |
| | |
| | print("\nRunning Train-Test Validation checks...") |
| | suite = train_test_validation() |
| | result = suite.run(train_dataset, test_dataset) |
| | |
| | |
| | print("\nTrain-Test Validation Suite completed!") |
| | print(f"Total checks: {len(result.results)}") |
| | |
| | |
| | if save_output: |
| | output_dir = Path("reports/deepchecks") |
| | output_dir.mkdir(parents=True, exist_ok=True) |
| | |
| | |
| | suffix = "_clean" if use_cleaned else "_original" |
| | json_path = output_dir / f"train_test_validation_suite_results{suffix}.json" |
| | json_results = { |
| | "suite_name": "Train-Test Validation Suite", |
| | "total_checks": len(result.results), |
| | "timestamp": pd.Timestamp.now().isoformat(), |
| | "checks": [] |
| | } |
| | |
| | for check_result in result.results: |
| | check_data = { |
| | "check_name": check_result.get_header(), |
| | "passed": check_result.passed_conditions() if hasattr(check_result, 'passed_conditions') else None, |
| | "display": str(check_result.display) if hasattr(check_result, 'display') else None |
| | } |
| | json_results["checks"].append(check_data) |
| | |
| | with open(json_path, 'w', encoding='utf-8') as f: |
| | json.dump(json_results, f, indent=2, ensure_ascii=False) |
| | print(f"JSON results saved to: {json_path}") |
| | |
| | return result |
| |
|
| |
|
| | def run_custom_train_test_checks(save_output=True, use_cleaned=True): |
| | """ |
| | Run custom train-test validation checks tailored for the SkillScope dataset. |
| | |
| | These checks are specifically designed for NLP/Text features and |
| | multi-label classification tasks. |
| | |
| | Args: |
| | save_output: Whether to save the HTML report |
| | use_cleaned: If True, use cleaned data instead of original |
| | |
| | Returns: |
| | dict: Dictionary containing check results |
| | """ |
| | from deepchecks.tabular.checks import ( |
| | TrainTestFeatureDrift, |
| | TrainTestLabelDrift, |
| | TrainTestSamplesMix, |
| | WholeDatasetDrift, |
| | FeatureLabelCorrelationChange, |
| | ) |
| | |
| | data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| | print("="*80) |
| | print(f"CUSTOM TRAIN-TEST VALIDATION CHECKS - {data_type} DATA") |
| | print("="*80) |
| | |
| | |
| | X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| | train_dataset = create_deepchecks_dataset(X_train, y_train, "training") |
| | test_dataset = create_deepchecks_dataset(X_test, y_test, "test") |
| | |
| | results = {} |
| | |
| | |
| | print("\n1. Checking for feature drift between train and test...") |
| | feature_drift_check = TrainTestFeatureDrift() |
| | results['feature_drift'] = feature_drift_check.run(train_dataset, test_dataset) |
| | |
| | |
| | print("2. Checking for label drift between train and test...") |
| | label_drift_check = TrainTestLabelDrift() |
| | results['label_drift'] = label_drift_check.run(train_dataset, test_dataset) |
| | |
| | |
| | print("3. Checking for data leakage (samples appearing in both sets)...") |
| | samples_mix_check = TrainTestSamplesMix() |
| | results['samples_mix'] = samples_mix_check.run(train_dataset, test_dataset) |
| | |
| | |
| | print("4. Checking overall dataset drift...") |
| | dataset_drift_check = WholeDatasetDrift() |
| | results['dataset_drift'] = dataset_drift_check.run(train_dataset, test_dataset) |
| | |
| | |
| | print("5. Checking for changes in feature-label correlation...") |
| | correlation_change_check = FeatureLabelCorrelationChange() |
| | results['correlation_change'] = correlation_change_check.run(train_dataset, test_dataset) |
| | |
| | |
| | |
| | print("6. Skipping NewLabel check (not available in this Deepchecks version)") |
| | |
| | print("\nAll custom train-test checks completed!") |
| | |
| | |
| | |
| | return results |
| |
|
| |
|
| | def compare_distributions(use_cleaned=True): |
| | """ |
| | Compare statistical distributions between train and test sets. |
| | |
| | Args: |
| | use_cleaned: If True, compare cleaned data instead of original |
| | """ |
| | data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| | print("="*80) |
| | print(f"TRAIN-TEST DISTRIBUTION COMPARISON - {data_type} DATA") |
| | print("="*80) |
| | |
| | X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| | |
| | print("\n1. SAMPLE SIZES:") |
| | print(f" Training: {X_train.shape[0]} samples") |
| | print(f" Test: {X_test.shape[0]} samples") |
| | print(f" Train/Test ratio: {X_train.shape[0]/X_test.shape[0]:.2f}") |
| | |
| | print("\n2. FEATURE DIMENSIONS:") |
| | print(f" Training features: {X_train.shape[1]}") |
| | print(f" Test features: {X_test.shape[1]}") |
| | if X_train.shape[1] != X_test.shape[1]: |
| | print(" WARNING: Feature dimensions don't match!") |
| | else: |
| | print(" ✓ Feature dimensions match") |
| | |
| | print("\n3. LABEL DISTRIBUTION:") |
| | train_unique, train_counts = np.unique(y_train, return_counts=True) |
| | test_unique, test_counts = np.unique(y_test, return_counts=True) |
| | |
| | print(f" Training unique labels: {len(train_unique)}") |
| | print(f" Test unique labels: {len(test_unique)}") |
| | |
| | |
| | new_labels = set(test_unique) - set(train_unique) |
| | if new_labels: |
| | print(f" WARNING: {len(new_labels)} new labels in test set: {new_labels}") |
| | else: |
| | print(" No new labels in test set") |
| | |
| | |
| | missing_labels = set(train_unique) - set(test_unique) |
| | if missing_labels: |
| | print(f" INFO: {len(missing_labels)} labels only in train set") |
| | |
| | print("\n4. FEATURE STATISTICS COMPARISON:") |
| | print(f" Train - Mean: {X_train.mean():.4f}, Std: {X_train.std():.4f}") |
| | print(f" Test - Mean: {X_test.mean():.4f}, Std: {X_test.std():.4f}") |
| | |
| | mean_diff = abs(X_train.mean() - X_test.mean()) |
| | std_diff = abs(X_train.std() - X_test.std()) |
| | |
| | print(f" Mean difference: {mean_diff:.4f}") |
| | print(f" Std difference: {std_diff:.4f}") |
| | |
| | if mean_diff > 0.1 or std_diff > 0.1: |
| | print(" WARNING: Significant statistical differences detected!") |
| | else: |
| | print(" Statistical distributions are similar") |
| | |
| | print("\n5. SPARSITY COMPARISON:") |
| | train_sparsity = (X_train == 0).sum() / X_train.size * 100 |
| | test_sparsity = (X_test == 0).sum() / X_test.size * 100 |
| | print(f" Training sparsity: {train_sparsity:.2f}%") |
| | print(f" Test sparsity: {test_sparsity:.2f}%") |
| | print(f" Sparsity difference: {abs(train_sparsity - test_sparsity):.2f}%") |
| | |
| | if abs(train_sparsity - test_sparsity) > 5: |
| | print(" WARNING: Significant sparsity difference!") |
| | else: |
| | print(" Sparsity levels are similar") |
| |
|
| |
|
| | def validate_split_quality(use_cleaned=True): |
| | """ |
| | Validate the quality of the train-test split. |
| | |
| | Args: |
| | use_cleaned: If True, validate cleaned data instead of original |
| | """ |
| | data_type = "CLEANED" if use_cleaned else "ORIGINAL" |
| | print("="*80) |
| | print(f"TRAIN-TEST SPLIT QUALITY VALIDATION - {data_type} DATA") |
| | print("="*80) |
| | |
| | X_train, y_train, X_test, y_test = load_train_test_data(use_cleaned=use_cleaned) |
| | |
| | total_samples = X_train.shape[0] + X_test.shape[0] |
| | test_ratio = X_test.shape[0] / total_samples |
| | |
| | print(f"\nTotal samples: {total_samples}") |
| | print(f"Test set ratio: {test_ratio:.2%}") |
| | |
| | |
| | if 0.15 <= test_ratio <= 0.35: |
| | print(" Test set size is within recommended range (15-35%)") |
| | else: |
| | print(" WARNING: Test set size is outside recommended range") |
| | |
| | |
| | from scipy.stats import chisquare |
| | |
| | |
| | common_labels = np.intersect1d(np.unique(y_train), np.unique(y_test)) |
| | |
| | if len(common_labels) > 0: |
| | train_dist = [np.sum(y_train == label) for label in common_labels] |
| | test_dist = [np.sum(y_test == label) for label in common_labels] |
| | |
| | |
| | train_props = np.array(train_dist) / len(y_train) |
| | test_props = np.array(test_dist) / len(y_test) |
| | |
| | |
| | |
| | expected = test_props * len(y_train) |
| | chi_stat, p_value = chisquare(train_dist, expected) |
| | |
| | print(f"\nLabel distribution similarity (chi-square test):") |
| | print(f" Chi-square statistic: {chi_stat:.4f}") |
| | print(f" P-value: {p_value:.4f}") |
| | |
| | if p_value > 0.05: |
| | print(" Label distributions are statistically similar (p > 0.05)") |
| | else: |
| | print(" WARNING: Label distributions differ significantly (p <= 0.05)") |
| | else: |
| | print(" WARNING: No common labels between train and test sets!") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import sys |
| | |
| | |
| | use_cleaned = not ('--original' in sys.argv or '-o' in sys.argv) |
| | |
| | if use_cleaned: |
| | print("Testing CLEANED data (from data_cleaning.py) - DEFAULT\n") |
| | else: |
| | print("Testing ORIGINAL data\n") |
| | print("Note: Using --original flag to test old data\n") |
| | |
| | |
| | compare_distributions(use_cleaned=use_cleaned) |
| | |
| | |
| | print("\n") |
| | validate_split_quality(use_cleaned=use_cleaned) |
| | |
| | |
| | print("\n") |
| | suite_result = run_train_test_validation_suite(save_output=True, use_cleaned=use_cleaned) |
| | |
| | |
| | print("\n") |
| | custom_results = run_custom_train_test_checks(save_output=True, use_cleaned=use_cleaned) |
| | |
| | print("\n" + "="*80) |
| | print("TRAIN-TEST VALIDATION COMPLETED") |
| | print("="*80) |
| | print("\nCheck the reports in the 'reports/deepchecks' directory") |
| |
|