| | """ |
| | Model Training Tests |
| | |
| | These tests verify that the model training process works correctly: |
| | - Training completes without errors |
| | - Loss decreases over epochs |
| | - No overfitting on a single batch |
| | - Training works on different devices (CPU, GPU if available) |
| | |
| | Based on the "Testing Models" section from the behavioral testing framework. |
| | """ |
| | import pytest |
| | import numpy as np |
| | import torch |
| | from sklearn.ensemble import RandomForestClassifier |
| | from sklearn.multioutput import MultiOutputClassifier |
| | from sklearn.model_selection import train_test_split |
| | from sklearn.metrics import f1_score |
| | from pathlib import Path |
| |
|
| | from hopcroft_skill_classification_tool_competition.config import DATA_PATHS, TRAINING_CONFIG |
| |
|
| |
|
| | @pytest.mark.training |
| | class TestModelTraining: |
| | """Test suite for model training validation.""" |
| | |
| | def test_training_completes_without_errors(self): |
| | """ |
| | Test that training completes without raising exceptions. |
| | |
| | Uses a small subset of data for fast testing. |
| | """ |
| | |
| | X = np.load(DATA_PATHS["features"])[:100] |
| | Y = np.load(DATA_PATHS["labels"])[:100] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | X_train, X_test, Y_train, Y_test = train_test_split( |
| | X, Y, test_size=0.2, random_state=42 |
| | ) |
| | |
| | |
| | base_model = RandomForestClassifier( |
| | n_estimators=10, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | model = MultiOutputClassifier(base_model) |
| | |
| | |
| | try: |
| | model.fit(X_train, Y_train) |
| | predictions = model.predict(X_test) |
| | assert predictions.shape == Y_test.shape, "Prediction shape mismatch" |
| | except Exception as e: |
| | pytest.fail(f"Training failed with error: {e}") |
| | |
| | def test_decreasing_loss_after_training(self): |
| | """ |
| | Test that loss decreases after one training epoch. |
| | |
| | We verify this by checking that the model performs better than random. |
| | """ |
| | |
| | X = np.load(DATA_PATHS["features"])[:200] |
| | Y = np.load(DATA_PATHS["labels"])[:200] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | X_train, X_test, Y_train, Y_test = train_test_split( |
| | X, Y, test_size=0.2, random_state=42 |
| | ) |
| | |
| | |
| | base_model = RandomForestClassifier( |
| | n_estimators=20, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | model = MultiOutputClassifier(base_model) |
| | model.fit(X_train, Y_train) |
| | |
| | |
| | Y_pred = model.predict(X_test) |
| | |
| | |
| | f1 = f1_score(Y_test, Y_pred, average='micro', zero_division=0) |
| | |
| | print(f"\nF1 Score after training: {f1:.4f}") |
| | |
| | |
| | |
| | assert f1 > 0.1, ( |
| | f"Model F1 score ({f1:.4f}) is too low, " |
| | "suggests training didn't improve performance" |
| | ) |
| | |
| | def test_overfitting_on_single_batch(self): |
| | """ |
| | Test that model can overfit on a single batch. |
| | |
| | A model should be able to memorize a small dataset (overfitting check). |
| | This verifies the model has sufficient capacity to learn. |
| | """ |
| | |
| | X = np.load(DATA_PATHS["features"])[:20] |
| | Y = np.load(DATA_PATHS["labels"])[:20] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | |
| | base_model = RandomForestClassifier( |
| | n_estimators=50, |
| | max_depth=None, |
| | min_samples_split=2, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | model = MultiOutputClassifier(base_model) |
| | model.fit(X, Y) |
| | |
| | |
| | Y_pred = model.predict(X) |
| | |
| | |
| | accuracy = (Y_pred == Y).mean() |
| | |
| | print(f"\nTraining accuracy (should overfit): {accuracy:.4f}") |
| | |
| | |
| | assert accuracy > 0.7, ( |
| | f"Model cannot overfit on small dataset (accuracy: {accuracy:.4f}). " |
| | "This suggests the model lacks capacity to learn." |
| | ) |
| | |
| | def test_training_on_cpu(self): |
| | """ |
| | Test that training works on CPU. |
| | """ |
| | |
| | X = np.load(DATA_PATHS["features"])[:50] |
| | Y = np.load(DATA_PATHS["labels"])[:50] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | |
| | base_model = RandomForestClassifier( |
| | n_estimators=10, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=1 |
| | ) |
| | model = MultiOutputClassifier(base_model) |
| | |
| | try: |
| | model.fit(X, Y) |
| | predictions = model.predict(X) |
| | assert predictions.shape == Y.shape |
| | print("\n[PASS] Training on CPU successful") |
| | except Exception as e: |
| | pytest.fail(f"Training on CPU failed: {e}") |
| | |
| | def test_training_on_multiple_cores(self): |
| | """ |
| | Test that training works with parallel processing (multiple CPU cores). |
| | """ |
| | |
| | X = np.load(DATA_PATHS["features"])[:50] |
| | Y = np.load(DATA_PATHS["labels"])[:50] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | |
| | base_model = RandomForestClassifier( |
| | n_estimators=10, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | model = MultiOutputClassifier(base_model) |
| | |
| | try: |
| | model.fit(X, Y) |
| | predictions = model.predict(X) |
| | assert predictions.shape == Y.shape |
| | print("\n[PASS] Training with multiple CPU cores successful") |
| | except Exception as e: |
| | pytest.fail(f"Training with multiple cores failed: {e}") |
| | |
| | @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") |
| | def test_training_on_gpu(self): |
| | """ |
| | Test that training works on GPU (if available). |
| | |
| | Note: RandomForest doesn't use GPU, but this test demonstrates |
| | the pattern for models that do (like neural networks). |
| | """ |
| | |
| | |
| | assert torch.cuda.is_available(), "GPU should be available" |
| | print(f"\n[PASS] GPU detected: {torch.cuda.get_device_name(0)}") |
| | print("Note: RandomForest uses CPU. This test verifies GPU availability.") |
| | |
| | def test_reproducibility_with_random_seed(self): |
| | """ |
| | Test that training is reproducible when using the same random seed. |
| | """ |
| | |
| | X = np.load(DATA_PATHS["features"])[:50] |
| | Y = np.load(DATA_PATHS["labels"])[:50] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | |
| | model1 = MultiOutputClassifier( |
| | RandomForestClassifier( |
| | n_estimators=10, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | ) |
| | model1.fit(X, Y) |
| | pred1 = model1.predict(X) |
| | |
| | |
| | model2 = MultiOutputClassifier( |
| | RandomForestClassifier( |
| | n_estimators=10, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | ) |
| | model2.fit(X, Y) |
| | pred2 = model2.predict(X) |
| | |
| | |
| | assert np.array_equal(pred1, pred2), ( |
| | "Models with same random seed should produce identical predictions" |
| | ) |
| | print("\n[PASS] Training is reproducible with random seed") |
| | |
| | def test_model_improves_with_more_data(self): |
| | """ |
| | Test that model performance improves with more training data. |
| | """ |
| | X_full = np.load(DATA_PATHS["features"])[:500] |
| | Y_full = np.load(DATA_PATHS["labels"])[:500] |
| | |
| | |
| | col_sums = Y_full.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y_full = Y_full[:, valid_cols] |
| | |
| | |
| | X_train_full, X_test, Y_train_full, Y_test = train_test_split( |
| | X_full, Y_full, test_size=0.2, random_state=42 |
| | ) |
| | |
| | |
| | X_small = X_train_full[:50] |
| | Y_small = Y_train_full[:50] |
| | |
| | model_small = MultiOutputClassifier( |
| | RandomForestClassifier( |
| | n_estimators=20, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | ) |
| | model_small.fit(X_small, Y_small) |
| | pred_small = model_small.predict(X_test) |
| | f1_small = f1_score(Y_test, pred_small, average='micro', zero_division=0) |
| | |
| | |
| | model_large = MultiOutputClassifier( |
| | RandomForestClassifier( |
| | n_estimators=20, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | ) |
| | model_large.fit(X_train_full, Y_train_full) |
| | pred_large = model_large.predict(X_test) |
| | f1_large = f1_score(Y_test, pred_large, average='micro', zero_division=0) |
| | |
| | print(f"\nF1 with 50 samples: {f1_small:.4f}") |
| | print(f"F1 with {len(X_train_full)} samples: {f1_large:.4f}") |
| | |
| | |
| | |
| | assert f1_large >= f1_small * 0.9, ( |
| | f"Model with more data ({f1_large:.4f}) should not perform " |
| | f"significantly worse than with less data ({f1_small:.4f})" |
| | ) |
| | |
| | def test_model_saves_and_loads_correctly(self, tmp_path): |
| | """ |
| | Test that trained model can be saved and loaded without errors. |
| | """ |
| | import joblib |
| | |
| | |
| | X = np.load(DATA_PATHS["features"])[:50] |
| | Y = np.load(DATA_PATHS["labels"])[:50] |
| | |
| | |
| | col_sums = Y.sum(axis=0) |
| | valid_cols = col_sums > 0 |
| | Y = Y[:, valid_cols] |
| | |
| | |
| | model = MultiOutputClassifier( |
| | RandomForestClassifier( |
| | n_estimators=10, |
| | max_depth=5, |
| | random_state=42, |
| | n_jobs=-1 |
| | ) |
| | ) |
| | model.fit(X, Y) |
| | pred_original = model.predict(X) |
| | |
| | |
| | model_path = tmp_path / "test_model.pkl" |
| | joblib.dump(model, model_path) |
| | |
| | |
| | loaded_model = joblib.load(model_path) |
| | pred_loaded = loaded_model.predict(X) |
| | |
| | |
| | assert np.array_equal(pred_original, pred_loaded), ( |
| | "Loaded model should produce identical predictions" |
| | ) |
| | print("\n[PASS] Model saves and loads correctly") |
| |
|