| | """ |
| | Pytest configuration and fixtures for behavioral tests. |
| | """ |
| | import pytest |
| | import numpy as np |
| | import joblib |
| | from pathlib import Path |
| | from sklearn.feature_extraction.text import TfidfVectorizer |
| |
|
| | from hopcroft_skill_classification_tool_competition.config import DATA_PATHS |
| | from hopcroft_skill_classification_tool_competition.features import ( |
| | clean_github_text, |
| | get_label_columns, |
| | load_data_from_db |
| | ) |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def trained_model(): |
| | """Load the trained model for testing.""" |
| | model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch_smote.pkl" |
| | |
| | |
| | if not model_path.exists(): |
| | model_path = Path(DATA_PATHS["models_dir"]) / "random_forest_tfidf_gridsearch.pkl" |
| | |
| | if not model_path.exists(): |
| | pytest.skip(f"Model not found at {model_path}. Please train a model first.") |
| | |
| | return joblib.load(model_path) |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def tfidf_vectorizer(trained_model): |
| | """ |
| | Extract or reconstruct the TF-IDF vectorizer from the trained model. |
| | |
| | Note: In a production setting, you should save and load the vectorizer separately. |
| | For now, we'll create a new one fitted on the training data with max_features=1000. |
| | """ |
| | |
| | features_path = Path(DATA_PATHS["features"]) |
| | |
| | if not features_path.exists(): |
| | pytest.skip(f"Features not found at {features_path}. Please run feature extraction first.") |
| | |
| | |
| | |
| | from hopcroft_skill_classification_tool_competition.features import extract_tfidf_features |
| | |
| | try: |
| | df = load_data_from_db() |
| | |
| | _, vectorizer = extract_tfidf_features(df, max_features=1000) |
| | return vectorizer |
| | except Exception as e: |
| | pytest.skip(f"Could not load vectorizer: {e}") |
| |
|
| |
|
| | @pytest.fixture(scope="session") |
| | def label_names(): |
| | """Get the list of label names from the database.""" |
| | try: |
| | df = load_data_from_db() |
| | return get_label_columns(df) |
| | except Exception as e: |
| | pytest.skip(f"Could not load label names: {e}") |
| |
|
| |
|
| | @pytest.fixture |
| | def predict_text(trained_model, tfidf_vectorizer): |
| | """ |
| | Factory fixture that returns a function to predict skills from raw text. |
| | |
| | Returns: |
| | Function that takes text and returns predicted label indices |
| | """ |
| | def _predict(text: str, return_proba: bool = False): |
| | """ |
| | Predict skills from raw text. |
| | |
| | Args: |
| | text: Raw GitHub issue text |
| | return_proba: If True, return probabilities instead of binary predictions |
| | |
| | Returns: |
| | If return_proba=False: indices of predicted labels (numpy array) |
| | If return_proba=True: probability matrix (n_samples, n_labels) |
| | """ |
| | |
| | cleaned = clean_github_text(text) |
| | features = tfidf_vectorizer.transform([cleaned]).toarray() |
| | |
| | if return_proba: |
| | |
| | |
| | try: |
| | probas = np.array([ |
| | estimator.predict_proba(features)[0][:, 1] |
| | for estimator in trained_model.estimators_ |
| | ]).T |
| | return probas |
| | except Exception: |
| | |
| | return trained_model.predict(features) |
| | |
| | |
| | predictions = trained_model.predict(features)[0] |
| | |
| | |
| | return np.where(predictions == 1)[0] |
| | |
| | return _predict |
| |
|
| |
|
| | @pytest.fixture |
| | def predict_with_labels(predict_text, label_names): |
| | """ |
| | Factory fixture that returns a function to predict skills with label names. |
| | |
| | Returns: |
| | Function that takes text and returns list of predicted label names |
| | """ |
| | def _predict(text: str): |
| | """ |
| | Predict skill labels from raw text. |
| | |
| | Args: |
| | text: Raw GitHub issue text |
| | |
| | Returns: |
| | List of predicted label names |
| | """ |
| | indices = predict_text(text) |
| | return [label_names[i] for i in indices] |
| | |
| | return _predict |
| |
|
| |
|
| | def pytest_configure(config): |
| | """Register custom markers.""" |
| | config.addinivalue_line( |
| | "markers", "invariance: Tests for invariance (changes should not affect predictions)" |
| | ) |
| | config.addinivalue_line( |
| | "markers", "directional: Tests for directional expectations (changes should affect predictions predictably)" |
| | ) |
| | config.addinivalue_line( |
| | "markers", "mft: Minimum Functionality Tests (basic examples with expected outputs)" |
| | ) |
| | config.addinivalue_line( |
| | "markers", "training: Tests for model training validation (loss, overfitting, devices)" |
| | ) |
| |
|