| |
| """ |
| Train all three pg_plan_cache models: |
| 1. SQL Cache Advisor (classification: low / medium / high) |
| 2. Cache TTL Recommender (regression: seconds) |
| 3. Query Complexity Estimator (regression: 1-100 score) |
| |
| Saves trained models as joblib files in the ./trained/ directory. |
| """ |
|
|
| import os |
| import json |
| import numpy as np |
| from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor |
| from sklearn.model_selection import train_test_split, cross_val_score |
| from sklearn.metrics import classification_report, mean_absolute_error, r2_score |
| from sklearn.preprocessing import LabelEncoder |
| import joblib |
|
|
| from features import extract_features, FEATURE_NAMES |
| from dataset import generate_dataset |
|
|
| OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained") |
|
|
|
|
| def train(): |
| print("=" * 60) |
| print(" pg_plan_cache β Model Training") |
| print("=" * 60) |
|
|
| |
| print("\n[1/5] Generating synthetic training data...") |
| queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42) |
| print(f" Generated {len(queries)} samples") |
|
|
| |
| print("\n[2/5] Extracting features...") |
| X = np.array([extract_features(q) for q in queries]) |
| print(f" Feature matrix: {X.shape}") |
|
|
| |
| le = LabelEncoder() |
| y_benefit = le.fit_transform(benefits) |
| y_ttl = np.array(ttls, dtype=float) |
| y_complexity = np.array(complexities, dtype=float) |
|
|
| |
| X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \ |
| train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42) |
|
|
| print(f" Train: {len(X_train)}, Test: {len(X_test)}") |
|
|
| |
| print("\n[3/5] Training SQL Cache Advisor...") |
| clf = RandomForestClassifier( |
| n_estimators=200, |
| max_depth=15, |
| min_samples_split=5, |
| min_samples_leaf=2, |
| random_state=42, |
| n_jobs=-1, |
| ) |
| clf.fit(X_train, yb_train) |
|
|
| yb_pred = clf.predict(X_test) |
| print("\n Classification Report:") |
| report = classification_report(yb_test, yb_pred, target_names=le.classes_) |
| print(" " + report.replace("\n", "\n ")) |
|
|
| cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy") |
| print(f" Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})") |
|
|
| |
| print("\n[4/5] Training Cache TTL Recommender...") |
| reg_ttl = GradientBoostingRegressor( |
| n_estimators=200, |
| max_depth=8, |
| learning_rate=0.1, |
| min_samples_split=5, |
| random_state=42, |
| ) |
| reg_ttl.fit(X_train, yt_train) |
|
|
| yt_pred = reg_ttl.predict(X_test) |
| mae_ttl = mean_absolute_error(yt_test, yt_pred) |
| r2_ttl = r2_score(yt_test, yt_pred) |
| print(f" MAE: {mae_ttl:.1f} seconds") |
| print(f" R2: {r2_ttl:.3f}") |
|
|
| |
| print("\n[5/5] Training Query Complexity Estimator...") |
| reg_cplx = GradientBoostingRegressor( |
| n_estimators=200, |
| max_depth=8, |
| learning_rate=0.1, |
| min_samples_split=5, |
| random_state=42, |
| ) |
| reg_cplx.fit(X_train, yc_train) |
|
|
| yc_pred = reg_cplx.predict(X_test) |
| mae_cplx = mean_absolute_error(yc_test, yc_pred) |
| r2_cplx = r2_score(yc_test, yc_pred) |
| print(f" MAE: {mae_cplx:.1f} points") |
| print(f" R2: {r2_cplx:.3f}") |
|
|
| |
| os.makedirs(OUTPUT_DIR, exist_ok=True) |
|
|
| joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib")) |
| joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib")) |
| joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib")) |
| joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib")) |
|
|
| |
| importances = { |
| "cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())), |
| "ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())), |
| "complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())), |
| } |
| with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f: |
| json.dump(importances, f, indent=2) |
|
|
| |
| metadata = { |
| "models": { |
| "cache_advisor": { |
| "type": "RandomForestClassifier", |
| "task": "classification", |
| "classes": le.classes_.tolist(), |
| "accuracy_cv5": round(float(cv_scores.mean()), 4), |
| }, |
| "ttl_recommender": { |
| "type": "GradientBoostingRegressor", |
| "task": "regression", |
| "unit": "seconds", |
| "mae": round(float(mae_ttl), 2), |
| "r2": round(float(r2_ttl), 4), |
| }, |
| "complexity_estimator": { |
| "type": "GradientBoostingRegressor", |
| "task": "regression", |
| "unit": "score (1-100)", |
| "mae": round(float(mae_cplx), 2), |
| "r2": round(float(r2_cplx), 4), |
| }, |
| }, |
| "features": FEATURE_NAMES, |
| "n_features": len(FEATURE_NAMES), |
| "training_samples": len(queries), |
| "test_samples": len(X_test), |
| } |
| with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f: |
| json.dump(metadata, f, indent=2) |
|
|
| print(f"\n Models saved to {OUTPUT_DIR}/") |
| print(" Files: cache_advisor.joblib, ttl_recommender.joblib,") |
| print(" complexity_estimator.joblib, label_encoder.joblib,") |
| print(" feature_importances.json, metadata.json") |
| print("\nDone.") |
|
|
|
|
| if __name__ == "__main__": |
| train() |
|
|