nilenpatel's picture
Upload pg_plan_cache models
406cec4 verified
#!/usr/bin/env python3
"""
Train all three pg_plan_cache models:
1. SQL Cache Advisor (classification: low / medium / high)
2. Cache TTL Recommender (regression: seconds)
3. Query Complexity Estimator (regression: 1-100 score)
Saves trained models as joblib files in the ./trained/ directory.
"""
import os
import json
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib
from features import extract_features, FEATURE_NAMES
from dataset import generate_dataset
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained")
def train():
print("=" * 60)
print(" pg_plan_cache β€” Model Training")
print("=" * 60)
# ── Generate data ─────────────────────────────────────────
print("\n[1/5] Generating synthetic training data...")
queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42)
print(f" Generated {len(queries)} samples")
# ── Extract features ──────────────────────────────────────
print("\n[2/5] Extracting features...")
X = np.array([extract_features(q) for q in queries])
print(f" Feature matrix: {X.shape}")
# ── Encode labels ─────────────────────────────────────────
le = LabelEncoder()
y_benefit = le.fit_transform(benefits) # low=1, medium=2, high=0
y_ttl = np.array(ttls, dtype=float)
y_complexity = np.array(complexities, dtype=float)
# ── Split ─────────────────────────────────────────────────
X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \
train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42)
print(f" Train: {len(X_train)}, Test: {len(X_test)}")
# ── Model 1: Cache Advisor (classification) ───────────────
print("\n[3/5] Training SQL Cache Advisor...")
clf = RandomForestClassifier(
n_estimators=200,
max_depth=15,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1,
)
clf.fit(X_train, yb_train)
yb_pred = clf.predict(X_test)
print("\n Classification Report:")
report = classification_report(yb_test, yb_pred, target_names=le.classes_)
print(" " + report.replace("\n", "\n "))
cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy")
print(f" Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
# ── Model 2: TTL Recommender (regression) ─────────────────
print("\n[4/5] Training Cache TTL Recommender...")
reg_ttl = GradientBoostingRegressor(
n_estimators=200,
max_depth=8,
learning_rate=0.1,
min_samples_split=5,
random_state=42,
)
reg_ttl.fit(X_train, yt_train)
yt_pred = reg_ttl.predict(X_test)
mae_ttl = mean_absolute_error(yt_test, yt_pred)
r2_ttl = r2_score(yt_test, yt_pred)
print(f" MAE: {mae_ttl:.1f} seconds")
print(f" R2: {r2_ttl:.3f}")
# ── Model 3: Complexity Estimator (regression) ────────────
print("\n[5/5] Training Query Complexity Estimator...")
reg_cplx = GradientBoostingRegressor(
n_estimators=200,
max_depth=8,
learning_rate=0.1,
min_samples_split=5,
random_state=42,
)
reg_cplx.fit(X_train, yc_train)
yc_pred = reg_cplx.predict(X_test)
mae_cplx = mean_absolute_error(yc_test, yc_pred)
r2_cplx = r2_score(yc_test, yc_pred)
print(f" MAE: {mae_cplx:.1f} points")
print(f" R2: {r2_cplx:.3f}")
# ── Save models ───────────────────────────────────────────
os.makedirs(OUTPUT_DIR, exist_ok=True)
joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib"))
joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib"))
joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib"))
joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib"))
# Feature importances
importances = {
"cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())),
"ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())),
"complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())),
}
with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f:
json.dump(importances, f, indent=2)
# Model metadata
metadata = {
"models": {
"cache_advisor": {
"type": "RandomForestClassifier",
"task": "classification",
"classes": le.classes_.tolist(),
"accuracy_cv5": round(float(cv_scores.mean()), 4),
},
"ttl_recommender": {
"type": "GradientBoostingRegressor",
"task": "regression",
"unit": "seconds",
"mae": round(float(mae_ttl), 2),
"r2": round(float(r2_ttl), 4),
},
"complexity_estimator": {
"type": "GradientBoostingRegressor",
"task": "regression",
"unit": "score (1-100)",
"mae": round(float(mae_cplx), 2),
"r2": round(float(r2_cplx), 4),
},
},
"features": FEATURE_NAMES,
"n_features": len(FEATURE_NAMES),
"training_samples": len(queries),
"test_samples": len(X_test),
}
with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f:
json.dump(metadata, f, indent=2)
print(f"\n Models saved to {OUTPUT_DIR}/")
print(" Files: cache_advisor.joblib, ttl_recommender.joblib,")
print(" complexity_estimator.joblib, label_encoder.joblib,")
print(" feature_importances.json, metadata.json")
print("\nDone.")
if __name__ == "__main__":
train()