Upload pg_plan_cache models

406cec4 verified about 1 month ago

6.58 kB

	#!/usr/bin/env python3
	"""
	Train all three pg_plan_cache models:
	1. SQL Cache Advisor (classification: low / medium / high)
	2. Cache TTL Recommender (regression: seconds)
	3. Query Complexity Estimator (regression: 1-100 score)

	Saves trained models as joblib files in the ./trained/ directory.
	"""

	import os
	import json
	import numpy as np
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.metrics import classification_report, mean_absolute_error, r2_score
	from sklearn.preprocessing import LabelEncoder
	import joblib

	from features import extract_features, FEATURE_NAMES
	from dataset import generate_dataset

	OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "trained")


	def train():
	print("=" * 60)
	print(" pg_plan_cache — Model Training")
	print("=" * 60)

	# ── Generate data ─────────────────────────────────────────
	print("\n[1/5] Generating synthetic training data...")
	queries, benefits, ttls, complexities = generate_dataset(n=8000, seed=42)
	print(f" Generated {len(queries)} samples")

	# ── Extract features ──────────────────────────────────────
	print("\n[2/5] Extracting features...")
	X = np.array([extract_features(q) for q in queries])
	print(f" Feature matrix: {X.shape}")

	# ── Encode labels ─────────────────────────────────────────
	le = LabelEncoder()
	y_benefit = le.fit_transform(benefits) # low=1, medium=2, high=0
	y_ttl = np.array(ttls, dtype=float)
	y_complexity = np.array(complexities, dtype=float)

	# ── Split ─────────────────────────────────────────────────
	X_train, X_test, yb_train, yb_test, yt_train, yt_test, yc_train, yc_test = \
	train_test_split(X, y_benefit, y_ttl, y_complexity, test_size=0.2, random_state=42)

	print(f" Train: {len(X_train)}, Test: {len(X_test)}")

	# ── Model 1: Cache Advisor (classification) ───────────────
	print("\n[3/5] Training SQL Cache Advisor...")
	clf = RandomForestClassifier(
	n_estimators=200,
	max_depth=15,
	min_samples_split=5,
	min_samples_leaf=2,
	random_state=42,
	n_jobs=-1,
	)
	clf.fit(X_train, yb_train)

	yb_pred = clf.predict(X_test)
	print("\n Classification Report:")
	report = classification_report(yb_test, yb_pred, target_names=le.classes_)
	print(" " + report.replace("\n", "\n "))

	cv_scores = cross_val_score(clf, X, y_benefit, cv=5, scoring="accuracy")
	print(f" Cross-val accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")

	# ── Model 2: TTL Recommender (regression) ─────────────────
	print("\n[4/5] Training Cache TTL Recommender...")
	reg_ttl = GradientBoostingRegressor(
	n_estimators=200,
	max_depth=8,
	learning_rate=0.1,
	min_samples_split=5,
	random_state=42,
	)
	reg_ttl.fit(X_train, yt_train)

	yt_pred = reg_ttl.predict(X_test)
	mae_ttl = mean_absolute_error(yt_test, yt_pred)
	r2_ttl = r2_score(yt_test, yt_pred)
	print(f" MAE: {mae_ttl:.1f} seconds")
	print(f" R2: {r2_ttl:.3f}")

	# ── Model 3: Complexity Estimator (regression) ────────────
	print("\n[5/5] Training Query Complexity Estimator...")
	reg_cplx = GradientBoostingRegressor(
	n_estimators=200,
	max_depth=8,
	learning_rate=0.1,
	min_samples_split=5,
	random_state=42,
	)
	reg_cplx.fit(X_train, yc_train)

	yc_pred = reg_cplx.predict(X_test)
	mae_cplx = mean_absolute_error(yc_test, yc_pred)
	r2_cplx = r2_score(yc_test, yc_pred)
	print(f" MAE: {mae_cplx:.1f} points")
	print(f" R2: {r2_cplx:.3f}")

	# ── Save models ───────────────────────────────────────────
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	joblib.dump(clf, os.path.join(OUTPUT_DIR, "cache_advisor.joblib"))
	joblib.dump(reg_ttl, os.path.join(OUTPUT_DIR, "ttl_recommender.joblib"))
	joblib.dump(reg_cplx, os.path.join(OUTPUT_DIR, "complexity_estimator.joblib"))
	joblib.dump(le, os.path.join(OUTPUT_DIR, "label_encoder.joblib"))

	# Feature importances
	importances = {
	"cache_advisor": dict(zip(FEATURE_NAMES, clf.feature_importances_.tolist())),
	"ttl_recommender": dict(zip(FEATURE_NAMES, reg_ttl.feature_importances_.tolist())),
	"complexity_estimator": dict(zip(FEATURE_NAMES, reg_cplx.feature_importances_.tolist())),
	}
	with open(os.path.join(OUTPUT_DIR, "feature_importances.json"), "w") as f:
	json.dump(importances, f, indent=2)

	# Model metadata
	metadata = {
	"models": {
	"cache_advisor": {
	"type": "RandomForestClassifier",
	"task": "classification",
	"classes": le.classes_.tolist(),
	"accuracy_cv5": round(float(cv_scores.mean()), 4),
	},
	"ttl_recommender": {
	"type": "GradientBoostingRegressor",
	"task": "regression",
	"unit": "seconds",
	"mae": round(float(mae_ttl), 2),
	"r2": round(float(r2_ttl), 4),
	},
	"complexity_estimator": {
	"type": "GradientBoostingRegressor",
	"task": "regression",
	"unit": "score (1-100)",
	"mae": round(float(mae_cplx), 2),
	"r2": round(float(r2_cplx), 4),
	},
	},
	"features": FEATURE_NAMES,
	"n_features": len(FEATURE_NAMES),
	"training_samples": len(queries),
	"test_samples": len(X_test),
	}
	with open(os.path.join(OUTPUT_DIR, "metadata.json"), "w") as f:
	json.dump(metadata, f, indent=2)

	print(f"\n Models saved to {OUTPUT_DIR}/")
	print(" Files: cache_advisor.joblib, ttl_recommender.joblib,")
	print(" complexity_estimator.joblib, label_encoder.joblib,")
	print(" feature_importances.json, metadata.json")
	print("\nDone.")


	if __name__ == "__main__":
	train()