{ "model_name": "difficulty_model", "model_version": "difficulty_model_v2_baseline_001", "dataset_version": "2.0.0", "trained_at": "2026-05-21T05:59:09.943332+00:00", "seed": 42, "split_counts": { "train": 3912, "validation": 1033, "test": 875 }, "metrics": { "validation": { "mae": 0.3475, "r_squared": 0.5003, "per_bucket_mae": { "easy": 0.3058, "medium": 0.2934, "hard": 0.6563 } }, "test": { "mae": 0.3519, "r_squared": 0.4685, "per_bucket_mae": { "easy": 0.325, "medium": 0.2885, "hard": 0.6797 } } }, "limitations": [ "Trained on synthetic data only.", "difficulty_score distribution may not reflect real-world difficulty.", "OrdinalEncoder assumes an ordering that may not be meaningful for subject/question_type.", "Per-bucket MAE depends on the quality of the difficulty string labels." ] }