| """ |
| Incident evaluation endpoints — backward‑compatible Bayesian reroute. |
| |
| This module provides two incident‑related routes: |
| |
| * ``POST /api/v1/report_incident`` |
| Stores a ``ReliabilityEvent`` in an in‑memory history for auditing |
| and debugging. |
| * ``POST /api/v1/v1/incidents/evaluate`` **(deprecated)** |
| Former heuristic endpoint now **rerouted to the full Bayesian risk |
| engine**. All callers should migrate to |
| ``POST /api/v1/intents/evaluate``, which returns richer metadata |
| including CUDL uncertainty decomposition and decision traces. |
| |
| The local model duplicates (``ReliabilityEvent``, ``HealingAction``) |
| have been removed; all types are imported from the canonical ARF core |
| framework (``agentic_reliability_framework.core.models.event``). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| import time |
| from typing import Optional |
|
|
| from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request |
|
|
| from agentic_reliability_framework.core.models.event import ( |
| HealingAction, |
| ReliabilityEvent, |
| ) |
|
|
| from app.causal_explainer import CausalExplainer |
| from app.core.usage_tracker import UsageRecord, enforce_quota, tracker |
|
|
| logger = logging.getLogger(__name__) |
|
|
| router = APIRouter() |
|
|
| |
| |
| |
| incident_history: list[dict] = [] |
|
|
|
|
| |
| |
| |
| @router.post("/report_incident") |
| async def report_incident(event: ReliabilityEvent) -> dict[str, str]: |
| """ |
| Record a ``ReliabilityEvent`` in the in‑memory incident history. |
| |
| This endpoint is used by internal monitoring tools to feed incident |
| data into the causal explainer and downstream analysis. The event |
| is stored as a JSON‑safe dictionary and is **not** persisted across |
| API restarts. |
| |
| Parameters |
| ---------- |
| event : ReliabilityEvent |
| The reliability event to record. Must include at minimum |
| ``component``, ``latency_p99``, ``error_rate``, and |
| ``service_mesh``. |
| |
| Returns |
| ------- |
| dict |
| A simple acknowledgement ``{"status": "recorded"}``. |
| """ |
| incident_history.append(event.model_dump(mode="json")) |
| return {"status": "recorded"} |
|
|
|
|
| |
| |
| |
| @router.post("/v1/incidents/evaluate") |
| async def evaluate_incident( |
| request: Request, |
| event: ReliabilityEvent, |
| background_tasks: BackgroundTasks, |
| quota: dict = Depends(enforce_quota), |
| ) -> dict: |
| """ |
| Evaluate an incident using the **Bayesian risk engine**. |
| |
| .. deprecated:: 0.6.0 |
| Use ``POST /api/v1/intents/evaluate`` instead. This endpoint |
| will be removed in a future release. Responses include a |
| ``deprecation_notice`` field to assist migration. |
| |
| The following steps are performed: |
| |
| 1. Convert the ``ReliabilityEvent`` into a minimal |
| ``DeployConfigurationIntent`` via ``intent_adapter``. |
| 2. Call ``risk_service.evaluate_intent()`` to obtain a Bayesian |
| risk score. |
| 3. Generate a heuristic healing action based on the risk score. |
| 4. Run the causal explainer for counter‑factual text. |
| 5. Build a backward‑compatible response envelope. |
| |
| Parameters |
| ---------- |
| request : Request |
| The Starlette request object (used for internal state access). |
| event : ReliabilityEvent |
| The incident event containing component name, latency, error |
| rate, etc. |
| background_tasks : BackgroundTasks |
| FastAPI background‑task runner for asynchronous logging. |
| quota : dict |
| Injected by ``enforce_quota``; contains ``api_key``, ``tier``, |
| and ``remaining``. |
| |
| Returns |
| ------- |
| dict |
| A dictionary with keys: |
| |
| * ``deprecation_notice`` (str) — migration guidance. |
| * ``healing_intent`` (dict) — action, component, risk score, |
| justification, confidence, and advisory status. |
| * ``causal_explanation`` (dict) — factual/counter‑factual |
| outcomes and explanation text. |
| * ``utility_decision`` (dict) — selected action and expected |
| utility. |
| """ |
| start_time = time.time() |
| api_key: str = quota["api_key"] |
| tier = quota["tier"] |
| response_data: Optional[dict] = None |
| error_msg: Optional[str] = None |
|
|
| try: |
| |
| |
| |
| from app.services.intent_adapter import to_oss_intent |
| from app.services.risk_service import evaluate_intent |
|
|
| raw_intent = { |
| "intent_type": "deploy_config", |
| "environment": "prod", |
| "service_name": event.component, |
| "requester": "auto", |
| "change_scope": "global", |
| "deployment_target": "prod", |
| "configuration": {}, |
| "provenance": {"source": "incident_evaluate"}, |
| } |
| oss_intent = to_oss_intent(raw_intent) |
|
|
| |
| |
| |
| risk_engine = request.app.state.risk_engine |
| result = evaluate_intent( |
| engine=risk_engine, |
| intent=oss_intent, |
| cost_estimate=None, |
| policy_violations=[], |
| ) |
|
|
| |
| |
| |
| optimal_action = ( |
| HealingAction.RESTART_CONTAINER |
| if result["risk_score"] > 0.5 |
| else HealingAction.NO_ACTION |
| ) |
|
|
| |
| |
| |
| causal_explainer = CausalExplainer() |
| current_state = { |
| "latency": event.latency_p99, |
| "error_rate": event.error_rate, |
| "last_action": {"action_type": "no_action"}, |
| } |
| proposed_action = {"action_type": optimal_action.value, "params": {}} |
| causal_exp = causal_explainer.explain_healing_intent( |
| proposed_action, current_state, "latency" |
| ) |
|
|
| |
| |
| |
| healing_intent = { |
| "action": optimal_action.value, |
| "component": event.component, |
| "parameters": {}, |
| "justification": ( |
| f"Bayesian risk score: {result['risk_score']:.3f}. " |
| f"Causal: {causal_exp.explanation_text}" |
| ), |
| "confidence": 1.0 - result.get("uncertainty", 0.0), |
| "risk_score": result["risk_score"], |
| "status": "oss_advisory_only", |
| } |
|
|
| response_data = { |
| "deprecation_notice": ( |
| "This endpoint is deprecated. Use POST /api/v1/intents/evaluate " |
| "for the full Bayesian evaluation with CUDL decomposition." |
| ), |
| "healing_intent": healing_intent, |
| "causal_explanation": { |
| "factual_outcome": causal_exp.factual_outcome, |
| "counterfactual_outcome": causal_exp.counterfactual_outcome, |
| "effect": causal_exp.effect, |
| "explanation_text": causal_exp.explanation_text, |
| "is_model_based": causal_exp.is_model_based, |
| "warnings": causal_exp.warnings, |
| }, |
| "utility_decision": { |
| "best_action": optimal_action.value, |
| "expected_utility": 0.5, |
| "explanation": ( |
| "Decision based on Bayesian risk threshold > 0.5" |
| ), |
| }, |
| } |
|
|
| |
| |
| |
| if tracker: |
| record = UsageRecord( |
| api_key=api_key, |
| tier=tier, |
| timestamp=time.time(), |
| endpoint="/v1/incidents/evaluate", |
| request_body=event.model_dump(mode="json"), |
| response=response_data, |
| processing_ms=(time.time() - start_time) * 1000, |
| ) |
| await tracker.increment_usage_async(record, background_tasks) |
|
|
| logger.warning( |
| "Deprecated endpoint /v1/incidents/evaluate called by key %s", |
| api_key[:8], |
| ) |
| return response_data |
|
|
| except HTTPException: |
| raise |
| except Exception as exc: |
| error_msg = str(exc) |
| if tracker: |
| record = UsageRecord( |
| api_key=api_key, |
| tier=tier, |
| timestamp=time.time(), |
| endpoint="/v1/incidents/evaluate", |
| request_body=event.model_dump(mode="json"), |
| error=error_msg, |
| processing_ms=(time.time() - start_time) * 1000, |
| ) |
| await tracker.increment_usage_async(record, background_tasks) |
| raise HTTPException(status_code=500, detail=error_msg) |
|
|