Spaces:

A-R-F
/

Agentic-Reliability-Framework-API

Running

File size: 10,020 Bytes

afa4de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d521fd
afa4de7
2d521fd
afa4de7
 
 
2d521fd
afa4de7
2d521fd
afa4de7
 
 
 
2d521fd
afa4de7
 
2d521fd
afa4de7
2d521fd
 
 
afa4de7
 
 
 
2d521fd
afa4de7
 
 
 
2d521fd
afa4de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d521fd
 
 
afa4de7
 
 
2d521fd
 
 
 
 
afa4de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d521fd
afa4de7
2d521fd
afa4de7
 
2d521fd
 
afa4de7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d521fd
afa4de7
 
 
 
 
 
 
 
 
 
2d521fd
afa4de7
 
 
 
 
 
 
 
 
 
 
 
 
2d521fd
 
 
afa4de7
2d521fd
 
afa4de7
 
 
2d521fd
afa4de7
 
 
2d521fd
 
 
afa4de7
 
 
 
 
 
 
 
2d521fd
 
 
afa4de7
 
 
 
2d521fd
 
 
 
 
 
 
afa4de7
2d521fd
 
 
 
afa4de7
 
 
 
2d521fd
 
afa4de7
2d521fd
afa4de7
2d521fd
 
 
 
 
 
afa4de7
2d521fd
 
 
 
 
afa4de7
 
 
 
2d521fd
 
 
 
afa4de7
 
2d521fd
 
 
 
 
 
afa4de7
2d521fd

"""
Incident evaluation endpoints — backward‑compatible Bayesian reroute.

This module provides two incident‑related routes:

* ``POST /api/v1/report_incident``
    Stores a ``ReliabilityEvent`` in an in‑memory history for auditing
    and debugging.
* ``POST /api/v1/v1/incidents/evaluate``    **(deprecated)**
    Former heuristic endpoint now **rerouted to the full Bayesian risk
    engine**.  All callers should migrate to
    ``POST /api/v1/intents/evaluate``, which returns richer metadata
    including CUDL uncertainty decomposition and decision traces.

The local model duplicates (``ReliabilityEvent``, ``HealingAction``)
have been removed; all types are imported from the canonical ARF core
framework (``agentic_reliability_framework.core.models.event``).
"""

from __future__ import annotations

import logging
import time
from typing import Optional

from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request

from agentic_reliability_framework.core.models.event import (
    HealingAction,
    ReliabilityEvent,
)

from app.causal_explainer import CausalExplainer
from app.core.usage_tracker import UsageRecord, enforce_quota, tracker

logger = logging.getLogger(__name__)

router = APIRouter()

# ---------------------------------------------------------------------------
# In‑memory incident store  (for auditing / debugging only)
# ---------------------------------------------------------------------------
incident_history: list[dict] = []


# ---------------------------------------------------------------------------
# POST /api/v1/report_incident
# ---------------------------------------------------------------------------
@router.post("/report_incident")
async def report_incident(event: ReliabilityEvent) -> dict[str, str]:
    """
    Record a ``ReliabilityEvent`` in the in‑memory incident history.

    This endpoint is used by internal monitoring tools to feed incident
    data into the causal explainer and downstream analysis.  The event
    is stored as a JSON‑safe dictionary and is **not** persisted across
    API restarts.

    Parameters
    ----------
    event : ReliabilityEvent
        The reliability event to record.  Must include at minimum
        ``component``, ``latency_p99``, ``error_rate``, and
        ``service_mesh``.

    Returns
    -------
    dict
        A simple acknowledgement ``{"status": "recorded"}``.
    """
    incident_history.append(event.model_dump(mode="json"))
    return {"status": "recorded"}


# ---------------------------------------------------------------------------
# POST /api/v1/v1/incidents/evaluate  (deprecated)
# ---------------------------------------------------------------------------
@router.post("/v1/incidents/evaluate")
async def evaluate_incident(
    request: Request,
    event: ReliabilityEvent,
    background_tasks: BackgroundTasks,
    quota: dict = Depends(enforce_quota),
) -> dict:
    """
    Evaluate an incident using the **Bayesian risk engine**.

    .. deprecated:: 0.6.0
        Use ``POST /api/v1/intents/evaluate`` instead.  This endpoint
        will be removed in a future release.  Responses include a
        ``deprecation_notice`` field to assist migration.

    The following steps are performed:

    1. Convert the ``ReliabilityEvent`` into a minimal
       ``DeployConfigurationIntent`` via ``intent_adapter``.
    2. Call ``risk_service.evaluate_intent()`` to obtain a Bayesian
       risk score.
    3. Generate a heuristic healing action based on the risk score.
    4. Run the causal explainer for counter‑factual text.
    5. Build a backward‑compatible response envelope.

    Parameters
    ----------
    request : Request
        The Starlette request object (used for internal state access).
    event : ReliabilityEvent
        The incident event containing component name, latency, error
        rate, etc.
    background_tasks : BackgroundTasks
        FastAPI background‑task runner for asynchronous logging.
    quota : dict
        Injected by ``enforce_quota``; contains ``api_key``, ``tier``,
        and ``remaining``.

    Returns
    -------
    dict
        A dictionary with keys:

        * ``deprecation_notice`` (str) — migration guidance.
        * ``healing_intent`` (dict) — action, component, risk score,
          justification, confidence, and advisory status.
        * ``causal_explanation`` (dict) — factual/counter‑factual
          outcomes and explanation text.
        * ``utility_decision`` (dict) — selected action and expected
          utility.
    """
    start_time = time.time()
    api_key: str = quota["api_key"]
    tier = quota["tier"]
    response_data: Optional[dict] = None
    error_msg: Optional[str] = None

    try:
        # ------------------------------------------------------------------
        # Step 1 – Convert the event into an infrastructure intent
        # ------------------------------------------------------------------
        from app.services.intent_adapter import to_oss_intent
        from app.services.risk_service import evaluate_intent

        raw_intent = {
            "intent_type": "deploy_config",
            "environment": "prod",
            "service_name": event.component,
            "requester": "auto",
            "change_scope": "global",
            "deployment_target": "prod",
            "configuration": {},
            "provenance": {"source": "incident_evaluate"},
        }
        oss_intent = to_oss_intent(raw_intent)

        # ------------------------------------------------------------------
        # Step 2 – Bayesian risk evaluation
        # ------------------------------------------------------------------
        risk_engine = request.app.state.risk_engine
        result = evaluate_intent(
            engine=risk_engine,
            intent=oss_intent,
            cost_estimate=None,
            policy_violations=[],
        )

        # ------------------------------------------------------------------
        # Step 3 – Heuristic action selection based on risk threshold
        # ------------------------------------------------------------------
        optimal_action = (
            HealingAction.RESTART_CONTAINER
            if result["risk_score"] > 0.5
            else HealingAction.NO_ACTION
        )

        # ------------------------------------------------------------------
        # Step 4 – Causal explainer
        # ------------------------------------------------------------------
        causal_explainer = CausalExplainer()
        current_state = {
            "latency": event.latency_p99,
            "error_rate": event.error_rate,
            "last_action": {"action_type": "no_action"},
        }
        proposed_action = {"action_type": optimal_action.value, "params": {}}
        causal_exp = causal_explainer.explain_healing_intent(
            proposed_action, current_state, "latency"
        )

        # ------------------------------------------------------------------
        # Step 5 – Build response envelope
        # ------------------------------------------------------------------
        healing_intent = {
            "action": optimal_action.value,
            "component": event.component,
            "parameters": {},
            "justification": (
                f"Bayesian risk score: {result['risk_score']:.3f}. "
                f"Causal: {causal_exp.explanation_text}"
            ),
            "confidence": 1.0 - result.get("uncertainty", 0.0),
            "risk_score": result["risk_score"],
            "status": "oss_advisory_only",
        }

        response_data = {
            "deprecation_notice": (
                "This endpoint is deprecated. Use POST /api/v1/intents/evaluate "
                "for the full Bayesian evaluation with CUDL decomposition."
            ),
            "healing_intent": healing_intent,
            "causal_explanation": {
                "factual_outcome": causal_exp.factual_outcome,
                "counterfactual_outcome": causal_exp.counterfactual_outcome,
                "effect": causal_exp.effect,
                "explanation_text": causal_exp.explanation_text,
                "is_model_based": causal_exp.is_model_based,
                "warnings": causal_exp.warnings,
            },
            "utility_decision": {
                "best_action": optimal_action.value,
                "expected_utility": 0.5,
                "explanation": (
                    "Decision based on Bayesian risk threshold > 0.5"
                ),
            },
        }

        # ------------------------------------------------------------------
        # Asynchronous usage logging
        # ------------------------------------------------------------------
        if tracker:
            record = UsageRecord(
                api_key=api_key,
                tier=tier,
                timestamp=time.time(),
                endpoint="/v1/incidents/evaluate",
                request_body=event.model_dump(mode="json"),
                response=response_data,
                processing_ms=(time.time() - start_time) * 1000,
            )
            await tracker.increment_usage_async(record, background_tasks)

        logger.warning(
            "Deprecated endpoint /v1/incidents/evaluate called by key %s",
            api_key[:8],
        )
        return response_data

    except HTTPException:
        raise
    except Exception as exc:
        error_msg = str(exc)
        if tracker:
            record = UsageRecord(
                api_key=api_key,
                tier=tier,
                timestamp=time.time(),
                endpoint="/v1/incidents/evaluate",
                request_body=event.model_dump(mode="json"),
                error=error_msg,
                processing_ms=(time.time() - start_time) * 1000,
            )
            await tracker.increment_usage_async(record, background_tasks)
        raise HTTPException(status_code=500, detail=error_msg)