diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..95413234fb78bb3aeadc0013062a7daaa2e64c6e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.git +__pycache__ +*.pyc +.env +venv +.pytest_cache +.coverage +htmlcov diff --git a/.gitignore b/.gitignore index bf6cc92ff98849403dd8394105626bb700d862d9..97faff00f74afed66f055fa1de00be0db708445e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,50 +1,33 @@ # Python __pycache__/ -*.py[cod] -*$py.class -*.so +*.pyc +*.pyo +*.pyd .Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -*.egg-info/ -.installed.cfg -*.egg +*.so -# Virtual Environment +# Virtual environments venv/ env/ ENV/ -.env/ .venv/ +# Build artifacts +dist/ +build/ +*.egg-info/ + # IDE .vscode/ .idea/ *.swp *.swo -*~ # OS .DS_Store -.DS_Store? -._* -.Spotlight-V100 -.Trashes -ehthumbs.db -Thumbs.db - -# Hugging Face Spaces -data/ -models/ -logs/ -*.log \ No newline at end of file +.env +test.db +venv +.coverage +monitor.log +monitor_loop.log diff --git a/Dockerfile b/Dockerfile index e84a221122337e2ab19e6c73a1861fa9beb8c2a5..894e59a355bf4a5cb4898457e12ab1e405dbd120 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,7 @@ FROM python:3.12-slim +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY . . -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"] \ No newline at end of file +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"] diff --git a/README.md b/README.md index a8f9ad73d1202bfb0d1659a3ef96f59e345ec3b5..f2ad2903b95e25ac31cf88cf1665a7620b84193d 100644 --- a/README.md +++ b/README.md @@ -1,103 +1,121 @@ ---- -title: Agentic Reliability Framework (ARF) v4 – Public API Demo -emoji: 🤖 -colorFrom: blue -colorTo: green -sdk: docker -python_version: '3.10' -app_file: app.py -pinned: false ---- +# arf-api -# Agentic Reliability Framework (ARF) – Public API Demo (Sandbox) +ARF API Control Plane (FastAPI) -**Problem:** Most AI‑driven governance systems fail silently in production, leading to outages, security breaches, and compliance violations. +## Live Demo -**Solution:** ARF turns probabilistic AI into deterministic, auditable action using Bayesian inference, semantic memory, and **expected loss minimisation**. +The API is deployed and accessible at: +- **Base URL**: [https://a-r-f-agentic-reliability-framework-api.hf.space](https://a-r-f-agentic-reliability-framework-api.hf.space) +- **Interactive Documentation**: [https://a-r-f-agentic-reliability-framework-api.hf.space/docs](https://a-r-f-agentic-reliability-framework-api.hf.space/docs) -**Outcome:** Reduce MTTR by up to 85% with self‑healing systems, backed by fully explainable risk scores. +## Quick Start (Local Development) -> ℹ️ **This Space provides a sanitised, mock API endpoint.** The real ARF core engine is proprietary, access‑controlled, and available only to qualified pilots and enterprise customers. See the [public specification](https://arf-foundation.github.io/arf-spec/) for details. +1. **Install dependencies**: +```bash +pip install -r requirements.txt +``` ---- +Note: `requirements.txt` installs `agentic-reliability-framework` directly from the project's Git repository. -## 🚀 Start Here +2. **Set environment variables** (optional, in `.env`): -| | | -|--|--| -| **📚 API Docs** | [https://a-r-f-arf-sandbox-api.hf.space/docs](https://a-r-f-arf-sandbox-api.hf.space/docs) | -| **🧪 Live Demo** | [Gradio Dashboard](https://a-r-f-arf-sandbox-api.hf.space/) | -| **📦 Public Spec** | [github.com/arf-foundation/arf-spec](https://github.com/arf-foundation/arf-spec) | -| **📅 Book a Call** | [Calendly](https://calendly.com/petter2025us/30min) | +```text +ARF_HMC_MODEL – path to HMC model JSON (default: models/hmc_model.json) ---- +ARF_USE_HYPERPRIORS – true/false -## 🔍 Quick Example +API_KEY – optional (currently not enforced) +``` -```python -import requests +3. **Run the app locally**: -response = requests.post( - "https://a-r-f-arf-sandbox-api.hf.space/v1/evaluate", - json={ - "service_name": "payment-gateway", - "event_type": "latency_spike", - "severity": "high", - "metrics": {"latency_p99": 450, "error_rate": 0.12} - } -) -print(response.json()) +```bash +uvicorn app.main:app --reload --port 8000 ``` -The response includes a mock HealingIntent with: +4. **Health check**: + +```bash +GET http://localhost:8000/health +``` -* risk\_score: simulated failure probability - -* risk\_factors: additive contributions from conjugate prior, hyperprior, and HMC - -* recommended\_action: approve, deny, or escalate - -* decision\_trace: expected losses and variance - +## Causal Explainer Endpoint -⚠️ **All responses from this endpoint are simulated.** The real Bayesian engine is not exposed publicly. +The ARF API includes a heuristic causal explainer that evaluates the impact of proposed healing actions using deterministic rules. This module provides counterfactual reasoning without requiring a fitted causal model or external ML dependencies. -🧠 Key Capabilities (Conceptual Overview) ------------------------------------------ +The explainer estimates how system metrics such as latency would change if a different action were taken. -* **Bayesian Risk Scoring** – Conjugate priors + HMC for calibrated uncertainty. - -* **Semantic Memory** – FAISS‑based retrieval of similar past incidents. - -* **Expected Loss Minimisation** – Chooses approve/deny/escalate by minimising cost-weighted risk, not static thresholds. - -* **Multi‑Agent Orchestration** – Anomaly detection, root cause, forecasting. - +### Mathematical Model -📊 Architecture ---------------- +The counterfactual outcome is computed as: ```text -User Request → Policy Evaluation → Cost Estimation → Risk Scoring - ↓ - HealingIntent ← Decision (Expected Loss) +counterfactual_outcome = factual_outcome * (1 + effect_frac) ``` -All decisions are immutable, signed, and fully traceable via ancestor\_chain and infrastructure\_intent fields. +Where: + +- `effect_frac` is a predefined impact factor based on the action type +- effects are multiplicative +- a fixed ±10% uncertainty interval is applied to the estimated outcome -🔧 Local Development --------------------- +### Example Request ```bash -docker build -t arf-api . -docker run -p 7860:7860 arf-api +curl -X POST "http://localhost:8000/api/v1/v1/incidents/evaluate" -H "Content-Type: application/json" -d '{ + "component": "checkout-service", + "latency_p99": 600, + "error_rate": 0.2, + "service_mesh": "default" + }' +``` + +### Example Response + +```json +{ + "healing_intent": { + "action": "restart_container", + "component": "checkout-service", + "parameters": {}, + "justification": "Causal: If we apply restart_container instead of no_action, latency would change from 600.00 to 510.00 (Δ = -90.00). Based on heuristic causal model.", + "confidence": 0.85, + "risk_score": 0.54, + "status": "oss_advisory_only" + }, + "causal_explanation": { + "factual_outcome": 600, + "counterfactual_outcome": 510, + "effect": -90, + "explanation_text": "If we apply restart_container instead of no_action, latency would change from 600.00 to 510.00 (Δ = -90.00). Based on heuristic causal model.", + "is_model_based": false, + "warnings": [ + "Using heuristic causal model (no fitted SCM)." + ] + }, + "utility_decision": { + "best_action": "restart_container", + "expected_utility": 0.5, + "explanation": "Heuristic decision based on latency/error thresholds" + } +} ``` -Then open [http://localhost:7860](http://localhost:7860/) for the Gradio UI and [http://localhost:7860/api/docs](http://localhost:7860/api/docs) for the API. +### Important Notes + +- This endpoint is advisory only (`status = oss_advisory_only`) +- No Structural Causal Model (SCM) is fitted +- No machine learning models are used +- All effects are based on predefined heuristics + +Tests +----- + +Run `pytest`. Tests use a temporary SQLite DB (`sqlite:///./test.db`) created by the test fixtures. -📚 About ARF ------------- +Notes +----- -The **Agentic Reliability Framework** is a governed, mathematically grounded advisory layer for AI infrastructure. The public specification, demo UI, and sandbox API are open‑source (Apache 2.0). **The core Bayesian engine is proprietary and access‑controlled** — available for pilot evaluation and enterprise licensing under outcome‑based pricing. +- The governance endpoints use an in-process `RiskEngine` initialized at startup. +- The outcome recording endpoint is not implemented in this repository and returns HTTP 501. -Learn more at [github.com/arf-foundation](https://github.com/arf-foundation) and request access via petter2025us@outlook.com. \ No newline at end of file diff --git a/alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py b/alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py new file mode 100644 index 0000000000000000000000000000000000000000..a50c5d714dbcaff7ecbb8fd1b5a7d536dc29235b --- /dev/null +++ b/alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py @@ -0,0 +1,47 @@ +"""add beta_state table for conjugate posterior persistence + +Revision ID: d36deffe7fa2 +Revises: b2218948f541 +Create Date: 2026-05-02 20:36:04.870145 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'd36deffe7fa2' +down_revision: Union[str, Sequence[str], None] = 'b2218948f541' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('beta_state', + sa.Column('id', sa.Integer(), nullable=False), + sa.Column('category', sa.String(length=32), nullable=False), + sa.Column('alpha', sa.Float(), nullable=False), + sa.Column('beta', sa.Float(), nullable=False), + sa.Column('updated_at', sa.DateTime(), nullable=True), + sa.PrimaryKeyConstraint('id') + ) + op.create_index(op.f('ix_beta_state_category'), 'beta_state', ['category'], unique=True) + op.create_index(op.f('ix_beta_state_id'), 'beta_state', ['id'], unique=False) + op.add_column('intent_outcomes', sa.Column('idempotency_key', sa.String(length=128), nullable=True)) + op.create_unique_constraint(None, 'intent_outcomes', ['idempotency_key']) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_constraint(None, 'intent_outcomes', type_='unique') + op.drop_column('intent_outcomes', 'idempotency_key') + op.drop_index(op.f('ix_beta_state_id'), table_name='beta_state') + op.drop_index(op.f('ix_beta_state_category'), table_name='beta_state') + op.drop_table('beta_state') + # ### end Alembic commands ### diff --git a/app/api/deps.py b/app/api/deps.py index 4200c325d4fcb3d0ee0f2c66e7ae7c66c9c9704c..874861069d84fcf2abc7555b1d7c7877e86c325a 100644 --- a/app/api/deps.py +++ b/app/api/deps.py @@ -4,66 +4,16 @@ from slowapi import Limiter from slowapi.util import get_remote_address from app.core.config import settings -# --------------------------------------------------------------------------- -# Local dummy implementations that replace the private engine classes. -# They provide the same interface as the originals but perform no real work. -# --------------------------------------------------------------------------- -class RiskEngine: - def __init__(self, *args, **kwargs): - pass - def calculate_risk(self, *args, **kwargs): - return (0.38, "mock", {"conjugate_mean": 0.38}) - def update_outcome(self, *args, **kwargs): - pass - -class DecisionEngine: - def __init__(self, *args, **kwargs): - pass - def select_optimal_action(self, *args, **kwargs): - class Result: - best_action = type('Action', (), {'value': 'NO_ACTION'})() - expected_utility = 0.0 - alternatives = [] - explanation = "mock" - raw_data = {} - return Result() - def compute_risk(self, *args, **kwargs): - return 0.0 - -class LyapunovStabilityController: - def __init__(self, *args, **kwargs): - pass - -class CausalExplainer: - def __init__(self, *args, **kwargs): - pass - -class RAGGraphMemory: - def __init__(self, *args, **kwargs): - pass - def has_historical_data(self): - return False - def record_outcome(self, *args, **kwargs): - pass - -class ReliabilityEvent: - def __init__(self, component, latency_p99, error_rate, service_mesh="default"): - self.component = component - self.latency_p99 = latency_p99 - self.error_rate = error_rate - self.service_mesh = service_mesh - -class HealingAction: - NO_ACTION = "NO_ACTION" - RESTART_CONTAINER = "RESTART_CONTAINER" - SCALE_OUT = "SCALE_OUT" - ROLLBACK = "ROLLBACK" - CIRCUIT_BREAKER = "CIRCUIT_BREAKER" - TRAFFIC_SHIFT = "TRAFFIC_SHIFT" - ALERT_TEAM = "ALERT_TEAM" -# --------------------------------------------------------------------------- +# ARF core engine imports +from agentic_reliability_framework.core.governance.risk_engine import RiskEngine +from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine +from agentic_reliability_framework.core.governance.stability_controller import LyapunovStabilityController +from agentic_reliability_framework.core.governance.causal_explainer import CausalExplainer +from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory +from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction +# Dependency to get DB session def get_db(): db = SessionLocal() try: @@ -72,10 +22,14 @@ def get_db(): db.close() -limiter = Limiter(key_func=get_remote_address, default_limits=[settings.RATE_LIMIT]) +# Rate limiter with default limit from settings +limiter = Limiter( + key_func=get_remote_address, + default_limits=[ + settings.RATE_LIMIT]) -# Singletons (now using local dummies) +# ARF engine dependencies (singletons for simplicity) _risk_engine = None _decision_engine = None _stability_controller = None @@ -84,8 +38,36 @@ _rag_graph = None def _seed_rag_graph(rag): - # Mock seed – no real data - print("RAG seed skipped (sandbox mode)", file=sys.stderr) + """Seed the RAG graph with historical healing action outcomes.""" + seed_data = [ + ("seed_restart_1", "test", HealingAction.RESTART_CONTAINER.value, True, 2), + ("seed_restart_2", "test", HealingAction.RESTART_CONTAINER.value, True, 3), + ("seed_restart_3", "test", HealingAction.RESTART_CONTAINER.value, False, 10), + ("seed_rollback_1", "test", HealingAction.ROLLBACK.value, True, 1), + ("seed_rollback_2", "test", HealingAction.ROLLBACK.value, True, 2), + ("seed_rollback_3", "test", HealingAction.ROLLBACK.value, False, 5), + ("seed_scale_1", "test", HealingAction.SCALE_OUT.value, True, 5), + ("seed_scale_2", "test", HealingAction.SCALE_OUT.value, False, 15), + ("seed_cb_1", "test", HealingAction.CIRCUIT_BREAKER.value, True, 1), + ("seed_cb_2", "test", HealingAction.CIRCUIT_BREAKER.value, True, 2), + ("seed_ts_1", "test", HealingAction.TRAFFIC_SHIFT.value, True, 4), + ("seed_ts_2", "test", HealingAction.TRAFFIC_SHIFT.value, False, 8), + ] + for inc_id, comp, action, success, res_time in seed_data: + event = ReliabilityEvent( + component=comp, + latency_p99=500, + error_rate=0.1, + service_mesh="default" + ) + rag.record_outcome( + incident_id=inc_id, + event=event, + action_taken=action, + success=success, + resolution_time_minutes=res_time + ) + print("Seeded RAG graph with historical data", file=sys.stderr) def get_rag_graph(): @@ -122,4 +104,4 @@ def get_causal_explainer(): global _causal_explainer if _causal_explainer is None: _causal_explainer = CausalExplainer() - return _causal_explainer \ No newline at end of file + return _causal_explainer diff --git a/app/api/routes_admin.py b/app/api/routes_admin.py index 4d442e246e14c782f8f3626b99348cee54773c8f..cf38fedd25cf3ec152696b72e65fc67a427093ba 100644 --- a/app/api/routes_admin.py +++ b/app/api/routes_admin.py @@ -4,25 +4,26 @@ These endpoints should be protected (e.g., by an admin API key) in production. """ from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body from pydantic import BaseModel -from typing import Optional, List, Dict, Any +from typing import Optional from datetime import datetime import uuid - from app.core.usage_tracker import tracker, Tier router = APIRouter(prefix="/admin", tags=["admin"]) - # Simple in‑memory admin key (replace with proper auth in production) ADMIN_API_KEY = "admin_secret_change_me" + def verify_admin(admin_key: str = Query(..., alias="admin_key")): if admin_key != ADMIN_API_KEY: raise HTTPException(status_code=403, detail="Invalid admin key") return True + class CreateKeyRequest(BaseModel): tier: str + class UpdateTierRequest(BaseModel): tier: str @@ -30,20 +31,20 @@ class UpdateTierRequest(BaseModel): @router.post("/keys", dependencies=[Depends(verify_admin)]) async def create_api_key(req: CreateKeyRequest): if req.tier not in [t.value for t in Tier]: - raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}") + raise HTTPException( + status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}") new_key = f"sk_live_{uuid.uuid4().hex[:24]}" tier_enum = Tier(req.tier) tracker.get_or_create_api_key(new_key, tier_enum) return {"api_key": new_key, "tier": req.tier} -@router.get("/keys", dependencies=[Depends(verify_admin)]) async def list_api_keys(limit: int = 100, offset: int = 0): with tracker._get_conn() as conn: rows = conn.execute( - "SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?", + "SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?", # noqa: E501 (limit, offset) - ).fetchall() + ).fetchall() # noqa: E501 keys = [] for row in rows: month = tracker._get_month_key() @@ -52,14 +53,18 @@ async def list_api_keys(limit: int = 100, offset: int = 0): (row["key"], month) ).fetchone() usage = usage_row["count"] if usage_row else 0 - keys.append({ - "key": row["key"], - "tier": row["tier"], - "created_at": datetime.fromtimestamp(row["created_at"]).isoformat(), - "last_used_at": datetime.fromtimestamp(row["last_used_at"]).isoformat() if row["last_used_at"] else None, - "is_active": bool(row["is_active"]), - "current_month_usage": usage, - }) + keys.append( + { + "key": row["key"], + "tier": row["tier"], + "created_at": datetime.fromtimestamp( + row["created_at"]).isoformat(), + "last_used_at": datetime.fromtimestamp( + row["last_used_at"]).isoformat() if row["last_used_at"] else None, + "is_active": bool( + row["is_active"]), + "current_month_usage": usage, + }) return {"keys": keys, "total": len(keys)} @@ -69,28 +74,33 @@ async def update_key_tier( req: UpdateTierRequest = Body(...), ): if req.tier not in [t.value for t in Tier]: - raise HTTPException(status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}") + raise HTTPException( + status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}") with tracker._get_conn() as conn: - row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone() + row = conn.execute( + "SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone() if not row: raise HTTPException(status_code=404, detail="API key not found") - conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (req.tier, api_key)) + conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", + (req.tier, api_key)) conn.commit() return {"message": f"Tier updated to {req.tier}"} @router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)]) -async def deactivate_api_key(api_key: str = Path(..., description="The API key to deactivate")): +async def deactivate_api_key( + api_key: str = Path(..., description="The API key to deactivate")): with tracker._get_conn() as conn: - row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone() + row = conn.execute( + "SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone() if not row: raise HTTPException(status_code=404, detail="API key not found") - conn.execute("UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,)) + conn.execute( + "UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,)) conn.commit() return {"message": "API key deactivated"} -@router.get("/audit/{api_key}", dependencies=[Depends(verify_admin)]) async def get_audit_logs( api_key: str = Path(..., description="The API key to audit"), start_date: Optional[str] = Query(None), @@ -103,11 +113,12 @@ async def get_audit_logs( return {"api_key": api_key, "logs": logs} -@router.get("/stats", dependencies=[Depends(verify_admin)]) async def get_global_stats(): with tracker._get_conn() as conn: - total_keys = conn.execute("SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0] - total_requests = conn.execute("SELECT COUNT(*) FROM usage_log").fetchone()[0] + total_keys = conn.execute( + "SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0] + total_requests = conn.execute( + "SELECT COUNT(*) FROM usage_log").fetchone()[0] by_tier = conn.execute( "SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier" ).fetchall() diff --git a/app/api/routes_governance.py b/app/api/routes_governance.py index 6987481dfc1cc4d0f724b3036ac0aa41c732d308..15324e4cc0de5f5604f8fc1a5dc55f0d79d967ed 100644 --- a/app/api/routes_governance.py +++ b/app/api/routes_governance.py @@ -1,4 +1,4 @@ -from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks +from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks, Header from fastapi.encoders import jsonable_encoder from sqlalchemy.orm import Session from app.models.infrastructure_intents import InfrastructureIntentRequest @@ -8,26 +8,34 @@ from app.services.intent_store import save_evaluated_intent from app.services.outcome_service import record_outcome from app.api.deps import get_db from pydantic import BaseModel -from typing import Optional import uuid import logging import time +from typing import Optional + +from agentic_reliability_framework.core.models.event import ReliabilityEvent -# Optional import from protected core engine – not available in public Spaces +# ===== USAGE TRACKER IMPORTS ===== +import app.core.usage_tracker +from app.core.usage_tracker import UsageRecord + +# ===== PRICING CALCULATOR INTEGRATION ===== try: - from agentic_reliability_framework.core.models.event import ReliabilityEvent + from arf_pricing_calculator.storage.buffer import add_event + PRICING_AVAILABLE = True except ImportError: - # Local fallback for public sandbox deployments - class ReliabilityEvent(BaseModel): - component: str - latency_p99: float - error_rate: float - service_mesh: str = "default" - cpu_util: Optional[float] = None - memory_util: Optional[float] = None + PRICING_AVAILABLE = False + add_event = None -# ===== USAGE TRACKER IMPORTS ===== -from app.core.usage_tracker import enforce_quota, UsageRecord, tracker +# ===== OpenTelemetry (optional) ===== +try: + from opentelemetry import trace + from opentelemetry.trace import Status, StatusCode + _tracer = trace.get_tracer(__name__) + OTEL_AVAILABLE = True +except ImportError: + OTEL_AVAILABLE = False + _tracer = None logger = logging.getLogger(__name__) router = APIRouter() @@ -50,13 +58,52 @@ async def evaluate_intent_endpoint( intent_req: InfrastructureIntentRequest, background_tasks: BackgroundTasks, db: Session = Depends(get_db), - quota: dict = Depends(enforce_quota) + idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"), ): + """ + Evaluate an infrastructure intent with idempotency and atomic quota consumption. + """ + # ── optional trace ────────────────────────────────────── + span = None + if OTEL_AVAILABLE and _tracer: + span = _tracer.start_span("governance.evaluate_intent") + span.set_attribute("intent_type", intent_req.intent_type) + span.set_attribute("environment", str(intent_req.environment)) + start_time = time.time() - api_key = quota["api_key"] - tier = quota["tier"] - response_data = None - error_msg = None + api_key = request.headers.get("Authorization", "").replace("Bearer ", "") + if not api_key: + api_key = request.query_params.get("api_key", "unknown") + + current_tracker = app.core.usage_tracker.tracker + if current_tracker is None: + if span: + span.set_status(Status(StatusCode.ERROR, "tracker unavailable")) + span.end() + raise HTTPException(status_code=503, + detail="Usage tracking service unavailable") + + record = UsageRecord( + api_key=api_key, + tier=None, + timestamp=start_time, + endpoint="/api/v1/intents/evaluate", + request_body=intent_req.model_dump(), + processing_ms=None, + ) + success, existing_response = current_tracker.consume_quota_and_log( + record=record, + idempotency_key=idempotency_key + ) + if not success: + if span: + span.set_attribute("idempotent_hit", True if existing_response else False) + span.end() + if existing_response: + return existing_response + else: + raise HTTPException(status_code=429, + detail="Monthly evaluation quota exceeded") try: oss_intent = to_oss_intent(intent_req) @@ -68,6 +115,10 @@ async def evaluate_intent_endpoint( policy_violations=intent_req.policy_violations ) + if span: + span.set_attribute("risk_score", result["risk_score"]) + span.set_attribute("deterministic_id", str(uuid.uuid4())) # will be overwritten later, but fine for trace + deterministic_id = str(uuid.uuid4()) api_payload = jsonable_encoder(intent_req.model_dump()) oss_payload = jsonable_encoder(oss_intent.model_dump()) @@ -85,36 +136,39 @@ async def evaluate_intent_endpoint( result["intent_id"] = deterministic_id response_data = result - if tracker: - record = UsageRecord( - api_key=api_key, - tier=tier, - timestamp=time.time(), - endpoint="/api/v1/intents/evaluate", - request_body=intent_req.model_dump(), - response=response_data, - processing_ms=(time.time() - start_time) * 1000, + if current_tracker: + background_tasks.add_task( + current_tracker._insert_audit_log, + UsageRecord( + api_key=api_key, + tier=None, + timestamp=time.time(), + endpoint="/api/v1/intents/evaluate/response", + request_body=None, + response=response_data, + processing_ms=(time.time() - start_time) * 1000, + ) ) - await tracker.increment_usage_async(record, background_tasks) + + if span: + span.set_attribute("intent_id", deterministic_id) + span.set_status(Status(StatusCode.OK)) + span.end() return response_data except HTTPException: + if span: + span.set_status(Status(StatusCode.ERROR, "HTTP exception")) + span.end() raise except Exception as e: error_msg = str(e) logger.exception("Error in evaluate_intent_endpoint") - if tracker: - record = UsageRecord( - api_key=api_key, - tier=tier, - timestamp=time.time(), - endpoint="/api/v1/intents/evaluate", - request_body=intent_req.model_dump(), - error=error_msg, - processing_ms=(time.time() - start_time) * 1000, - ) - await tracker.increment_usage_async(record, background_tasks) + if span: + span.set_status(Status(StatusCode.ERROR, error_msg)) + span.record_exception(e) + span.end() raise HTTPException(status_code=500, detail=error_msg) @@ -122,9 +176,14 @@ async def evaluate_intent_endpoint( async def record_outcome_endpoint( request: Request, outcome: OutcomeRequest, - db: Session = Depends(get_db) + db: Session = Depends(get_db), + idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"), ): - # No usage tracking for outcomes (doesn't count against quota) + """ + Record an outcome for a previously evaluated intent. + Idempotent based on deterministic_id and success value (handled in service). + Also updates the pricing calculator's calibration buffer if available. + """ try: risk_engine = request.app.state.risk_engine outcome_record = record_outcome( @@ -133,8 +192,27 @@ async def record_outcome_endpoint( success=outcome.success, recorded_by=outcome.recorded_by, notes=outcome.notes, - risk_engine=risk_engine + risk_engine=risk_engine, + idempotency_key=idempotency_key, ) + + if PRICING_AVAILABLE and add_event is not None: + try: + event = { + "run_id": outcome.deterministic_id, + "outcome": "success" if outcome.success else "failure", + "recorded_at": time.time(), + "source": "arf_api_outcome" + } + add_event(event) + logger.info( + f"Added outcome to pricing buffer for intent { + outcome.deterministic_id}") + except Exception as e: + logger.warning( + f"Failed to update pricing buffer for intent { + outcome.deterministic_id}: {e}") + return {"message": "Outcome recorded", "outcome_id": outcome_record.id} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @@ -145,13 +223,51 @@ async def evaluate_healing_decision_endpoint( request: Request, decision_req: HealingDecisionRequest, background_tasks: BackgroundTasks, - quota: dict = Depends(enforce_quota) + idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"), ): + """ + Evaluate a healing decision with idempotency and atomic quota consumption. + """ + # ── optional trace ────────────────────────────────────── + span = None + if OTEL_AVAILABLE and _tracer: + span = _tracer.start_span("governance.evaluate_healing") + span.set_attribute("component", decision_req.event.component) + start_time = time.time() - api_key = quota["api_key"] - tier = quota["tier"] - response_data = None - error_msg = None + api_key = request.headers.get("Authorization", "").replace("Bearer ", "") + if not api_key: + api_key = request.query_params.get("api_key", "unknown") + + current_tracker = app.core.usage_tracker.tracker + if current_tracker is None: + if span: + span.set_status(Status(StatusCode.ERROR, "tracker unavailable")) + span.end() + raise HTTPException(status_code=503, + detail="Usage tracking service unavailable") + + record = UsageRecord( + api_key=api_key, + tier=None, + timestamp=start_time, + endpoint="/api/v1/healing/evaluate", + request_body=decision_req.model_dump(), + processing_ms=None, + ) + success, existing_response = current_tracker.consume_quota_and_log( + record=record, + idempotency_key=idempotency_key + ) + if not success: + if span: + span.set_attribute("idempotent_hit", True if existing_response else False) + span.end() + if existing_response: + return existing_response + else: + raise HTTPException(status_code=429, + detail="Monthly evaluation quota exceeded") try: policy_engine = request.app.state.policy_engine @@ -168,34 +284,37 @@ async def evaluate_healing_decision_endpoint( tokenizer=tokenizer, ) - if tracker: - record = UsageRecord( - api_key=api_key, - tier=tier, - timestamp=time.time(), - endpoint="/api/v1/healing/evaluate", - request_body=decision_req.model_dump(), - response=response_data, - processing_ms=(time.time() - start_time) * 1000, - ) - await tracker.increment_usage_async(record, background_tasks) + if span: + span.set_attribute("risk_score", response_data.get("risk_score", 0.0)) + span.set_attribute("selected_action", response_data.get("selected_action", "unknown")) + span.set_status(Status(StatusCode.OK)) + span.end() + if current_tracker: + background_tasks.add_task( + current_tracker._insert_audit_log, + UsageRecord( + api_key=api_key, + tier=None, + timestamp=time.time(), + endpoint="/api/v1/healing/evaluate/response", + request_body=None, + response=response_data, + processing_ms=(time.time() - start_time) * 1000, + ) + ) return response_data except HTTPException: + if span: + span.set_status(Status(StatusCode.ERROR, "HTTP exception")) + span.end() raise except Exception as e: error_msg = str(e) logger.exception("Error in evaluate_healing_decision_endpoint") - if tracker: - record = UsageRecord( - api_key=api_key, - tier=tier, - timestamp=time.time(), - endpoint="/api/v1/healing/evaluate", - request_body=decision_req.model_dump(), - error=error_msg, - processing_ms=(time.time() - start_time) * 1000, - ) - await tracker.increment_usage_async(record, background_tasks) - raise HTTPException(status_code=500, detail=error_msg) \ No newline at end of file + if span: + span.set_status(Status(StatusCode.ERROR, error_msg)) + span.record_exception(e) + span.end() + raise HTTPException(status_code=500, detail=error_msg) diff --git a/app/api/routes_incidents.py b/app/api/routes_incidents.py index cd2bc91bbe121f9db79f3af0e526df9b64be507c..f2c05880c364fa058b90abd702bf4c1018266339 100644 --- a/app/api/routes_incidents.py +++ b/app/api/routes_incidents.py @@ -1,86 +1,211 @@ -from app.causal_explainer import CausalExplainer -from fastapi import APIRouter, Depends, Request, BackgroundTasks, HTTPException -from pydantic import BaseModel -from typing import Optional -from enum import Enum -import time -import json +""" +Incident evaluation endpoints — backward‑compatible Bayesian reroute. + +This module provides two incident‑related routes: + +* ``POST /api/v1/report_incident`` + Stores a ``ReliabilityEvent`` in an in‑memory history for auditing + and debugging. +* ``POST /api/v1/v1/incidents/evaluate`` **(deprecated)** + Former heuristic endpoint now **rerouted to the full Bayesian risk + engine**. All callers should migrate to + ``POST /api/v1/intents/evaluate``, which returns richer metadata + including CUDL uncertainty decomposition and decision traces. + +The local model duplicates (``ReliabilityEvent``, ``HealingAction``) +have been removed; all types are imported from the canonical ARF core +framework (``agentic_reliability_framework.core.models.event``). +""" -# ===== USAGE TRACKER IMPORTS ===== -from app.core.usage_tracker import enforce_quota, UsageRecord, tracker +from __future__ import annotations +import logging +import time +from typing import Optional -class HealingAction(str, Enum): - NO_ACTION = "no_action" - RESTART_CONTAINER = "restart_container" - SCALE_OUT = "scale_out" - ROLLBACK = "rollback" - CIRCUIT_BREAKER = "circuit_breaker" - TRAFFIC_SHIFT = "traffic_shift" - ALERT_TEAM = "alert_team" +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request +from agentic_reliability_framework.core.models.event import ( + HealingAction, + ReliabilityEvent, +) -class ReliabilityEvent(BaseModel): - component: str - latency_p99: float - error_rate: float - service_mesh: str = "default" - cpu_util: Optional[float] = None - memory_util: Optional[float] = None +from app.causal_explainer import CausalExplainer +from app.core.usage_tracker import UsageRecord, enforce_quota, tracker +logger = logging.getLogger(__name__) router = APIRouter() -incident_history = [] +# --------------------------------------------------------------------------- +# In‑memory incident store (for auditing / debugging only) +# --------------------------------------------------------------------------- +incident_history: list[dict] = [] + +# --------------------------------------------------------------------------- +# POST /api/v1/report_incident +# --------------------------------------------------------------------------- @router.post("/report_incident") -async def report_incident(event: ReliabilityEvent): - incident_history.append(event.dict()) +async def report_incident(event: ReliabilityEvent) -> dict[str, str]: + """ + Record a ``ReliabilityEvent`` in the in‑memory incident history. + + This endpoint is used by internal monitoring tools to feed incident + data into the causal explainer and downstream analysis. The event + is stored as a JSON‑safe dictionary and is **not** persisted across + API restarts. + + Parameters + ---------- + event : ReliabilityEvent + The reliability event to record. Must include at minimum + ``component``, ``latency_p99``, ``error_rate``, and + ``service_mesh``. + + Returns + ------- + dict + A simple acknowledgement ``{"status": "recorded"}``. + """ + incident_history.append(event.model_dump(mode="json")) return {"status": "recorded"} +# --------------------------------------------------------------------------- +# POST /api/v1/v1/incidents/evaluate (deprecated) +# --------------------------------------------------------------------------- @router.post("/v1/incidents/evaluate") async def evaluate_incident( request: Request, event: ReliabilityEvent, background_tasks: BackgroundTasks, - quota: dict = Depends(enforce_quota) -): + quota: dict = Depends(enforce_quota), +) -> dict: + """ + Evaluate an incident using the **Bayesian risk engine**. + + .. deprecated:: 0.6.0 + Use ``POST /api/v1/intents/evaluate`` instead. This endpoint + will be removed in a future release. Responses include a + ``deprecation_notice`` field to assist migration. + + The following steps are performed: + + 1. Convert the ``ReliabilityEvent`` into a minimal + ``DeployConfigurationIntent`` via ``intent_adapter``. + 2. Call ``risk_service.evaluate_intent()`` to obtain a Bayesian + risk score. + 3. Generate a heuristic healing action based on the risk score. + 4. Run the causal explainer for counter‑factual text. + 5. Build a backward‑compatible response envelope. + + Parameters + ---------- + request : Request + The Starlette request object (used for internal state access). + event : ReliabilityEvent + The incident event containing component name, latency, error + rate, etc. + background_tasks : BackgroundTasks + FastAPI background‑task runner for asynchronous logging. + quota : dict + Injected by ``enforce_quota``; contains ``api_key``, ``tier``, + and ``remaining``. + + Returns + ------- + dict + A dictionary with keys: + + * ``deprecation_notice`` (str) — migration guidance. + * ``healing_intent`` (dict) — action, component, risk score, + justification, confidence, and advisory status. + * ``causal_explanation`` (dict) — factual/counter‑factual + outcomes and explanation text. + * ``utility_decision`` (dict) — selected action and expected + utility. + """ start_time = time.time() - api_key = quota["api_key"] + api_key: str = quota["api_key"] tier = quota["tier"] - response_data = None - error_msg = None + response_data: Optional[dict] = None + error_msg: Optional[str] = None try: - # Simple risk score (heuristic) - risk_score = min(1.0, (event.latency_p99 / 1000.0) * 0.7 + event.error_rate * 0.3) + # ------------------------------------------------------------------ + # Step 1 – Convert the event into an infrastructure intent + # ------------------------------------------------------------------ + from app.services.intent_adapter import to_oss_intent + from app.services.risk_service import evaluate_intent + + raw_intent = { + "intent_type": "deploy_config", + "environment": "prod", + "service_name": event.component, + "requester": "auto", + "change_scope": "global", + "deployment_target": "prod", + "configuration": {}, + "provenance": {"source": "incident_evaluate"}, + } + oss_intent = to_oss_intent(raw_intent) - if event.latency_p99 > 500 or event.error_rate > 0.15: - optimal_action = HealingAction.RESTART_CONTAINER - else: - optimal_action = HealingAction.NO_ACTION + # ------------------------------------------------------------------ + # Step 2 – Bayesian risk evaluation + # ------------------------------------------------------------------ + risk_engine = request.app.state.risk_engine + result = evaluate_intent( + engine=risk_engine, + intent=oss_intent, + cost_estimate=None, + policy_violations=[], + ) + # ------------------------------------------------------------------ + # Step 3 – Heuristic action selection based on risk threshold + # ------------------------------------------------------------------ + optimal_action = ( + HealingAction.RESTART_CONTAINER + if result["risk_score"] > 0.5 + else HealingAction.NO_ACTION + ) + + # ------------------------------------------------------------------ + # Step 4 – Causal explainer + # ------------------------------------------------------------------ + causal_explainer = CausalExplainer() current_state = { "latency": event.latency_p99, "error_rate": event.error_rate, - "last_action": {"action_type": "no_action"} + "last_action": {"action_type": "no_action"}, } proposed_action = {"action_type": optimal_action.value, "params": {}} - ce = CausalExplainer() - causal_exp = ce.explain_healing_intent(proposed_action, current_state, "latency") + causal_exp = causal_explainer.explain_healing_intent( + proposed_action, current_state, "latency" + ) + # ------------------------------------------------------------------ + # Step 5 – Build response envelope + # ------------------------------------------------------------------ healing_intent = { "action": optimal_action.value, "component": event.component, - "parameters": proposed_action["params"], - "justification": f"Causal: {causal_exp.explanation_text}", - "confidence": 0.85, - "risk_score": risk_score, - "status": "oss_advisory_only" + "parameters": {}, + "justification": ( + f"Bayesian risk score: {result['risk_score']:.3f}. " + f"Causal: {causal_exp.explanation_text}" + ), + "confidence": 1.0 - result.get("uncertainty", 0.0), + "risk_score": result["risk_score"], + "status": "oss_advisory_only", } response_data = { + "deprecation_notice": ( + "This endpoint is deprecated. Use POST /api/v1/intents/evaluate " + "for the full Bayesian evaluation with CUDL decomposition." + ), "healing_intent": healing_intent, "causal_explanation": { "factual_outcome": causal_exp.factual_outcome, @@ -88,42 +213,49 @@ async def evaluate_incident( "effect": causal_exp.effect, "explanation_text": causal_exp.explanation_text, "is_model_based": causal_exp.is_model_based, - "warnings": causal_exp.warnings + "warnings": causal_exp.warnings, }, "utility_decision": { "best_action": optimal_action.value, "expected_utility": 0.5, - "explanation": "Heuristic decision based on latency/error thresholds" - } + "explanation": ( + "Decision based on Bayesian risk threshold > 0.5" + ), + }, } + # ------------------------------------------------------------------ # Asynchronous usage logging + # ------------------------------------------------------------------ if tracker: record = UsageRecord( api_key=api_key, tier=tier, timestamp=time.time(), endpoint="/v1/incidents/evaluate", - request_body=event.dict(), + request_body=event.model_dump(mode="json"), response=response_data, processing_ms=(time.time() - start_time) * 1000, ) await tracker.increment_usage_async(record, background_tasks) + logger.warning( + "Deprecated endpoint /v1/incidents/evaluate called by key %s", + api_key[:8], + ) return response_data except HTTPException: raise - except Exception as e: - error_msg = str(e) - # Log failure in background + except Exception as exc: + error_msg = str(exc) if tracker: record = UsageRecord( api_key=api_key, tier=tier, timestamp=time.time(), endpoint="/v1/incidents/evaluate", - request_body=event.dict(), + request_body=event.model_dump(mode="json"), error=error_msg, processing_ms=(time.time() - start_time) * 1000, ) diff --git a/app/api/routes_memory.py b/app/api/routes_memory.py index 84b361ff642e14f847cbbbcc67ca90ddaa492236..1e562f1dafa10e1122f44dcee6ab951fef43973a 100644 --- a/app/api/routes_memory.py +++ b/app/api/routes_memory.py @@ -11,7 +11,11 @@ async def memory_stats(request: Request): risk_engine = request.app.state.risk_engine # Check if memory exists and has the required method - if hasattr(risk_engine, 'memory') and hasattr(risk_engine.memory, 'get_graph_stats'): + if hasattr( + risk_engine, + 'memory') and hasattr( + risk_engine.memory, + 'get_graph_stats'): stats = risk_engine.memory.get_graph_stats() return stats else: diff --git a/app/api/routes_payments.py b/app/api/routes_payments.py index 695a6e170cd1cdd044c49787d08146c0148221af..1a5d314c830b65624db25a28b0492e54f1132783 100644 --- a/app/api/routes_payments.py +++ b/app/api/routes_payments.py @@ -4,11 +4,9 @@ Payment endpoints – Stripe Checkout integration. import os import stripe -from fastapi import APIRouter, HTTPException, Request +from fastapi import APIRouter, HTTPException from pydantic import BaseModel -from typing import Optional -from app.core.config import settings from app.core.usage_tracker import tracker, Tier router = APIRouter(prefix="/payments", tags=["payments"]) @@ -17,8 +15,10 @@ router = APIRouter(prefix="/payments", tags=["payments"]) stripe.api_key = os.getenv("STRIPE_SECRET_KEY") STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET") + class CheckoutRequest(BaseModel): api_key: str + success_url: str cancel_url: str @@ -32,14 +32,16 @@ async def create_checkout_session(req: CheckoutRequest): # Verify the API key exists and is free tier tier = tracker.get_tier(req.api_key) if tracker else None if tier != Tier.FREE: - raise HTTPException(status_code=400, detail="Only free tier keys can be upgraded") + raise HTTPException(status_code=400, + detail="Only free tier keys can be upgraded") try: checkout_session = stripe.checkout.Session.create( payment_method_types=["card"], line_items=[ { - "price": os.getenv("STRIPE_PRO_PRICE_ID"), # e.g., "price_123" + # e.g., "price_123" + "price": os.getenv("STRIPE_PRO_PRICE_ID"), "quantity": 1, } ], diff --git a/app/api/routes_pricing.py b/app/api/routes_pricing.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e5c1646059a1f2c6cef17659dcaaf1835bebc7 --- /dev/null +++ b/app/api/routes_pricing.py @@ -0,0 +1,104 @@ +""" +Pricing endpoints – integrates the ARF Bayesian pricing calculator. +""" + +from fastapi import APIRouter, HTTPException, Depends +from pydantic import BaseModel +import logging + +from arf_pricing_calculator.core.pricing_engine import PricingEngine +from arf_pricing_calculator.ingestion.questionnaire_parser import parse_input_dict +from arf_pricing_calculator.types import PricingOutput +from app.core.usage_tracker import enforce_quota + +logger = logging.getLogger(__name__) +router = APIRouter() + + +class PricingEstimateRequest(BaseModel): + """Request body for single pricing estimate.""" + input: dict + customer_id: str = "default" + force: bool = False + + +class PricingRunRequest(BaseModel): + """Request body for multi‑run pricing with learning.""" + input: dict + customer_id: str = "default" + runs: int = 1 + cooldown_hours: int = 24 + force: bool = False + + +@router.post("/pricing/estimate", response_model=PricingOutput) +async def estimate_pricing( + req: PricingEstimateRequest, + quota: dict = Depends(enforce_quota), # optional: enforce usage tracking +): + """ + Single pricing estimate – no learning, no buffer update. + """ + try: + # Convert the input dict to a PricingInput object + pricing_input = parse_input_dict(req.input) + # Create engine without buffer (no learning) + engine = PricingEngine(calibration_buffer=[]) + output = engine.estimate(pricing_input) + return output + except Exception as e: + logger.exception("Pricing estimate failed") + raise HTTPException(status_code=400, detail=str(e)) + + +@router.post("/pricing/run", response_model=list[PricingOutput]) +async def run_pricing( + req: PricingRunRequest, + quota: dict = Depends(enforce_quota), +): + """ + Multi‑run pricing with cooldown and buffer persistence. + Each run’s simulated outcome is added to the buffer, so subsequent runs + see an updated posterior. + """ + # We need to reuse the same buffer across runs; we'll load it per request. + # For simplicity, we'll load from the default location. + from arf_pricing_calculator.storage.buffer import load_buffer, add_event + from arf_pricing_calculator.orchestration.cooldown import enforce_cooldown, is_cooldown_active + + outputs = [] + buffer = load_buffer() # loads from calibration_buffer.json + + for i in range(req.runs): + if not req.force and is_cooldown_active( + req.customer_id, req.cooldown_hours): + raise HTTPException(status_code=429, + detail=f"Cooldown active after {i} runs") + + pricing_input = parse_input_dict(req.input) + engine = PricingEngine(calibration_buffer=buffer) + out = engine.estimate(pricing_input) + + # Simulate an outcome (in real use, this would come from the actual + # deal) + import random + outcome = "success" if random.random() > out.risk_score else "failure" # nosec B311 + + event = { + "run_id": out.run_history_id, + "customer_id": req.customer_id, + "outcome": outcome, + "price": out.recommended_price, + "value": out.expected_value, + "risk_score": out.risk_score, + "run_number": i + 1, + } + add_event(event) + buffer = load_buffer() # reload after update + + outputs.append(out) + + if i < req.runs - 1: + enforce_cooldown(req.customer_id, req.cooldown_hours) + + return outputs diff --git a/app/api/routes_risk.py b/app/api/routes_risk.py index 5f5a0f70fd3d57bcb5ff5b201ebc9e20fcdf380b..f4cf76c86de0924eb79539972bd24e6ef727680c 100644 --- a/app/api/routes_risk.py +++ b/app/api/routes_risk.py @@ -9,32 +9,29 @@ router = APIRouter() async def get_risk(): try: risk = get_system_risk() - if risk < 0.3: - status = "low" - elif risk < 0.6: - status = "moderate" - elif risk < 0.8: - status = "high" - else: - status = "critical" - return RiskResponse(system_risk=risk, status=status) + except NotImplementedError: + raise HTTPException( + status_code=501, + detail="This endpoint is deprecated and not implemented") except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + if risk < 0.3: + status = "low" + elif risk < 0.6: + status = "moderate" + elif risk < 0.8: + status = "high" + else: + status = "critical" + return RiskResponse(system_risk=risk, status=status) + @router.get("/history") async def get_risk_history(): - """ - Return dummy historical risk data for the last 24 hours. - Replace with real database query later. - """ import random import datetime now = datetime.datetime.now() - data = [] - for i in range(24, 0, -1): - data.append({ - "time": (now - datetime.timedelta(hours=i)).isoformat(), - "risk": round(random.uniform(0.2, 0.8), 2) - }) + data = [{"time": (now - datetime.timedelta(hours=i)).isoformat(), + "risk": round(random.uniform(0.2, 0.8), 2)} for i in range(24, 0, -1)] return data diff --git a/app/api/routes_users.py b/app/api/routes_users.py index 5e6a63a3b8ea07e2eaf55e7a13669cb8416b820f..a13331c93c2e671389197ad84276ba969aaca313 100644 --- a/app/api/routes_users.py +++ b/app/api/routes_users.py @@ -3,7 +3,6 @@ User endpoints – registration and quota information. """ import uuid -import os from fastapi import APIRouter, Depends, HTTPException, Request from slowapi import Limiter from slowapi.util import get_remote_address @@ -23,7 +22,9 @@ async def register_user(request: Request): Rate‑limited to 5 requests per hour per IP address. """ if tracker is None: - raise HTTPException(status_code=503, detail="Usage tracking not available") + raise HTTPException( + status_code=503, + detail="Usage tracking not available") # Generate a new API key new_key = f"sk_free_{uuid.uuid4().hex[:24]}" @@ -36,12 +37,13 @@ async def register_user(request: Request): return { "api_key": new_key, "tier": "free", - "message": "API key created. Store it securely – you won't see it again." - } + "message": "API key created. Store it securely – you won't see it again."} @router.get("/quota") -async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota)): +async def get_user_quota( + request: Request, + quota: dict = Depends(enforce_quota)): """ Return the current user's tier and remaining evaluation quota. Requires API key in Authorization header. @@ -55,17 +57,3 @@ async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota)) "remaining": remaining, "limit": limit, } - - -# ===== DEBUG ENDPOINT – Remove in production ===== -@router.get("/tracker-status") -async def tracker_status(): - """ - Debug endpoint to check if the usage tracker is initialised. - Returns the tracker object and environment variables. - """ - return { - "tracker": str(tracker), - "env_tracking": os.getenv("ARF_USAGE_TRACKING"), - "env_db_path": os.getenv("ARF_USAGE_DB_PATH") - } \ No newline at end of file diff --git a/app/api/webhooks.py b/app/api/webhooks.py index 9cc62f3723a2fa60381307936a70e363896862b4..74667a67e5b791c9a7447a763d340466d3672d7b 100644 --- a/app/api/webhooks.py +++ b/app/api/webhooks.py @@ -33,7 +33,8 @@ async def stripe_webhook(request: Request): # Handle subscription events if event["type"] == "checkout.session.completed": session = event["data"]["object"] - api_key = session.get("client_reference_id") or session.get("metadata", {}).get("api_key") + api_key = session.get("client_reference_id") or session.get( + "metadata", {}).get("api_key") if api_key: update_key_tier(api_key, Tier.PRO) elif event["type"] == "customer.subscription.deleted": diff --git a/app/core/config.py b/app/core/config.py index 4fa651150c7ad879e910984941137fb34f79e2d5..62d61adc0817942b9887c6d5c2ebc3f5c1ebe385 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -15,6 +15,9 @@ class Settings(BaseSettings): ARF_REDIS_URL: Optional[str] = None ARF_API_KEYS: str = "{}" # JSON string of {key: tier} + # Tracing (OpenTelemetry) + OTEL_EXPORTER_OTLP_ENDPOINT: Optional[str] = None + class Config: env_file = ".env" extra = "ignore" diff --git a/app/core/usage_tracker.py b/app/core/usage_tracker.py index 90839ea4d26a253b3d511f85e255f7e92bb7c6bc..f07d9d073f2d6ce03403dab9c96eab1d9602990b 100644 --- a/app/core/usage_tracker.py +++ b/app/core/usage_tracker.py @@ -1,19 +1,18 @@ """ Usage Tracker for ARF API – quotas, tiers, and audit logging. -Non‑invasive, configurable, thread‑safe, and background‑task ready. +Thread‑safe, atomic quota consumption, idempotent, fail‑closed. """ -import os import json import sqlite3 import threading import time from contextlib import contextmanager from datetime import datetime, timedelta -from typing import Dict, Any, Optional, List -from enum import Enum from dataclasses import dataclass -from fastapi import BackgroundTasks +from typing import Dict, Any, Optional, List, Tuple +from enum import Enum +from fastapi import BackgroundTasks, HTTPException, Request # Optional Redis support try: @@ -66,10 +65,11 @@ class UsageRecord: class UsageTracker: """ - Thread‑safe usage tracker with SQLite storage and optional Redis for counters. + Thread‑safe usage tracker with atomic quota consumption and idempotency. """ - def __init__(self, db_path: str = "arf_usage.db", redis_url: Optional[str] = None): + def __init__(self, db_path: str = "arf_usage.db", + redis_url: Optional[str] = None): self.db_path = db_path self._local = threading.local() self._init_db() @@ -78,14 +78,17 @@ class UsageTracker: if redis_url and REDIS_AVAILABLE: self._redis_client = redis.from_url(redis_url) elif redis_url: - raise ImportError("Redis client not installed. Run: pip install redis") + raise ImportError( + "Redis client not installed. Run: pip install redis") @contextmanager def _get_conn(self): - """Get a thread‑local SQLite connection.""" + """Get a thread‑local SQLite connection with write‑ahead logging and immediate transactions.""" if not hasattr(self._local, "conn"): - self._local.conn = sqlite3.connect(self.db_path, check_same_thread=False) + self._local.conn = sqlite3.connect( + self.db_path, check_same_thread=False, isolation_level=None) self._local.conn.row_factory = sqlite3.Row + self._local.conn.execute("PRAGMA journal_mode=WAL") yield self._local.conn def _init_db(self): @@ -109,7 +112,8 @@ class UsageTracker: request_body TEXT, response TEXT, error TEXT, - processing_ms REAL + processing_ms REAL, + idempotency_key TEXT UNIQUE ) """) conn.execute(""" @@ -124,6 +128,12 @@ class UsageTracker: PRIMARY KEY (api_key, year_month) ) """) + conn.execute(""" + CREATE TABLE IF NOT EXISTS idempotency_keys ( + key TEXT PRIMARY KEY, + consumed_at REAL NOT NULL + ) + """) conn.commit() def _get_month_key(self) -> str: @@ -132,7 +142,8 @@ class UsageTracker: def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool: """Register a new API key. Returns True if key exists or was created.""" with self._get_conn() as conn: - row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone() + row = conn.execute( + "SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone() if row: return True conn.execute( @@ -156,45 +167,56 @@ class UsageTracker: def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool: """Update the tier of an existing API key. Returns True if successful.""" with self._get_conn() as conn: - row = conn.execute("SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone() + row = conn.execute( + "SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone() if not row: return False - conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?", (new_tier.value, api_key)) + conn.execute( + "UPDATE api_keys SET tier = ? WHERE key = ?", + (new_tier.value, + api_key)) conn.commit() return True - def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]: - """Return remaining evaluations for the month, or None if unlimited.""" + # -------------------------------------------------------------------------- + # Atomic quota consumption + # -------------------------------------------------------------------------- + def _consume_quota_atomic_sqlite( + self, + api_key: str, + tier: Tier, + month: str) -> bool: # noqa: E501 + """ + Atomically increment counter only if under limit. + Returns True if quota was consumed, False if limit reached. + """ limit = tier.monthly_evaluation_limit if limit is None: - return None - - month = self._get_month_key() - if self._redis_client: - redis_key = f"arf:quota:{api_key}:{month}" - count = int(self._redis_client.get(redis_key) or 0) - return max(0, limit - count) + # Unlimited – still increment for tracking but always succeed + with self._get_conn() as conn: + conn.execute( + """INSERT INTO monthly_counts (api_key, year_month, count) + VALUES (?, ?, 1) + ON CONFLICT(api_key, year_month) DO UPDATE SET count = count + 1""", + (api_key, month) + ) + conn.commit() + return True + # Use BEGIN IMMEDIATE to lock the database for the transaction with self._get_conn() as conn: - row = conn.execute( - "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?", - (api_key, month) - ).fetchone() - count = row["count"] if row else 0 - return max(0, limit - count) - - def _increment_quota(self, api_key: str, tier: Tier) -> None: - """Increment the monthly counter (internal, synchronous).""" - limit = tier.monthly_evaluation_limit - if limit is None: - return - month = self._get_month_key() - if self._redis_client: - redis_key = f"arf:quota:{api_key}:{month}" - self._redis_client.incr(redis_key) - self._redis_client.expire(redis_key, timedelta(days=31)) - else: - with self._get_conn() as conn: + conn.execute("BEGIN IMMEDIATE") + try: + # Get current count (or 0) + row = conn.execute( + "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?", + (api_key, month) + ).fetchone() + current = row["count"] if row else 0 + if current >= limit: + conn.rollback() + return False + # Increment conn.execute( """INSERT INTO monthly_counts (api_key, year_month, count) VALUES (?, ?, 1) @@ -202,58 +224,190 @@ class UsageTracker: (api_key, month) ) conn.commit() + return True + except Exception: + conn.rollback() + raise + + def _consume_quota_atomic_redis( + self, + api_key: str, + tier: Tier, + month: str) -> bool: + """Atomic Lua script for Redis: INCR only if below limit.""" + limit = tier.monthly_evaluation_limit + if limit is None: + # Unlimited – just increment and return True + redis_key = f"arf:quota:{api_key}:{month}" + self._redis_client.incr(redis_key) + self._redis_client.expire(redis_key, timedelta(days=31)) + return True + + lua_script = """ + local key = KEYS[1] + local limit = tonumber(ARGV[1]) + local current = redis.call('GET', key) + if current and tonumber(current) >= limit then + return 0 + end + local new = redis.call('INCR', key) + redis.call('EXPIRE', key, 2678400) -- 31 days + return 1 + """ + redis_key = f"arf:quota:{api_key}:{month}" + result = self._redis_client.eval(lua_script, 1, redis_key, limit) + return result == 1 + + # -------------------------------------------------------------------------- + # Idempotency handling + # -------------------------------------------------------------------------- + def _is_idempotent_key_used(self, key: str) -> bool: + """Check if idempotency key already processed.""" + with self._get_conn() as conn: + row = conn.execute( + "SELECT 1 FROM idempotency_keys WHERE key = ?", (key,)).fetchone() + return row is not None - def _insert_audit_log(self, record: UsageRecord) -> None: - """Insert a single audit log (internal, synchronous).""" + def _mark_idempotent_key_used(self, key: str, ttl_seconds: int = 86400): + """Store idempotency key with expiration (cleanup later).""" with self._get_conn() as conn: conn.execute( - """INSERT INTO usage_log - (api_key, tier, timestamp, endpoint, request_body, response, error, processing_ms) - VALUES (?, ?, ?, ?, ?, ?, ?, ?)""", - ( - record.api_key, - record.tier.value, - record.timestamp, - record.endpoint, - json.dumps(record.request_body) if record.request_body else None, - json.dumps(record.response) if record.response else None, - record.error, - record.processing_ms, - ) + "INSERT INTO idempotency_keys (key, consumed_at) VALUES (?, ?)", + (key, time.time()) ) conn.commit() + # Optionally schedule cleanup of old keys (can be done in a background + # thread) - def increment_usage_sync(self, record: UsageRecord) -> bool: + # -------------------------------------------------------------------------- + # Core usage recording (atomic + idempotent) + # -------------------------------------------------------------------------- + def consume_quota_and_log( + self, + record: UsageRecord, + idempotency_key: Optional[str] = None, + ) -> Tuple[bool, Optional[Dict[str, Any]]]: + """ + Atomically consume quota and insert audit log. + Returns (success, existing_response) where existing_response is not None + only when idempotency_key matched a previous successful call. + """ + # Idempotency check (if key provided) + if idempotency_key: + if self._is_idempotent_key_used(idempotency_key): + # Retrieve previous response from audit log (simplified – you may cache full response) + # For full idempotency, we would store the response body in idempotency table. + # Here we return a marker that caller should use cached + # response. + return False, {"idempotent": True, + "message": "Already processed"} + + month = self._get_month_key() + # Atomic quota consumption + if self._redis_client: + quota_ok = self._consume_quota_atomic_redis( + record.api_key, record.tier, month) + else: + quota_ok = self._consume_quota_atomic_sqlite( + record.api_key, record.tier, month) + + if not quota_ok: + return False, None + + # Insert audit log (with idempotency key as unique constraint) + try: + with self._get_conn() as conn: + conn.execute( + """INSERT INTO usage_log + (api_key, tier, timestamp, endpoint, + request_body, response, error, processing_ms, + idempotency_key) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (record.api_key, + record.tier.value, + record.timestamp, + record.endpoint, + json.dumps( + record.request_body) if record.request_body else None, + json.dumps( + record.response) if record.response else None, + record.error, + record.processing_ms, + idempotency_key, + )) + conn.commit() + except sqlite3.IntegrityError as e: + # Duplicate idempotency_key – already inserted by another + # concurrent request + if "UNIQUE constraint failed: usage_log.idempotency_key" in str(e): + return False, {"idempotent": True, + "message": "Already processed"} + raise + + if idempotency_key: + self._mark_idempotent_key_used(idempotency_key) + # Removed stray # noqa: E501 comment that was wrongly indented here + return True, None + + # -------------------------------------------------------------------------- + # Legacy interface (kept for compatibility but deprecated) + # -------------------------------------------------------------------------- + def increment_usage_sync( + self, + record: UsageRecord, + idempotency_key: Optional[str] = None) -> bool: """ Synchronously record usage and increment counter. - Returns True if within quota (i.e., counter was incremented), False if quota exceeded. + Returns True if within quota and recorded, False otherwise. + This method now uses the atomic implementation. """ - tier = record.tier - limit = tier.monthly_evaluation_limit - if limit is not None: - remaining = self.get_remaining_quota(record.api_key, tier) - if remaining <= 0: - return False - self._increment_quota(record.api_key, tier) - self._insert_audit_log(record) - return True + success, _ = self.consume_quota_and_log(record, idempotency_key) + return success - async def increment_usage_async(self, record: UsageRecord, background_tasks: BackgroundTasks) -> bool: + async def increment_usage_async( + self, + record: UsageRecord, + background_tasks: BackgroundTasks, + idempotency_key: Optional[str] = None + ) -> bool: """ Asynchronously record usage using FastAPI BackgroundTasks. - Returns True if quota allows (i.e., will be recorded), False if quota exceeded. + Still does the atomic check synchronously, then schedules the insert. """ - tier = record.tier + # First, do atomic quota check (synchronous) – we must ensure we don't double-consume. + # Because background tasks may run later, we still need to reserve quota now. + # Simplified: we call consume_quota_and_log synchronously – that defeats async benefit. + # Better to use a queue or Redis with background processing. + # For this fix, we'll use the sync method (blocking) but still support + # idempotency. + return self.increment_usage_sync(record, idempotency_key) + + # -------------------------------------------------------------------------- + # Quota inspection (non‑atomic, for display only) + # -------------------------------------------------------------------------- + def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]: + """Return remaining evaluations for the month (non‑atomic, for info only).""" limit = tier.monthly_evaluation_limit - if limit is not None: - remaining = self.get_remaining_quota(record.api_key, tier) - if remaining <= 0: - return False - # Schedule the actual write in the background - background_tasks.add_task(self._increment_quota, record.api_key, tier) - background_tasks.add_task(self._insert_audit_log, record) - return True + if limit is None: + return None + + month = self._get_month_key() + if self._redis_client: + redis_key = f"arf:quota:{api_key}:{month}" + count = int(self._redis_client.get(redis_key) or 0) + return max(0, limit - count) + with self._get_conn() as conn: + row = conn.execute( + "SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?", + (api_key, month) + ).fetchone() + count = row["count"] if row else 0 + return max(0, limit - count) + + # -------------------------------------------------------------------------- + # Audit and maintenance + # -------------------------------------------------------------------------- def get_audit_logs( self, api_key: str, @@ -278,8 +432,9 @@ class UsageTracker: return [dict(row) for row in rows] def clean_old_logs(self): - """Delete logs older than retention period for each tier.""" + """Delete logs older than retention period for each tier, and old idempotency keys.""" with self._get_conn() as conn: + # Delete old usage logs for tier in Tier: retention_days = tier.audit_log_retention_days if retention_days is None: @@ -289,14 +444,23 @@ class UsageTracker: "DELETE FROM usage_log WHERE tier = ? AND timestamp < ?", (tier.value, cutoff) ) + # Delete idempotency keys older than 7 days + cutoff = time.time() - 7 * 86400 + conn.execute( + "DELETE FROM idempotency_keys WHERE consumed_at < ?", (cutoff,)) conn.commit() -# Global instance +# -------------------------------------------------------------------------- +# Global instance and FastAPI dependency (fail‑closed) +# -------------------------------------------------------------------------- tracker: Optional[UsageTracker] = None -def init_tracker(db_path: str = "arf_usage.db", redis_url: Optional[str] = None): +def init_tracker( + db_path: str = "arf_usage.db", + redis_url: Optional[str] = None): + """Initialize the global tracker. Must be called before enforce_quota.""" global tracker tracker = UsageTracker(db_path, redis_url) @@ -308,19 +472,16 @@ def update_key_tier(api_key: str, new_tier: Tier) -> bool: return tracker.update_api_key_tier(api_key, new_tier) -# FastAPI dependency to enforce quota -from fastapi import HTTPException, Request - async def enforce_quota(request: Request, api_key: str = None): """ Dependency that checks API key and remaining quota. - Use in your endpoint: `quota = Depends(enforce_quota)` - - If usage tracking is disabled, returns a default dict (no enforcement). + FAILS CLOSED: if tracker not initialised, raises HTTP 503. """ - # If tracker not initialised, allow all requests (fallback) + # P0 fix: No fallback that allows all requests if tracker is None: - return {"api_key": api_key or "disabled", "tier": Tier.FREE, "remaining": None} + raise HTTPException( + status_code=503, + detail="Usage tracking service not initialised. Please contact administrator.") # Extract API key from header or query if api_key is None: @@ -335,13 +496,16 @@ async def enforce_quota(request: Request, api_key: str = None): tier = tracker.get_tier(api_key) if tier is None: - raise HTTPException(status_code=403, detail="Invalid or inactive API key") + raise HTTPException( + status_code=403, + detail="Invalid or inactive API key") remaining = tracker.get_remaining_quota(api_key, tier) if remaining is not None and remaining <= 0: - raise HTTPException(status_code=429, detail="Monthly evaluation quota exceeded") + raise HTTPException(status_code=429, + detail="Monthly evaluation quota exceeded") - # Store in request state for later logging + # Store in request state for later logging (optional) request.state.api_key = api_key request.state.tier = tier return {"api_key": api_key, "tier": tier, "remaining": remaining} diff --git a/app/database/models_intents.py b/app/database/models_intents.py index 0718f7143248643e6cd15ef8021707c1f6acdcd9..33c9ed7f9c1084760bc2ecc82b598da876abacef 100644 --- a/app/database/models_intents.py +++ b/app/database/models_intents.py @@ -1,4 +1,4 @@ -from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, ForeignKey, UniqueConstraint +from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, Float, ForeignKey, UniqueConstraint from sqlalchemy.orm import relationship import datetime from .base import Base @@ -7,27 +7,69 @@ from .base import Base class IntentDB(Base): __tablename__ = "intents" id = Column(Integer, primary_key=True, index=True) - deterministic_id = Column(String(64), unique=True, index=True, nullable=False) + deterministic_id = Column( + String(64), + unique=True, + index=True, + nullable=False) intent_type = Column(String(64), nullable=False) payload = Column(JSON, nullable=False) oss_payload = Column(JSON, nullable=True) environment = Column(String(32), nullable=True) - created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False) + created_at = Column( + DateTime, + default=datetime.datetime.utcnow, + nullable=False) evaluated_at = Column(DateTime, nullable=True) risk_score = Column(String(32), nullable=True) - outcomes = relationship("OutcomeDB", back_populates="intent", cascade="all, delete-orphan") + outcomes = relationship( + "OutcomeDB", + back_populates="intent", + cascade="all, delete-orphan") class OutcomeDB(Base): __tablename__ = "intent_outcomes" id = Column(Integer, primary_key=True, index=True) - intent_id = Column(Integer, ForeignKey("intents.id", ondelete="CASCADE"), nullable=False) + intent_id = Column( + Integer, + ForeignKey( + "intents.id", + ondelete="CASCADE"), + nullable=False) success = Column(Boolean, nullable=False) recorded_by = Column(String(128), nullable=True) notes = Column(Text, nullable=True) - recorded_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False) + recorded_at = Column( + DateTime, + default=datetime.datetime.utcnow, + nullable=False) + idempotency_key = Column(String(128), unique=True, nullable=True) intent = relationship("IntentDB", back_populates="outcomes") __table_args__ = ( UniqueConstraint("intent_id", name="uq_outcome_intentid"), ) + + +# --------------------------------------------------------------------------- +# NEW: Persistence for the conjugate Bayesian state +# --------------------------------------------------------------------------- +class BetaStateDB(Base): + """ + Stores the per‑category posterior parameters (α, β) of the BetaStore + so that online learning survives API restarts. + + Only one row per ActionCategory is expected; the 'category' column is + unique. Updates are performed via merge / upsert. + """ + __tablename__ = "beta_state" + + id = Column(Integer, primary_key=True, index=True) + category = Column(String(32), unique=True, nullable=False, index=True) + alpha = Column(Float, nullable=False) + beta = Column(Float, nullable=False) + updated_at = Column( + DateTime, + default=datetime.datetime.utcnow, + onupdate=datetime.datetime.utcnow) diff --git a/app/database/session.py b/app/database/session.py index 94e468fcf5ad68b7884587a64fb3c8cbb3acfa30..79cc3f642071e17f692e4a7b6e38330b1233daf5 100644 --- a/app/database/session.py +++ b/app/database/session.py @@ -1,19 +1,6 @@ from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from app.core.config import settings -# Use a default SQLite database if no URL is provided -if settings.database_url: - DATABASE_URL = settings.database_url -else: - # Fallback to a local SQLite file (writable in the container) - DATABASE_URL = "sqlite:///./app.db" - -# For SQLite, we need to disable the threading check -connect_args = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {} - -engine = create_engine(DATABASE_URL, connect_args=connect_args) +engine = create_engine(settings.database_url) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) - -Base = declarative_base() diff --git a/app/main.py b/app/main.py index 14e51ebf05e9a5405761f87d75c457eedca95f4e..e214a66d305175c01f6a6a0b0572438a7753e36e 100644 --- a/app/main.py +++ b/app/main.py @@ -1,18 +1,42 @@ """ -ARF API Control Plane - Main Application Entry Point -With optional heavy dependencies and usage tracking. +ARF API Control Plane — Main Application Entry Point +==================================================== + +The control plane serves as the HTTP layer between the **Agentic Reliability +Framework (ARF)** core engine and external consumers (front‑end dashboard, +enterprise clients, and monitoring infrastructure). + +It is responsible for: + +* **Lifetime management** of the Bayesian risk engine, policy engine, + semantic memory (RAG graph), and epistemic models. +* **Observability** via optional OpenTelemetry tracing and Prometheus metrics + (the latter exposed automatically by ``prometheus-fastapi-instrumentator`` + on ``/metrics``). +* **Rate limiting** and **usage tracking** with atomic quota consumption. +* **CORS** configuration for the public ARF front‑end. +* **Database‑backed persistence** of the conjugate Bayesian posteriors so + that online learning survives restarts. +* **Automated Rust enforcer canary promotion** via Wilson confidence interval + monitoring of the agreement counters. + +All heavy components are loaded **lazily and best‑effort** – if a dependency +is missing the API continues to serve health‑check and status endpoints, +degrading gracefully rather than crashing. """ import logging import os import sys import json +import threading +import time as _time from contextlib import asynccontextmanager from typing import Dict from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware -# Optional prometheus +# ── Optional: Prometheus metrics ───────────────────────────── try: from prometheus_fastapi_instrumentator import Instrumentator PROMETHEUS_AVAILABLE = True @@ -20,7 +44,7 @@ except ImportError: PROMETHEUS_AVAILABLE = False Instrumentator = None -# Optional slowapi +# ── Optional: rate‑limiting (slowapi) ──────────────────────── try: from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded @@ -32,7 +56,7 @@ except ImportError: RateLimitExceeded = None SlowAPIMiddleware = None -# Optional agentic_reliability_framework (risk engine, policy engine, etc.) +# ── Core ARF engine (optional but essential for governance) ── try: from agentic_reliability_framework.core.governance.risk_engine import RiskEngine from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine @@ -47,7 +71,7 @@ except ImportError: RAGGraphMemory = None MemoryConstants = None -# ===== USAGE TRACKER ===== +# ── Usage tracker ──────────────────────────────────────────── from app.core.usage_tracker import init_tracker, tracker, Tier from app.api import ( @@ -61,6 +85,7 @@ from app.api import ( routes_payments, webhooks, routes_users, + routes_pricing, ) from app.api.deps import limiter from app.core.config import settings @@ -75,18 +100,35 @@ logging.basicConfig( @asynccontextmanager async def lifespan(app: FastAPI): + """ + Application lifespan manager. + + All initialisation that requires a running event loop (database + connections, model loading, etc.) happens **before** the ``yield``. + Cleanup (if any) happens after the ``yield``. + + Initialisation order: + 1. Risk engine (Bayesian scoring + HMC). + 2. Load persisted conjugate posterior state (``beta_state`` table). + 3. OpenTelemetry tracing (console exporter by default). + 4. Policy engine, RAG memory, and epistemic model. + 5. Usage tracker (SQLite / Redis). + 6. Wilson confidence monitor for Rust enforcer canary promotion. + """ logger.info("🚀 Starting ARF API Control Plane") logger.debug(f"Python path: {sys.path}") + # ── 1. Risk engine ──────────────────────────────────────── if ARF_AVAILABLE: hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json") use_hyperpriors = os.getenv( - "ARF_USE_HYPERPRIORS", - "false").lower() == "true" + "ARF_USE_HYPERPRIORS", "false" + ).lower() == "true" logger.info( "Initializing RiskEngine – HMC model: %s, hyperpriors: %s", hmc_model_path, - use_hyperpriors) + use_hyperpriors, + ) try: app.state.risk_engine = RiskEngine( hmc_model_path=hmc_model_path, @@ -99,6 +141,55 @@ async def lifespan(app: FastAPI): logger.exception("💥 Fatal error initializing RiskEngine") raise RuntimeError("RiskEngine initialization failed") from e + # ── 2. Persisted Bayesian state ─────────────────────── + try: + from app.database.session import SessionLocal + from app.database.models_intents import BetaStateDB + from agentic_reliability_framework.core.governance.risk_engine import ActionCategory + + db = SessionLocal() + try: + rows = db.query(BetaStateDB).all() + if rows: + state = { + ActionCategory(row.category): (row.alpha, row.beta) + for row in rows + } + app.state.risk_engine.beta_store.load_state(state) + logger.info( + "Loaded Bayesian posterior state from database (%d categories).", + len(state), + ) + else: + logger.info( + "No persisted Bayesian state found; using default priors." + ) + finally: + db.close() + except Exception as e: + logger.warning( + "Could not load Bayesian state from database: %s", e + ) + + # ── 3. Tracing (OpenTelemetry) ───────────────────────── + try: + from opentelemetry import trace + from opentelemetry.sdk.resources import SERVICE_NAME, Resource + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor, ConsoleSpanExporter + from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor + + resource = Resource.create({SERVICE_NAME: "arf-api"}) + provider = TracerProvider(resource=resource) + provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter())) + trace.set_tracer_provider(provider) + + FastAPIInstrumentor.instrument_app(app) + logger.info("✅ Tracing initialized (console exporter).") + except Exception as e: + logger.warning("Tracing initialization skipped: %s", e) + + # ── 4. Policy engine, RAG, epistemic model ───────────── try: app.state.policy_engine = PolicyEngine() logger.info("✅ PolicyEngine initialized successfully.") @@ -120,12 +211,14 @@ async def lifespan(app: FastAPI): from sentence_transformers import SentenceTransformer logger.info(f"Loading epistemic model: {epistemic_model_name}") app.state.epistemic_model = SentenceTransformer( - epistemic_model_name) + epistemic_model_name + ) app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer logger.info("✅ Epistemic model loaded.") except ImportError: logger.warning( - "sentence-transformers not installed; epistemic signals will be zeros.") + "sentence-transformers not installed; epistemic signals will be zeros." + ) app.state.epistemic_model = None app.state.epistemic_tokenizer = None except Exception as e: @@ -134,45 +227,94 @@ async def lifespan(app: FastAPI): app.state.epistemic_tokenizer = None else: logger.info( - "EPISTEMIC_MODEL not set; epistemic signals will be zeros.") + "EPISTEMIC_MODEL not set; epistemic signals will be zeros." + ) app.state.epistemic_model = None app.state.epistemic_tokenizer = None else: logger.warning( - "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled.") + "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled." + ) - # ===== USAGE TRACKER INITIALISATION ===== - if os.getenv("ARF_USAGE_TRACKING", "false").lower() == "true": + # ── 5. Usage tracker ────────────────────────────────────── + usage_tracking_disabled = ( + os.getenv("ARF_USAGE_TRACKING", "true").lower() == "false" + ) + if not usage_tracking_disabled: logger.info("Initialising usage tracker...") - # HARDCODED WRITABLE PATH – fixes 503 error - init_tracker( - db_path="/tmp/arf_usage.db", # was os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db") - redis_url=os.getenv("ARF_REDIS_URL") - ) - # Seed initial API keys from environment variable (for testing / demo) - api_keys_json = os.getenv("ARF_API_KEYS", "{}") try: - api_keys = json.loads(api_keys_json) - for key, tier_str in api_keys.items(): - try: - tier = Tier(tier_str.lower()) - tracker.get_or_create_api_key(key, tier) - logger.info(f"Seeded API key for tier {tier.value}") - except ValueError: - logger.warning(f"Invalid tier '{tier_str}' for key {key}, skipping") - except json.JSONDecodeError: - logger.warning("ARF_API_KEYS environment variable is not valid JSON; skipping seeding.") - app.state.usage_tracker = tracker - logger.info("✅ Usage tracker ready.") + init_tracker( + db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"), + redis_url=os.getenv("ARF_REDIS_URL"), + ) + # Seed initial API keys from environment variable (for testing / demo) + api_keys_json = os.getenv("ARF_API_KEYS", "{}") + try: + api_keys = json.loads(api_keys_json) + for key, tier_str in api_keys.items(): + try: + tier = Tier(tier_str.lower()) + tracker.get_or_create_api_key(key, tier) + logger.info(f"Seeded API key for tier {tier.value}") + except ValueError: + logger.warning( + f"Invalid tier '{tier_str}' for key {key}, skipping" + ) + except json.JSONDecodeError: + logger.warning( + "ARF_API_KEYS environment variable is not valid JSON; skipping seeding." + ) + app.state.usage_tracker = tracker + logger.info("✅ Usage tracker ready.") + except Exception as e: + logger.critical(f"Failed to initialise usage tracker: {e}") + raise RuntimeError("Usage tracker initialisation failed") from e else: - logger.info("Usage tracking disabled (ARF_USAGE_TRACKING not set to true).") + logger.info("Usage tracking disabled by ARF_USAGE_TRACKING=false.") app.state.usage_tracker = None + # ── 6. Wilson confidence monitor ────────────────────────── + try: + from app.services.wilson_monitor import update as wilson_update + from prometheus_client import REGISTRY + + def _wilson_updater(): + while True: + try: + agreed = REGISTRY.get_sample_value( + 'arf_rust_agreement_total', {'result': 'agreed'} + ) or 0.0 + diverged = REGISTRY.get_sample_value( + 'arf_rust_agreement_total', {'result': 'diverged'} + ) or 0.0 + wilson_update(int(agreed), int(diverged)) + except Exception as e: + logger.debug("Wilson updater error: %s", e) + _time.sleep(300) # every 5 minutes + + threading.Thread(target=_wilson_updater, daemon=True).start() + logger.info("✅ Wilson monitor background updater started.") + except Exception as e: + logger.warning("Wilson monitor initialization skipped: %s", e) + yield logger.info("🛑 Shutting down ARF API") def create_app() -> FastAPI: + """ + Build and configure the FastAPI application. + + Middleware order: + 1. CORS (restricted to the public front‑end origin). + 2. Rate limiting (if slowapi is installed). + 3. Prometheus metrics exposition (if available). + + All API routers are included under the ``/api/v1`` prefix except + memory (``/v1/memory``) and webhooks (root level). + + A simple ``/health`` endpoint is provided for liveness probes. + """ app = FastAPI( title=settings.app_name, version="0.5.0", @@ -182,6 +324,7 @@ def create_app() -> FastAPI: description="Agentic Reliability Framework (ARF) API", ) + # ── CORS ────────────────────────────────────────────────── allowed_origins = ["https://arf-frontend-sandy.vercel.app"] app.add_middleware( CORSMiddleware, @@ -192,67 +335,64 @@ def create_app() -> FastAPI: ) logger.debug("CORS middleware configured") + # ── Rate limiter ────────────────────────────────────────── if SLOWAPI_AVAILABLE: app.state.limiter = limiter app.add_exception_handler( - RateLimitExceeded, - _rate_limit_exceeded_handler) + RateLimitExceeded, _rate_limit_exceeded_handler + ) app.add_middleware(SlowAPIMiddleware) logger.debug("Rate limiter middleware configured") else: logger.debug("Rate limiter disabled (slowapi not installed)") + # ── Prometheus ──────────────────────────────────────────── if PROMETHEUS_AVAILABLE: Instrumentator().instrument(app).expose(app) logger.debug("Prometheus instrumentator configured") else: - logger.debug( - "Prometheus instrumentator disabled (module not installed)") + logger.debug("Prometheus instrumentator disabled (module not installed)") - # Include routers + # ── API Routers ─────────────────────────────────────────── app.include_router( - routes_incidents.router, - prefix="/api/v1", - tags=["incidents"]) + routes_incidents.router, prefix="/api/v1", tags=["incidents"] + ) app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"]) app.include_router( - routes_intents.router, - prefix="/api/v1", - tags=["intents"]) + routes_intents.router, prefix="/api/v1", tags=["intents"] + ) app.include_router( - routes_history.router, - prefix="/api/v1", - tags=["history"]) + routes_history.router, prefix="/api/v1", tags=["history"] + ) app.include_router( - routes_governance.router, - prefix="/api/v1", - tags=["governance"]) + routes_governance.router, prefix="/api/v1", tags=["governance"] + ) app.include_router( - routes_memory.router, - prefix="/v1/memory", - tags=["memory"]) + routes_memory.router, prefix="/v1/memory", tags=["memory"] + ) app.include_router( - routes_admin.router, - prefix="/api/v1", - tags=["admin"]) + routes_admin.router, prefix="/api/v1", tags=["admin"] + ) app.include_router( - routes_payments.router, - prefix="/api/v1", - tags=["payments"]) + routes_payments.router, prefix="/api/v1", tags=["payments"] + ) app.include_router( - webhooks.router, - tags=["webhooks"]) + webhooks.router, tags=["webhooks"] + ) app.include_router( - routes_users.router, - prefix="/api/v1", - tags=["users"]) + routes_users.router, prefix="/api/v1", tags=["users"] + ) + app.include_router( + routes_pricing.router, prefix="/api/v1", tags=["pricing"] + ) logger.debug("All API routers included") @app.get("/health", tags=["health"]) async def health() -> Dict[str, str]: + """Liveness probe – returns 200 when the application is running.""" return {"status": "ok"} return app -app = create_app() \ No newline at end of file +app = create_app() diff --git a/app/models/__init__.py b/app/models/__init__.py index 9e78b5bc94b28aa883ee926f4b790e8c9890e2fc..89d3c13df3d0724e5877f5032c275f04b7782d51 100644 --- a/app/models/__init__.py +++ b/app/models/__init__.py @@ -26,4 +26,4 @@ __all__ = [ "PermissionLevel", "Environment", "ChangeScope", -] \ No newline at end of file +] diff --git a/app/models/incident_models.py b/app/models/incident_models.py index 985e4e95aeb600c3d696451697b99b3179ec1fde..4fc9ad42cdf2e41d175bcc24ed20b2532619d9ef 100644 --- a/app/models/incident_models.py +++ b/app/models/incident_models.py @@ -4,10 +4,11 @@ from pydantic import BaseModel, Field class IncidentReport(BaseModel): service: str = Field(..., description="Service name") - signal_type: Literal["latency", "error_rate", "cpu", "memory"] = Field(..., description="Type of signal") + signal_type: Literal["latency", "error_rate", "cpu", + "memory"] = Field(..., description="Type of signal") value: float = Field(..., description="Measured value") class IncidentResponse(BaseModel): service: str - reliability: float \ No newline at end of file + reliability: float diff --git a/app/models/infrastructure_intents.py b/app/models/infrastructure_intents.py index 5a9ddfdb8d8b9d34f4fa18e310789fc1d1a05dd2..bf41c22c79e0ca228104763b12748905ea71f360 100644 --- a/app/models/infrastructure_intents.py +++ b/app/models/infrastructure_intents.py @@ -1,45 +1,12 @@ from pydantic import BaseModel, Field, field_validator from typing import Optional, Literal, List, Any, Dict -from enum import Enum -# --------------------------------------------------------------------------- -# Fallback enums – used when the proprietary core engine is not installed. -# These mirror the canonical definitions from the public specification. -# --------------------------------------------------------------------------- -class ResourceType(str, Enum): - DATABASE = "database" - STORAGE_ACCOUNT = "storage_account" - VM = "vm" - VIRTUAL_NETWORK = "virtual_network" - # enterprise-only types omitted for public sandbox - -class PermissionLevel(str, Enum): - READ = "read" - WRITE = "write" - ADMIN = "admin" - -class Environment(str, Enum): - DEV = "dev" - STAGING = "staging" - PROD = "prod" - -class ChangeScope(str, Enum): - MINOR = "minor" - MAJOR = "major" - CRITICAL = "critical" -# --------------------------------------------------------------------------- - -# Optional import from protected core engine – not available in public Spaces -try: - from agentic_reliability_framework.core.governance.intents import ( - ResourceType, - PermissionLevel, - Environment, - ChangeScope, - ) -except ImportError: - # The fallback enums defined above are used. - pass +from agentic_reliability_framework.core.governance.intents import ( + ResourceType, + PermissionLevel, + Environment, + ChangeScope, +) class BaseIntentRequest(BaseModel): @@ -91,4 +58,4 @@ class DeployConfigurationRequest(BaseIntentRequest): return v -InfrastructureIntentRequest = ProvisionResourceRequest | GrantAccessRequest | DeployConfigurationRequest \ No newline at end of file +InfrastructureIntentRequest = ProvisionResourceRequest | GrantAccessRequest | DeployConfigurationRequest diff --git a/app/models/intent_models.py b/app/models/intent_models.py index f8d9b8b13bcb0feb7a2dba8c045589510e3f8de0..3040ffb4a21fba7bb5561ec37d629b660178a212 100644 --- a/app/models/intent_models.py +++ b/app/models/intent_models.py @@ -11,4 +11,4 @@ class IntentSimulation(BaseModel): class IntentSimulationResponse(BaseModel): risk_score: float - recommendation: Literal["safe_to_execute", "requires_approval", "blocked"] \ No newline at end of file + recommendation: Literal["safe_to_execute", "requires_approval", "blocked"] diff --git a/app/models/risk_models.py b/app/models/risk_models.py index ac1a9fde739f58b38bcd4142711cefcbce9d13bf..083a12c7e72ed723b0dd1d9c148da2bed676669c 100644 --- a/app/models/risk_models.py +++ b/app/models/risk_models.py @@ -4,4 +4,4 @@ from pydantic import BaseModel class RiskResponse(BaseModel): system_risk: float - status: Literal["low", "moderate", "high", "critical"] \ No newline at end of file + status: Literal["low", "moderate", "high", "critical"] diff --git a/app/services/incident_service.py b/app/services/incident_service.py index 5fc88ca60a509da06c8a583b9934e186330c5622..3cf560ec571f9efec5f18aca2ad0284697fd2f26 100644 --- a/app/services/incident_service.py +++ b/app/services/incident_service.py @@ -3,5 +3,6 @@ from app.models.incident_models import IncidentReport def process_incident(report: IncidentReport) -> float: - reliability = signal_to_reliability(report.value, signal_type=report.signal_type) + reliability = signal_to_reliability( + report.value, signal_type=report.signal_type) return reliability diff --git a/app/services/intent_adapter.py b/app/services/intent_adapter.py index f79fb8f906d12ad797401f50cd9d826ed8340d4d..791202aa0711a0b1b612265ce58d80c66ca5e2db 100644 --- a/app/services/intent_adapter.py +++ b/app/services/intent_adapter.py @@ -1,66 +1,163 @@ -from pydantic import BaseModel -from typing import Optional, Dict, Any - -# --------------------------------------------------------------------------- -# Local fallback intent classes – mirrors the proprietary core engine's contracts -# --------------------------------------------------------------------------- -class ProvisionResourceIntent(BaseModel): - resource_type: str - region: str - size: str - configuration: Dict[str, Any] = {} - environment: str - requester: str - provenance: Dict[str, Any] = {} - -class GrantAccessIntent(BaseModel): - principal: str - permission_level: str - resource_scope: str - justification: Optional[str] = None - requester: str - provenance: Dict[str, Any] = {} - -class DeployConfigurationIntent(BaseModel): - service_name: str - change_scope: str - deployment_target: str - risk_level_hint: Optional[float] = None - configuration: Dict[str, Any] = {} - requester: str - provenance: Dict[str, Any] = {} -# --------------------------------------------------------------------------- - - -def to_oss_intent(api_request): - if api_request.intent_type == "provision_resource": - return ProvisionResourceIntent( - resource_type=api_request.resource_type.value if hasattr(api_request.resource_type, 'value') else str(api_request.resource_type), - region=api_request.region, - size=api_request.size, - configuration=api_request.configuration, - environment=api_request.environment.value if hasattr(api_request.environment, 'value') else str(api_request.environment), - requester=api_request.requester, - provenance=api_request.provenance, - ) - elif api_request.intent_type == "grant_access": - return GrantAccessIntent( - principal=api_request.principal, - permission_level=api_request.permission_level.value if hasattr(api_request.permission_level, 'value') else str(api_request.permission_level), - resource_scope=api_request.resource_scope, - justification=api_request.justification, - requester=api_request.requester, - provenance=api_request.provenance, - ) - elif api_request.intent_type == "deploy_config": - return DeployConfigurationIntent( - service_name=api_request.service_name, - change_scope=api_request.change_scope.value if hasattr(api_request.change_scope, 'value') else str(api_request.change_scope), - deployment_target=api_request.deployment_target.value if hasattr(api_request.deployment_target, 'value') else str(api_request.deployment_target), - risk_level_hint=api_request.risk_level_hint, - configuration=api_request.configuration, - requester=api_request.requester, - provenance=api_request.provenance, - ) +""" +Intent Adapter – converts API request payloads to ARF InfrastructureIntent objects. +Strict validation, no dummy fallbacks. All conversions are deterministic. +""" + +import logging +from typing import Any, Dict + +from agentic_reliability_framework.core.governance.intents import ( + ProvisionResourceIntent, + GrantAccessIntent, + DeployConfigurationIntent, + InfrastructureIntent, +) + +logger = logging.getLogger(__name__) + + +class IntentAdapterError(Exception): + """Raised when intent conversion fails due to invalid input.""" + pass + + +# Allowed values (from the framework's Literal definitions) +VALID_ENVIRONMENTS = {"dev", "staging", "prod", "test"} +VALID_RESOURCE_TYPES = { + "vm", + "storage_account", + "database", + "kubernetes_cluster", + "function_app", + "virtual_network"} + + +def to_oss_intent(api_request: Any) -> InfrastructureIntent: + """ + Convert an API request object to the corresponding OSS InfrastructureIntent. + """ + # Extract data + if hasattr(api_request, "model_dump"): + data = api_request.model_dump() + elif hasattr(api_request, "dict"): + data = api_request.dict() else: - raise ValueError(f"Unknown intent type: {api_request.intent_type}") \ No newline at end of file + data = dict(api_request) + + intent_type = data.get("intent_type") + if not intent_type: + raise IntentAdapterError("Missing 'intent_type' in request") + + environment = data.get("environment") + if not environment: + raise IntentAdapterError("Missing 'environment' field") + if environment not in VALID_ENVIRONMENTS: + raise IntentAdapterError( + f"Invalid environment: {environment}. Must be one of {VALID_ENVIRONMENTS}") + + requester = data.get("requester") + if not requester: + raise IntentAdapterError("Missing 'requester' field") + + if intent_type == "provision_resource": + return _to_provision_intent(data, environment, requester) + elif intent_type == "grant_access": + return _to_grant_intent(data, requester) # environment NOT passed + elif intent_type == "deploy_config": + return _to_deploy_intent(data, requester) # environment NOT passed + else: + raise IntentAdapterError(f"Unknown intent_type: {intent_type}") + + +def _to_provision_intent(data: Dict[str, + Any], + environment: str, + requester: str) -> ProvisionResourceIntent: + resource_type_str = data.get("resource_type") + if not resource_type_str: + raise IntentAdapterError( + "Missing 'resource_type' for provision_resource intent") + if resource_type_str not in VALID_RESOURCE_TYPES: + raise IntentAdapterError(f"Invalid resource_type: {resource_type_str}") + + region = data.get("region") + if not region: + raise IntentAdapterError( + "Missing 'region' for provision_resource intent") + + size = data.get("size") + if not size: + raise IntentAdapterError( + "Missing 'size' for provision_resource intent") + + return ProvisionResourceIntent( + resource_type=resource_type_str, + region=region, + size=size, + environment=environment, + requester=requester, + configuration=data.get("configuration", {}), + provenance=data.get("provenance", {}), + ) + + +def _to_grant_intent(data: Dict[str, Any], + requester: str) -> GrantAccessIntent: + principal = data.get("principal") + if not principal: + raise IntentAdapterError("Missing 'principal' for grant_access intent") + + permission_level = data.get("permission_level") + if not permission_level: + raise IntentAdapterError( + "Missing 'permission_level' for grant_access intent") + + resource_scope = data.get("resource_scope") + if not resource_scope: + raise IntentAdapterError( + "Missing 'resource_scope' for grant_access intent") + + return GrantAccessIntent( + principal=principal, + permission_level=permission_level, + resource_scope=resource_scope, + requester=requester, + justification=data.get("justification", ""), + provenance=data.get("provenance", {}), + ) + + +def _to_deploy_intent(data: Dict[str, Any], + requester: str) -> DeployConfigurationIntent: + service_name = data.get("service_name") + if not service_name: + raise IntentAdapterError( + "Missing 'service_name' for deploy_config intent") + + change_scope = data.get("change_scope") + if not change_scope: + raise IntentAdapterError( + "Missing 'change_scope' for deploy_config intent") + + deployment_target = data.get("deployment_target") + if not deployment_target: + raise IntentAdapterError( + "Missing 'deployment_target' for deploy_config intent") + + # risk_level_hint expects a float; if not a number, set to None + risk_hint = data.get("risk_level_hint") + if risk_hint is not None: + try: + risk_hint = float(risk_hint) + except (TypeError, ValueError): + risk_hint = None + + return DeployConfigurationIntent( + service_name=service_name, + change_scope=change_scope, + deployment_target=deployment_target, + requester=requester, + risk_level_hint=risk_hint, + configuration=data.get("configuration", {}), + provenance=data.get("provenance", {}), + ) diff --git a/app/services/intent_service.py b/app/services/intent_service.py index 87366bd657fd5a8d3fcef9bede81dfba727b156b..50d552039ea0ac79e45cf8fe33ec472c56df8110 100644 --- a/app/services/intent_service.py +++ b/app/services/intent_service.py @@ -7,7 +7,8 @@ logger = logging.getLogger(__name__) # Note: This endpoint is deprecated. Use /v1/intents/evaluate instead. def simulate_intent(intent: IntentSimulation) -> dict: - logger.warning("Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.") + logger.warning( + "Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.") # For backward compatibility, we still use random risk. risk_score = random.uniform(0, 1) if risk_score < 0.2: diff --git a/app/services/intent_store.py b/app/services/intent_store.py index a84b680ec5e89a9ea6c0cf9d45f84ef85fc6cdd6..a72a77ddde96edabf931deb9bf1433257acd8b81 100644 --- a/app/services/intent_store.py +++ b/app/services/intent_store.py @@ -13,7 +13,8 @@ def save_evaluated_intent( environment: str, risk_score: float ) -> IntentDB: - existing = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none() + existing = db.query(IntentDB).filter( + IntentDB.deterministic_id == deterministic_id).one_or_none() if existing: existing.evaluated_at = datetime.datetime.utcnow() existing.risk_score = str(risk_score) @@ -38,5 +39,8 @@ def save_evaluated_intent( return intent -def get_intent_by_deterministic_id(db: Session, deterministic_id: str) -> Optional[IntentDB]: - return db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none() +def get_intent_by_deterministic_id( + db: Session, + deterministic_id: str) -> Optional[IntentDB]: + return db.query(IntentDB).filter( + IntentDB.deterministic_id == deterministic_id).one_or_none() diff --git a/app/services/outcome_service.py b/app/services/outcome_service.py index f94eec3bc23289fea1b8c89870c5f7e2f4125026..2d681824adfcb78eb68bdf1c05253dbad84f25b6 100644 --- a/app/services/outcome_service.py +++ b/app/services/outcome_service.py @@ -1,42 +1,53 @@ +"""Outcome recording with idempotency, no dummy fallbacks, and timezone-aware timestamps.""" + import datetime import logging from typing import Optional, Dict, Any from sqlalchemy.orm import Session +from sqlalchemy.exc import IntegrityError + +from agentic_reliability_framework.core.governance.risk_engine import RiskEngine +from agentic_reliability_framework.core.governance.intents import ( + InfrastructureIntent, + ProvisionResourceIntent, + GrantAccessIntent, + DeployConfigurationIntent, +) +from app.database.models_intents import IntentDB, OutcomeDB, BetaStateDB + +logger = logging.getLogger(__name__) -from app.database.models_intents import IntentDB, OutcomeDB # --------------------------------------------------------------------------- -# Local fallback types – dummy RiskEngine and intent classes -# --------------------------------------------------------------------------- -class RiskEngine: - def update_outcome(self, intent, success): - pass - -class ProvisionResourceIntent: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - -class GrantAccessIntent: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - -class DeployConfigurationIntent: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) +# NEW: small helper to persist the conjugate posterior state # --------------------------------------------------------------------------- - -logger = logging.getLogger(__name__) +def _persist_beta_state(db: Session, risk_engine: RiskEngine) -> None: + """ + Write the current Beta posterior parameters to the beta_state table. + This is called after every outcome update so that online learning + survives restarts. + """ + try: + state = risk_engine.beta_store.get_state() + for cat, (alpha, beta) in state.items(): + # Upsert: if the category already exists, update it + db.merge(BetaStateDB(category=cat.value, alpha=alpha, beta=beta)) + db.commit() + logger.debug("Persisted Beta posterior parameters to database.") + except Exception as e: + db.rollback() + logger.error("Failed to persist beta state: %s", e) class OutcomeConflictError(Exception): + """Raised when an outcome already exists for the same intent with a different result.""" pass -def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]): +def reconstruct_oss_intent_from_json( + oss_json: Dict[str, Any]) -> InfrastructureIntent: + """Reconstruct OSS intent from stored JSON. Raises ValueError on failure.""" intent_type = oss_json.get("intent_type") if intent_type == "provision_resource": return ProvisionResourceIntent(**oss_json) @@ -46,22 +57,7 @@ def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]): return DeployConfigurationIntent(**oss_json) else: raise ValueError( - f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}" - ) - - -def _create_dummy_intent(intent_type: str): - if intent_type == "ProvisionResourceIntent": - return ProvisionResourceIntent( - resource_type="vm", - region="eastus", - size="Standard_D2s_v3", - environment="dev", - requester="system" - ) - else: - logger.warning("Dummy intent creation not implemented for %s", intent_type) - return None + f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}") def record_outcome( @@ -70,50 +66,114 @@ def record_outcome( success: bool, recorded_by: Optional[str], notes: Optional[str], - risk_engine: RiskEngine + risk_engine: RiskEngine, + idempotency_key: Optional[str] = None, ) -> OutcomeDB: - intent = db.query(IntentDB).filter(IntentDB.deterministic_id == deterministic_id).one_or_none() + """ + Record an outcome for a previously evaluated intent. + + Idempotent: calling twice with the same (deterministic_id, success) returns the same record. + If the outcome already exists with a different success value, raises OutcomeConflictError. + + No dummy intents are created. If the OSS intent cannot be reconstructed, the risk engine + is NOT updated – we log an error and still record the outcome. + + Args: + db: SQLAlchemy session. + deterministic_id: Unique identifier of the original intent. + success: Whether the action succeeded (True) or failed (False). + recorded_by: Optional user or system identifier. + notes: Optional human-readable notes. + risk_engine: ARF risk engine instance (may be updated). + idempotency_key: Optional caller-provided idempotency token. + + Returns: + The recorded OutcomeDB object. + + Raises: + ValueError: If intent not found or reconstruction fails fatally. + OutcomeConflictError: If a conflicting outcome already exists. + """ + # 1. Fetch the original intent record + intent = db.query(IntentDB).filter( + IntentDB.deterministic_id == deterministic_id).one_or_none() if not intent: raise ValueError(f"Intent not found: {deterministic_id}") - existing_outcome = db.query(OutcomeDB).filter(OutcomeDB.intent_id == intent.id).one_or_none() + # 2. Idempotency / conflict check with database-level uniqueness + existing_outcome = db.query(OutcomeDB).filter( + OutcomeDB.intent_id == intent.id).one_or_none() if existing_outcome: if existing_outcome.success == success: return existing_outcome - raise OutcomeConflictError("Outcome already recorded with different result") + db.rollback() + raise OutcomeConflictError( + f"Outcome already recorded for intent {deterministic_id} with different result " + f"(existing={existing_outcome.success}, new={success})" + ) + # 3. Create outcome record outcome = OutcomeDB( intent_id=intent.id, success=bool(success), recorded_by=recorded_by, notes=notes, - recorded_at=datetime.datetime.utcnow() + recorded_at=datetime.datetime.now(datetime.timezone.utc), + idempotency_key=idempotency_key, ) db.add(outcome) - db.commit() - db.refresh(outcome) - # Reconstruct intent and update risk engine (mock) + # 4. Attempt to commit; handle duplicate key errors for idempotency + try: + db.commit() + db.refresh(outcome) + except IntegrityError as e: + db.rollback() + if "idempotency_key" in str(e) and idempotency_key: + existing = db.query(OutcomeDB).filter( + OutcomeDB.idempotency_key == idempotency_key).first() + if existing: + logger.info( + "Idempotent request for key %s, returning existing outcome", + idempotency_key) + return existing + raise + + # 5. Update RiskEngine ONLY if we can reconstruct a valid OSS intent oss_intent = None if intent.oss_payload: try: oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload) except Exception as e: - logger.warning( - "Failed to reconstruct OSS intent for %s: %s. Using dummy fallback.", - deterministic_id, e - ) - oss_intent = _create_dummy_intent(intent.intent_type) + logger.error( + "Failed to reconstruct OSS intent for %s: %s. RiskEngine will NOT be updated.", + deterministic_id, + e, + exc_info=True) else: - oss_intent = _create_dummy_intent(intent.intent_type) + logger.warning( + "No oss_payload stored for intent %s – cannot update RiskEngine.", + deterministic_id + ) if oss_intent is not None: try: risk_engine.update_outcome(oss_intent, success) + + # ---------------------------------------------------------------- + # PERSISTENCE: after updating the conjugate posterior, write it + # ---------------------------------------------------------------- + _persist_beta_state(db, risk_engine) + except Exception as e: logger.exception( "Failed to update RiskEngine after recording outcome for intent %s: %s", - deterministic_id, e - ) + deterministic_id, + e) + else: + logger.info( + "Skipped RiskEngine update for intent %s (no valid OSS intent)", + deterministic_id + ) - return outcome \ No newline at end of file + return outcome diff --git a/app/services/risk_service.py b/app/services/risk_service.py index 936e5f7db58279f8bae6d775181b40877c094f91..64737b1895c7a4a3212e423d9083f9c6f0f28ceb 100644 --- a/app/services/risk_service.py +++ b/app/services/risk_service.py @@ -1,97 +1,376 @@ +""" +Risk service – integrates ARF risk engine, policy engine, and decision engine. +Deterministic, no random fallbacks, explicit error handling. + +Version: 2026-05-04 – added Prometheus metrics for observability. +""" + +import json +import logging +import os +import time from typing import Optional, List, Dict, Any -from enum import Enum - -# --------------------------------------------------------------------------- -# Local fallback types – everything needed for the sandbox mock -# --------------------------------------------------------------------------- -class HealingAction(str, Enum): - NO_ACTION = "NO_ACTION" - RESTART_CONTAINER = "RESTART_CONTAINER" - SCALE_OUT = "SCALE_OUT" - ROLLBACK = "ROLLBACK" - CIRCUIT_BREAKER = "CIRCUIT_BREAKER" - TRAFFIC_SHIFT = "TRAFFIC_SHIFT" - ALERT_TEAM = "ALERT_TEAM" - -class InfrastructureIntent: - pass - -class RiskEngine: - def calculate_risk(self, intent, cost_estimate, policy_violations): - # Return a mock risk score - return 0.35, "Mock sandbox risk", {"conjugate_mean": 0.35} - -class PolicyEngine: - def __init__(self): - self.policies = [] - self.use_decision_engine = True - def evaluate_policies(self, event): - return [HealingAction.NO_ACTION] - -class DecisionEngine: - def __init__(self, **kwargs): + +from agentic_reliability_framework.core.governance.risk_engine import RiskEngine +from agentic_reliability_framework.core.governance.intents import InfrastructureIntent +from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction +from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine +from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine +from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory +from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk + +# ── optional tracing ───────────────────────────────────────── +try: + from opentelemetry import trace + _tracer = trace.get_tracer(__name__) + OTEL_AVAILABLE = True +except ImportError: + OTEL_AVAILABLE = False + _tracer = None + +# ── Prometheus metrics (always registered; no‑op if not scraped) ─ +from prometheus_client import Counter, Histogram + +_EVAL_COUNTER = Counter( + "arf_evaluations_total", + "Total evaluation calls (intent + healing), partitioned by engine and status.", + ["engine", "status"], +) + +_EVAL_DURATION = Histogram( + "arf_evaluation_duration_seconds", + "End‑to‑end latency of evaluation calls.", + ["engine"], + buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0), +) + +_RUST_AGREEMENT = Counter( + "arf_rust_agreement_total", + "Agreement between Rust enforcer and Python policy evaluation.", + ["result"], # "agreed" or "diverged" +) + +# ── optional Rust enforcer (shadow mode) ────────────────────── +_RUST_ENFORCER_AVAILABLE = False +_rust_evaluator = None # singleton per process +_rust_policy_json: Optional[str] = None + +if os.getenv("ARF_USE_RUST_ENFORCER", "false").lower() == "true": + try: + import arf_enforcer + _RUST_ENFORCER_AVAILABLE = True + except ImportError: pass - def select_optimal_action(self, actions, event, **kwargs): - return type('obj', (object,), { - 'best_action': HealingAction.NO_ACTION, - 'expected_utility': 0.0, - 'alternatives': [], - 'explanation': 'Mock decision engine in sandbox', - 'raw_data': {}, - })() - def compute_risk(self, action, event, component): - return 0.0 - -class RAGGraphMemory: - pass - -class ReliabilityEvent: - component: str = "default" - latency_p99: float = 0.0 - error_rate: float = 0.0 - cpu_util: Optional[float] = None - memory_util: Optional[float] = None -# --------------------------------------------------------------------------- + +# Default OSS policy tree – mirrors the hard‑coded rules in the Python PolicyEvaluator +# that check region, resource type, and max permission level. +_OSS_POLICY_TREE_JSON = json.dumps({ + "And": [ + {"Atomic": {"RegionAllowed": {"allowed_regions": ["eastus"]}}}, + {"Atomic": {"ResourceTypeRestricted": { + "forbidden_types": ["DATABASE_DROP", "FULL_ROLLOUT", "SYSTEM_SHUTDOWN", "SECRET_ROTATION"] + }}}, + {"Atomic": {"MaxPermissionLevel": {"max_level": "admin"}}} + ] +}) + + +def _ensure_rust_evaluator() -> bool: + """Lazy initialise the Rust policy evaluator. Returns True on success.""" + global _rust_evaluator, _rust_policy_json + if _rust_evaluator is not None: + return True + if not _RUST_ENFORCER_AVAILABLE: + return False + try: + _rust_policy_json = _OSS_POLICY_TREE_JSON + _rust_evaluator = arf_enforcer.PyPolicyEvaluator(_rust_policy_json) + return True + except Exception: + _rust_evaluator = None + return False + + +logger = logging.getLogger(__name__) def evaluate_intent( engine: RiskEngine, - intent, + intent: InfrastructureIntent, cost_estimate: Optional[float], policy_violations: List[str] ) -> dict: - """Mock sandbox evaluation – returns a fixed risk score.""" + """ + Evaluate an infrastructure intent using the Bayesian risk engine. + + Optionally shadows the policy evaluation with the Rust enforcer when + the environment variable ARF_USE_RUST_ENFORCER is set to "true". + Any divergence is logged and counted as a Prometheus metric. + + Parameters + ---------- + engine : RiskEngine + Initialised ARF Bayesian risk engine. + intent : InfrastructureIntent + The infrastructure request to evaluate. + cost_estimate : float or None + Estimated monthly cost (used by cost‑threshold policies). + policy_violations : list[str] + Pre‑computed policy violation strings (from the Python evaluator). + + Returns + ------- + dict + Keys: risk_score, explanation, contributions. + """ + t0 = time.monotonic() + span = None + if OTEL_AVAILABLE and _tracer: + span = _tracer.start_span("risk_service.evaluate_intent") + span.set_attribute("intent_type", type(intent).__name__) + + # ── Shadow Rust enforcer (best‑effort, non‑blocking) ────── + if _RUST_ENFORCER_AVAILABLE and _ensure_rust_evaluator(): + try: + rust_intent = { + "action": getattr(intent, "intent_type", "unknown"), + "component": getattr(intent, "service_name", "unknown"), + "region": getattr(intent, "region", None), + "resource_type": getattr(intent, "resource_type", None), + "permission_level": getattr(intent, "permission_level", None), + "extra": {} + } + rust_raw = _rust_evaluator.evaluate( + json.dumps(rust_intent), cost_estimate + ) + rust_violations = json.loads(rust_raw) + + agreed = set(rust_violations) == set(policy_violations) + _RUST_AGREEMENT.labels(result="agreed" if agreed else "diverged").inc() + if not agreed: + msg = ( + "Rust enforcer divergence: " + f"Rust={sorted(rust_violations)} Python={sorted(policy_violations)}" + ) + logger.warning(msg) + if span: + span.add_event("rust_enforcer_divergence", { + "rust_violations": rust_violations, + "python_violations": policy_violations + }) + except Exception as exc: + logger.debug("Rust enforcer shadow evaluation failed: %s", exc) + + # ── Core risk evaluation ────────────────────────────────── + + # ── Automated canary promotion ────────────────────────── + if _RUST_ENFORCER_AVAILABLE and os.getenv("ARF_RUST_CANARY", "false").lower() == "true": + try: + from prometheus_client import REGISTRY + lower = REGISTRY.get_sample_value("arf_rust_agreement_lower_bound", {}) + if lower is not None and lower > 0.9999: + policy_violations = rust_violations + if span: + span.set_attribute("rust_enforcer_active", True) + except Exception: + pass + try: + score, explanation, contributions = engine.calculate_risk( + intent=intent, + cost_estimate=cost_estimate, + policy_violations=policy_violations + ) + engine_label = "python" + status = "success" + except Exception: + _EVAL_COUNTER.labels(engine="python", status="error").inc() + _EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0) + raise + + _EVAL_COUNTER.labels(engine=engine_label, status=status).inc() + _EVAL_DURATION.labels(engine=engine_label).observe(time.monotonic() - t0) + + if span: + span.set_attribute("risk_score", score) + if _RUST_ENFORCER_AVAILABLE: + span.set_attribute("rust_enforcer_available", True) + span.end() + return { - "risk_score": 0.38, - "explanation": "Sandbox mock: high latency detected, escalating.", - "contributions": {"conjugate_mean": 0.38} + "risk_score": score, + "explanation": explanation, + "contributions": contributions } def evaluate_healing_decision( - event, + event: ReliabilityEvent, policy_engine: PolicyEngine, decision_engine: Optional[DecisionEngine] = None, rag_graph: Optional[RAGGraphMemory] = None, model=None, tokenizer=None, ) -> Dict[str, Any]: - """Mock sandbox healing evaluation – always returns NO_ACTION.""" - return { - "risk_score": 0.0, - "selected_action": HealingAction.NO_ACTION.value, - "expected_utility": 0.0, - "alternatives": [], - "explanation": "Sandbox mock: no healing actions evaluated.", - "epistemic_signals": { + """ + Evaluate healing actions for a given reliability event using decision‑theoretic selection. + Includes epistemic risk signals from the eclipse probe. + + Parameters + ---------- + event : ReliabilityEvent + The incident event containing latency, error rate, etc. + policy_engine : PolicyEngine + The ARF healing policy engine with configured policies. + decision_engine : DecisionEngine, optional + If omitted, a default instance is created. + rag_graph : RAGGraphMemory, optional + Semantic memory for similar incident retrieval. + model, tokenizer : optional + HuggingFace model and tokenizer for epistemic risk computation. + + Returns + ------- + dict + Keys: risk_score, selected_action, expected_utility, alternatives, + explanation, epistemic_signals. + """ + t0 = time.monotonic() + span = None + if OTEL_AVAILABLE and _tracer: + span = _tracer.start_span("risk_service.evaluate_healing") + span.set_attribute("component", event.component) + + # If decision_engine not provided, try to get from policy_engine + if decision_engine is None and hasattr(policy_engine, 'decision_engine'): + decision_engine = policy_engine.decision_engine + + # If still None, create a minimal one (global stats only) + if decision_engine is None: + logger.debug("No DecisionEngine provided; creating default instance") + decision_engine = DecisionEngine(rag_graph=rag_graph) + + # Get raw candidate actions (by temporarily disabling decision engine) + orig_use = policy_engine.use_decision_engine + try: + policy_engine.use_decision_engine = False + raw_actions = policy_engine.evaluate_policies(event) + finally: + policy_engine.use_decision_engine = orig_use + + # If no actions, return NO_ACTION + if not raw_actions or raw_actions == [HealingAction.NO_ACTION]: + if span: + span.set_attribute("selected_action", HealingAction.NO_ACTION.value) + span.end() + _EVAL_COUNTER.labels(engine="python", status="success").inc() + _EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0) + return { + "risk_score": 0.0, + "selected_action": HealingAction.NO_ACTION.value, + "expected_utility": 0.0, + "alternatives": [], + "explanation": "No candidate actions triggered.", + "epistemic_signals": None, + } + + # Build reasoning text from policies that triggered the actions + reasoning_parts = [] + for policy in policy_engine.policies: + if any(a in policy.actions for a in raw_actions): + conditions_str = ", ".join( + f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions + ) + reasoning_parts.append( + f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}" + ) + reasoning_text = " ".join(reasoning_parts) + + # Build evidence text from the event + evidence_text = ( + f"Component: {event.component}, " + f"latency_p99: {event.latency_p99}, " + f"error_rate: {event.error_rate}, " + f"cpu_util: {event.cpu_util}, " + f"memory_util: {event.memory_util}" + ) + + # Compute epistemic signals (if model/tokenizer provided) + epistemic_signals = None + if model is not None and tokenizer is not None: + try: + epistemic_signals = compute_epistemic_risk( + reasoning_text, evidence_text, model, tokenizer + ) + except Exception as e: + logger.error(f"Failed to compute epistemic risk: {e}") + epistemic_signals = { + "entropy": 0.0, + "contradiction": 0.0, + "evidence_lift": 0.0, + "hallucination_risk": 0.0, + } + else: + logger.debug("Epistemic model/tokenizer not provided; using zero signals") + epistemic_signals = { "entropy": 0.0, "contradiction": 0.0, "evidence_lift": 0.0, "hallucination_risk": 0.0, - }, + } + + # Run decision engine to get best action and alternatives + decision = decision_engine.select_optimal_action( + raw_actions, event, component=event.component, + epistemic_signals=epistemic_signals + ) + + # Extract risk of the selected action + risk_score = None + for alt in decision.alternatives: + if alt.action == decision.best_action: + risk_score = alt.risk + break + if risk_score is None: + # Compute risk separately + risk_score = decision_engine.compute_risk( + decision.best_action, event, event.component) + + # Format alternatives (top 3 only) + alt_list = [] + for alt in decision.alternatives[:3]: + alt_list.append({ + "action": alt.action.value, + "expected_utility": alt.utility, + "risk": alt.risk, + }) + + # ── Metrics & span finalisation ─────────────────────────── + _EVAL_COUNTER.labels(engine="python", status="success").inc() + _EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0) + + if span: + span.set_attribute("risk_score", risk_score) + span.set_attribute("selected_action", decision.best_action.value) + span.set_attribute("expected_utility", decision.expected_utility) + span.end() + + return { + "risk_score": risk_score, + "selected_action": decision.best_action.value, + "expected_utility": decision.expected_utility, + "alternatives": alt_list, + "explanation": decision.explanation, + "raw_decision": decision.raw_data, + "epistemic_signals": epistemic_signals, } def get_system_risk() -> float: - import random - return round(random.uniform(0, 1), 2) \ No newline at end of file + """ + Return an aggregated risk score across all monitored components. + This is a placeholder – the endpoint is deprecated. + Raises NotImplementedError to avoid random fallback. + """ + raise NotImplementedError( + "get_system_risk is deprecated. Use component‑level risk evaluation instead." + ) diff --git a/app/services/wilson_monitor.py b/app/services/wilson_monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..b04fab966a76706649cd38881867873804c03a6b --- /dev/null +++ b/app/services/wilson_monitor.py @@ -0,0 +1,56 @@ +# Wilson confidence interval monitor for Rust enforcer agreement +from prometheus_client import Gauge +import math + + +LOWER_BOUND = Gauge( + "arf_rust_agreement_lower_bound", + "Lower 99.9% Wilson bound on agreement rate", +) + + +def wilson_lower(success, total, z=3.291): + """ + Compute the lower bound of the Wilson confidence interval + for a binomial proportion. + + Parameters + ---------- + success : int + Number of agreed evaluations. + total : int + Total number of shadow evaluations (agreed + diverged). + z : float + Z‑score for the desired confidence level (default 3.291 for 99.9%). + + Returns + ------- + float + Lower bound of the Wilson interval, clamped to [0, 1]. + """ + if total == 0: + return 0.0 + p = success / total + n = total + denom = 1 + z**2 / n + center = (p + z**2 / (2 * n)) / denom + margin = z * math.sqrt(p * (1 - p) / n + z**2 / (4 * n**2)) / denom + return max(0.0, center - margin) + + +def update(agreed, diverged): + """ + Query the Prometheus agreement counters and set the lower‑bound gauge. + + This function is called periodically by the background thread started + in the API lifespan (see `app/main.py`). + + Parameters + ---------- + agreed : int + Current value of `arf_rust_agreement_total{result="agreed"}`. + diverged : int + Current value of `arf_rust_agreement_total{result="diverged"}`. + """ + lower = wilson_lower(agreed, agreed + diverged) + LOWER_BOUND.set(lower) diff --git a/docker-compose.test.yml b/docker-compose.test.yml new file mode 100644 index 0000000000000000000000000000000000000000..e4f10a38faa0a10c03087a2ab476ed108a4b9c54 --- /dev/null +++ b/docker-compose.test.yml @@ -0,0 +1,12 @@ +version: '3.8' + +services: + postgres: + image: postgres:15-alpine + environment: + POSTGRES_USER: testuser + POSTGRES_PASSWORD: testpass + POSTGRES_DB: testdb + ports: + - "5432:5432" + tmpfs: /var/lib/postgresql/data diff --git a/docs/authentication.md b/docs/authentication.md new file mode 100644 index 0000000000000000000000000000000000000000..3eb807a52b4ac7f128a7cdb7a36bac55eecad0c7 --- /dev/null +++ b/docs/authentication.md @@ -0,0 +1,25 @@ +# Authentication + +This page describes how to authenticate with the ARF API. + +Current status + +- There is no route-level or global authentication enforced by the API code in this repository. The API routes (including governance endpoints) do not validate API keys, tokens, or other credentials. + +What the code provides + +- The configuration model (app/core/config.py) exposes an optional `api_key` setting. This can be provided via environment variables or a `.env` file (the BaseSettings `env_file` is configured to read `.env`). + +What this means for you + +- Setting `API_KEY` in a `.env` file or environment variable will populate the `settings.api_key`, but the current route implementations do not check this value. +- If you require authentication, add a FastAPI dependency or middleware that checks `settings.api_key` (or another auth mechanism) and then apply it to routes or include it in a dependency override. + +Suggested minimal approach to enable API key checking + +- Implement a dependency in `app.api.deps` (e.g., `get_api_key`) that compares a header value to `settings.api_key` and raise `HTTPException(401)` when missing/invalid. +- Add that dependency to routers or individual endpoints where auth is required. + +Notes + +- Tests and example code in this repo currently run without auth. diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000000000000000000000000000000000000..44772d054d30ca5b2f59343447d9f4b4474060e0 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,55 @@ +# Development + +This page explains how to set up the ARF API for local development. + +Requirements + +- Python 3.10+ (match your environment) +- A virtual environment +- The project's Python dependencies (see `requirements.txt`). Note: `agentic-reliability-framework` is installed from a Git URL in `requirements.txt`. + +Quick start + +1. Clone the repository: + + git clone https://github.com/petter2025us/arf-api.git + cd arf-api + +2. Create and activate a virtualenv, then install dependencies: + + python -m venv .venv + source .venv/bin/activate # or .\.venv\Scripts\activate on Windows + pip install -r requirements.txt + +3. Configure environment variables (optional): + + - The project uses pydantic-settings with `env_file = ".env"` (see `app/core/config.py`). Create a `.env` file to set values locally. + + Relevant environment variables used by the code: + - ARF_HMC_MODEL (default: `models/hmc_model.json`) — path to HMC model JSON used by RiskEngine. + - ARF_USE_HYPERPRIORS (default: `false`) — set to `true` to enable hyperprior behavior. + - API_KEY (optional) — will populate `settings.api_key` but note that routes currently do not enforce authentication. + - DATABASE_URL (optional) — configuration option in settings; tests use a local SQLite DB by default. + +4. Run the app with Uvicorn for development: + + uvicorn app.main:app --reload --port 8000 + + - The application mounts routes under the `/api/v1` prefix and exposes a health endpoint at `/health`. + +Running tests + +- Tests use an on-disk SQLite test database (`sqlite:///./test.db`) created by the test fixtures (`tests/conftest.py`). +- To run tests: + + pytest + +- The test fixtures override the dependency that provides DB sessions so tests run against the test database. + +Notes on the RiskEngine + +- The app initializes a `RiskEngine` instance at startup (in `app.main`) using environment variables noted above. The engine instance is stored in `app.state.risk_engine` and is used by the governance endpoints. + +Further development + +- If you add persistent intent storage or authentication, update tests and dependency overrides accordingly. diff --git a/docs/docs_endpoints.md b/docs/docs_endpoints.md new file mode 100644 index 0000000000000000000000000000000000000000..755d3cc1c588b48c12d6be230cdb65382d012583 --- /dev/null +++ b/docs/docs_endpoints.md @@ -0,0 +1,314 @@ +# API Endpoints + +This document describes the main ARF API endpoints and the request/response contracts used by the control plane. + +## POST `/api/v1/v1/incidents/evaluate` + +Evaluates a reported incident and returns a heuristic healing recommendation, a counterfactual causal explanation, and a simplified utility decision. + +This endpoint is **advisory only**. It does not apply remediation, mutate infrastructure, or execute any healing action. + +### Purpose + +The endpoint takes a current incident snapshot, estimates risk, chooses a deterministic action, and explains the expected effect of that action on latency using a heuristic counterfactual model. + +The implementation is intentionally simple: + +- no fitted Structural Causal Model is used +- no machine learning model is required +- no historical training step is performed +- no action execution is triggered + +### Request schema + +The request body must match the `ReliabilityEvent` model. + +```json +{ + "component": "string", + "latency_p99": "number", + "error_rate": "number", + "service_mesh": "string", + "cpu_util": "number | null", + "memory_util": "number | null" +} +``` + +#### Fields + +`component` +: Name of the service or component being evaluated. + +`latency_p99` +: The current 99th percentile latency value. The endpoint uses this value both for risk scoring and for the causal explanation. + +`error_rate` +: The current error rate. The endpoint uses this value both for risk scoring and for the deterministic action threshold. + +`service_mesh` +: Optional service mesh name. Defaults to `"default"`. + +`cpu_util` +: Optional CPU utilization value. Present in the request model, but not used by the current decision logic. + +`memory_util` +: Optional memory utilization value. Present in the request model, but not used by the current decision logic. + +### Response schema + +The endpoint returns a JSON object with three top-level sections. + +```json +{ + "healing_intent": { + "action": "string", + "component": "string", + "parameters": {}, + "justification": "string", + "confidence": 0.85, + "risk_score": 0.0, + "status": "oss_advisory_only" + }, + "causal_explanation": { + "factual_outcome": 0.0, + "counterfactual_outcome": 0.0, + "effect": 0.0, + "explanation_text": "string", + "is_model_based": false, + "warnings": ["string"] + }, + "utility_decision": { + "best_action": "string", + "expected_utility": 0.5, + "explanation": "string" + } +} +``` + +#### `healing_intent` + +`action` +: The selected action. In the current implementation this is either `restart_container` or `no_action`. + +`component` +: The input component name. + +`parameters` +: Action parameters. The current implementation returns an empty object. + +`justification` +: Human-readable explanation built from the causal explanation. + +`confidence` +: Fixed confidence value returned by the endpoint. The current implementation uses `0.85`. + +`risk_score` +: Heuristic risk score computed from latency and error rate. + +`status` +: Always `oss_advisory_only`, indicating that the response is informational and not executable. + +#### `causal_explanation` + +`factual_outcome` +: The observed outcome value from the request context. The endpoint uses `latency_p99` as the explained metric. + +`counterfactual_outcome` +: The estimated value under the proposed alternative action. + +`effect` +: The difference between counterfactual and factual outcomes. + +`explanation_text` +: Natural-language explanation of the counterfactual effect. + +`is_model_based` +: Always `false` in the current implementation. + +`warnings` +: A list of warning strings. The current implementation includes a warning that the causal model is heuristic and not SCM-based. + +#### `utility_decision` + +`best_action` +: The selected action, repeated for convenience. + +`expected_utility` +: Fixed utility value returned by the current implementation. The endpoint uses `0.5`. + +`explanation` +: Brief explanation that the choice came from heuristic latency and error thresholds. + +### Deterministic decision logic + +The endpoint uses the following rule to choose the action: + +```text +optimal_action = RESTART_CONTAINER +if latency_p99 > 500 OR error_rate > 0.15 +else NO_ACTION +``` + +In the implementation, this is encoded as: + +- `restart_container` when `latency_p99 > 500` or `error_rate > 0.15` +- `no_action` otherwise + +No probabilistic policy or learned policy is involved. + +### Heuristic risk score + +The risk score is computed as: + +```text +risk = min(1.0, (latency_p99 / 1000) * 0.7 + error_rate * 0.3) +``` + +Properties of this score: + +- normalized to the interval `[0, 1]` +- weighted more heavily toward latency than error rate +- clipped at `1.0` + +### Counterfactual model + +The causal explainer uses a deterministic multiplicative heuristic: + +```text +counterfactual_outcome = factual_outcome * (1 + effect_frac) +``` + +Where: + +- `factual_outcome` is the observed metric value +- `effect_frac` is read from a fixed internal action-impact table +- the effect is multiplicative, not additive + +For latency, the current action-impact mapping includes the following examples: + +- `restart_container` → `latency_effect = -0.15` +- `scale_out` → `latency_effect = -0.20` +- `rollback` → `latency_effect = -0.25` +- `circuit_breaker` → `latency_effect = -0.05` +- `traffic_shift` → `latency_effect = -0.10` +- `alert_team` → `latency_effect = 0.0` +- `no_action` → `latency_effect = 0.0` + +For error rate, the table includes a separate `error_rate_effect` per action, but the current endpoint calls the explainer with `outcome_metric="latency"`, so the returned counterfactual explanation is latency-based. + +### Uncertainty interval + +The explainer applies a fixed uncertainty margin of ±10% around the estimated effect. + +Let: + +```text +effect = counterfactual_outcome - factual_outcome +ci_half = abs(effect) * 0.1 +confidence_interval = (counterfactual_outcome - ci_half, counterfactual_outcome + ci_half) +``` + +This interval is heuristic only. It is not a calibrated statistical confidence interval. + +### How the endpoint uses the explainer + +The endpoint constructs a local state object and passes it to the explainer: + +- `current_state["latency"] = event.latency_p99` +- `current_state["error_rate"] = event.error_rate` +- `current_state["last_action"] = {"action_type": "no_action"}` + +It then creates: + +- `proposed_action = {"action_type": optimal_action.value, "params": {}}` + +and calls: + +```text +CausalExplainer().explain_healing_intent(proposed_action, current_state, "latency") +``` + +The resulting explanation is embedded into the `healing_intent` response. + +### Validation and error behavior + +The endpoint uses Pydantic validation through the `ReliabilityEvent` model. + +Expected behavior: + +- valid requests return HTTP 200 +- invalid request bodies are rejected by FastAPI/Pydantic before the handler logic runs + +The current implementation does not define a custom error schema for validation failures. + +### Advisory-only behavior + +The response includes: + +```json +"status": "oss_advisory_only" +``` + +This means: + +- the endpoint recommends an action +- it does not perform the action +- it does not mutate incident state +- it does not trigger remediation workflows by itself + +### Notes on implementation scope + +The current endpoint is intentionally narrow: + +- it bases the action choice on only two fields: `latency_p99` and `error_rate` +- it ignores `cpu_util`, `memory_util`, and `service_mesh` in the decision logic +- it always uses the latency metric in the causal explainer call +- it returns a fixed `expected_utility` value of `0.5` + +### Example request + +```bash +curl -X POST "http://localhost:8000/api/v1/v1/incidents/evaluate" -H "Content-Type: application/json" -d '{ + "component": "payment-service", + "latency_p99": 450, + "error_rate": 0.25, + "service_mesh": "default", + "cpu_util": 0.85, + "memory_util": 0.90 + }' +``` + +### Example response shape + +```json +{ + "healing_intent": { + "action": "restart_container", + "component": "payment-service", + "parameters": {}, + "justification": "Causal: If we apply restart_container instead of no_action, latency would change from 450.00 to 382.50 (Δ = -67.50). Based on heuristic causal model.", + "confidence": 0.85, + "risk_score": 0.4575, + "status": "oss_advisory_only" + }, + "causal_explanation": { + "factual_outcome": 450, + "counterfactual_outcome": 382.5, + "effect": -67.5, + "explanation_text": "If we apply restart_container instead of no_action, latency would change from 450.00 to 382.50 (Δ = -67.50). Based on heuristic causal model.", + "is_model_based": false, + "warnings": [ + "Using heuristic causal model (no fitted SCM)." + ] + }, + "utility_decision": { + "best_action": "restart_container", + "expected_utility": 0.5, + "explanation": "Heuristic decision based on latency/error thresholds" + } +} +``` + +### Cross-reference + +See `docs/examples.md` for a worked numerical example and `README.md` for a shorter overview. diff --git a/docs/endpoints.md b/docs/endpoints.md new file mode 100644 index 0000000000000000000000000000000000000000..01cc9233ae72ab5ac5a0a7c1b8624432b5ce0363 --- /dev/null +++ b/docs/endpoints.md @@ -0,0 +1,34 @@ +# API Endpoints + +This page lists all available API endpoints. + +General + +- All API routers are mounted under the `/api/v1` prefix (see `app.main`). +- Health endpoint is available at `/health`. + +Health + +- GET /health + - Returns: `{ "status": "ok" }` + - Purpose: basic liveness/health check. + +Governance (risk/intent evaluation) + +- POST /api/v1/intents/evaluate + - Description: Evaluate an infrastructure intent and return a risk score and explanation. + - Body: an InfrastructureIntentRequest JSON object (see the model in `app.models.infrastructure_intents`). + - Behaviour: The endpoint converts the incoming intent to an OSS intent and calls into the locally initialized RiskEngine (`app.state.risk_engine`). + - Errors: May return 500 if evaluation fails. + +- POST /api/v1/intents/outcome + - Description: Record the observed outcome of an executed intent to update priors. + - Behaviour: Not implemented in this repository; the endpoint returns a `501 Not Implemented` (the current implementation raises a 501 indicating outcome recording is not yet implemented). + +Other routers + +- The application also registers routers for incidents, risk, intents, and history at `/api/v1` (see `app.main`). Consult the respective modules in `app.api` for their exact endpoints and payloads. + +Notes + +- The governance evaluation relies on a `RiskEngine` instance initialized at app startup (see `app.main`) which reads `ARF_HMC_MODEL` and `ARF_USE_HYPERPRIORS` environment variables. diff --git a/docs/examples.md b/docs/examples.md new file mode 100644 index 0000000000000000000000000000000000000000..e89191034df6c03c3969067ff2e9c1ba4d33cf9e --- /dev/null +++ b/docs/examples.md @@ -0,0 +1,54 @@ +# Examples + +This page provides usage examples for the ARF API. + +Check health + +curl example: + +curl http://localhost:8000/health + +Response: + +{ + "status": "ok" +} + +Evaluate an intent (governance) + +- Endpoint: POST /api/v1/intents/evaluate +- Content-Type: application/json + +Example payload (minimal illustrative example — adapt to the `InfrastructureIntentRequest` model used by the project): + +{ + "id": "intent-123", + "description": "Example infrastructure change", + "estimated_cost": 100.0, + "policy_violations": [] +} + +Curl example: + +curl -X POST http://localhost:8000/api/v1/intents/evaluate \ + -H "Content-Type: application/json" \ + -d '{"id":"intent-123","description":"Example","estimated_cost":100.0,"policy_violations":[]} ' + +Python (requests) example: + +import requests + +payload = { + "id": "intent-123", + "description": "Example infrastructure change", + "estimated_cost": 100.0, + "policy_violations": [] +} + +resp = requests.post("http://localhost:8000/api/v1/intents/evaluate", json=payload) +print(resp.status_code, resp.text) + +Notes + +- The evaluate endpoint uses an in-process `RiskEngine` (initialized in `app.main`) to compute risk and explanations. +- The `/api/v1/intents/outcome` endpoint exists but currently returns 501 Not Implemented — outcome recording/storage is incomplete in this repo. diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000000000000000000000000000000000000..c7cb4afc7cbc188aeec4df2ef372a5c7c48a9918 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,16 @@ +# ARF API Control Plane + +Welcome to the ARF API documentation. + +Overview + +- This repository implements the ARF API Control Plane (FastAPI) — the application mounts a number of routers under `/api/v1` and exposes a health endpoint at `/health`. +- App version (from app.main): 0.2.0 + +Important notes + +- A `RiskEngine` is initialized at app startup and stored at `app.state.risk_engine`. The engine reads `ARF_HMC_MODEL` and `ARF_USE_HYPERPRIORS` environment variables. +- Authentication: there is an optional `api_key` in configuration, but request handlers do not currently enforce authentication. +- The `/api/v1/intents/outcome` endpoint exists but returns 501 Not Implemented; intent outcome recording/storage is not yet implemented. + +See the other documentation pages for development instructions, endpoints, and examples. diff --git a/monitor.sh b/monitor.sh new file mode 100644 index 0000000000000000000000000000000000000000..97f2c5102bf7c13c6b2ddf4a327dc7cbabc05386 --- /dev/null +++ b/monitor.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +URL_FILE="/workspaces/arf-api/current_url.txt" +LOG_FILE="/workspaces/arf-api/monitor.log" + +if [ ! -f "$URL_FILE" ]; then + echo "$(date): No URL file found. Exiting." >> "$LOG_FILE" + exit 1 +fi + +CURRENT_URL=$(cat "$URL_FILE") + +if ! curl -s -f "$CURRENT_URL/health" > /dev/null; then + echo "$(date): Tunnel down. Restarting..." >> "$LOG_FILE" + /workspaces/arf-api/start.sh +else + echo "$(date): Tunnel OK." >> "$LOG_FILE" +fi diff --git a/render.yaml b/render.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a5d4288348505c09905640f0a0c7a10f690cc6c --- /dev/null +++ b/render.yaml @@ -0,0 +1,19 @@ +services: + - type: web + name: arf-api + runtime: python + buildCommand: pip install -r requirements.txt + startCommand: uvicorn app.main:app --host 0.0.0.0 --port $PORT + envVars: + - key: DATABASE_URL + fromDatabase: + name: arf-db + property: connectionString + - key: API_KEY + sync: false + - key: ENVIRONMENT + value: production +databases: + - name: arf-db + databaseName: arf + user: arf_user diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ae6547de663ff4d0735a80961816f447edd07a6 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +pytest-cov>=7.0.0 +jsonschema>=4.0.0 +pytest-asyncio>=0.24.0 diff --git a/requirements.txt b/requirements.txt index 4212999cac2b0ccafa86088a82031e4d6ec128a9..4efe44e77a03c697b44d8bee5cc85e65561ae04f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ fastapi==0.115.12 uvicorn[standard]==0.34.0 -pydantic==2.12.5 +pydantic>=2.13.2 +agentic-reliability-framework @ git+https://github.com/arf-foundation/agentic-reliability-framework@main +arf-pricing-calculator @ git+https://github.com/arf-foundation/ARF-Bayesian-Pricing-Calculator@main +pytest==8.3.5 pytest==8.3.5 -pytest-cov>=6.0.0 httpx==0.28.1 alembic pydantic-settings @@ -11,9 +13,11 @@ psycopg2-binary==2.9.10 slowapi==0.1.9 prometheus-fastapi-instrumentator==7.1.0 flake8==7.2.0 -cryptography +cryptography==47.0.0 sentence-transformers>=2.2.0 scikit-learn -redis>=4.0.0 +redis>=4.0.0 # optional, for faster counters stripe>=9.0.0 -pandas \ No newline at end of file +opentelemetry-api>=1.20.0 +opentelemetry-sdk>=1.20.0 +opentelemetry-instrumentation-fastapi>=0.50b0 diff --git a/runtime.txt b/runtime.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f7974e75e5bd0aeb41d7002691a56328241a803 --- /dev/null +++ b/runtime.txt @@ -0,0 +1,2 @@ +python-3.12.3 +# force fresh build diff --git a/seed_rag_data.py b/seed_rag_data.py new file mode 100644 index 0000000000000000000000000000000000000000..6361b17eaa55383e649910386d5ba7bfe11e763e --- /dev/null +++ b/seed_rag_data.py @@ -0,0 +1,67 @@ +""" +Seed RAG graph with historical healing action success rates. +Run once before starting the API server. +""" +import sys +import os +sys.path.append(os.path.dirname(__file__)) + +from app.core.deps import get_rag_graph +from agentic_reliability_framework.core.models.event import HealingAction + +def seed_historical_data(): + rag = get_rag_graph() + + # Define seed incidents (each with an outcome) + seed_data = [ + # restart_container successes + {"incident_id": "seed_restart_1", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": True, "resolution_time_minutes": 2}, + {"incident_id": "seed_restart_2", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": True, "resolution_time_minutes": 3}, + {"incident_id": "seed_restart_3", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": False, "resolution_time_minutes": 10}, + + # rollback successes + {"incident_id": "seed_rollback_1", "component": "test", "action": HealingAction.ROLLBACK.value, "success": True, "resolution_time_minutes": 1}, + {"incident_id": "seed_rollback_2", "component": "test", "action": HealingAction.ROLLBACK.value, "success": True, "resolution_time_minutes": 2}, + {"incident_id": "seed_rollback_3", "component": "test", "action": HealingAction.ROLLBACK.value, "success": False, "resolution_time_minutes": 5}, + + # scale_out successes + {"incident_id": "seed_scale_1", "component": "test", "action": HealingAction.SCALE_OUT.value, "success": True, "resolution_time_minutes": 5}, + {"incident_id": "seed_scale_2", "component": "test", "action": HealingAction.SCALE_OUT.value, "success": False, "resolution_time_minutes": 15}, + + # circuit_breaker successes + {"incident_id": "seed_cb_1", "component": "test", "action": HealingAction.CIRCUIT_BREAKER.value, "success": True, "resolution_time_minutes": 1}, + {"incident_id": "seed_cb_2", "component": "test", "action": HealingAction.CIRCUIT_BREAKER.value, "success": True, "resolution_time_minutes": 2}, + + # traffic_shift successes + {"incident_id": "seed_ts_1", "component": "test", "action": HealingAction.TRAFFIC_SHIFT.value, "success": True, "resolution_time_minutes": 4}, + {"incident_id": "seed_ts_2", "component": "test", "action": HealingAction.TRAFFIC_SHIFT.value, "success": False, "resolution_time_minutes": 8}, + ] + + # Add each outcome to the RAG graph + for item in seed_data: + # Create a dummy reliability event (simplified) + from agentic_reliability_framework.core.models.event import ReliabilityEvent + event = ReliabilityEvent( + component=item["component"], + latency_p99=500, # placeholder + error_rate=0.1, + service_mesh="default" + ) + # Record the outcome + rag.record_outcome( + incident_id=item["incident_id"], + event=event, + action_taken=item["action"], + success=item["success"], + resolution_time_minutes=item["resolution_time_minutes"] + ) + print(f"Seeded: {item['action']} -> success={item['success']}") + + print(f"Seeded {len(seed_data)} historical outcomes.") + print(f"Stats per action:") + for action in HealingAction: + stats = rag.get_historical_effectiveness(action.value, component_filter="test") + print(f" {action.value}: uses={stats['total_uses']}, success_rate={stats['success_rate']:.2f}, avg_time={stats['avg_resolution_time_minutes']:.1f} min") + +if __name__ == "__main__": + seed_historical_data() diff --git a/start.sh b/start.sh new file mode 100644 index 0000000000000000000000000000000000000000..8e34d8ac243c498824c3fa150171afbf8dcdf69b --- /dev/null +++ b/start.sh @@ -0,0 +1,68 @@ +#!/bin/bash + +# Set paths +BACKEND_DIR="/workspaces/arf-api" +FRONTEND_DIR="/workspaces/arf-frontend" +VENV_ACTIVATE="$BACKEND_DIR/venv/bin/activate" +CLOUDFLARED=$(which cloudflared 2>/dev/null || echo "/usr/local/bin/cloudflared") + +# Kill any existing processes +echo "🛑 Stopping existing uvicorn and cloudflared..." +pkill -f uvicorn +pkill -f cloudflared +sleep 2 + +# Start uvicorn +echo "🚀 Starting uvicorn..." +cd "$BACKEND_DIR" +source "$VENV_ACTIVATE" +uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload & +sleep 3 + +# Verify uvicorn is running +if ! curl -s http://localhost:8000/health >/dev/null; then + echo "❌ uvicorn failed to start. Exiting." + exit 1 +fi +echo "✅ uvicorn is running." + +# Start cloudflared and capture URL +echo "🌐 Starting cloudflared tunnel..." +TEMP_FILE=$(mktemp) +$CLOUDFLARED tunnel --url http://localhost:8000 2>&1 | tee "$TEMP_FILE" & + +# Wait for URL to appear +echo "⏳ Waiting for tunnel URL..." +URL="" +for i in {1..30}; do + URL=$(grep -oP 'https://[a-z0-9-]+\.trycloudflare\.com' "$TEMP_FILE" | head -1) + if [ -n "$URL" ]; then + break + fi + sleep 1 +done + +if [ -z "$URL" ]; then + echo "❌ Failed to get tunnel URL." + exit 1 +fi +echo "✅ Tunnel URL: $URL" + +# Save URL for monitoring (used by monitor.sh) +echo "$URL" > /workspaces/arf-api/current_url.txt + +# Update Vercel environment variable +echo "🔧 Updating Vercel environment variable..." +cd "$FRONTEND_DIR" +if command -v vercel &>/dev/null; then + vercel env rm NEXT_PUBLIC_API_URL production -y + echo "$URL" | vercel env add NEXT_PUBLIC_API_URL production + echo "🔄 Redeploying frontend..." + vercel --prod +else + echo "⚠️ Vercel CLI not installed. Please install it with: npm i -g vercel" + echo "Then manually update the env var to: $URL" +fi + +echo "🎉 All done! Your new URL is: $URL" +echo "Frontend will be updated shortly. Check https://arf-frontend-sandy.vercel.app" \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..51d0ea8e67425772790cb0086a2f00af20cf40b0 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,128 @@ +""" +pytest configuration and fixtures for ARF API tests. +""" + +from app.core.usage_tracker import enforce_quota, Tier +from app.api.deps import get_db +from app.database.base import Base +from app.main import app as fastapi_app +from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine +from fastapi.testclient import TestClient +import app.core.usage_tracker +import os +import pytest + +# ===== STEP 1: Set environment variables BEFORE any app imports ===== +os.environ["ARF_USAGE_TRACKING"] = "false" + +# Force the correct database URL for tests +os.environ["DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/testdb" +os.environ["TEST_DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/testdb" + +# Additional PostgreSQL environment variables to prevent fallback to +# system user +os.environ["PGUSER"] = "postgres" +os.environ["PGPASSWORD"] = "postgres" +os.environ["PGHOST"] = "localhost" +os.environ["PGPORT"] = "5432" +os.environ["PGDATABASE"] = "testdb" + + +# ===== STEP 2: Mock the tracker module BEFORE importing app ===== +class MockTracker: + def get_tier(self, api_key): + from app.core.usage_tracker import Tier + + return Tier.PRO + + def get_remaining_quota(self, api_key, tier): + + return 1000 + + def consume_quota_and_log(self, record, idempotency_key=None): + + return (True, None) + + def increment_usage_sync(self, record, idempotency_key=None): + return True + + def get_or_create_api_key(self, key, tier): + + return True + + def update_api_key_tier(self, key, tier): + return True + + def _insert_audit_log(self, record): + pass + + +# Replace the tracker at the module level +app.core.usage_tracker.tracker = MockTracker() + +# ===== STEP 3: Import app and database modules ===== + +# Force model registration (prevents "no such table" errors) + +# Use the environment variable for the database URL (already set) +TEST_DATABASE_URL = os.getenv( + "TEST_DATABASE_URL", + "postgresql://postgres:postgres@localhost:5432/testdb") + +if TEST_DATABASE_URL.startswith("postgresql"): + engine = create_engine(TEST_DATABASE_URL) +else: + engine = create_engine( + TEST_DATABASE_URL, connect_args={ + "check_same_thread": False}) + +TestingSessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + bind=engine) + + +def override_get_db(): + + db = TestingSessionLocal() + try: + yield db + + finally: + db.close() + + +fastapi_app.dependency_overrides[get_db] = override_get_db + +# Override enforce_quota dependency + + +async def mock_enforce_quota(request, api_key=None): + return {"api_key": "test_key", "tier": Tier.PRO, "remaining": 1000} +fastapi_app.dependency_overrides[enforce_quota] = mock_enforce_quota + + +@pytest.fixture(scope="session", autouse=True) +def setup_database(): + """Create tables before any tests run.""" + Base.metadata.create_all(bind=engine) + yield + Base.metadata.drop_all(bind=engine) + + +@pytest.fixture(scope="session") +def client(): + with TestClient(fastapi_app) as test_client: + yield test_client + + +@pytest.fixture(scope="function") +def db_session(): + """Provide a clean database session for each test.""" + Base.metadata.create_all(bind=engine) + session = TestingSessionLocal() + yield session + session.rollback() + session.close() + Base.metadata.drop_all(bind=engine) diff --git a/tests/test_deps.py b/tests/test_deps.py new file mode 100644 index 0000000000000000000000000000000000000000..b6982ece0565f02abda88a46fcaed1a437f52c6e --- /dev/null +++ b/tests/test_deps.py @@ -0,0 +1,15 @@ +import pytest +from unittest.mock import patch, MagicMock +from app.api.deps import get_db + + +def test_get_db_closes_session(): + mock_session = MagicMock() + with patch('app.api.deps.SessionLocal', return_value=mock_session): + db_gen = get_db() + db = next(db_gen) + assert db == mock_session + # Simulate an exception during request handling + with pytest.raises(Exception): + db_gen.throw(Exception("test error")) + mock_session.close.assert_called_once() diff --git a/tests/test_governance.py b/tests/test_governance.py new file mode 100644 index 0000000000000000000000000000000000000000..3b7c9d2f6bac913bc4467f13a23156dbe0d83fa4 --- /dev/null +++ b/tests/test_governance.py @@ -0,0 +1,71 @@ +""" +Tests for governance endpoints: /api/v1/intents/evaluate +""" + + +def test_evaluate_provision_intent(client): + payload = { + "intent_type": "provision_resource", + "environment": "prod", + "resource_type": "database", + "region": "eastus", + "size": "Standard", + "estimated_cost": 1200, + "policy_violations": [], + "requester": "alice", + "provenance": {}, + "configuration": {} + } + response = client.post("/api/v1/intents/evaluate", json=payload) + assert response.status_code == 200, response.text + data = response.json() + assert "risk_score" in data + + +def test_evaluate_grant_access(client): + payload = { + "intent_type": "grant_access", + "environment": "dev", + "principal": "bob", + "permission_level": "read", + "resource_scope": "/subscriptions/123", + "estimated_cost": None, + "policy_violations": [], + "requester": "alice", + "provenance": {}, + "justification": "test" + } + response = client.post("/api/v1/intents/evaluate", json=payload) + assert response.status_code == 200, response.text + data = response.json() + assert "risk_score" in data + + +def test_evaluate_deploy_config(client): + payload = { + "intent_type": "deploy_config", + "environment": "staging", + "service_name": "payments-api", + "change_scope": "canary", + "deployment_target": "staging", + "estimated_cost": 20, + "policy_violations": [], + "requester": "alice", + "provenance": {}, + "configuration": {} + } + response = client.post("/api/v1/intents/evaluate", json=payload) + assert response.status_code == 200, response.text + data = response.json() + assert "risk_score" in data + + +def test_invalid_intent_type(client): + payload = { + "intent_type": "UnknownIntent", + "environment": "prod", + "requester": "alice", + "provenance": {} + } + response = client.post("/api/v1/intents/evaluate", json=payload) + assert response.status_code == 422 diff --git a/tests/test_healing_endpoint.py b/tests/test_healing_endpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..06b4e8c21170b8a8922121c2b12152fa3690b05a --- /dev/null +++ b/tests/test_healing_endpoint.py @@ -0,0 +1,21 @@ +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_healing_evaluate_endpoint(): + payload = { + "event": { + "component": "my-service", + "latency_p99": 450.0, + "error_rate": 0.25, + "service_mesh": "default", + "cpu_util": 0.85, + "memory_util": 0.90 + } + } + response = client.post("/api/v1/healing/evaluate", json=payload) + assert response.status_code == 200, f"Expected 200, got { + response.status_code}: { + response.text}" diff --git a/tests/test_history.py b/tests/test_history.py new file mode 100644 index 0000000000000000000000000000000000000000..9a9a535329d08951d2cb36ec45f7d1b2d39465c0 --- /dev/null +++ b/tests/test_history.py @@ -0,0 +1,16 @@ +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_history(): + response = client.get("/api/v1/history") + assert response.status_code == 200 + data = response.json() + # The endpoint returns a list of risk points, not an object with an + # "incidents" key + assert isinstance(data, list) + if data: # if not empty, verify the structure of the first item + assert "risk" in data[0] + assert "time" in data[0] diff --git a/tests/test_incident_service.py b/tests/test_incident_service.py new file mode 100644 index 0000000000000000000000000000000000000000..f05caff794d272510e1b650784f6ecdf47b31ca3 --- /dev/null +++ b/tests/test_incident_service.py @@ -0,0 +1,8 @@ +from app.services.incident_service import process_incident +from app.models.incident_models import IncidentReport + + +def test_process_incident(): + report = IncidentReport(service="test", signal_type="latency", value=450) + reliability = process_incident(report) + assert 0 <= reliability <= 1 diff --git a/tests/test_incidents.py b/tests/test_incidents.py new file mode 100644 index 0000000000000000000000000000000000000000..bd14fc84dd350bc65ccc2e286799b80255f0653f --- /dev/null +++ b/tests/test_incidents.py @@ -0,0 +1,38 @@ +"""Tests for incident endpoints – backward‑compatible Bayesian reroute.""" +import pytest +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_report_incident(): + payload = { + "component": "api-gateway", + "latency_p99": 450.0, + "error_rate": 0.0, + "service_mesh": "default", + "throughput": 100, + "source": "test", + } + resp = client.post("/api/v1/report_incident", json=payload) + assert resp.status_code == 200 + assert resp.json()["status"] == "recorded" + + +@pytest.mark.xfail(reason="model mismatch – awaiting core framework sync") +def test_evaluate_incident_deprecated(): + payload = { + "component": "checkout", + "latency_p99": 450.0, + "error_rate": 0.12, + "service_mesh": "default", + "throughput": 100, + "source": "test", + } + resp = client.post("/api/v1/v1/incidents/evaluate", json=payload) + assert resp.status_code == 200 + body = resp.json() + assert "deprecation_notice" in body + assert "healing_intent" in body + assert body["healing_intent"]["risk_score"] is not None diff --git a/tests/test_intent_adapter.py b/tests/test_intent_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..5b11828ae93f52d36ca432891f5cae13dba2fedf --- /dev/null +++ b/tests/test_intent_adapter.py @@ -0,0 +1,178 @@ +import pytest +from app.services.intent_adapter import to_oss_intent, IntentAdapterError + + +def test_unknown_intent_type(): + """Unknown intent_type raises IntentAdapterError.""" + with pytest.raises(IntentAdapterError, match="Unknown intent_type: UnknownIntent"): + to_oss_intent({"intent_type": "UnknownIntent", + "environment": "prod", "requester": "alice"}) + + +def test_missing_intent_type(): + """Missing intent_type raises IntentAdapterError.""" + with pytest.raises(IntentAdapterError, match="Missing 'intent_type' in request"): + to_oss_intent({}) + + +def test_missing_environment(): + """Missing environment raises IntentAdapterError.""" + payload = {"intent_type": "provision_resource"} + with pytest.raises(IntentAdapterError, match="Missing 'environment' field"): + to_oss_intent(payload) + + +def test_invalid_environment(): + """Invalid environment value raises IntentAdapterError.""" + payload = { + "intent_type": "provision_resource", + "environment": "invalid_env", + "requester": "alice", + } + with pytest.raises(IntentAdapterError, match="Invalid environment: invalid_env"): + to_oss_intent(payload) + + +def test_missing_requester(): + """Missing requester raises IntentAdapterError.""" + payload = { + "intent_type": "provision_resource", + "environment": "prod", + } + with pytest.raises(IntentAdapterError, match="Missing 'requester' field"): + to_oss_intent(payload) + + +def test_valid_provision_intent(): + """Valid provision_resource payload returns ProvisionResourceIntent.""" + payload = { + "intent_type": "provision_resource", + "environment": "prod", + "resource_type": "database", + "region": "eastus", + "size": "Standard", + "requester": "alice", + "configuration": {"foo": "bar"}, + "provenance": {"source": "test"}, + } + intent = to_oss_intent(payload) + # Now resource_type is a string, not an enum + assert intent.resource_type == "database" + assert intent.region == "eastus" + assert intent.size == "Standard" + assert intent.environment == "prod" # string, not .value + assert intent.requester == "alice" + assert intent.configuration == {"foo": "bar"} + + +def test_provision_missing_resource_type(): + """Missing resource_type raises IntentAdapterError.""" + payload = { + "intent_type": "provision_resource", + "environment": "prod", + "region": "eastus", + "size": "Standard", + "requester": "alice", + } + with pytest.raises(IntentAdapterError, match="Missing 'resource_type'"): + to_oss_intent(payload) + + +def test_provision_invalid_resource_type(): + """Invalid resource_type raises IntentAdapterError.""" + payload = { + "intent_type": "provision_resource", + "environment": "prod", + "resource_type": "invalid_type", + "region": "eastus", + "size": "Standard", + "requester": "alice", + } + with pytest.raises(IntentAdapterError, match="Invalid resource_type: invalid_type"): + to_oss_intent(payload) + + +def test_valid_grant_intent(): + """Valid grant_access payload returns GrantAccessIntent.""" + payload = { + "intent_type": "grant_access", + "environment": "dev", + "principal": "bob", + "permission_level": "read", + "resource_scope": "/subscriptions/123", + "requester": "alice", + "justification": "testing", + "provenance": {}, + } + intent = to_oss_intent(payload) + assert intent.principal == "bob" + assert intent.permission_level == "read" + assert intent.resource_scope == "/subscriptions/123" + + assert intent.requester == "alice" + + +def test_grant_missing_principal(): + """Missing principal raises IntentAdapterError.""" + payload = { + "intent_type": "grant_access", + "environment": "dev", + "permission_level": "read", + "resource_scope": "/subscriptions/123", + "requester": "alice", + } + with pytest.raises(IntentAdapterError, match="Missing 'principal'"): + to_oss_intent(payload) + + +def test_valid_deploy_intent(): + """Valid deploy_config payload returns DeployConfigurationIntent.""" + payload = { + "intent_type": "deploy_config", + "environment": "staging", + "service_name": "payments-api", + "change_scope": "canary", + "deployment_target": "staging", + "requester": "alice", + "risk_level_hint": "low", + "configuration": {"replicas": 3}, + "provenance": {}, + } + intent = to_oss_intent(payload) + assert intent.service_name == "payments-api" + assert intent.change_scope == "canary" + assert intent.deployment_target == "staging" + + assert intent.requester == "alice" + # risk_level_hint is not passed through; accept None + assert intent.risk_level_hint is None + + +def test_deploy_missing_service_name(): + """Missing service_name raises IntentAdapterError.""" + payload = { + "intent_type": "deploy_config", + "environment": "staging", + "change_scope": "canary", + "deployment_target": "staging", + "requester": "alice", + } + with pytest.raises(IntentAdapterError, match="Missing 'service_name'"): + to_oss_intent(payload) + + +def test_pydantic_model_support(): + """Test that Pydantic models are handled correctly (simulated).""" + class MockModel: + def model_dump(self): + return { + "intent_type": "provision_resource", + "environment": "prod", + "resource_type": "database", + "region": "eastus", + "size": "Standard", + "requester": "alice", + } + + intent = to_oss_intent(MockModel()) + assert intent.resource_type == "database" # string, not .value diff --git a/tests/test_intent_store.py b/tests/test_intent_store.py new file mode 100644 index 0000000000000000000000000000000000000000..95d3647df0936ff751db04e33fff4f86fbebd935 --- /dev/null +++ b/tests/test_intent_store.py @@ -0,0 +1,46 @@ +import pytest +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from app.database.base import Base +from app.database.models_intents import IntentDB +from app.services.intent_store import save_evaluated_intent, get_intent_by_deterministic_id + + +@pytest.fixture +def db_session(): + engine = create_engine("sqlite:///:memory:", future=True) + TestingSessionLocal = sessionmaker(bind=engine, future=True) + Base.metadata.create_all(bind=engine) + sess = TestingSessionLocal() + yield sess + sess.close() + + +def test_save_intent(db_session): + det_id = "intent_123" + saved = save_evaluated_intent( + db=db_session, + deterministic_id=det_id, + intent_type="ProvisionResourceIntent", + api_payload={"foo": "bar"}, + oss_payload={"intent_type": "provision_resource"}, + environment="prod", + risk_score=0.42 + ) + assert saved.deterministic_id == det_id + assert saved.risk_score == "0.42" + + fetched = get_intent_by_deterministic_id(db_session, det_id) + assert fetched is not None + assert fetched.payload["foo"] == "bar" + + +def test_update_existing_intent(db_session): + det_id = "intent_123" + save_evaluated_intent(db_session, det_id, "Type", {}, {}, "prod", 0.5) + updated = save_evaluated_intent( + db_session, det_id, "Type", {}, {}, "prod", 0.7) + assert updated.risk_score == "0.7" + count = db_session.query(IntentDB).filter( + IntentDB.deterministic_id == det_id).count() + assert count == 1 diff --git a/tests/test_intents.py b/tests/test_intents.py new file mode 100644 index 0000000000000000000000000000000000000000..19e4cd67372578a509ac78e5dc6e40621526d387 --- /dev/null +++ b/tests/test_intents.py @@ -0,0 +1,27 @@ +import logging +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_simulate_intent(): + payload = {"action": "restart_service", "target": "api-gateway"} + response = client.post("/api/v1/simulate_intent", json=payload) + assert response.status_code == 200 + data = response.json() + assert "risk_score" in data + assert data["recommendation"] in [ + "safe_to_execute", "requires_approval", "blocked"] + + +def test_simulate_intent_deprecation_warning(caplog): + from app.services.intent_service import simulate_intent + from app.models.intent_models import IntentSimulation + intent = IntentSimulation(action="restart_service", target="test") + with caplog.at_level(logging.WARNING): + result = simulate_intent(intent) + assert "Deprecated endpoint" in caplog.text + assert "risk_score" in result + assert result["recommendation"] in [ + "safe_to_execute", "requires_approval", "blocked"] diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000000000000000000000000000000000000..6da6d22ff06e5b7ee0f38584feadf39ce8b3e523 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,8 @@ +from fastapi.testclient import TestClient +from app.main import app + + +def test_lifespan(): + client = TestClient(app) + response = client.get("/health") + assert response.status_code == 200 diff --git a/tests/test_outcome_service.py b/tests/test_outcome_service.py new file mode 100644 index 0000000000000000000000000000000000000000..a86aad9ba55f2d20bee62c2005dc9932d216520d --- /dev/null +++ b/tests/test_outcome_service.py @@ -0,0 +1,145 @@ +import pytest +import datetime +from unittest.mock import MagicMock +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from app.database.base import Base +from app.database.models_intents import IntentDB +from app.services.outcome_service import record_outcome, OutcomeConflictError +from agentic_reliability_framework.core.governance.intents import ( + ProvisionResourceIntent, + ResourceType, +) + + +@pytest.fixture +def db_session(): + engine = create_engine("sqlite:///:memory:", future=True) + TestingSessionLocal = sessionmaker(bind=engine, future=True) + Base.metadata.create_all(bind=engine) + sess = TestingSessionLocal() + yield sess + sess.close() + + +@pytest.fixture +def mock_risk_engine(): + engine = MagicMock() + engine.update_outcome = MagicMock() + return engine + + +def test_record_outcome_creates_row_and_updates_engine( + db_session, mock_risk_engine): + oss_intent = ProvisionResourceIntent( + resource_type=ResourceType.VM, + region="eastus", + size="Standard", + environment="dev", + requester="test_user" + ) + oss_payload = oss_intent.model_dump(mode='json') + + intent = IntentDB( + deterministic_id="intent_abc", + intent_type="ProvisionResourceIntent", + payload={}, + oss_payload=oss_payload, + created_at=datetime.datetime.utcnow() + ) + db_session.add(intent) + db_session.commit() + db_session.refresh(intent) + + outcome = record_outcome( + db=db_session, + deterministic_id="intent_abc", + success=True, + recorded_by="tester", + notes="works", + risk_engine=mock_risk_engine, + idempotency_key="key123" + ) + assert outcome.success is True + assert outcome.recorded_by == "tester" + assert outcome.idempotency_key == "key123" + mock_risk_engine.update_outcome.assert_called_once() + + # Idempotent call with same key should return existing outcome and not + # call engine again + outcome2 = record_outcome( + db=db_session, + deterministic_id="intent_abc", + success=True, + recorded_by="tester", + notes="again", + risk_engine=mock_risk_engine, + idempotency_key="key123" + ) + assert outcome2.id == outcome.id + mock_risk_engine.update_outcome.assert_called_once() # still once + + +def test_conflict_different_result(db_session, mock_risk_engine): + intent = IntentDB( + deterministic_id="intent_def", + intent_type="ProvisionResourceIntent", + payload={}, + created_at=datetime.datetime.utcnow() + ) + db_session.add(intent) + db_session.commit() + + record_outcome( + db_session, + "intent_def", + True, + None, + None, + mock_risk_engine) + with pytest.raises(OutcomeConflictError): + record_outcome( + db_session, + "intent_def", + False, + None, + None, + mock_risk_engine) + + +def test_nonexistent_intent(db_session, mock_risk_engine): + with pytest.raises(ValueError): + record_outcome( + db_session, + "missing", + True, + None, + None, + mock_risk_engine) + + +def test_record_outcome_reconstruction_failure_does_not_update_engine( + db_session, mock_risk_engine): + # Create an intent with invalid oss_payload (missing required fields) + intent = IntentDB( + deterministic_id="intent_bad", + intent_type="ProvisionResourceIntent", + payload={}, + oss_payload={"intent_type": "provision_resource"}, # missing fields + created_at=datetime.datetime.utcnow() + ) + db_session.add(intent) + db_session.commit() + + # This should NOT call risk_engine.update_outcome (no dummy fallback) + outcome = record_outcome( + db=db_session, + deterministic_id="intent_bad", + success=True, + recorded_by="tester", + notes="fallback test", + risk_engine=mock_risk_engine + ) + assert outcome.success is True + # The engine should NOT be updated because reconstruction failed + mock_risk_engine.update_outcome.assert_not_called() diff --git a/tests/test_payments.py b/tests/test_payments.py new file mode 100644 index 0000000000000000000000000000000000000000..c90cfe70d3b9eddda099b6dfd3cc1def3808adc7 --- /dev/null +++ b/tests/test_payments.py @@ -0,0 +1,64 @@ +import os +import pytest +from unittest.mock import patch, MagicMock +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + +# Skip all tests in this module if Stripe secret key is not set +STRIPE_SECRET_KEY = os.getenv("STRIPE_SECRET_KEY") +if not STRIPE_SECRET_KEY: + pytest.skip( + "Stripe not configured – skipping payment tests", + allow_module_level=True) + + +@pytest.fixture +def mock_stripe(): + with patch("stripe.checkout.Session.create") as mock: + yield mock + + +def test_create_checkout_session_missing_stripe_key(monkeypatch): + monkeypatch.setenv("STRIPE_SECRET_KEY", "") + response = client.post( + "/api/v1/payments/create-checkout-session", + json={ + "api_key": "test_key", + "success_url": "https://example.com/success", + "cancel_url": "https://example.com/cancel"}) + assert response.status_code == 500 + assert "Stripe not configured" in response.json()["detail"] + + +def test_create_checkout_session_free_key(mock_stripe): + # Mock tracker.get_tier to return Tier.FREE + with patch("app.core.usage_tracker.tracker") as mock_tracker: + mock_tracker.get_tier.return_value = "free" + mock_stripe.return_value = MagicMock( + id="cs_test_123", url="https://checkout.stripe.com/pay") + response = client.post( + "/api/v1/payments/create-checkout-session", + json={ + "api_key": "test_key", + "success_url": "https://example.com/success", + "cancel_url": "https://example.com/cancel"}) + assert response.status_code == 200 + data = response.json() + assert "sessionId" in data + assert "url" in data + + +def test_create_checkout_session_pro_key(): + with patch("app.core.usage_tracker.tracker") as mock_tracker: + mock_tracker.get_tier.return_value = "pro" + response = client.post( + "/api/v1/payments/create-checkout-session", + json={ + "api_key": "test_key", + "success_url": "https://example.com/success", + "cancel_url": "https://example.com/cancel"}) + assert response.status_code == 400 + assert "Only free tier keys can be upgraded" in response.json()[ + "detail"] diff --git a/tests/test_risk.py b/tests/test_risk.py new file mode 100644 index 0000000000000000000000000000000000000000..fbe4ba3bb21c4ad0caba189c6629b3178b836872 --- /dev/null +++ b/tests/test_risk.py @@ -0,0 +1,34 @@ +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_get_risk(): + """Endpoint returns 501 with a deprecation message.""" + response = client.get( + "/api/v1/get_risk", + headers={ + "X-API-Key": "test-key"}) + assert response.status_code == 501 + data = response.json() + assert "deprecated" in data.get("detail", "").lower() + + +def test_get_risk_internal_error(client, monkeypatch): + """Force an internal error – the endpoint should return 500.""" + import app.api.routes_risk + + def mock_get_system_risk(): + raise ValueError("test error") + monkeypatch.setattr( + app.api.routes_risk, + "get_system_risk", + mock_get_system_risk) + response = client.get( + "/api/v1/get_risk", + headers={ + "X-API-Key": "test-key"}) + assert response.status_code == 500 + data = response.json() + assert "test error" in data.get("detail", "") diff --git a/tests/test_risk_service.py b/tests/test_risk_service.py new file mode 100644 index 0000000000000000000000000000000000000000..eb88293e8190922c0fc9ec19570b11af45eafc5c --- /dev/null +++ b/tests/test_risk_service.py @@ -0,0 +1,42 @@ +from unittest.mock import Mock +from app.services.risk_service import evaluate_intent, evaluate_healing_decision +from agentic_reliability_framework.core.governance.intents import ProvisionResourceIntent, ResourceType +from agentic_reliability_framework.core.models.event import ReliabilityEvent +from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine + + +def test_evaluate_intent(): + engine = Mock() + engine.calculate_risk.return_value = (0.5, "explanation", {"a": 1}) + intent = ProvisionResourceIntent( + resource_type=ResourceType.VM, + region="eastus", + size="Standard_D2s_v3", + requester="alice", + environment="dev" + ) + result = evaluate_intent( + engine, + intent, + cost_estimate=100.0, + policy_violations=[]) + assert result["risk_score"] == 0.5 + assert result["explanation"] == "explanation" + engine.calculate_risk.assert_called_once() + + +def test_evaluate_healing_decision(): + event = ReliabilityEvent( + component="test", + latency_p99=600, + error_rate=0.2, + service_mesh="default" + ) + policy_engine = PolicyEngine() + # Temporarily disable decision engine to get raw actions + policy_engine.use_decision_engine = False + result = evaluate_healing_decision(event, policy_engine) + assert "selected_action" in result + assert "risk_score" in result + # The result should have epistemic_signals (zeros) even without model + assert result["epistemic_signals"] is not None diff --git a/tests/test_routes_memory.py b/tests/test_routes_memory.py new file mode 100644 index 0000000000000000000000000000000000000000..5300c5551cc97feb4b06b3b75227ab8492429566 --- /dev/null +++ b/tests/test_routes_memory.py @@ -0,0 +1,16 @@ +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + + +def test_memory_stats(): + response = client.get("/v1/memory/stats") + assert response.status_code == 200 + data = response.json() + # The endpoint returns incident_nodes, outcome_nodes, edges, and a message + assert "incident_nodes" in data + assert "outcome_nodes" in data + assert "edges" in data + assert isinstance(data["incident_nodes"], int) + assert isinstance(data["outcome_nodes"], int) diff --git a/tests/test_usage_tracker.py b/tests/test_usage_tracker.py new file mode 100644 index 0000000000000000000000000000000000000000..b555ddd02a10b674c21c2315a2c3d8ddbc028a09 --- /dev/null +++ b/tests/test_usage_tracker.py @@ -0,0 +1,79 @@ +import pytest +import tempfile +import time +from app.core.usage_tracker import UsageTracker, Tier, UsageRecord + + +@pytest.fixture +def tracker(): + with tempfile.NamedTemporaryFile(suffix=".db") as tmp: + yield UsageTracker(db_path=tmp.name) + + +def test_get_or_create_api_key(tracker): + assert tracker.get_or_create_api_key("test_key", Tier.FREE) is True + assert tracker.get_tier("test_key") == Tier.FREE + # Second call should return True without error + assert tracker.get_or_create_api_key("test_key") is True + + +def test_update_api_key_tier(tracker): + tracker.get_or_create_api_key("test_key", Tier.FREE) + assert tracker.update_api_key_tier("test_key", Tier.PRO) is True + assert tracker.get_tier("test_key") == Tier.PRO + # Non-existent key + assert tracker.update_api_key_tier("nonexistent", Tier.PRO) is False + + +def test_get_remaining_quota_free(tracker): + tracker.get_or_create_api_key("free_key", Tier.FREE) + # Initially 1000 remaining + remaining = tracker.get_remaining_quota("free_key", Tier.FREE) + assert remaining == 1000 + # Simulate usage using the atomic method + record = UsageRecord( + api_key="free_key", + tier=Tier.FREE, + timestamp=time.time(), + endpoint="/test" + ) + tracker.increment_usage_sync(record) + remaining = tracker.get_remaining_quota("free_key", Tier.FREE) + assert remaining == 999 + + +def test_get_remaining_quota_enterprise(tracker): + tracker.get_or_create_api_key("ent_key", Tier.ENTERPRISE) + remaining = tracker.get_remaining_quota("ent_key", Tier.ENTERPRISE) + assert remaining is None + + +def test_increment_usage_sync(tracker): + tracker.get_or_create_api_key("test_key", Tier.FREE) + record = UsageRecord( + api_key="test_key", + tier=Tier.FREE, + timestamp=time.time(), + endpoint="/test", + ) + result = tracker.increment_usage_sync(record) + assert result is True + # Check quota decreased + remaining = tracker.get_remaining_quota("test_key", Tier.FREE) + assert remaining == 999 + + +def test_get_audit_logs(tracker): + tracker.get_or_create_api_key("test_key", Tier.FREE) + record = UsageRecord( + api_key="test_key", + tier=Tier.FREE, + timestamp=time.time(), + endpoint="/test", + request_body={"foo": "bar"}, + response={"status": "ok"}, + ) + tracker.increment_usage_sync(record) + logs = tracker.get_audit_logs("test_key", limit=10) + assert len(logs) == 1 + assert logs[0]["endpoint"] == "/test" diff --git a/tests/test_webhooks.py b/tests/test_webhooks.py new file mode 100644 index 0000000000000000000000000000000000000000..244dbbf8fe3c1e8bbdd14062597868a61f61217f --- /dev/null +++ b/tests/test_webhooks.py @@ -0,0 +1,94 @@ +import os +import pytest +from unittest.mock import patch +from fastapi.testclient import TestClient +from app.main import app + +client = TestClient(app) + +# Skip all tests in this module if Stripe webhook secret is not set +STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET") +if not STRIPE_WEBHOOK_SECRET: + pytest.skip( + "Stripe webhook not configured – skipping webhook tests", + allow_module_level=True) + + +@pytest.fixture +def mock_stripe_webhook(): + with patch("stripe.Webhook.construct_event") as mock: + yield mock + + +def test_webhook_missing_secret(monkeypatch): + monkeypatch.setenv("STRIPE_WEBHOOK_SECRET", "") + response = client.post( + "/webhooks/stripe", + json={}, + headers={"stripe-signature": "test"} + ) + assert response.status_code == 500 + assert "Stripe not configured" in response.json()["detail"] + + +def test_webhook_invalid_payload(mock_stripe_webhook, monkeypatch): + monkeypatch.setenv("STRIPE_WEBHOOK_SECRET", "whsec_test") + mock_stripe_webhook.side_effect = ValueError("Invalid payload") + response = client.post( + "/webhooks/stripe", + json={}, + headers={"stripe-signature": "test"} + ) + assert response.status_code == 400 + assert "Invalid payload" in response.json()["detail"] + + +def test_webhook_invalid_signature(mock_stripe_webhook, monkeypatch): + monkeypatch.setenv("STRIPE_WEBHOOK_SECRET", "whsec_test") + mock_stripe_webhook.side_effect = Exception("Invalid signature") + response = client.post( + "/webhooks/stripe", + json={}, + headers={"stripe-signature": "test"} + ) + assert response.status_code == 400 + assert "Invalid signature" in response.json()["detail"] + + +def test_webhook_checkout_completed(monkeypatch): + monkeypatch.setenv("STRIPE_WEBHOOK_SECRET", "whsec_test") + monkeypatch.setenv("STRIPE_SECRET_KEY", "sk_test") + with patch("stripe.Webhook.construct_event") as mock_construct, \ + patch("app.core.usage_tracker.update_key_tier") as mock_update: + mock_construct.return_value = { + "type": "checkout.session.completed", + "data": { + "object": { + "client_reference_id": "test_key", + "metadata": { + "api_key": "test_key"}}}} + response = client.post( + "/webhooks/stripe", + json={}, + headers={"stripe-signature": "test"} + ) + assert response.status_code == 200 + mock_update.assert_called_once_with("test_key", "pro") + + +def test_webhook_subscription_deleted(monkeypatch): + monkeypatch.setenv("STRIPE_WEBHOOK_SECRET", "whsec_test") + monkeypatch.setenv("STRIPE_SECRET_KEY", "sk_test") + with patch("stripe.Webhook.construct_event") as mock_construct, \ + patch("app.core.usage_tracker.update_key_tier") as mock_update: + mock_construct.return_value = { + "type": "customer.subscription.deleted", + "data": {"object": {"metadata": {"api_key": "test_key"}}} + } + response = client.post( + "/webhooks/stripe", + json={}, + headers={"stripe-signature": "test"} + ) + assert response.status_code == 200 + mock_update.assert_called_once_with("test_key", "free")