| """ |
| ARF API Control Plane β Main Application Entry Point |
| ==================================================== |
| |
| The control plane serves as the HTTP layer between the **Agentic Reliability |
| Framework (ARF)** core engine and external consumers (frontβend dashboard, |
| enterprise clients, and monitoring infrastructure). |
| |
| It is responsible for: |
| |
| * **Lifetime management** of the Bayesian risk engine, policy engine, |
| semantic memory (RAG graph), and epistemic models. |
| * **Observability** via optional OpenTelemetry tracing and Prometheus metrics |
| (the latter exposed automatically by ``prometheus-fastapi-instrumentator`` |
| on ``/metrics``). |
| * **Rate limiting** and **usage tracking** with atomic quota consumption. |
| * **CORS** configuration for the public ARF frontβend. |
| * **Databaseβbacked persistence** of the conjugate Bayesian posteriors so |
| that online learning survives restarts. |
| * **Automated Rust enforcer canary promotion** via Wilson confidence interval |
| monitoring of the agreement counters. |
| |
| All heavy components are loaded **lazily and bestβeffort** β if a dependency |
| is missing the API continues to serve healthβcheck and status endpoints, |
| degrading gracefully rather than crashing. |
| """ |
| import logging |
| import os |
| import sys |
| import json |
| import threading |
| import time as _time |
| from contextlib import asynccontextmanager |
| from typing import Dict |
|
|
| from fastapi import FastAPI |
| from fastapi.middleware.cors import CORSMiddleware |
|
|
| |
| try: |
| from prometheus_fastapi_instrumentator import Instrumentator |
| PROMETHEUS_AVAILABLE = True |
| except ImportError: |
| PROMETHEUS_AVAILABLE = False |
| Instrumentator = None |
|
|
| |
| try: |
| from slowapi import _rate_limit_exceeded_handler |
| from slowapi.errors import RateLimitExceeded |
| from slowapi.middleware import SlowAPIMiddleware |
| SLOWAPI_AVAILABLE = True |
| except ImportError: |
| SLOWAPI_AVAILABLE = False |
| _rate_limit_exceeded_handler = None |
| RateLimitExceeded = None |
| SlowAPIMiddleware = None |
|
|
| |
| try: |
| from agentic_reliability_framework.core.governance.risk_engine import RiskEngine |
| from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine |
| from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory |
| from agentic_reliability_framework.runtime.memory.constants import MemoryConstants |
| ARF_AVAILABLE = True |
| except ImportError: |
| ARF_AVAILABLE = False |
| RiskEngine = None |
| PolicyEngine = None |
| create_faiss_index = None |
| RAGGraphMemory = None |
| MemoryConstants = None |
|
|
| |
| from app.core.usage_tracker import init_tracker, tracker, Tier |
|
|
| from app.api import ( |
| routes_governance, |
| routes_history, |
| routes_incidents, |
| routes_intents, |
| routes_risk, |
| routes_memory, |
| routes_admin, |
| routes_payments, |
| webhooks, |
| routes_users, |
| routes_pricing, |
| ) |
| from app.api.deps import limiter |
| from app.core.config import settings |
|
|
| logger = logging.getLogger("arf.api") |
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", |
| handlers=[logging.StreamHandler(sys.stdout)], |
| ) |
|
|
|
|
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| """ |
| Application lifespan manager. |
| |
| All initialisation that requires a running event loop (database |
| connections, model loading, etc.) happens **before** the ``yield``. |
| Cleanup (if any) happens after the ``yield``. |
| |
| Initialisation order: |
| 1. Risk engine (Bayesian scoring + HMC). |
| 2. Load persisted conjugate posterior state (``beta_state`` table). |
| 3. OpenTelemetry tracing (console exporter by default). |
| 4. Policy engine, RAG memory, and epistemic model. |
| 5. Usage tracker (SQLite / Redis). |
| 6. Wilson confidence monitor for Rust enforcer canary promotion. |
| """ |
| logger.info("π Starting ARF API Control Plane") |
| logger.debug(f"Python path: {sys.path}") |
|
|
| |
| if ARF_AVAILABLE: |
| hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json") |
| use_hyperpriors = os.getenv( |
| "ARF_USE_HYPERPRIORS", "false" |
| ).lower() == "true" |
| logger.info( |
| "Initializing RiskEngine β HMC model: %s, hyperpriors: %s", |
| hmc_model_path, |
| use_hyperpriors, |
| ) |
| try: |
| app.state.risk_engine = RiskEngine( |
| hmc_model_path=hmc_model_path, |
| use_hyperpriors=use_hyperpriors, |
| n0=1000, |
| hyperprior_weight=0.3, |
| ) |
| logger.info("β
RiskEngine initialized successfully.") |
| except Exception as e: |
| logger.exception("π₯ Fatal error initializing RiskEngine") |
| raise RuntimeError("RiskEngine initialization failed") from e |
|
|
| |
| try: |
| from app.database.session import SessionLocal |
| from app.database.models_intents import BetaStateDB |
| from agentic_reliability_framework.core.governance.risk_engine import ActionCategory |
|
|
| db = SessionLocal() |
| try: |
| rows = db.query(BetaStateDB).all() |
| if rows: |
| state = { |
| ActionCategory(row.category): (row.alpha, row.beta) |
| for row in rows |
| } |
| app.state.risk_engine.beta_store.load_state(state) |
| logger.info( |
| "Loaded Bayesian posterior state from database (%d categories).", |
| len(state), |
| ) |
| else: |
| logger.info( |
| "No persisted Bayesian state found; using default priors." |
| ) |
| finally: |
| db.close() |
| except Exception as e: |
| logger.warning( |
| "Could not load Bayesian state from database: %s", e |
| ) |
|
|
| |
| try: |
| from opentelemetry import trace |
| from opentelemetry.sdk.resources import SERVICE_NAME, Resource |
| from opentelemetry.sdk.trace import TracerProvider |
| from opentelemetry.sdk.trace.export import SimpleSpanProcessor, ConsoleSpanExporter |
| from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor |
|
|
| resource = Resource.create({SERVICE_NAME: "arf-api"}) |
| provider = TracerProvider(resource=resource) |
| provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter())) |
| trace.set_tracer_provider(provider) |
|
|
| FastAPIInstrumentor.instrument_app(app) |
| logger.info("β
Tracing initialized (console exporter).") |
| except Exception as e: |
| logger.warning("Tracing initialization skipped: %s", e) |
|
|
| |
| try: |
| app.state.policy_engine = PolicyEngine() |
| logger.info("β
PolicyEngine initialized successfully.") |
| except Exception as e: |
| logger.warning(f"PolicyEngine initialization failed: {e}") |
| app.state.policy_engine = None |
|
|
| try: |
| faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM) |
| app.state.rag_graph = RAGGraphMemory(faiss_index) |
| logger.info("β
RAGGraphMemory initialized successfully.") |
| except Exception as e: |
| logger.warning(f"RAGGraphMemory initialization failed: {e}") |
| app.state.rag_graph = None |
|
|
| epistemic_model_name = os.getenv("EPISTEMIC_MODEL", "") |
| if epistemic_model_name: |
| try: |
| from sentence_transformers import SentenceTransformer |
| logger.info(f"Loading epistemic model: {epistemic_model_name}") |
| app.state.epistemic_model = SentenceTransformer( |
| epistemic_model_name |
| ) |
| app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer |
| logger.info("β
Epistemic model loaded.") |
| except ImportError: |
| logger.warning( |
| "sentence-transformers not installed; epistemic signals will be zeros." |
| ) |
| app.state.epistemic_model = None |
| app.state.epistemic_tokenizer = None |
| except Exception as e: |
| logger.warning(f"Failed to load epistemic model: {e}") |
| app.state.epistemic_model = None |
| app.state.epistemic_tokenizer = None |
| else: |
| logger.info( |
| "EPISTEMIC_MODEL not set; epistemic signals will be zeros." |
| ) |
| app.state.epistemic_model = None |
| app.state.epistemic_tokenizer = None |
| else: |
| logger.warning( |
| "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled." |
| ) |
|
|
| |
| usage_tracking_disabled = ( |
| os.getenv("ARF_USAGE_TRACKING", "true").lower() == "false" |
| ) |
| if not usage_tracking_disabled: |
| logger.info("Initialising usage tracker...") |
| try: |
| init_tracker( |
| db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"), |
| redis_url=os.getenv("ARF_REDIS_URL"), |
| ) |
| |
| api_keys_json = os.getenv("ARF_API_KEYS", "{}") |
| try: |
| api_keys = json.loads(api_keys_json) |
| for key, tier_str in api_keys.items(): |
| try: |
| tier = Tier(tier_str.lower()) |
| tracker.get_or_create_api_key(key, tier) |
| logger.info(f"Seeded API key for tier {tier.value}") |
| except ValueError: |
| logger.warning( |
| f"Invalid tier '{tier_str}' for key {key}, skipping" |
| ) |
| except json.JSONDecodeError: |
| logger.warning( |
| "ARF_API_KEYS environment variable is not valid JSON; skipping seeding." |
| ) |
| app.state.usage_tracker = tracker |
| logger.info("β
Usage tracker ready.") |
| except Exception as e: |
| logger.critical(f"Failed to initialise usage tracker: {e}") |
| raise RuntimeError("Usage tracker initialisation failed") from e |
| else: |
| logger.info("Usage tracking disabled by ARF_USAGE_TRACKING=false.") |
| app.state.usage_tracker = None |
|
|
| |
| try: |
| from app.services.wilson_monitor import update as wilson_update |
| from prometheus_client import REGISTRY |
|
|
| def _wilson_updater(): |
| while True: |
| try: |
| agreed = REGISTRY.get_sample_value( |
| 'arf_rust_agreement_total', {'result': 'agreed'} |
| ) or 0.0 |
| diverged = REGISTRY.get_sample_value( |
| 'arf_rust_agreement_total', {'result': 'diverged'} |
| ) or 0.0 |
| wilson_update(int(agreed), int(diverged)) |
| except Exception as e: |
| logger.debug("Wilson updater error: %s", e) |
| _time.sleep(300) |
|
|
| threading.Thread(target=_wilson_updater, daemon=True).start() |
| logger.info("β
Wilson monitor background updater started.") |
| except Exception as e: |
| logger.warning("Wilson monitor initialization skipped: %s", e) |
|
|
| yield |
| logger.info("π Shutting down ARF API") |
|
|
|
|
| def create_app() -> FastAPI: |
| """ |
| Build and configure the FastAPI application. |
| |
| Middleware order: |
| 1. CORS (restricted to the public frontβend origin). |
| 2. Rate limiting (if slowapi is installed). |
| 3. Prometheus metrics exposition (if available). |
| |
| All API routers are included under the ``/api/v1`` prefix except |
| memory (``/v1/memory``) and webhooks (root level). |
| |
| A simple ``/health`` endpoint is provided for liveness probes. |
| """ |
| app = FastAPI( |
| title=settings.app_name, |
| version="0.5.0", |
| lifespan=lifespan, |
| docs_url="/docs", |
| redoc_url="/redoc", |
| description="Agentic Reliability Framework (ARF) API", |
| ) |
|
|
| |
| allowed_origins = ["https://arf-frontend-sandy.vercel.app"] |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=allowed_origins, |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
| logger.debug("CORS middleware configured") |
|
|
| |
| if SLOWAPI_AVAILABLE: |
| app.state.limiter = limiter |
| app.add_exception_handler( |
| RateLimitExceeded, _rate_limit_exceeded_handler |
| ) |
| app.add_middleware(SlowAPIMiddleware) |
| logger.debug("Rate limiter middleware configured") |
| else: |
| logger.debug("Rate limiter disabled (slowapi not installed)") |
|
|
| |
| if PROMETHEUS_AVAILABLE: |
| Instrumentator().instrument(app).expose(app) |
| logger.debug("Prometheus instrumentator configured") |
| else: |
| logger.debug("Prometheus instrumentator disabled (module not installed)") |
|
|
| |
| app.include_router( |
| routes_incidents.router, prefix="/api/v1", tags=["incidents"] |
| ) |
| app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"]) |
| app.include_router( |
| routes_intents.router, prefix="/api/v1", tags=["intents"] |
| ) |
| app.include_router( |
| routes_history.router, prefix="/api/v1", tags=["history"] |
| ) |
| app.include_router( |
| routes_governance.router, prefix="/api/v1", tags=["governance"] |
| ) |
| app.include_router( |
| routes_memory.router, prefix="/v1/memory", tags=["memory"] |
| ) |
| app.include_router( |
| routes_admin.router, prefix="/api/v1", tags=["admin"] |
| ) |
| app.include_router( |
| routes_payments.router, prefix="/api/v1", tags=["payments"] |
| ) |
| app.include_router( |
| webhooks.router, tags=["webhooks"] |
| ) |
| app.include_router( |
| routes_users.router, prefix="/api/v1", tags=["users"] |
| ) |
| app.include_router( |
| routes_pricing.router, prefix="/api/v1", tags=["pricing"] |
| ) |
| logger.debug("All API routers included") |
|
|
| @app.get("/health", tags=["health"]) |
| async def health() -> Dict[str, str]: |
| """Liveness probe β returns 200 when the application is running.""" |
| return {"status": "ok"} |
|
|
| return app |
|
|
|
|
| app = create_app() |
|
|