Spaces:
Build error
Build error
| """ | |
| ARF API Control Plane β Main Application Entry Point | |
| ==================================================== | |
| The control plane serves as the HTTP layer between the **Agentic Reliability | |
| Framework (ARF)** core engine and external consumers (frontβend dashboard, | |
| enterprise clients, and monitoring infrastructure). | |
| It is responsible for: | |
| * **Lifetime management** of the Bayesian risk engine, policy engine, | |
| semantic memory (RAG graph), and epistemic models. | |
| * **Observability** via optional OpenTelemetry tracing and Prometheus metrics | |
| (the latter exposed automatically by ``prometheus-fastapi-instrumentator`` | |
| on ``/metrics``). | |
| * **Rate limiting** and **usage tracking** with atomic quota consumption. | |
| * **CORS** configuration for the public ARF frontβend. | |
| * **Databaseβbacked persistence** of the conjugate Bayesian posteriors so | |
| that online learning survives restarts. | |
| * **Automated Rust enforcer canary promotion** via Wilson confidence interval | |
| monitoring of the agreement counters. | |
| All heavy components are loaded **lazily and bestβeffort** β if a dependency | |
| is missing the API continues to serve healthβcheck and status endpoints, | |
| degrading gracefully rather than crashing. | |
| """ | |
| import logging | |
| import os | |
| import sys | |
| import json | |
| import threading | |
| import time as _time | |
| from contextlib import asynccontextmanager | |
| from typing import Dict | |
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| # ββ Optional: Prometheus metrics βββββββββββββββββββββββββββββ | |
| try: | |
| from prometheus_fastapi_instrumentator import Instrumentator | |
| PROMETHEUS_AVAILABLE = True | |
| except ImportError: | |
| PROMETHEUS_AVAILABLE = False | |
| Instrumentator = None | |
| # ββ Optional: rateβlimiting (slowapi) ββββββββββββββββββββββββ | |
| try: | |
| from slowapi import _rate_limit_exceeded_handler | |
| from slowapi.errors import RateLimitExceeded | |
| from slowapi.middleware import SlowAPIMiddleware | |
| SLOWAPI_AVAILABLE = True | |
| except ImportError: | |
| SLOWAPI_AVAILABLE = False | |
| _rate_limit_exceeded_handler = None | |
| RateLimitExceeded = None | |
| SlowAPIMiddleware = None | |
| # ββ Core ARF engine (optional but essential for governance) ββ | |
| try: | |
| from agentic_reliability_framework.core.governance.risk_engine import RiskEngine | |
| from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine | |
| from agentic_reliability_framework.runtime.memory import create_faiss_index, RAGGraphMemory | |
| from agentic_reliability_framework.runtime.memory.constants import MemoryConstants | |
| ARF_AVAILABLE = True | |
| except ImportError: | |
| ARF_AVAILABLE = False | |
| RiskEngine = None | |
| PolicyEngine = None | |
| create_faiss_index = None | |
| RAGGraphMemory = None | |
| MemoryConstants = None | |
| # ββ Usage tracker ββββββββββββββββββββββββββββββββββββββββββββ | |
| from app.core.usage_tracker import init_tracker, tracker, Tier | |
| from app.api import ( | |
| routes_governance, | |
| routes_history, | |
| routes_incidents, | |
| routes_intents, | |
| routes_risk, | |
| routes_memory, | |
| routes_admin, | |
| routes_payments, | |
| webhooks, | |
| routes_users, | |
| routes_pricing, | |
| ) | |
| from app.api.deps import limiter | |
| from app.core.config import settings | |
| logger = logging.getLogger("arf.api") | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
| handlers=[logging.StreamHandler(sys.stdout)], | |
| ) | |
| async def lifespan(app: FastAPI): | |
| """ | |
| Application lifespan manager. | |
| All initialisation that requires a running event loop (database | |
| connections, model loading, etc.) happens **before** the ``yield``. | |
| Cleanup (if any) happens after the ``yield``. | |
| Initialisation order: | |
| 1. Risk engine (Bayesian scoring + HMC). | |
| 2. **NEW: Load persisted conjugate posterior state per tenant**. | |
| 3. OpenTelemetry tracing (console exporter by default). | |
| 4. Policy engine, RAG memory, and epistemic model. | |
| 5. Usage tracker (SQLite / Redis). | |
| 6. Wilson confidence monitor for Rust enforcer canary promotion. | |
| """ | |
| logger.info("π Starting ARF API Control Plane") | |
| logger.debug(f"Python path: {sys.path}") | |
| # ββ 1. Risk engine ββββββββββββββββββββββββββββββββββββββββ | |
| if ARF_AVAILABLE: | |
| hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json") | |
| use_hyperpriors = os.getenv( | |
| "ARF_USE_HYPERPRIORS", "false" | |
| ).lower() == "true" | |
| logger.info( | |
| "Initializing RiskEngine β HMC model: %s, hyperpriors: %s", | |
| hmc_model_path, | |
| use_hyperpriors, | |
| ) | |
| try: | |
| app.state.risk_engine = RiskEngine( | |
| hmc_model_path=hmc_model_path, | |
| use_hyperpriors=use_hyperpriors, | |
| n0=1000, | |
| hyperprior_weight=0.3, | |
| ) | |
| logger.info("β RiskEngine initialized successfully.") | |
| except Exception as e: | |
| logger.exception("π₯ Fatal error initializing RiskEngine") | |
| raise RuntimeError("RiskEngine initialization failed") from e | |
| # ββ 2. Persisted Bayesian state (PER TENANT) βββββββββββ | |
| try: | |
| from app.database.session import SessionLocal | |
| from app.database.models_intents import BetaStateDB, TenantDB | |
| from agentic_reliability_framework.core.governance.risk_engine import ActionCategory | |
| db = SessionLocal() | |
| try: | |
| # Load all tenants that have beta_state entries (or all tenants) | |
| tenant_rows = db.query(TenantDB.id).all() | |
| tenant_ids = [tid for (tid,) in tenant_rows] if tenant_rows else ["__default__"] | |
| for tid in tenant_ids: | |
| rows = db.query(BetaStateDB).filter(BetaStateDB.tenant_id == tid).all() | |
| if rows: | |
| state = {ActionCategory(row.category): (row.alpha, row.beta) for row in rows} | |
| app.state.risk_engine.load_tenant_state(tid, state) | |
| logger.info(f"Loaded Bayesian state for tenant {tid}: {len(state)} categories.") | |
| else: | |
| app.state.risk_engine._ensure_tenant(tid) | |
| logger.info(f"No persisted state for tenant {tid}; using default priors.") | |
| finally: | |
| db.close() | |
| except Exception as e: | |
| logger.warning(f"Could not load tenant Beta states: {e}") | |
| # ββ 3. Tracing (OpenTelemetry) βββββββββββββββββββββββββ | |
| try: | |
| from opentelemetry import trace | |
| from opentelemetry.sdk.resources import SERVICE_NAME, Resource | |
| from opentelemetry.sdk.trace import TracerProvider | |
| from opentelemetry.sdk.trace.export import SimpleSpanProcessor, ConsoleSpanExporter | |
| from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor | |
| resource = Resource.create({SERVICE_NAME: "arf-api"}) | |
| provider = TracerProvider(resource=resource) | |
| provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter())) | |
| trace.set_tracer_provider(provider) | |
| FastAPIInstrumentor.instrument_app(app) | |
| logger.info("β Tracing initialized (console exporter).") | |
| except Exception as e: | |
| logger.warning("Tracing initialization skipped: %s", e) | |
| # ββ 4. Policy engine, RAG, epistemic model βββββββββββββ | |
| try: | |
| app.state.policy_engine = PolicyEngine() | |
| logger.info("β PolicyEngine initialized successfully.") | |
| except Exception as e: | |
| logger.warning(f"PolicyEngine initialization failed: {e}") | |
| app.state.policy_engine = None | |
| try: | |
| faiss_index = create_faiss_index(dim=MemoryConstants.VECTOR_DIM) | |
| app.state.rag_graph = RAGGraphMemory(faiss_index) | |
| logger.info("β RAGGraphMemory initialized successfully.") | |
| except Exception as e: | |
| logger.warning(f"RAGGraphMemory initialization failed: {e}") | |
| app.state.rag_graph = None | |
| epistemic_model_name = os.getenv("EPISTEMIC_MODEL", "") | |
| if epistemic_model_name: | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| logger.info(f"Loading epistemic model: {epistemic_model_name}") | |
| app.state.epistemic_model = SentenceTransformer( | |
| epistemic_model_name | |
| ) | |
| app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer | |
| logger.info("β Epistemic model loaded.") | |
| except ImportError: | |
| logger.warning( | |
| "sentence-transformers not installed; epistemic signals will be zeros." | |
| ) | |
| app.state.epistemic_model = None | |
| app.state.epistemic_tokenizer = None | |
| except Exception as e: | |
| logger.warning(f"Failed to load epistemic model: {e}") | |
| app.state.epistemic_model = None | |
| app.state.epistemic_tokenizer = None | |
| else: | |
| logger.info( | |
| "EPISTEMIC_MODEL not set; epistemic signals will be zeros." | |
| ) | |
| app.state.epistemic_model = None | |
| app.state.epistemic_tokenizer = None | |
| else: | |
| logger.warning( | |
| "agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled." | |
| ) | |
| # ββ 5. Usage tracker ββββββββββββββββββββββββββββββββββββββ | |
| usage_tracking_disabled = ( | |
| os.getenv("ARF_USAGE_TRACKING", "true").lower() == "false" | |
| ) | |
| if not usage_tracking_disabled: | |
| logger.info("Initialising usage tracker...") | |
| try: | |
| init_tracker( | |
| db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"), | |
| redis_url=os.getenv("ARF_REDIS_URL"), | |
| ) | |
| # Seed initial API keys from environment variable (for testing / demo) | |
| api_keys_json = os.getenv("ARF_API_KEYS", "{}") | |
| try: | |
| api_keys = json.loads(api_keys_json) | |
| for key, tier_str in api_keys.items(): | |
| try: | |
| tier = Tier(tier_str.lower()) | |
| tracker.get_or_create_api_key(key, tier) | |
| logger.info(f"Seeded API key for tier {tier.value}") | |
| except ValueError: | |
| logger.warning( | |
| f"Invalid tier '{tier_str}' for key {key}, skipping" | |
| ) | |
| except json.JSONDecodeError: | |
| logger.warning( | |
| "ARF_API_KEYS environment variable is not valid JSON; skipping seeding." | |
| ) | |
| app.state.usage_tracker = tracker | |
| logger.info("β Usage tracker ready.") | |
| except Exception as e: | |
| logger.critical(f"Failed to initialise usage tracker: {e}") | |
| raise RuntimeError("Usage tracker initialisation failed") from e | |
| else: | |
| logger.info("Usage tracking disabled by ARF_USAGE_TRACKING=false.") | |
| app.state.usage_tracker = None | |
| # ββ 6. Wilson confidence monitor ββββββββββββββββββββββββββ | |
| try: | |
| from app.services.wilson_monitor import update as wilson_update | |
| from prometheus_client import REGISTRY | |
| def _wilson_updater(): | |
| while True: | |
| try: | |
| agreed = REGISTRY.get_sample_value( | |
| 'arf_rust_agreement_total', {'result': 'agreed'} | |
| ) or 0.0 | |
| diverged = REGISTRY.get_sample_value( | |
| 'arf_rust_agreement_total', {'result': 'diverged'} | |
| ) or 0.0 | |
| wilson_update(int(agreed), int(diverged)) | |
| except Exception as e: | |
| logger.debug("Wilson updater error: %s", e) | |
| _time.sleep(300) # every 5 minutes | |
| threading.Thread(target=_wilson_updater, daemon=True).start() | |
| logger.info("β Wilson monitor background updater started.") | |
| except Exception as e: | |
| logger.warning("Wilson monitor initialization skipped: %s", e) | |
| yield | |
| logger.info("π Shutting down ARF API") | |
| def create_app() -> FastAPI: | |
| """ | |
| Build and configure the FastAPI application. | |
| Middleware order: | |
| 1. CORS (restricted to the public frontβend origin). | |
| 2. Rate limiting (if slowapi is installed). | |
| 3. Prometheus metrics exposition (if available). | |
| All API routers are included under the ``/api/v1`` prefix except | |
| memory (``/v1/memory``) and webhooks (root level). | |
| A simple ``/health`` endpoint is provided for liveness probes. | |
| """ | |
| app = FastAPI( | |
| title=settings.app_name, | |
| version="0.5.0", | |
| lifespan=lifespan, | |
| docs_url="/docs", | |
| redoc_url="/redoc", | |
| description="Agentic Reliability Framework (ARF) API", | |
| ) | |
| # ββ CORS ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| allowed_origins = ["https://arf-frontend-sandy.vercel.app"] | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=allowed_origins, | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| logger.debug("CORS middleware configured") | |
| # ββ Rate limiter ββββββββββββββββββββββββββββββββββββββββββ | |
| if SLOWAPI_AVAILABLE: | |
| app.state.limiter = limiter | |
| app.add_exception_handler( | |
| RateLimitExceeded, _rate_limit_exceeded_handler | |
| ) | |
| app.add_middleware(SlowAPIMiddleware) | |
| logger.debug("Rate limiter middleware configured") | |
| else: | |
| logger.debug("Rate limiter disabled (slowapi not installed)") | |
| # ββ Prometheus ββββββββββββββββββββββββββββββββββββββββββββ | |
| if PROMETHEUS_AVAILABLE: | |
| Instrumentator().instrument(app).expose(app) | |
| logger.debug("Prometheus instrumentator configured") | |
| else: | |
| logger.debug("Prometheus instrumentator disabled (module not installed)") | |
| # ββ API Routers βββββββββββββββββββββββββββββββββββββββββββ | |
| app.include_router( | |
| routes_incidents.router, prefix="/api/v1", tags=["incidents"] | |
| ) | |
| app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"]) | |
| app.include_router( | |
| routes_intents.router, prefix="/api/v1", tags=["intents"] | |
| ) | |
| app.include_router( | |
| routes_history.router, prefix="/api/v1", tags=["history"] | |
| ) | |
| app.include_router( | |
| routes_governance.router, prefix="/api/v1", tags=["governance"] | |
| ) | |
| app.include_router( | |
| routes_memory.router, prefix="/v1/memory", tags=["memory"] | |
| ) | |
| app.include_router( | |
| routes_admin.router, prefix="/api/v1", tags=["admin"] | |
| ) | |
| app.include_router( | |
| routes_payments.router, prefix="/api/v1", tags=["payments"] | |
| ) | |
| app.include_router( | |
| webhooks.router, tags=["webhooks"] | |
| ) | |
| app.include_router( | |
| routes_users.router, prefix="/api/v1", tags=["users"] | |
| ) | |
| app.include_router( | |
| routes_pricing.router, prefix="/api/v1", tags=["pricing"] | |
| ) | |
| logger.debug("All API routers included") | |
| async def health() -> Dict[str, str]: | |
| """Liveness probe β returns 200 when the application is running.""" | |
| return {"status": "ok"} | |
| return app | |
| app = create_app() | |