Spaces:

ARF-AI
/

Agentic-Reliability-Framework-API

Build error

App Files Files Community

Agentic-Reliability-Framework-API / app /services /risk_service.py

petter2025

Upload folder using huggingface_hub (#3)

6d20eab about 1 month ago

raw

history blame contribute delete

14.1 kB

	"""
	Risk service – integrates ARF risk engine, policy engine, and decision engine.
	Deterministic, no random fallbacks, explicit error handling.

	Version: 2026-05-04 – added Prometheus metrics for observability.
	"""

	import json
	import logging
	import os
	import time
	from typing import Optional, List, Dict, Any

	from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
	from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
	from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
	from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
	from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
	from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
	from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk

	# ── optional tracing ─────────────────────────────────────────
	try:
	from opentelemetry import trace
	_tracer = trace.get_tracer(__name__)
	OTEL_AVAILABLE = True
	except ImportError:
	OTEL_AVAILABLE = False
	_tracer = None

	# ── Prometheus metrics (always registered; no‑op if not scraped) ─
	from prometheus_client import Counter, Histogram

	_EVAL_COUNTER = Counter(
	"arf_evaluations_total",
	"Total evaluation calls (intent + healing), partitioned by engine and status.",
	["engine", "status"],
	)

	_EVAL_DURATION = Histogram(
	"arf_evaluation_duration_seconds",
	"End‑to‑end latency of evaluation calls.",
	["engine"],
	buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0),
	)

	_RUST_AGREEMENT = Counter(
	"arf_rust_agreement_total",
	"Agreement between Rust enforcer and Python policy evaluation.",
	["result"], # "agreed" or "diverged"
	)

	# ── optional Rust enforcer (shadow mode) ──────────────────────
	_RUST_ENFORCER_AVAILABLE = False
	_rust_evaluator = None # singleton per process
	_rust_policy_json: Optional[str] = None

	if os.getenv("ARF_USE_RUST_ENFORCER", "false").lower() == "true":
	try:
	import arf_enforcer
	_RUST_ENFORCER_AVAILABLE = True
	except ImportError:
	pass

	# Default OSS policy tree – mirrors the hard‑coded rules in the Python PolicyEvaluator
	# that check region, resource type, and max permission level.
	_OSS_POLICY_TREE_JSON = json.dumps({
	"And": [
	{"Atomic": {"RegionAllowed": {"allowed_regions": ["eastus"]}}},
	{"Atomic": {"ResourceTypeRestricted": {
	"forbidden_types": ["DATABASE_DROP", "FULL_ROLLOUT", "SYSTEM_SHUTDOWN", "SECRET_ROTATION"]
	}}},
	{"Atomic": {"MaxPermissionLevel": {"max_level": "admin"}}}
	]
	})


	def _ensure_rust_evaluator() -> bool:
	"""Lazy initialise the Rust policy evaluator. Returns True on success."""
	global _rust_evaluator, _rust_policy_json
	if _rust_evaluator is not None:
	return True
	if not _RUST_ENFORCER_AVAILABLE:
	return False
	try:
	_rust_policy_json = _OSS_POLICY_TREE_JSON
	_rust_evaluator = arf_enforcer.PyPolicyEvaluator(_rust_policy_json)
	return True
	except Exception:
	_rust_evaluator = None
	return False


	logger = logging.getLogger(__name__)


	def evaluate_intent(
	engine: RiskEngine,
	intent: InfrastructureIntent,
	cost_estimate: Optional[float],
	policy_violations: List[str]
	) -> dict:
	"""
	Evaluate an infrastructure intent using the Bayesian risk engine.

	Optionally shadows the policy evaluation with the Rust enforcer when
	the environment variable ARF_USE_RUST_ENFORCER is set to "true".
	Any divergence is logged and counted as a Prometheus metric.

	Parameters
	----------
	engine : RiskEngine
	Initialised ARF Bayesian risk engine.
	intent : InfrastructureIntent
	The infrastructure request to evaluate.
	cost_estimate : float or None
	Estimated monthly cost (used by cost‑threshold policies).
	policy_violations : list[str]
	Pre‑computed policy violation strings (from the Python evaluator).

	Returns
	-------
	dict
	Keys: risk_score, explanation, contributions.
	"""
	t0 = time.monotonic()
	span = None
	if OTEL_AVAILABLE and _tracer:
	span = _tracer.start_span("risk_service.evaluate_intent")
	span.set_attribute("intent_type", type(intent).__name__)

	# ── Shadow Rust enforcer (best‑effort, non‑blocking) ──────
	if _RUST_ENFORCER_AVAILABLE and _ensure_rust_evaluator():
	try:
	rust_intent = {
	"action": getattr(intent, "intent_type", "unknown"),
	"component": getattr(intent, "service_name", "unknown"),
	"region": getattr(intent, "region", None),
	"resource_type": getattr(intent, "resource_type", None),
	"permission_level": getattr(intent, "permission_level", None),
	"extra": {}
	}
	rust_raw = _rust_evaluator.evaluate(
	json.dumps(rust_intent), cost_estimate
	)
	rust_violations = json.loads(rust_raw)

	agreed = set(rust_violations) == set(policy_violations)
	_RUST_AGREEMENT.labels(result="agreed" if agreed else "diverged").inc()
	if not agreed:
	msg = (
	"Rust enforcer divergence: "
	f"Rust={sorted(rust_violations)} Python={sorted(policy_violations)}"
	)
	logger.warning(msg)
	if span:
	span.add_event("rust_enforcer_divergence", {
	"rust_violations": rust_violations,
	"python_violations": policy_violations
	})
	except Exception as exc:
	logger.debug("Rust enforcer shadow evaluation failed: %s", exc)

	# ── Core risk evaluation ──────────────────────────────────

	# ── Automated canary promotion ──────────────────────────
	if _RUST_ENFORCER_AVAILABLE and os.getenv("ARF_RUST_CANARY", "false").lower() == "true":
	try:
	from prometheus_client import REGISTRY
	lower = REGISTRY.get_sample_value("arf_rust_agreement_lower_bound", {})
	if lower is not None and lower > 0.9999:
	policy_violations = rust_violations
	if span:
	span.set_attribute("rust_enforcer_active", True)
	except Exception:
	pass
	try:
	score, explanation, contributions = engine.calculate_risk(
	intent=intent,
	cost_estimate=cost_estimate,
	policy_violations=policy_violations
	)
	engine_label = "python"
	status = "success"
	except Exception:
	_EVAL_COUNTER.labels(engine="python", status="error").inc()
	_EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
	raise

	_EVAL_COUNTER.labels(engine=engine_label, status=status).inc()
	_EVAL_DURATION.labels(engine=engine_label).observe(time.monotonic() - t0)

	if span:
	span.set_attribute("risk_score", score)
	if _RUST_ENFORCER_AVAILABLE:
	span.set_attribute("rust_enforcer_available", True)
	span.end()

	return {
	"risk_score": score,
	"explanation": explanation,
	"contributions": contributions
	}


	def evaluate_healing_decision(
	event: ReliabilityEvent,
	policy_engine: PolicyEngine,
	decision_engine: Optional[DecisionEngine] = None,
	rag_graph: Optional[RAGGraphMemory] = None,
	model=None,
	tokenizer=None,
	) -> Dict[str, Any]:
	"""
	Evaluate healing actions for a given reliability event using decision‑theoretic selection.
	Includes epistemic risk signals from the eclipse probe.

	Parameters
	----------
	event : ReliabilityEvent
	The incident event containing latency, error rate, etc.
	policy_engine : PolicyEngine
	The ARF healing policy engine with configured policies.
	decision_engine : DecisionEngine, optional
	If omitted, a default instance is created.
	rag_graph : RAGGraphMemory, optional
	Semantic memory for similar incident retrieval.
	model, tokenizer : optional
	HuggingFace model and tokenizer for epistemic risk computation.

	Returns
	-------
	dict
	Keys: risk_score, selected_action, expected_utility, alternatives,
	explanation, epistemic_signals.
	"""
	t0 = time.monotonic()
	span = None
	if OTEL_AVAILABLE and _tracer:
	span = _tracer.start_span("risk_service.evaluate_healing")
	span.set_attribute("component", event.component)

	# If decision_engine not provided, try to get from policy_engine
	if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
	decision_engine = policy_engine.decision_engine

	# If still None, create a minimal one (global stats only)
	if decision_engine is None:
	logger.debug("No DecisionEngine provided; creating default instance")
	decision_engine = DecisionEngine(rag_graph=rag_graph)

	# Get raw candidate actions (by temporarily disabling decision engine)
	orig_use = policy_engine.use_decision_engine
	try:
	policy_engine.use_decision_engine = False
	raw_actions = policy_engine.evaluate_policies(event)
	finally:
	policy_engine.use_decision_engine = orig_use

	# If no actions, return NO_ACTION
	if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
	if span:
	span.set_attribute("selected_action", HealingAction.NO_ACTION.value)
	span.end()
	_EVAL_COUNTER.labels(engine="python", status="success").inc()
	_EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
	return {
	"risk_score": 0.0,
	"selected_action": HealingAction.NO_ACTION.value,
	"expected_utility": 0.0,
	"alternatives": [],
	"explanation": "No candidate actions triggered.",
	"epistemic_signals": None,
	}

	# Build reasoning text from policies that triggered the actions
	reasoning_parts = []
	for policy in policy_engine.policies:
	if any(a in policy.actions for a in raw_actions):
	conditions_str = ", ".join(
	f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
	)
	reasoning_parts.append(
	f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
	)
	reasoning_text = " ".join(reasoning_parts)

	# Build evidence text from the event
	evidence_text = (
	f"Component: {event.component}, "
	f"latency_p99: {event.latency_p99}, "
	f"error_rate: {event.error_rate}, "
	f"cpu_util: {event.cpu_util}, "
	f"memory_util: {event.memory_util}"
	)

	# Compute epistemic signals (if model/tokenizer provided)
	epistemic_signals = None
	if model is not None and tokenizer is not None:
	try:
	epistemic_signals = compute_epistemic_risk(
	reasoning_text, evidence_text, model, tokenizer
	)
	except Exception as e:
	logger.error(f"Failed to compute epistemic risk: {e}")
	epistemic_signals = {
	"entropy": 0.0,
	"contradiction": 0.0,
	"evidence_lift": 0.0,
	"hallucination_risk": 0.0,
	}
	else:
	logger.debug("Epistemic model/tokenizer not provided; using zero signals")
	epistemic_signals = {
	"entropy": 0.0,
	"contradiction": 0.0,
	"evidence_lift": 0.0,
	"hallucination_risk": 0.0,
	}

	# Run decision engine to get best action and alternatives
	decision = decision_engine.select_optimal_action(
	raw_actions, event, component=event.component,
	epistemic_signals=epistemic_signals
	)

	# Extract risk of the selected action
	risk_score = None
	for alt in decision.alternatives:
	if alt.action == decision.best_action:
	risk_score = alt.risk
	break
	if risk_score is None:
	# Compute risk separately
	risk_score = decision_engine.compute_risk(
	decision.best_action, event, event.component)

	# Format alternatives (top 3 only)
	alt_list = []
	for alt in decision.alternatives[:3]:
	alt_list.append({
	"action": alt.action.value,
	"expected_utility": alt.utility,
	"risk": alt.risk,
	})

	# ── Metrics & span finalisation ───────────────────────────
	_EVAL_COUNTER.labels(engine="python", status="success").inc()
	_EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)

	if span:
	span.set_attribute("risk_score", risk_score)
	span.set_attribute("selected_action", decision.best_action.value)
	span.set_attribute("expected_utility", decision.expected_utility)
	span.end()

	return {
	"risk_score": risk_score,
	"selected_action": decision.best_action.value,
	"expected_utility": decision.expected_utility,
	"alternatives": alt_list,
	"explanation": decision.explanation,
	"raw_decision": decision.raw_data,
	"epistemic_signals": epistemic_signals,
	}


	def get_system_risk() -> float:
	"""
	Return an aggregated risk score across all monitored components.
	This is a placeholder – the endpoint is deprecated.
	Raises NotImplementedError to avoid random fallback.
	"""
	raise NotImplementedError(
	"get_system_risk is deprecated. Use component‑level risk evaluation instead."
	)