Upload folder using huggingface_hub
#3
by petter2025 - opened
This view is limited to 50 files because it contains too many changes. See the raw diff here.
- .dockerignore +8 -0
- .gitignore +16 -33
- Dockerfile +2 -1
- README.md +90 -72
- alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py +47 -0
- app/api/deps.py +45 -63
- app/api/routes_admin.py +36 -25
- app/api/routes_governance.py +190 -71
- app/api/routes_incidents.py +186 -54
- app/api/routes_memory.py +5 -1
- app/api/routes_payments.py +7 -5
- app/api/routes_pricing.py +104 -0
- app/api/routes_risk.py +16 -19
- app/api/routes_users.py +7 -19
- app/api/webhooks.py +2 -1
- app/core/config.py +3 -0
- app/core/usage_tracker.py +257 -93
- app/database/models_intents.py +48 -6
- app/database/session.py +1 -14
- app/main.py +207 -67
- app/models/__init__.py +1 -1
- app/models/incident_models.py +3 -2
- app/models/infrastructure_intents.py +7 -40
- app/models/intent_models.py +1 -1
- app/models/risk_models.py +1 -1
- app/services/incident_service.py +2 -1
- app/services/intent_adapter.py +162 -65
- app/services/intent_service.py +2 -1
- app/services/intent_store.py +7 -3
- app/services/outcome_service.py +117 -57
- app/services/risk_service.py +348 -69
- app/services/wilson_monitor.py +56 -0
- docker-compose.test.yml +12 -0
- docs/authentication.md +25 -0
- docs/development.md +55 -0
- docs/docs_endpoints.md +314 -0
- docs/endpoints.md +34 -0
- docs/examples.md +54 -0
- docs/index.md +16 -0
- monitor.sh +18 -0
- render.yaml +19 -0
- requirements-dev.txt +3 -0
- requirements.txt +9 -5
- runtime.txt +2 -0
- seed_rag_data.py +67 -0
- start.sh +68 -0
- tests/conftest.py +128 -0
- tests/test_deps.py +15 -0
- tests/test_governance.py +71 -0
- tests/test_healing_endpoint.py +21 -0
.dockerignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
__pycache__
|
| 3 |
+
*.pyc
|
| 4 |
+
.env
|
| 5 |
+
venv
|
| 6 |
+
.pytest_cache
|
| 7 |
+
.coverage
|
| 8 |
+
htmlcov
|
.gitignore
CHANGED
|
@@ -1,50 +1,33 @@
|
|
| 1 |
# Python
|
| 2 |
__pycache__/
|
| 3 |
-
*.
|
| 4 |
-
*
|
| 5 |
-
*.
|
| 6 |
.Python
|
| 7 |
-
|
| 8 |
-
develop-eggs/
|
| 9 |
-
dist/
|
| 10 |
-
downloads/
|
| 11 |
-
eggs/
|
| 12 |
-
.eggs/
|
| 13 |
-
lib/
|
| 14 |
-
lib64/
|
| 15 |
-
parts/
|
| 16 |
-
sdist/
|
| 17 |
-
var/
|
| 18 |
-
wheels/
|
| 19 |
-
*.egg-info/
|
| 20 |
-
.installed.cfg
|
| 21 |
-
*.egg
|
| 22 |
|
| 23 |
-
# Virtual
|
| 24 |
venv/
|
| 25 |
env/
|
| 26 |
ENV/
|
| 27 |
-
.env/
|
| 28 |
.venv/
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# IDE
|
| 31 |
.vscode/
|
| 32 |
.idea/
|
| 33 |
*.swp
|
| 34 |
*.swo
|
| 35 |
-
*~
|
| 36 |
|
| 37 |
# OS
|
| 38 |
.DS_Store
|
| 39 |
-
.
|
| 40 |
-
.
|
| 41 |
-
|
| 42 |
-
.
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
# Hugging Face Spaces
|
| 47 |
-
data/
|
| 48 |
-
models/
|
| 49 |
-
logs/
|
| 50 |
-
*.log
|
|
|
|
| 1 |
# Python
|
| 2 |
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
*.pyo
|
| 5 |
+
*.pyd
|
| 6 |
.Python
|
| 7 |
+
*.so
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# Virtual environments
|
| 10 |
venv/
|
| 11 |
env/
|
| 12 |
ENV/
|
|
|
|
| 13 |
.venv/
|
| 14 |
|
| 15 |
+
# Build artifacts
|
| 16 |
+
dist/
|
| 17 |
+
build/
|
| 18 |
+
*.egg-info/
|
| 19 |
+
|
| 20 |
# IDE
|
| 21 |
.vscode/
|
| 22 |
.idea/
|
| 23 |
*.swp
|
| 24 |
*.swo
|
|
|
|
| 25 |
|
| 26 |
# OS
|
| 27 |
.DS_Store
|
| 28 |
+
.env
|
| 29 |
+
test.db
|
| 30 |
+
venv
|
| 31 |
+
.coverage
|
| 32 |
+
monitor.log
|
| 33 |
+
monitor_loop.log
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
FROM python:3.12-slim
|
|
|
|
| 2 |
WORKDIR /app
|
| 3 |
COPY requirements.txt .
|
| 4 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 5 |
COPY . .
|
| 6 |
-
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 1 |
FROM python:3.12-slim
|
| 2 |
+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
| 3 |
WORKDIR /app
|
| 4 |
COPY requirements.txt .
|
| 5 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 6 |
COPY . .
|
| 7 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
CHANGED
|
@@ -1,103 +1,121 @@
|
|
| 1 |
-
-
|
| 2 |
-
title: Agentic Reliability Framework (ARF) v4 – Public API Demo
|
| 3 |
-
emoji: 🤖
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: docker
|
| 7 |
-
python_version: '3.10'
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
-
--
|
| 23 |
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
| **📚 API Docs** | [https://a-r-f-arf-sandbox-api.hf.space/docs](https://a-r-f-arf-sandbox-api.hf.space/docs) |
|
| 29 |
-
| **🧪 Live Demo** | [Gradio Dashboard](https://a-r-f-arf-sandbox-api.hf.space/) |
|
| 30 |
-
| **📦 Public Spec** | [github.com/arf-foundation/arf-spec](https://github.com/arf-foundation/arf-spec) |
|
| 31 |
-
| **📅 Book a Call** | [Calendly](https://calendly.com/petter2025us/30min) |
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
-
|
| 38 |
-
import requests
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
json={
|
| 43 |
-
"service_name": "payment-gateway",
|
| 44 |
-
"event_type": "latency_spike",
|
| 45 |
-
"severity": "high",
|
| 46 |
-
"metrics": {"latency_p99": 450, "error_rate": 0.12}
|
| 47 |
-
}
|
| 48 |
-
)
|
| 49 |
-
print(response.json())
|
| 50 |
```
|
| 51 |
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
* risk\_factors: additive contributions from conjugate prior, hyperprior, and HMC
|
| 57 |
-
|
| 58 |
-
* recommended\_action: approve, deny, or escalate
|
| 59 |
-
|
| 60 |
-
* decision\_trace: expected losses and variance
|
| 61 |
-
|
| 62 |
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
-----------------------------------------
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
* **Semantic Memory** – FAISS‑based retrieval of similar past incidents.
|
| 71 |
-
|
| 72 |
-
* **Expected Loss Minimisation** – Chooses approve/deny/escalate by minimising cost-weighted risk, not static thresholds.
|
| 73 |
-
|
| 74 |
-
* **Multi‑Agent Orchestration** – Anomaly detection, root cause, forecasting.
|
| 75 |
-
|
| 76 |
|
| 77 |
-
|
| 78 |
-
---------------
|
| 79 |
|
| 80 |
```text
|
| 81 |
-
|
| 82 |
-
↓
|
| 83 |
-
HealingIntent ← Decision (Expected Loss)
|
| 84 |
```
|
| 85 |
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
| 89 |
-
--------------------
|
| 90 |
|
| 91 |
```bash
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
```
|
| 95 |
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
-
|
| 99 |
-
-----
|
| 100 |
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
-
Learn more at [github.com/arf-foundation](https://github.com/arf-foundation) and request access via petter2025us@outlook.com.
|
|
|
|
| 1 |
+
# arf-api
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
ARF API Control Plane (FastAPI)
|
| 4 |
|
| 5 |
+
## Live Demo
|
| 6 |
|
| 7 |
+
The API is deployed and accessible at:
|
| 8 |
+
- **Base URL**: [https://a-r-f-agentic-reliability-framework-api.hf.space](https://a-r-f-agentic-reliability-framework-api.hf.space)
|
| 9 |
+
- **Interactive Documentation**: [https://a-r-f-agentic-reliability-framework-api.hf.space/docs](https://a-r-f-agentic-reliability-framework-api.hf.space/docs)
|
| 10 |
|
| 11 |
+
## Quick Start (Local Development)
|
| 12 |
|
| 13 |
+
1. **Install dependencies**:
|
| 14 |
+
```bash
|
| 15 |
+
pip install -r requirements.txt
|
| 16 |
+
```
|
| 17 |
|
| 18 |
+
Note: `requirements.txt` installs `agentic-reliability-framework` directly from the project's Git repository.
|
| 19 |
|
| 20 |
+
2. **Set environment variables** (optional, in `.env`):
|
| 21 |
|
| 22 |
+
```text
|
| 23 |
+
ARF_HMC_MODEL – path to HMC model JSON (default: models/hmc_model.json)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
ARF_USE_HYPERPRIORS – true/false
|
| 26 |
|
| 27 |
+
API_KEY – optional (currently not enforced)
|
| 28 |
+
```
|
| 29 |
|
| 30 |
+
3. **Run the app locally**:
|
|
|
|
| 31 |
|
| 32 |
+
```bash
|
| 33 |
+
uvicorn app.main:app --reload --port 8000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
```
|
| 35 |
|
| 36 |
+
4. **Health check**:
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
GET http://localhost:8000/health
|
| 40 |
+
```
|
| 41 |
|
| 42 |
+
## Causal Explainer Endpoint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
The ARF API includes a heuristic causal explainer that evaluates the impact of proposed healing actions using deterministic rules. This module provides counterfactual reasoning without requiring a fitted causal model or external ML dependencies.
|
| 45 |
|
| 46 |
+
The explainer estimates how system metrics such as latency would change if a different action were taken.
|
|
|
|
| 47 |
|
| 48 |
+
### Mathematical Model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
The counterfactual outcome is computed as:
|
|
|
|
| 51 |
|
| 52 |
```text
|
| 53 |
+
counterfactual_outcome = factual_outcome * (1 + effect_frac)
|
|
|
|
|
|
|
| 54 |
```
|
| 55 |
|
| 56 |
+
Where:
|
| 57 |
+
|
| 58 |
+
- `effect_frac` is a predefined impact factor based on the action type
|
| 59 |
+
- effects are multiplicative
|
| 60 |
+
- a fixed ±10% uncertainty interval is applied to the estimated outcome
|
| 61 |
|
| 62 |
+
### Example Request
|
|
|
|
| 63 |
|
| 64 |
```bash
|
| 65 |
+
curl -X POST "http://localhost:8000/api/v1/v1/incidents/evaluate" -H "Content-Type: application/json" -d '{
|
| 66 |
+
"component": "checkout-service",
|
| 67 |
+
"latency_p99": 600,
|
| 68 |
+
"error_rate": 0.2,
|
| 69 |
+
"service_mesh": "default"
|
| 70 |
+
}'
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
### Example Response
|
| 74 |
+
|
| 75 |
+
```json
|
| 76 |
+
{
|
| 77 |
+
"healing_intent": {
|
| 78 |
+
"action": "restart_container",
|
| 79 |
+
"component": "checkout-service",
|
| 80 |
+
"parameters": {},
|
| 81 |
+
"justification": "Causal: If we apply restart_container instead of no_action, latency would change from 600.00 to 510.00 (Δ = -90.00). Based on heuristic causal model.",
|
| 82 |
+
"confidence": 0.85,
|
| 83 |
+
"risk_score": 0.54,
|
| 84 |
+
"status": "oss_advisory_only"
|
| 85 |
+
},
|
| 86 |
+
"causal_explanation": {
|
| 87 |
+
"factual_outcome": 600,
|
| 88 |
+
"counterfactual_outcome": 510,
|
| 89 |
+
"effect": -90,
|
| 90 |
+
"explanation_text": "If we apply restart_container instead of no_action, latency would change from 600.00 to 510.00 (Δ = -90.00). Based on heuristic causal model.",
|
| 91 |
+
"is_model_based": false,
|
| 92 |
+
"warnings": [
|
| 93 |
+
"Using heuristic causal model (no fitted SCM)."
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
"utility_decision": {
|
| 97 |
+
"best_action": "restart_container",
|
| 98 |
+
"expected_utility": 0.5,
|
| 99 |
+
"explanation": "Heuristic decision based on latency/error thresholds"
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
```
|
| 103 |
|
| 104 |
+
### Important Notes
|
| 105 |
+
|
| 106 |
+
- This endpoint is advisory only (`status = oss_advisory_only`)
|
| 107 |
+
- No Structural Causal Model (SCM) is fitted
|
| 108 |
+
- No machine learning models are used
|
| 109 |
+
- All effects are based on predefined heuristics
|
| 110 |
+
|
| 111 |
+
Tests
|
| 112 |
+
-----
|
| 113 |
+
|
| 114 |
+
Run `pytest`. Tests use a temporary SQLite DB (`sqlite:///./test.db`) created by the test fixtures.
|
| 115 |
|
| 116 |
+
Notes
|
| 117 |
+
-----
|
| 118 |
|
| 119 |
+
- The governance endpoints use an in-process `RiskEngine` initialized at startup.
|
| 120 |
+
- The outcome recording endpoint is not implemented in this repository and returns HTTP 501.
|
| 121 |
|
|
|
alembic/versions/d36deffe7fa2_add_beta_state_table_for_conjugate_.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""add beta_state table for conjugate posterior persistence
|
| 2 |
+
|
| 3 |
+
Revision ID: d36deffe7fa2
|
| 4 |
+
Revises: b2218948f541
|
| 5 |
+
Create Date: 2026-05-02 20:36:04.870145
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
from typing import Sequence, Union
|
| 9 |
+
|
| 10 |
+
from alembic import op
|
| 11 |
+
import sqlalchemy as sa
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# revision identifiers, used by Alembic.
|
| 15 |
+
revision: str = 'd36deffe7fa2'
|
| 16 |
+
down_revision: Union[str, Sequence[str], None] = 'b2218948f541'
|
| 17 |
+
branch_labels: Union[str, Sequence[str], None] = None
|
| 18 |
+
depends_on: Union[str, Sequence[str], None] = None
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def upgrade() -> None:
|
| 22 |
+
"""Upgrade schema."""
|
| 23 |
+
# ### commands auto generated by Alembic - please adjust! ###
|
| 24 |
+
op.create_table('beta_state',
|
| 25 |
+
sa.Column('id', sa.Integer(), nullable=False),
|
| 26 |
+
sa.Column('category', sa.String(length=32), nullable=False),
|
| 27 |
+
sa.Column('alpha', sa.Float(), nullable=False),
|
| 28 |
+
sa.Column('beta', sa.Float(), nullable=False),
|
| 29 |
+
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
| 30 |
+
sa.PrimaryKeyConstraint('id')
|
| 31 |
+
)
|
| 32 |
+
op.create_index(op.f('ix_beta_state_category'), 'beta_state', ['category'], unique=True)
|
| 33 |
+
op.create_index(op.f('ix_beta_state_id'), 'beta_state', ['id'], unique=False)
|
| 34 |
+
op.add_column('intent_outcomes', sa.Column('idempotency_key', sa.String(length=128), nullable=True))
|
| 35 |
+
op.create_unique_constraint(None, 'intent_outcomes', ['idempotency_key'])
|
| 36 |
+
# ### end Alembic commands ###
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def downgrade() -> None:
|
| 40 |
+
"""Downgrade schema."""
|
| 41 |
+
# ### commands auto generated by Alembic - please adjust! ###
|
| 42 |
+
op.drop_constraint(None, 'intent_outcomes', type_='unique')
|
| 43 |
+
op.drop_column('intent_outcomes', 'idempotency_key')
|
| 44 |
+
op.drop_index(op.f('ix_beta_state_id'), table_name='beta_state')
|
| 45 |
+
op.drop_index(op.f('ix_beta_state_category'), table_name='beta_state')
|
| 46 |
+
op.drop_table('beta_state')
|
| 47 |
+
# ### end Alembic commands ###
|
app/api/deps.py
CHANGED
|
@@ -4,66 +4,16 @@ from slowapi import Limiter
|
|
| 4 |
from slowapi.util import get_remote_address
|
| 5 |
from app.core.config import settings
|
| 6 |
|
| 7 |
-
#
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def calculate_risk(self, *args, **kwargs):
|
| 15 |
-
return (0.38, "mock", {"conjugate_mean": 0.38})
|
| 16 |
-
def update_outcome(self, *args, **kwargs):
|
| 17 |
-
pass
|
| 18 |
-
|
| 19 |
-
class DecisionEngine:
|
| 20 |
-
def __init__(self, *args, **kwargs):
|
| 21 |
-
pass
|
| 22 |
-
def select_optimal_action(self, *args, **kwargs):
|
| 23 |
-
class Result:
|
| 24 |
-
best_action = type('Action', (), {'value': 'NO_ACTION'})()
|
| 25 |
-
expected_utility = 0.0
|
| 26 |
-
alternatives = []
|
| 27 |
-
explanation = "mock"
|
| 28 |
-
raw_data = {}
|
| 29 |
-
return Result()
|
| 30 |
-
def compute_risk(self, *args, **kwargs):
|
| 31 |
-
return 0.0
|
| 32 |
-
|
| 33 |
-
class LyapunovStabilityController:
|
| 34 |
-
def __init__(self, *args, **kwargs):
|
| 35 |
-
pass
|
| 36 |
-
|
| 37 |
-
class CausalExplainer:
|
| 38 |
-
def __init__(self, *args, **kwargs):
|
| 39 |
-
pass
|
| 40 |
-
|
| 41 |
-
class RAGGraphMemory:
|
| 42 |
-
def __init__(self, *args, **kwargs):
|
| 43 |
-
pass
|
| 44 |
-
def has_historical_data(self):
|
| 45 |
-
return False
|
| 46 |
-
def record_outcome(self, *args, **kwargs):
|
| 47 |
-
pass
|
| 48 |
-
|
| 49 |
-
class ReliabilityEvent:
|
| 50 |
-
def __init__(self, component, latency_p99, error_rate, service_mesh="default"):
|
| 51 |
-
self.component = component
|
| 52 |
-
self.latency_p99 = latency_p99
|
| 53 |
-
self.error_rate = error_rate
|
| 54 |
-
self.service_mesh = service_mesh
|
| 55 |
-
|
| 56 |
-
class HealingAction:
|
| 57 |
-
NO_ACTION = "NO_ACTION"
|
| 58 |
-
RESTART_CONTAINER = "RESTART_CONTAINER"
|
| 59 |
-
SCALE_OUT = "SCALE_OUT"
|
| 60 |
-
ROLLBACK = "ROLLBACK"
|
| 61 |
-
CIRCUIT_BREAKER = "CIRCUIT_BREAKER"
|
| 62 |
-
TRAFFIC_SHIFT = "TRAFFIC_SHIFT"
|
| 63 |
-
ALERT_TEAM = "ALERT_TEAM"
|
| 64 |
-
# ---------------------------------------------------------------------------
|
| 65 |
|
| 66 |
|
|
|
|
| 67 |
def get_db():
|
| 68 |
db = SessionLocal()
|
| 69 |
try:
|
|
@@ -72,10 +22,14 @@ def get_db():
|
|
| 72 |
db.close()
|
| 73 |
|
| 74 |
|
| 75 |
-
limiter
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
-
#
|
| 79 |
_risk_engine = None
|
| 80 |
_decision_engine = None
|
| 81 |
_stability_controller = None
|
|
@@ -84,8 +38,36 @@ _rag_graph = None
|
|
| 84 |
|
| 85 |
|
| 86 |
def _seed_rag_graph(rag):
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
|
| 91 |
def get_rag_graph():
|
|
@@ -122,4 +104,4 @@ def get_causal_explainer():
|
|
| 122 |
global _causal_explainer
|
| 123 |
if _causal_explainer is None:
|
| 124 |
_causal_explainer = CausalExplainer()
|
| 125 |
-
return _causal_explainer
|
|
|
|
| 4 |
from slowapi.util import get_remote_address
|
| 5 |
from app.core.config import settings
|
| 6 |
|
| 7 |
+
# ARF core engine imports
|
| 8 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 9 |
+
from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
|
| 10 |
+
from agentic_reliability_framework.core.governance.stability_controller import LyapunovStabilityController
|
| 11 |
+
from agentic_reliability_framework.core.governance.causal_explainer import CausalExplainer
|
| 12 |
+
from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
|
| 13 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
+
# Dependency to get DB session
|
| 17 |
def get_db():
|
| 18 |
db = SessionLocal()
|
| 19 |
try:
|
|
|
|
| 22 |
db.close()
|
| 23 |
|
| 24 |
|
| 25 |
+
# Rate limiter with default limit from settings
|
| 26 |
+
limiter = Limiter(
|
| 27 |
+
key_func=get_remote_address,
|
| 28 |
+
default_limits=[
|
| 29 |
+
settings.RATE_LIMIT])
|
| 30 |
|
| 31 |
|
| 32 |
+
# ARF engine dependencies (singletons for simplicity)
|
| 33 |
_risk_engine = None
|
| 34 |
_decision_engine = None
|
| 35 |
_stability_controller = None
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def _seed_rag_graph(rag):
|
| 41 |
+
"""Seed the RAG graph with historical healing action outcomes."""
|
| 42 |
+
seed_data = [
|
| 43 |
+
("seed_restart_1", "test", HealingAction.RESTART_CONTAINER.value, True, 2),
|
| 44 |
+
("seed_restart_2", "test", HealingAction.RESTART_CONTAINER.value, True, 3),
|
| 45 |
+
("seed_restart_3", "test", HealingAction.RESTART_CONTAINER.value, False, 10),
|
| 46 |
+
("seed_rollback_1", "test", HealingAction.ROLLBACK.value, True, 1),
|
| 47 |
+
("seed_rollback_2", "test", HealingAction.ROLLBACK.value, True, 2),
|
| 48 |
+
("seed_rollback_3", "test", HealingAction.ROLLBACK.value, False, 5),
|
| 49 |
+
("seed_scale_1", "test", HealingAction.SCALE_OUT.value, True, 5),
|
| 50 |
+
("seed_scale_2", "test", HealingAction.SCALE_OUT.value, False, 15),
|
| 51 |
+
("seed_cb_1", "test", HealingAction.CIRCUIT_BREAKER.value, True, 1),
|
| 52 |
+
("seed_cb_2", "test", HealingAction.CIRCUIT_BREAKER.value, True, 2),
|
| 53 |
+
("seed_ts_1", "test", HealingAction.TRAFFIC_SHIFT.value, True, 4),
|
| 54 |
+
("seed_ts_2", "test", HealingAction.TRAFFIC_SHIFT.value, False, 8),
|
| 55 |
+
]
|
| 56 |
+
for inc_id, comp, action, success, res_time in seed_data:
|
| 57 |
+
event = ReliabilityEvent(
|
| 58 |
+
component=comp,
|
| 59 |
+
latency_p99=500,
|
| 60 |
+
error_rate=0.1,
|
| 61 |
+
service_mesh="default"
|
| 62 |
+
)
|
| 63 |
+
rag.record_outcome(
|
| 64 |
+
incident_id=inc_id,
|
| 65 |
+
event=event,
|
| 66 |
+
action_taken=action,
|
| 67 |
+
success=success,
|
| 68 |
+
resolution_time_minutes=res_time
|
| 69 |
+
)
|
| 70 |
+
print("Seeded RAG graph with historical data", file=sys.stderr)
|
| 71 |
|
| 72 |
|
| 73 |
def get_rag_graph():
|
|
|
|
| 104 |
global _causal_explainer
|
| 105 |
if _causal_explainer is None:
|
| 106 |
_causal_explainer = CausalExplainer()
|
| 107 |
+
return _causal_explainer
|
app/api/routes_admin.py
CHANGED
|
@@ -4,25 +4,26 @@ These endpoints should be protected (e.g., by an admin API key) in production.
|
|
| 4 |
"""
|
| 5 |
from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
|
| 6 |
from pydantic import BaseModel
|
| 7 |
-
from typing import Optional
|
| 8 |
from datetime import datetime
|
| 9 |
import uuid
|
| 10 |
-
|
| 11 |
from app.core.usage_tracker import tracker, Tier
|
| 12 |
|
| 13 |
router = APIRouter(prefix="/admin", tags=["admin"])
|
| 14 |
-
|
| 15 |
# Simple in‑memory admin key (replace with proper auth in production)
|
| 16 |
ADMIN_API_KEY = "admin_secret_change_me"
|
| 17 |
|
|
|
|
| 18 |
def verify_admin(admin_key: str = Query(..., alias="admin_key")):
|
| 19 |
if admin_key != ADMIN_API_KEY:
|
| 20 |
raise HTTPException(status_code=403, detail="Invalid admin key")
|
| 21 |
return True
|
| 22 |
|
|
|
|
| 23 |
class CreateKeyRequest(BaseModel):
|
| 24 |
tier: str
|
| 25 |
|
|
|
|
| 26 |
class UpdateTierRequest(BaseModel):
|
| 27 |
tier: str
|
| 28 |
|
|
@@ -30,20 +31,20 @@ class UpdateTierRequest(BaseModel):
|
|
| 30 |
@router.post("/keys", dependencies=[Depends(verify_admin)])
|
| 31 |
async def create_api_key(req: CreateKeyRequest):
|
| 32 |
if req.tier not in [t.value for t in Tier]:
|
| 33 |
-
raise HTTPException(
|
|
|
|
| 34 |
new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
|
| 35 |
tier_enum = Tier(req.tier)
|
| 36 |
tracker.get_or_create_api_key(new_key, tier_enum)
|
| 37 |
return {"api_key": new_key, "tier": req.tier}
|
| 38 |
|
| 39 |
|
| 40 |
-
@router.get("/keys", dependencies=[Depends(verify_admin)])
|
| 41 |
async def list_api_keys(limit: int = 100, offset: int = 0):
|
| 42 |
with tracker._get_conn() as conn:
|
| 43 |
rows = conn.execute(
|
| 44 |
-
"SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?",
|
| 45 |
(limit, offset)
|
| 46 |
-
).fetchall()
|
| 47 |
keys = []
|
| 48 |
for row in rows:
|
| 49 |
month = tracker._get_month_key()
|
|
@@ -52,14 +53,18 @@ async def list_api_keys(limit: int = 100, offset: int = 0):
|
|
| 52 |
(row["key"], month)
|
| 53 |
).fetchone()
|
| 54 |
usage = usage_row["count"] if usage_row else 0
|
| 55 |
-
keys.append(
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
return {"keys": keys, "total": len(keys)}
|
| 64 |
|
| 65 |
|
|
@@ -69,28 +74,33 @@ async def update_key_tier(
|
|
| 69 |
req: UpdateTierRequest = Body(...),
|
| 70 |
):
|
| 71 |
if req.tier not in [t.value for t in Tier]:
|
| 72 |
-
raise HTTPException(
|
|
|
|
| 73 |
with tracker._get_conn() as conn:
|
| 74 |
-
row = conn.execute(
|
|
|
|
| 75 |
if not row:
|
| 76 |
raise HTTPException(status_code=404, detail="API key not found")
|
| 77 |
-
conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?",
|
|
|
|
| 78 |
conn.commit()
|
| 79 |
return {"message": f"Tier updated to {req.tier}"}
|
| 80 |
|
| 81 |
|
| 82 |
@router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
|
| 83 |
-
async def deactivate_api_key(
|
|
|
|
| 84 |
with tracker._get_conn() as conn:
|
| 85 |
-
row = conn.execute(
|
|
|
|
| 86 |
if not row:
|
| 87 |
raise HTTPException(status_code=404, detail="API key not found")
|
| 88 |
-
conn.execute(
|
|
|
|
| 89 |
conn.commit()
|
| 90 |
return {"message": "API key deactivated"}
|
| 91 |
|
| 92 |
|
| 93 |
-
@router.get("/audit/{api_key}", dependencies=[Depends(verify_admin)])
|
| 94 |
async def get_audit_logs(
|
| 95 |
api_key: str = Path(..., description="The API key to audit"),
|
| 96 |
start_date: Optional[str] = Query(None),
|
|
@@ -103,11 +113,12 @@ async def get_audit_logs(
|
|
| 103 |
return {"api_key": api_key, "logs": logs}
|
| 104 |
|
| 105 |
|
| 106 |
-
@router.get("/stats", dependencies=[Depends(verify_admin)])
|
| 107 |
async def get_global_stats():
|
| 108 |
with tracker._get_conn() as conn:
|
| 109 |
-
total_keys = conn.execute(
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
by_tier = conn.execute(
|
| 112 |
"SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
|
| 113 |
).fetchall()
|
|
|
|
| 4 |
"""
|
| 5 |
from fastapi import APIRouter, Depends, HTTPException, Query, Path, Body
|
| 6 |
from pydantic import BaseModel
|
| 7 |
+
from typing import Optional
|
| 8 |
from datetime import datetime
|
| 9 |
import uuid
|
|
|
|
| 10 |
from app.core.usage_tracker import tracker, Tier
|
| 11 |
|
| 12 |
router = APIRouter(prefix="/admin", tags=["admin"])
|
|
|
|
| 13 |
# Simple in‑memory admin key (replace with proper auth in production)
|
| 14 |
ADMIN_API_KEY = "admin_secret_change_me"
|
| 15 |
|
| 16 |
+
|
| 17 |
def verify_admin(admin_key: str = Query(..., alias="admin_key")):
|
| 18 |
if admin_key != ADMIN_API_KEY:
|
| 19 |
raise HTTPException(status_code=403, detail="Invalid admin key")
|
| 20 |
return True
|
| 21 |
|
| 22 |
+
|
| 23 |
class CreateKeyRequest(BaseModel):
|
| 24 |
tier: str
|
| 25 |
|
| 26 |
+
|
| 27 |
class UpdateTierRequest(BaseModel):
|
| 28 |
tier: str
|
| 29 |
|
|
|
|
| 31 |
@router.post("/keys", dependencies=[Depends(verify_admin)])
|
| 32 |
async def create_api_key(req: CreateKeyRequest):
|
| 33 |
if req.tier not in [t.value for t in Tier]:
|
| 34 |
+
raise HTTPException(
|
| 35 |
+
status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
|
| 36 |
new_key = f"sk_live_{uuid.uuid4().hex[:24]}"
|
| 37 |
tier_enum = Tier(req.tier)
|
| 38 |
tracker.get_or_create_api_key(new_key, tier_enum)
|
| 39 |
return {"api_key": new_key, "tier": req.tier}
|
| 40 |
|
| 41 |
|
|
|
|
| 42 |
async def list_api_keys(limit: int = 100, offset: int = 0):
|
| 43 |
with tracker._get_conn() as conn:
|
| 44 |
rows = conn.execute(
|
| 45 |
+
"SELECT key, tier, created_at, last_used_at, is_active FROM api_keys ORDER BY created_at DESC LIMIT ? OFFSET ?", # noqa: E501
|
| 46 |
(limit, offset)
|
| 47 |
+
).fetchall() # noqa: E501
|
| 48 |
keys = []
|
| 49 |
for row in rows:
|
| 50 |
month = tracker._get_month_key()
|
|
|
|
| 53 |
(row["key"], month)
|
| 54 |
).fetchone()
|
| 55 |
usage = usage_row["count"] if usage_row else 0
|
| 56 |
+
keys.append(
|
| 57 |
+
{
|
| 58 |
+
"key": row["key"],
|
| 59 |
+
"tier": row["tier"],
|
| 60 |
+
"created_at": datetime.fromtimestamp(
|
| 61 |
+
row["created_at"]).isoformat(),
|
| 62 |
+
"last_used_at": datetime.fromtimestamp(
|
| 63 |
+
row["last_used_at"]).isoformat() if row["last_used_at"] else None,
|
| 64 |
+
"is_active": bool(
|
| 65 |
+
row["is_active"]),
|
| 66 |
+
"current_month_usage": usage,
|
| 67 |
+
})
|
| 68 |
return {"keys": keys, "total": len(keys)}
|
| 69 |
|
| 70 |
|
|
|
|
| 74 |
req: UpdateTierRequest = Body(...),
|
| 75 |
):
|
| 76 |
if req.tier not in [t.value for t in Tier]:
|
| 77 |
+
raise HTTPException(
|
| 78 |
+
status_code=400, detail=f"Invalid tier. Must be one of {[t.value for t in Tier]}")
|
| 79 |
with tracker._get_conn() as conn:
|
| 80 |
+
row = conn.execute(
|
| 81 |
+
"SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
|
| 82 |
if not row:
|
| 83 |
raise HTTPException(status_code=404, detail="API key not found")
|
| 84 |
+
conn.execute("UPDATE api_keys SET tier = ? WHERE key = ?",
|
| 85 |
+
(req.tier, api_key))
|
| 86 |
conn.commit()
|
| 87 |
return {"message": f"Tier updated to {req.tier}"}
|
| 88 |
|
| 89 |
|
| 90 |
@router.delete("/keys/{api_key}", dependencies=[Depends(verify_admin)])
|
| 91 |
+
async def deactivate_api_key(
|
| 92 |
+
api_key: str = Path(..., description="The API key to deactivate")):
|
| 93 |
with tracker._get_conn() as conn:
|
| 94 |
+
row = conn.execute(
|
| 95 |
+
"SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
|
| 96 |
if not row:
|
| 97 |
raise HTTPException(status_code=404, detail="API key not found")
|
| 98 |
+
conn.execute(
|
| 99 |
+
"UPDATE api_keys SET is_active = 0 WHERE key = ?", (api_key,))
|
| 100 |
conn.commit()
|
| 101 |
return {"message": "API key deactivated"}
|
| 102 |
|
| 103 |
|
|
|
|
| 104 |
async def get_audit_logs(
|
| 105 |
api_key: str = Path(..., description="The API key to audit"),
|
| 106 |
start_date: Optional[str] = Query(None),
|
|
|
|
| 113 |
return {"api_key": api_key, "logs": logs}
|
| 114 |
|
| 115 |
|
|
|
|
| 116 |
async def get_global_stats():
|
| 117 |
with tracker._get_conn() as conn:
|
| 118 |
+
total_keys = conn.execute(
|
| 119 |
+
"SELECT COUNT(*) FROM api_keys WHERE is_active = 1").fetchone()[0]
|
| 120 |
+
total_requests = conn.execute(
|
| 121 |
+
"SELECT COUNT(*) FROM usage_log").fetchone()[0]
|
| 122 |
by_tier = conn.execute(
|
| 123 |
"SELECT tier, COUNT(*) as count FROM usage_log GROUP BY tier"
|
| 124 |
).fetchall()
|
app/api/routes_governance.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks
|
| 2 |
from fastapi.encoders import jsonable_encoder
|
| 3 |
from sqlalchemy.orm import Session
|
| 4 |
from app.models.infrastructure_intents import InfrastructureIntentRequest
|
|
@@ -8,26 +8,34 @@ from app.services.intent_store import save_evaluated_intent
|
|
| 8 |
from app.services.outcome_service import record_outcome
|
| 9 |
from app.api.deps import get_db
|
| 10 |
from pydantic import BaseModel
|
| 11 |
-
from typing import Optional
|
| 12 |
import uuid
|
| 13 |
import logging
|
| 14 |
import time
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
try:
|
| 18 |
-
from
|
|
|
|
| 19 |
except ImportError:
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
component: str
|
| 23 |
-
latency_p99: float
|
| 24 |
-
error_rate: float
|
| 25 |
-
service_mesh: str = "default"
|
| 26 |
-
cpu_util: Optional[float] = None
|
| 27 |
-
memory_util: Optional[float] = None
|
| 28 |
|
| 29 |
-
# =====
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
logger = logging.getLogger(__name__)
|
| 33 |
router = APIRouter()
|
|
@@ -50,13 +58,52 @@ async def evaluate_intent_endpoint(
|
|
| 50 |
intent_req: InfrastructureIntentRequest,
|
| 51 |
background_tasks: BackgroundTasks,
|
| 52 |
db: Session = Depends(get_db),
|
| 53 |
-
|
| 54 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
start_time = time.time()
|
| 56 |
-
api_key =
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
try:
|
| 62 |
oss_intent = to_oss_intent(intent_req)
|
|
@@ -68,6 +115,10 @@ async def evaluate_intent_endpoint(
|
|
| 68 |
policy_violations=intent_req.policy_violations
|
| 69 |
)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
deterministic_id = str(uuid.uuid4())
|
| 72 |
api_payload = jsonable_encoder(intent_req.model_dump())
|
| 73 |
oss_payload = jsonable_encoder(oss_intent.model_dump())
|
|
@@ -85,36 +136,39 @@ async def evaluate_intent_endpoint(
|
|
| 85 |
result["intent_id"] = deterministic_id
|
| 86 |
response_data = result
|
| 87 |
|
| 88 |
-
if
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
| 97 |
)
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
return response_data
|
| 101 |
|
| 102 |
except HTTPException:
|
|
|
|
|
|
|
|
|
|
| 103 |
raise
|
| 104 |
except Exception as e:
|
| 105 |
error_msg = str(e)
|
| 106 |
logger.exception("Error in evaluate_intent_endpoint")
|
| 107 |
-
if
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
timestamp=time.time(),
|
| 112 |
-
endpoint="/api/v1/intents/evaluate",
|
| 113 |
-
request_body=intent_req.model_dump(),
|
| 114 |
-
error=error_msg,
|
| 115 |
-
processing_ms=(time.time() - start_time) * 1000,
|
| 116 |
-
)
|
| 117 |
-
await tracker.increment_usage_async(record, background_tasks)
|
| 118 |
raise HTTPException(status_code=500, detail=error_msg)
|
| 119 |
|
| 120 |
|
|
@@ -122,9 +176,14 @@ async def evaluate_intent_endpoint(
|
|
| 122 |
async def record_outcome_endpoint(
|
| 123 |
request: Request,
|
| 124 |
outcome: OutcomeRequest,
|
| 125 |
-
db: Session = Depends(get_db)
|
|
|
|
| 126 |
):
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
try:
|
| 129 |
risk_engine = request.app.state.risk_engine
|
| 130 |
outcome_record = record_outcome(
|
|
@@ -133,8 +192,27 @@ async def record_outcome_endpoint(
|
|
| 133 |
success=outcome.success,
|
| 134 |
recorded_by=outcome.recorded_by,
|
| 135 |
notes=outcome.notes,
|
| 136 |
-
risk_engine=risk_engine
|
|
|
|
| 137 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
|
| 139 |
except Exception as e:
|
| 140 |
raise HTTPException(status_code=500, detail=str(e))
|
|
@@ -145,13 +223,51 @@ async def evaluate_healing_decision_endpoint(
|
|
| 145 |
request: Request,
|
| 146 |
decision_req: HealingDecisionRequest,
|
| 147 |
background_tasks: BackgroundTasks,
|
| 148 |
-
|
| 149 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
start_time = time.time()
|
| 151 |
-
api_key =
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
try:
|
| 157 |
policy_engine = request.app.state.policy_engine
|
|
@@ -168,34 +284,37 @@ async def evaluate_healing_decision_endpoint(
|
|
| 168 |
tokenizer=tokenizer,
|
| 169 |
)
|
| 170 |
|
| 171 |
-
if
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
endpoint="/api/v1/healing/evaluate",
|
| 177 |
-
request_body=decision_req.model_dump(),
|
| 178 |
-
response=response_data,
|
| 179 |
-
processing_ms=(time.time() - start_time) * 1000,
|
| 180 |
-
)
|
| 181 |
-
await tracker.increment_usage_async(record, background_tasks)
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
return response_data
|
| 184 |
|
| 185 |
except HTTPException:
|
|
|
|
|
|
|
|
|
|
| 186 |
raise
|
| 187 |
except Exception as e:
|
| 188 |
error_msg = str(e)
|
| 189 |
logger.exception("Error in evaluate_healing_decision_endpoint")
|
| 190 |
-
if
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
endpoint="/api/v1/healing/evaluate",
|
| 196 |
-
request_body=decision_req.model_dump(),
|
| 197 |
-
error=error_msg,
|
| 198 |
-
processing_ms=(time.time() - start_time) * 1000,
|
| 199 |
-
)
|
| 200 |
-
await tracker.increment_usage_async(record, background_tasks)
|
| 201 |
-
raise HTTPException(status_code=500, detail=error_msg)
|
|
|
|
| 1 |
+
from fastapi import APIRouter, Depends, HTTPException, Request, BackgroundTasks, Header
|
| 2 |
from fastapi.encoders import jsonable_encoder
|
| 3 |
from sqlalchemy.orm import Session
|
| 4 |
from app.models.infrastructure_intents import InfrastructureIntentRequest
|
|
|
|
| 8 |
from app.services.outcome_service import record_outcome
|
| 9 |
from app.api.deps import get_db
|
| 10 |
from pydantic import BaseModel
|
|
|
|
| 11 |
import uuid
|
| 12 |
import logging
|
| 13 |
import time
|
| 14 |
+
from typing import Optional
|
| 15 |
+
|
| 16 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent
|
| 17 |
|
| 18 |
+
# ===== USAGE TRACKER IMPORTS =====
|
| 19 |
+
import app.core.usage_tracker
|
| 20 |
+
from app.core.usage_tracker import UsageRecord
|
| 21 |
+
|
| 22 |
+
# ===== PRICING CALCULATOR INTEGRATION =====
|
| 23 |
try:
|
| 24 |
+
from arf_pricing_calculator.storage.buffer import add_event
|
| 25 |
+
PRICING_AVAILABLE = True
|
| 26 |
except ImportError:
|
| 27 |
+
PRICING_AVAILABLE = False
|
| 28 |
+
add_event = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# ===== OpenTelemetry (optional) =====
|
| 31 |
+
try:
|
| 32 |
+
from opentelemetry import trace
|
| 33 |
+
from opentelemetry.trace import Status, StatusCode
|
| 34 |
+
_tracer = trace.get_tracer(__name__)
|
| 35 |
+
OTEL_AVAILABLE = True
|
| 36 |
+
except ImportError:
|
| 37 |
+
OTEL_AVAILABLE = False
|
| 38 |
+
_tracer = None
|
| 39 |
|
| 40 |
logger = logging.getLogger(__name__)
|
| 41 |
router = APIRouter()
|
|
|
|
| 58 |
intent_req: InfrastructureIntentRequest,
|
| 59 |
background_tasks: BackgroundTasks,
|
| 60 |
db: Session = Depends(get_db),
|
| 61 |
+
idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"),
|
| 62 |
):
|
| 63 |
+
"""
|
| 64 |
+
Evaluate an infrastructure intent with idempotency and atomic quota consumption.
|
| 65 |
+
"""
|
| 66 |
+
# ── optional trace ──────────────────────────────────────
|
| 67 |
+
span = None
|
| 68 |
+
if OTEL_AVAILABLE and _tracer:
|
| 69 |
+
span = _tracer.start_span("governance.evaluate_intent")
|
| 70 |
+
span.set_attribute("intent_type", intent_req.intent_type)
|
| 71 |
+
span.set_attribute("environment", str(intent_req.environment))
|
| 72 |
+
|
| 73 |
start_time = time.time()
|
| 74 |
+
api_key = request.headers.get("Authorization", "").replace("Bearer ", "")
|
| 75 |
+
if not api_key:
|
| 76 |
+
api_key = request.query_params.get("api_key", "unknown")
|
| 77 |
+
|
| 78 |
+
current_tracker = app.core.usage_tracker.tracker
|
| 79 |
+
if current_tracker is None:
|
| 80 |
+
if span:
|
| 81 |
+
span.set_status(Status(StatusCode.ERROR, "tracker unavailable"))
|
| 82 |
+
span.end()
|
| 83 |
+
raise HTTPException(status_code=503,
|
| 84 |
+
detail="Usage tracking service unavailable")
|
| 85 |
+
|
| 86 |
+
record = UsageRecord(
|
| 87 |
+
api_key=api_key,
|
| 88 |
+
tier=None,
|
| 89 |
+
timestamp=start_time,
|
| 90 |
+
endpoint="/api/v1/intents/evaluate",
|
| 91 |
+
request_body=intent_req.model_dump(),
|
| 92 |
+
processing_ms=None,
|
| 93 |
+
)
|
| 94 |
+
success, existing_response = current_tracker.consume_quota_and_log(
|
| 95 |
+
record=record,
|
| 96 |
+
idempotency_key=idempotency_key
|
| 97 |
+
)
|
| 98 |
+
if not success:
|
| 99 |
+
if span:
|
| 100 |
+
span.set_attribute("idempotent_hit", True if existing_response else False)
|
| 101 |
+
span.end()
|
| 102 |
+
if existing_response:
|
| 103 |
+
return existing_response
|
| 104 |
+
else:
|
| 105 |
+
raise HTTPException(status_code=429,
|
| 106 |
+
detail="Monthly evaluation quota exceeded")
|
| 107 |
|
| 108 |
try:
|
| 109 |
oss_intent = to_oss_intent(intent_req)
|
|
|
|
| 115 |
policy_violations=intent_req.policy_violations
|
| 116 |
)
|
| 117 |
|
| 118 |
+
if span:
|
| 119 |
+
span.set_attribute("risk_score", result["risk_score"])
|
| 120 |
+
span.set_attribute("deterministic_id", str(uuid.uuid4())) # will be overwritten later, but fine for trace
|
| 121 |
+
|
| 122 |
deterministic_id = str(uuid.uuid4())
|
| 123 |
api_payload = jsonable_encoder(intent_req.model_dump())
|
| 124 |
oss_payload = jsonable_encoder(oss_intent.model_dump())
|
|
|
|
| 136 |
result["intent_id"] = deterministic_id
|
| 137 |
response_data = result
|
| 138 |
|
| 139 |
+
if current_tracker:
|
| 140 |
+
background_tasks.add_task(
|
| 141 |
+
current_tracker._insert_audit_log,
|
| 142 |
+
UsageRecord(
|
| 143 |
+
api_key=api_key,
|
| 144 |
+
tier=None,
|
| 145 |
+
timestamp=time.time(),
|
| 146 |
+
endpoint="/api/v1/intents/evaluate/response",
|
| 147 |
+
request_body=None,
|
| 148 |
+
response=response_data,
|
| 149 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 150 |
+
)
|
| 151 |
)
|
| 152 |
+
|
| 153 |
+
if span:
|
| 154 |
+
span.set_attribute("intent_id", deterministic_id)
|
| 155 |
+
span.set_status(Status(StatusCode.OK))
|
| 156 |
+
span.end()
|
| 157 |
|
| 158 |
return response_data
|
| 159 |
|
| 160 |
except HTTPException:
|
| 161 |
+
if span:
|
| 162 |
+
span.set_status(Status(StatusCode.ERROR, "HTTP exception"))
|
| 163 |
+
span.end()
|
| 164 |
raise
|
| 165 |
except Exception as e:
|
| 166 |
error_msg = str(e)
|
| 167 |
logger.exception("Error in evaluate_intent_endpoint")
|
| 168 |
+
if span:
|
| 169 |
+
span.set_status(Status(StatusCode.ERROR, error_msg))
|
| 170 |
+
span.record_exception(e)
|
| 171 |
+
span.end()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
raise HTTPException(status_code=500, detail=error_msg)
|
| 173 |
|
| 174 |
|
|
|
|
| 176 |
async def record_outcome_endpoint(
|
| 177 |
request: Request,
|
| 178 |
outcome: OutcomeRequest,
|
| 179 |
+
db: Session = Depends(get_db),
|
| 180 |
+
idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"),
|
| 181 |
):
|
| 182 |
+
"""
|
| 183 |
+
Record an outcome for a previously evaluated intent.
|
| 184 |
+
Idempotent based on deterministic_id and success value (handled in service).
|
| 185 |
+
Also updates the pricing calculator's calibration buffer if available.
|
| 186 |
+
"""
|
| 187 |
try:
|
| 188 |
risk_engine = request.app.state.risk_engine
|
| 189 |
outcome_record = record_outcome(
|
|
|
|
| 192 |
success=outcome.success,
|
| 193 |
recorded_by=outcome.recorded_by,
|
| 194 |
notes=outcome.notes,
|
| 195 |
+
risk_engine=risk_engine,
|
| 196 |
+
idempotency_key=idempotency_key,
|
| 197 |
)
|
| 198 |
+
|
| 199 |
+
if PRICING_AVAILABLE and add_event is not None:
|
| 200 |
+
try:
|
| 201 |
+
event = {
|
| 202 |
+
"run_id": outcome.deterministic_id,
|
| 203 |
+
"outcome": "success" if outcome.success else "failure",
|
| 204 |
+
"recorded_at": time.time(),
|
| 205 |
+
"source": "arf_api_outcome"
|
| 206 |
+
}
|
| 207 |
+
add_event(event)
|
| 208 |
+
logger.info(
|
| 209 |
+
f"Added outcome to pricing buffer for intent {
|
| 210 |
+
outcome.deterministic_id}")
|
| 211 |
+
except Exception as e:
|
| 212 |
+
logger.warning(
|
| 213 |
+
f"Failed to update pricing buffer for intent {
|
| 214 |
+
outcome.deterministic_id}: {e}")
|
| 215 |
+
|
| 216 |
return {"message": "Outcome recorded", "outcome_id": outcome_record.id}
|
| 217 |
except Exception as e:
|
| 218 |
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
| 223 |
request: Request,
|
| 224 |
decision_req: HealingDecisionRequest,
|
| 225 |
background_tasks: BackgroundTasks,
|
| 226 |
+
idempotency_key: Optional[str] = Header(None, alias="Idempotency-Key"),
|
| 227 |
):
|
| 228 |
+
"""
|
| 229 |
+
Evaluate a healing decision with idempotency and atomic quota consumption.
|
| 230 |
+
"""
|
| 231 |
+
# ── optional trace ──────────────────────────────────────
|
| 232 |
+
span = None
|
| 233 |
+
if OTEL_AVAILABLE and _tracer:
|
| 234 |
+
span = _tracer.start_span("governance.evaluate_healing")
|
| 235 |
+
span.set_attribute("component", decision_req.event.component)
|
| 236 |
+
|
| 237 |
start_time = time.time()
|
| 238 |
+
api_key = request.headers.get("Authorization", "").replace("Bearer ", "")
|
| 239 |
+
if not api_key:
|
| 240 |
+
api_key = request.query_params.get("api_key", "unknown")
|
| 241 |
+
|
| 242 |
+
current_tracker = app.core.usage_tracker.tracker
|
| 243 |
+
if current_tracker is None:
|
| 244 |
+
if span:
|
| 245 |
+
span.set_status(Status(StatusCode.ERROR, "tracker unavailable"))
|
| 246 |
+
span.end()
|
| 247 |
+
raise HTTPException(status_code=503,
|
| 248 |
+
detail="Usage tracking service unavailable")
|
| 249 |
+
|
| 250 |
+
record = UsageRecord(
|
| 251 |
+
api_key=api_key,
|
| 252 |
+
tier=None,
|
| 253 |
+
timestamp=start_time,
|
| 254 |
+
endpoint="/api/v1/healing/evaluate",
|
| 255 |
+
request_body=decision_req.model_dump(),
|
| 256 |
+
processing_ms=None,
|
| 257 |
+
)
|
| 258 |
+
success, existing_response = current_tracker.consume_quota_and_log(
|
| 259 |
+
record=record,
|
| 260 |
+
idempotency_key=idempotency_key
|
| 261 |
+
)
|
| 262 |
+
if not success:
|
| 263 |
+
if span:
|
| 264 |
+
span.set_attribute("idempotent_hit", True if existing_response else False)
|
| 265 |
+
span.end()
|
| 266 |
+
if existing_response:
|
| 267 |
+
return existing_response
|
| 268 |
+
else:
|
| 269 |
+
raise HTTPException(status_code=429,
|
| 270 |
+
detail="Monthly evaluation quota exceeded")
|
| 271 |
|
| 272 |
try:
|
| 273 |
policy_engine = request.app.state.policy_engine
|
|
|
|
| 284 |
tokenizer=tokenizer,
|
| 285 |
)
|
| 286 |
|
| 287 |
+
if span:
|
| 288 |
+
span.set_attribute("risk_score", response_data.get("risk_score", 0.0))
|
| 289 |
+
span.set_attribute("selected_action", response_data.get("selected_action", "unknown"))
|
| 290 |
+
span.set_status(Status(StatusCode.OK))
|
| 291 |
+
span.end()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
+
if current_tracker:
|
| 294 |
+
background_tasks.add_task(
|
| 295 |
+
current_tracker._insert_audit_log,
|
| 296 |
+
UsageRecord(
|
| 297 |
+
api_key=api_key,
|
| 298 |
+
tier=None,
|
| 299 |
+
timestamp=time.time(),
|
| 300 |
+
endpoint="/api/v1/healing/evaluate/response",
|
| 301 |
+
request_body=None,
|
| 302 |
+
response=response_data,
|
| 303 |
+
processing_ms=(time.time() - start_time) * 1000,
|
| 304 |
+
)
|
| 305 |
+
)
|
| 306 |
return response_data
|
| 307 |
|
| 308 |
except HTTPException:
|
| 309 |
+
if span:
|
| 310 |
+
span.set_status(Status(StatusCode.ERROR, "HTTP exception"))
|
| 311 |
+
span.end()
|
| 312 |
raise
|
| 313 |
except Exception as e:
|
| 314 |
error_msg = str(e)
|
| 315 |
logger.exception("Error in evaluate_healing_decision_endpoint")
|
| 316 |
+
if span:
|
| 317 |
+
span.set_status(Status(StatusCode.ERROR, error_msg))
|
| 318 |
+
span.record_exception(e)
|
| 319 |
+
span.end()
|
| 320 |
+
raise HTTPException(status_code=500, detail=error_msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/api/routes_incidents.py
CHANGED
|
@@ -1,86 +1,211 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
from app.core.usage_tracker import enforce_quota, UsageRecord, tracker
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
NO_ACTION = "no_action"
|
| 15 |
-
RESTART_CONTAINER = "restart_container"
|
| 16 |
-
SCALE_OUT = "scale_out"
|
| 17 |
-
ROLLBACK = "rollback"
|
| 18 |
-
CIRCUIT_BREAKER = "circuit_breaker"
|
| 19 |
-
TRAFFIC_SHIFT = "traffic_shift"
|
| 20 |
-
ALERT_TEAM = "alert_team"
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
latency_p99: float
|
| 26 |
-
error_rate: float
|
| 27 |
-
service_mesh: str = "default"
|
| 28 |
-
cpu_util: Optional[float] = None
|
| 29 |
-
memory_util: Optional[float] = None
|
| 30 |
|
|
|
|
| 31 |
|
| 32 |
router = APIRouter()
|
| 33 |
-
incident_history = []
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
@router.post("/report_incident")
|
| 37 |
-
async def report_incident(event: ReliabilityEvent):
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
return {"status": "recorded"}
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
@router.post("/v1/incidents/evaluate")
|
| 43 |
async def evaluate_incident(
|
| 44 |
request: Request,
|
| 45 |
event: ReliabilityEvent,
|
| 46 |
background_tasks: BackgroundTasks,
|
| 47 |
-
quota: dict = Depends(enforce_quota)
|
| 48 |
-
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
start_time = time.time()
|
| 50 |
-
api_key = quota["api_key"]
|
| 51 |
tier = quota["tier"]
|
| 52 |
-
response_data = None
|
| 53 |
-
error_msg = None
|
| 54 |
|
| 55 |
try:
|
| 56 |
-
#
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
current_state = {
|
| 65 |
"latency": event.latency_p99,
|
| 66 |
"error_rate": event.error_rate,
|
| 67 |
-
"last_action": {"action_type": "no_action"}
|
| 68 |
}
|
| 69 |
proposed_action = {"action_type": optimal_action.value, "params": {}}
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
|
|
|
|
|
|
|
|
|
|
| 73 |
healing_intent = {
|
| 74 |
"action": optimal_action.value,
|
| 75 |
"component": event.component,
|
| 76 |
-
"parameters":
|
| 77 |
-
"justification":
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
| 81 |
}
|
| 82 |
|
| 83 |
response_data = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
"healing_intent": healing_intent,
|
| 85 |
"causal_explanation": {
|
| 86 |
"factual_outcome": causal_exp.factual_outcome,
|
|
@@ -88,42 +213,49 @@ async def evaluate_incident(
|
|
| 88 |
"effect": causal_exp.effect,
|
| 89 |
"explanation_text": causal_exp.explanation_text,
|
| 90 |
"is_model_based": causal_exp.is_model_based,
|
| 91 |
-
"warnings": causal_exp.warnings
|
| 92 |
},
|
| 93 |
"utility_decision": {
|
| 94 |
"best_action": optimal_action.value,
|
| 95 |
"expected_utility": 0.5,
|
| 96 |
-
"explanation":
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
}
|
| 99 |
|
|
|
|
| 100 |
# Asynchronous usage logging
|
|
|
|
| 101 |
if tracker:
|
| 102 |
record = UsageRecord(
|
| 103 |
api_key=api_key,
|
| 104 |
tier=tier,
|
| 105 |
timestamp=time.time(),
|
| 106 |
endpoint="/v1/incidents/evaluate",
|
| 107 |
-
request_body=event.
|
| 108 |
response=response_data,
|
| 109 |
processing_ms=(time.time() - start_time) * 1000,
|
| 110 |
)
|
| 111 |
await tracker.increment_usage_async(record, background_tasks)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
return response_data
|
| 114 |
|
| 115 |
except HTTPException:
|
| 116 |
raise
|
| 117 |
-
except Exception as
|
| 118 |
-
error_msg = str(
|
| 119 |
-
# Log failure in background
|
| 120 |
if tracker:
|
| 121 |
record = UsageRecord(
|
| 122 |
api_key=api_key,
|
| 123 |
tier=tier,
|
| 124 |
timestamp=time.time(),
|
| 125 |
endpoint="/v1/incidents/evaluate",
|
| 126 |
-
request_body=event.
|
| 127 |
error=error_msg,
|
| 128 |
processing_ms=(time.time() - start_time) * 1000,
|
| 129 |
)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Incident evaluation endpoints — backward‑compatible Bayesian reroute.
|
| 3 |
+
|
| 4 |
+
This module provides two incident‑related routes:
|
| 5 |
+
|
| 6 |
+
* ``POST /api/v1/report_incident``
|
| 7 |
+
Stores a ``ReliabilityEvent`` in an in‑memory history for auditing
|
| 8 |
+
and debugging.
|
| 9 |
+
* ``POST /api/v1/v1/incidents/evaluate`` **(deprecated)**
|
| 10 |
+
Former heuristic endpoint now **rerouted to the full Bayesian risk
|
| 11 |
+
engine**. All callers should migrate to
|
| 12 |
+
``POST /api/v1/intents/evaluate``, which returns richer metadata
|
| 13 |
+
including CUDL uncertainty decomposition and decision traces.
|
| 14 |
+
|
| 15 |
+
The local model duplicates (``ReliabilityEvent``, ``HealingAction``)
|
| 16 |
+
have been removed; all types are imported from the canonical ARF core
|
| 17 |
+
framework (``agentic_reliability_framework.core.models.event``).
|
| 18 |
+
"""
|
| 19 |
|
| 20 |
+
from __future__ import annotations
|
|
|
|
| 21 |
|
| 22 |
+
import logging
|
| 23 |
+
import time
|
| 24 |
+
from typing import Optional
|
| 25 |
|
| 26 |
+
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, Request
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
from agentic_reliability_framework.core.models.event import (
|
| 29 |
+
HealingAction,
|
| 30 |
+
ReliabilityEvent,
|
| 31 |
+
)
|
| 32 |
|
| 33 |
+
from app.causal_explainer import CausalExplainer
|
| 34 |
+
from app.core.usage_tracker import UsageRecord, enforce_quota, tracker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
logger = logging.getLogger(__name__)
|
| 37 |
|
| 38 |
router = APIRouter()
|
|
|
|
| 39 |
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# In‑memory incident store (for auditing / debugging only)
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
incident_history: list[dict] = []
|
| 44 |
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# POST /api/v1/report_incident
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
@router.post("/report_incident")
|
| 50 |
+
async def report_incident(event: ReliabilityEvent) -> dict[str, str]:
|
| 51 |
+
"""
|
| 52 |
+
Record a ``ReliabilityEvent`` in the in‑memory incident history.
|
| 53 |
+
|
| 54 |
+
This endpoint is used by internal monitoring tools to feed incident
|
| 55 |
+
data into the causal explainer and downstream analysis. The event
|
| 56 |
+
is stored as a JSON‑safe dictionary and is **not** persisted across
|
| 57 |
+
API restarts.
|
| 58 |
+
|
| 59 |
+
Parameters
|
| 60 |
+
----------
|
| 61 |
+
event : ReliabilityEvent
|
| 62 |
+
The reliability event to record. Must include at minimum
|
| 63 |
+
``component``, ``latency_p99``, ``error_rate``, and
|
| 64 |
+
``service_mesh``.
|
| 65 |
+
|
| 66 |
+
Returns
|
| 67 |
+
-------
|
| 68 |
+
dict
|
| 69 |
+
A simple acknowledgement ``{"status": "recorded"}``.
|
| 70 |
+
"""
|
| 71 |
+
incident_history.append(event.model_dump(mode="json"))
|
| 72 |
return {"status": "recorded"}
|
| 73 |
|
| 74 |
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
# POST /api/v1/v1/incidents/evaluate (deprecated)
|
| 77 |
+
# ---------------------------------------------------------------------------
|
| 78 |
@router.post("/v1/incidents/evaluate")
|
| 79 |
async def evaluate_incident(
|
| 80 |
request: Request,
|
| 81 |
event: ReliabilityEvent,
|
| 82 |
background_tasks: BackgroundTasks,
|
| 83 |
+
quota: dict = Depends(enforce_quota),
|
| 84 |
+
) -> dict:
|
| 85 |
+
"""
|
| 86 |
+
Evaluate an incident using the **Bayesian risk engine**.
|
| 87 |
+
|
| 88 |
+
.. deprecated:: 0.6.0
|
| 89 |
+
Use ``POST /api/v1/intents/evaluate`` instead. This endpoint
|
| 90 |
+
will be removed in a future release. Responses include a
|
| 91 |
+
``deprecation_notice`` field to assist migration.
|
| 92 |
+
|
| 93 |
+
The following steps are performed:
|
| 94 |
+
|
| 95 |
+
1. Convert the ``ReliabilityEvent`` into a minimal
|
| 96 |
+
``DeployConfigurationIntent`` via ``intent_adapter``.
|
| 97 |
+
2. Call ``risk_service.evaluate_intent()`` to obtain a Bayesian
|
| 98 |
+
risk score.
|
| 99 |
+
3. Generate a heuristic healing action based on the risk score.
|
| 100 |
+
4. Run the causal explainer for counter‑factual text.
|
| 101 |
+
5. Build a backward‑compatible response envelope.
|
| 102 |
+
|
| 103 |
+
Parameters
|
| 104 |
+
----------
|
| 105 |
+
request : Request
|
| 106 |
+
The Starlette request object (used for internal state access).
|
| 107 |
+
event : ReliabilityEvent
|
| 108 |
+
The incident event containing component name, latency, error
|
| 109 |
+
rate, etc.
|
| 110 |
+
background_tasks : BackgroundTasks
|
| 111 |
+
FastAPI background‑task runner for asynchronous logging.
|
| 112 |
+
quota : dict
|
| 113 |
+
Injected by ``enforce_quota``; contains ``api_key``, ``tier``,
|
| 114 |
+
and ``remaining``.
|
| 115 |
+
|
| 116 |
+
Returns
|
| 117 |
+
-------
|
| 118 |
+
dict
|
| 119 |
+
A dictionary with keys:
|
| 120 |
+
|
| 121 |
+
* ``deprecation_notice`` (str) — migration guidance.
|
| 122 |
+
* ``healing_intent`` (dict) — action, component, risk score,
|
| 123 |
+
justification, confidence, and advisory status.
|
| 124 |
+
* ``causal_explanation`` (dict) — factual/counter‑factual
|
| 125 |
+
outcomes and explanation text.
|
| 126 |
+
* ``utility_decision`` (dict) — selected action and expected
|
| 127 |
+
utility.
|
| 128 |
+
"""
|
| 129 |
start_time = time.time()
|
| 130 |
+
api_key: str = quota["api_key"]
|
| 131 |
tier = quota["tier"]
|
| 132 |
+
response_data: Optional[dict] = None
|
| 133 |
+
error_msg: Optional[str] = None
|
| 134 |
|
| 135 |
try:
|
| 136 |
+
# ------------------------------------------------------------------
|
| 137 |
+
# Step 1 – Convert the event into an infrastructure intent
|
| 138 |
+
# ------------------------------------------------------------------
|
| 139 |
+
from app.services.intent_adapter import to_oss_intent
|
| 140 |
+
from app.services.risk_service import evaluate_intent
|
| 141 |
+
|
| 142 |
+
raw_intent = {
|
| 143 |
+
"intent_type": "deploy_config",
|
| 144 |
+
"environment": "prod",
|
| 145 |
+
"service_name": event.component,
|
| 146 |
+
"requester": "auto",
|
| 147 |
+
"change_scope": "global",
|
| 148 |
+
"deployment_target": "prod",
|
| 149 |
+
"configuration": {},
|
| 150 |
+
"provenance": {"source": "incident_evaluate"},
|
| 151 |
+
}
|
| 152 |
+
oss_intent = to_oss_intent(raw_intent)
|
| 153 |
|
| 154 |
+
# ------------------------------------------------------------------
|
| 155 |
+
# Step 2 – Bayesian risk evaluation
|
| 156 |
+
# ------------------------------------------------------------------
|
| 157 |
+
risk_engine = request.app.state.risk_engine
|
| 158 |
+
result = evaluate_intent(
|
| 159 |
+
engine=risk_engine,
|
| 160 |
+
intent=oss_intent,
|
| 161 |
+
cost_estimate=None,
|
| 162 |
+
policy_violations=[],
|
| 163 |
+
)
|
| 164 |
|
| 165 |
+
# ------------------------------------------------------------------
|
| 166 |
+
# Step 3 – Heuristic action selection based on risk threshold
|
| 167 |
+
# ------------------------------------------------------------------
|
| 168 |
+
optimal_action = (
|
| 169 |
+
HealingAction.RESTART_CONTAINER
|
| 170 |
+
if result["risk_score"] > 0.5
|
| 171 |
+
else HealingAction.NO_ACTION
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# ------------------------------------------------------------------
|
| 175 |
+
# Step 4 – Causal explainer
|
| 176 |
+
# ------------------------------------------------------------------
|
| 177 |
+
causal_explainer = CausalExplainer()
|
| 178 |
current_state = {
|
| 179 |
"latency": event.latency_p99,
|
| 180 |
"error_rate": event.error_rate,
|
| 181 |
+
"last_action": {"action_type": "no_action"},
|
| 182 |
}
|
| 183 |
proposed_action = {"action_type": optimal_action.value, "params": {}}
|
| 184 |
+
causal_exp = causal_explainer.explain_healing_intent(
|
| 185 |
+
proposed_action, current_state, "latency"
|
| 186 |
+
)
|
| 187 |
|
| 188 |
+
# ------------------------------------------------------------------
|
| 189 |
+
# Step 5 – Build response envelope
|
| 190 |
+
# ------------------------------------------------------------------
|
| 191 |
healing_intent = {
|
| 192 |
"action": optimal_action.value,
|
| 193 |
"component": event.component,
|
| 194 |
+
"parameters": {},
|
| 195 |
+
"justification": (
|
| 196 |
+
f"Bayesian risk score: {result['risk_score']:.3f}. "
|
| 197 |
+
f"Causal: {causal_exp.explanation_text}"
|
| 198 |
+
),
|
| 199 |
+
"confidence": 1.0 - result.get("uncertainty", 0.0),
|
| 200 |
+
"risk_score": result["risk_score"],
|
| 201 |
+
"status": "oss_advisory_only",
|
| 202 |
}
|
| 203 |
|
| 204 |
response_data = {
|
| 205 |
+
"deprecation_notice": (
|
| 206 |
+
"This endpoint is deprecated. Use POST /api/v1/intents/evaluate "
|
| 207 |
+
"for the full Bayesian evaluation with CUDL decomposition."
|
| 208 |
+
),
|
| 209 |
"healing_intent": healing_intent,
|
| 210 |
"causal_explanation": {
|
| 211 |
"factual_outcome": causal_exp.factual_outcome,
|
|
|
|
| 213 |
"effect": causal_exp.effect,
|
| 214 |
"explanation_text": causal_exp.explanation_text,
|
| 215 |
"is_model_based": causal_exp.is_model_based,
|
| 216 |
+
"warnings": causal_exp.warnings,
|
| 217 |
},
|
| 218 |
"utility_decision": {
|
| 219 |
"best_action": optimal_action.value,
|
| 220 |
"expected_utility": 0.5,
|
| 221 |
+
"explanation": (
|
| 222 |
+
"Decision based on Bayesian risk threshold > 0.5"
|
| 223 |
+
),
|
| 224 |
+
},
|
| 225 |
}
|
| 226 |
|
| 227 |
+
# ------------------------------------------------------------------
|
| 228 |
# Asynchronous usage logging
|
| 229 |
+
# ------------------------------------------------------------------
|
| 230 |
if tracker:
|
| 231 |
record = UsageRecord(
|
| 232 |
api_key=api_key,
|
| 233 |
tier=tier,
|
| 234 |
timestamp=time.time(),
|
| 235 |
endpoint="/v1/incidents/evaluate",
|
| 236 |
+
request_body=event.model_dump(mode="json"),
|
| 237 |
response=response_data,
|
| 238 |
processing_ms=(time.time() - start_time) * 1000,
|
| 239 |
)
|
| 240 |
await tracker.increment_usage_async(record, background_tasks)
|
| 241 |
|
| 242 |
+
logger.warning(
|
| 243 |
+
"Deprecated endpoint /v1/incidents/evaluate called by key %s",
|
| 244 |
+
api_key[:8],
|
| 245 |
+
)
|
| 246 |
return response_data
|
| 247 |
|
| 248 |
except HTTPException:
|
| 249 |
raise
|
| 250 |
+
except Exception as exc:
|
| 251 |
+
error_msg = str(exc)
|
|
|
|
| 252 |
if tracker:
|
| 253 |
record = UsageRecord(
|
| 254 |
api_key=api_key,
|
| 255 |
tier=tier,
|
| 256 |
timestamp=time.time(),
|
| 257 |
endpoint="/v1/incidents/evaluate",
|
| 258 |
+
request_body=event.model_dump(mode="json"),
|
| 259 |
error=error_msg,
|
| 260 |
processing_ms=(time.time() - start_time) * 1000,
|
| 261 |
)
|
app/api/routes_memory.py
CHANGED
|
@@ -11,7 +11,11 @@ async def memory_stats(request: Request):
|
|
| 11 |
risk_engine = request.app.state.risk_engine
|
| 12 |
|
| 13 |
# Check if memory exists and has the required method
|
| 14 |
-
if hasattr(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
stats = risk_engine.memory.get_graph_stats()
|
| 16 |
return stats
|
| 17 |
else:
|
|
|
|
| 11 |
risk_engine = request.app.state.risk_engine
|
| 12 |
|
| 13 |
# Check if memory exists and has the required method
|
| 14 |
+
if hasattr(
|
| 15 |
+
risk_engine,
|
| 16 |
+
'memory') and hasattr(
|
| 17 |
+
risk_engine.memory,
|
| 18 |
+
'get_graph_stats'):
|
| 19 |
stats = risk_engine.memory.get_graph_stats()
|
| 20 |
return stats
|
| 21 |
else:
|
app/api/routes_payments.py
CHANGED
|
@@ -4,11 +4,9 @@ Payment endpoints – Stripe Checkout integration.
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
import stripe
|
| 7 |
-
from fastapi import APIRouter, HTTPException
|
| 8 |
from pydantic import BaseModel
|
| 9 |
-
from typing import Optional
|
| 10 |
|
| 11 |
-
from app.core.config import settings
|
| 12 |
from app.core.usage_tracker import tracker, Tier
|
| 13 |
|
| 14 |
router = APIRouter(prefix="/payments", tags=["payments"])
|
|
@@ -17,8 +15,10 @@ router = APIRouter(prefix="/payments", tags=["payments"])
|
|
| 17 |
stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
|
| 18 |
STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
|
| 19 |
|
|
|
|
| 20 |
class CheckoutRequest(BaseModel):
|
| 21 |
api_key: str
|
|
|
|
| 22 |
success_url: str
|
| 23 |
cancel_url: str
|
| 24 |
|
|
@@ -32,14 +32,16 @@ async def create_checkout_session(req: CheckoutRequest):
|
|
| 32 |
# Verify the API key exists and is free tier
|
| 33 |
tier = tracker.get_tier(req.api_key) if tracker else None
|
| 34 |
if tier != Tier.FREE:
|
| 35 |
-
raise HTTPException(status_code=400,
|
|
|
|
| 36 |
|
| 37 |
try:
|
| 38 |
checkout_session = stripe.checkout.Session.create(
|
| 39 |
payment_method_types=["card"],
|
| 40 |
line_items=[
|
| 41 |
{
|
| 42 |
-
|
|
|
|
| 43 |
"quantity": 1,
|
| 44 |
}
|
| 45 |
],
|
|
|
|
| 4 |
|
| 5 |
import os
|
| 6 |
import stripe
|
| 7 |
+
from fastapi import APIRouter, HTTPException
|
| 8 |
from pydantic import BaseModel
|
|
|
|
| 9 |
|
|
|
|
| 10 |
from app.core.usage_tracker import tracker, Tier
|
| 11 |
|
| 12 |
router = APIRouter(prefix="/payments", tags=["payments"])
|
|
|
|
| 15 |
stripe.api_key = os.getenv("STRIPE_SECRET_KEY")
|
| 16 |
STRIPE_WEBHOOK_SECRET = os.getenv("STRIPE_WEBHOOK_SECRET")
|
| 17 |
|
| 18 |
+
|
| 19 |
class CheckoutRequest(BaseModel):
|
| 20 |
api_key: str
|
| 21 |
+
|
| 22 |
success_url: str
|
| 23 |
cancel_url: str
|
| 24 |
|
|
|
|
| 32 |
# Verify the API key exists and is free tier
|
| 33 |
tier = tracker.get_tier(req.api_key) if tracker else None
|
| 34 |
if tier != Tier.FREE:
|
| 35 |
+
raise HTTPException(status_code=400,
|
| 36 |
+
detail="Only free tier keys can be upgraded")
|
| 37 |
|
| 38 |
try:
|
| 39 |
checkout_session = stripe.checkout.Session.create(
|
| 40 |
payment_method_types=["card"],
|
| 41 |
line_items=[
|
| 42 |
{
|
| 43 |
+
# e.g., "price_123"
|
| 44 |
+
"price": os.getenv("STRIPE_PRO_PRICE_ID"),
|
| 45 |
"quantity": 1,
|
| 46 |
}
|
| 47 |
],
|
app/api/routes_pricing.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pricing endpoints – integrates the ARF Bayesian pricing calculator.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from fastapi import APIRouter, HTTPException, Depends
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
import logging
|
| 8 |
+
|
| 9 |
+
from arf_pricing_calculator.core.pricing_engine import PricingEngine
|
| 10 |
+
from arf_pricing_calculator.ingestion.questionnaire_parser import parse_input_dict
|
| 11 |
+
from arf_pricing_calculator.types import PricingOutput
|
| 12 |
+
from app.core.usage_tracker import enforce_quota
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
router = APIRouter()
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class PricingEstimateRequest(BaseModel):
|
| 19 |
+
"""Request body for single pricing estimate."""
|
| 20 |
+
input: dict
|
| 21 |
+
customer_id: str = "default"
|
| 22 |
+
force: bool = False
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class PricingRunRequest(BaseModel):
|
| 26 |
+
"""Request body for multi‑run pricing with learning."""
|
| 27 |
+
input: dict
|
| 28 |
+
customer_id: str = "default"
|
| 29 |
+
runs: int = 1
|
| 30 |
+
cooldown_hours: int = 24
|
| 31 |
+
force: bool = False
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
@router.post("/pricing/estimate", response_model=PricingOutput)
|
| 35 |
+
async def estimate_pricing(
|
| 36 |
+
req: PricingEstimateRequest,
|
| 37 |
+
quota: dict = Depends(enforce_quota), # optional: enforce usage tracking
|
| 38 |
+
):
|
| 39 |
+
"""
|
| 40 |
+
Single pricing estimate – no learning, no buffer update.
|
| 41 |
+
"""
|
| 42 |
+
try:
|
| 43 |
+
# Convert the input dict to a PricingInput object
|
| 44 |
+
pricing_input = parse_input_dict(req.input)
|
| 45 |
+
# Create engine without buffer (no learning)
|
| 46 |
+
engine = PricingEngine(calibration_buffer=[])
|
| 47 |
+
output = engine.estimate(pricing_input)
|
| 48 |
+
return output
|
| 49 |
+
except Exception as e:
|
| 50 |
+
logger.exception("Pricing estimate failed")
|
| 51 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@router.post("/pricing/run", response_model=list[PricingOutput])
|
| 55 |
+
async def run_pricing(
|
| 56 |
+
req: PricingRunRequest,
|
| 57 |
+
quota: dict = Depends(enforce_quota),
|
| 58 |
+
):
|
| 59 |
+
"""
|
| 60 |
+
Multi‑run pricing with cooldown and buffer persistence.
|
| 61 |
+
Each run’s simulated outcome is added to the buffer, so subsequent runs
|
| 62 |
+
see an updated posterior.
|
| 63 |
+
"""
|
| 64 |
+
# We need to reuse the same buffer across runs; we'll load it per request.
|
| 65 |
+
# For simplicity, we'll load from the default location.
|
| 66 |
+
from arf_pricing_calculator.storage.buffer import load_buffer, add_event
|
| 67 |
+
from arf_pricing_calculator.orchestration.cooldown import enforce_cooldown, is_cooldown_active
|
| 68 |
+
|
| 69 |
+
outputs = []
|
| 70 |
+
buffer = load_buffer() # loads from calibration_buffer.json
|
| 71 |
+
|
| 72 |
+
for i in range(req.runs):
|
| 73 |
+
if not req.force and is_cooldown_active(
|
| 74 |
+
req.customer_id, req.cooldown_hours):
|
| 75 |
+
raise HTTPException(status_code=429,
|
| 76 |
+
detail=f"Cooldown active after {i} runs")
|
| 77 |
+
|
| 78 |
+
pricing_input = parse_input_dict(req.input)
|
| 79 |
+
engine = PricingEngine(calibration_buffer=buffer)
|
| 80 |
+
out = engine.estimate(pricing_input)
|
| 81 |
+
|
| 82 |
+
# Simulate an outcome (in real use, this would come from the actual
|
| 83 |
+
# deal)
|
| 84 |
+
import random
|
| 85 |
+
outcome = "success" if random.random() > out.risk_score else "failure" # nosec B311
|
| 86 |
+
|
| 87 |
+
event = {
|
| 88 |
+
"run_id": out.run_history_id,
|
| 89 |
+
"customer_id": req.customer_id,
|
| 90 |
+
"outcome": outcome,
|
| 91 |
+
"price": out.recommended_price,
|
| 92 |
+
"value": out.expected_value,
|
| 93 |
+
"risk_score": out.risk_score,
|
| 94 |
+
"run_number": i + 1,
|
| 95 |
+
}
|
| 96 |
+
add_event(event)
|
| 97 |
+
buffer = load_buffer() # reload after update
|
| 98 |
+
|
| 99 |
+
outputs.append(out)
|
| 100 |
+
|
| 101 |
+
if i < req.runs - 1:
|
| 102 |
+
enforce_cooldown(req.customer_id, req.cooldown_hours)
|
| 103 |
+
|
| 104 |
+
return outputs
|
app/api/routes_risk.py
CHANGED
|
@@ -9,32 +9,29 @@ router = APIRouter()
|
|
| 9 |
async def get_risk():
|
| 10 |
try:
|
| 11 |
risk = get_system_risk()
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
elif risk < 0.8:
|
| 17 |
-
status = "high"
|
| 18 |
-
else:
|
| 19 |
-
status = "critical"
|
| 20 |
-
return RiskResponse(system_risk=risk, status=status)
|
| 21 |
except Exception as e:
|
| 22 |
raise HTTPException(status_code=500, detail=str(e))
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
@router.get("/history")
|
| 26 |
async def get_risk_history():
|
| 27 |
-
"""
|
| 28 |
-
Return dummy historical risk data for the last 24 hours.
|
| 29 |
-
Replace with real database query later.
|
| 30 |
-
"""
|
| 31 |
import random
|
| 32 |
import datetime
|
| 33 |
now = datetime.datetime.now()
|
| 34 |
-
data = [
|
| 35 |
-
|
| 36 |
-
data.append({
|
| 37 |
-
"time": (now - datetime.timedelta(hours=i)).isoformat(),
|
| 38 |
-
"risk": round(random.uniform(0.2, 0.8), 2)
|
| 39 |
-
})
|
| 40 |
return data
|
|
|
|
| 9 |
async def get_risk():
|
| 10 |
try:
|
| 11 |
risk = get_system_risk()
|
| 12 |
+
except NotImplementedError:
|
| 13 |
+
raise HTTPException(
|
| 14 |
+
status_code=501,
|
| 15 |
+
detail="This endpoint is deprecated and not implemented")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
except Exception as e:
|
| 17 |
raise HTTPException(status_code=500, detail=str(e))
|
| 18 |
|
| 19 |
+
if risk < 0.3:
|
| 20 |
+
status = "low"
|
| 21 |
+
elif risk < 0.6:
|
| 22 |
+
status = "moderate"
|
| 23 |
+
elif risk < 0.8:
|
| 24 |
+
status = "high"
|
| 25 |
+
else:
|
| 26 |
+
status = "critical"
|
| 27 |
+
return RiskResponse(system_risk=risk, status=status)
|
| 28 |
+
|
| 29 |
|
| 30 |
@router.get("/history")
|
| 31 |
async def get_risk_history():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
import random
|
| 33 |
import datetime
|
| 34 |
now = datetime.datetime.now()
|
| 35 |
+
data = [{"time": (now - datetime.timedelta(hours=i)).isoformat(),
|
| 36 |
+
"risk": round(random.uniform(0.2, 0.8), 2)} for i in range(24, 0, -1)]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
return data
|
app/api/routes_users.py
CHANGED
|
@@ -3,7 +3,6 @@ User endpoints – registration and quota information.
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import uuid
|
| 6 |
-
import os
|
| 7 |
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 8 |
from slowapi import Limiter
|
| 9 |
from slowapi.util import get_remote_address
|
|
@@ -23,7 +22,9 @@ async def register_user(request: Request):
|
|
| 23 |
Rate‑limited to 5 requests per hour per IP address.
|
| 24 |
"""
|
| 25 |
if tracker is None:
|
| 26 |
-
raise HTTPException(
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Generate a new API key
|
| 29 |
new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
|
|
@@ -36,12 +37,13 @@ async def register_user(request: Request):
|
|
| 36 |
return {
|
| 37 |
"api_key": new_key,
|
| 38 |
"tier": "free",
|
| 39 |
-
"message": "API key created. Store it securely – you won't see it again."
|
| 40 |
-
}
|
| 41 |
|
| 42 |
|
| 43 |
@router.get("/quota")
|
| 44 |
-
async def get_user_quota(
|
|
|
|
|
|
|
| 45 |
"""
|
| 46 |
Return the current user's tier and remaining evaluation quota.
|
| 47 |
Requires API key in Authorization header.
|
|
@@ -55,17 +57,3 @@ async def get_user_quota(request: Request, quota: dict = Depends(enforce_quota))
|
|
| 55 |
"remaining": remaining,
|
| 56 |
"limit": limit,
|
| 57 |
}
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
# ===== DEBUG ENDPOINT – Remove in production =====
|
| 61 |
-
@router.get("/tracker-status")
|
| 62 |
-
async def tracker_status():
|
| 63 |
-
"""
|
| 64 |
-
Debug endpoint to check if the usage tracker is initialised.
|
| 65 |
-
Returns the tracker object and environment variables.
|
| 66 |
-
"""
|
| 67 |
-
return {
|
| 68 |
-
"tracker": str(tracker),
|
| 69 |
-
"env_tracking": os.getenv("ARF_USAGE_TRACKING"),
|
| 70 |
-
"env_db_path": os.getenv("ARF_USAGE_DB_PATH")
|
| 71 |
-
}
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import uuid
|
|
|
|
| 6 |
from fastapi import APIRouter, Depends, HTTPException, Request
|
| 7 |
from slowapi import Limiter
|
| 8 |
from slowapi.util import get_remote_address
|
|
|
|
| 22 |
Rate‑limited to 5 requests per hour per IP address.
|
| 23 |
"""
|
| 24 |
if tracker is None:
|
| 25 |
+
raise HTTPException(
|
| 26 |
+
status_code=503,
|
| 27 |
+
detail="Usage tracking not available")
|
| 28 |
|
| 29 |
# Generate a new API key
|
| 30 |
new_key = f"sk_free_{uuid.uuid4().hex[:24]}"
|
|
|
|
| 37 |
return {
|
| 38 |
"api_key": new_key,
|
| 39 |
"tier": "free",
|
| 40 |
+
"message": "API key created. Store it securely – you won't see it again."}
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
@router.get("/quota")
|
| 44 |
+
async def get_user_quota(
|
| 45 |
+
request: Request,
|
| 46 |
+
quota: dict = Depends(enforce_quota)):
|
| 47 |
"""
|
| 48 |
Return the current user's tier and remaining evaluation quota.
|
| 49 |
Requires API key in Authorization header.
|
|
|
|
| 57 |
"remaining": remaining,
|
| 58 |
"limit": limit,
|
| 59 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/api/webhooks.py
CHANGED
|
@@ -33,7 +33,8 @@ async def stripe_webhook(request: Request):
|
|
| 33 |
# Handle subscription events
|
| 34 |
if event["type"] == "checkout.session.completed":
|
| 35 |
session = event["data"]["object"]
|
| 36 |
-
api_key = session.get("client_reference_id") or session.get(
|
|
|
|
| 37 |
if api_key:
|
| 38 |
update_key_tier(api_key, Tier.PRO)
|
| 39 |
elif event["type"] == "customer.subscription.deleted":
|
|
|
|
| 33 |
# Handle subscription events
|
| 34 |
if event["type"] == "checkout.session.completed":
|
| 35 |
session = event["data"]["object"]
|
| 36 |
+
api_key = session.get("client_reference_id") or session.get(
|
| 37 |
+
"metadata", {}).get("api_key")
|
| 38 |
if api_key:
|
| 39 |
update_key_tier(api_key, Tier.PRO)
|
| 40 |
elif event["type"] == "customer.subscription.deleted":
|
app/core/config.py
CHANGED
|
@@ -15,6 +15,9 @@ class Settings(BaseSettings):
|
|
| 15 |
ARF_REDIS_URL: Optional[str] = None
|
| 16 |
ARF_API_KEYS: str = "{}" # JSON string of {key: tier}
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
class Config:
|
| 19 |
env_file = ".env"
|
| 20 |
extra = "ignore"
|
|
|
|
| 15 |
ARF_REDIS_URL: Optional[str] = None
|
| 16 |
ARF_API_KEYS: str = "{}" # JSON string of {key: tier}
|
| 17 |
|
| 18 |
+
# Tracing (OpenTelemetry)
|
| 19 |
+
OTEL_EXPORTER_OTLP_ENDPOINT: Optional[str] = None
|
| 20 |
+
|
| 21 |
class Config:
|
| 22 |
env_file = ".env"
|
| 23 |
extra = "ignore"
|
app/core/usage_tracker.py
CHANGED
|
@@ -1,19 +1,18 @@
|
|
| 1 |
"""
|
| 2 |
Usage Tracker for ARF API – quotas, tiers, and audit logging.
|
| 3 |
-
|
| 4 |
"""
|
| 5 |
|
| 6 |
-
import os
|
| 7 |
import json
|
| 8 |
import sqlite3
|
| 9 |
import threading
|
| 10 |
import time
|
| 11 |
from contextlib import contextmanager
|
| 12 |
from datetime import datetime, timedelta
|
| 13 |
-
from typing import Dict, Any, Optional, List
|
| 14 |
-
from enum import Enum
|
| 15 |
from dataclasses import dataclass
|
| 16 |
-
from
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Optional Redis support
|
| 19 |
try:
|
|
@@ -66,10 +65,11 @@ class UsageRecord:
|
|
| 66 |
|
| 67 |
class UsageTracker:
|
| 68 |
"""
|
| 69 |
-
Thread‑safe usage tracker with
|
| 70 |
"""
|
| 71 |
|
| 72 |
-
def __init__(self, db_path: str = "arf_usage.db",
|
|
|
|
| 73 |
self.db_path = db_path
|
| 74 |
self._local = threading.local()
|
| 75 |
self._init_db()
|
|
@@ -78,14 +78,17 @@ class UsageTracker:
|
|
| 78 |
if redis_url and REDIS_AVAILABLE:
|
| 79 |
self._redis_client = redis.from_url(redis_url)
|
| 80 |
elif redis_url:
|
| 81 |
-
raise ImportError(
|
|
|
|
| 82 |
|
| 83 |
@contextmanager
|
| 84 |
def _get_conn(self):
|
| 85 |
-
"""Get a thread‑local SQLite connection."""
|
| 86 |
if not hasattr(self._local, "conn"):
|
| 87 |
-
self._local.conn = sqlite3.connect(
|
|
|
|
| 88 |
self._local.conn.row_factory = sqlite3.Row
|
|
|
|
| 89 |
yield self._local.conn
|
| 90 |
|
| 91 |
def _init_db(self):
|
|
@@ -109,7 +112,8 @@ class UsageTracker:
|
|
| 109 |
request_body TEXT,
|
| 110 |
response TEXT,
|
| 111 |
error TEXT,
|
| 112 |
-
processing_ms REAL
|
|
|
|
| 113 |
)
|
| 114 |
""")
|
| 115 |
conn.execute("""
|
|
@@ -124,6 +128,12 @@ class UsageTracker:
|
|
| 124 |
PRIMARY KEY (api_key, year_month)
|
| 125 |
)
|
| 126 |
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
conn.commit()
|
| 128 |
|
| 129 |
def _get_month_key(self) -> str:
|
|
@@ -132,7 +142,8 @@ class UsageTracker:
|
|
| 132 |
def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
|
| 133 |
"""Register a new API key. Returns True if key exists or was created."""
|
| 134 |
with self._get_conn() as conn:
|
| 135 |
-
row = conn.execute(
|
|
|
|
| 136 |
if row:
|
| 137 |
return True
|
| 138 |
conn.execute(
|
|
@@ -156,45 +167,56 @@ class UsageTracker:
|
|
| 156 |
def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
|
| 157 |
"""Update the tier of an existing API key. Returns True if successful."""
|
| 158 |
with self._get_conn() as conn:
|
| 159 |
-
row = conn.execute(
|
|
|
|
| 160 |
if not row:
|
| 161 |
return False
|
| 162 |
-
conn.execute(
|
|
|
|
|
|
|
|
|
|
| 163 |
conn.commit()
|
| 164 |
return True
|
| 165 |
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
limit = tier.monthly_evaluation_limit
|
| 169 |
if limit is None:
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
| 177 |
|
|
|
|
| 178 |
with self._get_conn() as conn:
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
(
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
month = self._get_month_key()
|
| 192 |
-
if self._redis_client:
|
| 193 |
-
redis_key = f"arf:quota:{api_key}:{month}"
|
| 194 |
-
self._redis_client.incr(redis_key)
|
| 195 |
-
self._redis_client.expire(redis_key, timedelta(days=31))
|
| 196 |
-
else:
|
| 197 |
-
with self._get_conn() as conn:
|
| 198 |
conn.execute(
|
| 199 |
"""INSERT INTO monthly_counts (api_key, year_month, count)
|
| 200 |
VALUES (?, ?, 1)
|
|
@@ -202,58 +224,190 @@ class UsageTracker:
|
|
| 202 |
(api_key, month)
|
| 203 |
)
|
| 204 |
conn.commit()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
-
def
|
| 207 |
-
"""
|
| 208 |
with self._get_conn() as conn:
|
| 209 |
conn.execute(
|
| 210 |
-
"
|
| 211 |
-
|
| 212 |
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
| 213 |
-
(
|
| 214 |
-
record.api_key,
|
| 215 |
-
record.tier.value,
|
| 216 |
-
record.timestamp,
|
| 217 |
-
record.endpoint,
|
| 218 |
-
json.dumps(record.request_body) if record.request_body else None,
|
| 219 |
-
json.dumps(record.response) if record.response else None,
|
| 220 |
-
record.error,
|
| 221 |
-
record.processing_ms,
|
| 222 |
-
)
|
| 223 |
)
|
| 224 |
conn.commit()
|
|
|
|
|
|
|
| 225 |
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
"""
|
| 228 |
Synchronously record usage and increment counter.
|
| 229 |
-
Returns True if within quota
|
|
|
|
| 230 |
"""
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
if limit is not None:
|
| 234 |
-
remaining = self.get_remaining_quota(record.api_key, tier)
|
| 235 |
-
if remaining <= 0:
|
| 236 |
-
return False
|
| 237 |
-
self._increment_quota(record.api_key, tier)
|
| 238 |
-
self._insert_audit_log(record)
|
| 239 |
-
return True
|
| 240 |
|
| 241 |
-
async def increment_usage_async(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
"""
|
| 243 |
Asynchronously record usage using FastAPI BackgroundTasks.
|
| 244 |
-
|
| 245 |
"""
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
limit = tier.monthly_evaluation_limit
|
| 248 |
-
if limit is
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
def get_audit_logs(
|
| 258 |
self,
|
| 259 |
api_key: str,
|
|
@@ -278,8 +432,9 @@ class UsageTracker:
|
|
| 278 |
return [dict(row) for row in rows]
|
| 279 |
|
| 280 |
def clean_old_logs(self):
|
| 281 |
-
"""Delete logs older than retention period for each tier."""
|
| 282 |
with self._get_conn() as conn:
|
|
|
|
| 283 |
for tier in Tier:
|
| 284 |
retention_days = tier.audit_log_retention_days
|
| 285 |
if retention_days is None:
|
|
@@ -289,14 +444,23 @@ class UsageTracker:
|
|
| 289 |
"DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
|
| 290 |
(tier.value, cutoff)
|
| 291 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
conn.commit()
|
| 293 |
|
| 294 |
|
| 295 |
-
#
|
|
|
|
|
|
|
| 296 |
tracker: Optional[UsageTracker] = None
|
| 297 |
|
| 298 |
|
| 299 |
-
def init_tracker(
|
|
|
|
|
|
|
|
|
|
| 300 |
global tracker
|
| 301 |
tracker = UsageTracker(db_path, redis_url)
|
| 302 |
|
|
@@ -308,19 +472,16 @@ def update_key_tier(api_key: str, new_tier: Tier) -> bool:
|
|
| 308 |
return tracker.update_api_key_tier(api_key, new_tier)
|
| 309 |
|
| 310 |
|
| 311 |
-
# FastAPI dependency to enforce quota
|
| 312 |
-
from fastapi import HTTPException, Request
|
| 313 |
-
|
| 314 |
async def enforce_quota(request: Request, api_key: str = None):
|
| 315 |
"""
|
| 316 |
Dependency that checks API key and remaining quota.
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
If usage tracking is disabled, returns a default dict (no enforcement).
|
| 320 |
"""
|
| 321 |
-
#
|
| 322 |
if tracker is None:
|
| 323 |
-
|
|
|
|
|
|
|
| 324 |
|
| 325 |
# Extract API key from header or query
|
| 326 |
if api_key is None:
|
|
@@ -335,13 +496,16 @@ async def enforce_quota(request: Request, api_key: str = None):
|
|
| 335 |
|
| 336 |
tier = tracker.get_tier(api_key)
|
| 337 |
if tier is None:
|
| 338 |
-
raise HTTPException(
|
|
|
|
|
|
|
| 339 |
|
| 340 |
remaining = tracker.get_remaining_quota(api_key, tier)
|
| 341 |
if remaining is not None and remaining <= 0:
|
| 342 |
-
raise HTTPException(status_code=429,
|
|
|
|
| 343 |
|
| 344 |
-
# Store in request state for later logging
|
| 345 |
request.state.api_key = api_key
|
| 346 |
request.state.tier = tier
|
| 347 |
return {"api_key": api_key, "tier": tier, "remaining": remaining}
|
|
|
|
| 1 |
"""
|
| 2 |
Usage Tracker for ARF API – quotas, tiers, and audit logging.
|
| 3 |
+
Thread‑safe, atomic quota consumption, idempotent, fail‑closed.
|
| 4 |
"""
|
| 5 |
|
|
|
|
| 6 |
import json
|
| 7 |
import sqlite3
|
| 8 |
import threading
|
| 9 |
import time
|
| 10 |
from contextlib import contextmanager
|
| 11 |
from datetime import datetime, timedelta
|
|
|
|
|
|
|
| 12 |
from dataclasses import dataclass
|
| 13 |
+
from typing import Dict, Any, Optional, List, Tuple
|
| 14 |
+
from enum import Enum
|
| 15 |
+
from fastapi import BackgroundTasks, HTTPException, Request
|
| 16 |
|
| 17 |
# Optional Redis support
|
| 18 |
try:
|
|
|
|
| 65 |
|
| 66 |
class UsageTracker:
|
| 67 |
"""
|
| 68 |
+
Thread‑safe usage tracker with atomic quota consumption and idempotency.
|
| 69 |
"""
|
| 70 |
|
| 71 |
+
def __init__(self, db_path: str = "arf_usage.db",
|
| 72 |
+
redis_url: Optional[str] = None):
|
| 73 |
self.db_path = db_path
|
| 74 |
self._local = threading.local()
|
| 75 |
self._init_db()
|
|
|
|
| 78 |
if redis_url and REDIS_AVAILABLE:
|
| 79 |
self._redis_client = redis.from_url(redis_url)
|
| 80 |
elif redis_url:
|
| 81 |
+
raise ImportError(
|
| 82 |
+
"Redis client not installed. Run: pip install redis")
|
| 83 |
|
| 84 |
@contextmanager
|
| 85 |
def _get_conn(self):
|
| 86 |
+
"""Get a thread‑local SQLite connection with write‑ahead logging and immediate transactions."""
|
| 87 |
if not hasattr(self._local, "conn"):
|
| 88 |
+
self._local.conn = sqlite3.connect(
|
| 89 |
+
self.db_path, check_same_thread=False, isolation_level=None)
|
| 90 |
self._local.conn.row_factory = sqlite3.Row
|
| 91 |
+
self._local.conn.execute("PRAGMA journal_mode=WAL")
|
| 92 |
yield self._local.conn
|
| 93 |
|
| 94 |
def _init_db(self):
|
|
|
|
| 112 |
request_body TEXT,
|
| 113 |
response TEXT,
|
| 114 |
error TEXT,
|
| 115 |
+
processing_ms REAL,
|
| 116 |
+
idempotency_key TEXT UNIQUE
|
| 117 |
)
|
| 118 |
""")
|
| 119 |
conn.execute("""
|
|
|
|
| 128 |
PRIMARY KEY (api_key, year_month)
|
| 129 |
)
|
| 130 |
""")
|
| 131 |
+
conn.execute("""
|
| 132 |
+
CREATE TABLE IF NOT EXISTS idempotency_keys (
|
| 133 |
+
key TEXT PRIMARY KEY,
|
| 134 |
+
consumed_at REAL NOT NULL
|
| 135 |
+
)
|
| 136 |
+
""")
|
| 137 |
conn.commit()
|
| 138 |
|
| 139 |
def _get_month_key(self) -> str:
|
|
|
|
| 142 |
def get_or_create_api_key(self, key: str, tier: Tier = Tier.FREE) -> bool:
|
| 143 |
"""Register a new API key. Returns True if key exists or was created."""
|
| 144 |
with self._get_conn() as conn:
|
| 145 |
+
row = conn.execute(
|
| 146 |
+
"SELECT key FROM api_keys WHERE key = ?", (key,)).fetchone()
|
| 147 |
if row:
|
| 148 |
return True
|
| 149 |
conn.execute(
|
|
|
|
| 167 |
def update_api_key_tier(self, api_key: str, new_tier: Tier) -> bool:
|
| 168 |
"""Update the tier of an existing API key. Returns True if successful."""
|
| 169 |
with self._get_conn() as conn:
|
| 170 |
+
row = conn.execute(
|
| 171 |
+
"SELECT key FROM api_keys WHERE key = ?", (api_key,)).fetchone()
|
| 172 |
if not row:
|
| 173 |
return False
|
| 174 |
+
conn.execute(
|
| 175 |
+
"UPDATE api_keys SET tier = ? WHERE key = ?",
|
| 176 |
+
(new_tier.value,
|
| 177 |
+
api_key))
|
| 178 |
conn.commit()
|
| 179 |
return True
|
| 180 |
|
| 181 |
+
# --------------------------------------------------------------------------
|
| 182 |
+
# Atomic quota consumption
|
| 183 |
+
# --------------------------------------------------------------------------
|
| 184 |
+
def _consume_quota_atomic_sqlite(
|
| 185 |
+
self,
|
| 186 |
+
api_key: str,
|
| 187 |
+
tier: Tier,
|
| 188 |
+
month: str) -> bool: # noqa: E501
|
| 189 |
+
"""
|
| 190 |
+
Atomically increment counter only if under limit.
|
| 191 |
+
Returns True if quota was consumed, False if limit reached.
|
| 192 |
+
"""
|
| 193 |
limit = tier.monthly_evaluation_limit
|
| 194 |
if limit is None:
|
| 195 |
+
# Unlimited – still increment for tracking but always succeed
|
| 196 |
+
with self._get_conn() as conn:
|
| 197 |
+
conn.execute(
|
| 198 |
+
"""INSERT INTO monthly_counts (api_key, year_month, count)
|
| 199 |
+
VALUES (?, ?, 1)
|
| 200 |
+
ON CONFLICT(api_key, year_month) DO UPDATE SET count = count + 1""",
|
| 201 |
+
(api_key, month)
|
| 202 |
+
)
|
| 203 |
+
conn.commit()
|
| 204 |
+
return True
|
| 205 |
|
| 206 |
+
# Use BEGIN IMMEDIATE to lock the database for the transaction
|
| 207 |
with self._get_conn() as conn:
|
| 208 |
+
conn.execute("BEGIN IMMEDIATE")
|
| 209 |
+
try:
|
| 210 |
+
# Get current count (or 0)
|
| 211 |
+
row = conn.execute(
|
| 212 |
+
"SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
|
| 213 |
+
(api_key, month)
|
| 214 |
+
).fetchone()
|
| 215 |
+
current = row["count"] if row else 0
|
| 216 |
+
if current >= limit:
|
| 217 |
+
conn.rollback()
|
| 218 |
+
return False
|
| 219 |
+
# Increment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
conn.execute(
|
| 221 |
"""INSERT INTO monthly_counts (api_key, year_month, count)
|
| 222 |
VALUES (?, ?, 1)
|
|
|
|
| 224 |
(api_key, month)
|
| 225 |
)
|
| 226 |
conn.commit()
|
| 227 |
+
return True
|
| 228 |
+
except Exception:
|
| 229 |
+
conn.rollback()
|
| 230 |
+
raise
|
| 231 |
+
|
| 232 |
+
def _consume_quota_atomic_redis(
|
| 233 |
+
self,
|
| 234 |
+
api_key: str,
|
| 235 |
+
tier: Tier,
|
| 236 |
+
month: str) -> bool:
|
| 237 |
+
"""Atomic Lua script for Redis: INCR only if below limit."""
|
| 238 |
+
limit = tier.monthly_evaluation_limit
|
| 239 |
+
if limit is None:
|
| 240 |
+
# Unlimited – just increment and return True
|
| 241 |
+
redis_key = f"arf:quota:{api_key}:{month}"
|
| 242 |
+
self._redis_client.incr(redis_key)
|
| 243 |
+
self._redis_client.expire(redis_key, timedelta(days=31))
|
| 244 |
+
return True
|
| 245 |
+
|
| 246 |
+
lua_script = """
|
| 247 |
+
local key = KEYS[1]
|
| 248 |
+
local limit = tonumber(ARGV[1])
|
| 249 |
+
local current = redis.call('GET', key)
|
| 250 |
+
if current and tonumber(current) >= limit then
|
| 251 |
+
return 0
|
| 252 |
+
end
|
| 253 |
+
local new = redis.call('INCR', key)
|
| 254 |
+
redis.call('EXPIRE', key, 2678400) -- 31 days
|
| 255 |
+
return 1
|
| 256 |
+
"""
|
| 257 |
+
redis_key = f"arf:quota:{api_key}:{month}"
|
| 258 |
+
result = self._redis_client.eval(lua_script, 1, redis_key, limit)
|
| 259 |
+
return result == 1
|
| 260 |
+
|
| 261 |
+
# --------------------------------------------------------------------------
|
| 262 |
+
# Idempotency handling
|
| 263 |
+
# --------------------------------------------------------------------------
|
| 264 |
+
def _is_idempotent_key_used(self, key: str) -> bool:
|
| 265 |
+
"""Check if idempotency key already processed."""
|
| 266 |
+
with self._get_conn() as conn:
|
| 267 |
+
row = conn.execute(
|
| 268 |
+
"SELECT 1 FROM idempotency_keys WHERE key = ?", (key,)).fetchone()
|
| 269 |
+
return row is not None
|
| 270 |
|
| 271 |
+
def _mark_idempotent_key_used(self, key: str, ttl_seconds: int = 86400):
|
| 272 |
+
"""Store idempotency key with expiration (cleanup later)."""
|
| 273 |
with self._get_conn() as conn:
|
| 274 |
conn.execute(
|
| 275 |
+
"INSERT INTO idempotency_keys (key, consumed_at) VALUES (?, ?)",
|
| 276 |
+
(key, time.time())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
)
|
| 278 |
conn.commit()
|
| 279 |
+
# Optionally schedule cleanup of old keys (can be done in a background
|
| 280 |
+
# thread)
|
| 281 |
|
| 282 |
+
# --------------------------------------------------------------------------
|
| 283 |
+
# Core usage recording (atomic + idempotent)
|
| 284 |
+
# --------------------------------------------------------------------------
|
| 285 |
+
def consume_quota_and_log(
|
| 286 |
+
self,
|
| 287 |
+
record: UsageRecord,
|
| 288 |
+
idempotency_key: Optional[str] = None,
|
| 289 |
+
) -> Tuple[bool, Optional[Dict[str, Any]]]:
|
| 290 |
+
"""
|
| 291 |
+
Atomically consume quota and insert audit log.
|
| 292 |
+
Returns (success, existing_response) where existing_response is not None
|
| 293 |
+
only when idempotency_key matched a previous successful call.
|
| 294 |
+
"""
|
| 295 |
+
# Idempotency check (if key provided)
|
| 296 |
+
if idempotency_key:
|
| 297 |
+
if self._is_idempotent_key_used(idempotency_key):
|
| 298 |
+
# Retrieve previous response from audit log (simplified – you may cache full response)
|
| 299 |
+
# For full idempotency, we would store the response body in idempotency table.
|
| 300 |
+
# Here we return a marker that caller should use cached
|
| 301 |
+
# response.
|
| 302 |
+
return False, {"idempotent": True,
|
| 303 |
+
"message": "Already processed"}
|
| 304 |
+
|
| 305 |
+
month = self._get_month_key()
|
| 306 |
+
# Atomic quota consumption
|
| 307 |
+
if self._redis_client:
|
| 308 |
+
quota_ok = self._consume_quota_atomic_redis(
|
| 309 |
+
record.api_key, record.tier, month)
|
| 310 |
+
else:
|
| 311 |
+
quota_ok = self._consume_quota_atomic_sqlite(
|
| 312 |
+
record.api_key, record.tier, month)
|
| 313 |
+
|
| 314 |
+
if not quota_ok:
|
| 315 |
+
return False, None
|
| 316 |
+
|
| 317 |
+
# Insert audit log (with idempotency key as unique constraint)
|
| 318 |
+
try:
|
| 319 |
+
with self._get_conn() as conn:
|
| 320 |
+
conn.execute(
|
| 321 |
+
"""INSERT INTO usage_log
|
| 322 |
+
(api_key, tier, timestamp, endpoint,
|
| 323 |
+
request_body, response, error, processing_ms,
|
| 324 |
+
idempotency_key)
|
| 325 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
| 326 |
+
(record.api_key,
|
| 327 |
+
record.tier.value,
|
| 328 |
+
record.timestamp,
|
| 329 |
+
record.endpoint,
|
| 330 |
+
json.dumps(
|
| 331 |
+
record.request_body) if record.request_body else None,
|
| 332 |
+
json.dumps(
|
| 333 |
+
record.response) if record.response else None,
|
| 334 |
+
record.error,
|
| 335 |
+
record.processing_ms,
|
| 336 |
+
idempotency_key,
|
| 337 |
+
))
|
| 338 |
+
conn.commit()
|
| 339 |
+
except sqlite3.IntegrityError as e:
|
| 340 |
+
# Duplicate idempotency_key – already inserted by another
|
| 341 |
+
# concurrent request
|
| 342 |
+
if "UNIQUE constraint failed: usage_log.idempotency_key" in str(e):
|
| 343 |
+
return False, {"idempotent": True,
|
| 344 |
+
"message": "Already processed"}
|
| 345 |
+
raise
|
| 346 |
+
|
| 347 |
+
if idempotency_key:
|
| 348 |
+
self._mark_idempotent_key_used(idempotency_key)
|
| 349 |
+
# Removed stray # noqa: E501 comment that was wrongly indented here
|
| 350 |
+
return True, None
|
| 351 |
+
|
| 352 |
+
# --------------------------------------------------------------------------
|
| 353 |
+
# Legacy interface (kept for compatibility but deprecated)
|
| 354 |
+
# --------------------------------------------------------------------------
|
| 355 |
+
def increment_usage_sync(
|
| 356 |
+
self,
|
| 357 |
+
record: UsageRecord,
|
| 358 |
+
idempotency_key: Optional[str] = None) -> bool:
|
| 359 |
"""
|
| 360 |
Synchronously record usage and increment counter.
|
| 361 |
+
Returns True if within quota and recorded, False otherwise.
|
| 362 |
+
This method now uses the atomic implementation.
|
| 363 |
"""
|
| 364 |
+
success, _ = self.consume_quota_and_log(record, idempotency_key)
|
| 365 |
+
return success
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
|
| 367 |
+
async def increment_usage_async(
|
| 368 |
+
self,
|
| 369 |
+
record: UsageRecord,
|
| 370 |
+
background_tasks: BackgroundTasks,
|
| 371 |
+
idempotency_key: Optional[str] = None
|
| 372 |
+
) -> bool:
|
| 373 |
"""
|
| 374 |
Asynchronously record usage using FastAPI BackgroundTasks.
|
| 375 |
+
Still does the atomic check synchronously, then schedules the insert.
|
| 376 |
"""
|
| 377 |
+
# First, do atomic quota check (synchronous) – we must ensure we don't double-consume.
|
| 378 |
+
# Because background tasks may run later, we still need to reserve quota now.
|
| 379 |
+
# Simplified: we call consume_quota_and_log synchronously – that defeats async benefit.
|
| 380 |
+
# Better to use a queue or Redis with background processing.
|
| 381 |
+
# For this fix, we'll use the sync method (blocking) but still support
|
| 382 |
+
# idempotency.
|
| 383 |
+
return self.increment_usage_sync(record, idempotency_key)
|
| 384 |
+
|
| 385 |
+
# --------------------------------------------------------------------------
|
| 386 |
+
# Quota inspection (non‑atomic, for display only)
|
| 387 |
+
# --------------------------------------------------------------------------
|
| 388 |
+
def get_remaining_quota(self, api_key: str, tier: Tier) -> Optional[int]:
|
| 389 |
+
"""Return remaining evaluations for the month (non‑atomic, for info only)."""
|
| 390 |
limit = tier.monthly_evaluation_limit
|
| 391 |
+
if limit is None:
|
| 392 |
+
return None
|
| 393 |
+
|
| 394 |
+
month = self._get_month_key()
|
| 395 |
+
if self._redis_client:
|
| 396 |
+
redis_key = f"arf:quota:{api_key}:{month}"
|
| 397 |
+
count = int(self._redis_client.get(redis_key) or 0)
|
| 398 |
+
return max(0, limit - count)
|
| 399 |
|
| 400 |
+
with self._get_conn() as conn:
|
| 401 |
+
row = conn.execute(
|
| 402 |
+
"SELECT count FROM monthly_counts WHERE api_key = ? AND year_month = ?",
|
| 403 |
+
(api_key, month)
|
| 404 |
+
).fetchone()
|
| 405 |
+
count = row["count"] if row else 0
|
| 406 |
+
return max(0, limit - count)
|
| 407 |
+
|
| 408 |
+
# --------------------------------------------------------------------------
|
| 409 |
+
# Audit and maintenance
|
| 410 |
+
# --------------------------------------------------------------------------
|
| 411 |
def get_audit_logs(
|
| 412 |
self,
|
| 413 |
api_key: str,
|
|
|
|
| 432 |
return [dict(row) for row in rows]
|
| 433 |
|
| 434 |
def clean_old_logs(self):
|
| 435 |
+
"""Delete logs older than retention period for each tier, and old idempotency keys."""
|
| 436 |
with self._get_conn() as conn:
|
| 437 |
+
# Delete old usage logs
|
| 438 |
for tier in Tier:
|
| 439 |
retention_days = tier.audit_log_retention_days
|
| 440 |
if retention_days is None:
|
|
|
|
| 444 |
"DELETE FROM usage_log WHERE tier = ? AND timestamp < ?",
|
| 445 |
(tier.value, cutoff)
|
| 446 |
)
|
| 447 |
+
# Delete idempotency keys older than 7 days
|
| 448 |
+
cutoff = time.time() - 7 * 86400
|
| 449 |
+
conn.execute(
|
| 450 |
+
"DELETE FROM idempotency_keys WHERE consumed_at < ?", (cutoff,))
|
| 451 |
conn.commit()
|
| 452 |
|
| 453 |
|
| 454 |
+
# --------------------------------------------------------------------------
|
| 455 |
+
# Global instance and FastAPI dependency (fail‑closed)
|
| 456 |
+
# --------------------------------------------------------------------------
|
| 457 |
tracker: Optional[UsageTracker] = None
|
| 458 |
|
| 459 |
|
| 460 |
+
def init_tracker(
|
| 461 |
+
db_path: str = "arf_usage.db",
|
| 462 |
+
redis_url: Optional[str] = None):
|
| 463 |
+
"""Initialize the global tracker. Must be called before enforce_quota."""
|
| 464 |
global tracker
|
| 465 |
tracker = UsageTracker(db_path, redis_url)
|
| 466 |
|
|
|
|
| 472 |
return tracker.update_api_key_tier(api_key, new_tier)
|
| 473 |
|
| 474 |
|
|
|
|
|
|
|
|
|
|
| 475 |
async def enforce_quota(request: Request, api_key: str = None):
|
| 476 |
"""
|
| 477 |
Dependency that checks API key and remaining quota.
|
| 478 |
+
FAILS CLOSED: if tracker not initialised, raises HTTP 503.
|
|
|
|
|
|
|
| 479 |
"""
|
| 480 |
+
# P0 fix: No fallback that allows all requests
|
| 481 |
if tracker is None:
|
| 482 |
+
raise HTTPException(
|
| 483 |
+
status_code=503,
|
| 484 |
+
detail="Usage tracking service not initialised. Please contact administrator.")
|
| 485 |
|
| 486 |
# Extract API key from header or query
|
| 487 |
if api_key is None:
|
|
|
|
| 496 |
|
| 497 |
tier = tracker.get_tier(api_key)
|
| 498 |
if tier is None:
|
| 499 |
+
raise HTTPException(
|
| 500 |
+
status_code=403,
|
| 501 |
+
detail="Invalid or inactive API key")
|
| 502 |
|
| 503 |
remaining = tracker.get_remaining_quota(api_key, tier)
|
| 504 |
if remaining is not None and remaining <= 0:
|
| 505 |
+
raise HTTPException(status_code=429,
|
| 506 |
+
detail="Monthly evaluation quota exceeded")
|
| 507 |
|
| 508 |
+
# Store in request state for later logging (optional)
|
| 509 |
request.state.api_key = api_key
|
| 510 |
request.state.tier = tier
|
| 511 |
return {"api_key": api_key, "tier": tier, "remaining": remaining}
|
app/database/models_intents.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, ForeignKey, UniqueConstraint
|
| 2 |
from sqlalchemy.orm import relationship
|
| 3 |
import datetime
|
| 4 |
from .base import Base
|
|
@@ -7,27 +7,69 @@ from .base import Base
|
|
| 7 |
class IntentDB(Base):
|
| 8 |
__tablename__ = "intents"
|
| 9 |
id = Column(Integer, primary_key=True, index=True)
|
| 10 |
-
deterministic_id = Column(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
intent_type = Column(String(64), nullable=False)
|
| 12 |
payload = Column(JSON, nullable=False)
|
| 13 |
oss_payload = Column(JSON, nullable=True)
|
| 14 |
environment = Column(String(32), nullable=True)
|
| 15 |
-
created_at = Column(
|
|
|
|
|
|
|
|
|
|
| 16 |
evaluated_at = Column(DateTime, nullable=True)
|
| 17 |
risk_score = Column(String(32), nullable=True)
|
| 18 |
-
outcomes = relationship(
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
class OutcomeDB(Base):
|
| 22 |
__tablename__ = "intent_outcomes"
|
| 23 |
id = Column(Integer, primary_key=True, index=True)
|
| 24 |
-
intent_id = Column(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
success = Column(Boolean, nullable=False)
|
| 26 |
recorded_by = Column(String(128), nullable=True)
|
| 27 |
notes = Column(Text, nullable=True)
|
| 28 |
-
recorded_at = Column(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
intent = relationship("IntentDB", back_populates="outcomes")
|
| 30 |
|
| 31 |
__table_args__ = (
|
| 32 |
UniqueConstraint("intent_id", name="uq_outcome_intentid"),
|
| 33 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sqlalchemy import Column, Integer, String, DateTime, Boolean, Text, JSON, Float, ForeignKey, UniqueConstraint
|
| 2 |
from sqlalchemy.orm import relationship
|
| 3 |
import datetime
|
| 4 |
from .base import Base
|
|
|
|
| 7 |
class IntentDB(Base):
|
| 8 |
__tablename__ = "intents"
|
| 9 |
id = Column(Integer, primary_key=True, index=True)
|
| 10 |
+
deterministic_id = Column(
|
| 11 |
+
String(64),
|
| 12 |
+
unique=True,
|
| 13 |
+
index=True,
|
| 14 |
+
nullable=False)
|
| 15 |
intent_type = Column(String(64), nullable=False)
|
| 16 |
payload = Column(JSON, nullable=False)
|
| 17 |
oss_payload = Column(JSON, nullable=True)
|
| 18 |
environment = Column(String(32), nullable=True)
|
| 19 |
+
created_at = Column(
|
| 20 |
+
DateTime,
|
| 21 |
+
default=datetime.datetime.utcnow,
|
| 22 |
+
nullable=False)
|
| 23 |
evaluated_at = Column(DateTime, nullable=True)
|
| 24 |
risk_score = Column(String(32), nullable=True)
|
| 25 |
+
outcomes = relationship(
|
| 26 |
+
"OutcomeDB",
|
| 27 |
+
back_populates="intent",
|
| 28 |
+
cascade="all, delete-orphan")
|
| 29 |
|
| 30 |
|
| 31 |
class OutcomeDB(Base):
|
| 32 |
__tablename__ = "intent_outcomes"
|
| 33 |
id = Column(Integer, primary_key=True, index=True)
|
| 34 |
+
intent_id = Column(
|
| 35 |
+
Integer,
|
| 36 |
+
ForeignKey(
|
| 37 |
+
"intents.id",
|
| 38 |
+
ondelete="CASCADE"),
|
| 39 |
+
nullable=False)
|
| 40 |
success = Column(Boolean, nullable=False)
|
| 41 |
recorded_by = Column(String(128), nullable=True)
|
| 42 |
notes = Column(Text, nullable=True)
|
| 43 |
+
recorded_at = Column(
|
| 44 |
+
DateTime,
|
| 45 |
+
default=datetime.datetime.utcnow,
|
| 46 |
+
nullable=False)
|
| 47 |
+
idempotency_key = Column(String(128), unique=True, nullable=True)
|
| 48 |
intent = relationship("IntentDB", back_populates="outcomes")
|
| 49 |
|
| 50 |
__table_args__ = (
|
| 51 |
UniqueConstraint("intent_id", name="uq_outcome_intentid"),
|
| 52 |
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# NEW: Persistence for the conjugate Bayesian state
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
class BetaStateDB(Base):
|
| 59 |
+
"""
|
| 60 |
+
Stores the per‑category posterior parameters (α, β) of the BetaStore
|
| 61 |
+
so that online learning survives API restarts.
|
| 62 |
+
|
| 63 |
+
Only one row per ActionCategory is expected; the 'category' column is
|
| 64 |
+
unique. Updates are performed via merge / upsert.
|
| 65 |
+
"""
|
| 66 |
+
__tablename__ = "beta_state"
|
| 67 |
+
|
| 68 |
+
id = Column(Integer, primary_key=True, index=True)
|
| 69 |
+
category = Column(String(32), unique=True, nullable=False, index=True)
|
| 70 |
+
alpha = Column(Float, nullable=False)
|
| 71 |
+
beta = Column(Float, nullable=False)
|
| 72 |
+
updated_at = Column(
|
| 73 |
+
DateTime,
|
| 74 |
+
default=datetime.datetime.utcnow,
|
| 75 |
+
onupdate=datetime.datetime.utcnow)
|
app/database/session.py
CHANGED
|
@@ -1,19 +1,6 @@
|
|
| 1 |
from sqlalchemy import create_engine
|
| 2 |
-
from sqlalchemy.ext.declarative import declarative_base
|
| 3 |
from sqlalchemy.orm import sessionmaker
|
| 4 |
from app.core.config import settings
|
| 5 |
|
| 6 |
-
|
| 7 |
-
if settings.database_url:
|
| 8 |
-
DATABASE_URL = settings.database_url
|
| 9 |
-
else:
|
| 10 |
-
# Fallback to a local SQLite file (writable in the container)
|
| 11 |
-
DATABASE_URL = "sqlite:///./app.db"
|
| 12 |
-
|
| 13 |
-
# For SQLite, we need to disable the threading check
|
| 14 |
-
connect_args = {"check_same_thread": False} if DATABASE_URL.startswith("sqlite") else {}
|
| 15 |
-
|
| 16 |
-
engine = create_engine(DATABASE_URL, connect_args=connect_args)
|
| 17 |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
| 18 |
-
|
| 19 |
-
Base = declarative_base()
|
|
|
|
| 1 |
from sqlalchemy import create_engine
|
|
|
|
| 2 |
from sqlalchemy.orm import sessionmaker
|
| 3 |
from app.core.config import settings
|
| 4 |
|
| 5 |
+
engine = create_engine(settings.database_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
|
|
|
|
|
app/main.py
CHANGED
|
@@ -1,18 +1,42 @@
|
|
| 1 |
"""
|
| 2 |
-
ARF API Control Plane
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
import logging
|
| 6 |
import os
|
| 7 |
import sys
|
| 8 |
import json
|
|
|
|
|
|
|
| 9 |
from contextlib import asynccontextmanager
|
| 10 |
from typing import Dict
|
| 11 |
|
| 12 |
from fastapi import FastAPI
|
| 13 |
from fastapi.middleware.cors import CORSMiddleware
|
| 14 |
|
| 15 |
-
# Optional
|
| 16 |
try:
|
| 17 |
from prometheus_fastapi_instrumentator import Instrumentator
|
| 18 |
PROMETHEUS_AVAILABLE = True
|
|
@@ -20,7 +44,7 @@ except ImportError:
|
|
| 20 |
PROMETHEUS_AVAILABLE = False
|
| 21 |
Instrumentator = None
|
| 22 |
|
| 23 |
-
# Optional slowapi
|
| 24 |
try:
|
| 25 |
from slowapi import _rate_limit_exceeded_handler
|
| 26 |
from slowapi.errors import RateLimitExceeded
|
|
@@ -32,7 +56,7 @@ except ImportError:
|
|
| 32 |
RateLimitExceeded = None
|
| 33 |
SlowAPIMiddleware = None
|
| 34 |
|
| 35 |
-
#
|
| 36 |
try:
|
| 37 |
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 38 |
from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
|
|
@@ -47,7 +71,7 @@ except ImportError:
|
|
| 47 |
RAGGraphMemory = None
|
| 48 |
MemoryConstants = None
|
| 49 |
|
| 50 |
-
#
|
| 51 |
from app.core.usage_tracker import init_tracker, tracker, Tier
|
| 52 |
|
| 53 |
from app.api import (
|
|
@@ -61,6 +85,7 @@ from app.api import (
|
|
| 61 |
routes_payments,
|
| 62 |
webhooks,
|
| 63 |
routes_users,
|
|
|
|
| 64 |
)
|
| 65 |
from app.api.deps import limiter
|
| 66 |
from app.core.config import settings
|
|
@@ -75,18 +100,35 @@ logging.basicConfig(
|
|
| 75 |
|
| 76 |
@asynccontextmanager
|
| 77 |
async def lifespan(app: FastAPI):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
logger.info("🚀 Starting ARF API Control Plane")
|
| 79 |
logger.debug(f"Python path: {sys.path}")
|
| 80 |
|
|
|
|
| 81 |
if ARF_AVAILABLE:
|
| 82 |
hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
|
| 83 |
use_hyperpriors = os.getenv(
|
| 84 |
-
"ARF_USE_HYPERPRIORS",
|
| 85 |
-
|
| 86 |
logger.info(
|
| 87 |
"Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
|
| 88 |
hmc_model_path,
|
| 89 |
-
use_hyperpriors
|
|
|
|
| 90 |
try:
|
| 91 |
app.state.risk_engine = RiskEngine(
|
| 92 |
hmc_model_path=hmc_model_path,
|
|
@@ -99,6 +141,55 @@ async def lifespan(app: FastAPI):
|
|
| 99 |
logger.exception("💥 Fatal error initializing RiskEngine")
|
| 100 |
raise RuntimeError("RiskEngine initialization failed") from e
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
try:
|
| 103 |
app.state.policy_engine = PolicyEngine()
|
| 104 |
logger.info("✅ PolicyEngine initialized successfully.")
|
|
@@ -120,12 +211,14 @@ async def lifespan(app: FastAPI):
|
|
| 120 |
from sentence_transformers import SentenceTransformer
|
| 121 |
logger.info(f"Loading epistemic model: {epistemic_model_name}")
|
| 122 |
app.state.epistemic_model = SentenceTransformer(
|
| 123 |
-
epistemic_model_name
|
|
|
|
| 124 |
app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
|
| 125 |
logger.info("✅ Epistemic model loaded.")
|
| 126 |
except ImportError:
|
| 127 |
logger.warning(
|
| 128 |
-
"sentence-transformers not installed; epistemic signals will be zeros."
|
|
|
|
| 129 |
app.state.epistemic_model = None
|
| 130 |
app.state.epistemic_tokenizer = None
|
| 131 |
except Exception as e:
|
|
@@ -134,45 +227,94 @@ async def lifespan(app: FastAPI):
|
|
| 134 |
app.state.epistemic_tokenizer = None
|
| 135 |
else:
|
| 136 |
logger.info(
|
| 137 |
-
"EPISTEMIC_MODEL not set; epistemic signals will be zeros."
|
|
|
|
| 138 |
app.state.epistemic_model = None
|
| 139 |
app.state.epistemic_tokenizer = None
|
| 140 |
else:
|
| 141 |
logger.warning(
|
| 142 |
-
"agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled."
|
|
|
|
| 143 |
|
| 144 |
-
#
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
| 146 |
logger.info("Initialising usage tracker...")
|
| 147 |
-
# HARDCODED WRITABLE PATH – fixes 503 error
|
| 148 |
-
init_tracker(
|
| 149 |
-
db_path="/tmp/arf_usage.db", # was os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db")
|
| 150 |
-
redis_url=os.getenv("ARF_REDIS_URL")
|
| 151 |
-
)
|
| 152 |
-
# Seed initial API keys from environment variable (for testing / demo)
|
| 153 |
-
api_keys_json = os.getenv("ARF_API_KEYS", "{}")
|
| 154 |
try:
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
else:
|
| 168 |
-
logger.info("Usage tracking disabled
|
| 169 |
app.state.usage_tracker = None
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
yield
|
| 172 |
logger.info("🛑 Shutting down ARF API")
|
| 173 |
|
| 174 |
|
| 175 |
def create_app() -> FastAPI:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
app = FastAPI(
|
| 177 |
title=settings.app_name,
|
| 178 |
version="0.5.0",
|
|
@@ -182,6 +324,7 @@ def create_app() -> FastAPI:
|
|
| 182 |
description="Agentic Reliability Framework (ARF) API",
|
| 183 |
)
|
| 184 |
|
|
|
|
| 185 |
allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
|
| 186 |
app.add_middleware(
|
| 187 |
CORSMiddleware,
|
|
@@ -192,67 +335,64 @@ def create_app() -> FastAPI:
|
|
| 192 |
)
|
| 193 |
logger.debug("CORS middleware configured")
|
| 194 |
|
|
|
|
| 195 |
if SLOWAPI_AVAILABLE:
|
| 196 |
app.state.limiter = limiter
|
| 197 |
app.add_exception_handler(
|
| 198 |
-
RateLimitExceeded,
|
| 199 |
-
|
| 200 |
app.add_middleware(SlowAPIMiddleware)
|
| 201 |
logger.debug("Rate limiter middleware configured")
|
| 202 |
else:
|
| 203 |
logger.debug("Rate limiter disabled (slowapi not installed)")
|
| 204 |
|
|
|
|
| 205 |
if PROMETHEUS_AVAILABLE:
|
| 206 |
Instrumentator().instrument(app).expose(app)
|
| 207 |
logger.debug("Prometheus instrumentator configured")
|
| 208 |
else:
|
| 209 |
-
logger.debug(
|
| 210 |
-
"Prometheus instrumentator disabled (module not installed)")
|
| 211 |
|
| 212 |
-
#
|
| 213 |
app.include_router(
|
| 214 |
-
routes_incidents.router,
|
| 215 |
-
|
| 216 |
-
tags=["incidents"])
|
| 217 |
app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
|
| 218 |
app.include_router(
|
| 219 |
-
routes_intents.router,
|
| 220 |
-
|
| 221 |
-
tags=["intents"])
|
| 222 |
app.include_router(
|
| 223 |
-
routes_history.router,
|
| 224 |
-
|
| 225 |
-
tags=["history"])
|
| 226 |
app.include_router(
|
| 227 |
-
routes_governance.router,
|
| 228 |
-
|
| 229 |
-
tags=["governance"])
|
| 230 |
app.include_router(
|
| 231 |
-
routes_memory.router,
|
| 232 |
-
|
| 233 |
-
tags=["memory"])
|
| 234 |
app.include_router(
|
| 235 |
-
routes_admin.router,
|
| 236 |
-
|
| 237 |
-
tags=["admin"])
|
| 238 |
app.include_router(
|
| 239 |
-
routes_payments.router,
|
| 240 |
-
|
| 241 |
-
tags=["payments"])
|
| 242 |
app.include_router(
|
| 243 |
-
webhooks.router,
|
| 244 |
-
|
| 245 |
app.include_router(
|
| 246 |
-
routes_users.router,
|
| 247 |
-
|
| 248 |
-
|
|
|
|
|
|
|
| 249 |
logger.debug("All API routers included")
|
| 250 |
|
| 251 |
@app.get("/health", tags=["health"])
|
| 252 |
async def health() -> Dict[str, str]:
|
|
|
|
| 253 |
return {"status": "ok"}
|
| 254 |
|
| 255 |
return app
|
| 256 |
|
| 257 |
|
| 258 |
-
app = create_app()
|
|
|
|
| 1 |
"""
|
| 2 |
+
ARF API Control Plane — Main Application Entry Point
|
| 3 |
+
====================================================
|
| 4 |
+
|
| 5 |
+
The control plane serves as the HTTP layer between the **Agentic Reliability
|
| 6 |
+
Framework (ARF)** core engine and external consumers (front‑end dashboard,
|
| 7 |
+
enterprise clients, and monitoring infrastructure).
|
| 8 |
+
|
| 9 |
+
It is responsible for:
|
| 10 |
+
|
| 11 |
+
* **Lifetime management** of the Bayesian risk engine, policy engine,
|
| 12 |
+
semantic memory (RAG graph), and epistemic models.
|
| 13 |
+
* **Observability** via optional OpenTelemetry tracing and Prometheus metrics
|
| 14 |
+
(the latter exposed automatically by ``prometheus-fastapi-instrumentator``
|
| 15 |
+
on ``/metrics``).
|
| 16 |
+
* **Rate limiting** and **usage tracking** with atomic quota consumption.
|
| 17 |
+
* **CORS** configuration for the public ARF front‑end.
|
| 18 |
+
* **Database‑backed persistence** of the conjugate Bayesian posteriors so
|
| 19 |
+
that online learning survives restarts.
|
| 20 |
+
* **Automated Rust enforcer canary promotion** via Wilson confidence interval
|
| 21 |
+
monitoring of the agreement counters.
|
| 22 |
+
|
| 23 |
+
All heavy components are loaded **lazily and best‑effort** – if a dependency
|
| 24 |
+
is missing the API continues to serve health‑check and status endpoints,
|
| 25 |
+
degrading gracefully rather than crashing.
|
| 26 |
"""
|
| 27 |
import logging
|
| 28 |
import os
|
| 29 |
import sys
|
| 30 |
import json
|
| 31 |
+
import threading
|
| 32 |
+
import time as _time
|
| 33 |
from contextlib import asynccontextmanager
|
| 34 |
from typing import Dict
|
| 35 |
|
| 36 |
from fastapi import FastAPI
|
| 37 |
from fastapi.middleware.cors import CORSMiddleware
|
| 38 |
|
| 39 |
+
# ── Optional: Prometheus metrics ─────────────────────────────
|
| 40 |
try:
|
| 41 |
from prometheus_fastapi_instrumentator import Instrumentator
|
| 42 |
PROMETHEUS_AVAILABLE = True
|
|
|
|
| 44 |
PROMETHEUS_AVAILABLE = False
|
| 45 |
Instrumentator = None
|
| 46 |
|
| 47 |
+
# ── Optional: rate‑limiting (slowapi) ────────────────────────
|
| 48 |
try:
|
| 49 |
from slowapi import _rate_limit_exceeded_handler
|
| 50 |
from slowapi.errors import RateLimitExceeded
|
|
|
|
| 56 |
RateLimitExceeded = None
|
| 57 |
SlowAPIMiddleware = None
|
| 58 |
|
| 59 |
+
# ── Core ARF engine (optional but essential for governance) ──
|
| 60 |
try:
|
| 61 |
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 62 |
from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
|
|
|
|
| 71 |
RAGGraphMemory = None
|
| 72 |
MemoryConstants = None
|
| 73 |
|
| 74 |
+
# ── Usage tracker ────────────────────────────────────────────
|
| 75 |
from app.core.usage_tracker import init_tracker, tracker, Tier
|
| 76 |
|
| 77 |
from app.api import (
|
|
|
|
| 85 |
routes_payments,
|
| 86 |
webhooks,
|
| 87 |
routes_users,
|
| 88 |
+
routes_pricing,
|
| 89 |
)
|
| 90 |
from app.api.deps import limiter
|
| 91 |
from app.core.config import settings
|
|
|
|
| 100 |
|
| 101 |
@asynccontextmanager
|
| 102 |
async def lifespan(app: FastAPI):
|
| 103 |
+
"""
|
| 104 |
+
Application lifespan manager.
|
| 105 |
+
|
| 106 |
+
All initialisation that requires a running event loop (database
|
| 107 |
+
connections, model loading, etc.) happens **before** the ``yield``.
|
| 108 |
+
Cleanup (if any) happens after the ``yield``.
|
| 109 |
+
|
| 110 |
+
Initialisation order:
|
| 111 |
+
1. Risk engine (Bayesian scoring + HMC).
|
| 112 |
+
2. Load persisted conjugate posterior state (``beta_state`` table).
|
| 113 |
+
3. OpenTelemetry tracing (console exporter by default).
|
| 114 |
+
4. Policy engine, RAG memory, and epistemic model.
|
| 115 |
+
5. Usage tracker (SQLite / Redis).
|
| 116 |
+
6. Wilson confidence monitor for Rust enforcer canary promotion.
|
| 117 |
+
"""
|
| 118 |
logger.info("🚀 Starting ARF API Control Plane")
|
| 119 |
logger.debug(f"Python path: {sys.path}")
|
| 120 |
|
| 121 |
+
# ── 1. Risk engine ────────────────────────────────────────
|
| 122 |
if ARF_AVAILABLE:
|
| 123 |
hmc_model_path = os.getenv("ARF_HMC_MODEL", "models/hmc_model.json")
|
| 124 |
use_hyperpriors = os.getenv(
|
| 125 |
+
"ARF_USE_HYPERPRIORS", "false"
|
| 126 |
+
).lower() == "true"
|
| 127 |
logger.info(
|
| 128 |
"Initializing RiskEngine – HMC model: %s, hyperpriors: %s",
|
| 129 |
hmc_model_path,
|
| 130 |
+
use_hyperpriors,
|
| 131 |
+
)
|
| 132 |
try:
|
| 133 |
app.state.risk_engine = RiskEngine(
|
| 134 |
hmc_model_path=hmc_model_path,
|
|
|
|
| 141 |
logger.exception("💥 Fatal error initializing RiskEngine")
|
| 142 |
raise RuntimeError("RiskEngine initialization failed") from e
|
| 143 |
|
| 144 |
+
# ── 2. Persisted Bayesian state ────────────────────���──
|
| 145 |
+
try:
|
| 146 |
+
from app.database.session import SessionLocal
|
| 147 |
+
from app.database.models_intents import BetaStateDB
|
| 148 |
+
from agentic_reliability_framework.core.governance.risk_engine import ActionCategory
|
| 149 |
+
|
| 150 |
+
db = SessionLocal()
|
| 151 |
+
try:
|
| 152 |
+
rows = db.query(BetaStateDB).all()
|
| 153 |
+
if rows:
|
| 154 |
+
state = {
|
| 155 |
+
ActionCategory(row.category): (row.alpha, row.beta)
|
| 156 |
+
for row in rows
|
| 157 |
+
}
|
| 158 |
+
app.state.risk_engine.beta_store.load_state(state)
|
| 159 |
+
logger.info(
|
| 160 |
+
"Loaded Bayesian posterior state from database (%d categories).",
|
| 161 |
+
len(state),
|
| 162 |
+
)
|
| 163 |
+
else:
|
| 164 |
+
logger.info(
|
| 165 |
+
"No persisted Bayesian state found; using default priors."
|
| 166 |
+
)
|
| 167 |
+
finally:
|
| 168 |
+
db.close()
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.warning(
|
| 171 |
+
"Could not load Bayesian state from database: %s", e
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# ── 3. Tracing (OpenTelemetry) ─────────────────────────
|
| 175 |
+
try:
|
| 176 |
+
from opentelemetry import trace
|
| 177 |
+
from opentelemetry.sdk.resources import SERVICE_NAME, Resource
|
| 178 |
+
from opentelemetry.sdk.trace import TracerProvider
|
| 179 |
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor, ConsoleSpanExporter
|
| 180 |
+
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
| 181 |
+
|
| 182 |
+
resource = Resource.create({SERVICE_NAME: "arf-api"})
|
| 183 |
+
provider = TracerProvider(resource=resource)
|
| 184 |
+
provider.add_span_processor(SimpleSpanProcessor(ConsoleSpanExporter()))
|
| 185 |
+
trace.set_tracer_provider(provider)
|
| 186 |
+
|
| 187 |
+
FastAPIInstrumentor.instrument_app(app)
|
| 188 |
+
logger.info("✅ Tracing initialized (console exporter).")
|
| 189 |
+
except Exception as e:
|
| 190 |
+
logger.warning("Tracing initialization skipped: %s", e)
|
| 191 |
+
|
| 192 |
+
# ── 4. Policy engine, RAG, epistemic model ─────────────
|
| 193 |
try:
|
| 194 |
app.state.policy_engine = PolicyEngine()
|
| 195 |
logger.info("✅ PolicyEngine initialized successfully.")
|
|
|
|
| 211 |
from sentence_transformers import SentenceTransformer
|
| 212 |
logger.info(f"Loading epistemic model: {epistemic_model_name}")
|
| 213 |
app.state.epistemic_model = SentenceTransformer(
|
| 214 |
+
epistemic_model_name
|
| 215 |
+
)
|
| 216 |
app.state.epistemic_tokenizer = app.state.epistemic_model.tokenizer
|
| 217 |
logger.info("✅ Epistemic model loaded.")
|
| 218 |
except ImportError:
|
| 219 |
logger.warning(
|
| 220 |
+
"sentence-transformers not installed; epistemic signals will be zeros."
|
| 221 |
+
)
|
| 222 |
app.state.epistemic_model = None
|
| 223 |
app.state.epistemic_tokenizer = None
|
| 224 |
except Exception as e:
|
|
|
|
| 227 |
app.state.epistemic_tokenizer = None
|
| 228 |
else:
|
| 229 |
logger.info(
|
| 230 |
+
"EPISTEMIC_MODEL not set; epistemic signals will be zeros."
|
| 231 |
+
)
|
| 232 |
app.state.epistemic_model = None
|
| 233 |
app.state.epistemic_tokenizer = None
|
| 234 |
else:
|
| 235 |
logger.warning(
|
| 236 |
+
"agentic_reliability_framework not installed; risk engine, policy engine, RAG disabled."
|
| 237 |
+
)
|
| 238 |
|
| 239 |
+
# ── 5. Usage tracker ──────────────────────────────────────
|
| 240 |
+
usage_tracking_disabled = (
|
| 241 |
+
os.getenv("ARF_USAGE_TRACKING", "true").lower() == "false"
|
| 242 |
+
)
|
| 243 |
+
if not usage_tracking_disabled:
|
| 244 |
logger.info("Initialising usage tracker...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
try:
|
| 246 |
+
init_tracker(
|
| 247 |
+
db_path=os.getenv("ARF_USAGE_DB_PATH", "arf_usage.db"),
|
| 248 |
+
redis_url=os.getenv("ARF_REDIS_URL"),
|
| 249 |
+
)
|
| 250 |
+
# Seed initial API keys from environment variable (for testing / demo)
|
| 251 |
+
api_keys_json = os.getenv("ARF_API_KEYS", "{}")
|
| 252 |
+
try:
|
| 253 |
+
api_keys = json.loads(api_keys_json)
|
| 254 |
+
for key, tier_str in api_keys.items():
|
| 255 |
+
try:
|
| 256 |
+
tier = Tier(tier_str.lower())
|
| 257 |
+
tracker.get_or_create_api_key(key, tier)
|
| 258 |
+
logger.info(f"Seeded API key for tier {tier.value}")
|
| 259 |
+
except ValueError:
|
| 260 |
+
logger.warning(
|
| 261 |
+
f"Invalid tier '{tier_str}' for key {key}, skipping"
|
| 262 |
+
)
|
| 263 |
+
except json.JSONDecodeError:
|
| 264 |
+
logger.warning(
|
| 265 |
+
"ARF_API_KEYS environment variable is not valid JSON; skipping seeding."
|
| 266 |
+
)
|
| 267 |
+
app.state.usage_tracker = tracker
|
| 268 |
+
logger.info("✅ Usage tracker ready.")
|
| 269 |
+
except Exception as e:
|
| 270 |
+
logger.critical(f"Failed to initialise usage tracker: {e}")
|
| 271 |
+
raise RuntimeError("Usage tracker initialisation failed") from e
|
| 272 |
else:
|
| 273 |
+
logger.info("Usage tracking disabled by ARF_USAGE_TRACKING=false.")
|
| 274 |
app.state.usage_tracker = None
|
| 275 |
|
| 276 |
+
# ── 6. Wilson confidence monitor ──────────────────────────
|
| 277 |
+
try:
|
| 278 |
+
from app.services.wilson_monitor import update as wilson_update
|
| 279 |
+
from prometheus_client import REGISTRY
|
| 280 |
+
|
| 281 |
+
def _wilson_updater():
|
| 282 |
+
while True:
|
| 283 |
+
try:
|
| 284 |
+
agreed = REGISTRY.get_sample_value(
|
| 285 |
+
'arf_rust_agreement_total', {'result': 'agreed'}
|
| 286 |
+
) or 0.0
|
| 287 |
+
diverged = REGISTRY.get_sample_value(
|
| 288 |
+
'arf_rust_agreement_total', {'result': 'diverged'}
|
| 289 |
+
) or 0.0
|
| 290 |
+
wilson_update(int(agreed), int(diverged))
|
| 291 |
+
except Exception as e:
|
| 292 |
+
logger.debug("Wilson updater error: %s", e)
|
| 293 |
+
_time.sleep(300) # every 5 minutes
|
| 294 |
+
|
| 295 |
+
threading.Thread(target=_wilson_updater, daemon=True).start()
|
| 296 |
+
logger.info("✅ Wilson monitor background updater started.")
|
| 297 |
+
except Exception as e:
|
| 298 |
+
logger.warning("Wilson monitor initialization skipped: %s", e)
|
| 299 |
+
|
| 300 |
yield
|
| 301 |
logger.info("🛑 Shutting down ARF API")
|
| 302 |
|
| 303 |
|
| 304 |
def create_app() -> FastAPI:
|
| 305 |
+
"""
|
| 306 |
+
Build and configure the FastAPI application.
|
| 307 |
+
|
| 308 |
+
Middleware order:
|
| 309 |
+
1. CORS (restricted to the public front‑end origin).
|
| 310 |
+
2. Rate limiting (if slowapi is installed).
|
| 311 |
+
3. Prometheus metrics exposition (if available).
|
| 312 |
+
|
| 313 |
+
All API routers are included under the ``/api/v1`` prefix except
|
| 314 |
+
memory (``/v1/memory``) and webhooks (root level).
|
| 315 |
+
|
| 316 |
+
A simple ``/health`` endpoint is provided for liveness probes.
|
| 317 |
+
"""
|
| 318 |
app = FastAPI(
|
| 319 |
title=settings.app_name,
|
| 320 |
version="0.5.0",
|
|
|
|
| 324 |
description="Agentic Reliability Framework (ARF) API",
|
| 325 |
)
|
| 326 |
|
| 327 |
+
# ── CORS ──────────────────────────────────────────────────
|
| 328 |
allowed_origins = ["https://arf-frontend-sandy.vercel.app"]
|
| 329 |
app.add_middleware(
|
| 330 |
CORSMiddleware,
|
|
|
|
| 335 |
)
|
| 336 |
logger.debug("CORS middleware configured")
|
| 337 |
|
| 338 |
+
# ── Rate limiter ──────────────────────────────────────────
|
| 339 |
if SLOWAPI_AVAILABLE:
|
| 340 |
app.state.limiter = limiter
|
| 341 |
app.add_exception_handler(
|
| 342 |
+
RateLimitExceeded, _rate_limit_exceeded_handler
|
| 343 |
+
)
|
| 344 |
app.add_middleware(SlowAPIMiddleware)
|
| 345 |
logger.debug("Rate limiter middleware configured")
|
| 346 |
else:
|
| 347 |
logger.debug("Rate limiter disabled (slowapi not installed)")
|
| 348 |
|
| 349 |
+
# ── Prometheus ────────────────────────────────────────────
|
| 350 |
if PROMETHEUS_AVAILABLE:
|
| 351 |
Instrumentator().instrument(app).expose(app)
|
| 352 |
logger.debug("Prometheus instrumentator configured")
|
| 353 |
else:
|
| 354 |
+
logger.debug("Prometheus instrumentator disabled (module not installed)")
|
|
|
|
| 355 |
|
| 356 |
+
# ── API Routers ───────────────────────────────────────────
|
| 357 |
app.include_router(
|
| 358 |
+
routes_incidents.router, prefix="/api/v1", tags=["incidents"]
|
| 359 |
+
)
|
|
|
|
| 360 |
app.include_router(routes_risk.router, prefix="/api/v1", tags=["risk"])
|
| 361 |
app.include_router(
|
| 362 |
+
routes_intents.router, prefix="/api/v1", tags=["intents"]
|
| 363 |
+
)
|
|
|
|
| 364 |
app.include_router(
|
| 365 |
+
routes_history.router, prefix="/api/v1", tags=["history"]
|
| 366 |
+
)
|
|
|
|
| 367 |
app.include_router(
|
| 368 |
+
routes_governance.router, prefix="/api/v1", tags=["governance"]
|
| 369 |
+
)
|
|
|
|
| 370 |
app.include_router(
|
| 371 |
+
routes_memory.router, prefix="/v1/memory", tags=["memory"]
|
| 372 |
+
)
|
|
|
|
| 373 |
app.include_router(
|
| 374 |
+
routes_admin.router, prefix="/api/v1", tags=["admin"]
|
| 375 |
+
)
|
|
|
|
| 376 |
app.include_router(
|
| 377 |
+
routes_payments.router, prefix="/api/v1", tags=["payments"]
|
| 378 |
+
)
|
|
|
|
| 379 |
app.include_router(
|
| 380 |
+
webhooks.router, tags=["webhooks"]
|
| 381 |
+
)
|
| 382 |
app.include_router(
|
| 383 |
+
routes_users.router, prefix="/api/v1", tags=["users"]
|
| 384 |
+
)
|
| 385 |
+
app.include_router(
|
| 386 |
+
routes_pricing.router, prefix="/api/v1", tags=["pricing"]
|
| 387 |
+
)
|
| 388 |
logger.debug("All API routers included")
|
| 389 |
|
| 390 |
@app.get("/health", tags=["health"])
|
| 391 |
async def health() -> Dict[str, str]:
|
| 392 |
+
"""Liveness probe – returns 200 when the application is running."""
|
| 393 |
return {"status": "ok"}
|
| 394 |
|
| 395 |
return app
|
| 396 |
|
| 397 |
|
| 398 |
+
app = create_app()
|
app/models/__init__.py
CHANGED
|
@@ -26,4 +26,4 @@ __all__ = [
|
|
| 26 |
"PermissionLevel",
|
| 27 |
"Environment",
|
| 28 |
"ChangeScope",
|
| 29 |
-
]
|
|
|
|
| 26 |
"PermissionLevel",
|
| 27 |
"Environment",
|
| 28 |
"ChangeScope",
|
| 29 |
+
]
|
app/models/incident_models.py
CHANGED
|
@@ -4,10 +4,11 @@ from pydantic import BaseModel, Field
|
|
| 4 |
|
| 5 |
class IncidentReport(BaseModel):
|
| 6 |
service: str = Field(..., description="Service name")
|
| 7 |
-
signal_type: Literal["latency", "error_rate", "cpu",
|
|
|
|
| 8 |
value: float = Field(..., description="Measured value")
|
| 9 |
|
| 10 |
|
| 11 |
class IncidentResponse(BaseModel):
|
| 12 |
service: str
|
| 13 |
-
reliability: float
|
|
|
|
| 4 |
|
| 5 |
class IncidentReport(BaseModel):
|
| 6 |
service: str = Field(..., description="Service name")
|
| 7 |
+
signal_type: Literal["latency", "error_rate", "cpu",
|
| 8 |
+
"memory"] = Field(..., description="Type of signal")
|
| 9 |
value: float = Field(..., description="Measured value")
|
| 10 |
|
| 11 |
|
| 12 |
class IncidentResponse(BaseModel):
|
| 13 |
service: str
|
| 14 |
+
reliability: float
|
app/models/infrastructure_intents.py
CHANGED
|
@@ -1,45 +1,12 @@
|
|
| 1 |
from pydantic import BaseModel, Field, field_validator
|
| 2 |
from typing import Optional, Literal, List, Any, Dict
|
| 3 |
-
from enum import Enum
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
STORAGE_ACCOUNT = "storage_account"
|
| 12 |
-
VM = "vm"
|
| 13 |
-
VIRTUAL_NETWORK = "virtual_network"
|
| 14 |
-
# enterprise-only types omitted for public sandbox
|
| 15 |
-
|
| 16 |
-
class PermissionLevel(str, Enum):
|
| 17 |
-
READ = "read"
|
| 18 |
-
WRITE = "write"
|
| 19 |
-
ADMIN = "admin"
|
| 20 |
-
|
| 21 |
-
class Environment(str, Enum):
|
| 22 |
-
DEV = "dev"
|
| 23 |
-
STAGING = "staging"
|
| 24 |
-
PROD = "prod"
|
| 25 |
-
|
| 26 |
-
class ChangeScope(str, Enum):
|
| 27 |
-
MINOR = "minor"
|
| 28 |
-
MAJOR = "major"
|
| 29 |
-
CRITICAL = "critical"
|
| 30 |
-
# ---------------------------------------------------------------------------
|
| 31 |
-
|
| 32 |
-
# Optional import from protected core engine – not available in public Spaces
|
| 33 |
-
try:
|
| 34 |
-
from agentic_reliability_framework.core.governance.intents import (
|
| 35 |
-
ResourceType,
|
| 36 |
-
PermissionLevel,
|
| 37 |
-
Environment,
|
| 38 |
-
ChangeScope,
|
| 39 |
-
)
|
| 40 |
-
except ImportError:
|
| 41 |
-
# The fallback enums defined above are used.
|
| 42 |
-
pass
|
| 43 |
|
| 44 |
|
| 45 |
class BaseIntentRequest(BaseModel):
|
|
@@ -91,4 +58,4 @@ class DeployConfigurationRequest(BaseIntentRequest):
|
|
| 91 |
return v
|
| 92 |
|
| 93 |
|
| 94 |
-
InfrastructureIntentRequest = ProvisionResourceRequest | GrantAccessRequest | DeployConfigurationRequest
|
|
|
|
| 1 |
from pydantic import BaseModel, Field, field_validator
|
| 2 |
from typing import Optional, Literal, List, Any, Dict
|
|
|
|
| 3 |
|
| 4 |
+
from agentic_reliability_framework.core.governance.intents import (
|
| 5 |
+
ResourceType,
|
| 6 |
+
PermissionLevel,
|
| 7 |
+
Environment,
|
| 8 |
+
ChangeScope,
|
| 9 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class BaseIntentRequest(BaseModel):
|
|
|
|
| 58 |
return v
|
| 59 |
|
| 60 |
|
| 61 |
+
InfrastructureIntentRequest = ProvisionResourceRequest | GrantAccessRequest | DeployConfigurationRequest
|
app/models/intent_models.py
CHANGED
|
@@ -11,4 +11,4 @@ class IntentSimulation(BaseModel):
|
|
| 11 |
|
| 12 |
class IntentSimulationResponse(BaseModel):
|
| 13 |
risk_score: float
|
| 14 |
-
recommendation: Literal["safe_to_execute", "requires_approval", "blocked"]
|
|
|
|
| 11 |
|
| 12 |
class IntentSimulationResponse(BaseModel):
|
| 13 |
risk_score: float
|
| 14 |
+
recommendation: Literal["safe_to_execute", "requires_approval", "blocked"]
|
app/models/risk_models.py
CHANGED
|
@@ -4,4 +4,4 @@ from pydantic import BaseModel
|
|
| 4 |
|
| 5 |
class RiskResponse(BaseModel):
|
| 6 |
system_risk: float
|
| 7 |
-
status: Literal["low", "moderate", "high", "critical"]
|
|
|
|
| 4 |
|
| 5 |
class RiskResponse(BaseModel):
|
| 6 |
system_risk: float
|
| 7 |
+
status: Literal["low", "moderate", "high", "critical"]
|
app/services/incident_service.py
CHANGED
|
@@ -3,5 +3,6 @@ from app.models.incident_models import IncidentReport
|
|
| 3 |
|
| 4 |
|
| 5 |
def process_incident(report: IncidentReport) -> float:
|
| 6 |
-
reliability = signal_to_reliability(
|
|
|
|
| 7 |
return reliability
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
def process_incident(report: IncidentReport) -> float:
|
| 6 |
+
reliability = signal_to_reliability(
|
| 7 |
+
report.value, signal_type=report.signal_type)
|
| 8 |
return reliability
|
app/services/intent_adapter.py
CHANGED
|
@@ -1,66 +1,163 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def to_oss_intent(api_request):
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
provenance=api_request.provenance,
|
| 45 |
-
)
|
| 46 |
-
elif api_request.intent_type == "grant_access":
|
| 47 |
-
return GrantAccessIntent(
|
| 48 |
-
principal=api_request.principal,
|
| 49 |
-
permission_level=api_request.permission_level.value if hasattr(api_request.permission_level, 'value') else str(api_request.permission_level),
|
| 50 |
-
resource_scope=api_request.resource_scope,
|
| 51 |
-
justification=api_request.justification,
|
| 52 |
-
requester=api_request.requester,
|
| 53 |
-
provenance=api_request.provenance,
|
| 54 |
-
)
|
| 55 |
-
elif api_request.intent_type == "deploy_config":
|
| 56 |
-
return DeployConfigurationIntent(
|
| 57 |
-
service_name=api_request.service_name,
|
| 58 |
-
change_scope=api_request.change_scope.value if hasattr(api_request.change_scope, 'value') else str(api_request.change_scope),
|
| 59 |
-
deployment_target=api_request.deployment_target.value if hasattr(api_request.deployment_target, 'value') else str(api_request.deployment_target),
|
| 60 |
-
risk_level_hint=api_request.risk_level_hint,
|
| 61 |
-
configuration=api_request.configuration,
|
| 62 |
-
requester=api_request.requester,
|
| 63 |
-
provenance=api_request.provenance,
|
| 64 |
-
)
|
| 65 |
else:
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Intent Adapter – converts API request payloads to ARF InfrastructureIntent objects.
|
| 3 |
+
Strict validation, no dummy fallbacks. All conversions are deterministic.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
from typing import Any, Dict
|
| 8 |
+
|
| 9 |
+
from agentic_reliability_framework.core.governance.intents import (
|
| 10 |
+
ProvisionResourceIntent,
|
| 11 |
+
GrantAccessIntent,
|
| 12 |
+
DeployConfigurationIntent,
|
| 13 |
+
InfrastructureIntent,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class IntentAdapterError(Exception):
|
| 20 |
+
"""Raised when intent conversion fails due to invalid input."""
|
| 21 |
+
pass
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Allowed values (from the framework's Literal definitions)
|
| 25 |
+
VALID_ENVIRONMENTS = {"dev", "staging", "prod", "test"}
|
| 26 |
+
VALID_RESOURCE_TYPES = {
|
| 27 |
+
"vm",
|
| 28 |
+
"storage_account",
|
| 29 |
+
"database",
|
| 30 |
+
"kubernetes_cluster",
|
| 31 |
+
"function_app",
|
| 32 |
+
"virtual_network"}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def to_oss_intent(api_request: Any) -> InfrastructureIntent:
|
| 36 |
+
"""
|
| 37 |
+
Convert an API request object to the corresponding OSS InfrastructureIntent.
|
| 38 |
+
"""
|
| 39 |
+
# Extract data
|
| 40 |
+
if hasattr(api_request, "model_dump"):
|
| 41 |
+
data = api_request.model_dump()
|
| 42 |
+
elif hasattr(api_request, "dict"):
|
| 43 |
+
data = api_request.dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
else:
|
| 45 |
+
data = dict(api_request)
|
| 46 |
+
|
| 47 |
+
intent_type = data.get("intent_type")
|
| 48 |
+
if not intent_type:
|
| 49 |
+
raise IntentAdapterError("Missing 'intent_type' in request")
|
| 50 |
+
|
| 51 |
+
environment = data.get("environment")
|
| 52 |
+
if not environment:
|
| 53 |
+
raise IntentAdapterError("Missing 'environment' field")
|
| 54 |
+
if environment not in VALID_ENVIRONMENTS:
|
| 55 |
+
raise IntentAdapterError(
|
| 56 |
+
f"Invalid environment: {environment}. Must be one of {VALID_ENVIRONMENTS}")
|
| 57 |
+
|
| 58 |
+
requester = data.get("requester")
|
| 59 |
+
if not requester:
|
| 60 |
+
raise IntentAdapterError("Missing 'requester' field")
|
| 61 |
+
|
| 62 |
+
if intent_type == "provision_resource":
|
| 63 |
+
return _to_provision_intent(data, environment, requester)
|
| 64 |
+
elif intent_type == "grant_access":
|
| 65 |
+
return _to_grant_intent(data, requester) # environment NOT passed
|
| 66 |
+
elif intent_type == "deploy_config":
|
| 67 |
+
return _to_deploy_intent(data, requester) # environment NOT passed
|
| 68 |
+
else:
|
| 69 |
+
raise IntentAdapterError(f"Unknown intent_type: {intent_type}")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _to_provision_intent(data: Dict[str,
|
| 73 |
+
Any],
|
| 74 |
+
environment: str,
|
| 75 |
+
requester: str) -> ProvisionResourceIntent:
|
| 76 |
+
resource_type_str = data.get("resource_type")
|
| 77 |
+
if not resource_type_str:
|
| 78 |
+
raise IntentAdapterError(
|
| 79 |
+
"Missing 'resource_type' for provision_resource intent")
|
| 80 |
+
if resource_type_str not in VALID_RESOURCE_TYPES:
|
| 81 |
+
raise IntentAdapterError(f"Invalid resource_type: {resource_type_str}")
|
| 82 |
+
|
| 83 |
+
region = data.get("region")
|
| 84 |
+
if not region:
|
| 85 |
+
raise IntentAdapterError(
|
| 86 |
+
"Missing 'region' for provision_resource intent")
|
| 87 |
+
|
| 88 |
+
size = data.get("size")
|
| 89 |
+
if not size:
|
| 90 |
+
raise IntentAdapterError(
|
| 91 |
+
"Missing 'size' for provision_resource intent")
|
| 92 |
+
|
| 93 |
+
return ProvisionResourceIntent(
|
| 94 |
+
resource_type=resource_type_str,
|
| 95 |
+
region=region,
|
| 96 |
+
size=size,
|
| 97 |
+
environment=environment,
|
| 98 |
+
requester=requester,
|
| 99 |
+
configuration=data.get("configuration", {}),
|
| 100 |
+
provenance=data.get("provenance", {}),
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _to_grant_intent(data: Dict[str, Any],
|
| 105 |
+
requester: str) -> GrantAccessIntent:
|
| 106 |
+
principal = data.get("principal")
|
| 107 |
+
if not principal:
|
| 108 |
+
raise IntentAdapterError("Missing 'principal' for grant_access intent")
|
| 109 |
+
|
| 110 |
+
permission_level = data.get("permission_level")
|
| 111 |
+
if not permission_level:
|
| 112 |
+
raise IntentAdapterError(
|
| 113 |
+
"Missing 'permission_level' for grant_access intent")
|
| 114 |
+
|
| 115 |
+
resource_scope = data.get("resource_scope")
|
| 116 |
+
if not resource_scope:
|
| 117 |
+
raise IntentAdapterError(
|
| 118 |
+
"Missing 'resource_scope' for grant_access intent")
|
| 119 |
+
|
| 120 |
+
return GrantAccessIntent(
|
| 121 |
+
principal=principal,
|
| 122 |
+
permission_level=permission_level,
|
| 123 |
+
resource_scope=resource_scope,
|
| 124 |
+
requester=requester,
|
| 125 |
+
justification=data.get("justification", ""),
|
| 126 |
+
provenance=data.get("provenance", {}),
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def _to_deploy_intent(data: Dict[str, Any],
|
| 131 |
+
requester: str) -> DeployConfigurationIntent:
|
| 132 |
+
service_name = data.get("service_name")
|
| 133 |
+
if not service_name:
|
| 134 |
+
raise IntentAdapterError(
|
| 135 |
+
"Missing 'service_name' for deploy_config intent")
|
| 136 |
+
|
| 137 |
+
change_scope = data.get("change_scope")
|
| 138 |
+
if not change_scope:
|
| 139 |
+
raise IntentAdapterError(
|
| 140 |
+
"Missing 'change_scope' for deploy_config intent")
|
| 141 |
+
|
| 142 |
+
deployment_target = data.get("deployment_target")
|
| 143 |
+
if not deployment_target:
|
| 144 |
+
raise IntentAdapterError(
|
| 145 |
+
"Missing 'deployment_target' for deploy_config intent")
|
| 146 |
+
|
| 147 |
+
# risk_level_hint expects a float; if not a number, set to None
|
| 148 |
+
risk_hint = data.get("risk_level_hint")
|
| 149 |
+
if risk_hint is not None:
|
| 150 |
+
try:
|
| 151 |
+
risk_hint = float(risk_hint)
|
| 152 |
+
except (TypeError, ValueError):
|
| 153 |
+
risk_hint = None
|
| 154 |
+
|
| 155 |
+
return DeployConfigurationIntent(
|
| 156 |
+
service_name=service_name,
|
| 157 |
+
change_scope=change_scope,
|
| 158 |
+
deployment_target=deployment_target,
|
| 159 |
+
requester=requester,
|
| 160 |
+
risk_level_hint=risk_hint,
|
| 161 |
+
configuration=data.get("configuration", {}),
|
| 162 |
+
provenance=data.get("provenance", {}),
|
| 163 |
+
)
|
app/services/intent_service.py
CHANGED
|
@@ -7,7 +7,8 @@ logger = logging.getLogger(__name__)
|
|
| 7 |
|
| 8 |
# Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
|
| 9 |
def simulate_intent(intent: IntentSimulation) -> dict:
|
| 10 |
-
logger.warning(
|
|
|
|
| 11 |
# For backward compatibility, we still use random risk.
|
| 12 |
risk_score = random.uniform(0, 1)
|
| 13 |
if risk_score < 0.2:
|
|
|
|
| 7 |
|
| 8 |
# Note: This endpoint is deprecated. Use /v1/intents/evaluate instead.
|
| 9 |
def simulate_intent(intent: IntentSimulation) -> dict:
|
| 10 |
+
logger.warning(
|
| 11 |
+
"Deprecated endpoint /simulate_intent used. Please migrate to /v1/intents/evaluate.")
|
| 12 |
# For backward compatibility, we still use random risk.
|
| 13 |
risk_score = random.uniform(0, 1)
|
| 14 |
if risk_score < 0.2:
|
app/services/intent_store.py
CHANGED
|
@@ -13,7 +13,8 @@ def save_evaluated_intent(
|
|
| 13 |
environment: str,
|
| 14 |
risk_score: float
|
| 15 |
) -> IntentDB:
|
| 16 |
-
existing = db.query(IntentDB).filter(
|
|
|
|
| 17 |
if existing:
|
| 18 |
existing.evaluated_at = datetime.datetime.utcnow()
|
| 19 |
existing.risk_score = str(risk_score)
|
|
@@ -38,5 +39,8 @@ def save_evaluated_intent(
|
|
| 38 |
return intent
|
| 39 |
|
| 40 |
|
| 41 |
-
def get_intent_by_deterministic_id(
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
environment: str,
|
| 14 |
risk_score: float
|
| 15 |
) -> IntentDB:
|
| 16 |
+
existing = db.query(IntentDB).filter(
|
| 17 |
+
IntentDB.deterministic_id == deterministic_id).one_or_none()
|
| 18 |
if existing:
|
| 19 |
existing.evaluated_at = datetime.datetime.utcnow()
|
| 20 |
existing.risk_score = str(risk_score)
|
|
|
|
| 39 |
return intent
|
| 40 |
|
| 41 |
|
| 42 |
+
def get_intent_by_deterministic_id(
|
| 43 |
+
db: Session,
|
| 44 |
+
deterministic_id: str) -> Optional[IntentDB]:
|
| 45 |
+
return db.query(IntentDB).filter(
|
| 46 |
+
IntentDB.deterministic_id == deterministic_id).one_or_none()
|
app/services/outcome_service.py
CHANGED
|
@@ -1,42 +1,53 @@
|
|
|
|
|
|
|
|
| 1 |
import datetime
|
| 2 |
import logging
|
| 3 |
from typing import Optional, Dict, Any
|
| 4 |
|
| 5 |
from sqlalchemy.orm import Session
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
from app.database.models_intents import IntentDB, OutcomeDB
|
| 8 |
|
| 9 |
# ---------------------------------------------------------------------------
|
| 10 |
-
#
|
| 11 |
-
# ---------------------------------------------------------------------------
|
| 12 |
-
class RiskEngine:
|
| 13 |
-
def update_outcome(self, intent, success):
|
| 14 |
-
pass
|
| 15 |
-
|
| 16 |
-
class ProvisionResourceIntent:
|
| 17 |
-
def __init__(self, **kwargs):
|
| 18 |
-
for k, v in kwargs.items():
|
| 19 |
-
setattr(self, k, v)
|
| 20 |
-
|
| 21 |
-
class GrantAccessIntent:
|
| 22 |
-
def __init__(self, **kwargs):
|
| 23 |
-
for k, v in kwargs.items():
|
| 24 |
-
setattr(self, k, v)
|
| 25 |
-
|
| 26 |
-
class DeployConfigurationIntent:
|
| 27 |
-
def __init__(self, **kwargs):
|
| 28 |
-
for k, v in kwargs.items():
|
| 29 |
-
setattr(self, k, v)
|
| 30 |
# ---------------------------------------------------------------------------
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
class OutcomeConflictError(Exception):
|
|
|
|
| 36 |
pass
|
| 37 |
|
| 38 |
|
| 39 |
-
def reconstruct_oss_intent_from_json(
|
|
|
|
|
|
|
| 40 |
intent_type = oss_json.get("intent_type")
|
| 41 |
if intent_type == "provision_resource":
|
| 42 |
return ProvisionResourceIntent(**oss_json)
|
|
@@ -46,22 +57,7 @@ def reconstruct_oss_intent_from_json(oss_json: Dict[str, Any]):
|
|
| 46 |
return DeployConfigurationIntent(**oss_json)
|
| 47 |
else:
|
| 48 |
raise ValueError(
|
| 49 |
-
f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}"
|
| 50 |
-
)
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def _create_dummy_intent(intent_type: str):
|
| 54 |
-
if intent_type == "ProvisionResourceIntent":
|
| 55 |
-
return ProvisionResourceIntent(
|
| 56 |
-
resource_type="vm",
|
| 57 |
-
region="eastus",
|
| 58 |
-
size="Standard_D2s_v3",
|
| 59 |
-
environment="dev",
|
| 60 |
-
requester="system"
|
| 61 |
-
)
|
| 62 |
-
else:
|
| 63 |
-
logger.warning("Dummy intent creation not implemented for %s", intent_type)
|
| 64 |
-
return None
|
| 65 |
|
| 66 |
|
| 67 |
def record_outcome(
|
|
@@ -70,50 +66,114 @@ def record_outcome(
|
|
| 70 |
success: bool,
|
| 71 |
recorded_by: Optional[str],
|
| 72 |
notes: Optional[str],
|
| 73 |
-
risk_engine: RiskEngine
|
|
|
|
| 74 |
) -> OutcomeDB:
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
if not intent:
|
| 77 |
raise ValueError(f"Intent not found: {deterministic_id}")
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
| 80 |
if existing_outcome:
|
| 81 |
if existing_outcome.success == success:
|
| 82 |
return existing_outcome
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
|
|
|
| 85 |
outcome = OutcomeDB(
|
| 86 |
intent_id=intent.id,
|
| 87 |
success=bool(success),
|
| 88 |
recorded_by=recorded_by,
|
| 89 |
notes=notes,
|
| 90 |
-
recorded_at=datetime.datetime.
|
|
|
|
| 91 |
)
|
| 92 |
db.add(outcome)
|
| 93 |
-
db.commit()
|
| 94 |
-
db.refresh(outcome)
|
| 95 |
|
| 96 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
oss_intent = None
|
| 98 |
if intent.oss_payload:
|
| 99 |
try:
|
| 100 |
oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
|
| 101 |
except Exception as e:
|
| 102 |
-
logger.
|
| 103 |
-
"Failed to reconstruct OSS intent for %s: %s.
|
| 104 |
-
deterministic_id,
|
| 105 |
-
|
| 106 |
-
|
| 107 |
else:
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
if oss_intent is not None:
|
| 111 |
try:
|
| 112 |
risk_engine.update_outcome(oss_intent, success)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
except Exception as e:
|
| 114 |
logger.exception(
|
| 115 |
"Failed to update RiskEngine after recording outcome for intent %s: %s",
|
| 116 |
-
deterministic_id,
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
-
return outcome
|
|
|
|
| 1 |
+
"""Outcome recording with idempotency, no dummy fallbacks, and timezone-aware timestamps."""
|
| 2 |
+
|
| 3 |
import datetime
|
| 4 |
import logging
|
| 5 |
from typing import Optional, Dict, Any
|
| 6 |
|
| 7 |
from sqlalchemy.orm import Session
|
| 8 |
+
from sqlalchemy.exc import IntegrityError
|
| 9 |
+
|
| 10 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 11 |
+
from agentic_reliability_framework.core.governance.intents import (
|
| 12 |
+
InfrastructureIntent,
|
| 13 |
+
ProvisionResourceIntent,
|
| 14 |
+
GrantAccessIntent,
|
| 15 |
+
DeployConfigurationIntent,
|
| 16 |
+
)
|
| 17 |
+
from app.database.models_intents import IntentDB, OutcomeDB, BetaStateDB
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
|
|
|
|
| 21 |
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
+
# NEW: small helper to persist the conjugate posterior state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
# ---------------------------------------------------------------------------
|
| 25 |
+
def _persist_beta_state(db: Session, risk_engine: RiskEngine) -> None:
|
| 26 |
+
"""
|
| 27 |
+
Write the current Beta posterior parameters to the beta_state table.
|
| 28 |
+
This is called after every outcome update so that online learning
|
| 29 |
+
survives restarts.
|
| 30 |
+
"""
|
| 31 |
+
try:
|
| 32 |
+
state = risk_engine.beta_store.get_state()
|
| 33 |
+
for cat, (alpha, beta) in state.items():
|
| 34 |
+
# Upsert: if the category already exists, update it
|
| 35 |
+
db.merge(BetaStateDB(category=cat.value, alpha=alpha, beta=beta))
|
| 36 |
+
db.commit()
|
| 37 |
+
logger.debug("Persisted Beta posterior parameters to database.")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
db.rollback()
|
| 40 |
+
logger.error("Failed to persist beta state: %s", e)
|
| 41 |
|
| 42 |
|
| 43 |
class OutcomeConflictError(Exception):
|
| 44 |
+
"""Raised when an outcome already exists for the same intent with a different result."""
|
| 45 |
pass
|
| 46 |
|
| 47 |
|
| 48 |
+
def reconstruct_oss_intent_from_json(
|
| 49 |
+
oss_json: Dict[str, Any]) -> InfrastructureIntent:
|
| 50 |
+
"""Reconstruct OSS intent from stored JSON. Raises ValueError on failure."""
|
| 51 |
intent_type = oss_json.get("intent_type")
|
| 52 |
if intent_type == "provision_resource":
|
| 53 |
return ProvisionResourceIntent(**oss_json)
|
|
|
|
| 57 |
return DeployConfigurationIntent(**oss_json)
|
| 58 |
else:
|
| 59 |
raise ValueError(
|
| 60 |
+
f"Cannot reconstruct intent from JSON: missing or unknown intent_type {intent_type}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
def record_outcome(
|
|
|
|
| 66 |
success: bool,
|
| 67 |
recorded_by: Optional[str],
|
| 68 |
notes: Optional[str],
|
| 69 |
+
risk_engine: RiskEngine,
|
| 70 |
+
idempotency_key: Optional[str] = None,
|
| 71 |
) -> OutcomeDB:
|
| 72 |
+
"""
|
| 73 |
+
Record an outcome for a previously evaluated intent.
|
| 74 |
+
|
| 75 |
+
Idempotent: calling twice with the same (deterministic_id, success) returns the same record.
|
| 76 |
+
If the outcome already exists with a different success value, raises OutcomeConflictError.
|
| 77 |
+
|
| 78 |
+
No dummy intents are created. If the OSS intent cannot be reconstructed, the risk engine
|
| 79 |
+
is NOT updated – we log an error and still record the outcome.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
db: SQLAlchemy session.
|
| 83 |
+
deterministic_id: Unique identifier of the original intent.
|
| 84 |
+
success: Whether the action succeeded (True) or failed (False).
|
| 85 |
+
recorded_by: Optional user or system identifier.
|
| 86 |
+
notes: Optional human-readable notes.
|
| 87 |
+
risk_engine: ARF risk engine instance (may be updated).
|
| 88 |
+
idempotency_key: Optional caller-provided idempotency token.
|
| 89 |
+
|
| 90 |
+
Returns:
|
| 91 |
+
The recorded OutcomeDB object.
|
| 92 |
+
|
| 93 |
+
Raises:
|
| 94 |
+
ValueError: If intent not found or reconstruction fails fatally.
|
| 95 |
+
OutcomeConflictError: If a conflicting outcome already exists.
|
| 96 |
+
"""
|
| 97 |
+
# 1. Fetch the original intent record
|
| 98 |
+
intent = db.query(IntentDB).filter(
|
| 99 |
+
IntentDB.deterministic_id == deterministic_id).one_or_none()
|
| 100 |
if not intent:
|
| 101 |
raise ValueError(f"Intent not found: {deterministic_id}")
|
| 102 |
|
| 103 |
+
# 2. Idempotency / conflict check with database-level uniqueness
|
| 104 |
+
existing_outcome = db.query(OutcomeDB).filter(
|
| 105 |
+
OutcomeDB.intent_id == intent.id).one_or_none()
|
| 106 |
if existing_outcome:
|
| 107 |
if existing_outcome.success == success:
|
| 108 |
return existing_outcome
|
| 109 |
+
db.rollback()
|
| 110 |
+
raise OutcomeConflictError(
|
| 111 |
+
f"Outcome already recorded for intent {deterministic_id} with different result "
|
| 112 |
+
f"(existing={existing_outcome.success}, new={success})"
|
| 113 |
+
)
|
| 114 |
|
| 115 |
+
# 3. Create outcome record
|
| 116 |
outcome = OutcomeDB(
|
| 117 |
intent_id=intent.id,
|
| 118 |
success=bool(success),
|
| 119 |
recorded_by=recorded_by,
|
| 120 |
notes=notes,
|
| 121 |
+
recorded_at=datetime.datetime.now(datetime.timezone.utc),
|
| 122 |
+
idempotency_key=idempotency_key,
|
| 123 |
)
|
| 124 |
db.add(outcome)
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
# 4. Attempt to commit; handle duplicate key errors for idempotency
|
| 127 |
+
try:
|
| 128 |
+
db.commit()
|
| 129 |
+
db.refresh(outcome)
|
| 130 |
+
except IntegrityError as e:
|
| 131 |
+
db.rollback()
|
| 132 |
+
if "idempotency_key" in str(e) and idempotency_key:
|
| 133 |
+
existing = db.query(OutcomeDB).filter(
|
| 134 |
+
OutcomeDB.idempotency_key == idempotency_key).first()
|
| 135 |
+
if existing:
|
| 136 |
+
logger.info(
|
| 137 |
+
"Idempotent request for key %s, returning existing outcome",
|
| 138 |
+
idempotency_key)
|
| 139 |
+
return existing
|
| 140 |
+
raise
|
| 141 |
+
|
| 142 |
+
# 5. Update RiskEngine ONLY if we can reconstruct a valid OSS intent
|
| 143 |
oss_intent = None
|
| 144 |
if intent.oss_payload:
|
| 145 |
try:
|
| 146 |
oss_intent = reconstruct_oss_intent_from_json(intent.oss_payload)
|
| 147 |
except Exception as e:
|
| 148 |
+
logger.error(
|
| 149 |
+
"Failed to reconstruct OSS intent for %s: %s. RiskEngine will NOT be updated.",
|
| 150 |
+
deterministic_id,
|
| 151 |
+
e,
|
| 152 |
+
exc_info=True)
|
| 153 |
else:
|
| 154 |
+
logger.warning(
|
| 155 |
+
"No oss_payload stored for intent %s – cannot update RiskEngine.",
|
| 156 |
+
deterministic_id
|
| 157 |
+
)
|
| 158 |
|
| 159 |
if oss_intent is not None:
|
| 160 |
try:
|
| 161 |
risk_engine.update_outcome(oss_intent, success)
|
| 162 |
+
|
| 163 |
+
# ----------------------------------------------------------------
|
| 164 |
+
# PERSISTENCE: after updating the conjugate posterior, write it
|
| 165 |
+
# ----------------------------------------------------------------
|
| 166 |
+
_persist_beta_state(db, risk_engine)
|
| 167 |
+
|
| 168 |
except Exception as e:
|
| 169 |
logger.exception(
|
| 170 |
"Failed to update RiskEngine after recording outcome for intent %s: %s",
|
| 171 |
+
deterministic_id,
|
| 172 |
+
e)
|
| 173 |
+
else:
|
| 174 |
+
logger.info(
|
| 175 |
+
"Skipped RiskEngine update for intent %s (no valid OSS intent)",
|
| 176 |
+
deterministic_id
|
| 177 |
+
)
|
| 178 |
|
| 179 |
+
return outcome
|
app/services/risk_service.py
CHANGED
|
@@ -1,97 +1,376 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import Optional, List, Dict, Any
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
pass
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
def evaluate_intent(
|
| 58 |
engine: RiskEngine,
|
| 59 |
-
intent,
|
| 60 |
cost_estimate: Optional[float],
|
| 61 |
policy_violations: List[str]
|
| 62 |
) -> dict:
|
| 63 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
return {
|
| 65 |
-
"risk_score":
|
| 66 |
-
"explanation":
|
| 67 |
-
"contributions":
|
| 68 |
}
|
| 69 |
|
| 70 |
|
| 71 |
def evaluate_healing_decision(
|
| 72 |
-
event,
|
| 73 |
policy_engine: PolicyEngine,
|
| 74 |
decision_engine: Optional[DecisionEngine] = None,
|
| 75 |
rag_graph: Optional[RAGGraphMemory] = None,
|
| 76 |
model=None,
|
| 77 |
tokenizer=None,
|
| 78 |
) -> Dict[str, Any]:
|
| 79 |
-
"""
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
"entropy": 0.0,
|
| 88 |
"contradiction": 0.0,
|
| 89 |
"evidence_lift": 0.0,
|
| 90 |
"hallucination_risk": 0.0,
|
| 91 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
}
|
| 93 |
|
| 94 |
|
| 95 |
def get_system_risk() -> float:
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Risk service – integrates ARF risk engine, policy engine, and decision engine.
|
| 3 |
+
Deterministic, no random fallbacks, explicit error handling.
|
| 4 |
+
|
| 5 |
+
Version: 2026-05-04 – added Prometheus metrics for observability.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
import time
|
| 12 |
from typing import Optional, List, Dict, Any
|
| 13 |
+
|
| 14 |
+
from agentic_reliability_framework.core.governance.risk_engine import RiskEngine
|
| 15 |
+
from agentic_reliability_framework.core.governance.intents import InfrastructureIntent
|
| 16 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent, HealingAction
|
| 17 |
+
from agentic_reliability_framework.core.governance.policy_engine import PolicyEngine
|
| 18 |
+
from agentic_reliability_framework.core.decision.decision_engine import DecisionEngine
|
| 19 |
+
from agentic_reliability_framework.runtime.memory.rag_graph import RAGGraphMemory
|
| 20 |
+
from agentic_reliability_framework.core.research.eclipse_probe import compute_epistemic_risk
|
| 21 |
+
|
| 22 |
+
# ── optional tracing ─────────────────────────────────────────
|
| 23 |
+
try:
|
| 24 |
+
from opentelemetry import trace
|
| 25 |
+
_tracer = trace.get_tracer(__name__)
|
| 26 |
+
OTEL_AVAILABLE = True
|
| 27 |
+
except ImportError:
|
| 28 |
+
OTEL_AVAILABLE = False
|
| 29 |
+
_tracer = None
|
| 30 |
+
|
| 31 |
+
# ── Prometheus metrics (always registered; no‑op if not scraped) ─
|
| 32 |
+
from prometheus_client import Counter, Histogram
|
| 33 |
+
|
| 34 |
+
_EVAL_COUNTER = Counter(
|
| 35 |
+
"arf_evaluations_total",
|
| 36 |
+
"Total evaluation calls (intent + healing), partitioned by engine and status.",
|
| 37 |
+
["engine", "status"],
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
_EVAL_DURATION = Histogram(
|
| 41 |
+
"arf_evaluation_duration_seconds",
|
| 42 |
+
"End‑to‑end latency of evaluation calls.",
|
| 43 |
+
["engine"],
|
| 44 |
+
buckets=(0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0),
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
_RUST_AGREEMENT = Counter(
|
| 48 |
+
"arf_rust_agreement_total",
|
| 49 |
+
"Agreement between Rust enforcer and Python policy evaluation.",
|
| 50 |
+
["result"], # "agreed" or "diverged"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
# ── optional Rust enforcer (shadow mode) ──────────────────────
|
| 54 |
+
_RUST_ENFORCER_AVAILABLE = False
|
| 55 |
+
_rust_evaluator = None # singleton per process
|
| 56 |
+
_rust_policy_json: Optional[str] = None
|
| 57 |
+
|
| 58 |
+
if os.getenv("ARF_USE_RUST_ENFORCER", "false").lower() == "true":
|
| 59 |
+
try:
|
| 60 |
+
import arf_enforcer
|
| 61 |
+
_RUST_ENFORCER_AVAILABLE = True
|
| 62 |
+
except ImportError:
|
| 63 |
pass
|
| 64 |
+
|
| 65 |
+
# Default OSS policy tree – mirrors the hard‑coded rules in the Python PolicyEvaluator
|
| 66 |
+
# that check region, resource type, and max permission level.
|
| 67 |
+
_OSS_POLICY_TREE_JSON = json.dumps({
|
| 68 |
+
"And": [
|
| 69 |
+
{"Atomic": {"RegionAllowed": {"allowed_regions": ["eastus"]}}},
|
| 70 |
+
{"Atomic": {"ResourceTypeRestricted": {
|
| 71 |
+
"forbidden_types": ["DATABASE_DROP", "FULL_ROLLOUT", "SYSTEM_SHUTDOWN", "SECRET_ROTATION"]
|
| 72 |
+
}}},
|
| 73 |
+
{"Atomic": {"MaxPermissionLevel": {"max_level": "admin"}}}
|
| 74 |
+
]
|
| 75 |
+
})
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _ensure_rust_evaluator() -> bool:
|
| 79 |
+
"""Lazy initialise the Rust policy evaluator. Returns True on success."""
|
| 80 |
+
global _rust_evaluator, _rust_policy_json
|
| 81 |
+
if _rust_evaluator is not None:
|
| 82 |
+
return True
|
| 83 |
+
if not _RUST_ENFORCER_AVAILABLE:
|
| 84 |
+
return False
|
| 85 |
+
try:
|
| 86 |
+
_rust_policy_json = _OSS_POLICY_TREE_JSON
|
| 87 |
+
_rust_evaluator = arf_enforcer.PyPolicyEvaluator(_rust_policy_json)
|
| 88 |
+
return True
|
| 89 |
+
except Exception:
|
| 90 |
+
_rust_evaluator = None
|
| 91 |
+
return False
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
logger = logging.getLogger(__name__)
|
| 95 |
|
| 96 |
|
| 97 |
def evaluate_intent(
|
| 98 |
engine: RiskEngine,
|
| 99 |
+
intent: InfrastructureIntent,
|
| 100 |
cost_estimate: Optional[float],
|
| 101 |
policy_violations: List[str]
|
| 102 |
) -> dict:
|
| 103 |
+
"""
|
| 104 |
+
Evaluate an infrastructure intent using the Bayesian risk engine.
|
| 105 |
+
|
| 106 |
+
Optionally shadows the policy evaluation with the Rust enforcer when
|
| 107 |
+
the environment variable ARF_USE_RUST_ENFORCER is set to "true".
|
| 108 |
+
Any divergence is logged and counted as a Prometheus metric.
|
| 109 |
+
|
| 110 |
+
Parameters
|
| 111 |
+
----------
|
| 112 |
+
engine : RiskEngine
|
| 113 |
+
Initialised ARF Bayesian risk engine.
|
| 114 |
+
intent : InfrastructureIntent
|
| 115 |
+
The infrastructure request to evaluate.
|
| 116 |
+
cost_estimate : float or None
|
| 117 |
+
Estimated monthly cost (used by cost‑threshold policies).
|
| 118 |
+
policy_violations : list[str]
|
| 119 |
+
Pre‑computed policy violation strings (from the Python evaluator).
|
| 120 |
+
|
| 121 |
+
Returns
|
| 122 |
+
-------
|
| 123 |
+
dict
|
| 124 |
+
Keys: risk_score, explanation, contributions.
|
| 125 |
+
"""
|
| 126 |
+
t0 = time.monotonic()
|
| 127 |
+
span = None
|
| 128 |
+
if OTEL_AVAILABLE and _tracer:
|
| 129 |
+
span = _tracer.start_span("risk_service.evaluate_intent")
|
| 130 |
+
span.set_attribute("intent_type", type(intent).__name__)
|
| 131 |
+
|
| 132 |
+
# ── Shadow Rust enforcer (best‑effort, non‑blocking) ──────
|
| 133 |
+
if _RUST_ENFORCER_AVAILABLE and _ensure_rust_evaluator():
|
| 134 |
+
try:
|
| 135 |
+
rust_intent = {
|
| 136 |
+
"action": getattr(intent, "intent_type", "unknown"),
|
| 137 |
+
"component": getattr(intent, "service_name", "unknown"),
|
| 138 |
+
"region": getattr(intent, "region", None),
|
| 139 |
+
"resource_type": getattr(intent, "resource_type", None),
|
| 140 |
+
"permission_level": getattr(intent, "permission_level", None),
|
| 141 |
+
"extra": {}
|
| 142 |
+
}
|
| 143 |
+
rust_raw = _rust_evaluator.evaluate(
|
| 144 |
+
json.dumps(rust_intent), cost_estimate
|
| 145 |
+
)
|
| 146 |
+
rust_violations = json.loads(rust_raw)
|
| 147 |
+
|
| 148 |
+
agreed = set(rust_violations) == set(policy_violations)
|
| 149 |
+
_RUST_AGREEMENT.labels(result="agreed" if agreed else "diverged").inc()
|
| 150 |
+
if not agreed:
|
| 151 |
+
msg = (
|
| 152 |
+
"Rust enforcer divergence: "
|
| 153 |
+
f"Rust={sorted(rust_violations)} Python={sorted(policy_violations)}"
|
| 154 |
+
)
|
| 155 |
+
logger.warning(msg)
|
| 156 |
+
if span:
|
| 157 |
+
span.add_event("rust_enforcer_divergence", {
|
| 158 |
+
"rust_violations": rust_violations,
|
| 159 |
+
"python_violations": policy_violations
|
| 160 |
+
})
|
| 161 |
+
except Exception as exc:
|
| 162 |
+
logger.debug("Rust enforcer shadow evaluation failed: %s", exc)
|
| 163 |
+
|
| 164 |
+
# ── Core risk evaluation ──────────────────────────────────
|
| 165 |
+
|
| 166 |
+
# ── Automated canary promotion ──────────────────────────
|
| 167 |
+
if _RUST_ENFORCER_AVAILABLE and os.getenv("ARF_RUST_CANARY", "false").lower() == "true":
|
| 168 |
+
try:
|
| 169 |
+
from prometheus_client import REGISTRY
|
| 170 |
+
lower = REGISTRY.get_sample_value("arf_rust_agreement_lower_bound", {})
|
| 171 |
+
if lower is not None and lower > 0.9999:
|
| 172 |
+
policy_violations = rust_violations
|
| 173 |
+
if span:
|
| 174 |
+
span.set_attribute("rust_enforcer_active", True)
|
| 175 |
+
except Exception:
|
| 176 |
+
pass
|
| 177 |
+
try:
|
| 178 |
+
score, explanation, contributions = engine.calculate_risk(
|
| 179 |
+
intent=intent,
|
| 180 |
+
cost_estimate=cost_estimate,
|
| 181 |
+
policy_violations=policy_violations
|
| 182 |
+
)
|
| 183 |
+
engine_label = "python"
|
| 184 |
+
status = "success"
|
| 185 |
+
except Exception:
|
| 186 |
+
_EVAL_COUNTER.labels(engine="python", status="error").inc()
|
| 187 |
+
_EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
|
| 188 |
+
raise
|
| 189 |
+
|
| 190 |
+
_EVAL_COUNTER.labels(engine=engine_label, status=status).inc()
|
| 191 |
+
_EVAL_DURATION.labels(engine=engine_label).observe(time.monotonic() - t0)
|
| 192 |
+
|
| 193 |
+
if span:
|
| 194 |
+
span.set_attribute("risk_score", score)
|
| 195 |
+
if _RUST_ENFORCER_AVAILABLE:
|
| 196 |
+
span.set_attribute("rust_enforcer_available", True)
|
| 197 |
+
span.end()
|
| 198 |
+
|
| 199 |
return {
|
| 200 |
+
"risk_score": score,
|
| 201 |
+
"explanation": explanation,
|
| 202 |
+
"contributions": contributions
|
| 203 |
}
|
| 204 |
|
| 205 |
|
| 206 |
def evaluate_healing_decision(
|
| 207 |
+
event: ReliabilityEvent,
|
| 208 |
policy_engine: PolicyEngine,
|
| 209 |
decision_engine: Optional[DecisionEngine] = None,
|
| 210 |
rag_graph: Optional[RAGGraphMemory] = None,
|
| 211 |
model=None,
|
| 212 |
tokenizer=None,
|
| 213 |
) -> Dict[str, Any]:
|
| 214 |
+
"""
|
| 215 |
+
Evaluate healing actions for a given reliability event using decision‑theoretic selection.
|
| 216 |
+
Includes epistemic risk signals from the eclipse probe.
|
| 217 |
+
|
| 218 |
+
Parameters
|
| 219 |
+
----------
|
| 220 |
+
event : ReliabilityEvent
|
| 221 |
+
The incident event containing latency, error rate, etc.
|
| 222 |
+
policy_engine : PolicyEngine
|
| 223 |
+
The ARF healing policy engine with configured policies.
|
| 224 |
+
decision_engine : DecisionEngine, optional
|
| 225 |
+
If omitted, a default instance is created.
|
| 226 |
+
rag_graph : RAGGraphMemory, optional
|
| 227 |
+
Semantic memory for similar incident retrieval.
|
| 228 |
+
model, tokenizer : optional
|
| 229 |
+
HuggingFace model and tokenizer for epistemic risk computation.
|
| 230 |
+
|
| 231 |
+
Returns
|
| 232 |
+
-------
|
| 233 |
+
dict
|
| 234 |
+
Keys: risk_score, selected_action, expected_utility, alternatives,
|
| 235 |
+
explanation, epistemic_signals.
|
| 236 |
+
"""
|
| 237 |
+
t0 = time.monotonic()
|
| 238 |
+
span = None
|
| 239 |
+
if OTEL_AVAILABLE and _tracer:
|
| 240 |
+
span = _tracer.start_span("risk_service.evaluate_healing")
|
| 241 |
+
span.set_attribute("component", event.component)
|
| 242 |
+
|
| 243 |
+
# If decision_engine not provided, try to get from policy_engine
|
| 244 |
+
if decision_engine is None and hasattr(policy_engine, 'decision_engine'):
|
| 245 |
+
decision_engine = policy_engine.decision_engine
|
| 246 |
+
|
| 247 |
+
# If still None, create a minimal one (global stats only)
|
| 248 |
+
if decision_engine is None:
|
| 249 |
+
logger.debug("No DecisionEngine provided; creating default instance")
|
| 250 |
+
decision_engine = DecisionEngine(rag_graph=rag_graph)
|
| 251 |
+
|
| 252 |
+
# Get raw candidate actions (by temporarily disabling decision engine)
|
| 253 |
+
orig_use = policy_engine.use_decision_engine
|
| 254 |
+
try:
|
| 255 |
+
policy_engine.use_decision_engine = False
|
| 256 |
+
raw_actions = policy_engine.evaluate_policies(event)
|
| 257 |
+
finally:
|
| 258 |
+
policy_engine.use_decision_engine = orig_use
|
| 259 |
+
|
| 260 |
+
# If no actions, return NO_ACTION
|
| 261 |
+
if not raw_actions or raw_actions == [HealingAction.NO_ACTION]:
|
| 262 |
+
if span:
|
| 263 |
+
span.set_attribute("selected_action", HealingAction.NO_ACTION.value)
|
| 264 |
+
span.end()
|
| 265 |
+
_EVAL_COUNTER.labels(engine="python", status="success").inc()
|
| 266 |
+
_EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
|
| 267 |
+
return {
|
| 268 |
+
"risk_score": 0.0,
|
| 269 |
+
"selected_action": HealingAction.NO_ACTION.value,
|
| 270 |
+
"expected_utility": 0.0,
|
| 271 |
+
"alternatives": [],
|
| 272 |
+
"explanation": "No candidate actions triggered.",
|
| 273 |
+
"epistemic_signals": None,
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
# Build reasoning text from policies that triggered the actions
|
| 277 |
+
reasoning_parts = []
|
| 278 |
+
for policy in policy_engine.policies:
|
| 279 |
+
if any(a in policy.actions for a in raw_actions):
|
| 280 |
+
conditions_str = ", ".join(
|
| 281 |
+
f"{c.metric} {c.operator} {c.threshold}" for c in policy.conditions
|
| 282 |
+
)
|
| 283 |
+
reasoning_parts.append(
|
| 284 |
+
f"Policy {policy.name} triggered by {conditions_str} → actions {[a.value for a in policy.actions]}"
|
| 285 |
+
)
|
| 286 |
+
reasoning_text = " ".join(reasoning_parts)
|
| 287 |
+
|
| 288 |
+
# Build evidence text from the event
|
| 289 |
+
evidence_text = (
|
| 290 |
+
f"Component: {event.component}, "
|
| 291 |
+
f"latency_p99: {event.latency_p99}, "
|
| 292 |
+
f"error_rate: {event.error_rate}, "
|
| 293 |
+
f"cpu_util: {event.cpu_util}, "
|
| 294 |
+
f"memory_util: {event.memory_util}"
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# Compute epistemic signals (if model/tokenizer provided)
|
| 298 |
+
epistemic_signals = None
|
| 299 |
+
if model is not None and tokenizer is not None:
|
| 300 |
+
try:
|
| 301 |
+
epistemic_signals = compute_epistemic_risk(
|
| 302 |
+
reasoning_text, evidence_text, model, tokenizer
|
| 303 |
+
)
|
| 304 |
+
except Exception as e:
|
| 305 |
+
logger.error(f"Failed to compute epistemic risk: {e}")
|
| 306 |
+
epistemic_signals = {
|
| 307 |
+
"entropy": 0.0,
|
| 308 |
+
"contradiction": 0.0,
|
| 309 |
+
"evidence_lift": 0.0,
|
| 310 |
+
"hallucination_risk": 0.0,
|
| 311 |
+
}
|
| 312 |
+
else:
|
| 313 |
+
logger.debug("Epistemic model/tokenizer not provided; using zero signals")
|
| 314 |
+
epistemic_signals = {
|
| 315 |
"entropy": 0.0,
|
| 316 |
"contradiction": 0.0,
|
| 317 |
"evidence_lift": 0.0,
|
| 318 |
"hallucination_risk": 0.0,
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
# Run decision engine to get best action and alternatives
|
| 322 |
+
decision = decision_engine.select_optimal_action(
|
| 323 |
+
raw_actions, event, component=event.component,
|
| 324 |
+
epistemic_signals=epistemic_signals
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Extract risk of the selected action
|
| 328 |
+
risk_score = None
|
| 329 |
+
for alt in decision.alternatives:
|
| 330 |
+
if alt.action == decision.best_action:
|
| 331 |
+
risk_score = alt.risk
|
| 332 |
+
break
|
| 333 |
+
if risk_score is None:
|
| 334 |
+
# Compute risk separately
|
| 335 |
+
risk_score = decision_engine.compute_risk(
|
| 336 |
+
decision.best_action, event, event.component)
|
| 337 |
+
|
| 338 |
+
# Format alternatives (top 3 only)
|
| 339 |
+
alt_list = []
|
| 340 |
+
for alt in decision.alternatives[:3]:
|
| 341 |
+
alt_list.append({
|
| 342 |
+
"action": alt.action.value,
|
| 343 |
+
"expected_utility": alt.utility,
|
| 344 |
+
"risk": alt.risk,
|
| 345 |
+
})
|
| 346 |
+
|
| 347 |
+
# ── Metrics & span finalisation ───────────────────────────
|
| 348 |
+
_EVAL_COUNTER.labels(engine="python", status="success").inc()
|
| 349 |
+
_EVAL_DURATION.labels(engine="python").observe(time.monotonic() - t0)
|
| 350 |
+
|
| 351 |
+
if span:
|
| 352 |
+
span.set_attribute("risk_score", risk_score)
|
| 353 |
+
span.set_attribute("selected_action", decision.best_action.value)
|
| 354 |
+
span.set_attribute("expected_utility", decision.expected_utility)
|
| 355 |
+
span.end()
|
| 356 |
+
|
| 357 |
+
return {
|
| 358 |
+
"risk_score": risk_score,
|
| 359 |
+
"selected_action": decision.best_action.value,
|
| 360 |
+
"expected_utility": decision.expected_utility,
|
| 361 |
+
"alternatives": alt_list,
|
| 362 |
+
"explanation": decision.explanation,
|
| 363 |
+
"raw_decision": decision.raw_data,
|
| 364 |
+
"epistemic_signals": epistemic_signals,
|
| 365 |
}
|
| 366 |
|
| 367 |
|
| 368 |
def get_system_risk() -> float:
|
| 369 |
+
"""
|
| 370 |
+
Return an aggregated risk score across all monitored components.
|
| 371 |
+
This is a placeholder – the endpoint is deprecated.
|
| 372 |
+
Raises NotImplementedError to avoid random fallback.
|
| 373 |
+
"""
|
| 374 |
+
raise NotImplementedError(
|
| 375 |
+
"get_system_risk is deprecated. Use component‑level risk evaluation instead."
|
| 376 |
+
)
|
app/services/wilson_monitor.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Wilson confidence interval monitor for Rust enforcer agreement
|
| 2 |
+
from prometheus_client import Gauge
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
LOWER_BOUND = Gauge(
|
| 7 |
+
"arf_rust_agreement_lower_bound",
|
| 8 |
+
"Lower 99.9% Wilson bound on agreement rate",
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def wilson_lower(success, total, z=3.291):
|
| 13 |
+
"""
|
| 14 |
+
Compute the lower bound of the Wilson confidence interval
|
| 15 |
+
for a binomial proportion.
|
| 16 |
+
|
| 17 |
+
Parameters
|
| 18 |
+
----------
|
| 19 |
+
success : int
|
| 20 |
+
Number of agreed evaluations.
|
| 21 |
+
total : int
|
| 22 |
+
Total number of shadow evaluations (agreed + diverged).
|
| 23 |
+
z : float
|
| 24 |
+
Z‑score for the desired confidence level (default 3.291 for 99.9%).
|
| 25 |
+
|
| 26 |
+
Returns
|
| 27 |
+
-------
|
| 28 |
+
float
|
| 29 |
+
Lower bound of the Wilson interval, clamped to [0, 1].
|
| 30 |
+
"""
|
| 31 |
+
if total == 0:
|
| 32 |
+
return 0.0
|
| 33 |
+
p = success / total
|
| 34 |
+
n = total
|
| 35 |
+
denom = 1 + z**2 / n
|
| 36 |
+
center = (p + z**2 / (2 * n)) / denom
|
| 37 |
+
margin = z * math.sqrt(p * (1 - p) / n + z**2 / (4 * n**2)) / denom
|
| 38 |
+
return max(0.0, center - margin)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def update(agreed, diverged):
|
| 42 |
+
"""
|
| 43 |
+
Query the Prometheus agreement counters and set the lower‑bound gauge.
|
| 44 |
+
|
| 45 |
+
This function is called periodically by the background thread started
|
| 46 |
+
in the API lifespan (see `app/main.py`).
|
| 47 |
+
|
| 48 |
+
Parameters
|
| 49 |
+
----------
|
| 50 |
+
agreed : int
|
| 51 |
+
Current value of `arf_rust_agreement_total{result="agreed"}`.
|
| 52 |
+
diverged : int
|
| 53 |
+
Current value of `arf_rust_agreement_total{result="diverged"}`.
|
| 54 |
+
"""
|
| 55 |
+
lower = wilson_lower(agreed, agreed + diverged)
|
| 56 |
+
LOWER_BOUND.set(lower)
|
docker-compose.test.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
postgres:
|
| 5 |
+
image: postgres:15-alpine
|
| 6 |
+
environment:
|
| 7 |
+
POSTGRES_USER: testuser
|
| 8 |
+
POSTGRES_PASSWORD: testpass
|
| 9 |
+
POSTGRES_DB: testdb
|
| 10 |
+
ports:
|
| 11 |
+
- "5432:5432"
|
| 12 |
+
tmpfs: /var/lib/postgresql/data
|
docs/authentication.md
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Authentication
|
| 2 |
+
|
| 3 |
+
This page describes how to authenticate with the ARF API.
|
| 4 |
+
|
| 5 |
+
Current status
|
| 6 |
+
|
| 7 |
+
- There is no route-level or global authentication enforced by the API code in this repository. The API routes (including governance endpoints) do not validate API keys, tokens, or other credentials.
|
| 8 |
+
|
| 9 |
+
What the code provides
|
| 10 |
+
|
| 11 |
+
- The configuration model (app/core/config.py) exposes an optional `api_key` setting. This can be provided via environment variables or a `.env` file (the BaseSettings `env_file` is configured to read `.env`).
|
| 12 |
+
|
| 13 |
+
What this means for you
|
| 14 |
+
|
| 15 |
+
- Setting `API_KEY` in a `.env` file or environment variable will populate the `settings.api_key`, but the current route implementations do not check this value.
|
| 16 |
+
- If you require authentication, add a FastAPI dependency or middleware that checks `settings.api_key` (or another auth mechanism) and then apply it to routes or include it in a dependency override.
|
| 17 |
+
|
| 18 |
+
Suggested minimal approach to enable API key checking
|
| 19 |
+
|
| 20 |
+
- Implement a dependency in `app.api.deps` (e.g., `get_api_key`) that compares a header value to `settings.api_key` and raise `HTTPException(401)` when missing/invalid.
|
| 21 |
+
- Add that dependency to routers or individual endpoints where auth is required.
|
| 22 |
+
|
| 23 |
+
Notes
|
| 24 |
+
|
| 25 |
+
- Tests and example code in this repo currently run without auth.
|
docs/development.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Development
|
| 2 |
+
|
| 3 |
+
This page explains how to set up the ARF API for local development.
|
| 4 |
+
|
| 5 |
+
Requirements
|
| 6 |
+
|
| 7 |
+
- Python 3.10+ (match your environment)
|
| 8 |
+
- A virtual environment
|
| 9 |
+
- The project's Python dependencies (see `requirements.txt`). Note: `agentic-reliability-framework` is installed from a Git URL in `requirements.txt`.
|
| 10 |
+
|
| 11 |
+
Quick start
|
| 12 |
+
|
| 13 |
+
1. Clone the repository:
|
| 14 |
+
|
| 15 |
+
git clone https://github.com/petter2025us/arf-api.git
|
| 16 |
+
cd arf-api
|
| 17 |
+
|
| 18 |
+
2. Create and activate a virtualenv, then install dependencies:
|
| 19 |
+
|
| 20 |
+
python -m venv .venv
|
| 21 |
+
source .venv/bin/activate # or .\.venv\Scripts\activate on Windows
|
| 22 |
+
pip install -r requirements.txt
|
| 23 |
+
|
| 24 |
+
3. Configure environment variables (optional):
|
| 25 |
+
|
| 26 |
+
- The project uses pydantic-settings with `env_file = ".env"` (see `app/core/config.py`). Create a `.env` file to set values locally.
|
| 27 |
+
|
| 28 |
+
Relevant environment variables used by the code:
|
| 29 |
+
- ARF_HMC_MODEL (default: `models/hmc_model.json`) — path to HMC model JSON used by RiskEngine.
|
| 30 |
+
- ARF_USE_HYPERPRIORS (default: `false`) — set to `true` to enable hyperprior behavior.
|
| 31 |
+
- API_KEY (optional) — will populate `settings.api_key` but note that routes currently do not enforce authentication.
|
| 32 |
+
- DATABASE_URL (optional) — configuration option in settings; tests use a local SQLite DB by default.
|
| 33 |
+
|
| 34 |
+
4. Run the app with Uvicorn for development:
|
| 35 |
+
|
| 36 |
+
uvicorn app.main:app --reload --port 8000
|
| 37 |
+
|
| 38 |
+
- The application mounts routes under the `/api/v1` prefix and exposes a health endpoint at `/health`.
|
| 39 |
+
|
| 40 |
+
Running tests
|
| 41 |
+
|
| 42 |
+
- Tests use an on-disk SQLite test database (`sqlite:///./test.db`) created by the test fixtures (`tests/conftest.py`).
|
| 43 |
+
- To run tests:
|
| 44 |
+
|
| 45 |
+
pytest
|
| 46 |
+
|
| 47 |
+
- The test fixtures override the dependency that provides DB sessions so tests run against the test database.
|
| 48 |
+
|
| 49 |
+
Notes on the RiskEngine
|
| 50 |
+
|
| 51 |
+
- The app initializes a `RiskEngine` instance at startup (in `app.main`) using environment variables noted above. The engine instance is stored in `app.state.risk_engine` and is used by the governance endpoints.
|
| 52 |
+
|
| 53 |
+
Further development
|
| 54 |
+
|
| 55 |
+
- If you add persistent intent storage or authentication, update tests and dependency overrides accordingly.
|
docs/docs_endpoints.md
ADDED
|
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Endpoints
|
| 2 |
+
|
| 3 |
+
This document describes the main ARF API endpoints and the request/response contracts used by the control plane.
|
| 4 |
+
|
| 5 |
+
## POST `/api/v1/v1/incidents/evaluate`
|
| 6 |
+
|
| 7 |
+
Evaluates a reported incident and returns a heuristic healing recommendation, a counterfactual causal explanation, and a simplified utility decision.
|
| 8 |
+
|
| 9 |
+
This endpoint is **advisory only**. It does not apply remediation, mutate infrastructure, or execute any healing action.
|
| 10 |
+
|
| 11 |
+
### Purpose
|
| 12 |
+
|
| 13 |
+
The endpoint takes a current incident snapshot, estimates risk, chooses a deterministic action, and explains the expected effect of that action on latency using a heuristic counterfactual model.
|
| 14 |
+
|
| 15 |
+
The implementation is intentionally simple:
|
| 16 |
+
|
| 17 |
+
- no fitted Structural Causal Model is used
|
| 18 |
+
- no machine learning model is required
|
| 19 |
+
- no historical training step is performed
|
| 20 |
+
- no action execution is triggered
|
| 21 |
+
|
| 22 |
+
### Request schema
|
| 23 |
+
|
| 24 |
+
The request body must match the `ReliabilityEvent` model.
|
| 25 |
+
|
| 26 |
+
```json
|
| 27 |
+
{
|
| 28 |
+
"component": "string",
|
| 29 |
+
"latency_p99": "number",
|
| 30 |
+
"error_rate": "number",
|
| 31 |
+
"service_mesh": "string",
|
| 32 |
+
"cpu_util": "number | null",
|
| 33 |
+
"memory_util": "number | null"
|
| 34 |
+
}
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
#### Fields
|
| 38 |
+
|
| 39 |
+
`component`
|
| 40 |
+
: Name of the service or component being evaluated.
|
| 41 |
+
|
| 42 |
+
`latency_p99`
|
| 43 |
+
: The current 99th percentile latency value. The endpoint uses this value both for risk scoring and for the causal explanation.
|
| 44 |
+
|
| 45 |
+
`error_rate`
|
| 46 |
+
: The current error rate. The endpoint uses this value both for risk scoring and for the deterministic action threshold.
|
| 47 |
+
|
| 48 |
+
`service_mesh`
|
| 49 |
+
: Optional service mesh name. Defaults to `"default"`.
|
| 50 |
+
|
| 51 |
+
`cpu_util`
|
| 52 |
+
: Optional CPU utilization value. Present in the request model, but not used by the current decision logic.
|
| 53 |
+
|
| 54 |
+
`memory_util`
|
| 55 |
+
: Optional memory utilization value. Present in the request model, but not used by the current decision logic.
|
| 56 |
+
|
| 57 |
+
### Response schema
|
| 58 |
+
|
| 59 |
+
The endpoint returns a JSON object with three top-level sections.
|
| 60 |
+
|
| 61 |
+
```json
|
| 62 |
+
{
|
| 63 |
+
"healing_intent": {
|
| 64 |
+
"action": "string",
|
| 65 |
+
"component": "string",
|
| 66 |
+
"parameters": {},
|
| 67 |
+
"justification": "string",
|
| 68 |
+
"confidence": 0.85,
|
| 69 |
+
"risk_score": 0.0,
|
| 70 |
+
"status": "oss_advisory_only"
|
| 71 |
+
},
|
| 72 |
+
"causal_explanation": {
|
| 73 |
+
"factual_outcome": 0.0,
|
| 74 |
+
"counterfactual_outcome": 0.0,
|
| 75 |
+
"effect": 0.0,
|
| 76 |
+
"explanation_text": "string",
|
| 77 |
+
"is_model_based": false,
|
| 78 |
+
"warnings": ["string"]
|
| 79 |
+
},
|
| 80 |
+
"utility_decision": {
|
| 81 |
+
"best_action": "string",
|
| 82 |
+
"expected_utility": 0.5,
|
| 83 |
+
"explanation": "string"
|
| 84 |
+
}
|
| 85 |
+
}
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
#### `healing_intent`
|
| 89 |
+
|
| 90 |
+
`action`
|
| 91 |
+
: The selected action. In the current implementation this is either `restart_container` or `no_action`.
|
| 92 |
+
|
| 93 |
+
`component`
|
| 94 |
+
: The input component name.
|
| 95 |
+
|
| 96 |
+
`parameters`
|
| 97 |
+
: Action parameters. The current implementation returns an empty object.
|
| 98 |
+
|
| 99 |
+
`justification`
|
| 100 |
+
: Human-readable explanation built from the causal explanation.
|
| 101 |
+
|
| 102 |
+
`confidence`
|
| 103 |
+
: Fixed confidence value returned by the endpoint. The current implementation uses `0.85`.
|
| 104 |
+
|
| 105 |
+
`risk_score`
|
| 106 |
+
: Heuristic risk score computed from latency and error rate.
|
| 107 |
+
|
| 108 |
+
`status`
|
| 109 |
+
: Always `oss_advisory_only`, indicating that the response is informational and not executable.
|
| 110 |
+
|
| 111 |
+
#### `causal_explanation`
|
| 112 |
+
|
| 113 |
+
`factual_outcome`
|
| 114 |
+
: The observed outcome value from the request context. The endpoint uses `latency_p99` as the explained metric.
|
| 115 |
+
|
| 116 |
+
`counterfactual_outcome`
|
| 117 |
+
: The estimated value under the proposed alternative action.
|
| 118 |
+
|
| 119 |
+
`effect`
|
| 120 |
+
: The difference between counterfactual and factual outcomes.
|
| 121 |
+
|
| 122 |
+
`explanation_text`
|
| 123 |
+
: Natural-language explanation of the counterfactual effect.
|
| 124 |
+
|
| 125 |
+
`is_model_based`
|
| 126 |
+
: Always `false` in the current implementation.
|
| 127 |
+
|
| 128 |
+
`warnings`
|
| 129 |
+
: A list of warning strings. The current implementation includes a warning that the causal model is heuristic and not SCM-based.
|
| 130 |
+
|
| 131 |
+
#### `utility_decision`
|
| 132 |
+
|
| 133 |
+
`best_action`
|
| 134 |
+
: The selected action, repeated for convenience.
|
| 135 |
+
|
| 136 |
+
`expected_utility`
|
| 137 |
+
: Fixed utility value returned by the current implementation. The endpoint uses `0.5`.
|
| 138 |
+
|
| 139 |
+
`explanation`
|
| 140 |
+
: Brief explanation that the choice came from heuristic latency and error thresholds.
|
| 141 |
+
|
| 142 |
+
### Deterministic decision logic
|
| 143 |
+
|
| 144 |
+
The endpoint uses the following rule to choose the action:
|
| 145 |
+
|
| 146 |
+
```text
|
| 147 |
+
optimal_action = RESTART_CONTAINER
|
| 148 |
+
if latency_p99 > 500 OR error_rate > 0.15
|
| 149 |
+
else NO_ACTION
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
In the implementation, this is encoded as:
|
| 153 |
+
|
| 154 |
+
- `restart_container` when `latency_p99 > 500` or `error_rate > 0.15`
|
| 155 |
+
- `no_action` otherwise
|
| 156 |
+
|
| 157 |
+
No probabilistic policy or learned policy is involved.
|
| 158 |
+
|
| 159 |
+
### Heuristic risk score
|
| 160 |
+
|
| 161 |
+
The risk score is computed as:
|
| 162 |
+
|
| 163 |
+
```text
|
| 164 |
+
risk = min(1.0, (latency_p99 / 1000) * 0.7 + error_rate * 0.3)
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Properties of this score:
|
| 168 |
+
|
| 169 |
+
- normalized to the interval `[0, 1]`
|
| 170 |
+
- weighted more heavily toward latency than error rate
|
| 171 |
+
- clipped at `1.0`
|
| 172 |
+
|
| 173 |
+
### Counterfactual model
|
| 174 |
+
|
| 175 |
+
The causal explainer uses a deterministic multiplicative heuristic:
|
| 176 |
+
|
| 177 |
+
```text
|
| 178 |
+
counterfactual_outcome = factual_outcome * (1 + effect_frac)
|
| 179 |
+
```
|
| 180 |
+
|
| 181 |
+
Where:
|
| 182 |
+
|
| 183 |
+
- `factual_outcome` is the observed metric value
|
| 184 |
+
- `effect_frac` is read from a fixed internal action-impact table
|
| 185 |
+
- the effect is multiplicative, not additive
|
| 186 |
+
|
| 187 |
+
For latency, the current action-impact mapping includes the following examples:
|
| 188 |
+
|
| 189 |
+
- `restart_container` → `latency_effect = -0.15`
|
| 190 |
+
- `scale_out` → `latency_effect = -0.20`
|
| 191 |
+
- `rollback` → `latency_effect = -0.25`
|
| 192 |
+
- `circuit_breaker` → `latency_effect = -0.05`
|
| 193 |
+
- `traffic_shift` → `latency_effect = -0.10`
|
| 194 |
+
- `alert_team` → `latency_effect = 0.0`
|
| 195 |
+
- `no_action` → `latency_effect = 0.0`
|
| 196 |
+
|
| 197 |
+
For error rate, the table includes a separate `error_rate_effect` per action, but the current endpoint calls the explainer with `outcome_metric="latency"`, so the returned counterfactual explanation is latency-based.
|
| 198 |
+
|
| 199 |
+
### Uncertainty interval
|
| 200 |
+
|
| 201 |
+
The explainer applies a fixed uncertainty margin of ±10% around the estimated effect.
|
| 202 |
+
|
| 203 |
+
Let:
|
| 204 |
+
|
| 205 |
+
```text
|
| 206 |
+
effect = counterfactual_outcome - factual_outcome
|
| 207 |
+
ci_half = abs(effect) * 0.1
|
| 208 |
+
confidence_interval = (counterfactual_outcome - ci_half, counterfactual_outcome + ci_half)
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
This interval is heuristic only. It is not a calibrated statistical confidence interval.
|
| 212 |
+
|
| 213 |
+
### How the endpoint uses the explainer
|
| 214 |
+
|
| 215 |
+
The endpoint constructs a local state object and passes it to the explainer:
|
| 216 |
+
|
| 217 |
+
- `current_state["latency"] = event.latency_p99`
|
| 218 |
+
- `current_state["error_rate"] = event.error_rate`
|
| 219 |
+
- `current_state["last_action"] = {"action_type": "no_action"}`
|
| 220 |
+
|
| 221 |
+
It then creates:
|
| 222 |
+
|
| 223 |
+
- `proposed_action = {"action_type": optimal_action.value, "params": {}}`
|
| 224 |
+
|
| 225 |
+
and calls:
|
| 226 |
+
|
| 227 |
+
```text
|
| 228 |
+
CausalExplainer().explain_healing_intent(proposed_action, current_state, "latency")
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
The resulting explanation is embedded into the `healing_intent` response.
|
| 232 |
+
|
| 233 |
+
### Validation and error behavior
|
| 234 |
+
|
| 235 |
+
The endpoint uses Pydantic validation through the `ReliabilityEvent` model.
|
| 236 |
+
|
| 237 |
+
Expected behavior:
|
| 238 |
+
|
| 239 |
+
- valid requests return HTTP 200
|
| 240 |
+
- invalid request bodies are rejected by FastAPI/Pydantic before the handler logic runs
|
| 241 |
+
|
| 242 |
+
The current implementation does not define a custom error schema for validation failures.
|
| 243 |
+
|
| 244 |
+
### Advisory-only behavior
|
| 245 |
+
|
| 246 |
+
The response includes:
|
| 247 |
+
|
| 248 |
+
```json
|
| 249 |
+
"status": "oss_advisory_only"
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
This means:
|
| 253 |
+
|
| 254 |
+
- the endpoint recommends an action
|
| 255 |
+
- it does not perform the action
|
| 256 |
+
- it does not mutate incident state
|
| 257 |
+
- it does not trigger remediation workflows by itself
|
| 258 |
+
|
| 259 |
+
### Notes on implementation scope
|
| 260 |
+
|
| 261 |
+
The current endpoint is intentionally narrow:
|
| 262 |
+
|
| 263 |
+
- it bases the action choice on only two fields: `latency_p99` and `error_rate`
|
| 264 |
+
- it ignores `cpu_util`, `memory_util`, and `service_mesh` in the decision logic
|
| 265 |
+
- it always uses the latency metric in the causal explainer call
|
| 266 |
+
- it returns a fixed `expected_utility` value of `0.5`
|
| 267 |
+
|
| 268 |
+
### Example request
|
| 269 |
+
|
| 270 |
+
```bash
|
| 271 |
+
curl -X POST "http://localhost:8000/api/v1/v1/incidents/evaluate" -H "Content-Type: application/json" -d '{
|
| 272 |
+
"component": "payment-service",
|
| 273 |
+
"latency_p99": 450,
|
| 274 |
+
"error_rate": 0.25,
|
| 275 |
+
"service_mesh": "default",
|
| 276 |
+
"cpu_util": 0.85,
|
| 277 |
+
"memory_util": 0.90
|
| 278 |
+
}'
|
| 279 |
+
```
|
| 280 |
+
|
| 281 |
+
### Example response shape
|
| 282 |
+
|
| 283 |
+
```json
|
| 284 |
+
{
|
| 285 |
+
"healing_intent": {
|
| 286 |
+
"action": "restart_container",
|
| 287 |
+
"component": "payment-service",
|
| 288 |
+
"parameters": {},
|
| 289 |
+
"justification": "Causal: If we apply restart_container instead of no_action, latency would change from 450.00 to 382.50 (Δ = -67.50). Based on heuristic causal model.",
|
| 290 |
+
"confidence": 0.85,
|
| 291 |
+
"risk_score": 0.4575,
|
| 292 |
+
"status": "oss_advisory_only"
|
| 293 |
+
},
|
| 294 |
+
"causal_explanation": {
|
| 295 |
+
"factual_outcome": 450,
|
| 296 |
+
"counterfactual_outcome": 382.5,
|
| 297 |
+
"effect": -67.5,
|
| 298 |
+
"explanation_text": "If we apply restart_container instead of no_action, latency would change from 450.00 to 382.50 (Δ = -67.50). Based on heuristic causal model.",
|
| 299 |
+
"is_model_based": false,
|
| 300 |
+
"warnings": [
|
| 301 |
+
"Using heuristic causal model (no fitted SCM)."
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
"utility_decision": {
|
| 305 |
+
"best_action": "restart_container",
|
| 306 |
+
"expected_utility": 0.5,
|
| 307 |
+
"explanation": "Heuristic decision based on latency/error thresholds"
|
| 308 |
+
}
|
| 309 |
+
}
|
| 310 |
+
```
|
| 311 |
+
|
| 312 |
+
### Cross-reference
|
| 313 |
+
|
| 314 |
+
See `docs/examples.md` for a worked numerical example and `README.md` for a shorter overview.
|
docs/endpoints.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Endpoints
|
| 2 |
+
|
| 3 |
+
This page lists all available API endpoints.
|
| 4 |
+
|
| 5 |
+
General
|
| 6 |
+
|
| 7 |
+
- All API routers are mounted under the `/api/v1` prefix (see `app.main`).
|
| 8 |
+
- Health endpoint is available at `/health`.
|
| 9 |
+
|
| 10 |
+
Health
|
| 11 |
+
|
| 12 |
+
- GET /health
|
| 13 |
+
- Returns: `{ "status": "ok" }`
|
| 14 |
+
- Purpose: basic liveness/health check.
|
| 15 |
+
|
| 16 |
+
Governance (risk/intent evaluation)
|
| 17 |
+
|
| 18 |
+
- POST /api/v1/intents/evaluate
|
| 19 |
+
- Description: Evaluate an infrastructure intent and return a risk score and explanation.
|
| 20 |
+
- Body: an InfrastructureIntentRequest JSON object (see the model in `app.models.infrastructure_intents`).
|
| 21 |
+
- Behaviour: The endpoint converts the incoming intent to an OSS intent and calls into the locally initialized RiskEngine (`app.state.risk_engine`).
|
| 22 |
+
- Errors: May return 500 if evaluation fails.
|
| 23 |
+
|
| 24 |
+
- POST /api/v1/intents/outcome
|
| 25 |
+
- Description: Record the observed outcome of an executed intent to update priors.
|
| 26 |
+
- Behaviour: Not implemented in this repository; the endpoint returns a `501 Not Implemented` (the current implementation raises a 501 indicating outcome recording is not yet implemented).
|
| 27 |
+
|
| 28 |
+
Other routers
|
| 29 |
+
|
| 30 |
+
- The application also registers routers for incidents, risk, intents, and history at `/api/v1` (see `app.main`). Consult the respective modules in `app.api` for their exact endpoints and payloads.
|
| 31 |
+
|
| 32 |
+
Notes
|
| 33 |
+
|
| 34 |
+
- The governance evaluation relies on a `RiskEngine` instance initialized at app startup (see `app.main`) which reads `ARF_HMC_MODEL` and `ARF_USE_HYPERPRIORS` environment variables.
|
docs/examples.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Examples
|
| 2 |
+
|
| 3 |
+
This page provides usage examples for the ARF API.
|
| 4 |
+
|
| 5 |
+
Check health
|
| 6 |
+
|
| 7 |
+
curl example:
|
| 8 |
+
|
| 9 |
+
curl http://localhost:8000/health
|
| 10 |
+
|
| 11 |
+
Response:
|
| 12 |
+
|
| 13 |
+
{
|
| 14 |
+
"status": "ok"
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
Evaluate an intent (governance)
|
| 18 |
+
|
| 19 |
+
- Endpoint: POST /api/v1/intents/evaluate
|
| 20 |
+
- Content-Type: application/json
|
| 21 |
+
|
| 22 |
+
Example payload (minimal illustrative example — adapt to the `InfrastructureIntentRequest` model used by the project):
|
| 23 |
+
|
| 24 |
+
{
|
| 25 |
+
"id": "intent-123",
|
| 26 |
+
"description": "Example infrastructure change",
|
| 27 |
+
"estimated_cost": 100.0,
|
| 28 |
+
"policy_violations": []
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
Curl example:
|
| 32 |
+
|
| 33 |
+
curl -X POST http://localhost:8000/api/v1/intents/evaluate \
|
| 34 |
+
-H "Content-Type: application/json" \
|
| 35 |
+
-d '{"id":"intent-123","description":"Example","estimated_cost":100.0,"policy_violations":[]} '
|
| 36 |
+
|
| 37 |
+
Python (requests) example:
|
| 38 |
+
|
| 39 |
+
import requests
|
| 40 |
+
|
| 41 |
+
payload = {
|
| 42 |
+
"id": "intent-123",
|
| 43 |
+
"description": "Example infrastructure change",
|
| 44 |
+
"estimated_cost": 100.0,
|
| 45 |
+
"policy_violations": []
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
resp = requests.post("http://localhost:8000/api/v1/intents/evaluate", json=payload)
|
| 49 |
+
print(resp.status_code, resp.text)
|
| 50 |
+
|
| 51 |
+
Notes
|
| 52 |
+
|
| 53 |
+
- The evaluate endpoint uses an in-process `RiskEngine` (initialized in `app.main`) to compute risk and explanations.
|
| 54 |
+
- The `/api/v1/intents/outcome` endpoint exists but currently returns 501 Not Implemented — outcome recording/storage is incomplete in this repo.
|
docs/index.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ARF API Control Plane
|
| 2 |
+
|
| 3 |
+
Welcome to the ARF API documentation.
|
| 4 |
+
|
| 5 |
+
Overview
|
| 6 |
+
|
| 7 |
+
- This repository implements the ARF API Control Plane (FastAPI) — the application mounts a number of routers under `/api/v1` and exposes a health endpoint at `/health`.
|
| 8 |
+
- App version (from app.main): 0.2.0
|
| 9 |
+
|
| 10 |
+
Important notes
|
| 11 |
+
|
| 12 |
+
- A `RiskEngine` is initialized at app startup and stored at `app.state.risk_engine`. The engine reads `ARF_HMC_MODEL` and `ARF_USE_HYPERPRIORS` environment variables.
|
| 13 |
+
- Authentication: there is an optional `api_key` in configuration, but request handlers do not currently enforce authentication.
|
| 14 |
+
- The `/api/v1/intents/outcome` endpoint exists but returns 501 Not Implemented; intent outcome recording/storage is not yet implemented.
|
| 15 |
+
|
| 16 |
+
See the other documentation pages for development instructions, endpoints, and examples.
|
monitor.sh
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
URL_FILE="/workspaces/arf-api/current_url.txt"
|
| 4 |
+
LOG_FILE="/workspaces/arf-api/monitor.log"
|
| 5 |
+
|
| 6 |
+
if [ ! -f "$URL_FILE" ]; then
|
| 7 |
+
echo "$(date): No URL file found. Exiting." >> "$LOG_FILE"
|
| 8 |
+
exit 1
|
| 9 |
+
fi
|
| 10 |
+
|
| 11 |
+
CURRENT_URL=$(cat "$URL_FILE")
|
| 12 |
+
|
| 13 |
+
if ! curl -s -f "$CURRENT_URL/health" > /dev/null; then
|
| 14 |
+
echo "$(date): Tunnel down. Restarting..." >> "$LOG_FILE"
|
| 15 |
+
/workspaces/arf-api/start.sh
|
| 16 |
+
else
|
| 17 |
+
echo "$(date): Tunnel OK." >> "$LOG_FILE"
|
| 18 |
+
fi
|
render.yaml
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
- type: web
|
| 3 |
+
name: arf-api
|
| 4 |
+
runtime: python
|
| 5 |
+
buildCommand: pip install -r requirements.txt
|
| 6 |
+
startCommand: uvicorn app.main:app --host 0.0.0.0 --port $PORT
|
| 7 |
+
envVars:
|
| 8 |
+
- key: DATABASE_URL
|
| 9 |
+
fromDatabase:
|
| 10 |
+
name: arf-db
|
| 11 |
+
property: connectionString
|
| 12 |
+
- key: API_KEY
|
| 13 |
+
sync: false
|
| 14 |
+
- key: ENVIRONMENT
|
| 15 |
+
value: production
|
| 16 |
+
databases:
|
| 17 |
+
- name: arf-db
|
| 18 |
+
databaseName: arf
|
| 19 |
+
user: arf_user
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pytest-cov>=7.0.0
|
| 2 |
+
jsonschema>=4.0.0
|
| 3 |
+
pytest-asyncio>=0.24.0
|
requirements.txt
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
fastapi==0.115.12
|
| 2 |
uvicorn[standard]==0.34.0
|
| 3 |
-
pydantic=
|
|
|
|
|
|
|
|
|
|
| 4 |
pytest==8.3.5
|
| 5 |
-
pytest-cov>=6.0.0
|
| 6 |
httpx==0.28.1
|
| 7 |
alembic
|
| 8 |
pydantic-settings
|
|
@@ -11,9 +13,11 @@ psycopg2-binary==2.9.10
|
|
| 11 |
slowapi==0.1.9
|
| 12 |
prometheus-fastapi-instrumentator==7.1.0
|
| 13 |
flake8==7.2.0
|
| 14 |
-
cryptography
|
| 15 |
sentence-transformers>=2.2.0
|
| 16 |
scikit-learn
|
| 17 |
-
redis>=4.0.0
|
| 18 |
stripe>=9.0.0
|
| 19 |
-
|
|
|
|
|
|
|
|
|
| 1 |
fastapi==0.115.12
|
| 2 |
uvicorn[standard]==0.34.0
|
| 3 |
+
pydantic>=2.13.2
|
| 4 |
+
agentic-reliability-framework @ git+https://github.com/arf-foundation/agentic-reliability-framework@main
|
| 5 |
+
arf-pricing-calculator @ git+https://github.com/arf-foundation/ARF-Bayesian-Pricing-Calculator@main
|
| 6 |
+
pytest==8.3.5
|
| 7 |
pytest==8.3.5
|
|
|
|
| 8 |
httpx==0.28.1
|
| 9 |
alembic
|
| 10 |
pydantic-settings
|
|
|
|
| 13 |
slowapi==0.1.9
|
| 14 |
prometheus-fastapi-instrumentator==7.1.0
|
| 15 |
flake8==7.2.0
|
| 16 |
+
cryptography==47.0.0
|
| 17 |
sentence-transformers>=2.2.0
|
| 18 |
scikit-learn
|
| 19 |
+
redis>=4.0.0 # optional, for faster counters
|
| 20 |
stripe>=9.0.0
|
| 21 |
+
opentelemetry-api>=1.20.0
|
| 22 |
+
opentelemetry-sdk>=1.20.0
|
| 23 |
+
opentelemetry-instrumentation-fastapi>=0.50b0
|
runtime.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-3.12.3
|
| 2 |
+
# force fresh build
|
seed_rag_data.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Seed RAG graph with historical healing action success rates.
|
| 3 |
+
Run once before starting the API server.
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
sys.path.append(os.path.dirname(__file__))
|
| 8 |
+
|
| 9 |
+
from app.core.deps import get_rag_graph
|
| 10 |
+
from agentic_reliability_framework.core.models.event import HealingAction
|
| 11 |
+
|
| 12 |
+
def seed_historical_data():
|
| 13 |
+
rag = get_rag_graph()
|
| 14 |
+
|
| 15 |
+
# Define seed incidents (each with an outcome)
|
| 16 |
+
seed_data = [
|
| 17 |
+
# restart_container successes
|
| 18 |
+
{"incident_id": "seed_restart_1", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": True, "resolution_time_minutes": 2},
|
| 19 |
+
{"incident_id": "seed_restart_2", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": True, "resolution_time_minutes": 3},
|
| 20 |
+
{"incident_id": "seed_restart_3", "component": "test", "action": HealingAction.RESTART_CONTAINER.value, "success": False, "resolution_time_minutes": 10},
|
| 21 |
+
|
| 22 |
+
# rollback successes
|
| 23 |
+
{"incident_id": "seed_rollback_1", "component": "test", "action": HealingAction.ROLLBACK.value, "success": True, "resolution_time_minutes": 1},
|
| 24 |
+
{"incident_id": "seed_rollback_2", "component": "test", "action": HealingAction.ROLLBACK.value, "success": True, "resolution_time_minutes": 2},
|
| 25 |
+
{"incident_id": "seed_rollback_3", "component": "test", "action": HealingAction.ROLLBACK.value, "success": False, "resolution_time_minutes": 5},
|
| 26 |
+
|
| 27 |
+
# scale_out successes
|
| 28 |
+
{"incident_id": "seed_scale_1", "component": "test", "action": HealingAction.SCALE_OUT.value, "success": True, "resolution_time_minutes": 5},
|
| 29 |
+
{"incident_id": "seed_scale_2", "component": "test", "action": HealingAction.SCALE_OUT.value, "success": False, "resolution_time_minutes": 15},
|
| 30 |
+
|
| 31 |
+
# circuit_breaker successes
|
| 32 |
+
{"incident_id": "seed_cb_1", "component": "test", "action": HealingAction.CIRCUIT_BREAKER.value, "success": True, "resolution_time_minutes": 1},
|
| 33 |
+
{"incident_id": "seed_cb_2", "component": "test", "action": HealingAction.CIRCUIT_BREAKER.value, "success": True, "resolution_time_minutes": 2},
|
| 34 |
+
|
| 35 |
+
# traffic_shift successes
|
| 36 |
+
{"incident_id": "seed_ts_1", "component": "test", "action": HealingAction.TRAFFIC_SHIFT.value, "success": True, "resolution_time_minutes": 4},
|
| 37 |
+
{"incident_id": "seed_ts_2", "component": "test", "action": HealingAction.TRAFFIC_SHIFT.value, "success": False, "resolution_time_minutes": 8},
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
# Add each outcome to the RAG graph
|
| 41 |
+
for item in seed_data:
|
| 42 |
+
# Create a dummy reliability event (simplified)
|
| 43 |
+
from agentic_reliability_framework.core.models.event import ReliabilityEvent
|
| 44 |
+
event = ReliabilityEvent(
|
| 45 |
+
component=item["component"],
|
| 46 |
+
latency_p99=500, # placeholder
|
| 47 |
+
error_rate=0.1,
|
| 48 |
+
service_mesh="default"
|
| 49 |
+
)
|
| 50 |
+
# Record the outcome
|
| 51 |
+
rag.record_outcome(
|
| 52 |
+
incident_id=item["incident_id"],
|
| 53 |
+
event=event,
|
| 54 |
+
action_taken=item["action"],
|
| 55 |
+
success=item["success"],
|
| 56 |
+
resolution_time_minutes=item["resolution_time_minutes"]
|
| 57 |
+
)
|
| 58 |
+
print(f"Seeded: {item['action']} -> success={item['success']}")
|
| 59 |
+
|
| 60 |
+
print(f"Seeded {len(seed_data)} historical outcomes.")
|
| 61 |
+
print(f"Stats per action:")
|
| 62 |
+
for action in HealingAction:
|
| 63 |
+
stats = rag.get_historical_effectiveness(action.value, component_filter="test")
|
| 64 |
+
print(f" {action.value}: uses={stats['total_uses']}, success_rate={stats['success_rate']:.2f}, avg_time={stats['avg_resolution_time_minutes']:.1f} min")
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
seed_historical_data()
|
start.sh
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Set paths
|
| 4 |
+
BACKEND_DIR="/workspaces/arf-api"
|
| 5 |
+
FRONTEND_DIR="/workspaces/arf-frontend"
|
| 6 |
+
VENV_ACTIVATE="$BACKEND_DIR/venv/bin/activate"
|
| 7 |
+
CLOUDFLARED=$(which cloudflared 2>/dev/null || echo "/usr/local/bin/cloudflared")
|
| 8 |
+
|
| 9 |
+
# Kill any existing processes
|
| 10 |
+
echo "🛑 Stopping existing uvicorn and cloudflared..."
|
| 11 |
+
pkill -f uvicorn
|
| 12 |
+
pkill -f cloudflared
|
| 13 |
+
sleep 2
|
| 14 |
+
|
| 15 |
+
# Start uvicorn
|
| 16 |
+
echo "🚀 Starting uvicorn..."
|
| 17 |
+
cd "$BACKEND_DIR"
|
| 18 |
+
source "$VENV_ACTIVATE"
|
| 19 |
+
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload &
|
| 20 |
+
sleep 3
|
| 21 |
+
|
| 22 |
+
# Verify uvicorn is running
|
| 23 |
+
if ! curl -s http://localhost:8000/health >/dev/null; then
|
| 24 |
+
echo "❌ uvicorn failed to start. Exiting."
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
echo "✅ uvicorn is running."
|
| 28 |
+
|
| 29 |
+
# Start cloudflared and capture URL
|
| 30 |
+
echo "🌐 Starting cloudflared tunnel..."
|
| 31 |
+
TEMP_FILE=$(mktemp)
|
| 32 |
+
$CLOUDFLARED tunnel --url http://localhost:8000 2>&1 | tee "$TEMP_FILE" &
|
| 33 |
+
|
| 34 |
+
# Wait for URL to appear
|
| 35 |
+
echo "⏳ Waiting for tunnel URL..."
|
| 36 |
+
URL=""
|
| 37 |
+
for i in {1..30}; do
|
| 38 |
+
URL=$(grep -oP 'https://[a-z0-9-]+\.trycloudflare\.com' "$TEMP_FILE" | head -1)
|
| 39 |
+
if [ -n "$URL" ]; then
|
| 40 |
+
break
|
| 41 |
+
fi
|
| 42 |
+
sleep 1
|
| 43 |
+
done
|
| 44 |
+
|
| 45 |
+
if [ -z "$URL" ]; then
|
| 46 |
+
echo "❌ Failed to get tunnel URL."
|
| 47 |
+
exit 1
|
| 48 |
+
fi
|
| 49 |
+
echo "✅ Tunnel URL: $URL"
|
| 50 |
+
|
| 51 |
+
# Save URL for monitoring (used by monitor.sh)
|
| 52 |
+
echo "$URL" > /workspaces/arf-api/current_url.txt
|
| 53 |
+
|
| 54 |
+
# Update Vercel environment variable
|
| 55 |
+
echo "🔧 Updating Vercel environment variable..."
|
| 56 |
+
cd "$FRONTEND_DIR"
|
| 57 |
+
if command -v vercel &>/dev/null; then
|
| 58 |
+
vercel env rm NEXT_PUBLIC_API_URL production -y
|
| 59 |
+
echo "$URL" | vercel env add NEXT_PUBLIC_API_URL production
|
| 60 |
+
echo "🔄 Redeploying frontend..."
|
| 61 |
+
vercel --prod
|
| 62 |
+
else
|
| 63 |
+
echo "⚠️ Vercel CLI not installed. Please install it with: npm i -g vercel"
|
| 64 |
+
echo "Then manually update the env var to: $URL"
|
| 65 |
+
fi
|
| 66 |
+
|
| 67 |
+
echo "🎉 All done! Your new URL is: $URL"
|
| 68 |
+
echo "Frontend will be updated shortly. Check https://arf-frontend-sandy.vercel.app"
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
pytest configuration and fixtures for ARF API tests.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from app.core.usage_tracker import enforce_quota, Tier
|
| 6 |
+
from app.api.deps import get_db
|
| 7 |
+
from app.database.base import Base
|
| 8 |
+
from app.main import app as fastapi_app
|
| 9 |
+
from sqlalchemy.orm import sessionmaker
|
| 10 |
+
from sqlalchemy import create_engine
|
| 11 |
+
from fastapi.testclient import TestClient
|
| 12 |
+
import app.core.usage_tracker
|
| 13 |
+
import os
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
# ===== STEP 1: Set environment variables BEFORE any app imports =====
|
| 17 |
+
os.environ["ARF_USAGE_TRACKING"] = "false"
|
| 18 |
+
|
| 19 |
+
# Force the correct database URL for tests
|
| 20 |
+
os.environ["DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/testdb"
|
| 21 |
+
os.environ["TEST_DATABASE_URL"] = "postgresql://postgres:postgres@localhost:5432/testdb"
|
| 22 |
+
|
| 23 |
+
# Additional PostgreSQL environment variables to prevent fallback to
|
| 24 |
+
# system user
|
| 25 |
+
os.environ["PGUSER"] = "postgres"
|
| 26 |
+
os.environ["PGPASSWORD"] = "postgres"
|
| 27 |
+
os.environ["PGHOST"] = "localhost"
|
| 28 |
+
os.environ["PGPORT"] = "5432"
|
| 29 |
+
os.environ["PGDATABASE"] = "testdb"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ===== STEP 2: Mock the tracker module BEFORE importing app =====
|
| 33 |
+
class MockTracker:
|
| 34 |
+
def get_tier(self, api_key):
|
| 35 |
+
from app.core.usage_tracker import Tier
|
| 36 |
+
|
| 37 |
+
return Tier.PRO
|
| 38 |
+
|
| 39 |
+
def get_remaining_quota(self, api_key, tier):
|
| 40 |
+
|
| 41 |
+
return 1000
|
| 42 |
+
|
| 43 |
+
def consume_quota_and_log(self, record, idempotency_key=None):
|
| 44 |
+
|
| 45 |
+
return (True, None)
|
| 46 |
+
|
| 47 |
+
def increment_usage_sync(self, record, idempotency_key=None):
|
| 48 |
+
return True
|
| 49 |
+
|
| 50 |
+
def get_or_create_api_key(self, key, tier):
|
| 51 |
+
|
| 52 |
+
return True
|
| 53 |
+
|
| 54 |
+
def update_api_key_tier(self, key, tier):
|
| 55 |
+
return True
|
| 56 |
+
|
| 57 |
+
def _insert_audit_log(self, record):
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# Replace the tracker at the module level
|
| 62 |
+
app.core.usage_tracker.tracker = MockTracker()
|
| 63 |
+
|
| 64 |
+
# ===== STEP 3: Import app and database modules =====
|
| 65 |
+
|
| 66 |
+
# Force model registration (prevents "no such table" errors)
|
| 67 |
+
|
| 68 |
+
# Use the environment variable for the database URL (already set)
|
| 69 |
+
TEST_DATABASE_URL = os.getenv(
|
| 70 |
+
"TEST_DATABASE_URL",
|
| 71 |
+
"postgresql://postgres:postgres@localhost:5432/testdb")
|
| 72 |
+
|
| 73 |
+
if TEST_DATABASE_URL.startswith("postgresql"):
|
| 74 |
+
engine = create_engine(TEST_DATABASE_URL)
|
| 75 |
+
else:
|
| 76 |
+
engine = create_engine(
|
| 77 |
+
TEST_DATABASE_URL, connect_args={
|
| 78 |
+
"check_same_thread": False})
|
| 79 |
+
|
| 80 |
+
TestingSessionLocal = sessionmaker(
|
| 81 |
+
autocommit=False,
|
| 82 |
+
autoflush=False,
|
| 83 |
+
bind=engine)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def override_get_db():
|
| 87 |
+
|
| 88 |
+
db = TestingSessionLocal()
|
| 89 |
+
try:
|
| 90 |
+
yield db
|
| 91 |
+
|
| 92 |
+
finally:
|
| 93 |
+
db.close()
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
fastapi_app.dependency_overrides[get_db] = override_get_db
|
| 97 |
+
|
| 98 |
+
# Override enforce_quota dependency
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
async def mock_enforce_quota(request, api_key=None):
|
| 102 |
+
return {"api_key": "test_key", "tier": Tier.PRO, "remaining": 1000}
|
| 103 |
+
fastapi_app.dependency_overrides[enforce_quota] = mock_enforce_quota
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@pytest.fixture(scope="session", autouse=True)
|
| 107 |
+
def setup_database():
|
| 108 |
+
"""Create tables before any tests run."""
|
| 109 |
+
Base.metadata.create_all(bind=engine)
|
| 110 |
+
yield
|
| 111 |
+
Base.metadata.drop_all(bind=engine)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@pytest.fixture(scope="session")
|
| 115 |
+
def client():
|
| 116 |
+
with TestClient(fastapi_app) as test_client:
|
| 117 |
+
yield test_client
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@pytest.fixture(scope="function")
|
| 121 |
+
def db_session():
|
| 122 |
+
"""Provide a clean database session for each test."""
|
| 123 |
+
Base.metadata.create_all(bind=engine)
|
| 124 |
+
session = TestingSessionLocal()
|
| 125 |
+
yield session
|
| 126 |
+
session.rollback()
|
| 127 |
+
session.close()
|
| 128 |
+
Base.metadata.drop_all(bind=engine)
|
tests/test_deps.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
from unittest.mock import patch, MagicMock
|
| 3 |
+
from app.api.deps import get_db
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_get_db_closes_session():
|
| 7 |
+
mock_session = MagicMock()
|
| 8 |
+
with patch('app.api.deps.SessionLocal', return_value=mock_session):
|
| 9 |
+
db_gen = get_db()
|
| 10 |
+
db = next(db_gen)
|
| 11 |
+
assert db == mock_session
|
| 12 |
+
# Simulate an exception during request handling
|
| 13 |
+
with pytest.raises(Exception):
|
| 14 |
+
db_gen.throw(Exception("test error"))
|
| 15 |
+
mock_session.close.assert_called_once()
|
tests/test_governance.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for governance endpoints: /api/v1/intents/evaluate
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def test_evaluate_provision_intent(client):
|
| 7 |
+
payload = {
|
| 8 |
+
"intent_type": "provision_resource",
|
| 9 |
+
"environment": "prod",
|
| 10 |
+
"resource_type": "database",
|
| 11 |
+
"region": "eastus",
|
| 12 |
+
"size": "Standard",
|
| 13 |
+
"estimated_cost": 1200,
|
| 14 |
+
"policy_violations": [],
|
| 15 |
+
"requester": "alice",
|
| 16 |
+
"provenance": {},
|
| 17 |
+
"configuration": {}
|
| 18 |
+
}
|
| 19 |
+
response = client.post("/api/v1/intents/evaluate", json=payload)
|
| 20 |
+
assert response.status_code == 200, response.text
|
| 21 |
+
data = response.json()
|
| 22 |
+
assert "risk_score" in data
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_evaluate_grant_access(client):
|
| 26 |
+
payload = {
|
| 27 |
+
"intent_type": "grant_access",
|
| 28 |
+
"environment": "dev",
|
| 29 |
+
"principal": "bob",
|
| 30 |
+
"permission_level": "read",
|
| 31 |
+
"resource_scope": "/subscriptions/123",
|
| 32 |
+
"estimated_cost": None,
|
| 33 |
+
"policy_violations": [],
|
| 34 |
+
"requester": "alice",
|
| 35 |
+
"provenance": {},
|
| 36 |
+
"justification": "test"
|
| 37 |
+
}
|
| 38 |
+
response = client.post("/api/v1/intents/evaluate", json=payload)
|
| 39 |
+
assert response.status_code == 200, response.text
|
| 40 |
+
data = response.json()
|
| 41 |
+
assert "risk_score" in data
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_evaluate_deploy_config(client):
|
| 45 |
+
payload = {
|
| 46 |
+
"intent_type": "deploy_config",
|
| 47 |
+
"environment": "staging",
|
| 48 |
+
"service_name": "payments-api",
|
| 49 |
+
"change_scope": "canary",
|
| 50 |
+
"deployment_target": "staging",
|
| 51 |
+
"estimated_cost": 20,
|
| 52 |
+
"policy_violations": [],
|
| 53 |
+
"requester": "alice",
|
| 54 |
+
"provenance": {},
|
| 55 |
+
"configuration": {}
|
| 56 |
+
}
|
| 57 |
+
response = client.post("/api/v1/intents/evaluate", json=payload)
|
| 58 |
+
assert response.status_code == 200, response.text
|
| 59 |
+
data = response.json()
|
| 60 |
+
assert "risk_score" in data
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def test_invalid_intent_type(client):
|
| 64 |
+
payload = {
|
| 65 |
+
"intent_type": "UnknownIntent",
|
| 66 |
+
"environment": "prod",
|
| 67 |
+
"requester": "alice",
|
| 68 |
+
"provenance": {}
|
| 69 |
+
}
|
| 70 |
+
response = client.post("/api/v1/intents/evaluate", json=payload)
|
| 71 |
+
assert response.status_code == 422
|
tests/test_healing_endpoint.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi.testclient import TestClient
|
| 2 |
+
from app.main import app
|
| 3 |
+
|
| 4 |
+
client = TestClient(app)
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_healing_evaluate_endpoint():
|
| 8 |
+
payload = {
|
| 9 |
+
"event": {
|
| 10 |
+
"component": "my-service",
|
| 11 |
+
"latency_p99": 450.0,
|
| 12 |
+
"error_rate": 0.25,
|
| 13 |
+
"service_mesh": "default",
|
| 14 |
+
"cpu_util": 0.85,
|
| 15 |
+
"memory_util": 0.90
|
| 16 |
+
}
|
| 17 |
+
}
|
| 18 |
+
response = client.post("/api/v1/healing/evaluate", json=payload)
|
| 19 |
+
assert response.status_code == 200, f"Expected 200, got {
|
| 20 |
+
response.status_code}: {
|
| 21 |
+
response.text}"
|