"""
Baseline agent for the PayOps OpenEnv environment.

A rule-based agent that decides an action based on observable risk signals.
Use this as a sanity-check to prove the environment works end-to-end and
as a lower-bound performance reference before training a learned agent.

Usage
-----
    # Run directly
    python scripts/baseline_agent.py

    # Or via the /baseline API endpoint
    curl -X POST http://localhost:8000/baseline

Policy
------
1. If any sanctioned / high-risk flag        → reject
2. velocity_1h >= 10 (velocity burst)        → reject
3. kyc_status in (failed, none)              → escalate
4. risk_score >= 0.80                        → reject
5. risk_score >= 0.60                        → escalate
6. risk_score >= 0.35 OR any flag present    → flag
7. Otherwise                                 → approve
"""

from __future__ import annotations

import asyncio
import sys
import os

# Allow running from the project root or the scripts/ directory
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from payops_env.environment import PayOpsEnvironment
from payops_env.grader import grade_episode
from payops_env.models import PayOpsAction
from payops_env.tasks import TASKS


# ---------------------------------------------------------------------------
# Policy
# ---------------------------------------------------------------------------

DANGER_FLAGS = {
    "sanctioned_country",
    "app_scam_indicator",
    "mule_account_pattern",
    "structuring_pattern",
    "ctr_threshold_avoidance",
}


def rule_based_policy(obs) -> str:
    """
    Deterministic rule-based policy.

    Priority order ensures the most dangerous patterns are caught first,
    even when the ML risk score is low (adversarial/poisoned score inputs).
    """
    # Priority 1: known fraud or sanctioned patterns regardless of risk score
    if any(flag in DANGER_FLAGS for flag in obs.flags):
        return "reject"

    # Priority 2: transaction velocity burst (potential account takeover)
    if obs.velocity_1h is not None and obs.velocity_1h >= 10:
        return "reject"

    # Priority 3: identity not confirmed
    if obs.kyc_status in ("failed", "none"):
        return "escalate"

    # Priority 4–6: risk score tiers
    if obs.risk_score >= 0.80:
        return "reject"
    elif obs.risk_score >= 0.60:
        return "escalate"
    elif obs.risk_score >= 0.35 or obs.flags:
        return "flag"
    else:
        return "approve"


# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------

async def run():
    env = PayOpsEnvironment()
    obs = await env.reset_async()
    total_reward = 0.0
    step = 0
    actions_taken = []

    print("=" * 60)
    print("  PayOps Baseline Agent — Rule-Based Policy")
    print("=" * 60)
    print(f"  Tasks in episode : {len(TASKS)}")
    print("=" * 60)

    while not obs.done:
        action_type = rule_based_policy(obs)
        action = PayOpsAction(
            action_type=action_type,
            transaction_id=obs.transaction_id,
        )

        print(f"\nStep {step + 1}  [{obs.task_difficulty.upper()}]  {obs.task_id}")
        print(f"  TXN        : {obs.transaction_id}")
        print(f"  Amount     : {obs.amount:,.2f} {obs.currency}")
        print(f"  Sender     : {obs.sender}")
        print(f"  Receiver   : {obs.receiver}")
        print(f"  Risk score : {obs.risk_score:.2f}")
        print(f"  KYC        : {obs.kyc_status or 'n/a'}  |  "
              f"Country risk: {obs.country_risk or 'n/a'}  |  "
              f"Velocity 1h: {obs.velocity_1h or 'n/a'}")
        print(f"  Flags      : {obs.flags or '[]'}")
        print(f"  → Agent    : {action_type}")

        obs = await env.step_async(action)
        actions_taken.append(action_type)
        total_reward += obs.reward
        step += 1

        print(f"  ✓ Reward   : {obs.reward:+.2f}  "
              f"(correct: {obs.info.get('correct_action', '?')})")

    env.close()

    # Grade the episode
    result = grade_episode(actions_taken, list(TASKS))

    print("\n" + "=" * 60)
    print("  Episode Summary")
    print("=" * 60)
    print(f"  Steps             : {step}")
    print(f"  Total reward      : {result.total_reward:+.2f}")
    print(f"  Max possible      : {result.max_possible_reward:.2f}")
    print(f"  Normalised score  : {result.normalised_score:.4f}")
    print(f"  Passed (≥0.5)     : {'YES ✓' if result.passed else 'NO ✗'}")

    print("\n  Per-task breakdown:")
    for t in result.per_task_rewards:
        mark = "✓" if t["correct"] else "✗"
        print(
        f"    {mark}  {t['task_id']:12s}  [{t['difficulty']:6s}]  "
        f"action={t['terminal_action']:10s}  "
        f"correct={t['correct_action']:10s}  "
        f"reward={t['weighted_reward']:+.2f}"
    )

    print("=" * 60)
    return result


if __name__ == "__main__":
    asyncio.run(run())