Spaces:

yadnyeshkolte
/

api-debug-env

Sleeping

File size: 7,790 Bytes

36dac03

"""
Baseline inference script for the API Integration Debugging Environment.

This script demonstrates an LLM-powered agent interacting with the environment
using the OpenAI API. It runs all 3 tasks (easy, medium, hard) and reports
baseline scores.

Usage:
    # Set your OpenAI API key
    export OPENAI_API_KEY=your-key-here

    # Run baseline
    python scripts/baseline_inference.py

    # Or specify a server URL
    python scripts/baseline_inference.py --server-url http://localhost:8000
"""

import argparse
import json
import os
import sys
from typing import Any, Dict, List, Optional

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from models import ApiDebugAction, ApiDebugObservation
from scenarios import get_all_task_ids, get_scenario
from server.api_debug_env_environment import ApiDebugEnvironment


def run_rule_based_baseline(task_id: str) -> Dict[str, Any]:
    """
    Run a simple rule-based baseline agent (no LLM needed).

    Strategy:
    1. Inspect all logs
    2. Inspect all configs
    3. Test all endpoints
    (Does not attempt fixes — tests reward signal for exploration-only behavior)
    """
    env = ApiDebugEnvironment(task_id=task_id)
    obs = env.reset()
    total_reward = 0.0

    # Phase 1: Inspect all logs
    for service in obs.available_targets:
        if obs.done:
            break
        obs = env.step(ApiDebugAction(action_type="inspect_logs", target=service))
        total_reward += obs.reward

    # Phase 2: Inspect all configs
    for service in obs.available_targets:
        if obs.done:
            break
        obs = env.step(ApiDebugAction(action_type="inspect_config", target=service))
        total_reward += obs.reward

    # Phase 3: Test all endpoints
    for service in obs.available_targets:
        if obs.done:
            break
        obs = env.step(ApiDebugAction(action_type="inspect_endpoint", target=service))
        total_reward += obs.reward

    score = env.grade()
    return {
        "task_id": task_id,
        "score": score,
        "total_reward": round(total_reward, 4),
        "steps_used": env._state.step_count,
        "issues_found": len(env._issues_found),
        "issues_fixed": len(env._issues_fixed),
        "issues_total": len(env._scenario.issues) if env._scenario else 0,
    }


def run_llm_baseline(task_id: str, api_key: Optional[str] = None) -> Dict[str, Any]:
    """
    Run an LLM-powered baseline agent using OpenAI API.

    The LLM reads observations and decides what to do next.
    """
    try:
        from openai import OpenAI
    except ImportError:
        print("OpenAI package not installed. Running rule-based baseline instead.")
        return run_rule_based_baseline(task_id)

    key = api_key or os.environ.get("OPENAI_API_KEY")
    if not key:
        print("No OPENAI_API_KEY set. Running rule-based baseline instead.")
        return run_rule_based_baseline(task_id)

    client = OpenAI(api_key=key)
    env = ApiDebugEnvironment(task_id=task_id)
    obs = env.reset()
    total_reward = 0.0

    system_prompt = f"""You are an API debugging agent. Your task: {obs.task_description}

Available actions:
- inspect_logs: Read error logs for a service
- inspect_config: See the configuration of a service
- inspect_endpoint: Test-call an endpoint
- submit_fix: Submit a config fix (requires fix_payload dict)

Available targets: {obs.available_targets}
Total issues to fix: {obs.issues_total}

Respond with JSON: {{"action_type": "...", "target": "...", "fix_payload": {{...}} }}
Only include fix_payload when action_type is "submit_fix"."""

    messages = [{"role": "system", "content": system_prompt}]

    while not obs.done:
        # Build observation message
        obs_text = f"""Step {env._state.step_count}/{env._scenario.max_steps if env._scenario else '?'}
Remaining steps: {obs.remaining_steps}
Issues found: {obs.issues_found}/{obs.issues_total}
Issues fixed: {obs.issues_fixed}/{obs.issues_total}
Last action result: {obs.action_result}"""

        if obs.logs:
            obs_text += f"\nLogs:\n" + "\n".join(obs.logs)
        if obs.config_snapshot:
            obs_text += f"\nConfig: {json.dumps(obs.config_snapshot, indent=2)}"
        if obs.api_response:
            obs_text += f"\nAPI Response: {json.dumps(obs.api_response, indent=2)}"
        if obs.hints:
            obs_text += f"\nHints: {'; '.join(obs.hints)}"

        messages.append({"role": "user", "content": obs_text})

        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=messages,
                temperature=0.2,
                max_tokens=500,
                response_format={"type": "json_object"},
            )

            action_json = json.loads(response.choices[0].message.content)
            messages.append({"role": "assistant", "content": json.dumps(action_json)})

            action = ApiDebugAction(
                action_type=action_json.get("action_type", "inspect_logs"),
                target=action_json.get("target", obs.available_targets[0] if obs.available_targets else ""),
                fix_payload=action_json.get("fix_payload"),
            )
        except Exception as e:
            print(f"  LLM error: {e}. Falling back to inspect_logs.")
            action = ApiDebugAction(
                action_type="inspect_logs",
                target=obs.available_targets[0] if obs.available_targets else "",
            )

        obs = env.step(action)
        total_reward += obs.reward

    score = env.grade()
    return {
        "task_id": task_id,
        "score": score,
        "total_reward": round(total_reward, 4),
        "steps_used": env._state.step_count,
        "issues_found": len(env._issues_found),
        "issues_fixed": len(env._issues_fixed),
        "issues_total": len(env._scenario.issues) if env._scenario else 0,
    }


def main():
    parser = argparse.ArgumentParser(description="Baseline inference for API Debug Env")
    parser.add_argument("--mode", choices=["rule", "llm"], default="rule",
                        help="Baseline mode: 'rule' for rule-based, 'llm' for LLM-powered")
    parser.add_argument("--api-key", type=str, default=None,
                        help="OpenAI API key (or set OPENAI_API_KEY env var)")
    parser.add_argument("--task", type=str, default=None,
                        help="Run specific task only (easy/medium/hard)")
    args = parser.parse_args()

    print("=" * 60)
    print("API Integration Debugging — Baseline Inference")
    print("=" * 60)

    task_ids = [args.task] if args.task else get_all_task_ids()
    all_results = {}

    for task_id in task_ids:
        print(f"\n{'─' * 40}")
        print(f"Task: {task_id}")
        print(f"{'─' * 40}")

        if args.mode == "llm":
            result = run_llm_baseline(task_id, args.api_key)
        else:
            result = run_rule_based_baseline(task_id)

        all_results[task_id] = result
        print(f"  Score:        {result['score']}")
        print(f"  Reward:       {result['total_reward']}")
        print(f"  Steps:        {result['steps_used']}")
        print(f"  Issues found: {result['issues_found']}/{result['issues_total']}")
        print(f"  Issues fixed: {result['issues_fixed']}/{result['issues_total']}")

    print(f"\n{'=' * 60}")
    print("Summary")
    print(f"{'=' * 60}")
    for tid, res in all_results.items():
        print(f"  {tid:8s}  score={res['score']:.4f}  fixed={res['issues_fixed']}/{res['issues_total']}")

    avg_score = sum(r["score"] for r in all_results.values()) / len(all_results)
    print(f"\n  Average score: {avg_score:.4f}")

    return all_results


if __name__ == "__main__":
    main()