Spaces:
Configuration error
Configuration error
File size: 2,797 Bytes
e4f3d12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | import json
import argparse
import os
import requests
from typing import Any
from commitguard_env.parse_action import parse_action
def run_episode(env_url: str, sample_id: str, model_client: Any = None) -> float:
"""
Runs a full 5-step episode for a single sample.
"""
# 1. Reset
# In a real evaluate, we'd need a reset_to_id endpoint or just loop reset until ID matches.
# For now, we assume reset gives us a random sample and we track it.
r = requests.post(f"{env_url}/reset")
data = r.json()
obs = data["observation"]
total_reward = 0.0
done = False
step_count = 0
while not done and step_count < 5:
# Prompt model (Simplified for script)
if model_client:
action_str = model_client.generate(obs['diff'], obs['available_files'])
else:
# Mock: straight to verdict for evaluation baseline
action_str = f"<action><action_type>verdict</action_type><is_vulnerable>true</is_vulnerable></action>"
r = requests.post(f"{env_url}/step", json={"action": action_str})
res = r.json()
obs = res["observation"]
total_reward = res["reward"] # Environment returns cumulative or step reward?
# In CommitGuard, reward at verdict includes the outcome.
done = res["done"]
step_count += 1
return total_reward
def evaluate(env_url: str, test_file: str, adapter_path: str = None):
with open(test_file, "r") as f:
test_samples = [json.loads(line) for line in f]
# Loading model if adapter provided
model_client = None
if adapter_path:
print(f"Loading LoRA adapter from {adapter_path}...")
# (Integration with Unsloth/Peft would go here)
pass
results = []
print(f"Starting evaluation on {len(test_samples)} samples...")
for sample in test_samples:
reward = run_episode(env_url, sample["commit_id"], model_client)
results.append({
"commit_id": sample["commit_id"],
"reward": reward,
"cwe": sample.get("cwe_type")
})
avg_reward = sum(r["reward"] for r in results) / len(results)
print(f"Evaluation Complete. Average Reward: {avg_reward:.4f}")
with open("eval_results.json", "w") as f:
json.dump(results, f, indent=2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--env-url", default="http://localhost:8000")
parser.add_argument("--test-file", default="data/devign_test.jsonl")
parser.add_argument("--adapter-path", default=None)
args = parser.parse_args()
evaluate(args.env_url, args.test_file, args.adapter_path)
|