ghostexec / tests /test_phase4.py
modelbuilderhq's picture
Upload folder using huggingface_hub
ee21104 verified
"""Phase 4: reward sub-scores, aggregation, logging, schema drift."""
import json
import random
import statistics
from pathlib import Path
import pytest
from ghostexec.models import GhostexecAction
from ghostexec.server import reward as reward_mod
from ghostexec.server.reward import aggregate_scores
from ghostexec.server.ghostexec_environment import GhostexecEnvironment
ROOT = Path(__file__).resolve().parents[1]
SCENARIO = ROOT / "scenarios" / "phase2_core.json"
DRIFT = ROOT / "scenarios" / "schema_drift_test.json"
def test_reward_weights_and_aggregator_helpers():
w = GhostexecEnvironment.load_world_from_json(SCENARIO)
c, r, t = 1.0, -1.0, 2.5
weighted_inner = reward_mod.W_CONFLICT * c + reward_mod.W_REL * r + reward_mod.W_TASK * t
bd = aggregate_scores(
c,
r,
t,
conflict_raw=c,
critical_queue_bonus=0.0,
weighted_inner=weighted_inner,
weighted_base_only=weighted_inner,
shaping_synergy=0.0,
shaping_tradeoff=0.0,
shaping_potential=0.0,
shaping_scaffold=0.0,
shaping_quality=0.0,
action_ok=True,
episode_done=False,
world_after=w,
)
assert bd.weighted_base == pytest.approx(reward_mod.WEIGHTED_OUTPUT_SCALE * weighted_inner)
def test_catastrophic_and_completion_bonuses_only_when_episode_done():
w0 = GhostexecEnvironment.load_world_from_json(SCENARIO)
w1 = w0.model_copy(deep=True)
w1.stress = 30
w2 = w1.model_copy(deep=True)
action = GhostexecAction(action_type="do_nothing")
mid = reward_mod.compute_step_reward(w1, w2, action, action_ok=True, episode_done=False)
assert mid.episode_completion_bonus == 0.0
assert mid.catastrophic_penalty == 0.0
w_bad = w1.model_copy(deep=True)
for i, c in enumerate(w_bad.contacts):
if c.name == "Marcus Webb":
w_bad.contacts[i] = c.model_copy(update={"mood": "furious"})
break
end = reward_mod.compute_step_reward(w1, w_bad, action, action_ok=True, episode_done=True)
assert end.episode_completion_bonus == pytest.approx(10.0)
assert end.catastrophic_penalty == pytest.approx(-15.0)
def test_invalid_step_matches_do_nothing_subscores_plus_invalid_addon():
w = GhostexecEnvironment.load_world_from_json(SCENARIO)
noop = GhostexecAction(action_type="do_nothing")
bad = GhostexecAction(action_type="reply_email", email_id="missing", message_body="x")
bd_ok = reward_mod.compute_step_reward(w, w, noop, action_ok=True, episode_done=False)
bd_bad = reward_mod.compute_step_reward(w, w, bad, action_ok=False, episode_done=False)
assert bd_bad.invalid_step_adjustment == pytest.approx(-0.25)
# do_nothing carries an additional strict additive floor (-0.15) not applied to invalid non-idle actions.
assert bd_bad.final == pytest.approx(bd_ok.final - (0.25 - 0.15))
def test_scripted_episode_reward_direction_and_log(tmp_path, monkeypatch):
logf = tmp_path / "rewards.jsonl"
env = GhostexecEnvironment(SCENARIO)
env.reset()
monkeypatch.setattr(env, "_reward_log_path", logf)
r_resolve = env.step(
GhostexecAction(
action_type="reschedule_meeting",
meeting_id="m02",
new_time="2026-04-21T18:00:00",
)
)
r_bad = env.step(GhostexecAction(action_type="do_nothing"))
assert r_resolve.metadata.get("step_ok") is True
assert r_bad.metadata.get("step_ok") is True
assert (r_resolve.reward or 0) > (r_bad.reward or 0)
assert logf.is_file()
lines = logf.read_text(encoding="utf-8").strip().splitlines()
assert len(lines) >= 2
row = json.loads(lines[0])
assert "reward" in row and "episode_id" in row
assert row.get("action_type") == "reschedule_meeting"
assert "conflict_raw" in row and "step_ok" in row
assert "shaping_total" in row and "shaping_to_base_ratio" in row
assert "shaping_scaffold" in row
assert row.get("reward_mode") == "full"
def test_reward_mode_base_turns_off_shaping_terms():
env = GhostexecEnvironment(SCENARIO, reward_mode="base")
env.reset()
obs = env.step(
GhostexecAction(
action_type="reschedule_meeting",
meeting_id="m02",
new_time="2026-04-21T18:00:00",
)
)
bd = (obs.metadata or {}).get("reward_breakdown") or {}
assert float(bd.get("shaping_synergy") or 0.0) == pytest.approx(0.0)
assert float(bd.get("shaping_tradeoff") or 0.0) == pytest.approx(0.0)
assert float(bd.get("shaping_potential") or 0.0) == pytest.approx(0.0)
def test_schema_drift_events_mutate_world():
env = GhostexecEnvironment(SCENARIO, schema_drift_events_path=DRIFT)
env.reset()
assert env.step(GhostexecAction(action_type="do_nothing")).metadata.get("step_ok") is True
assert any("schema drift: shifted" in x for x in env.world.action_log)
env.step(GhostexecAction(action_type="do_nothing"))
sarah = env.get_contact("Sarah Chen")
assert sarah is not None
assert sarah.communication_preference == "text"
env.step(GhostexecAction(action_type="do_nothing"))
t02 = next(t for t in env.world.tasks if t.id == "t02")
assert t02.deadline == "2026-04-21T07:00:00"
assert "Marcus Webb" in env._reply_relationship_suppressed # noqa: SLF001
def test_rewards_differ_between_helpful_and_idle_steps():
env = GhostexecEnvironment(SCENARIO)
env.reset()
r_help = env.step(
GhostexecAction(
action_type="reschedule_meeting",
meeting_id="m02",
new_time="2026-04-21T18:00:00",
)
).reward
r_idle = env.step(GhostexecAction(action_type="do_nothing")).reward
assert r_help is not None and r_idle is not None
assert r_help != r_idle
# Whitelisted reschedules (known non-overlapping targets for phase2_core at 08:00).
_SAFE_RESCHEDULES: list[tuple[str, str]] = [
("m02", "2026-04-21T18:00:00"),
("m03", "2026-04-21T18:30:00"),
("m06", "2026-04-21T20:00:00"),
("m09", "2026-04-21T21:00:00"),
]
def test_seeded_stochastic_policy_reward_spread():
random.seed(1234)
K = 80
archive_ids = [f"e{i:02d}" for i in range(1, 31)]
contacts = ["Jordan Lee", "Jamie Liu", "Marcus Webb", "Sarah Chen"]
env = GhostexecEnvironment(SCENARIO)
env.reset()
rewards: list[float] = []
ai = ri = 0
for _ in range(K):
u = random.random()
if u < 0.32:
obs = env.step(GhostexecAction(action_type="do_nothing"))
elif u < 0.58:
eid = archive_ids[ai % len(archive_ids)]
ai += 1
obs = env.step(GhostexecAction(action_type="archive_email", email_id=eid))
elif u < 0.78:
mid, nt = _SAFE_RESCHEDULES[ri % len(_SAFE_RESCHEDULES)]
ri += 1
obs = env.step(
GhostexecAction(action_type="reschedule_meeting", meeting_id=mid, new_time=nt)
)
else:
cname = contacts[ai % len(contacts)]
ai += 1
obs = env.step(
GhostexecAction(
action_type="send_message",
contact_name=cname,
message_body="Quick sync on priorities.",
)
)
assert obs.reward is not None
rewards.append(float(obs.reward))
std = statistics.pstdev(rewards)
sr = sorted(rewards)
p5 = sr[max(0, int(0.05 * (len(sr) - 1)))]
p95 = sr[min(len(sr) - 1, int(0.95 * (len(sr) - 1)))]
assert std > 0.06
assert (p95 - p5) > 0.09
def test_good_script_beats_do_nothing_spam_on_mean_reward():
good = GhostexecEnvironment(SCENARIO)
good.reset()
good_actions = [
GhostexecAction(
action_type="reschedule_meeting",
meeting_id="m02",
new_time="2026-04-21T18:00:00",
),
GhostexecAction(action_type="reply_email", email_id="e01", message_body="Drafting revised figures now."),
GhostexecAction(action_type="archive_email", email_id="e09"),
GhostexecAction(
action_type="send_message",
contact_name="Jordan Lee",
message_body="Standup notes attached.",
),
GhostexecAction(action_type="complete_task", task_id="t06"),
]
g_rewards = [good.step(a).reward for a in good_actions]
g_mean = sum(float(x) for x in g_rewards) / len(g_rewards)
bad = GhostexecEnvironment(SCENARIO)
bad.reset()
b_rewards = [bad.step(GhostexecAction(action_type="do_nothing")).reward for _ in range(5)]
b_mean = sum(float(x) for x in b_rewards) / len(b_rewards)
assert g_mean > b_mean + 0.2