Spaces:
Running
Running
File size: 6,501 Bytes
70f2179 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | # Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Client for the deployed opencode_env server.
The server exposes a single MCP tool ``run_rollout`` that runs one OpenCode
rollout in an E2B sandbox and returns a JSON-serialized :class:`RolloutResult`.
Example::
from opencode_env import OpenCodeEnv
with OpenCodeEnv(base_url="https://adithya-sk-opencode-env.hf.space") as env:
env.reset()
result = env.run_rollout(
base_url="https://api.openai.com/v1",
api_key=os.environ["OPENAI_API_KEY"],
model="gpt-4o-mini",
instruction="Create binary_search.py exposing def binary_search(arr, target) -> int...",
setup=[],
verify=["python /home/user/test.py"],
task_id="binary_search_v1",
)
print(result.reward, len(result.proxy_turns))
"""
from __future__ import annotations
import json
from typing import Any
from openenv.core.mcp_client import MCPToolClient
try:
from .models import RolloutResult
except ImportError: # pragma: no cover
from models import RolloutResult # type: ignore
class OpenCodeEnv(MCPToolClient):
"""Typed client for the opencode_env MCP server.
Inherits ``reset`` / ``call_tool`` / ``list_tools`` / ``from_docker_image``
/ context-manager semantics from :class:`MCPToolClient`.
"""
def run_rollout(
self,
*,
# Endpoint — pass either the shorthand selector OR explicit fields.
endpoint: str = "", # "vllm" | "openai" | "hf_router"
base_url: str = "",
api_key: str = "",
model: str = "",
# Task — the "list of bash commands" shape
instruction: str,
setup: list[str] | None = None,
verify: list[str] | None = None,
# Bookkeeping / tunables
task_id: str = "",
mode: str = "transparent_proxy",
disable_thinking: bool | None = None,
max_tokens_cap: int = 4096,
top_logprobs: int = 5,
agent_timeout_s: float = 600.0,
template: str = "",
) -> RolloutResult:
"""Run one OpenCode rollout and return the typed result.
Args:
base_url: OpenAI-compatible LLM endpoint (with trailing /v1).
api_key: Bearer token for the LLM. Use ``"intercepted"`` for vLLM
if it doesn't enforce auth.
model: Model id understood by the LLM endpoint
(e.g. ``"gpt-4o-mini"``, ``"Qwen/Qwen3.5-4B"``,
``"Qwen/Qwen3-4B-Instruct-2507:nscale"``).
instruction: Prompt passed to ``opencode run``.
setup: Bash commands run sequentially **before** the agent starts.
Each command runs in the sandbox; non-zero exit aborts setup.
verify: Bash commands run sequentially **after** the agent exits.
Reward = ``passed_count / total`` unless any command writes a
float to ``/home/user/logs/verifier/reward.txt`` (override).
task_id: Echoed back in the result for traceability.
mode: ``"transparent_proxy"`` (captures per-token logprobs via
an in-sandbox FastAPI proxy) or ``"black_box"`` (no proxy).
disable_thinking: Inject
``chat_template_kwargs.enable_thinking=false`` on forwarded
requests. Needed for Qwen3.5 vLLM; harmless on Instruct
variants; rejected by OpenAI direct.
max_tokens_cap: Clamp on per-turn ``max_tokens``. OpenCode asks
for ~32k by default; gpt-4o-mini caps at 16k.
top_logprobs: Top-k logprobs requested upstream. HF Router caps
at 5; OpenAI accepts up to 20; vLLM is unbounded.
agent_timeout_s: Hard wall-clock budget for one ``opencode run``.
template: E2B template name (e.g. ``"opencode-rl"``). Empty
string uses the default (slow) base image.
Returns:
A :class:`RolloutResult` with reward, per-turn logprobs, file
outputs, setup/verify results, and diagnostic tails.
"""
raw = self.call_tool(
"run_rollout",
endpoint=endpoint,
base_url=base_url,
api_key=api_key,
model=model,
instruction=instruction,
setup=list(setup or []),
verify=list(verify or []),
task_id=task_id,
mode=mode,
disable_thinking=disable_thinking,
max_tokens_cap=max_tokens_cap,
top_logprobs=top_logprobs,
agent_timeout_s=agent_timeout_s,
template=template,
)
return RolloutResult.model_validate_json(_extract_text(raw))
def _extract_text(result: Any) -> str:
"""Pull the JSON text out of whatever shape the MCP layer returns.
Handles the three shapes :meth:`MCPToolClient.call_tool` may surface:
a raw string, a ``CallToolObservation``-like object with
``.result.content[0].text``, or a dict with ``content[0]["text"]``.
"""
if isinstance(result, str):
return result
inner = getattr(result, "result", None)
if inner is not None:
content = getattr(inner, "content", None)
if content:
first = content[0]
text = getattr(first, "text", None)
if isinstance(text, str):
return text
if isinstance(first, dict) and "text" in first:
return first["text"]
if isinstance(result, dict):
content = result.get("content")
if isinstance(content, list) and content:
first = content[0]
if isinstance(first, dict) and "text" in first:
return first["text"]
nested = result.get("result")
if isinstance(nested, dict):
content = nested.get("content")
if isinstance(content, list) and content:
first = content[0]
if isinstance(first, dict) and "text" in first:
return first["text"]
return json.dumps(result, default=str)
content = getattr(result, "content", None)
if content:
first = content[0]
text = getattr(first, "text", None)
if isinstance(text, str):
return text
return str(result)
|