Spaces:

openenv
/

repl_env-0.2.2

Running on CPU Upgrade

App Files Files Community

repl_env-0.2.2 / server /repl_environment.py

burtenshaw HF Staff

Upload folder using huggingface_hub

5a88917 verified 1 day ago

raw

history blame contribute delete

24.8 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the BSD-style license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	REPL Environment Implementation.

	A Python REPL environment for training language models on code execution tasks,
	based on the Recursive Language Models (RLM) paradigm.

	References:
	- RLM Paper: https://arxiv.org/abs/2512.24601
	- Prime Intellect Blog: https://www.primeintellect.ai/blog/rlm
	- Alex Zhang Blog: https://alexzhang13.github.io/blog/2025/rlm/
	"""

	import os
	import re
	from collections.abc import Callable
	from typing import Any, List, Optional
	from uuid import uuid4

	try:
	from openenv.core.env_server.interfaces import Environment
	from openenv.core.env_server.types import EnvironmentMetadata
	except ImportError:
	from openenv.core.env_server.interfaces import Environment
	from openenv.core.env_server.types import EnvironmentMetadata

	try:
	from ..models import CodeBlockResult, REPLAction, REPLObservation, REPLState
	except ImportError:
	try:
	from repl_env.models import CodeBlockResult, REPLAction, REPLObservation, REPLState
	except ImportError:
	from models import CodeBlockResult, REPLAction, REPLObservation, REPLState

	try:
	from ..recursive_controller import create_server_recursive_controller
	from ..rubrics import REPLRubric
	from .python_executor import PythonExecutor
	except ImportError:
	try:
	from repl_env.recursive_controller import create_server_recursive_controller
	from repl_env.rubrics import REPLRubric
	from .python_executor import PythonExecutor
	except ImportError:
	from .python_executor import PythonExecutor
	from recursive_controller import create_server_recursive_controller
	from rubrics import REPLRubric


	class REPLEnvironment(Environment):
	"""
	A REPL environment for training language models to use code execution.

	Based on the Recursive Language Models (RLM) paradigm, this environment allows
	language models to:
	- Execute Python code in a sandboxed REPL
	- Work with large contexts loaded as variables
	- Finalize answers via FINAL(), FINAL_VAR(), or answer dict pattern
	- Optionally make recursive LLM calls via llm_query() / llm_query_batched()

	Supports two finalization patterns:
	1. RLM-style: print('FINAL(answer)') or print('FINAL_VAR(var_name)')
	2. Prime Intellect style: answer = {"content": "...", "ready": True}

	Example:
	>>> env = REPLEnvironment(context="Hello World", task_prompt="Count chars")
	>>> obs = env.reset()
	>>> print(obs.context_preview) # "Hello World"
	>>>
	>>> obs = env.step(REPLAction(code="result = len(context)"))
	>>> print(obs.result.success) # True
	>>> print(obs.available_variables) # ["context", "result", "answer"]
	>>>
	>>> obs = env.step(REPLAction(code="print(f'FINAL({result})')"))
	>>> print(obs.done) # True
	>>> print(obs.metadata["final_answer"]) # "11"
	"""

	SUPPORTS_CONCURRENT_SESSIONS = True

	def __init__(
	self,
	context: Optional[str] = None,
	task_prompt: Optional[str] = None,
	max_iterations: int = 30,
	max_output_length: int = 8192,
	context_preview_length: int = 500,
	rubric: Optional[REPLRubric] = None,
	llm_query_fn: Optional[Callable[[str], str]] = None,
	llm_batch_fn: Optional[Callable[[List[str]], List[str]]] = None,
	subcall_fn: Optional[Callable[[str, Optional[str]], str]] = None,
	subcall_batch_fn: Optional[
	Callable[[List[str], Optional[str]], List[str]]
	] = None,
	rlm_max_depth: int = 1,
	rlm_max_iterations: int \| None = None,
	):
	"""Initialize the REPL environment.

	Args:
	context: Initial context to load (can also be set via REPL_CONTEXT env var)
	task_prompt: Task description (can also be set via REPL_TASK_PROMPT env var)
	max_iterations: Maximum steps per episode (default 30, env var REPL_MAX_ITERATIONS)
	max_output_length: Max chars for stdout/stderr per turn (default 8192)
	context_preview_length: Chars to show in context preview (default 500)
	rubric: Optional REPLRubric for reward computation (default: REPLRubric())
	llm_query_fn: Optional function for llm_query() support
	llm_batch_fn: Optional function for llm_query_batched() support
	subcall_fn: Optional function for recursive rlm_query() support
	subcall_batch_fn: Optional function for recursive rlm_query_batched() support
	rlm_max_depth: Max recursion depth for server-backed rlm_query()
	rlm_max_iterations: Max iterations for recursive child runners
	"""
	self.initial_context = context or os.environ.get("REPL_CONTEXT", "")
	self.initial_task_prompt = task_prompt or os.environ.get("REPL_TASK_PROMPT", "")
	self.max_iterations = int(os.environ.get("REPL_MAX_ITERATIONS", max_iterations))
	self.max_output_length = max_output_length
	self.context_preview_length = context_preview_length

	# Rubric for reward computation (OpenEnv RFC 004)
	self.rubric = rubric or REPLRubric()

	# Optional LLM functions for recursive calls
	self.llm_query_fn = llm_query_fn
	self.llm_batch_fn = llm_batch_fn
	self.subcall_fn = subcall_fn
	self.subcall_batch_fn = subcall_batch_fn
	self.rlm_max_depth = rlm_max_depth
	self.rlm_max_iterations = rlm_max_iterations or max_iterations

	# State (initialized on reset)
	self._state: Optional[REPLState] = None
	self._executor: Optional[PythonExecutor] = None
	self._runtime_controller = None
	self._runtime_controller_chat_fn: Optional[Callable[..., str]] = None

	@staticmethod
	def _build_hf_chat_fn(
	hf_token: Optional[str] = None,
	llm_model: Optional[str] = None,
	) -> Callable[..., str]:
	try:
	from huggingface_hub import InferenceClient, InferenceTimeoutError
	except ImportError:
	raise RuntimeError("huggingface_hub is required for HF-backed recursion")

	default_model = llm_model or os.environ.get("LLM_MODEL", "Qwen/Qwen3.5-9B")
	client = InferenceClient(model=default_model, token=hf_token, timeout=300)

	def chat_fn(messages: list[dict[str, str]], model: str \| None = None) -> str:
	try:
	response = client.chat.completions.create(
	model=model or default_model,
	messages=messages,
	max_tokens=2048,
	# Qwen3.5 non-thinking mode for precise coding tasks (from model card)
	temperature=0.6,
	top_p=0.95,
	presence_penalty=0.0,
	extra_body={
	"top_k": 20,
	"min_p": 0.0,
	"repetition_penalty": 1.0,
	"chat_template_kwargs": {"enable_thinking": False},
	},
	)
	return response.choices[0].message.content or ""
	except InferenceTimeoutError:
	return "Error: LLM inference timed out"
	except Exception as e:
	return f"Error: {e}"

	return chat_fn

	def _create_llm_functions(
	self,
	hf_token: Optional[str],
	llm_model: Optional[str] = None,
	) -> None:
	"""Create LLM/subcall functions dynamically using client-provided token."""
	try:
	chat_fn = self._build_hf_chat_fn(hf_token, llm_model)
	except RuntimeError:
	return

	self._runtime_controller_chat_fn = chat_fn
	self._runtime_controller = create_server_recursive_controller(
	chat_fn,
	max_depth=self.rlm_max_depth,
	max_iterations=self.rlm_max_iterations,
	)
	self.llm_query_fn = self._runtime_controller.llm_query_fn
	self.llm_batch_fn = self._runtime_controller.llm_batch_fn
	self.subcall_fn = self._runtime_controller.rlm_query_fn
	self.subcall_batch_fn = self._runtime_controller.rlm_batch_fn

	def reset(
	self,
	seed: Optional[int] = None,
	episode_id: Optional[str] = None,
	context: Optional[str] = None,
	task_prompt: Optional[str] = None,
	hf_token: Optional[str] = None,
	llm_model: Optional[str] = None,
	**kwargs: Any,
	) -> REPLObservation:
	"""Reset the environment with optional new context.

	Args:
	seed: Optional random seed (for reproducibility)
	episode_id: Optional episode identifier (if not provided, one is generated)
	context: Context to load (overrides initial_context)
	task_prompt: Task description (overrides initial_task_prompt)
	hf_token: Optional HuggingFace token for llm_query/llm_query_batched.
	If provided, creates LLM functions using this token.
	Security: Token is NOT stored in state or logged.
	llm_model: Optional model name for LLM functions (default: from env or Qwen3.5-9B)
	**kwargs: Additional reset parameters including:
	expected_answer: Ground truth for rubric-based reward scoring
	rlm_max_depth: Override max recursion depth
	rlm_max_iterations: Override max iterations for recursive child runners

	Returns:
	Initial REPLObservation with environment ready message
	"""
	effective_context = context or self.initial_context
	effective_task_prompt = task_prompt or self.initial_task_prompt

	# Set expected answer for rubric-based reward computation
	expected_answer = kwargs.get("expected_answer")
	self.rubric.reset()
	if expected_answer is not None:
	self.rubric.set_expected(expected_answer)

	runtime_rlm_max_depth = kwargs.get("rlm_max_depth")
	if runtime_rlm_max_depth is None:
	runtime_rlm_max_depth = self.rlm_max_depth
	runtime_rlm_max_depth = int(runtime_rlm_max_depth)

	runtime_rlm_max_iterations = kwargs.get("rlm_max_iterations")
	if runtime_rlm_max_iterations is None:
	runtime_rlm_max_iterations = self.rlm_max_iterations
	runtime_rlm_max_iterations = int(runtime_rlm_max_iterations)

	# Detect if recursion config changed — controller must be rebuilt
	depth_changed = (
	runtime_rlm_max_depth != self.rlm_max_depth
	or runtime_rlm_max_iterations != self.rlm_max_iterations
	)
	self.rlm_max_depth = runtime_rlm_max_depth
	self.rlm_max_iterations = runtime_rlm_max_iterations

	# Create or rebuild LLM functions when needed.
	# Token resolution: explicit hf_token > HF_TOKEN env var > cached HF login.
	if not self.llm_query_fn:
	effective_token = (
	hf_token if hf_token is not None else os.environ.get("HF_TOKEN")
	)
	self._create_llm_functions(effective_token, llm_model)
	elif depth_changed and self._runtime_controller is not None:
	# Rebuild controller with new depth/iteration config but reuse
	# the existing chat_fn — don't require re-providing credentials.
	self._runtime_controller.close()
	self._runtime_controller = create_server_recursive_controller(
	self._runtime_controller_chat_fn,
	max_depth=self.rlm_max_depth,
	max_iterations=self.rlm_max_iterations,
	)
	self.llm_query_fn = self._runtime_controller.llm_query_fn
	self.llm_batch_fn = self._runtime_controller.llm_batch_fn
	self.subcall_fn = self._runtime_controller.rlm_query_fn
	self.subcall_batch_fn = self._runtime_controller.rlm_batch_fn

	# Initialize state
	self._state = REPLState(
	episode_id=episode_id or str(uuid4()),
	step_count=0,
	context=effective_context,
	task_prompt=effective_task_prompt,
	iteration=0,
	max_iterations=self.max_iterations,
	namespace_keys=[],
	final_answer=None,
	total_execution_time=0.0,
	)

	# Initialize executor
	self._executor = PythonExecutor(max_output_length=self.max_output_length)

	# Initialize answer dict (Prime Intellect style)
	self._executor.set_variable("answer", {"content": "", "ready": False})

	# Load context into namespace if provided
	if effective_context:
	self._executor.set_context(effective_context)

	def _call_single_query(prompt: str, model: str \| None = None) -> str:
	if not self.llm_query_fn:
	raise RuntimeError("llm_query is not configured")
	try:
	return self.llm_query_fn(prompt, model) # type: ignore[misc]
	except TypeError:
	return self.llm_query_fn(prompt) # type: ignore[misc]

	def _call_batched_query(
	prompts: List[str], model: str \| None = None
	) -> List[str]:
	if not self.llm_batch_fn:
	raise RuntimeError("llm_query_batched is not configured")
	try:
	return self.llm_batch_fn(prompts, model) # type: ignore[misc]
	except TypeError:
	return self.llm_batch_fn(prompts) # type: ignore[misc]

	def _call_recursive_query(prompt: str, model: str \| None = None) -> str:
	if self.subcall_fn is None:
	return _call_single_query(prompt, model)
	return self.subcall_fn(prompt, model)

	def _call_recursive_batched(
	prompts: List[str], model: str \| None = None
	) -> List[str]:
	if not prompts:
	return []
	if self.subcall_batch_fn is not None:
	return self.subcall_batch_fn(prompts, model)
	return _call_batched_query(prompts, model)

	# Inject LLM functions if provided
	# Names: llm_query (single), llm_query_batched (official RLM), llm_batch (alias)
	if self.llm_query_fn:
	self._executor.inject_function("llm_query", _call_single_query)
	if self.llm_batch_fn:
	self._executor.inject_function(
	"llm_query_batched", _call_batched_query
	) # Official name
	self._executor.inject_function("llm_batch", _call_batched_query) # Alias
	if self.llm_query_fn or self.subcall_fn:
	self._executor.inject_function("rlm_query", _call_recursive_query)
	if self.llm_batch_fn or self.subcall_batch_fn:
	self._executor.inject_function("rlm_query_batched", _call_recursive_batched)

	# Inject FINAL helper function so both FINAL(x) and print(f'FINAL({x})') work
	# Returns the FINAL pattern as a string so it appears in stdout for detection
	def final_helper(value):
	"""Helper that returns FINAL(value) string for detection."""
	return f"FINAL({value})"

	self._executor.inject_function("FINAL", final_helper)

	# Inject FINAL_VAR helper that looks up variable and returns FINAL(value)
	# This matches official RLM behavior - strips quotes from var_name and looks up in namespace
	executor = self._executor # Capture for closure

	def final_var_helper(var_name: str):
	"""Look up variable by name and return FINAL(value) for detection."""
	# Strip quotes if present (handles both FINAL_VAR("x") and FINAL_VAR(x))
	var_name_clean = str(var_name).strip().strip("\"'")
	# Look up variable in executor namespace
	value = executor.get_variable(var_name_clean)
	if value is not None:
	return f"FINAL({value})"
	return f"FINAL_VAR({var_name_clean})" # Fallback for regex detection

	self._executor.inject_function("FINAL_VAR", final_var_helper)

	def show_vars_helper():
	"""Return the current non-private variables in the namespace."""
	return sorted(executor.list_variables())

	self._executor.inject_function("SHOW_VARS", show_vars_helper)

	# Update namespace keys
	self._state.namespace_keys = self._executor.list_variables()

	# Build initial message
	message_parts = ["REPL environment initialized."]
	if effective_context:
	message_parts.append(
	f"Context loaded ({len(effective_context)} chars). Use 'context' variable to access it."
	)
	if effective_task_prompt:
	message_parts.append(f"Task: {effective_task_prompt}")
	message_parts.append(
	"Use answer['content'] to store your answer, and set answer['ready'] = True when done."
	)

	return REPLObservation(
	result=CodeBlockResult(
	stdout="\n".join(message_parts),
	stderr="",
	locals_snapshot={},
	execution_time=0.0,
	success=True,
	exception=None,
	),
	context_preview=(
	effective_context[: self.context_preview_length]
	if effective_context
	else None
	),
	context_length=len(effective_context) if effective_context else 0,
	available_variables=self._state.namespace_keys,
	iteration=0,
	max_iterations=self.max_iterations,
	done=False,
	metadata={
	"task_prompt": effective_task_prompt,
	"message": "Environment ready.",
	},
	)

	def step(
	self,
	action: REPLAction,
	timeout_s: Optional[float] = None,
	**kwargs: Any,
	) -> REPLObservation:
	"""Execute code and return observation.

	Args:
	action: REPLAction containing code to execute
	timeout_s: Optional timeout in seconds (not currently used)
	**kwargs: Additional step parameters

	Returns:
	REPLObservation with execution results
	"""
	if self._state is None or self._executor is None:
	raise RuntimeError("Environment not initialized. Call reset() first.")

	self._state.step_count += 1
	self._state.iteration += 1

	# Check if agent explicitly signals final answer
	if action.is_final:
	self._state.final_answer = action.final_answer or ""
	obs = self._create_final_observation(
	success=True,
	message="Final answer submitted.",
	)
	obs.reward = self._apply_rubric(action, obs)
	return obs

	# Check iteration limit
	if self._state.iteration >= self.max_iterations:
	# Check if there's a partial answer in the answer dict
	answer_var = self._executor.get_variable("answer")
	if isinstance(answer_var, dict) and answer_var.get("content"):
	self._state.final_answer = str(answer_var.get("content", ""))
	obs = self._create_final_observation(
	success=False,
	message=f"Maximum iterations ({self.max_iterations}) reached.",
	)
	obs.reward = self._apply_rubric(action, obs)
	return obs

	# Execute code
	result = self._executor.execute(action.code)
	self._state.total_execution_time += result["execution_time"]
	self._state.namespace_keys = self._executor.list_variables()

	# Check for final answer patterns
	final_answer = self._extract_final_answer(result["stdout"])
	done = final_answer is not None

	if done:
	self._state.final_answer = final_answer

	obs = REPLObservation(
	result=CodeBlockResult(
	stdout=result["stdout"],
	stderr=result["stderr"],
	locals_snapshot=result["locals_snapshot"],
	execution_time=result["execution_time"],
	success=result["success"],
	exception=result["exception"],
	),
	context_preview=(
	self._state.context[: self.context_preview_length]
	if self._state.context
	else None
	),
	context_length=len(self._state.context) if self._state.context else 0,
	available_variables=self._state.namespace_keys,
	iteration=self._state.iteration,
	max_iterations=self.max_iterations,
	done=done,
	metadata={
	"task_prompt": self._state.task_prompt,
	"final_answer": final_answer,
	"execution_time": result["execution_time"],
	},
	)
	obs.reward = self._apply_rubric(action, obs)
	return obs

	def _extract_final_answer(self, stdout: str) -> Optional[str]:
	"""Extract final answer from output.

	Supports multiple patterns:
	1. RLM-style: FINAL(answer) in stdout
	2. RLM-style: FINAL_VAR(variable_name) in stdout
	3. Prime Intellect style: answer = {"content": "...", "ready": True} in namespace

	Args:
	stdout: Standard output from code execution

	Returns:
	Final answer string or None if not found
	"""
	# Pattern 1: RLM-style FINAL(answer)
	final_match = re.search(r"FINAL\((.*?)\)", stdout, re.DOTALL)
	if final_match:
	return final_match.group(1).strip()

	# Pattern 2: RLM-style FINAL_VAR(variable_name)
	final_var_match = re.search(r"FINAL_VAR\((\w+)\)", stdout)
	if final_var_match and self._executor:
	var_name = final_var_match.group(1)
	value = self._executor.get_variable(var_name)
	if value is not None:
	return str(value)

	# Pattern 3: Prime Intellect style answer dict
	if self._executor:
	answer_var = self._executor.get_variable("answer")
	if isinstance(answer_var, dict):
	if answer_var.get("ready", False):
	return str(answer_var.get("content", ""))

	return None

	def _create_final_observation(self, success: bool, message: str) -> REPLObservation:
	"""Create observation for episode termination.

	Args:
	success: Whether the episode ended successfully
	message: Termination message

	Returns:
	Final REPLObservation with done=True (reward set by rubric)
	"""
	return REPLObservation(
	result=CodeBlockResult(
	stdout=message,
	stderr="",
	locals_snapshot={},
	execution_time=0.0,
	success=success,
	exception=None,
	),
	context_preview=None,
	context_length=0,
	available_variables=[],
	iteration=self._state.iteration if self._state else 0,
	max_iterations=self.max_iterations,
	done=True,
	metadata={
	"final_answer": self._state.final_answer if self._state else None,
	"total_execution_time": (
	self._state.total_execution_time if self._state else 0
	),
	"total_iterations": self._state.iteration if self._state else 0,
	},
	)

	@property
	def state(self) -> REPLState:
	"""Get the current environment state.

	Returns:
	Current REPLState

	Raises:
	RuntimeError: If environment not initialized
	"""
	if self._state is None:
	raise RuntimeError("Environment not initialized. Call reset() first.")
	return self._state

	def close(self) -> None:
	"""Cleanup resources."""
	if self._runtime_controller is not None:
	self._runtime_controller.close()
	self._runtime_controller = None
	self._executor = None
	self._state = None

	def get_metadata(self) -> EnvironmentMetadata:
	"""Get environment metadata.

	Returns:
	EnvironmentMetadata with environment info
	"""
	return EnvironmentMetadata(
	name="repl_env",
	description="Python REPL environment for RLM-style code execution",
	version="0.1.0",
	)