Spaces:

SolusOps
/

tracefix_rl

Running

App Files Files Community

databoysu commited on 28 days ago

Commit

b4f37fd

1 Parent(s): beb93a1

fixing state machine

Browse files

Files changed (7) hide show

README.md +2 -3
__pycache__/models.cpython-312.pyc +0 -0
__pycache__/tasks.cpython-312.pyc +0 -0
inference.py +53 -44
models.py +3 -3
requirements.txt +1 -1
tasks.py +3 -3

README.md CHANGED Viewed

@@ -2,7 +2,6 @@
 title: TraceFix-RL
 emoji: 🧑‍💻
 colorFrom: blue
-colorTo: indigo
 sdk: docker
 pinned: false
 app_port: 7860
@@ -109,12 +108,12 @@ Server endpoints:
 - `--easy`: run episode using easy-tier curriculum sampling.
 - `--medium`: run episode using medium-tier curriculum sampling.
 - `--hard`: run episode using hard-tier curriculum sampling.
-- `--debug`: print raw model response snippets for troubleshooting.
 Example:
 ```bash
-python inference.py --medium --debug
 ```
 The script also enforces a model-thinking/output cap:

 title: TraceFix-RL
 emoji: 🧑‍💻
 colorFrom: blue
 sdk: docker
 pinned: false
 app_port: 7860
 - `--easy`: run episode using easy-tier curriculum sampling.
 - `--medium`: run episode using medium-tier curriculum sampling.
 - `--hard`: run episode using hard-tier curriculum sampling.
+- `--thought`: include model thought traces in internal prompt history.
 Example:
 ```bash
+python inference.py --medium --thought
 ```
 The script also enforces a model-thinking/output cap:

__pycache__/models.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/models.cpython-312.pyc and b/__pycache__/models.cpython-312.pyc differ

__pycache__/tasks.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/tasks.cpython-312.pyc and b/__pycache__/tasks.cpython-312.pyc differ

inference.py CHANGED Viewed

@@ -47,8 +47,7 @@ BENCHMARK = os.getenv("BENCHMARK", "tracefix_rl")
 MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
 SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
 THINKING_TOKEN_LIMIT = int(os.getenv("THINKING_TOKEN_LIMIT", "512"))
-# Approximation used for hard truncation before sending to server.
-THINKING_CHAR_LIMIT = THINKING_TOKEN_LIMIT * 4
 SYSTEM_PROMPT = (
     "You are controlling a Python debugging RL environment. "
@@ -100,7 +99,7 @@ def _extract_json(text: str) -> dict[str, Any]:
         except json.JSONDecodeError:
             pass
-    return {"action_type": "RUN_TESTS"}
 def _build_observation_text(observation: Any) -> str:
@@ -116,31 +115,26 @@ def _build_observation_text(observation: Any) -> str:
 def _get_model_action(
-    client: OpenAI, observation: Any, history: list[str], debug: bool = False
-) -> dict[str, Any]:
     obs_text = _build_observation_text(observation)
     user_prompt = (
         "Pick the single best next action and return only JSON.\n\n"
         f"{obs_text}\n\n"
         f"history:\n{chr(10).join(history[-5:]) if history else 'none'}"
     )
-    try:
-        completion = client.chat.completions.create(
-            model=MODEL_NAME,
-            messages=[
-                {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_prompt},
-            ],
-            temperature=0.0,
-            max_tokens=THINKING_TOKEN_LIMIT,
-            stream=False,
-        )
-        response_text = (completion.choices[0].message.content or "").strip()
-        if debug:
-            print(f"[DEBUG] raw_model_response={response_text[:500]}", flush=True)
-        action = _extract_json(response_text)
-    except Exception:
-        action = {"action_type": "RUN_TESTS"}
     if action.get("action_type") not in {
         "VIEW_CODE",
@@ -150,27 +144,20 @@ def _get_model_action(
         "RESET_TO_ORIGINAL",
         "SUBMIT",
     }:
-        action = {"action_type": "RUN_TESTS"}
-    return action
 def _to_code_action(action_dict: dict[str, Any]) -> CodeAction:
-    thought = action_dict.get("thought")
-    if isinstance(thought, str):
-        thought = thought[:THINKING_CHAR_LIMIT]
     payload = {
-        "action_type": action_dict.get("action_type", "RUN_TESTS"),
-        "thought": thought,
         "start_line": action_dict.get("start_line"),
         "end_line": action_dict.get("end_line"),
         "new_code_block": action_dict.get("new_code_block"),
     }
-    try:
-        return CodeAction(**payload)
-    except Exception:
-        return CodeAction(action_type="RUN_TESTS")
 def _compute_score(step_result: Any, rewards: list[float]) -> float:
@@ -184,7 +171,7 @@ def _compute_score(step_result: Any, rewards: list[float]) -> float:
     return max(0.0, min(1.0, float(raw)))
-async def run(difficulty: Optional[str] = None, debug: bool = False) -> None:
     client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
     env: Optional[TraceFixRLEnv] = None
@@ -214,8 +201,34 @@ async def run(difficulty: Optional[str] = None, debug: bool = False) -> None:
             if result.done:
                 break
-            action_dict = _get_model_action(client, result.observation, history, debug=debug)
-            action = _to_code_action(action_dict)
             result = await env.step(action)
             reward = float(result.reward or 0.0)
@@ -229,7 +242,7 @@ async def run(difficulty: Optional[str] = None, debug: bool = False) -> None:
             rewards.append(reward)
             steps_taken = step
-            history.append(f"step={step} action={action_str} reward={reward:.2f}")
             log_step(step=step, action=action_str, reward=reward, done=done, error=error)
             if done:
@@ -243,10 +256,6 @@ async def run(difficulty: Optional[str] = None, debug: bool = False) -> None:
             log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
             started = True
         msg = str(exc).replace("\n", " ")
-        if steps_taken == 0:
-            log_step(step=1, action="RUN_TESTS", reward=0.0, done=False, error=msg)
-            steps_taken = 1
-            rewards.append(0.0)
         score = 0.0
         success = False
     finally:
@@ -264,7 +273,7 @@ if __name__ == "__main__":
     group.add_argument("--easy", action="store_true", help="Run on easy curriculum tier.")
     group.add_argument("--medium", action="store_true", help="Run on medium curriculum tier.")
     group.add_argument("--hard", action="store_true", help="Run on hard curriculum tier.")
-    parser.add_argument("--debug", action="store_true", help="Print debug model output snippets.")
     args = parser.parse_args()
     difficulty: Optional[str] = None
@@ -275,4 +284,4 @@ if __name__ == "__main__":
     elif args.hard:
         difficulty = "hard"
-    asyncio.run(run(difficulty=difficulty, debug=args.debug))

 MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
 SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
 THINKING_TOKEN_LIMIT = int(os.getenv("THINKING_TOKEN_LIMIT", "512"))
+MAX_PARSE_RETRIES = 3
 SYSTEM_PROMPT = (
     "You are controlling a Python debugging RL environment. "
         except json.JSONDecodeError:
             pass
+    raise ValueError("Invalid JSON response.")
 def _build_observation_text(observation: Any) -> str:
 def _get_model_action(
+    client: OpenAI, observation: Any, history: list[str]
+) -> tuple[dict[str, Any], str]:
     obs_text = _build_observation_text(observation)
     user_prompt = (
         "Pick the single best next action and return only JSON.\n\n"
         f"{obs_text}\n\n"
         f"history:\n{chr(10).join(history[-5:]) if history else 'none'}"
     )
+    completion = client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=[
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": user_prompt},
+        ],
+        temperature=0.0,
+        max_tokens=THINKING_TOKEN_LIMIT,
+        stream=False,
+    )
+    response_text = (completion.choices[0].message.content or "").strip()
+    action = _extract_json(response_text)
     if action.get("action_type") not in {
         "VIEW_CODE",
         "RESET_TO_ORIGINAL",
         "SUBMIT",
     }:
+        raise ValueError("Invalid action_type in model response.")
+    return action, response_text
 def _to_code_action(action_dict: dict[str, Any]) -> CodeAction:
     payload = {
+        "action_type": action_dict.get("action_type"),
+        "thought": action_dict.get("thought"),
         "start_line": action_dict.get("start_line"),
         "end_line": action_dict.get("end_line"),
         "new_code_block": action_dict.get("new_code_block"),
     }
+    return CodeAction(**payload)
 def _compute_score(step_result: Any, rewards: list[float]) -> float:
     return max(0.0, min(1.0, float(raw)))
+async def run(difficulty: Optional[str] = None, show_thought: bool = False) -> None:
     client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
     env: Optional[TraceFixRLEnv] = None
             if result.done:
                 break
+            action: Optional[CodeAction] = None
+            model_response = ""
+            for attempt in range(1, MAX_PARSE_RETRIES + 1):
+                try:
+                    action_dict, model_response = _get_model_action(client, result.observation, history)
+                    action = _to_code_action(action_dict)
+                    if show_thought:
+                        history.append(f"thought={action.thought}")
+                    break
+                except Exception as exc:
+                    cause = str(exc).replace("\n", " ")
+                    history.append(
+                        (
+                            f"parse_failure attempt={attempt} cause={cause}. "
+                            "Error: Invalid JSON or schema. Return a complete valid JSON object "
+                            "with fields: thought, action_type, start_line, end_line, new_code_block."
+                        )
+                    )
+                    if model_response:
+                        history.append(f"raw_response={model_response[:500]}")
+            if action is None:
+                action = CodeAction(
+                    action_type="RUN_TESTS",
+                    thought="Fallback after repeated invalid JSON/schema responses.",
+                )
             result = await env.step(action)
             reward = float(result.reward or 0.0)
             rewards.append(reward)
             steps_taken = step
+            history.append(f"step={step} action={action_str} reward={reward:.2f} error={error or 'null'}")
             log_step(step=step, action=action_str, reward=reward, done=done, error=error)
             if done:
             log_start(task=TASK_NAME, env=BENCHMARK, model=MODEL_NAME)
             started = True
         msg = str(exc).replace("\n", " ")
         score = 0.0
         success = False
     finally:
     group.add_argument("--easy", action="store_true", help="Run on easy curriculum tier.")
     group.add_argument("--medium", action="store_true", help="Run on medium curriculum tier.")
     group.add_argument("--hard", action="store_true", help="Run on hard curriculum tier.")
+    parser.add_argument("--thought", action="store_true", help="Include model thought traces in internal history.")
     args = parser.parse_args()
     difficulty: Optional[str] = None
     elif args.hard:
         difficulty = "hard"
+    asyncio.run(run(difficulty=difficulty, show_thought=args.thought))

models.py CHANGED Viewed

@@ -21,9 +21,9 @@ ActionType = Literal[
 class CodeAction(Action):
     """Structured action consumed by the environment."""
-    thought: Optional[str] = Field(
-        default=None,
-        description="Optional reasoning string for debugging/traceability.",
     )
     action_type: ActionType = Field(
         ...,

 class CodeAction(Action):
     """Structured action consumed by the environment."""
+    thought: str = Field(
+        ...,
+        description="Mandatory reasoning string before selecting an action.",
     )
     action_type: ActionType = Field(
         ...,

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 fastapi==0.111.0
 uvicorn[standard]==0.30.1
-pydantic==1.10.17
 websockets==12.0
 openai>=1.30.0

 fastapi==0.111.0
 uvicorn[standard]==0.30.1
+pydantic>=2.0.0
 websockets==12.0
 openai>=1.30.0

tasks.py CHANGED Viewed

@@ -609,10 +609,10 @@ TASK_MI_STRICT_OVERLAP = _t(
         "    intervals.sort()",
         "    merged = []",
         "    for interval in intervals:",
-        "        if not merged or merged[-1][1] <= interval[0]:",
         "            merged.append(list(interval))",
         "        else:",
-        "            merged[-1][1] = min(merged[-1][1], interval[1])",
         "    return merged",
     ],
     tests=[_tmi_1, _tmi_2, _tmi_3],
@@ -680,4 +680,4 @@ TASKS_BY_DIFFICULTY: Dict[str, List[Dict]] = {
 # Flat list — used for random sampling when training_step is not set
 ALL_TASKS: List[Dict] = [
     t for bucket in TASKS_BY_DIFFICULTY.values() for t in bucket
-]

         "    intervals.sort()",
         "    merged = []",
         "    for interval in intervals:",
+        "        if not merged or merged[-1][1] < interval[0]:",
         "            merged.append(list(interval))",
         "        else:",
+        "            merged[-1][1] = max(merged[-1][1], interval[1])",
         "    return merged",
     ],
     tests=[_tmi_1, _tmi_2, _tmi_3],
 # Flat list — used for random sampling when training_step is not set
 ALL_TASKS: List[Dict] = [
     t for bucket in TASKS_BY_DIFFICULTY.values() for t in bucket
+]