Spaces:

SolusOps
/

tracefix_rl

Sleeping

App Files Files Community

databoysu commited on Apr 8

Commit

f814100

1 Parent(s): 34c61c4

improving cot

Browse files

Files changed (3) hide show

__pycache__/environment.cpython-312.pyc +0 -0
environment.py +11 -4
inference.py +20 -29

__pycache__/environment.cpython-312.pyc CHANGED Viewed

Binary files a/__pycache__/environment.cpython-312.pyc and b/__pycache__/environment.cpython-312.pyc differ

environment.py CHANGED Viewed

@@ -139,6 +139,7 @@ class TraceFixRLGym:
         self._prev_pass_count: int = 0
         self._last_test_results: List[TestResult] = []
         self._last_output: str = ""
         self._last_edited_line: Optional[int] = None
         self._episode_id: str = ""
         self._done: bool = False
@@ -209,6 +210,7 @@ class TraceFixRLGym:
         self._prev_pass_count  = 0
         self._last_test_results = []
         self._last_output      = ""
         self._last_edited_line = None   # no edits yet — localized_context will be empty
         self._episode_id       = str(uuid.uuid4())[:8]
         self._done             = False
@@ -347,7 +349,7 @@ class TraceFixRLGym:
             self._last_run_all_passed = (current_pass == total_tests)
         if current_pass == total_tests and not syntax_err:
-            self._last_output = (
                 f"Tests Passed: {total_tests}/{total_tests}.\n\n"
                 "SUCCESS: ALL TESTS PASSED! You are finished. You MUST now use the SUBMIT action."
             )
@@ -359,10 +361,11 @@ class TraceFixRLGym:
             error_traceback = (output or "").strip() or "\n\n".join(failing_messages).strip()
             if not error_traceback:
                 error_traceback = "No traceback available."
-            self._last_output = (
                 f"Tests Passed: {current_pass}/{total_tests}.\n\n"
                 f"Traceback:\n{error_traceback}"
             )
         return reward
@@ -507,7 +510,7 @@ class TraceFixRLGym:
         context_anchor = self._last_edited_line
         if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
-            extracted_line = extract_error_line(self._last_output)
             if extracted_line is not None:
                 context_anchor = extracted_line
@@ -518,7 +521,11 @@ class TraceFixRLGym:
                 idx + 1: line for idx, line in enumerate(self._code_lines)
             },
             localized_context     = localized,
-            last_execution_output = self._last_output,
             syntax_error          = not syntax_valid,
             test_results          = list(self._last_test_results),
             step_count            = self._step_count,

         self._prev_pass_count: int = 0
         self._last_test_results: List[TestResult] = []
         self._last_output: str = ""
+        self._last_execution_output: str = ""
         self._last_edited_line: Optional[int] = None
         self._episode_id: str = ""
         self._done: bool = False
         self._prev_pass_count  = 0
         self._last_test_results = []
         self._last_output      = ""
+        self._last_execution_output = ""
         self._last_edited_line = None   # no edits yet — localized_context will be empty
         self._episode_id       = str(uuid.uuid4())[:8]
         self._done             = False
             self._last_run_all_passed = (current_pass == total_tests)
         if current_pass == total_tests and not syntax_err:
+            self._last_execution_output = (
                 f"Tests Passed: {total_tests}/{total_tests}.\n\n"
                 "SUCCESS: ALL TESTS PASSED! You are finished. You MUST now use the SUBMIT action."
             )
             error_traceback = (output or "").strip() or "\n\n".join(failing_messages).strip()
             if not error_traceback:
                 error_traceback = "No traceback available."
+            self._last_execution_output = (
                 f"Tests Passed: {current_pass}/{total_tests}.\n\n"
                 f"Traceback:\n{error_traceback}"
             )
+        self._last_output = self._last_execution_output
         return reward
         context_anchor = self._last_edited_line
         if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
+            extracted_line = extract_error_line(self._last_execution_output)
             if extracted_line is not None:
                 context_anchor = extracted_line
                 idx + 1: line for idx, line in enumerate(self._code_lines)
             },
             localized_context     = localized,
+            last_execution_output = (
+                self._last_execution_output
+                if self._last_action == "RUN_TESTS"
+                else self._last_output
+            ),
             syntax_error          = not syntax_valid,
             test_results          = list(self._last_test_results),
             step_count            = self._step_count,

inference.py CHANGED Viewed

@@ -47,35 +47,26 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
 SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
 SYSTEM_PROMPT = """\
-You are an autonomous Software Engineering RL Agent.
-You are strictly evaluated on your ability to reason deeply before taking action.
-OPERATING CONTRACT:
-1. Output exactly one CodeAction object per turn.
-2. You MUST read your conversation history. If you just tried an edit and the tests still fail, DO NOT repeat the same edit.
-3. PARSE_ERROR means your last output was invalid. Fix your formatting immediately.
-HOW TO THINK (The 'thought' field is mandatory):
-Before choosing an action_type, your 'thought' MUST contain these exact 3 sentences:
-1. "Observation: [State what you see in the test output or traceback]"
-2. "Diagnosis: [Explain exactly which line is causing the bug and why]"
-3. "Plan: [State exactly what tool you will use next to fix it]"
-ACTION POLICY:
-- VIEW_CODE: Read the code mapping.
-- RUN_TESTS: Execute tests to get the traceback.
-- REPLACE_LINES: Apply a focused fix. Use EXACT line numbers from code_dict.
-- UNDO_EDIT: Revert if the last edit caused a SyntaxError.
-- SUBMIT: Use this ONLY when the last_execution_output explicitly confirms all tests pass.
-- RESET_TO_ORIGINAL: Use this if wanting to reset the code file to try again or with a different strategy.
-VALID JSON EXAMPLES (Follow this exact thought depth):
-Example 1 (Planning an edit):
-{"thought":"Observation: The last_execution_output shows an IndexError on line 12 because 'i+1' is out of bounds. Diagnosis: The loop condition 'for i in range(len(arr))' goes to the end of the array, so 'arr[i+1]' fails on the last iteration. Plan: I will use REPLACE_LINES on line 10 to change the loop to 'range(len(arr)-1)'.","action_type":"REPLACE_LINES","start_line":10,"end_line":10,"new_code_block":"    for i in range(len(arr) - 1):"}
-Example 2 (Testing):
-{"thought":"Observation: I just replaced line 10 with the corrected loop condition. Diagnosis: I need to verify if this change fixed the IndexError and didn't break other boundary tests. Plan: I will use RUN_TESTS to get fresh evidence.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
 """

 SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
 SYSTEM_PROMPT = """\
+You are a debugging policy agent. Output exactly one CodeAction JSON object per turn.
+Use Action Trajectory on every turn. If an action repeats without progress, change strategy.
+PARSE_ERROR means your previous output was invalid; fix formatting immediately.
+Mandatory thought format (exactly 3 sentences):
+Observation: what you see in localized_context or last_execution_output.
+Diagnosis: root cause and exact line(s) to change.
+Plan: the next action_type and why.
+Action policy:
+- VIEW_CODE to inspect full line mapping.
+- RUN_TESTS to get fresh traceback evidence.
+- REPLACE_LINES for focused fixes using exact code_dict keys.
+- UNDO_EDIT if the latest edit made things worse.
+- RESET_TO_ORIGINAL as last-resort recovery.
+- SUBMIT ONLY when last_execution_output explicitly contains the success signal that all tests passed.
+Return only JSON keys: thought, action_type, start_line, end_line, new_code_block.
+No markdown. No extra keys.
 """