Spaces:
Sleeping
Sleeping
databoysu commited on
Commit ·
f814100
1
Parent(s): 34c61c4
improving cot
Browse files- __pycache__/environment.cpython-312.pyc +0 -0
- environment.py +11 -4
- inference.py +20 -29
__pycache__/environment.cpython-312.pyc
CHANGED
|
Binary files a/__pycache__/environment.cpython-312.pyc and b/__pycache__/environment.cpython-312.pyc differ
|
|
|
environment.py
CHANGED
|
@@ -139,6 +139,7 @@ class TraceFixRLGym:
|
|
| 139 |
self._prev_pass_count: int = 0
|
| 140 |
self._last_test_results: List[TestResult] = []
|
| 141 |
self._last_output: str = ""
|
|
|
|
| 142 |
self._last_edited_line: Optional[int] = None
|
| 143 |
self._episode_id: str = ""
|
| 144 |
self._done: bool = False
|
|
@@ -209,6 +210,7 @@ class TraceFixRLGym:
|
|
| 209 |
self._prev_pass_count = 0
|
| 210 |
self._last_test_results = []
|
| 211 |
self._last_output = ""
|
|
|
|
| 212 |
self._last_edited_line = None # no edits yet — localized_context will be empty
|
| 213 |
self._episode_id = str(uuid.uuid4())[:8]
|
| 214 |
self._done = False
|
|
@@ -347,7 +349,7 @@ class TraceFixRLGym:
|
|
| 347 |
self._last_run_all_passed = (current_pass == total_tests)
|
| 348 |
|
| 349 |
if current_pass == total_tests and not syntax_err:
|
| 350 |
-
self.
|
| 351 |
f"Tests Passed: {total_tests}/{total_tests}.\n\n"
|
| 352 |
"SUCCESS: ALL TESTS PASSED! You are finished. You MUST now use the SUBMIT action."
|
| 353 |
)
|
|
@@ -359,10 +361,11 @@ class TraceFixRLGym:
|
|
| 359 |
error_traceback = (output or "").strip() or "\n\n".join(failing_messages).strip()
|
| 360 |
if not error_traceback:
|
| 361 |
error_traceback = "No traceback available."
|
| 362 |
-
self.
|
| 363 |
f"Tests Passed: {current_pass}/{total_tests}.\n\n"
|
| 364 |
f"Traceback:\n{error_traceback}"
|
| 365 |
)
|
|
|
|
| 366 |
|
| 367 |
return reward
|
| 368 |
|
|
@@ -507,7 +510,7 @@ class TraceFixRLGym:
|
|
| 507 |
|
| 508 |
context_anchor = self._last_edited_line
|
| 509 |
if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
|
| 510 |
-
extracted_line = extract_error_line(self.
|
| 511 |
if extracted_line is not None:
|
| 512 |
context_anchor = extracted_line
|
| 513 |
|
|
@@ -518,7 +521,11 @@ class TraceFixRLGym:
|
|
| 518 |
idx + 1: line for idx, line in enumerate(self._code_lines)
|
| 519 |
},
|
| 520 |
localized_context = localized,
|
| 521 |
-
last_execution_output =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
syntax_error = not syntax_valid,
|
| 523 |
test_results = list(self._last_test_results),
|
| 524 |
step_count = self._step_count,
|
|
|
|
| 139 |
self._prev_pass_count: int = 0
|
| 140 |
self._last_test_results: List[TestResult] = []
|
| 141 |
self._last_output: str = ""
|
| 142 |
+
self._last_execution_output: str = ""
|
| 143 |
self._last_edited_line: Optional[int] = None
|
| 144 |
self._episode_id: str = ""
|
| 145 |
self._done: bool = False
|
|
|
|
| 210 |
self._prev_pass_count = 0
|
| 211 |
self._last_test_results = []
|
| 212 |
self._last_output = ""
|
| 213 |
+
self._last_execution_output = ""
|
| 214 |
self._last_edited_line = None # no edits yet — localized_context will be empty
|
| 215 |
self._episode_id = str(uuid.uuid4())[:8]
|
| 216 |
self._done = False
|
|
|
|
| 349 |
self._last_run_all_passed = (current_pass == total_tests)
|
| 350 |
|
| 351 |
if current_pass == total_tests and not syntax_err:
|
| 352 |
+
self._last_execution_output = (
|
| 353 |
f"Tests Passed: {total_tests}/{total_tests}.\n\n"
|
| 354 |
"SUCCESS: ALL TESTS PASSED! You are finished. You MUST now use the SUBMIT action."
|
| 355 |
)
|
|
|
|
| 361 |
error_traceback = (output or "").strip() or "\n\n".join(failing_messages).strip()
|
| 362 |
if not error_traceback:
|
| 363 |
error_traceback = "No traceback available."
|
| 364 |
+
self._last_execution_output = (
|
| 365 |
f"Tests Passed: {current_pass}/{total_tests}.\n\n"
|
| 366 |
f"Traceback:\n{error_traceback}"
|
| 367 |
)
|
| 368 |
+
self._last_output = self._last_execution_output
|
| 369 |
|
| 370 |
return reward
|
| 371 |
|
|
|
|
| 510 |
|
| 511 |
context_anchor = self._last_edited_line
|
| 512 |
if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
|
| 513 |
+
extracted_line = extract_error_line(self._last_execution_output)
|
| 514 |
if extracted_line is not None:
|
| 515 |
context_anchor = extracted_line
|
| 516 |
|
|
|
|
| 521 |
idx + 1: line for idx, line in enumerate(self._code_lines)
|
| 522 |
},
|
| 523 |
localized_context = localized,
|
| 524 |
+
last_execution_output = (
|
| 525 |
+
self._last_execution_output
|
| 526 |
+
if self._last_action == "RUN_TESTS"
|
| 527 |
+
else self._last_output
|
| 528 |
+
),
|
| 529 |
syntax_error = not syntax_valid,
|
| 530 |
test_results = list(self._last_test_results),
|
| 531 |
step_count = self._step_count,
|
inference.py
CHANGED
|
@@ -47,35 +47,26 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
|
|
| 47 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
|
| 48 |
|
| 49 |
SYSTEM_PROMPT = """\
|
| 50 |
-
You are
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
-
|
| 66 |
-
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
- RESET_TO_ORIGINAL: Use this if wanting to reset the code file to try again or with a different strategy.
|
| 71 |
-
|
| 72 |
-
VALID JSON EXAMPLES (Follow this exact thought depth):
|
| 73 |
-
|
| 74 |
-
Example 1 (Planning an edit):
|
| 75 |
-
{"thought":"Observation: The last_execution_output shows an IndexError on line 12 because 'i+1' is out of bounds. Diagnosis: The loop condition 'for i in range(len(arr))' goes to the end of the array, so 'arr[i+1]' fails on the last iteration. Plan: I will use REPLACE_LINES on line 10 to change the loop to 'range(len(arr)-1)'.","action_type":"REPLACE_LINES","start_line":10,"end_line":10,"new_code_block":" for i in range(len(arr) - 1):"}
|
| 76 |
-
|
| 77 |
-
Example 2 (Testing):
|
| 78 |
-
{"thought":"Observation: I just replaced line 10 with the corrected loop condition. Diagnosis: I need to verify if this change fixed the IndexError and didn't break other boundary tests. Plan: I will use RUN_TESTS to get fresh evidence.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
|
| 79 |
"""
|
| 80 |
|
| 81 |
|
|
|
|
| 47 |
SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
|
| 48 |
|
| 49 |
SYSTEM_PROMPT = """\
|
| 50 |
+
You are a debugging policy agent. Output exactly one CodeAction JSON object per turn.
|
| 51 |
+
|
| 52 |
+
Use Action Trajectory on every turn. If an action repeats without progress, change strategy.
|
| 53 |
+
PARSE_ERROR means your previous output was invalid; fix formatting immediately.
|
| 54 |
+
|
| 55 |
+
Mandatory thought format (exactly 3 sentences):
|
| 56 |
+
Observation: what you see in localized_context or last_execution_output.
|
| 57 |
+
Diagnosis: root cause and exact line(s) to change.
|
| 58 |
+
Plan: the next action_type and why.
|
| 59 |
+
|
| 60 |
+
Action policy:
|
| 61 |
+
- VIEW_CODE to inspect full line mapping.
|
| 62 |
+
- RUN_TESTS to get fresh traceback evidence.
|
| 63 |
+
- REPLACE_LINES for focused fixes using exact code_dict keys.
|
| 64 |
+
- UNDO_EDIT if the latest edit made things worse.
|
| 65 |
+
- RESET_TO_ORIGINAL as last-resort recovery.
|
| 66 |
+
- SUBMIT ONLY when last_execution_output explicitly contains the success signal that all tests passed.
|
| 67 |
+
|
| 68 |
+
Return only JSON keys: thought, action_type, start_line, end_line, new_code_block.
|
| 69 |
+
No markdown. No extra keys.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
"""
|
| 71 |
|
| 72 |
|