databoysu commited on
Commit
f814100
·
1 Parent(s): 34c61c4

improving cot

Browse files
__pycache__/environment.cpython-312.pyc CHANGED
Binary files a/__pycache__/environment.cpython-312.pyc and b/__pycache__/environment.cpython-312.pyc differ
 
environment.py CHANGED
@@ -139,6 +139,7 @@ class TraceFixRLGym:
139
  self._prev_pass_count: int = 0
140
  self._last_test_results: List[TestResult] = []
141
  self._last_output: str = ""
 
142
  self._last_edited_line: Optional[int] = None
143
  self._episode_id: str = ""
144
  self._done: bool = False
@@ -209,6 +210,7 @@ class TraceFixRLGym:
209
  self._prev_pass_count = 0
210
  self._last_test_results = []
211
  self._last_output = ""
 
212
  self._last_edited_line = None # no edits yet — localized_context will be empty
213
  self._episode_id = str(uuid.uuid4())[:8]
214
  self._done = False
@@ -347,7 +349,7 @@ class TraceFixRLGym:
347
  self._last_run_all_passed = (current_pass == total_tests)
348
 
349
  if current_pass == total_tests and not syntax_err:
350
- self._last_output = (
351
  f"Tests Passed: {total_tests}/{total_tests}.\n\n"
352
  "SUCCESS: ALL TESTS PASSED! You are finished. You MUST now use the SUBMIT action."
353
  )
@@ -359,10 +361,11 @@ class TraceFixRLGym:
359
  error_traceback = (output or "").strip() or "\n\n".join(failing_messages).strip()
360
  if not error_traceback:
361
  error_traceback = "No traceback available."
362
- self._last_output = (
363
  f"Tests Passed: {current_pass}/{total_tests}.\n\n"
364
  f"Traceback:\n{error_traceback}"
365
  )
 
366
 
367
  return reward
368
 
@@ -507,7 +510,7 @@ class TraceFixRLGym:
507
 
508
  context_anchor = self._last_edited_line
509
  if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
510
- extracted_line = extract_error_line(self._last_output)
511
  if extracted_line is not None:
512
  context_anchor = extracted_line
513
 
@@ -518,7 +521,11 @@ class TraceFixRLGym:
518
  idx + 1: line for idx, line in enumerate(self._code_lines)
519
  },
520
  localized_context = localized,
521
- last_execution_output = self._last_output,
 
 
 
 
522
  syntax_error = not syntax_valid,
523
  test_results = list(self._last_test_results),
524
  step_count = self._step_count,
 
139
  self._prev_pass_count: int = 0
140
  self._last_test_results: List[TestResult] = []
141
  self._last_output: str = ""
142
+ self._last_execution_output: str = ""
143
  self._last_edited_line: Optional[int] = None
144
  self._episode_id: str = ""
145
  self._done: bool = False
 
210
  self._prev_pass_count = 0
211
  self._last_test_results = []
212
  self._last_output = ""
213
+ self._last_execution_output = ""
214
  self._last_edited_line = None # no edits yet — localized_context will be empty
215
  self._episode_id = str(uuid.uuid4())[:8]
216
  self._done = False
 
349
  self._last_run_all_passed = (current_pass == total_tests)
350
 
351
  if current_pass == total_tests and not syntax_err:
352
+ self._last_execution_output = (
353
  f"Tests Passed: {total_tests}/{total_tests}.\n\n"
354
  "SUCCESS: ALL TESTS PASSED! You are finished. You MUST now use the SUBMIT action."
355
  )
 
361
  error_traceback = (output or "").strip() or "\n\n".join(failing_messages).strip()
362
  if not error_traceback:
363
  error_traceback = "No traceback available."
364
+ self._last_execution_output = (
365
  f"Tests Passed: {current_pass}/{total_tests}.\n\n"
366
  f"Traceback:\n{error_traceback}"
367
  )
368
+ self._last_output = self._last_execution_output
369
 
370
  return reward
371
 
 
510
 
511
  context_anchor = self._last_edited_line
512
  if self._last_action == "RUN_TESTS" and not self._last_run_all_passed:
513
+ extracted_line = extract_error_line(self._last_execution_output)
514
  if extracted_line is not None:
515
  context_anchor = extracted_line
516
 
 
521
  idx + 1: line for idx, line in enumerate(self._code_lines)
522
  },
523
  localized_context = localized,
524
+ last_execution_output = (
525
+ self._last_execution_output
526
+ if self._last_action == "RUN_TESTS"
527
+ else self._last_output
528
+ ),
529
  syntax_error = not syntax_valid,
530
  test_results = list(self._last_test_results),
531
  step_count = self._step_count,
inference.py CHANGED
@@ -47,35 +47,26 @@ MAX_STEPS = int(os.getenv("MAX_STEPS", "50"))
47
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
48
 
49
  SYSTEM_PROMPT = """\
50
- You are an autonomous Software Engineering RL Agent.
51
- You are strictly evaluated on your ability to reason deeply before taking action.
52
-
53
- OPERATING CONTRACT:
54
- 1. Output exactly one CodeAction object per turn.
55
- 2. You MUST read your conversation history. If you just tried an edit and the tests still fail, DO NOT repeat the same edit.
56
- 3. PARSE_ERROR means your last output was invalid. Fix your formatting immediately.
57
-
58
- HOW TO THINK (The 'thought' field is mandatory):
59
- Before choosing an action_type, your 'thought' MUST contain these exact 3 sentences:
60
- 1. "Observation: [State what you see in the test output or traceback]"
61
- 2. "Diagnosis: [Explain exactly which line is causing the bug and why]"
62
- 3. "Plan: [State exactly what tool you will use next to fix it]"
63
-
64
- ACTION POLICY:
65
- - VIEW_CODE: Read the code mapping.
66
- - RUN_TESTS: Execute tests to get the traceback.
67
- - REPLACE_LINES: Apply a focused fix. Use EXACT line numbers from code_dict.
68
- - UNDO_EDIT: Revert if the last edit caused a SyntaxError.
69
- - SUBMIT: Use this ONLY when the last_execution_output explicitly confirms all tests pass.
70
- - RESET_TO_ORIGINAL: Use this if wanting to reset the code file to try again or with a different strategy.
71
-
72
- VALID JSON EXAMPLES (Follow this exact thought depth):
73
-
74
- Example 1 (Planning an edit):
75
- {"thought":"Observation: The last_execution_output shows an IndexError on line 12 because 'i+1' is out of bounds. Diagnosis: The loop condition 'for i in range(len(arr))' goes to the end of the array, so 'arr[i+1]' fails on the last iteration. Plan: I will use REPLACE_LINES on line 10 to change the loop to 'range(len(arr)-1)'.","action_type":"REPLACE_LINES","start_line":10,"end_line":10,"new_code_block":" for i in range(len(arr) - 1):"}
76
-
77
- Example 2 (Testing):
78
- {"thought":"Observation: I just replaced line 10 with the corrected loop condition. Diagnosis: I need to verify if this change fixed the IndexError and didn't break other boundary tests. Plan: I will use RUN_TESTS to get fresh evidence.","action_type":"RUN_TESTS","start_line":null,"end_line":null,"new_code_block":null}
79
  """
80
 
81
 
 
47
  SUCCESS_SCORE_THRESHOLD = float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.99"))
48
 
49
  SYSTEM_PROMPT = """\
50
+ You are a debugging policy agent. Output exactly one CodeAction JSON object per turn.
51
+
52
+ Use Action Trajectory on every turn. If an action repeats without progress, change strategy.
53
+ PARSE_ERROR means your previous output was invalid; fix formatting immediately.
54
+
55
+ Mandatory thought format (exactly 3 sentences):
56
+ Observation: what you see in localized_context or last_execution_output.
57
+ Diagnosis: root cause and exact line(s) to change.
58
+ Plan: the next action_type and why.
59
+
60
+ Action policy:
61
+ - VIEW_CODE to inspect full line mapping.
62
+ - RUN_TESTS to get fresh traceback evidence.
63
+ - REPLACE_LINES for focused fixes using exact code_dict keys.
64
+ - UNDO_EDIT if the latest edit made things worse.
65
+ - RESET_TO_ORIGINAL as last-resort recovery.
66
+ - SUBMIT ONLY when last_execution_output explicitly contains the success signal that all tests passed.
67
+
68
+ Return only JSON keys: thought, action_type, start_line, end_line, new_code_block.
69
+ No markdown. No extra keys.
 
 
 
 
 
 
 
 
 
70
  """
71
 
72