| import json |
| import subprocess |
| import tempfile |
| import os |
| import sys |
|
|
| def test_passes(code, func, inp, expected): |
| if isinstance(inp, (list, tuple)): |
| args = ', '.join(repr(x) for x in inp) |
| else: |
| args = repr(inp) |
| |
| script = f"""{code} |
| |
| try: |
| r = {func}({args}) |
| expected = {repr(expected)} |
| print("PASS" if r == expected else f"FAIL: got {{r}}") |
| except Exception as e: |
| print(f"ERROR: {{e}}") |
| """ |
| try: |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f: |
| f.write(script) |
| fname = f.name |
| r = subprocess.run( |
| [sys.executable, fname], |
| capture_output=True, text=True, timeout=5 |
| ) |
| os.unlink(fname) |
| return 'PASS' in r.stdout |
| except: |
| return False |
|
|
| for tier in [1, 2, 3]: |
| bugs = [json.loads(l) for l in open(f'data/bugs_tier{tier}.jsonl') if l.strip()] |
| |
| broken_original = [] |
| buggy_not_failing = [] |
| |
| for b in bugs: |
| orig_passes = all( |
| test_passes(b['original_code'], b['function_name'], |
| t['input'], t['expected_output']) |
| for t in b['test_cases'] |
| ) |
| buggy_fails_some = any( |
| not test_passes(b['buggy_code'], b['function_name'], |
| t['input'], t['expected_output']) |
| for t in b['test_cases'] |
| ) |
| |
| if not orig_passes: |
| broken_original.append(b['id']) |
| if not buggy_fails_some: |
| buggy_not_failing.append(b['id']) |
| |
| print(f'\nTier {tier}:') |
| if broken_original: |
| print(f' BROKEN original_code: {broken_original}') |
| if buggy_not_failing: |
| print(f' BUGGY code not failing: {buggy_not_failing}') |
| if not broken_original and not buggy_not_failing: |
| print(f' All good!') |