Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Standalone baseline inference script. | |
| Uses OpenAI gpt-4o-mini to review Python code across 3 difficulty levels. | |
| Saves results to baseline_scores.json. | |
| """ | |
| import os | |
| import json | |
| import requests | |
| from openai import OpenAI | |
| # Initialize OpenAI client | |
| api_key = os.getenv("OPENAI_API_KEY") | |
| if not api_key: | |
| raise ValueError("OPENAI_API_KEY environment variable not set") | |
| client = OpenAI(api_key=api_key) | |
| # Server endpoint | |
| BASE_URL = "http://localhost:8000" | |
| TASKS = ["easy", "medium", "hard"] | |
| def reset_task(task_id: str) -> dict: | |
| """Reset environment for a given task_id.""" | |
| response = requests.post( | |
| f"{BASE_URL}/reset", | |
| json={"task_id": task_id} | |
| ) | |
| response.raise_for_status() | |
| return response.json() | |
| def step_task(action: dict) -> dict: | |
| """Submit action to environment and get observation.""" | |
| response = requests.post( | |
| f"{BASE_URL}/step", | |
| json={"action": action} | |
| ) | |
| response.raise_for_status() | |
| return response.json() | |
| def review_code(code_snippet: str) -> dict: | |
| """Use GPT-4o-mini to review code snippet.""" | |
| prompt = f"""Review this Python code. Reply as JSON with keys: review (str), bug_type (syntax/logic/security/none), line_number (int), confidence (float) | |
| Code: | |
| {code_snippet}""" | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "user", "content": prompt} | |
| ], | |
| temperature=0.7 | |
| ) | |
| content = response.choices[0].message.content | |
| # Try to extract JSON from response | |
| try: | |
| # First try direct JSON parsing | |
| result = json.loads(content) | |
| except json.JSONDecodeError: | |
| # Try to find JSON in the response text | |
| start = content.find('{') | |
| end = content.rfind('}') + 1 | |
| if start != -1 and end > start: | |
| result = json.loads(content[start:end]) | |
| else: | |
| raise ValueError(f"Could not parse JSON from response: {content}") | |
| return result | |
| def run_baseline(): | |
| """Run baseline inference on all tasks.""" | |
| results = { | |
| "scores": {}, | |
| "details": {} | |
| } | |
| for task_id in TASKS: | |
| print(f"\n{'='*60}") | |
| print(f"Running task: {task_id}") | |
| print('='*60) | |
| # Reset environment | |
| obs = reset_task(task_id) | |
| code_snippet = obs.get("code_snippet", "") | |
| print(f"Code snippet:\n{code_snippet}\n") | |
| # Get review from GPT-4o-mini | |
| print("Calling GPT-4o-mini for review...") | |
| review_result = review_code(code_snippet) | |
| print(f"Review result: {review_result}") | |
| # Prepare action | |
| action = { | |
| "review": review_result.get("review", ""), | |
| "bug_type": review_result.get("bug_type", "none"), | |
| "line_number": int(review_result.get("line_number", -1)), | |
| "confidence": float(review_result.get("confidence", 0.0)) | |
| } | |
| # Submit action to environment | |
| print(f"Submitting action: {action}") | |
| step_obs = step_task(action) | |
| # Extract score from observation | |
| # The step response should have reward/score information | |
| score = step_obs.get("cumulative_reward", 0.0) | |
| feedback = step_obs.get("previous_feedback", "") | |
| print(f"Score: {score}") | |
| print(f"Feedback: {feedback}") | |
| results["scores"][task_id] = score | |
| results["details"][task_id] = { | |
| "action": action, | |
| "feedback": feedback, | |
| "score": score | |
| } | |
| # Calculate average | |
| scores = list(results["scores"].values()) | |
| average = sum(scores) / len(scores) if scores else 0.0 | |
| results["average"] = round(average, 4) | |
| # Print summary | |
| print(f"\n{'='*60}") | |
| print("BASELINE RESULTS") | |
| print('='*60) | |
| for task_id in TASKS: | |
| print(f"{task_id:10s}: {results['scores'][task_id]:.4f}") | |
| print(f"{'Average':10s}: {results['average']:.4f}") | |
| print('='*60 + "\n") | |
| # Save to file | |
| with open("baseline_scores.json", "w") as f: | |
| json.dump(results, f, indent=2) | |
| print("Results saved to baseline_scores.json") | |
| return results | |
| if __name__ == "__main__": | |
| run_baseline() | |