SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on 21 days ago

Commit

8d5d80e

verified ·

1 Parent(s): 0003466

Upload data3/generate_problems_openai.py with huggingface_hub

Browse files

Files changed (1) hide show

data3/generate_problems_openai.py +506 -0

data3/generate_problems_openai.py ADDED Viewed

	@@ -0,0 +1,506 @@

+#!/usr/bin/env python3
+"""
+Generate programming problems from function_dataset_v2.csv using OpenAI API.
+Filters by relevance score and controls API cost.
+"""
+import csv
+import json
+import os
+import sys
+from openai import OpenAI
+from datetime import datetime
+from typing import Dict, Optional, Tuple
+import time
+# Configuration
+MODEL_NAME = "gpt-4o-mini"  # Cost-effective model, can change to "gpt-4o" for better quality
+MIN_RELEVANCE_SCORE = 60  # Only process functions with score >= 60
+MAX_BUDGET_USD = 10.0  # Maximum budget in USD
+# OpenAI pricing (as of Dec 2024)
+# Official pricing: https://openai.com/api/pricing/
+PRICING = {
+    # GPT-5 series
+    "gpt-5.2": {
+        "input": 1.75 / 1_000_000,     # $1.75 per 1M input tokens
+        "output": 14.00 / 1_000_000,   # $14.00 per 1M output tokens
+    },
+    "gpt-5.1": {
+        "input": 1.25 / 1_000_000,     # $1.25 per 1M input tokens
+        "output": 10.00 / 1_000_000,   # $10.00 per 1M output tokens
+    },
+    "gpt-5": {
+        "input": 1.25 / 1_000_000,     # $1.25 per 1M input tokens
+        "output": 10.00 / 1_000_000,   # $10.00 per 1M output tokens
+    },
+    "gpt-5-mini": {
+        "input": 0.25 / 1_000_000,     # $0.25 per 1M input tokens
+        "output": 2.00 / 1_000_000,    # $2.00 per 1M output tokens
+    },
+    "gpt-5-nano": {
+        "input": 0.05 / 1_000_000,     # $0.05 per 1M input tokens
+        "output": 0.40 / 1_000_000,    # $0.40 per 1M output tokens
+    },
+    # GPT-5 Pro series
+    "gpt-5.2-pro": {
+        "input": 21.00 / 1_000_000,    # $21.00 per 1M input tokens
+        "output": 168.00 / 1_000_000,  # $168.00 per 1M output tokens
+    },
+    "gpt-5-pro": {
+        "input": 15.00 / 1_000_000,    # $15.00 per 1M input tokens
+        "output": 120.00 / 1_000_000,  # $120.00 per 1M output tokens
+    },
+    # GPT-4.1 series
+    "gpt-4.1": {
+        "input": 2.00 / 1_000_000,     # $2.00 per 1M input tokens
+        "output": 8.00 / 1_000_000,    # $8.00 per 1M output tokens
+    },
+    "gpt-4.1-mini": {
+        "input": 0.40 / 1_000_000,     # $0.40 per 1M input tokens
+        "output": 1.60 / 1_000_000,    # $1.60 per 1M output tokens
+    },
+    "gpt-4.1-nano": {
+        "input": 0.10 / 1_000_000,     # $0.10 per 1M input tokens
+        "output": 0.40 / 1_000_000,    # $0.40 per 1M output tokens
+    },
+    # GPT-4o series (currently available)
+    "gpt-4o": {
+        "input": 2.50 / 1_000_000,     # $2.50 per 1M input tokens
+        "output": 10.00 / 1_000_000,   # $10.00 per 1M output tokens
+    },
+    "gpt-4o-2024-05-13": {
+        "input": 5.00 / 1_000_000,     # $5.00 per 1M input tokens
+        "output": 15.00 / 1_000_000,   # $15.00 per 1M output tokens
+    },
+    "gpt-4o-mini": {
+        "input": 0.15 / 1_000_000,     # $0.15 per 1M input tokens
+        "output": 0.60 / 1_000_000,    # $0.60 per 1M output tokens
+    },
+    # Realtime and Audio models
+    "gpt-realtime": {
+        "input": 4.00 / 1_000_000,     # $4.00 per 1M input tokens
+        "output": 16.00 / 1_000_000,   # $16.00 per 1M output tokens
+    },
+    "gpt-realtime-mini": {
+        "input": 0.60 / 1_000_000,     # $0.60 per 1M input tokens
+        "output": 2.40 / 1_000_000,    # $2.40 per 1M output tokens
+    },
+    "gpt-audio": {
+        "input": 2.50 / 1_000_000,     # $2.50 per 1M input tokens
+        "output": 10.00 / 1_000_000,   # $10.00 per 1M output tokens
+    },
+    "gpt-audio-mini": {
+        "input": 0.60 / 1_000_000,     # $0.60 per 1M input tokens
+        "output": 2.40 / 1_000_000,    # $2.40 per 1M output tokens
+    },
+}
+PROMPT_TEMPLATE = """You are an expert in scientific computing and computational chemistry/biology/physics. Please create a high-quality programming problem inspired by the following code snippet from a real scientific computing project.
+The problem should focus on scientific computing concepts such as:
+- Numerical algorithms and simulations
+- Data analysis and visualization
+- Mathematical modeling
+- Scientific data processing
+- Computational methods in chemistry, biology, or physics
+Code snippet for inspiration:
+```python
+{code}
+```
+Present your output in two distinct sections:
+[Problem Description]
+Create a **completely self-contained** problem description that:
+- Does NOT directly reference the code snippet above
+- Provides all necessary context and background
+- Clearly states what needs to be implemented
+- Specifies input/output format and constraints
+- Is inspired by the scientific computing concepts in the code but creates a NEW, interesting problem
+- Assumes common programming knowledge but explains any domain-specific concepts
+[Solution]
+Provide a comprehensive, **correct** Python solution that:
+- Accurately solves the problem described
+- Includes clear comments explaining the approach
+- Uses appropriate scientific computing libraries (numpy, scipy, etc.) when relevant
+- Is complete and runnable
+- Follows best practices for scientific computing
+Remember: The problem should be INSPIRED by the code, not a direct copy. Create something educational and interesting for scientific computing practitioners."""
+class OpenAIClient:
+    """Client for OpenAI API with cost tracking."""
+    def __init__(self, model_name: str = MODEL_NAME, api_key: Optional[str] = None):
+        """Initialize OpenAI API client.
+        Args:
+            model_name: Name of the OpenAI model to use
+            api_key: OpenAI API key (if None, will use OPENAI_API_KEY env variable)
+        """
+        self.model_name = model_name
+        self.client = OpenAI(api_key=api_key)
+        # Get pricing for the model
+        if model_name in PRICING:
+            self.input_price = PRICING[model_name]["input"]
+            self.output_price = PRICING[model_name]["output"]
+        else:
+            print(f"Warning: No pricing info for {model_name}, using gpt-4o-mini prices")
+            self.input_price = PRICING["gpt-4o-mini"]["input"]
+            self.output_price = PRICING["gpt-4o-mini"]["output"]
+        # Statistics
+        self.total_input_tokens = 0
+        self.total_output_tokens = 0
+        self.total_requests = 0
+        self.total_cost = 0.0
+    def generate_content(self, prompt: str, max_retries: int = 3) -> Tuple[str, Dict]:
+        """Generate content using OpenAI API and track usage.
+        Args:
+            prompt: The prompt to send to the API
+            max_retries: Maximum number of retries on rate limit errors
+        Returns:
+            Tuple of (response_text, usage_info)
+            usage_info contains: input_tokens, output_tokens, cost
+        """
+        for attempt in range(max_retries):
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model_name,
+                    messages=[
+                        {"role": "system", "content": "You are an expert in scientific computing and programming education."},
+                        {"role": "user", "content": prompt}
+                    ],
+                    temperature=0.7,
+                )
+                # Extract usage information
+                usage = response.usage
+                input_tokens = usage.prompt_tokens
+                output_tokens = usage.completion_tokens
+                # Calculate cost
+                input_cost = input_tokens * self.input_price
+                output_cost = output_tokens * self.output_price
+                request_cost = input_cost + output_cost
+                # Update totals
+                self.total_input_tokens += input_tokens
+                self.total_output_tokens += output_tokens
+                self.total_requests += 1
+                self.total_cost += request_cost
+                usage_info = {
+                    'input_tokens': input_tokens,
+                    'output_tokens': output_tokens,
+                    'total_tokens': input_tokens + output_tokens,
+                    'input_cost': input_cost,
+                    'output_cost': output_cost,
+                    'request_cost': request_cost
+                }
+                return response.choices[0].message.content, usage_info
+            except Exception as e:
+                error_msg = str(e)
+                # Check if it's a rate limit error
+                if "rate_limit" in error_msg.lower() or "429" in error_msg:
+                    if attempt < max_retries - 1:
+                        wait_time = (attempt + 1) * 5  # 5, 10, 15 seconds
+                        print(f"\n⚠️  Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
+                        time.sleep(wait_time)
+                        continue
+                # For other errors or if max retries reached, raise the exception
+                print(f"\nError generating content: {e}")
+                raise
+        raise Exception(f"Failed after {max_retries} retries")
+    def get_total_usage(self) -> Dict:
+        """Get total usage statistics.
+        Returns:
+            Dictionary with total usage information
+        """
+        return {
+            'total_requests': self.total_requests,
+            'total_input_tokens': self.total_input_tokens,
+            'total_output_tokens': self.total_output_tokens,
+            'total_tokens': self.total_input_tokens + self.total_output_tokens,
+            'total_cost': self.total_cost
+        }
+    def print_usage_summary(self):
+        """Print a summary of API usage and costs."""
+        usage = self.get_total_usage()
+        print("\n" + "="*70)
+        print("API USAGE SUMMARY")
+        print("="*70)
+        print(f"Model:                 {self.model_name}")
+        print(f"Total Requests:        {usage['total_requests']}")
+        print(f"Total Input Tokens:    {usage['total_input_tokens']:,}")
+        print(f"Total Output Tokens:   {usage['total_output_tokens']:,}")
+        print(f"Total Tokens:          {usage['total_tokens']:,}")
+        print(f"\nTotal Cost:            ${usage['total_cost']:.6f}")
+        print(f"Budget Remaining:      ${MAX_BUDGET_USD - usage['total_cost']:.6f}")
+        print("="*70)
+def process_function_dataset(
+    input_file: str,
+    output_file: str,
+    min_score: int = MIN_RELEVANCE_SCORE,
+    max_budget: float = MAX_BUDGET_USD,
+    max_samples: Optional[int] = None,
+    start_from: int = 0,
+    model_name: str = MODEL_NAME
+):
+    """Process function dataset and generate programming problems.
+    Args:
+        input_file: Path to function_dataset_v2.csv
+        output_file: Path to output JSONL file
+        min_score: Minimum relevance score to process
+        max_budget: Maximum budget in USD
+        max_samples: Maximum number of samples to process (None for all)
+        start_from: Skip first N rows (for resuming)
+        model_name: OpenAI model to use
+    """
+    print(f"Starting programming problem generation with OpenAI...")
+    print(f"Input: {input_file}")
+    print(f"Output: {output_file}")
+    print(f"Model: {model_name}")
+    print(f"Min Relevance Score: {min_score}")
+    print(f"Max Budget: ${max_budget:.2f}")
+    if max_samples:
+        print(f"Max Samples: {max_samples}")
+    print(f"Starting from row: {start_from}")
+    print()
+    # Initialize OpenAI client
+    client = OpenAIClient(model_name=model_name)
+    # Statistics
+    total_rows = 0
+    processed = 0
+    skipped_low_score = 0
+    skipped_no_code = 0
+    errors = 0
+    # Open output file in append mode if resuming
+    # mode = 'a' if start_from > 0 else 'w'
+    mode = 'a'
+    try:
+        with open(input_file, 'r', encoding='utf-8') as infile, \
+             open(output_file, mode, encoding='utf-8') as outfile:
+            reader = csv.DictReader(infile)
+            for row in reader:
+                total_rows += 1
+                # Skip if resuming
+                if total_rows <= start_from:
+                    continue
+                # Check if we've reached max samples
+                if max_samples and processed >= max_samples:
+                    print(f"\nReached max samples ({max_samples}). Stopping.")
+                    break
+                # Check budget
+                if client.total_cost >= max_budget:
+                    print(f"\n⚠️  Budget limit reached (${client.total_cost:.6f} >= ${max_budget:.2f})")
+                    print(f"Stopping at row {total_rows}")
+                    break
+                # Filter by relevance score
+                try:
+                    relevance_score = int(row.get('relevance_score', 0))
+                except (ValueError, TypeError):
+                    relevance_score = 0
+                if relevance_score < min_score:
+                    skipped_low_score += 1
+                    continue
+                # Get function content
+                function_content = row.get('function_content', '').strip()
+                if not function_content or len(function_content) < 50:
+                    skipped_no_code += 1
+                    continue
+                # Prepare metadata
+                metadata = {
+                    'original_index': row.get('original_index'),
+                    'function_name': row.get('function_name'),
+                    'repo_name': row.get('repo_name'),
+                    'path': row.get('path'),
+                    'language': row.get('language'),
+                    'relevance_score': relevance_score,
+                    'function_start_line': row.get('function_start_line'),
+                    'function_end_line': row.get('function_end_line'),
+                }
+                # Generate prompt
+                prompt = PROMPT_TEMPLATE.format(code=function_content)
+                # Call API
+                try:
+                    print(f"Processing row {total_rows} (score={relevance_score}, func={metadata['function_name']})...", end=' ')
+                    response_text, usage_info = client.generate_content(prompt)
+                    print(f"✓ (${usage_info['request_cost']:.6f}, {usage_info['total_tokens']} tokens)")
+                    # Save result
+                    result = {
+                        'metadata': metadata,
+                        'prompt': prompt,
+                        'response': response_text,
+                        'usage': usage_info,
+                        'model': model_name,
+                        'timestamp': datetime.now().isoformat(),
+                        'row_number': total_rows
+                    }
+                    outfile.write(json.dumps(result, ensure_ascii=False) + '\n')
+                    outfile.flush()  # Ensure data is written immediately
+                    processed += 1
+                    # Print periodic summary
+                    if processed % 10 == 0:
+                        print(f"\n--- Progress: {processed} problems generated, ${client.total_cost:.6f} spent ---\n")
+                except Exception as e:
+                    print(f"✗ Error: {e}")
+                    errors += 1
+                    # If too many errors in a row, stop
+                    if errors >= 5 and processed == 0:
+                        print("\n⚠️  Too many errors at the beginning. Please check your API key and configuration.")
+                        break
+                    continue
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Interrupted by user.")
+    # Final summary
+    print("\n" + "="*70)
+    print("PROCESSING COMPLETE")
+    print("="*70)
+    print(f"Total rows read:           {total_rows}")
+    print(f"Successfully processed:    {processed}")
+    print(f"Skipped (low score):       {skipped_low_score}")
+    print(f"Skipped (no/short code):   {skipped_no_code}")
+    print(f"Errors:                    {errors}")
+    client.print_usage_summary()
+    print(f"\nResults saved to: {output_file}")
+    return processed
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Generate programming problems from function dataset using OpenAI API'
+    )
+    parser.add_argument(
+        '--input',
+        default='function_dataset_v2.csv',
+        help='Input CSV file (default: function_dataset_v2.csv)'
+    )
+    parser.add_argument(
+        '--output',
+        default='programming_problems_openai.jsonl',
+        help='Output JSONL file (default: programming_problems_openai.jsonl)'
+    )
+    parser.add_argument(
+        '--model',
+        default=MODEL_NAME,
+        choices=[
+            # Most commonly used models (recommended)
+            'gpt-4o-mini', 'gpt-4o',
+            # GPT-4.1 series
+            'gpt-4.1', 'gpt-4.1-mini', 'gpt-4.1-nano',
+            # GPT-5 series
+            'gpt-5', 'gpt-5.1', 'gpt-5.2', 'gpt-5-mini', 'gpt-5-nano',
+            # Specialized models
+            'gpt-4o-2024-05-13', 'gpt-realtime', 'gpt-audio'
+        ],
+        help=f'OpenAI model to use (default: {MODEL_NAME}). Recommended: gpt-4o-mini for cost-effectiveness, gpt-4o for quality'
+    )
+    parser.add_argument(
+        '--min-score',
+        type=int,
+        default=MIN_RELEVANCE_SCORE,
+        help=f'Minimum relevance score (default: {MIN_RELEVANCE_SCORE})'
+    )
+    parser.add_argument(
+        '--max-budget',
+        type=float,
+        default=MAX_BUDGET_USD,
+        help=f'Maximum budget in USD (default: {MAX_BUDGET_USD})'
+    )
+    parser.add_argument(
+        '--max-samples',
+        type=int,
+        default=None,
+        help='Maximum number of samples to process (default: no limit)'
+    )
+    parser.add_argument(
+        '--start-from',
+        type=int,
+        default=0,
+        help='Start from row N (for resuming, default: 0)'
+    )
+    args = parser.parse_args()
+    # Check if input file exists
+    if not os.path.exists(args.input):
+        print(f"Error: Input file not found: {args.input}")
+        sys.exit(1)
+    # Check if API key is set
+    if not os.getenv('OPENAI_API_KEY'):
+        print("Error: OPENAI_API_KEY environment variable not set.")
+        print("Please set it with: export OPENAI_API_KEY='your-api-key'")
+        sys.exit(1)
+    try:
+        process_function_dataset(
+            input_file=args.input,
+            output_file=args.output,
+            min_score=args.min_score,
+            max_budget=args.max_budget,
+            max_samples=args.max_samples,
+            start_from=args.start_from,
+            model_name=args.model
+        )
+        print("\n✅ Success!")
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Interrupted by user. Progress has been saved to output file.")
+        print(f"   You can resume by using --start-from <row_number>")
+        sys.exit(0)
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)