DouDou commited on
Commit
8d5d80e
·
verified ·
1 Parent(s): 0003466

Upload data3/generate_problems_openai.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/generate_problems_openai.py +506 -0
data3/generate_problems_openai.py ADDED
@@ -0,0 +1,506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate programming problems from function_dataset_v2.csv using OpenAI API.
4
+ Filters by relevance score and controls API cost.
5
+ """
6
+
7
+ import csv
8
+ import json
9
+ import os
10
+ import sys
11
+ from openai import OpenAI
12
+ from datetime import datetime
13
+ from typing import Dict, Optional, Tuple
14
+ import time
15
+
16
+
17
+ # Configuration
18
+ MODEL_NAME = "gpt-4o-mini" # Cost-effective model, can change to "gpt-4o" for better quality
19
+ MIN_RELEVANCE_SCORE = 60 # Only process functions with score >= 60
20
+ MAX_BUDGET_USD = 10.0 # Maximum budget in USD
21
+
22
+ # OpenAI pricing (as of Dec 2024)
23
+ # Official pricing: https://openai.com/api/pricing/
24
+ PRICING = {
25
+ # GPT-5 series
26
+ "gpt-5.2": {
27
+ "input": 1.75 / 1_000_000, # $1.75 per 1M input tokens
28
+ "output": 14.00 / 1_000_000, # $14.00 per 1M output tokens
29
+ },
30
+ "gpt-5.1": {
31
+ "input": 1.25 / 1_000_000, # $1.25 per 1M input tokens
32
+ "output": 10.00 / 1_000_000, # $10.00 per 1M output tokens
33
+ },
34
+ "gpt-5": {
35
+ "input": 1.25 / 1_000_000, # $1.25 per 1M input tokens
36
+ "output": 10.00 / 1_000_000, # $10.00 per 1M output tokens
37
+ },
38
+ "gpt-5-mini": {
39
+ "input": 0.25 / 1_000_000, # $0.25 per 1M input tokens
40
+ "output": 2.00 / 1_000_000, # $2.00 per 1M output tokens
41
+ },
42
+ "gpt-5-nano": {
43
+ "input": 0.05 / 1_000_000, # $0.05 per 1M input tokens
44
+ "output": 0.40 / 1_000_000, # $0.40 per 1M output tokens
45
+ },
46
+ # GPT-5 Pro series
47
+ "gpt-5.2-pro": {
48
+ "input": 21.00 / 1_000_000, # $21.00 per 1M input tokens
49
+ "output": 168.00 / 1_000_000, # $168.00 per 1M output tokens
50
+ },
51
+ "gpt-5-pro": {
52
+ "input": 15.00 / 1_000_000, # $15.00 per 1M input tokens
53
+ "output": 120.00 / 1_000_000, # $120.00 per 1M output tokens
54
+ },
55
+ # GPT-4.1 series
56
+ "gpt-4.1": {
57
+ "input": 2.00 / 1_000_000, # $2.00 per 1M input tokens
58
+ "output": 8.00 / 1_000_000, # $8.00 per 1M output tokens
59
+ },
60
+ "gpt-4.1-mini": {
61
+ "input": 0.40 / 1_000_000, # $0.40 per 1M input tokens
62
+ "output": 1.60 / 1_000_000, # $1.60 per 1M output tokens
63
+ },
64
+ "gpt-4.1-nano": {
65
+ "input": 0.10 / 1_000_000, # $0.10 per 1M input tokens
66
+ "output": 0.40 / 1_000_000, # $0.40 per 1M output tokens
67
+ },
68
+ # GPT-4o series (currently available)
69
+ "gpt-4o": {
70
+ "input": 2.50 / 1_000_000, # $2.50 per 1M input tokens
71
+ "output": 10.00 / 1_000_000, # $10.00 per 1M output tokens
72
+ },
73
+ "gpt-4o-2024-05-13": {
74
+ "input": 5.00 / 1_000_000, # $5.00 per 1M input tokens
75
+ "output": 15.00 / 1_000_000, # $15.00 per 1M output tokens
76
+ },
77
+ "gpt-4o-mini": {
78
+ "input": 0.15 / 1_000_000, # $0.15 per 1M input tokens
79
+ "output": 0.60 / 1_000_000, # $0.60 per 1M output tokens
80
+ },
81
+ # Realtime and Audio models
82
+ "gpt-realtime": {
83
+ "input": 4.00 / 1_000_000, # $4.00 per 1M input tokens
84
+ "output": 16.00 / 1_000_000, # $16.00 per 1M output tokens
85
+ },
86
+ "gpt-realtime-mini": {
87
+ "input": 0.60 / 1_000_000, # $0.60 per 1M input tokens
88
+ "output": 2.40 / 1_000_000, # $2.40 per 1M output tokens
89
+ },
90
+ "gpt-audio": {
91
+ "input": 2.50 / 1_000_000, # $2.50 per 1M input tokens
92
+ "output": 10.00 / 1_000_000, # $10.00 per 1M output tokens
93
+ },
94
+ "gpt-audio-mini": {
95
+ "input": 0.60 / 1_000_000, # $0.60 per 1M input tokens
96
+ "output": 2.40 / 1_000_000, # $2.40 per 1M output tokens
97
+ },
98
+ }
99
+
100
+ PROMPT_TEMPLATE = """You are an expert in scientific computing and computational chemistry/biology/physics. Please create a high-quality programming problem inspired by the following code snippet from a real scientific computing project.
101
+
102
+ The problem should focus on scientific computing concepts such as:
103
+ - Numerical algorithms and simulations
104
+ - Data analysis and visualization
105
+ - Mathematical modeling
106
+ - Scientific data processing
107
+ - Computational methods in chemistry, biology, or physics
108
+
109
+ Code snippet for inspiration:
110
+ ```python
111
+ {code}
112
+ ```
113
+
114
+ Present your output in two distinct sections:
115
+
116
+ [Problem Description]
117
+ Create a **completely self-contained** problem description that:
118
+ - Does NOT directly reference the code snippet above
119
+ - Provides all necessary context and background
120
+ - Clearly states what needs to be implemented
121
+ - Specifies input/output format and constraints
122
+ - Is inspired by the scientific computing concepts in the code but creates a NEW, interesting problem
123
+ - Assumes common programming knowledge but explains any domain-specific concepts
124
+
125
+ [Solution]
126
+ Provide a comprehensive, **correct** Python solution that:
127
+ - Accurately solves the problem described
128
+ - Includes clear comments explaining the approach
129
+ - Uses appropriate scientific computing libraries (numpy, scipy, etc.) when relevant
130
+ - Is complete and runnable
131
+ - Follows best practices for scientific computing
132
+
133
+ Remember: The problem should be INSPIRED by the code, not a direct copy. Create something educational and interesting for scientific computing practitioners."""
134
+
135
+
136
+ class OpenAIClient:
137
+ """Client for OpenAI API with cost tracking."""
138
+
139
+ def __init__(self, model_name: str = MODEL_NAME, api_key: Optional[str] = None):
140
+ """Initialize OpenAI API client.
141
+
142
+ Args:
143
+ model_name: Name of the OpenAI model to use
144
+ api_key: OpenAI API key (if None, will use OPENAI_API_KEY env variable)
145
+ """
146
+ self.model_name = model_name
147
+ self.client = OpenAI(api_key=api_key)
148
+
149
+ # Get pricing for the model
150
+ if model_name in PRICING:
151
+ self.input_price = PRICING[model_name]["input"]
152
+ self.output_price = PRICING[model_name]["output"]
153
+ else:
154
+ print(f"Warning: No pricing info for {model_name}, using gpt-4o-mini prices")
155
+ self.input_price = PRICING["gpt-4o-mini"]["input"]
156
+ self.output_price = PRICING["gpt-4o-mini"]["output"]
157
+
158
+ # Statistics
159
+ self.total_input_tokens = 0
160
+ self.total_output_tokens = 0
161
+ self.total_requests = 0
162
+ self.total_cost = 0.0
163
+
164
+ def generate_content(self, prompt: str, max_retries: int = 3) -> Tuple[str, Dict]:
165
+ """Generate content using OpenAI API and track usage.
166
+
167
+ Args:
168
+ prompt: The prompt to send to the API
169
+ max_retries: Maximum number of retries on rate limit errors
170
+
171
+ Returns:
172
+ Tuple of (response_text, usage_info)
173
+ usage_info contains: input_tokens, output_tokens, cost
174
+ """
175
+ for attempt in range(max_retries):
176
+ try:
177
+ response = self.client.chat.completions.create(
178
+ model=self.model_name,
179
+ messages=[
180
+ {"role": "system", "content": "You are an expert in scientific computing and programming education."},
181
+ {"role": "user", "content": prompt}
182
+ ],
183
+ temperature=0.7,
184
+ )
185
+
186
+ # Extract usage information
187
+ usage = response.usage
188
+ input_tokens = usage.prompt_tokens
189
+ output_tokens = usage.completion_tokens
190
+
191
+ # Calculate cost
192
+ input_cost = input_tokens * self.input_price
193
+ output_cost = output_tokens * self.output_price
194
+ request_cost = input_cost + output_cost
195
+
196
+ # Update totals
197
+ self.total_input_tokens += input_tokens
198
+ self.total_output_tokens += output_tokens
199
+ self.total_requests += 1
200
+ self.total_cost += request_cost
201
+
202
+ usage_info = {
203
+ 'input_tokens': input_tokens,
204
+ 'output_tokens': output_tokens,
205
+ 'total_tokens': input_tokens + output_tokens,
206
+ 'input_cost': input_cost,
207
+ 'output_cost': output_cost,
208
+ 'request_cost': request_cost
209
+ }
210
+
211
+ return response.choices[0].message.content, usage_info
212
+
213
+ except Exception as e:
214
+ error_msg = str(e)
215
+
216
+ # Check if it's a rate limit error
217
+ if "rate_limit" in error_msg.lower() or "429" in error_msg:
218
+ if attempt < max_retries - 1:
219
+ wait_time = (attempt + 1) * 5 # 5, 10, 15 seconds
220
+ print(f"\n⚠️ Rate limit hit, waiting {wait_time}s before retry {attempt + 2}/{max_retries}...")
221
+ time.sleep(wait_time)
222
+ continue
223
+
224
+ # For other errors or if max retries reached, raise the exception
225
+ print(f"\nError generating content: {e}")
226
+ raise
227
+
228
+ raise Exception(f"Failed after {max_retries} retries")
229
+
230
+ def get_total_usage(self) -> Dict:
231
+ """Get total usage statistics.
232
+
233
+ Returns:
234
+ Dictionary with total usage information
235
+ """
236
+ return {
237
+ 'total_requests': self.total_requests,
238
+ 'total_input_tokens': self.total_input_tokens,
239
+ 'total_output_tokens': self.total_output_tokens,
240
+ 'total_tokens': self.total_input_tokens + self.total_output_tokens,
241
+ 'total_cost': self.total_cost
242
+ }
243
+
244
+ def print_usage_summary(self):
245
+ """Print a summary of API usage and costs."""
246
+ usage = self.get_total_usage()
247
+ print("\n" + "="*70)
248
+ print("API USAGE SUMMARY")
249
+ print("="*70)
250
+ print(f"Model: {self.model_name}")
251
+ print(f"Total Requests: {usage['total_requests']}")
252
+ print(f"Total Input Tokens: {usage['total_input_tokens']:,}")
253
+ print(f"Total Output Tokens: {usage['total_output_tokens']:,}")
254
+ print(f"Total Tokens: {usage['total_tokens']:,}")
255
+ print(f"\nTotal Cost: ${usage['total_cost']:.6f}")
256
+ print(f"Budget Remaining: ${MAX_BUDGET_USD - usage['total_cost']:.6f}")
257
+ print("="*70)
258
+
259
+
260
+ def process_function_dataset(
261
+ input_file: str,
262
+ output_file: str,
263
+ min_score: int = MIN_RELEVANCE_SCORE,
264
+ max_budget: float = MAX_BUDGET_USD,
265
+ max_samples: Optional[int] = None,
266
+ start_from: int = 0,
267
+ model_name: str = MODEL_NAME
268
+ ):
269
+ """Process function dataset and generate programming problems.
270
+
271
+ Args:
272
+ input_file: Path to function_dataset_v2.csv
273
+ output_file: Path to output JSONL file
274
+ min_score: Minimum relevance score to process
275
+ max_budget: Maximum budget in USD
276
+ max_samples: Maximum number of samples to process (None for all)
277
+ start_from: Skip first N rows (for resuming)
278
+ model_name: OpenAI model to use
279
+ """
280
+ print(f"Starting programming problem generation with OpenAI...")
281
+ print(f"Input: {input_file}")
282
+ print(f"Output: {output_file}")
283
+ print(f"Model: {model_name}")
284
+ print(f"Min Relevance Score: {min_score}")
285
+ print(f"Max Budget: ${max_budget:.2f}")
286
+ if max_samples:
287
+ print(f"Max Samples: {max_samples}")
288
+ print(f"Starting from row: {start_from}")
289
+ print()
290
+
291
+ # Initialize OpenAI client
292
+ client = OpenAIClient(model_name=model_name)
293
+
294
+ # Statistics
295
+ total_rows = 0
296
+ processed = 0
297
+ skipped_low_score = 0
298
+ skipped_no_code = 0
299
+ errors = 0
300
+
301
+ # Open output file in append mode if resuming
302
+ # mode = 'a' if start_from > 0 else 'w'
303
+ mode = 'a'
304
+
305
+ try:
306
+ with open(input_file, 'r', encoding='utf-8') as infile, \
307
+ open(output_file, mode, encoding='utf-8') as outfile:
308
+
309
+ reader = csv.DictReader(infile)
310
+
311
+ for row in reader:
312
+ total_rows += 1
313
+
314
+ # Skip if resuming
315
+ if total_rows <= start_from:
316
+ continue
317
+
318
+ # Check if we've reached max samples
319
+ if max_samples and processed >= max_samples:
320
+ print(f"\nReached max samples ({max_samples}). Stopping.")
321
+ break
322
+
323
+ # Check budget
324
+ if client.total_cost >= max_budget:
325
+ print(f"\n⚠️ Budget limit reached (${client.total_cost:.6f} >= ${max_budget:.2f})")
326
+ print(f"Stopping at row {total_rows}")
327
+ break
328
+
329
+ # Filter by relevance score
330
+ try:
331
+ relevance_score = int(row.get('relevance_score', 0))
332
+ except (ValueError, TypeError):
333
+ relevance_score = 0
334
+
335
+ if relevance_score < min_score:
336
+ skipped_low_score += 1
337
+ continue
338
+
339
+ # Get function content
340
+ function_content = row.get('function_content', '').strip()
341
+ if not function_content or len(function_content) < 50:
342
+ skipped_no_code += 1
343
+ continue
344
+
345
+ # Prepare metadata
346
+ metadata = {
347
+ 'original_index': row.get('original_index'),
348
+ 'function_name': row.get('function_name'),
349
+ 'repo_name': row.get('repo_name'),
350
+ 'path': row.get('path'),
351
+ 'language': row.get('language'),
352
+ 'relevance_score': relevance_score,
353
+ 'function_start_line': row.get('function_start_line'),
354
+ 'function_end_line': row.get('function_end_line'),
355
+ }
356
+
357
+ # Generate prompt
358
+ prompt = PROMPT_TEMPLATE.format(code=function_content)
359
+
360
+ # Call API
361
+ try:
362
+ print(f"Processing row {total_rows} (score={relevance_score}, func={metadata['function_name']})...", end=' ')
363
+
364
+ response_text, usage_info = client.generate_content(prompt)
365
+
366
+ print(f"✓ (${usage_info['request_cost']:.6f}, {usage_info['total_tokens']} tokens)")
367
+
368
+ # Save result
369
+ result = {
370
+ 'metadata': metadata,
371
+ 'prompt': prompt,
372
+ 'response': response_text,
373
+ 'usage': usage_info,
374
+ 'model': model_name,
375
+ 'timestamp': datetime.now().isoformat(),
376
+ 'row_number': total_rows
377
+ }
378
+
379
+ outfile.write(json.dumps(result, ensure_ascii=False) + '\n')
380
+ outfile.flush() # Ensure data is written immediately
381
+
382
+ processed += 1
383
+
384
+ # Print periodic summary
385
+ if processed % 10 == 0:
386
+ print(f"\n--- Progress: {processed} problems generated, ${client.total_cost:.6f} spent ---\n")
387
+
388
+ except Exception as e:
389
+ print(f"✗ Error: {e}")
390
+ errors += 1
391
+
392
+ # If too many errors in a row, stop
393
+ if errors >= 5 and processed == 0:
394
+ print("\n⚠️ Too many errors at the beginning. Please check your API key and configuration.")
395
+ break
396
+
397
+ continue
398
+ except KeyboardInterrupt:
399
+ print("\n\n⚠️ Interrupted by user.")
400
+
401
+ # Final summary
402
+ print("\n" + "="*70)
403
+ print("PROCESSING COMPLETE")
404
+ print("="*70)
405
+ print(f"Total rows read: {total_rows}")
406
+ print(f"Successfully processed: {processed}")
407
+ print(f"Skipped (low score): {skipped_low_score}")
408
+ print(f"Skipped (no/short code): {skipped_no_code}")
409
+ print(f"Errors: {errors}")
410
+
411
+ client.print_usage_summary()
412
+
413
+ print(f"\nResults saved to: {output_file}")
414
+
415
+ return processed
416
+
417
+
418
+ if __name__ == "__main__":
419
+ import argparse
420
+
421
+ parser = argparse.ArgumentParser(
422
+ description='Generate programming problems from function dataset using OpenAI API'
423
+ )
424
+ parser.add_argument(
425
+ '--input',
426
+ default='function_dataset_v2.csv',
427
+ help='Input CSV file (default: function_dataset_v2.csv)'
428
+ )
429
+ parser.add_argument(
430
+ '--output',
431
+ default='programming_problems_openai.jsonl',
432
+ help='Output JSONL file (default: programming_problems_openai.jsonl)'
433
+ )
434
+ parser.add_argument(
435
+ '--model',
436
+ default=MODEL_NAME,
437
+ choices=[
438
+ # Most commonly used models (recommended)
439
+ 'gpt-4o-mini', 'gpt-4o',
440
+ # GPT-4.1 series
441
+ 'gpt-4.1', 'gpt-4.1-mini', 'gpt-4.1-nano',
442
+ # GPT-5 series
443
+ 'gpt-5', 'gpt-5.1', 'gpt-5.2', 'gpt-5-mini', 'gpt-5-nano',
444
+ # Specialized models
445
+ 'gpt-4o-2024-05-13', 'gpt-realtime', 'gpt-audio'
446
+ ],
447
+ help=f'OpenAI model to use (default: {MODEL_NAME}). Recommended: gpt-4o-mini for cost-effectiveness, gpt-4o for quality'
448
+ )
449
+ parser.add_argument(
450
+ '--min-score',
451
+ type=int,
452
+ default=MIN_RELEVANCE_SCORE,
453
+ help=f'Minimum relevance score (default: {MIN_RELEVANCE_SCORE})'
454
+ )
455
+ parser.add_argument(
456
+ '--max-budget',
457
+ type=float,
458
+ default=MAX_BUDGET_USD,
459
+ help=f'Maximum budget in USD (default: {MAX_BUDGET_USD})'
460
+ )
461
+ parser.add_argument(
462
+ '--max-samples',
463
+ type=int,
464
+ default=None,
465
+ help='Maximum number of samples to process (default: no limit)'
466
+ )
467
+ parser.add_argument(
468
+ '--start-from',
469
+ type=int,
470
+ default=0,
471
+ help='Start from row N (for resuming, default: 0)'
472
+ )
473
+
474
+ args = parser.parse_args()
475
+
476
+ # Check if input file exists
477
+ if not os.path.exists(args.input):
478
+ print(f"Error: Input file not found: {args.input}")
479
+ sys.exit(1)
480
+
481
+ # Check if API key is set
482
+ if not os.getenv('OPENAI_API_KEY'):
483
+ print("Error: OPENAI_API_KEY environment variable not set.")
484
+ print("Please set it with: export OPENAI_API_KEY='your-api-key'")
485
+ sys.exit(1)
486
+
487
+ try:
488
+ process_function_dataset(
489
+ input_file=args.input,
490
+ output_file=args.output,
491
+ min_score=args.min_score,
492
+ max_budget=args.max_budget,
493
+ max_samples=args.max_samples,
494
+ start_from=args.start_from,
495
+ model_name=args.model
496
+ )
497
+ print("\n✅ Success!")
498
+ except KeyboardInterrupt:
499
+ print("\n\n⚠️ Interrupted by user. Progress has been saved to output file.")
500
+ print(f" You can resume by using --start-from <row_number>")
501
+ sys.exit(0)
502
+ except Exception as e:
503
+ print(f"\n❌ Error: {e}")
504
+ import traceback
505
+ traceback.print_exc()
506
+ sys.exit(1)