DouDou commited on
Commit
65df077
·
verified ·
1 Parent(s): 87eecb7

Upload data3/run_batch_gpt5nano.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/run_batch_gpt5nano.sh +195 -0
data3/run_batch_gpt5nano.sh ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # GPT-5-Nano 超大规模批量生成脚本 - 最便宜的选项
3
+
4
+ set -e
5
+
6
+ echo "🚀 OpenAI Batch API 批量生成 - GPT-5-Nano"
7
+ echo "========================================"
8
+ echo "模型: gpt-5-nano"
9
+ echo "Batch API 定价:"
10
+ echo " - 输入: \$0.025/M tokens (50% off \$0.05)"
11
+ echo " - 输出: \$0.20/M tokens (50% off \$0.40)"
12
+ echo ""
13
+ echo "预算: \$40"
14
+ echo "预计可生成: ~160,000+ 个样本"
15
+ echo "相比 gpt-4o-mini: 成本降低 75%"
16
+ echo "========================================"
17
+ echo ""
18
+
19
+ # 配置
20
+ BUDGET=40
21
+ MIN_SCORE=60
22
+ MODEL="gpt-5-nano"
23
+ INPUT_FILE="function_dataset_v2.csv"
24
+ BATCH_REQUESTS_FILE="batch_requests_gpt5nano.jsonl"
25
+ BATCH_RESULTS_RAW="batch_results_gpt5nano_raw.jsonl"
26
+ FINAL_OUTPUT="programming_problems_gpt5nano.jsonl"
27
+ BATCH_ID_FILE="batch_id_gpt5nano.txt"
28
+
29
+ # 检查环境
30
+ if [ ! -f "$INPUT_FILE" ]; then
31
+ echo "❌ 错误: 找不到输入文件 $INPUT_FILE"
32
+ exit 1
33
+ fi
34
+
35
+ if [ -z "$OPENAI_API_KEY" ]; then
36
+ echo "❌ 错误: OPENAI_API_KEY 环境变量未设置"
37
+ echo " 请运行: export OPENAI_API_KEY='your-api-key'"
38
+ exit 1
39
+ fi
40
+
41
+ # 步骤1: 估算成本
42
+ echo "📊 步骤 1/5: 估算预算..."
43
+ echo "----------------------------------------"
44
+ python3 generate_problems_batch.py estimate \
45
+ --num-requests 160000 \
46
+ --avg-input-tokens 1917 \
47
+ --avg-output-tokens 2552 \
48
+ --model $MODEL
49
+
50
+ echo ""
51
+ read -p "👉 继续执行? (y/n) " -n 1 -r
52
+ echo ""
53
+ if [[ ! $REPLY =~ ^[Yy]$ ]]; then
54
+ echo "❌ 已取消"
55
+ exit 0
56
+ fi
57
+
58
+ # 步骤2: 准备批量请求
59
+ echo ""
60
+ echo "📋 步骤 2/5: 准备批量请求..."
61
+ echo "----------------------------------------"
62
+ python3 generate_problems_batch.py prepare \
63
+ --input $INPUT_FILE \
64
+ --output $BATCH_REQUESTS_FILE \
65
+ --min-score $MIN_SCORE \
66
+ --model $MODEL
67
+
68
+ # 检查生成的请求数量
69
+ REQUEST_COUNT=$(wc -l < $BATCH_REQUESTS_FILE)
70
+ echo "✅ 已准备 $REQUEST_COUNT 个请求"
71
+
72
+ # 估算实际成本
73
+ echo ""
74
+ echo "💰 根据实际请求数量重新估算..."
75
+ python3 generate_problems_batch.py estimate \
76
+ --num-requests $REQUEST_COUNT \
77
+ --avg-input-tokens 1917 \
78
+ --avg-output-tokens 2552 \
79
+ --model $MODEL
80
+
81
+ echo ""
82
+ read -p "👉 继续提交到 OpenAI? (y/n) " -n 1 -r
83
+ echo ""
84
+ if [[ ! $REPLY =~ ^[Yy]$ ]]; then
85
+ echo "❌ 已取消 (批量请求文件已保存: $BATCH_REQUESTS_FILE)"
86
+ exit 0
87
+ fi
88
+
89
+ # 步骤3: 提交批处理任务
90
+ echo ""
91
+ echo "🚀 步骤 3/5: 提交批处理任务到 OpenAI..."
92
+ echo "----------------------------------------"
93
+ SUBMIT_OUTPUT=$(python3 generate_problems_batch.py submit \
94
+ --input $BATCH_REQUESTS_FILE \
95
+ --model $MODEL \
96
+ --description "Scientific computing problems (GPT-5-Nano) - $REQUEST_COUNT samples")
97
+
98
+ echo "$SUBMIT_OUTPUT"
99
+
100
+ # 提取并保存 Batch ID
101
+ BATCH_ID=$(echo "$SUBMIT_OUTPUT" | grep -oP 'Batch created: \K[^ ]+' || echo "$SUBMIT_OUTPUT" | grep -oP 'batch_[a-zA-Z0-9_]+' | head -1)
102
+
103
+ if [ -z "$BATCH_ID" ]; then
104
+ echo "❌ 错误: 无法获取 Batch ID"
105
+ echo "请手动检查输出并记录 Batch ID"
106
+ exit 1
107
+ fi
108
+
109
+ echo "$BATCH_ID" > $BATCH_ID_FILE
110
+ echo ""
111
+ echo "✅ Batch ID 已保存到: $BATCH_ID_FILE"
112
+ echo "📝 Batch ID: $BATCH_ID"
113
+ echo ""
114
+
115
+ # 步骤4: 监控批处理状态
116
+ echo "⏳ 步骤 4/5: 监控批处理状态..."
117
+ echo "----------------------------------------"
118
+ echo "批处理任务通常在几小时内完成(最多24小时)"
119
+ echo "您可以:"
120
+ echo " 1. 等待脚本自动监控(每5分钟检查一次)"
121
+ echo " 2. 按 Ctrl+C 退出,稍后运行监控命令:"
122
+ echo " python3 generate_problems_batch.py status $BATCH_ID"
123
+ echo ""
124
+
125
+ read -p "👉 是否自动监控? (y/n) " -n 1 -r
126
+ echo ""
127
+
128
+ if [[ $REPLY =~ ^[Yy]$ ]]; then
129
+ echo "🔍 开始自动监控..."
130
+
131
+ while true; do
132
+ TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
133
+ echo ""
134
+ echo "[$TIMESTAMP] 检查批处理状态..."
135
+
136
+ STATUS_OUTPUT=$(python3 generate_problems_batch.py status $BATCH_ID)
137
+ echo "$STATUS_OUTPUT"
138
+
139
+ # 检查状态
140
+ if echo "$STATUS_OUTPUT" | grep -q "Status: completed"; then
141
+ echo ""
142
+ echo "✅ 批处理已完成!"
143
+ break
144
+ elif echo "$STATUS_OUTPUT" | grep -q "Status: failed"; then
145
+ echo ""
146
+ echo "❌ 批处理失败!请检查错误信息"
147
+ exit 1
148
+ elif echo "$STATUS_OUTPUT" | grep -q "Status: expired"; then
149
+ echo ""
150
+ echo "❌ 批处理已过期(超过24小时)"
151
+ exit 1
152
+ fi
153
+
154
+ echo "⏳ 批处理仍在进行中,5分钟后再次检查..."
155
+ sleep 300 # 等待5分钟
156
+ done
157
+ else
158
+ echo "ℹ️ 跳过自动监控"
159
+ echo "稍后请手动检查状态:"
160
+ echo " python3 generate_problems_batch.py status $BATCH_ID"
161
+ echo ""
162
+ echo "完成后运行下载和处理命令:"
163
+ echo " python3 generate_problems_batch.py download $BATCH_ID --output $BATCH_RESULTS_RAW"
164
+ echo " python3 generate_problems_batch.py process --input $BATCH_RESULTS_RAW --output $FINAL_OUTPUT --requests $BATCH_REQUESTS_FILE"
165
+ exit 0
166
+ fi
167
+
168
+ # 步骤5: 下载和处理结果
169
+ echo ""
170
+ echo "⬇️ 步骤 5/5: 下载和处理结果..."
171
+ echo "----------------------------------------"
172
+
173
+ # 下载结果
174
+ python3 generate_problems_batch.py download $BATCH_ID \
175
+ --output $BATCH_RESULTS_RAW
176
+
177
+ # 处理结果
178
+ python3 generate_problems_batch.py process \
179
+ --input $BATCH_RESULTS_RAW \
180
+ --output $FINAL_OUTPUT \
181
+ --model $MODEL \
182
+ --requests $BATCH_REQUESTS_FILE
183
+
184
+ echo ""
185
+ echo "========================================"
186
+ echo "✅ 全部完成!"
187
+ echo "========================================"
188
+ echo "最终结果文件: $FINAL_OUTPUT"
189
+ echo ""
190
+ echo "查看结果:"
191
+ echo " head -1 $FINAL_OUTPUT | python3 -m json.tool"
192
+ echo " wc -l $FINAL_OUTPUT"
193
+ echo ""
194
+ echo "Batch ID: $BATCH_ID (已保存在 $BATCH_ID_FILE)"
195
+ echo "========================================"