DouDou commited on
Commit
89c39b8
·
verified ·
1 Parent(s): 28e5d57

Upload data3/vllm_high.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/vllm_high.py +151 -0
data3/vllm_high.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from openai import AsyncOpenAI
3
+ from tqdm import tqdm # 使用标准 tqdm
4
+ from load_dataset import load_dataset, length_max
5
+ from itertools import islice
6
+ import csv
7
+
8
+ client = AsyncOpenAI(
9
+ base_url="http://localhost:8000/v1",
10
+ api_key="none"
11
+ )
12
+
13
+ # 定义给 vLLM 使用的 JSON Schema(Python 字典写法)
14
+ scientific_func_schema = {
15
+ "type": "array",
16
+ "description": "List of functions related to scientific and especially chemistry-related computing.",
17
+ "items": {
18
+ "type": "object",
19
+ "additionalProperties": False,
20
+ "properties": {
21
+ "function_name": {
22
+ "type": "string",
23
+ "description": "The function name."
24
+ },
25
+ "function_start_line": {
26
+ "type": "integer",
27
+ "description": "The starting line number of the function definition (inclusive)."
28
+ },
29
+ "function_end_line": {
30
+ "type": "integer",
31
+ "description": "The ending line number of the function definition (inclusive)."
32
+ },
33
+ "relevance_score": {
34
+ "type": "integer",
35
+ "minimum": 0,
36
+ "maximum": 100,
37
+ "description": "Relevance score (0–100) for scientific/chemistry-related computing. Only include functions with score > 0."
38
+ },
39
+ "relevance_reason": {
40
+ "type": "string",
41
+ "description": "Explanation of why this function is related to scientific/chemical computing and why it received that score."
42
+ },
43
+ "doc_start_line": {
44
+ "type": ["integer", "null"],
45
+ "description": "Starting line number of the associated documentation comment, or null if none."
46
+ },
47
+ "doc_end_line": {
48
+ "type": ["integer", "null"],
49
+ "description": "Ending line number of the associated documentation comment, or null if none."
50
+ }
51
+ },
52
+ "required": [
53
+ "function_name",
54
+ "function_start_line",
55
+ "function_end_line",
56
+ "relevance_score",
57
+ "relevance_reason",
58
+ "doc_start_line",
59
+ "doc_end_line"
60
+ ]
61
+ }
62
+ }
63
+
64
+
65
+ async def process_one(code_file):
66
+ prompt, row = code_file
67
+ """处理单条 prompt"""
68
+ resp = await client.chat.completions.create(
69
+ model="Qwen3",
70
+ messages=[{"role": "user", "content": prompt}],
71
+ max_tokens=8192,
72
+ temperature=0.7,
73
+ top_p=0.8,
74
+ presence_penalty=1.5,
75
+ frequency_penalty=1.5,
76
+
77
+ extra_body={
78
+ "top_k": 20,
79
+ "chat_template_kwargs": {
80
+ "enable_thinking": False,
81
+ },
82
+
83
+ },
84
+ # response_format={
85
+ # "type": "json_schema",
86
+ # "json_schema": {
87
+ # "name": "scientific_functions_analysis",
88
+ # "schema": scientific_func_schema,
89
+ # "strict": True
90
+ # },
91
+ # },
92
+ # response_format={
93
+ # "type": "json_schema",
94
+ # "json_schema": {
95
+ # "name": "scientific_functions_analysis",
96
+ # "schema": {
97
+ # 'type': 'array',
98
+ # },
99
+ # "strict": True
100
+ # },
101
+ # },
102
+ )
103
+
104
+ content = resp.choices[0].message.content
105
+ # if 'true' in content[:6].lower() or 'true' in content[-6:].lower():
106
+ # res = True
107
+ # else:
108
+ # res = False
109
+
110
+ res = content
111
+
112
+ return row, res
113
+
114
+ res_file = open('res2.csv', 'a+', encoding='utf-8')
115
+ writer = csv.writer(res_file)
116
+
117
+ async def process_batch(batch):
118
+ """并发处理一个 batch,同时显示进度条"""
119
+ # print(batch[0])
120
+ tasks = [asyncio.create_task(process_one(p)) for p in batch]
121
+ results = []
122
+
123
+ for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing batch", unit="req", leave=False):
124
+ result = await f
125
+ # if result[1] == True:
126
+ # writer.writerow([result[0]])
127
+ writer.writerow([result[0], result[1]])
128
+ results.append(result)
129
+
130
+ return results
131
+
132
+ async def process_dataset(dataset_iter, batch_size=200):
133
+ """按 batch_size 分批处理整个数据集,显示整体进度条"""
134
+ results = []
135
+ num_batches = (length_max + batch_size - 1) // batch_size
136
+ amount = 0
137
+ for i in tqdm(range(num_batches), desc="Overall progress", unit="batch"):
138
+ batch = list(islice(dataset_iter, batch_size))
139
+ batch_results = await process_batch(batch)
140
+ amount += len(batch_results)
141
+ # results.extend(batch_results)
142
+ print("处理完成,共获得结果条数:", amount)
143
+ with open("res.log", "w", encoding="utf-8") as f:
144
+ f.write(str(amount))
145
+ return results
146
+
147
+ if __name__ == "__main__":
148
+ dataset_iter = load_dataset()
149
+ final_results = asyncio.run(process_dataset(dataset_iter, batch_size=64))
150
+ print("处理完成,共获得结果条数:", len(final_results))
151
+ res_file.close()