SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

89c39b8

verified ·

1 Parent(s): 28e5d57

Upload data3/vllm_high.py with huggingface_hub

Browse files

Files changed (1) hide show

data3/vllm_high.py +151 -0

data3/vllm_high.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import asyncio
+from openai import AsyncOpenAI
+from tqdm import tqdm  # 使用标准 tqdm
+from load_dataset import load_dataset, length_max
+from itertools import islice
+import csv
+client = AsyncOpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="none"
+)
+# 定义给 vLLM 使用的 JSON Schema（Python 字典写法）
+scientific_func_schema = {
+    "type": "array",
+    "description": "List of functions related to scientific and especially chemistry-related computing.",
+    "items": {
+        "type": "object",
+        "additionalProperties": False,
+        "properties": {
+            "function_name": {
+                "type": "string",
+                "description": "The function name."
+            },
+            "function_start_line": {
+                "type": "integer",
+                "description": "The starting line number of the function definition (inclusive)."
+            },
+            "function_end_line": {
+                "type": "integer",
+                "description": "The ending line number of the function definition (inclusive)."
+            },
+            "relevance_score": {
+                "type": "integer",
+                "minimum": 0,
+                "maximum": 100,
+                "description": "Relevance score (0–100) for scientific/chemistry-related computing. Only include functions with score > 0."
+            },
+            "relevance_reason": {
+                "type": "string",
+                "description": "Explanation of why this function is related to scientific/chemical computing and why it received that score."
+            },
+            "doc_start_line": {
+                "type": ["integer", "null"],
+                "description": "Starting line number of the associated documentation comment, or null if none."
+            },
+            "doc_end_line": {
+                "type": ["integer", "null"],
+                "description": "Ending line number of the associated documentation comment, or null if none."
+            }
+        },
+        "required": [
+            "function_name",
+            "function_start_line",
+            "function_end_line",
+            "relevance_score",
+            "relevance_reason",
+            "doc_start_line",
+            "doc_end_line"
+        ]
+    }
+}
+async def process_one(code_file):
+    prompt, row = code_file
+    """处理单条 prompt"""
+    resp = await client.chat.completions.create(
+        model="Qwen3",
+        messages=[{"role": "user", "content": prompt}],
+        max_tokens=8192,
+        temperature=0.7,
+        top_p=0.8,
+        presence_penalty=1.5,
+        frequency_penalty=1.5,
+        extra_body={
+            "top_k": 20,
+            "chat_template_kwargs": {
+                "enable_thinking": False,
+            },
+        },
+        # response_format={
+        #     "type": "json_schema",
+        #     "json_schema": {
+        #         "name": "scientific_functions_analysis",
+        #         "schema": scientific_func_schema,
+        #         "strict": True
+        #     },
+        # },
+        # response_format={
+        #     "type": "json_schema",
+        #     "json_schema": {
+        #         "name": "scientific_functions_analysis",
+        #         "schema": {
+        #             'type': 'array',
+        #             },
+        #         "strict": True
+        #     },
+        # },
+    )
+    content = resp.choices[0].message.content
+    # if 'true' in content[:6].lower() or 'true' in content[-6:].lower():
+        # res = True
+    # else:
+        # res = False
+    res = content
+    return row, res
+res_file = open('res2.csv', 'a+', encoding='utf-8')
+writer = csv.writer(res_file)
+async def process_batch(batch):
+    """并发处理一个 batch，同时显示进度条"""
+    # print(batch[0])
+    tasks = [asyncio.create_task(process_one(p)) for p in batch]
+    results = []
+    for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing batch", unit="req", leave=False):
+        result = await f
+        # if result[1] == True:
+            # writer.writerow([result[0]])
+        writer.writerow([result[0], result[1]])
+        results.append(result)
+    return results
+async def process_dataset(dataset_iter, batch_size=200):
+    """按 batch_size 分批处理整个数据集，显示整体进度条"""
+    results = []
+    num_batches = (length_max + batch_size - 1) // batch_size
+    amount = 0
+    for i in tqdm(range(num_batches), desc="Overall progress", unit="batch"):
+        batch = list(islice(dataset_iter, batch_size))
+        batch_results = await process_batch(batch)
+        amount += len(batch_results)
+        # results.extend(batch_results)
+    print("处理完成，共获得结果条数：", amount)
+    with open("res.log", "w", encoding="utf-8") as f:
+        f.write(str(amount))
+    return results
+if __name__ == "__main__":
+    dataset_iter = load_dataset()
+    final_results = asyncio.run(process_dataset(dataset_iter, batch_size=64))
+    print("处理完成，共获得结果条数：", len(final_results))
+    res_file.close()