File size: 5,073 Bytes

89c39b8

import asyncio
from openai import AsyncOpenAI
from tqdm import tqdm  # 使用标准 tqdm
from load_dataset import load_dataset, length_max  
from itertools import islice
import csv

client = AsyncOpenAI(
    base_url="http://localhost:8000/v1",
    api_key="none"
)

# 定义给 vLLM 使用的 JSON Schema（Python 字典写法）
scientific_func_schema = {
    "type": "array",
    "description": "List of functions related to scientific and especially chemistry-related computing.",
    "items": {
        "type": "object",
        "additionalProperties": False,
        "properties": {
            "function_name": {
                "type": "string",
                "description": "The function name."
            },
            "function_start_line": {
                "type": "integer",
                "description": "The starting line number of the function definition (inclusive)."
            },
            "function_end_line": {
                "type": "integer",
                "description": "The ending line number of the function definition (inclusive)."
            },
            "relevance_score": {
                "type": "integer",
                "minimum": 0,
                "maximum": 100,
                "description": "Relevance score (0–100) for scientific/chemistry-related computing. Only include functions with score > 0."
            },
            "relevance_reason": {
                "type": "string",
                "description": "Explanation of why this function is related to scientific/chemical computing and why it received that score."
            },
            "doc_start_line": {
                "type": ["integer", "null"],
                "description": "Starting line number of the associated documentation comment, or null if none."
            },
            "doc_end_line": {
                "type": ["integer", "null"],
                "description": "Ending line number of the associated documentation comment, or null if none."
            }
        },
        "required": [
            "function_name",
            "function_start_line",
            "function_end_line",
            "relevance_score",
            "relevance_reason",
            "doc_start_line",
            "doc_end_line"
        ]
    }
}


async def process_one(code_file):
    prompt, row = code_file
    """处理单条 prompt"""
    resp = await client.chat.completions.create(
        model="Qwen3",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=8192,
        temperature=0.7,
        top_p=0.8,
        presence_penalty=1.5,
        frequency_penalty=1.5,
        
        extra_body={
            "top_k": 20,
            "chat_template_kwargs": {
                "enable_thinking": False,
            },

        },
        # response_format={
        #     "type": "json_schema",
        #     "json_schema": {
        #         "name": "scientific_functions_analysis",
        #         "schema": scientific_func_schema,
        #         "strict": True
        #     },
        # },
        # response_format={
        #     "type": "json_schema",
        #     "json_schema": {
        #         "name": "scientific_functions_analysis",
        #         "schema": {
        #             'type': 'array',
        #             },
        #         "strict": True
        #     },
        # },
    )

    content = resp.choices[0].message.content
    # if 'true' in content[:6].lower() or 'true' in content[-6:].lower():
        # res = True
    # else:
        # res = False
    
    res = content

    return row, res

res_file = open('res2.csv', 'a+', encoding='utf-8')
writer = csv.writer(res_file)

async def process_batch(batch):
    """并发处理一个 batch，同时显示进度条"""
    # print(batch[0])
    tasks = [asyncio.create_task(process_one(p)) for p in batch]
    results = []

    for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing batch", unit="req", leave=False):
        result = await f
        # if result[1] == True:
            # writer.writerow([result[0]])
        writer.writerow([result[0], result[1]])
        results.append(result)

    return results

async def process_dataset(dataset_iter, batch_size=200):
    """按 batch_size 分批处理整个数据集，显示整体进度条"""
    results = []
    num_batches = (length_max + batch_size - 1) // batch_size
    amount = 0
    for i in tqdm(range(num_batches), desc="Overall progress", unit="batch"):
        batch = list(islice(dataset_iter, batch_size))
        batch_results = await process_batch(batch)
        amount += len(batch_results)
        # results.extend(batch_results)
    print("处理完成，共获得结果条数：", amount)
    with open("res.log", "w", encoding="utf-8") as f:
        f.write(str(amount))
    return results

if __name__ == "__main__":
    dataset_iter = load_dataset()
    final_results = asyncio.run(process_dataset(dataset_iter, batch_size=64))
    print("处理完成，共获得结果条数：", len(final_results))
    res_file.close()