| | |
| | """ |
| | 合并 res2.csv 和 dataset_all.csv 数据集 |
| | |
| | res2.csv 结构: 编号, JSON列表(函数信息) |
| | dataset_all.csv 结构: 编号(row[0]), 文件内容(row[1]), 其他内容... |
| | |
| | 目标: 找到 res2.csv 中存在的所有编号,从 dataset_all.csv 中提取对应的记录,整合成新数据集 |
| | """ |
| |
|
| | import pandas as pd |
| | import json |
| | from tqdm import tqdm |
| | import os |
| |
|
| | def main(): |
| | print("开始处理数据集合并...") |
| | |
| | |
| | print("\n步骤 1: 读取 res2.csv 获取编号列表...") |
| | res2_path = '/home/weifengsun/tangou1/step2/res2.csv' |
| | |
| | |
| | res2_ids = set() |
| | chunk_size = 100000 |
| | |
| | for chunk in tqdm(pd.read_csv(res2_path, chunksize=chunk_size, header=None, usecols=[0]), |
| | desc="读取res2.csv"): |
| | res2_ids.update(chunk[0].tolist()) |
| | |
| | print(f"从 res2.csv 中找到 {len(res2_ids)} 个唯一编号") |
| | |
| | |
| | print("\n步骤 2: 从 dataset_all.csv 中筛选匹配的记录...") |
| | dataset_all_path = '/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv' |
| | output_path = '/home/weifengsun/tangou1/step2/merged_dataset.csv' |
| | |
| | |
| | total_rows = 0 |
| | matched_rows = 0 |
| | |
| | |
| | first_chunk = True |
| | for chunk in tqdm(pd.read_csv(dataset_all_path, chunksize=chunk_size, low_memory=False), |
| | desc="处理dataset_all.csv"): |
| | total_rows += len(chunk) |
| | |
| | |
| | |
| | matched_chunk = chunk[chunk.iloc[:, 0].isin(res2_ids)] |
| | matched_rows += len(matched_chunk) |
| | |
| | |
| | if len(matched_chunk) > 0: |
| | if first_chunk: |
| | matched_chunk.to_csv(output_path, index=True, mode='w') |
| | first_chunk = False |
| | else: |
| | matched_chunk.to_csv(output_path, index=True, mode='a', header=False) |
| | |
| | print(f"\n处理完成!") |
| | print(f"总共处理行数: {total_rows}") |
| | print(f"匹配的行数: {matched_rows}") |
| | print(f"匹配率: {matched_rows/total_rows*100:.2f}%") |
| | print(f"输出文件: {output_path}") |
| | |
| | |
| | print("\n步骤 3: 创建包含函数信息的增强数据集...") |
| | enhanced_output_path = '/home/weifengsun/tangou1/step2/enhanced_dataset.csv' |
| | |
| | |
| | print("加载res2.csv函数信息...") |
| | res2_dict = {} |
| | for chunk in tqdm(pd.read_csv(res2_path, chunksize=chunk_size, header=None), |
| | desc="加载res2"): |
| | for idx, row in chunk.iterrows(): |
| | res2_dict[row[0]] = row[1] |
| | |
| | |
| | print("合并函数信息...") |
| | merged_df = pd.read_csv(output_path, low_memory=False) |
| | |
| | |
| | merged_df['function_info'] = merged_df.iloc[:, 0].map(res2_dict) |
| | |
| | |
| | merged_df.to_csv(enhanced_output_path, index=True) |
| | |
| | print(f"\n增强数据集已保存到: {enhanced_output_path}") |
| | print(f"增强数据集行数: {len(merged_df)}") |
| | print(f"增强数据集列数: {len(merged_df.columns)}") |
| | |
| | |
| | print("\n数据集前5行预览:") |
| | print(merged_df.head()) |
| | print("\n增强数据集列名:") |
| | print(merged_df.columns.tolist()) |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|