import os import jsonlines import json def get_function_scores(dir): scores = [] subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]) for subdir in subdirs: md_path = os.path.join(dir, subdir, 'readme_summary.json') md_score = 0 with open(md_path, 'r', encoding='utf-8', errors='ignore') as f: md_score = json.load(f)['score'] json_path = os.path.join(dir, subdir, 'functions.jsonl') contents = [] with jsonlines.open(json_path) as reader: for obj in reader: if 'score' in obj: contents.append(obj['score'] * md_score) scores.extend(contents) return scores # scores = sorted(get_function_scores('/home/weifengsun/tangou1/step2/step22/dataset'), reverse=True) # print(len(scores)) # print(scores[:10]) # print(scores[-10:]) # print(scores[100000]) # print(scores[200000]) # print(scores[300000]) # print(scores[400000]) # print(scores[500000]) # 18099531 # [0.28443953109169584, 0.2844296876675756, 0.2825556445220201, 0.2806598113798131, 0.2768595256346984, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837] # [-0.024879203269990824, -0.02555222846015559, -0.02579063325241293, -0.02583700829326574, -0.02583700829326574, -0.02587076373841679, -0.025900709478987816, -0.025951737689723586, -0.029216928089614402, -0.04466233910208084] # 0.15702062139215656 # 0.14540986706855819 # 0.13808880121203515 # 0.13262306995012807 # 0.1282891692796717 def output_scores(dir, output_path, score): subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]) for subdir in subdirs: md_path = os.path.join(dir, subdir, 'readme_summary.json') md_summary = '' md_score = 0 with open(md_path, 'r', encoding='utf-8', errors='ignore') as f: data = json.load(f) md_summary = data['readme_summary'] md_score = data['score'] json_path = os.path.join(dir, subdir, 'functions.jsonl') contents = [] with jsonlines.open(json_path) as reader: for obj in reader: if 'score' in obj and obj['score'] * md_score > score: obj['md_summary'] = md_summary obj['md_score'] = md_score obj['final_score'] = obj['score'] * md_score with open(obj['file'], 'r', encoding='utf-8', errors='ignore') as f: obj['code_content'] = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']]) contents.append(obj) with jsonlines.open(output_path, 'a', flush=True) as writer: writer.write_all(contents) output_scores('/home/weifengsun/tangou1/step2/step22/dataset', '/home/weifengsun/tangou1/step2/step22/output/function_filtered_scores.jsonl', 0.1282891692796717) # with jsonlines.open('/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl', 'r') as reader: # print(len(list(reader))) # # 500000 # path = "/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl" # size_bytes = os.path.getsize(path) # size_mb = size_bytes / (1024 * 1024) # size_gb = size_bytes / (1024 * 1024 * 1024) # print(f"文件大小: {size_mb:.2f} MB") # print(f"文件大小: {size_gb:.2f} GB")