File size: 3,385 Bytes
a51dbcc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | import os
import jsonlines
import json
def get_function_scores(dir):
scores = []
subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
for subdir in subdirs:
md_path = os.path.join(dir, subdir, 'readme_summary.json')
md_score = 0
with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
md_score = json.load(f)['score']
json_path = os.path.join(dir, subdir, 'functions.jsonl')
contents = []
with jsonlines.open(json_path) as reader:
for obj in reader:
if 'score' in obj:
contents.append(obj['score'] * md_score)
scores.extend(contents)
return scores
# scores = sorted(get_function_scores('/home/weifengsun/tangou1/step2/step22/dataset'), reverse=True)
# print(len(scores))
# print(scores[:10])
# print(scores[-10:])
# print(scores[100000])
# print(scores[200000])
# print(scores[300000])
# print(scores[400000])
# print(scores[500000])
# 18099531
# [0.28443953109169584, 0.2844296876675756, 0.2825556445220201, 0.2806598113798131, 0.2768595256346984, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837]
# [-0.024879203269990824, -0.02555222846015559, -0.02579063325241293, -0.02583700829326574, -0.02583700829326574, -0.02587076373841679, -0.025900709478987816, -0.025951737689723586, -0.029216928089614402, -0.04466233910208084]
# 0.15702062139215656
# 0.14540986706855819
# 0.13808880121203515
# 0.13262306995012807
# 0.1282891692796717
def output_scores(dir, output_path, score):
subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
for subdir in subdirs:
md_path = os.path.join(dir, subdir, 'readme_summary.json')
md_summary = ''
md_score = 0
with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
data = json.load(f)
md_summary = data['readme_summary']
md_score = data['score']
json_path = os.path.join(dir, subdir, 'functions.jsonl')
contents = []
with jsonlines.open(json_path) as reader:
for obj in reader:
if 'score' in obj and obj['score'] * md_score > score:
obj['md_summary'] = md_summary
obj['md_score'] = md_score
obj['final_score'] = obj['score'] * md_score
with open(obj['file'], 'r', encoding='utf-8', errors='ignore') as f:
obj['code_content'] = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])
contents.append(obj)
with jsonlines.open(output_path, 'a', flush=True) as writer:
writer.write_all(contents)
output_scores('/home/weifengsun/tangou1/step2/step22/dataset', '/home/weifengsun/tangou1/step2/step22/output/function_filtered_scores.jsonl', 0.1282891692796717)
# with jsonlines.open('/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl', 'r') as reader:
# print(len(list(reader)))
# # 500000
# path = "/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl"
# size_bytes = os.path.getsize(path)
# size_mb = size_bytes / (1024 * 1024)
# size_gb = size_bytes / (1024 * 1024 * 1024)
# print(f"文件大小: {size_mb:.2f} MB")
# print(f"文件大小: {size_gb:.2f} GB")
|