File size: 3,385 Bytes
a51dbcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os
import jsonlines
import json

def get_function_scores(dir):
    scores = []
    subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
    for subdir in subdirs:
        md_path = os.path.join(dir, subdir, 'readme_summary.json')
        md_score = 0
        with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
            md_score = json.load(f)['score']

        json_path = os.path.join(dir, subdir, 'functions.jsonl')
        contents = []
        with jsonlines.open(json_path) as reader:
            for obj in reader:
                if 'score' in obj:
                    contents.append(obj['score'] * md_score)
        scores.extend(contents)
    return scores

# scores = sorted(get_function_scores('/home/weifengsun/tangou1/step2/step22/dataset'), reverse=True)
# print(len(scores))
# print(scores[:10])
# print(scores[-10:])
# print(scores[100000])
# print(scores[200000])
# print(scores[300000])
# print(scores[400000])
# print(scores[500000])
# 18099531
# [0.28443953109169584, 0.2844296876675756, 0.2825556445220201, 0.2806598113798131, 0.2768595256346984, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837, 0.2727182712631837]
# [-0.024879203269990824, -0.02555222846015559, -0.02579063325241293, -0.02583700829326574, -0.02583700829326574, -0.02587076373841679, -0.025900709478987816, -0.025951737689723586, -0.029216928089614402, -0.04466233910208084]
# 0.15702062139215656
# 0.14540986706855819
# 0.13808880121203515
# 0.13262306995012807
# 0.1282891692796717


def output_scores(dir, output_path, score):
    subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
    for subdir in subdirs:
        md_path = os.path.join(dir, subdir, 'readme_summary.json')
        md_summary = ''
        md_score = 0
        with open(md_path, 'r', encoding='utf-8', errors='ignore') as f:
            data = json.load(f)
            md_summary = data['readme_summary']
            md_score = data['score']

        json_path = os.path.join(dir, subdir, 'functions.jsonl')
        contents = []
        with jsonlines.open(json_path) as reader:
            for obj in reader:
                if 'score' in obj and obj['score'] * md_score > score:
                    obj['md_summary'] = md_summary
                    obj['md_score'] = md_score
                    obj['final_score'] = obj['score'] * md_score
                    with open(obj['file'], 'r', encoding='utf-8', errors='ignore') as f:
                        obj['code_content'] = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])
                    contents.append(obj)
        with jsonlines.open(output_path, 'a', flush=True) as writer:
            writer.write_all(contents)

output_scores('/home/weifengsun/tangou1/step2/step22/dataset', '/home/weifengsun/tangou1/step2/step22/output/function_filtered_scores.jsonl', 0.1282891692796717)
# with jsonlines.open('/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl', 'r') as reader:
#     print(len(list(reader)))
#     # 500000


# path = "/home/weifengsun/tangou1/step2/step22/function_filtered_scores.jsonl"
# size_bytes = os.path.getsize(path)

# size_mb = size_bytes / (1024 * 1024)
# size_gb = size_bytes / (1024 * 1024 * 1024)

# print(f"文件大小: {size_mb:.2f} MB")
# print(f"文件大小: {size_gb:.2f} GB")