DouDou commited on
Commit
96eab9a
·
verified ·
1 Parent(s): fe45bd1

Upload data3/load_dataset.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data3/load_dataset.py +75 -0
data3/load_dataset.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+
3
+ csv.field_size_limit(10 * 1024 * 1024 * 1024) # 10MB
4
+
5
+ # length_max = 300
6
+ # length_max = 28906320
7
+ # length_max = 8290
8
+
9
+ # with open('res.csv', 'r', encoding='utf-8') as f:
10
+ # reader = csv.reader(f)
11
+ # id_set = set()
12
+ # for i, row in enumerate(reader):
13
+ # id_set.add(row[0])
14
+ # print(len(id_set))
15
+
16
+ with open('res1.csv', 'r', encoding='utf-8') as f:
17
+ reader = csv.reader(f)
18
+ id_set = set()
19
+ for i, row in enumerate(reader):
20
+ id_set.add(row[0])
21
+ print(len(id_set))
22
+
23
+ with open('res2.csv', 'r', encoding='utf-8') as f:
24
+ reader = csv.reader(f)
25
+ for i, row in enumerate(reader):
26
+ if row[0] in id_set:
27
+ id_set.remove(row[0])
28
+ print(len(id_set))
29
+
30
+ length_max = len(id_set)
31
+
32
+ def load_dataset():
33
+ prompt = open('score_prompt.txt', 'r', encoding='utf-8').read()
34
+ # prompt = open('is_sci_prompt.txt', 'r', encoding='utf-8').read()
35
+ with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f:
36
+ reader = csv.reader(f)
37
+ amount = 0
38
+ # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source']
39
+ for i, row in enumerate(reader):
40
+ if i == 0:
41
+ continue
42
+ if amount > length_max:
43
+ return
44
+ if len(row[1]) > 100000 or len(row[1]) < 2000:
45
+ continue
46
+ if row[0] in id_set:
47
+ amount += 1
48
+ yield prompt.format(CODE_FILE=add_line_numbers(row[1])), row[0]
49
+ # yield prompt.format(CODE_FILE=row[1][:20000]), row[0]
50
+
51
+
52
+ def add_line_numbers(text: str) -> str:
53
+ out_lines = []
54
+ line_num = 1
55
+ start = 0
56
+ for line in text.splitlines(keepends=True):
57
+ out_lines.append(f"{line_num} " + line)
58
+ line_num += 1
59
+ return "".join(out_lines)
60
+
61
+
62
+ if __name__ == "__main__":
63
+ # with open('/home/weifengsun/tangou1/domain_code/src/datasets/data_merged/dataset_all.csv', 'r', encoding='utf-8') as f:
64
+ # reader = csv.reader(f)
65
+ # # ['', 'text', 'repo_name', 'path', 'language', 'license', 'size', 'keyword', 'text_hash', 'config', 'split', 'repo_path', 'ds_source']
66
+ # for i, row in enumerate(reader):
67
+ # if i > 100:
68
+ # break
69
+ # print(row[0])
70
+ # amount = 0
71
+ # for i in load_dataset():
72
+ # amount += 1
73
+ # print(amount)
74
+ # print(len(id_set))
75
+ pass