DouDou commited on
Commit
15d17be
·
verified ·
1 Parent(s): 6f3497d

Upload data2/step22/func_stat.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/func_stat.py +243 -0
data2/step22/func_stat.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+ import traceback
5
+ import os
6
+ import json
7
+ import sys
8
+ from concurrent.futures import ProcessPoolExecutor, as_completed
9
+ from tree_sitter import Language, Parser
10
+ import tree_sitter_c
11
+ import tree_sitter_cpp
12
+ import tree_sitter_java
13
+ import tree_sitter_go
14
+ import tree_sitter_rust
15
+ import tree_sitter_julia
16
+ import tree_sitter_python
17
+
18
+ ########################################
19
+ # 配置区
20
+ ########################################
21
+
22
+ LANG_SO = "build/my-languages.so"
23
+ OUTPUT_DIR = "/home/weifengsun/tangou1/step2/step22/dataset"
24
+
25
+ EXCLUDE_DIRS = {
26
+ ".git", "node_modules", "vendor", "third_party",
27
+ "build", "dist", "target", "__pycache__"
28
+ }
29
+
30
+ LANGUAGE_CONFIG = {
31
+ "python": {
32
+ "ext": [".py"],
33
+ "function_nodes": ["function_definition"],
34
+ "class_nodes": ["class_definition"],
35
+ "name_field": "name",
36
+ },
37
+ "c": {
38
+ "ext": [".c", ".h"],
39
+ "function_nodes": ["function_definition"],
40
+ "name_field": "declarator",
41
+ },
42
+ "cpp": {
43
+ "ext": [".cpp", ".cc", ".cxx", ".hpp", ".hh"],
44
+ "function_nodes": ["function_definition"],
45
+ "name_field": "declarator",
46
+ },
47
+ "java": {
48
+ "ext": [".java"],
49
+ "function_nodes": ["method_declaration", "constructor_declaration"],
50
+ "class_nodes": ["class_declaration"],
51
+ "name_field": "name",
52
+ },
53
+ "go": {
54
+ "ext": [".go"],
55
+ "function_nodes": ["function_declaration", "method_declaration"],
56
+ "name_field": "name",
57
+ "receiver_field": "receiver",
58
+ },
59
+ "rust": {
60
+ "ext": [".rs"],
61
+ "function_nodes": ["function_item"],
62
+ "class_nodes": ["impl_item", "trait_item"],
63
+ "name_field": "name",
64
+ },
65
+ "julia": {
66
+ "ext": [".jl"],
67
+ "function_nodes": ["function_definition"],
68
+ "name_field": "name",
69
+ },
70
+ }
71
+
72
+ EXT_TO_LANG = {}
73
+ for lang, cfg in LANGUAGE_CONFIG.items():
74
+ for e in cfg["ext"]:
75
+ EXT_TO_LANG[e] = lang
76
+
77
+ ########################################
78
+ # worker 初始化
79
+ ########################################
80
+
81
+ LANGUAGES = {
82
+ "python": Language(tree_sitter_python.language()),
83
+ "go": Language(tree_sitter_go.language()),
84
+ "rust": Language(tree_sitter_rust.language()),
85
+ "julia": Language(tree_sitter_julia.language()),
86
+ "c": Language(tree_sitter_c.language()),
87
+ "cpp": Language(tree_sitter_cpp.language()),
88
+ "java": Language(tree_sitter_java.language()),
89
+ }
90
+
91
+ def init_worker():
92
+ global PARSERS
93
+ PARSERS = {}
94
+
95
+ for lang in LANGUAGE_CONFIG:
96
+ try:
97
+ # LANGUAGE=Language(LANG_SO, lang)
98
+ parser = Parser(LANGUAGES[lang])
99
+ PARSERS[lang] = parser
100
+ except Exception:
101
+ print(f"Failed to load parser for {lang}")
102
+ print(traceback.format_exc())
103
+ pass
104
+
105
+
106
+ ########################################
107
+ # 函数提取逻辑
108
+ ########################################
109
+
110
+ def extract_functions(tree, file_path, language):
111
+ cfg = LANGUAGE_CONFIG[language]
112
+ results = []
113
+
114
+ def walk(node, scope):
115
+ # class / impl 作用域
116
+ if node.type in cfg.get("class_nodes", []):
117
+ name_node = node.child_by_field_name("name") or \
118
+ node.child_by_field_name("type")
119
+ if name_node:
120
+ scope.append(name_node.text.decode())
121
+
122
+ # Go receiver
123
+ if language == "go" and node.type in cfg["function_nodes"]:
124
+ recv = node.child_by_field_name("receiver")
125
+ if recv:
126
+ scope.append(recv.text.decode())
127
+
128
+ # 函数定义
129
+ if node.type in cfg["function_nodes"]:
130
+ name_node = node.child_by_field_name(cfg["name_field"])
131
+ if name_node:
132
+ name = name_node.text.decode()
133
+ qual = ".".join(scope + [name])
134
+ results.append({
135
+ "language": language,
136
+ "name": name,
137
+ "qualified_name": qual,
138
+ "file": file_path,
139
+ "start_line": node.start_point[0] + 1,
140
+ "end_line": node.end_point[0] + 1,
141
+ })
142
+
143
+ # Julia 简写函数 foo(x)=...
144
+ if language == "julia" and node.type == "assignment":
145
+ left = node.child(0)
146
+ if left and left.type == "call_expression":
147
+ fn = left.child_by_field_name("function")
148
+ if fn:
149
+ name = fn.text.decode()
150
+ results.append({
151
+ "language": language,
152
+ "name": name,
153
+ "qualified_name": name,
154
+ "file": file_path,
155
+ "start_line": node.start_point[0] + 1,
156
+ "end_line": node.end_point[0] + 1,
157
+ })
158
+
159
+ for c in node.children:
160
+ walk(c, scope)
161
+
162
+ if node.type in cfg.get("class_nodes", []):
163
+ scope.pop()
164
+
165
+ walk(tree.root_node, [])
166
+ return results
167
+
168
+
169
+ ########################################
170
+ # 项目处理
171
+ ########################################
172
+
173
+ def process_project(project_path):
174
+ project_name = os.path.basename(project_path.rstrip("/"))
175
+ output_path = os.path.join(OUTPUT_DIR, project_name, "functions.jsonl")
176
+
177
+ with open(output_path, "w", encoding="utf-8") as out:
178
+ for root, dirs, files in os.walk(project_path):
179
+ dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
180
+
181
+ for f in files:
182
+ ext = os.path.splitext(f)[1]
183
+ lang = EXT_TO_LANG.get(ext)
184
+ if not lang:
185
+ continue
186
+
187
+ path = os.path.join(root, f)
188
+ try:
189
+ code = open(path, "rb").read()
190
+ tree = PARSERS[lang].parse(code)
191
+ funcs = extract_functions(tree, path, lang)
192
+
193
+ for fn in funcs:
194
+ out.write(json.dumps(fn, ensure_ascii=False) + "\n")
195
+
196
+ except Exception:
197
+ continue
198
+
199
+
200
+ ########################################
201
+ # 主入口
202
+ ########################################
203
+
204
+ def load_projects(root):
205
+ return [
206
+ os.path.join(root, d)
207
+ for d in os.listdir(root)
208
+ if os.path.isdir(os.path.join(root, d))
209
+ ]
210
+
211
+
212
+ def main():
213
+ # if len(sys.argv) != 2:
214
+ # print("Usage: python extract_functions.py <projects_root>")
215
+ # sys.exit(1)
216
+
217
+ # projects_root = sys.argv[1]
218
+ projects_root = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"
219
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
220
+
221
+ projects = load_projects(projects_root)
222
+
223
+ with ProcessPoolExecutor(
224
+ max_workers=min(os.cpu_count(), 32),
225
+ initializer=init_worker
226
+ ) as pool:
227
+ futures = {
228
+ pool.submit(process_project, p): p
229
+ for p in projects
230
+ }
231
+
232
+ for f in as_completed(futures):
233
+ proj = futures[f]
234
+ try:
235
+ f.result()
236
+ print(f"[OK] {proj}")
237
+
238
+ except Exception as e:
239
+ print(f"[FAIL] {proj}: {e}")
240
+
241
+
242
+ if __name__ == "__main__":
243
+ main()