DouDou commited on
Commit
a7c0211
·
verified ·
1 Parent(s): 5c9df83

Upload data2/step22/emb_qwen_func.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/emb_qwen_func.py +308 -0
data2/step22/emb_qwen_func.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Requires vllm>=0.8.5
3
+ import torch
4
+ import vllm
5
+ from vllm import LLM
6
+ from transformers import AutoTokenizer
7
+ from pathlib import Path
8
+ import os
9
+ import jsonlines
10
+
11
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
12
+
13
+ # 必须在设置 CUDA_VISIBLE_DEVICES 之后
14
+ import multiprocessing as mp
15
+ mp.set_start_method("spawn", force=True)
16
+
17
+
18
+ def get_detailed_instruct(task_description: str, query: str) -> str:
19
+ return f'Instruct: {task_description}\nQuery:{query}'
20
+
21
+
22
+ keywords = ["Quantum mechanics",
23
+ "Gene editing",
24
+ "Folding",
25
+ "System biology",
26
+ "Antibody",
27
+ "Heterogeneity",
28
+ "Ligand",
29
+ "Drug repurpose",
30
+ "Kinetics",
31
+ "Next-generation sequencing",
32
+ "Pharmacogenetics",
33
+ "Phase-field technique",
34
+ "Human",
35
+ "Potential",
36
+ "Hartree-Fock",
37
+ "Flow matching",
38
+ "Lipid",
39
+ "Biomedical",
40
+ "Antigen",
41
+ "Stochastic modeling",
42
+ "Coupled cluster",
43
+ "Quantum biology",
44
+ "Spatial biology",
45
+ "Antagonist",
46
+ "Free energy perturbation",
47
+ "Cycle",
48
+ "Pharmacology",
49
+ "Redox",
50
+ "Physiology",
51
+ "Protein-Protein Interactions",
52
+ "Single-cell",
53
+ "Screening",
54
+ "Hydrophobic",
55
+ "First-principles based DFT",
56
+ "Molecular biology",
57
+ "Mechanism",
58
+ "Reproduction number",
59
+ "Spatial Transcriptomics",
60
+ "Ion",
61
+ "Computational Materials",
62
+ "Absorption",
63
+ "Pharmacometrics",
64
+ "GAN",
65
+ "Compartmental model",
66
+ "Diagnostics",
67
+ "Lead discovery",
68
+ "QAPR",
69
+ "Rosettafold",
70
+ "Autoregressive",
71
+ "Pharmacokinetics",
72
+ "Biotechnology",
73
+ "Hydrophilic",
74
+ "3D",
75
+ "Protein",
76
+ "QM/MM",
77
+ "Activation",
78
+ "AMR",
79
+ "Networks",
80
+ "Genotype",
81
+ "Gene regulatory networks",
82
+ "Biologics",
83
+ "Phenotype",
84
+ "Nowcasting",
85
+ "DFT",
86
+ "AlphaFold",
87
+ "Pandemic",
88
+ "Immunology",
89
+ "Pathology",
90
+ "Chemical space",
91
+ "Transformer",
92
+ "Homeostasis",
93
+ "Score",
94
+ "High-throughput",
95
+ "Cheminformatics",
96
+ "Hit-to-lead",
97
+ "Sequencing",
98
+ "Enzyme",
99
+ "Antimicrobial resistance modeling",
100
+ "Allosteric",
101
+ "Inhibition",
102
+ "Computational Biochemistry",
103
+ "Bioinformatics",
104
+ "Transcriptomics",
105
+ "Diffusion",
106
+ "Anomaly detection",
107
+ "Multi-omics",
108
+ "Biology",
109
+ "Pathway",
110
+ "Metabolomics",
111
+ "Synthetic biology",
112
+ "Microbial",
113
+ "Proteomics",
114
+ "Pharmaceutics",
115
+ "Organoid",
116
+ "Network pharmacology",
117
+ "Imaging",
118
+ "Generative adversarial networks",
119
+ "Microbiology",
120
+ "Organ-on-a-chip",
121
+ "De novo",
122
+ "Substrate",
123
+ "Personalized",
124
+ "Drug",
125
+ "Transcription",
126
+ "RNA",
127
+ "Explainable AI",
128
+ "Generate",
129
+ "Docking",
130
+ "Pathogens",
131
+ "Bio foundation model",
132
+ "Reinforcement learning",
133
+ "Mechanism of action",
134
+ "Generative",
135
+ "Metabolic",
136
+ "Metabolic Flux Analysis",
137
+ "Computational Chemistry",
138
+ "Vaccine",
139
+ "Biophysics",
140
+ "Integration",
141
+ "Biochemistry",
142
+ "Physiologically based pharmacokinetics model",
143
+ "Medicine",
144
+ "Crystal",
145
+ "Conjugate",
146
+ "Variational autoencoders",
147
+ "In Silico",
148
+ "Protein-protein",
149
+ "CRISPR",
150
+ "Spatial transcriptomics",
151
+ "Gene",
152
+ "Translation",
153
+ "Glycomics",
154
+ "Lead optimization",
155
+ "Pharmacodynamics",
156
+ "Ab initio",
157
+ "System immunology",
158
+ "Pseudotime analysis",
159
+ "Generative AI",
160
+ "RNN",
161
+ "Regulatory networks",
162
+ "PBPK model",
163
+ "Beta-blocker",
164
+ "Lipidomics",
165
+ "Reaction",
166
+ "Bio",
167
+ "Genesis",
168
+ "Evolution",
169
+ "Computational Biology",
170
+ "VAE",
171
+ "Pharmacogenomics",
172
+ "Assay",
173
+ "Sensors",
174
+ "Conformation",
175
+ "Finite element method",
176
+ "Human atlas",
177
+ "Translational medicine",
178
+ "Neurology",
179
+ "Genomics",
180
+ "Cell biology",
181
+ "Porous",
182
+ "Biomarker",
183
+ "Bioengineering",
184
+ "Allele",
185
+ "Recurrent neural networks",
186
+ "Carbohydrate",
187
+ "Metamaterial",
188
+ "Virtual human",
189
+ "DNA",
190
+ "Omics",
191
+ "Agonist",
192
+ "Receptor",
193
+ "Cofactor",
194
+ "Metabolic flux analysis",
195
+ "Cell atlas",
196
+ "Signaling",
197
+ "Electronic structure",
198
+ "Monte Carlo",
199
+ "Genomic surveillance",
200
+ "Agent-based model",
201
+ "Biosensors",
202
+ "2D",
203
+ "QSAR",
204
+ "Codon",
205
+ "Coenzyme",
206
+ "Nucleic acids",
207
+ "Dynamics",
208
+ "Ensemble",
209
+ "Spectrometry",
210
+ "Multi-scale modeling",
211
+ "ADMET",
212
+ "Marker",
213
+ "Toxicology",
214
+ "Profiling",
215
+ "Design",
216
+ "Viral",
217
+ "Chemistry",
218
+ "Epigenetics",
219
+ "Homo-Lumo",
220
+ "Modeling",
221
+ "Prediction",
222
+ "Quantum Chemistry",
223
+ "Half-life",
224
+ "Material",
225
+ "Disease",
226
+ "Phylodynamic model",
227
+ "Metagenomics",
228
+ "Digital twin",
229
+ "Cancer biology",
230
+ "Discovery",
231
+ "Bioavailability",
232
+ "Digital PCR"
233
+ ]
234
+
235
+ # Each query must come with a one-sentence instruction that describes the task
236
+ task = 'Given a web search query, retrieve relevant passages that answer the query'
237
+
238
+ queries = [
239
+ get_detailed_instruct(task, ' '.join(keywords))
240
+ ]
241
+
242
+ model = LLM(model="Qwen/Qwen3-Embedding-0.6B",
243
+ task="embed",
244
+ tensor_parallel_size=1,
245
+ data_parallel_size=1)
246
+
247
+
248
+ def get_functions_contents(dir):
249
+ subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
250
+ for subdir in subdirs:
251
+ if subdir == 'ElectronicStructureLibrary___libxc':
252
+ continue
253
+ print(subdir)
254
+ json_path = os.path.join(dir, subdir, 'functions.jsonl')
255
+ contents = []
256
+ if os.path.exists(json_path):
257
+ objs = []
258
+ has_scored = False
259
+ has_read = False
260
+ with jsonlines.open(json_path) as reader:
261
+ has_read = True
262
+ for obj in reader:
263
+ if 'score' in obj:
264
+ has_scored = True
265
+ break
266
+ file_path = obj['file']
267
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
268
+ func_content = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])[:32000]
269
+ # obj['content'] = func_content
270
+ contents.append(func_content)
271
+ objs.append(obj)
272
+ if has_read and not has_scored:
273
+ scores = get_scores(contents)
274
+ for i, obj in enumerate(objs):
275
+ obj['score'] = scores[i]
276
+ if has_read and not has_scored:
277
+ with jsonlines.open(json_path, 'w', flush=True) as writer:
278
+ writer.write_all(objs)
279
+ print("finish ", subdir)
280
+
281
+
282
+ tokenizer = AutoTokenizer.from_pretrained(
283
+ "Qwen/Qwen3-Embedding-0.6B",
284
+ trust_remote_code=True
285
+ )
286
+
287
+ MAX_TOKENS = 30000 # 留点 buffer
288
+
289
+ def truncate_to_max_tokens(text, max_tokens=MAX_TOKENS):
290
+ tokens = tokenizer(
291
+ text,
292
+ truncation=True,
293
+ max_length=max_tokens,
294
+ return_tensors=None
295
+ )
296
+ return tokenizer.decode(tokens["input_ids"], skip_special_tokens=True)
297
+
298
+ def get_scores(documents):
299
+ safe_queries = [truncate_to_max_tokens(q) for q in queries]
300
+ safe_docs = [truncate_to_max_tokens(d) for d in documents]
301
+
302
+ input_texts = safe_queries + safe_docs
303
+ outputs = model.embed(input_texts)
304
+ embeddings = torch.tensor([o.outputs.embedding for o in outputs])
305
+ scores = (embeddings[0] @ embeddings[1:].T)
306
+ return scores.tolist()
307
+
308
+ get_functions_contents('/home/weifengsun/tangou1/step2/step22/dataset')