SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

a7c0211

verified ·

1 Parent(s): 5c9df83

Upload data2/step22/emb_qwen_func.py with huggingface_hub

Browse files

Files changed (1) hide show

data2/step22/emb_qwen_func.py +308 -0

data2/step22/emb_qwen_func.py ADDED Viewed

	@@ -0,0 +1,308 @@

+# Requires vllm>=0.8.5
+import torch
+import vllm
+from vllm import LLM
+from transformers import AutoTokenizer
+from pathlib import Path
+import os
+import jsonlines
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+# 必须在设置 CUDA_VISIBLE_DEVICES 之后
+import multiprocessing as mp
+mp.set_start_method("spawn", force=True)
+def get_detailed_instruct(task_description: str, query: str) -> str:
+    return f'Instruct: {task_description}\nQuery:{query}'
+keywords = ["Quantum mechanics",
+    "Gene editing",
+    "Folding",
+    "System biology",
+    "Antibody",
+    "Heterogeneity",
+    "Ligand",
+    "Drug repurpose",
+    "Kinetics",
+    "Next-generation sequencing",
+    "Pharmacogenetics",
+    "Phase-field technique",
+    "Human",
+    "Potential",
+    "Hartree-Fock",
+    "Flow matching",
+    "Lipid",
+    "Biomedical",
+    "Antigen",
+    "Stochastic modeling",
+    "Coupled cluster",
+    "Quantum biology",
+    "Spatial biology",
+    "Antagonist",
+    "Free energy perturbation",
+    "Cycle",
+    "Pharmacology",
+    "Redox",
+    "Physiology",
+    "Protein-Protein Interactions",
+    "Single-cell",
+    "Screening",
+    "Hydrophobic",
+    "First-principles based DFT",
+    "Molecular biology",
+    "Mechanism",
+    "Reproduction number",
+    "Spatial Transcriptomics",
+    "Ion",
+    "Computational Materials",
+    "Absorption",
+    "Pharmacometrics",
+    "GAN",
+    "Compartmental model",
+    "Diagnostics",
+    "Lead discovery",
+    "QAPR",
+    "Rosettafold",
+    "Autoregressive",
+    "Pharmacokinetics",
+    "Biotechnology",
+    "Hydrophilic",
+    "3D",
+    "Protein",
+    "QM/MM",
+    "Activation",
+    "AMR",
+    "Networks",
+    "Genotype",
+    "Gene regulatory networks",
+    "Biologics",
+    "Phenotype",
+    "Nowcasting",
+    "DFT",
+    "AlphaFold",
+    "Pandemic",
+    "Immunology",
+    "Pathology",
+    "Chemical space",
+    "Transformer",
+    "Homeostasis",
+    "Score",
+    "High-throughput",
+    "Cheminformatics",
+    "Hit-to-lead",
+    "Sequencing",
+    "Enzyme",
+    "Antimicrobial resistance modeling",
+    "Allosteric",
+    "Inhibition",
+    "Computational Biochemistry",
+    "Bioinformatics",
+    "Transcriptomics",
+    "Diffusion",
+    "Anomaly detection",
+    "Multi-omics",
+    "Biology",
+    "Pathway",
+    "Metabolomics",
+    "Synthetic biology",
+    "Microbial",
+    "Proteomics",
+    "Pharmaceutics",
+    "Organoid",
+    "Network pharmacology",
+    "Imaging",
+    "Generative adversarial networks",
+    "Microbiology",
+    "Organ-on-a-chip",
+    "De novo",
+    "Substrate",
+    "Personalized",
+    "Drug",
+    "Transcription",
+    "RNA",
+    "Explainable AI",
+    "Generate",
+    "Docking",
+    "Pathogens",
+    "Bio foundation model",
+    "Reinforcement learning",
+    "Mechanism of action",
+    "Generative",
+    "Metabolic",
+    "Metabolic Flux Analysis",
+    "Computational Chemistry",
+    "Vaccine",
+    "Biophysics",
+    "Integration",
+    "Biochemistry",
+    "Physiologically based pharmacokinetics model",
+    "Medicine",
+    "Crystal",
+    "Conjugate",
+    "Variational autoencoders",
+    "In Silico",
+    "Protein-protein",
+    "CRISPR",
+    "Spatial transcriptomics",
+    "Gene",
+    "Translation",
+    "Glycomics",
+    "Lead optimization",
+    "Pharmacodynamics",
+    "Ab initio",
+    "System immunology",
+    "Pseudotime analysis",
+    "Generative AI",
+    "RNN",
+    "Regulatory networks",
+    "PBPK model",
+    "Beta-blocker",
+    "Lipidomics",
+    "Reaction",
+    "Bio",
+    "Genesis",
+    "Evolution",
+    "Computational Biology",
+    "VAE",
+    "Pharmacogenomics",
+    "Assay",
+    "Sensors",
+    "Conformation",
+    "Finite element method",
+    "Human atlas",
+    "Translational medicine",
+    "Neurology",
+    "Genomics",
+    "Cell biology",
+    "Porous",
+    "Biomarker",
+    "Bioengineering",
+    "Allele",
+    "Recurrent neural networks",
+    "Carbohydrate",
+    "Metamaterial",
+    "Virtual human",
+    "DNA",
+    "Omics",
+    "Agonist",
+    "Receptor",
+    "Cofactor",
+    "Metabolic flux analysis",
+    "Cell atlas",
+    "Signaling",
+    "Electronic structure",
+    "Monte Carlo",
+    "Genomic surveillance",
+    "Agent-based model",
+    "Biosensors",
+    "2D",
+    "QSAR",
+    "Codon",
+    "Coenzyme",
+    "Nucleic acids",
+    "Dynamics",
+    "Ensemble",
+    "Spectrometry",
+    "Multi-scale modeling",
+    "ADMET",
+    "Marker",
+    "Toxicology",
+    "Profiling",
+    "Design",
+    "Viral",
+    "Chemistry",
+    "Epigenetics",
+    "Homo-Lumo",
+    "Modeling",
+    "Prediction",
+    "Quantum Chemistry",
+    "Half-life",
+    "Material",
+    "Disease",
+    "Phylodynamic model",
+    "Metagenomics",
+    "Digital twin",
+    "Cancer biology",
+    "Discovery",
+    "Bioavailability",
+    "Digital PCR"
+    ]
+# Each query must come with a one-sentence instruction that describes the task
+task = 'Given a web search query, retrieve relevant passages that answer the query'
+queries = [
+    get_detailed_instruct(task, ' '.join(keywords))
+]
+model = LLM(model="Qwen/Qwen3-Embedding-0.6B",
+            task="embed",
+            tensor_parallel_size=1,
+            data_parallel_size=1)
+def get_functions_contents(dir):
+    subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))])
+    for subdir in subdirs:
+        if subdir == 'ElectronicStructureLibrary___libxc':
+            continue
+        print(subdir)
+        json_path = os.path.join(dir, subdir, 'functions.jsonl')
+        contents = []
+        if os.path.exists(json_path):
+            objs = []
+            has_scored = False
+            has_read = False
+            with jsonlines.open(json_path) as reader:
+                has_read = True
+                for obj in reader:
+                    if 'score' in obj:
+                        has_scored = True
+                        break
+                    file_path = obj['file']
+                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                        func_content = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])[:32000]
+                        # obj['content'] = func_content
+                        contents.append(func_content)
+                        objs.append(obj)
+                if has_read and not has_scored:
+                    scores = get_scores(contents)
+                    for i, obj in enumerate(objs):
+                        obj['score'] = scores[i]
+            if has_read and not has_scored:
+                with jsonlines.open(json_path, 'w', flush=True) as writer:
+                    writer.write_all(objs)
+                    print("finish ", subdir)
+tokenizer = AutoTokenizer.from_pretrained(
+    "Qwen/Qwen3-Embedding-0.6B",
+    trust_remote_code=True
+)
+MAX_TOKENS = 30000   # 留点 buffer
+def truncate_to_max_tokens(text, max_tokens=MAX_TOKENS):
+    tokens = tokenizer(
+        text,
+        truncation=True,
+        max_length=max_tokens,
+        return_tensors=None
+    )
+    return tokenizer.decode(tokens["input_ids"], skip_special_tokens=True)
+def get_scores(documents):
+    safe_queries = [truncate_to_max_tokens(q) for q in queries]
+    safe_docs = [truncate_to_max_tokens(d) for d in documents]
+    input_texts = safe_queries + safe_docs
+    outputs = model.embed(input_texts)
+    embeddings = torch.tensor([o.outputs.embedding for o in outputs])
+    scores = (embeddings[0] @ embeddings[1:].T)
+    return scores.tolist()
+get_functions_contents('/home/weifengsun/tangou1/step2/step22/dataset')