| |
|
| | |
| | import torch |
| | import vllm |
| | from vllm import LLM |
| | from transformers import AutoTokenizer |
| | from pathlib import Path |
| | import os |
| | import jsonlines |
| |
|
| | os.environ["CUDA_VISIBLE_DEVICES"] = "1" |
| |
|
| | |
| | import multiprocessing as mp |
| | mp.set_start_method("spawn", force=True) |
| |
|
| |
|
| | def get_detailed_instruct(task_description: str, query: str) -> str: |
| | return f'Instruct: {task_description}\nQuery:{query}' |
| |
|
| |
|
| | keywords = ["Quantum mechanics", |
| | "Gene editing", |
| | "Folding", |
| | "System biology", |
| | "Antibody", |
| | "Heterogeneity", |
| | "Ligand", |
| | "Drug repurpose", |
| | "Kinetics", |
| | "Next-generation sequencing", |
| | "Pharmacogenetics", |
| | "Phase-field technique", |
| | "Human", |
| | "Potential", |
| | "Hartree-Fock", |
| | "Flow matching", |
| | "Lipid", |
| | "Biomedical", |
| | "Antigen", |
| | "Stochastic modeling", |
| | "Coupled cluster", |
| | "Quantum biology", |
| | "Spatial biology", |
| | "Antagonist", |
| | "Free energy perturbation", |
| | "Cycle", |
| | "Pharmacology", |
| | "Redox", |
| | "Physiology", |
| | "Protein-Protein Interactions", |
| | "Single-cell", |
| | "Screening", |
| | "Hydrophobic", |
| | "First-principles based DFT", |
| | "Molecular biology", |
| | "Mechanism", |
| | "Reproduction number", |
| | "Spatial Transcriptomics", |
| | "Ion", |
| | "Computational Materials", |
| | "Absorption", |
| | "Pharmacometrics", |
| | "GAN", |
| | "Compartmental model", |
| | "Diagnostics", |
| | "Lead discovery", |
| | "QAPR", |
| | "Rosettafold", |
| | "Autoregressive", |
| | "Pharmacokinetics", |
| | "Biotechnology", |
| | "Hydrophilic", |
| | "3D", |
| | "Protein", |
| | "QM/MM", |
| | "Activation", |
| | "AMR", |
| | "Networks", |
| | "Genotype", |
| | "Gene regulatory networks", |
| | "Biologics", |
| | "Phenotype", |
| | "Nowcasting", |
| | "DFT", |
| | "AlphaFold", |
| | "Pandemic", |
| | "Immunology", |
| | "Pathology", |
| | "Chemical space", |
| | "Transformer", |
| | "Homeostasis", |
| | "Score", |
| | "High-throughput", |
| | "Cheminformatics", |
| | "Hit-to-lead", |
| | "Sequencing", |
| | "Enzyme", |
| | "Antimicrobial resistance modeling", |
| | "Allosteric", |
| | "Inhibition", |
| | "Computational Biochemistry", |
| | "Bioinformatics", |
| | "Transcriptomics", |
| | "Diffusion", |
| | "Anomaly detection", |
| | "Multi-omics", |
| | "Biology", |
| | "Pathway", |
| | "Metabolomics", |
| | "Synthetic biology", |
| | "Microbial", |
| | "Proteomics", |
| | "Pharmaceutics", |
| | "Organoid", |
| | "Network pharmacology", |
| | "Imaging", |
| | "Generative adversarial networks", |
| | "Microbiology", |
| | "Organ-on-a-chip", |
| | "De novo", |
| | "Substrate", |
| | "Personalized", |
| | "Drug", |
| | "Transcription", |
| | "RNA", |
| | "Explainable AI", |
| | "Generate", |
| | "Docking", |
| | "Pathogens", |
| | "Bio foundation model", |
| | "Reinforcement learning", |
| | "Mechanism of action", |
| | "Generative", |
| | "Metabolic", |
| | "Metabolic Flux Analysis", |
| | "Computational Chemistry", |
| | "Vaccine", |
| | "Biophysics", |
| | "Integration", |
| | "Biochemistry", |
| | "Physiologically based pharmacokinetics model", |
| | "Medicine", |
| | "Crystal", |
| | "Conjugate", |
| | "Variational autoencoders", |
| | "In Silico", |
| | "Protein-protein", |
| | "CRISPR", |
| | "Spatial transcriptomics", |
| | "Gene", |
| | "Translation", |
| | "Glycomics", |
| | "Lead optimization", |
| | "Pharmacodynamics", |
| | "Ab initio", |
| | "System immunology", |
| | "Pseudotime analysis", |
| | "Generative AI", |
| | "RNN", |
| | "Regulatory networks", |
| | "PBPK model", |
| | "Beta-blocker", |
| | "Lipidomics", |
| | "Reaction", |
| | "Bio", |
| | "Genesis", |
| | "Evolution", |
| | "Computational Biology", |
| | "VAE", |
| | "Pharmacogenomics", |
| | "Assay", |
| | "Sensors", |
| | "Conformation", |
| | "Finite element method", |
| | "Human atlas", |
| | "Translational medicine", |
| | "Neurology", |
| | "Genomics", |
| | "Cell biology", |
| | "Porous", |
| | "Biomarker", |
| | "Bioengineering", |
| | "Allele", |
| | "Recurrent neural networks", |
| | "Carbohydrate", |
| | "Metamaterial", |
| | "Virtual human", |
| | "DNA", |
| | "Omics", |
| | "Agonist", |
| | "Receptor", |
| | "Cofactor", |
| | "Metabolic flux analysis", |
| | "Cell atlas", |
| | "Signaling", |
| | "Electronic structure", |
| | "Monte Carlo", |
| | "Genomic surveillance", |
| | "Agent-based model", |
| | "Biosensors", |
| | "2D", |
| | "QSAR", |
| | "Codon", |
| | "Coenzyme", |
| | "Nucleic acids", |
| | "Dynamics", |
| | "Ensemble", |
| | "Spectrometry", |
| | "Multi-scale modeling", |
| | "ADMET", |
| | "Marker", |
| | "Toxicology", |
| | "Profiling", |
| | "Design", |
| | "Viral", |
| | "Chemistry", |
| | "Epigenetics", |
| | "Homo-Lumo", |
| | "Modeling", |
| | "Prediction", |
| | "Quantum Chemistry", |
| | "Half-life", |
| | "Material", |
| | "Disease", |
| | "Phylodynamic model", |
| | "Metagenomics", |
| | "Digital twin", |
| | "Cancer biology", |
| | "Discovery", |
| | "Bioavailability", |
| | "Digital PCR" |
| | ] |
| |
|
| | |
| | task = 'Given a web search query, retrieve relevant passages that answer the query' |
| |
|
| | queries = [ |
| | get_detailed_instruct(task, ' '.join(keywords)) |
| | ] |
| |
|
| | model = LLM(model="Qwen/Qwen3-Embedding-0.6B", |
| | task="embed", |
| | tensor_parallel_size=1, |
| | data_parallel_size=1) |
| |
|
| |
|
| | def get_functions_contents(dir): |
| | subdirs = sorted([d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]) |
| | for subdir in subdirs: |
| | if subdir == 'ElectronicStructureLibrary___libxc': |
| | continue |
| | print(subdir) |
| | json_path = os.path.join(dir, subdir, 'functions.jsonl') |
| | contents = [] |
| | if os.path.exists(json_path): |
| | objs = [] |
| | has_scored = False |
| | has_read = False |
| | with jsonlines.open(json_path) as reader: |
| | has_read = True |
| | for obj in reader: |
| | if 'score' in obj: |
| | has_scored = True |
| | break |
| | file_path = obj['file'] |
| | with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: |
| | func_content = ''.join(f.readlines()[obj['start_line']-1:obj['end_line']])[:32000] |
| | |
| | contents.append(func_content) |
| | objs.append(obj) |
| | if has_read and not has_scored: |
| | scores = get_scores(contents) |
| | for i, obj in enumerate(objs): |
| | obj['score'] = scores[i] |
| | if has_read and not has_scored: |
| | with jsonlines.open(json_path, 'w', flush=True) as writer: |
| | writer.write_all(objs) |
| | print("finish ", subdir) |
| | |
| |
|
| | tokenizer = AutoTokenizer.from_pretrained( |
| | "Qwen/Qwen3-Embedding-0.6B", |
| | trust_remote_code=True |
| | ) |
| |
|
| | MAX_TOKENS = 30000 |
| | |
| | def truncate_to_max_tokens(text, max_tokens=MAX_TOKENS): |
| | tokens = tokenizer( |
| | text, |
| | truncation=True, |
| | max_length=max_tokens, |
| | return_tensors=None |
| | ) |
| | return tokenizer.decode(tokens["input_ids"], skip_special_tokens=True) |
| |
|
| | def get_scores(documents): |
| | safe_queries = [truncate_to_max_tokens(q) for q in queries] |
| | safe_docs = [truncate_to_max_tokens(d) for d in documents] |
| |
|
| | input_texts = safe_queries + safe_docs |
| | outputs = model.embed(input_texts) |
| | embeddings = torch.tensor([o.outputs.embedding for o in outputs]) |
| | scores = (embeddings[0] @ embeddings[1:].T) |
| | return scores.tolist() |
| |
|
| | get_functions_contents('/home/weifengsun/tangou1/step2/step22/dataset') |