Irwiny123 commited on Jan 12

Commit

94391f2

1 Parent(s): a7b498a

提交LigUnity初始代码

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -35
.gitignore +165 -0
HGNN/Attention.py +36 -0
HGNN/PL_Aggregator.py +75 -0
HGNN/PL_Encoder.py +51 -0
HGNN/PP_Aggregator.py +43 -0
HGNN/PP_Encoder.py +51 -0
HGNN/align.py +198 -0
HGNN/data/CoreSet.dat +286 -0
HGNN/data/PDBbind_v2020/index/INDEX_general_PL_data.2020 +0 -0
HGNN/data/PDBbind_v2020/index/INDEX_general_PL_name.2020 +0 -0
HGNN/data/PDBbind_v2020/index/INDEX_refined_data.2020 +0 -0
HGNN/data/PDBbind_v2020/index/INDEX_refined_name.2020 +0 -0
HGNN/main.py +318 -0
HGNN/read_fasta.py +112 -0
HGNN/screen_dataset.py +420 -0
HGNN/screening.py +165 -0
HGNN/test_pocket.fasta +2 -0
HGNN/util.py +96 -0
License +159 -0
README.md +206 -3
active_learning_scripts/run_al.sh +22 -0
active_learning_scripts/run_cycle_ensemble.py +334 -0
active_learning_scripts/run_cycle_one_model.py +246 -0
active_learning_scripts/run_model.sh +53 -0
ensemble_result.py +173 -0
py_scripts/__init__.py +0 -0
py_scripts/write_case_study.py +227 -0
test.sh +18 -0
test_fewshot.sh +38 -0
test_fewshot_demo.sh +43 -0
test_zeroshot_demo.sh +20 -0
train.sh +145 -0
unimol/__init__.py +6 -0
unimol/data/__init__.py +50 -0
unimol/data/add_2d_conformer_dataset.py +46 -0
unimol/data/affinity_dataset.py +527 -0
unimol/data/atom_type_dataset.py +34 -0
unimol/data/conformer_sample_dataset.py +315 -0
unimol/data/coord_pad_dataset.py +82 -0
unimol/data/cropping_dataset.py +269 -0
unimol/data/data_utils.py +23 -0
unimol/data/dictionary.py +157 -0
unimol/data/distance_dataset.py +64 -0
unimol/data/from_str_dataset.py +19 -0
unimol/data/key_dataset.py +29 -0
unimol/data/lmdb_dataset.py +49 -0
unimol/data/mask_points_dataset.py +267 -0
unimol/data/normalize_dataset.py +68 -0
unimol/data/pair_dataset.py +144 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

	@@ -0,0 +1,165 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+tmp/
+**/*.ipynb
+*.ipynb
+results/

HGNN/Attention.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import torch
+import torch.nn as nn
+from torch.nn import init
+import numpy as np
+import random
+import torch.nn.functional as F
+class Attention(nn.Module):
+    def __init__(self, embedding_dims):
+        super(Attention, self).__init__()
+        self.embed_dim = embedding_dims
+        self.bilinear = nn.Bilinear(self.embed_dim, self.embed_dim, 1)
+        self.att1 = nn.Linear(self.embed_dim * 2, self.embed_dim)
+        self.att2 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.att3 = nn.Linear(self.embed_dim, 1)
+        # self.linear_q = nn.Linear(self.embed_dim, self.embed_dim)
+        # self.linear_k = nn.Linear(self.embed_dim, self.embed_dim)
+        self.softmax = nn.Softmax(0)
+    def forward(self, node1, u_rep, num_neighs):
+        uv_reps = u_rep.repeat(num_neighs, 1)
+        x = torch.cat((node1, uv_reps), 1)
+        x = F.relu(self.att1(x))
+        x = F.dropout(x, training=self.training)
+        x = F.relu(self.att2(x))
+        x = F.dropout(x, training=self.training)
+        x = self.att3(x)
+        att = F.softmax(x, dim=0)
+        # u_rep = self.linear_q(u_rep).repeat(num_neighs, 1)
+        # node1 = self.linear_k(node1)
+        # att = torch.sum(u_rep * node1, dim=1)
+        # att = F.softmax(att, dim=0).unsqueeze(1)
+        return att

HGNN/PL_Aggregator.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import torch.nn.functional as F
+import numpy as np
+import random
+from Attention import Attention
+class PLAggregator(nn.Module):
+    """
+    item and user aggregator: for aggregating embeddings of neighbors (item/user aggreagator).
+    """
+    def __init__(self, v2e=None, r2e=None, u2e=None, embed_dim=128, cuda="cpu", uv=True):
+        super(PLAggregator, self).__init__()
+        self.uv = uv
+        self.v2e = v2e
+        self.r2e = r2e
+        self.u2e = u2e
+        self.device = cuda
+        self.embed_dim = embed_dim
+        self.w_r1 = nn.Linear(self.embed_dim * 2, self.embed_dim)
+        self.w_r2 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.att = Attention(self.embed_dim)
+        if self.v2e is not None:
+            self.v2e.requires_grad = False
+        if self.u2e is not None:
+            self.u2e.requires_grad = False
+    def forward(self, nodes_u, input_hist):
+        embed_matrix = torch.zeros(len(input_hist), self.embed_dim, dtype=torch.float).to(self.device)
+        for i in range(len(input_hist)):
+            history = []
+            label = []
+            for idx in range(len(input_hist[i])):
+                vid_hist = input_hist[i][idx][0]
+                vlabel_hist = input_hist[i][idx][1]
+                history.append(vid_hist)
+                label.append(vlabel_hist)
+            num_histroy_item = len(history)
+            if num_histroy_item > 0:
+                e_uv = self.v2e.weight[history]
+                uv_rep = self.u2e.weight[nodes_u[i]]
+                e_r = self.r2e.weight[label]
+                x = torch.cat((e_uv, e_r), 1)
+                x = F.relu(self.w_r1(x))
+                o_history = F.relu(self.w_r2(x))
+                att_w = self.att(o_history, uv_rep, num_histroy_item)
+                # print([(a,b) for a,b in zip(label, att_w)])
+                att_history = torch.mm(o_history.t(), att_w)
+                att_history = att_history.t()
+                embed_matrix[i] = (att_history + uv_rep) / 2
+            else:
+                embed_matrix[i] = self.u2e.weight[nodes_u[i]]
+        return embed_matrix
+    def forward_inference(self, pocket_embed, neighbor_list):
+        neighbor_embed = torch.stack([x[1] for x in neighbor_list])
+        rel_embed = self.r2e.weight[torch.stack([x[2] for x in neighbor_list])]
+        x = torch.cat((neighbor_embed, rel_embed), 1)
+        x = F.relu(self.w_r1(x))
+        o_neighbor = F.relu(self.w_r2(x))
+        att_w = self.att(o_neighbor, pocket_embed, len(neighbor_list))
+        # print([(a,b) for a,b in zip(label, att_w)])
+        att_res = torch.mm(o_neighbor.t(), att_w).t()
+        return (att_res + pocket_embed) / 2

HGNN/PL_Encoder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+import random
+class PLEncoder(nn.Module):
+    def __init__(self, embed_dim, pocket_graph=None, aggregator=None, idx2assayid={}, assayid_lst_train=[], mol_smi={}, train_label_lst=[], cuda="cpu", uv=True):
+        super(PLEncoder, self).__init__()
+        self.uv = uv
+        self.pocket_graph = pocket_graph
+        self.aggregator = aggregator
+        self.embed_dim = embed_dim
+        self.device = cuda
+        smi2idx = {smi:idx for idx, smi in enumerate(mol_smi)}
+        self.idx2assayid, self.assayid_lst_train, self.smi2idx, self.mol_smi, self.train_label_lst = idx2assayid, assayid_lst_train, smi2idx, mol_smi, train_label_lst
+        self.assayid_set_train = set(assayid_lst_train)
+        self.label_dicts = {x["assay_id"]: x for x in self.train_label_lst}
+        self.linear1 = nn.Linear(2 * self.embed_dim, self.embed_dim)  #
+    def forward(self, nodes_pocket, nodes_lig=None, max_sample=10):
+        to_neighs = []
+        if nodes_lig is None:
+            lig_smi_lst = ["----"] * len(nodes_pocket)
+        else:
+            lig_smi_lst = [self.mol_smi[lig_id] for lig_id in nodes_lig]
+        for node, smi in zip(nodes_pocket, lig_smi_lst):
+            assayid = self.idx2assayid[node]
+            neighbors = []
+            nbr_pockets = self.pocket_graph.get(assayid, [])
+            # random.shuffle(nbr_pockets)
+            # breakpoint()
+            for n_assayid, score in nbr_pockets:
+                nbr_smi = self.label_dicts[n_assayid]["ligands"][0]["smi"]
+                if assayid == n_assayid:
+                    continue
+                if smi == nbr_smi:
+                    continue
+                if n_assayid not in self.assayid_set_train:
+                    continue
+                neighbors.append((self.smi2idx[nbr_smi], int((score - 0.5) * 10)))
+            to_neighs.append(neighbors)
+        neigh_feats = self.aggregator.forward(nodes_pocket, to_neighs)  # user-item network
+        return neigh_feats
+    def refine_pocket(self, pocket_embed, neighbor_list=None):
+        return self.aggregator.forward_inference(pocket_embed, neighbor_list)

HGNN/PP_Aggregator.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+import numpy as np
+import random
+from Attention import Attention
+class PPAggregator(nn.Module):
+    """
+    Social Aggregator: for aggregating embeddings of social neighbors.
+    """
+    def __init__(self, u2e=None, embed_dim=128, cuda="cpu"):
+        super(PPAggregator, self).__init__()
+        self.device = cuda
+        self.u2e = u2e
+        self.embed_dim = embed_dim
+        self.att = Attention(self.embed_dim)
+    def forward(self, nodes, to_neighs):
+        embed_matrix = torch.zeros(len(nodes), self.embed_dim, dtype=torch.float).to(self.device)
+        self_feats = self.u2e.weight[nodes]
+        for i in range(len(nodes)):
+            tmp_adj = to_neighs[i]
+            num_neighs = len(tmp_adj)
+            if num_neighs > 0:
+                e_u = self.u2e.weight[[x[0] for x in tmp_adj]] # fast: user embedding
+                u_rep = self.u2e.weight[nodes[i]]
+                att_w = self.att(e_u, u_rep, num_neighs)
+                att_history = torch.mm(e_u.t(), att_w).t()
+                embed_matrix[i] = (att_history + self_feats[i]) / 2
+            else:
+                embed_matrix[i] = self_feats[i]
+        return embed_matrix
+    def forward_inference(self, pocket_embed, neighbor_list):
+        neighbor_embed = torch.stack([x[0] for x in neighbor_list])
+        att_w = self.att(neighbor_embed, pocket_embed, len(neighbor_list))
+        att_res = torch.mm(neighbor_embed.t(), att_w).t()
+        return (att_res + pocket_embed) / 2

HGNN/PP_Encoder.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+import random
+import copy
+class PPEncoder(nn.Module):
+    def __init__(self, pocket_encoder, embed_dim, pocket_graph=None, aggregator=None, assayid_lst_all=[], assayid_lst_train=[], base_model=None, cuda="cpu"):
+        super(PPEncoder, self).__init__()
+        self.pocket_encoder = pocket_encoder
+        self.pocket_graph = pocket_graph
+        self.aggregator = aggregator
+        if base_model != None:
+            self.base_model = base_model
+        self.embed_dim = embed_dim
+        self.device = cuda
+        self.linear1 = nn.Linear(2 * self.embed_dim, self.embed_dim)
+        self.assayid_lst_all, self.assayid_set_train = assayid_lst_all, set(assayid_lst_train)
+        self.assayid2idxes = {}
+        for idx, assayid in enumerate(assayid_lst_all):
+            if assayid not in self.assayid2idxes:
+                self.assayid2idxes[assayid] = []
+            self.assayid2idxes[assayid].append(idx)
+    def forward(self, nodes_pocket, nodes_lig=None, max_sample=10):
+        to_neighs = []
+        for node in nodes_pocket:
+            assayid = self.assayid_lst_all[node]
+            neighbors = []
+            nbr_pockets = self.pocket_graph.get(assayid, [])
+            for n_assayid, score in nbr_pockets:
+                if n_assayid == assayid:
+                    continue
+                if n_assayid not in self.assayid_set_train:
+                    continue
+                neighbors.append((random.choices(self.assayid2idxes[n_assayid])[0], score))
+            to_neighs.append(neighbors)
+        neigh_feats = self.aggregator.forward(nodes_pocket, to_neighs)  # user-user network
+        self_feats = self.pocket_encoder(nodes_pocket, nodes_lig, max_sample)
+        return (self_feats + neigh_feats) / 2
+    def refine_pocket(self, pocket_embed, neighbor_list=None):
+        neigh_feats = self.aggregator.forward_inference(pocket_embed, neighbor_list)
+        self_feats = self.pocket_encoder.refine_pocket(pocket_embed, neighbor_list)
+        return (self_feats + neigh_feats) / 2

HGNN/align.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import glob
+import skbio
+import json, pickle, os
+from skbio import alignment
+from skbio import Protein
+from tqdm import tqdm
+from multiprocessing import Pool
+import numpy as np
+import torch
+cutoff = 5.0
+blosum50 = \
+    {
+        '*': {'*': 1, 'A': -5, 'C': -5, 'B': -5, 'E': -5, 'D': -5, 'G': -5,
+              'F': -5, 'I': -5, 'H': -5, 'K': -5, 'M': -5, 'L': -5,
+              'N': -5, 'Q': -5, 'P': -5, 'S': -5, 'R': -5, 'T': -5,
+              'W': -5, 'V': -5, 'Y': -5, 'X': -5, 'Z': -5},
+        'A': {'*': -5, 'A': 5, 'C': -1, 'B': -2, 'E': -1, 'D': -2, 'G': 0,
+              'F': -3, 'I': -1, 'H': -2, 'K': -1, 'M': -1, 'L': -2,
+              'N': -1, 'Q': -1, 'P': -1, 'S': 1, 'R': -2, 'T': 0, 'W': -3,
+              'V': 0, 'Y': -2, 'X': -1, 'Z': -1},
+        'C': {'*': -5, 'A': -1, 'C': 13, 'B': -3, 'E': -3, 'D': -4,
+              'G': -3, 'F': -2, 'I': -2, 'H': -3, 'K': -3, 'M': -2,
+              'L': -2, 'N': -2, 'Q': -3, 'P': -4, 'S': -1, 'R': -4,
+              'T': -1, 'W': -5, 'V': -1, 'Y': -3, 'X': -1, 'Z': -3},
+        'B': {'*': -5, 'A': -2, 'C': -3, 'B': 6, 'E': 1, 'D': 6, 'G': -1,
+              'F': -4, 'I': -4, 'H': 0, 'K': 0, 'M': -3, 'L': -4, 'N': 5,
+              'Q': 0, 'P': -2, 'S': 0, 'R': -1, 'T': 0, 'W': -5, 'V': -3,
+              'Y': -3, 'X': -1, 'Z': 1},
+        'E': {'*': -5, 'A': -1, 'C': -3, 'B': 1, 'E': 6, 'D': 2, 'G': -3,
+              'F': -3, 'I': -4, 'H': 0, 'K': 1, 'M': -2, 'L': -3, 'N': 0,
+              'Q': 2, 'P': -1, 'S': -1, 'R': 0, 'T': -1, 'W': -3, 'V': -3,
+              'Y': -2, 'X': -1, 'Z': 5},
+        'D': {'*': -5, 'A': -2, 'C': -4, 'B': 6, 'E': 2, 'D': 8, 'G': -1,
+              'F': -5, 'I': -4, 'H': -1, 'K': -1, 'M': -4, 'L': -4, 'N': 2,
+              'Q': 0, 'P': -1, 'S': 0, 'R': -2, 'T': -1, 'W': -5, 'V': -4,
+              'Y': -3, 'X': -1, 'Z': 1},
+        'G': {'*': -5, 'A': 0, 'C': -3, 'B': -1, 'E': -3, 'D': -1, 'G': 8,
+              'F': -4, 'I': -4, 'H': -2, 'K': -2, 'M': -3, 'L': -4, 'N': 0,
+              'Q': -2, 'P': -2, 'S': 0, 'R': -3, 'T': -2, 'W': -3, 'V': -4,
+              'Y': -3, 'X': -1, 'Z': -2},
+        'F': {'*': -5, 'A': -3, 'C': -2, 'B': -4, 'E': -3, 'D': -5,
+              'G': -4, 'F': 8, 'I': 0, 'H': -1, 'K': -4, 'M': 0, 'L': 1,
+              'N': -4, 'Q': -4, 'P': -4, 'S': -3, 'R': -3, 'T': -2, 'W': 1,
+              'V': -1, 'Y': 4, 'X': -1, 'Z': -4},
+        'I': {'*': -5, 'A': -1, 'C': -2, 'B': -4, 'E': -4, 'D': -4,
+              'G': -4, 'F': 0, 'I': 5, 'H': -4, 'K': -3, 'M': 2, 'L': 2,
+              'N': -3, 'Q': -3, 'P': -3, 'S': -3, 'R': -4, 'T': -1,
+              'W': -3, 'V': 4, 'Y': -1, 'X': -1, 'Z': -3},
+        'H': {'*': -5, 'A': -2, 'C': -3, 'B': 0, 'E': 0, 'D': -1, 'G': -2,
+              'F': -1, 'I': -4, 'H': 10, 'K': 0, 'M': -1, 'L': -3, 'N': 1,
+              'Q': 1, 'P': -2, 'S': -1, 'R': 0, 'T': -2, 'W': -3, 'V': -4,
+              'Y': 2, 'X': -1, 'Z': 0},
+        'K': {'*': -5, 'A': -1, 'C': -3, 'B': 0, 'E': 1, 'D': -1, 'G': -2,
+              'F': -4, 'I': -3, 'H': 0, 'K': 6, 'M': -2, 'L': -3, 'N': 0,
+              'Q': 2, 'P': -1, 'S': 0, 'R': 3, 'T': -1, 'W': -3, 'V': -3,
+              'Y': -2, 'X': -1, 'Z': 1},
+        'M': {'*': -5, 'A': -1, 'C': -2, 'B': -3, 'E': -2, 'D': -4,
+              'G': -3, 'F': 0, 'I': 2, 'H': -1, 'K': -2, 'M': 7, 'L': 3,
+              'N': -2, 'Q': 0, 'P': -3, 'S': -2, 'R': -2, 'T': -1, 'W': -1,
+              'V': 1, 'Y': 0, 'X': -1, 'Z': -1},
+        'L': {'*': -5, 'A': -2, 'C': -2, 'B': -4, 'E': -3, 'D': -4,
+              'G': -4, 'F': 1, 'I': 2, 'H': -3, 'K': -3, 'M': 3, 'L': 5,
+              'N': -4, 'Q': -2, 'P': -4, 'S': -3, 'R': -3, 'T': -1,
+              'W': -2, 'V': 1, 'Y': -1, 'X': -1, 'Z': -3},
+        'N': {'*': -5, 'A': -1, 'C': -2, 'B': 5, 'E': 0, 'D': 2, 'G': 0,
+              'F': -4, 'I': -3, 'H': 1, 'K': 0, 'M': -2, 'L': -4, 'N': 7,
+              'Q': 0, 'P': -2, 'S': 1, 'R': -1, 'T': 0, 'W': -4, 'V': -3,
+              'Y': -2, 'X': -1, 'Z': 0},
+        'Q': {'*': -5, 'A': -1, 'C': -3, 'B': 0, 'E': 2, 'D': 0, 'G': -2,
+              'F': -4, 'I': -3, 'H': 1, 'K': 2, 'M': 0, 'L': -2, 'N': 0,
+              'Q': 7, 'P': -1, 'S': 0, 'R': 1, 'T': -1, 'W': -1, 'V': -3,
+              'Y': -1, 'X': -1, 'Z': 4},
+        'P': {'*': -5, 'A': -1, 'C': -4, 'B': -2, 'E': -1, 'D': -1,
+              'G': -2, 'F': -4, 'I': -3, 'H': -2, 'K': -1, 'M': -3,
+              'L': -4, 'N': -2, 'Q': -1, 'P': 10, 'S': -1, 'R': -3,
+              'T': -1, 'W': -4, 'V': -3, 'Y': -3, 'X': -1, 'Z': -1},
+        'S': {'*': -5, 'A': 1, 'C': -1, 'B': 0, 'E': -1, 'D': 0, 'G': 0,
+              'F': -3, 'I': -3, 'H': -1, 'K': 0, 'M': -2, 'L': -3, 'N': 1,
+              'Q': 0, 'P': -1, 'S': 5, 'R': -1, 'T': 2, 'W': -4, 'V': -2,
+              'Y': -2, 'X': -1, 'Z': 0},
+        'R': {'*': -5, 'A': -2, 'C': -4, 'B': -1, 'E': 0, 'D': -2, 'G': -3,
+              'F': -3, 'I': -4, 'H': 0, 'K': 3, 'M': -2, 'L': -3, 'N': -1,
+              'Q': 1, 'P': -3, 'S': -1, 'R': 7, 'T': -1, 'W': -3, 'V': -3,
+              'Y': -1, 'X': -1, 'Z': 0},
+        'T': {'*': -5, 'A': 0, 'C': -1, 'B': 0, 'E': -1, 'D': -1, 'G': -2,
+              'F': -2, 'I': -1, 'H': -2, 'K': -1, 'M': -1, 'L': -1, 'N': 0,
+              'Q': -1, 'P': -1, 'S': 2, 'R': -1, 'T': 5, 'W': -3, 'V': 0,
+              'Y': -2, 'X': -1, 'Z': -1},
+        'W': {'*': -5, 'A': -3, 'C': -5, 'B': -5, 'E': -3, 'D': -5,
+              'G': -3, 'F': 1, 'I': -3, 'H': -3, 'K': -3, 'M': -1, 'L': -2,
+              'N': -4, 'Q': -1, 'P': -4, 'S': -4, 'R': -3, 'T': -3,
+              'W': 15, 'V': -3, 'Y': 2, 'X': -1, 'Z': -2},
+        'V': {'*': -5, 'A': 0, 'C': -1, 'B': -3, 'E': -3, 'D': -4, 'G': -4,
+              'F': -1, 'I': 4, 'H': -4, 'K': -3, 'M': 1, 'L': 1, 'N': -3,
+              'Q': -3, 'P': -3, 'S': -2, 'R': -3, 'T': 0, 'W': -3, 'V': 5,
+              'Y': -1, 'X': -1, 'Z': -3},
+        'Y': {'*': -5, 'A': -2, 'C': -3, 'B': -3, 'E': -2, 'D': -3,
+              'G': -3, 'F': 4, 'I': -1, 'H': 2, 'K': -2, 'M': 0, 'L': -1,
+              'N': -2, 'Q': -1, 'P': -3, 'S': -2, 'R': -1, 'T': -2, 'W': 2,
+              'V': -1, 'Y': 8, 'X': -1, 'Z': -2},
+        'X': {'*': -5, 'A': -1, 'C': -1, 'B': -1, 'E': -1, 'D': -1,
+              'G': -1, 'F': -1, 'I': -1, 'H': -1, 'K': -1, 'M': -1,
+              'L': -1, 'N': -1, 'Q': -1, 'P': -1, 'S': -1, 'R': -1,
+              'T': -1, 'W': -1, 'V': -1, 'Y': -1, 'X': -1, 'Z': -1},
+        'Z': {'*': -5, 'A': -1, 'C': -3, 'B': 1, 'E': 5, 'D': 1, 'G': -2,
+              'F': -4, 'I': -3, 'H': 0, 'K': 1, 'M': -1, 'L': -3, 'N': 0,
+              'Q': 4, 'P': -1, 'S': 0, 'R': 0, 'T': -1, 'W': -2, 'V': -3,
+              'Y': -2, 'X': -1, 'Z': 5}}
+import math
+def get_align_score(fasta_1, fasta_2):
+    kwargs = {}
+    kwargs['suppress_sequences'] = False
+    kwargs['zero_index'] = True
+    kwargs['protein'] = True
+    kwargs['substitution_matrix'] = blosum50
+    query = alignment.StripedSmithWaterman(fasta_1, **kwargs)
+    align = query(fasta_2)
+    score = align.optimal_alignment_score
+    return float(score)
+def read_data(data_root, result_root):
+    training_data_fastas = json.load(open(f"{data_root}/align_fastas_dict_10.23.json"))
+    bdb_fastas_dict = training_data_fastas['bdb_fastas']
+    pdbbind_fastas_dict = training_data_fastas['pdb_fastas']
+    save_dir_bdb = f"{result_root}/BDB"
+    save_dir_pdbbind = f"{result_root}/PDBBind"
+    mol_feat_train_bdb = np.load(f'{save_dir_bdb}/bdb_mol_reps.npy')
+    pocket_feat_train_bdb = np.load(f'{save_dir_bdb}/bdb_pocket_reps.npy')
+    pocket_names_bdb = json.load(open(f"{save_dir_bdb}/bdb_pocket_names.json"))
+    mol_smis_bdb = json.load(open(f"{save_dir_bdb}/bdb_mol_smis.json"))
+    bdb_pocket_feat_dict = {pocket_names_bdb[i]: pocket_feat_train_bdb[i] for i in range(len(pocket_names_bdb))}
+    bdb_mol_feat_dict = {mol_smis_bdb[i]: mol_feat_train_bdb[i] for i in range(len(mol_smis_bdb))}
+    mol_feat_train_pdbbind = np.load(f'{save_dir_pdbbind}/train_mol_reps.npy')
+    pocket_feat_train_pdbbind = np.load(f'{save_dir_pdbbind}/train_pocket_reps.npy')
+    pocket_names_pdbbind = json.load(open(f"{save_dir_pdbbind}/train_pdbbind_ids.json"))
+    mol_smis_pdbbind = json.load(open(f"{save_dir_pdbbind}/train_mol_smis.json"))
+    pdbbind_pocket_feat_dict = {pocket_names_pdbbind[i]: pocket_feat_train_pdbbind[i] for i in range(len(pocket_names_pdbbind))}
+    pdbbind_mol_feat_dict = {mol_smis_pdbbind[i]: mol_feat_train_pdbbind[i] for i in range(len(mol_smis_pdbbind))}
+    return bdb_fastas_dict, bdb_pocket_feat_dict, bdb_mol_feat_dict, pdbbind_fastas_dict, pdbbind_pocket_feat_dict, pdbbind_mol_feat_dict
+def get_neighbor_pocket(test_fasta, data_root, result_root, device):
+    # 1. Read data file
+    print("reading datas")
+    bdb_fastas_dict, bdb_pocket_feat_dict, bdb_mol_feat_dict, pdbbind_fastas_dict, \
+      pdbbind_pocket_feat_dict, pdbbind_mol_feat_dict = read_data(data_root, result_root)
+    training_assay = json.load(open(f"{data_root}/train_label_blend_seq_full.json"))
+    training_assay += json.load(open(f"{data_root}/train_label_pdbbind_seq.json"))
+    assay_dict = {}
+    for assay in training_assay:
+        assay["ligands"] = sorted(assay["ligands"], key=lambda x: x["act"], reverse=True)
+        if "assay_id" in assay:
+            assay_dict[assay["assay_id"]] = assay
+        else:
+            assay_dict[assay["pockets"][0][:4]] = assay
+    skip = 0
+    # 2. run alignment
+    print("running alignment pdbbind")
+    align_res_list = []
+    for a_name, fasta in tqdm(pdbbind_fastas_dict.items()):
+        if a_name not in pdbbind_pocket_feat_dict:
+            skip += 1
+            continue
+        p_name = a_name
+        l_smi = assay_dict[a_name]["ligands"][0]["smi"]
+        align_score = get_align_score(test_fasta, fasta) / get_align_score(test_fasta, test_fasta)
+        if align_score >= 0.5:
+            align_res_list.append((pdbbind_pocket_feat_dict[p_name], pdbbind_mol_feat_dict[l_smi], align_score, a_name))
+    print("running alignment bindingdb")
+    for a_name, fasta in tqdm(bdb_fastas_dict.items()):
+        if a_name not in assay_dict:
+            skip += 1
+            continue
+        p_name = assay_dict[a_name]["pockets"][0]
+        l_smi = assay_dict[a_name]["ligands"][0]["smi"]
+        if l_smi not in bdb_mol_feat_dict:
+            continue
+        align_score = get_align_score(test_fasta, fasta) / get_align_score(test_fasta, test_fasta)
+        if align_score >= 0.5:
+            align_res_list.append((bdb_pocket_feat_dict[p_name], bdb_mol_feat_dict[l_smi], align_score, a_name))
+    for i, res in enumerate(align_res_list):
+        align_res_list[i] = (torch.tensor(res[0]).float().to(device),
+                             torch.tensor(res[1]).float().to(device),
+                             torch.tensor(int((res[2] - 0.5) * 10)).to(device))
+    return align_res_list

HGNN/data/CoreSet.dat ADDED Viewed

	@@ -0,0 +1,286 @@

+#code   resl     year    logKa    Ka           target
+4llx    1.75     2014    2.89     Ki=1300uM    1
+5c28    1.56     2015    5.66     Ki=2.2uM     1
+3uuo    2.11     2012    7.96     Ki=11nM      1
+3ui7    2.28     2011    9.00     Ki=1nM       1
+5c2h    2.09     2015   11.09     Ki=8.2pM     1
+2v00    1.55     2007    3.66     Kd=0.22mM    2
+3wz8    1.45     2015    5.82     Ki=1.5uM     2
+3pww    1.22     2011    7.32     Ki=48nM      2
+3prs    1.38     2011    7.82     Ki=15nM      2
+3uri    2.10     2012    9.00     Ki=1nM       2
+4m0z    2.00     2014    5.19     Kd=6.4uM     3
+4m0y    1.70     2014    6.46     Kd=0.35uM    3
+3qgy    2.10     2011    7.80     Ki=16nM      3
+4qd6    2.45     2015    8.64     Ki=2.3nM     3
+4rfm    2.10     2015   10.05     Ki=90pM      3
+4cr9    1.70     2015    4.10     Ki=80uM      4
+4cra    1.80     2015    7.22     Ki=0.06uM    4
+4x6p    1.93     2015    8.30     Ki=5nM       4
+4crc    1.60     2015    8.72     Ki=0.0019uM  4
+4ty7    2.09     2014    9.52     Ki=0.3nM     4
+5aba    1.62     2015    2.98     Kd=1040uM    5
+5a7b    1.40     2015    3.57     Kd=271uM     5
+4agn    1.60     2012    3.97     Kd=107uM     5
+4agp    1.50     2012    4.69     Kd=20.6uM    5
+4agq    1.42     2012    5.01     Kd=9.7uM     5
+3bgz    2.40     2007    6.26     Ki=0.55uM    6
+3jya    2.10     2009    6.89     Ki=0.1301uM  6
+2c3i    1.90     2005    7.60     Kd=25nM      6
+4k18    2.05     2013    8.96     Ki=1.1nM     6
+5dwr    2.00     2015   11.22     Ki=6pM       6
+3mss    1.95     2010    4.66     Kd=22uM      7
+3k5v    1.74     2010    6.30     Kd=0.5uM     7
+3pyy    1.85     2011    6.86     Kd=137nM     7
+2v7a    2.50     2007    8.30     Kd=0.005uM   7
+4twp    2.40     2015   10.00     Ki=100pM     7
+3wtj    2.24     2015    6.53     Kd=0.297uM   8
+3zdg    2.48     2013    7.10     Ki=79nM      8
+3u8k    2.47     2011    8.66     Ki=2.2nM     8
+4qac    2.10     2014    9.40     Kd=0.4nM     8
+3u8n    2.35     2011   10.17     Ki=0.067nM   8
+1a30    2.00     1998    4.30     Ki=50uM      9
+2qnq    2.30     2008    6.11     Ki=0.77uM    9
+1g2k    1.95     2001    7.96     Ki=11nM      9
+1eby    2.29     2002    9.70     Ki=0.20nM    9
+3o9i    1.45     2011   11.82     Ki=1.5pM     9
+4lzs    2.20     2014    4.80     Kd=16uM     10
+3u5j    1.60     2011    5.61     Kd=2.46uM   10
+4wiv    1.56     2014    6.26     Kd=550nM    10
+4ogj    1.65     2014    6.79     Kd=164nM    10
+3p5o    1.60     2010    7.30     Kd=50.5nM   10
+1ps3    1.80     2003    2.28     Ki=5.2mM    11
+3dx1    1.21     2009    3.58     Ki=265uM    11
+3d4z    1.39     2008    4.89     Ki=13uM     11
+3dx2    1.40     2009    6.82     Ki=150nM    11
+3ejr    1.27     2009    8.57     Ki=2.7nM    11
+3l7b    2.00     2010    2.40     Ki=4.01mM   12
+4eky    2.45     2012    3.52     Ki=303.0uM  12
+3g2n    2.10     2010    4.09     Ki=81uM     12
+3syr    2.40     2012    5.10     Ki=7.9uM    12
+3ebp    2.00     2009    5.91     Ki=1.24uM   12
+2w66    2.27     2009    4.05     Ki=89uM     13
+2w4x    2.42     2009    4.85     Kd=14uM     13
+2wca    2.30     2009    5.60     Ki=2.5uM    13
+2xj7    2.00     2010    6.66     Ki=220nM    13
+2vvn    1.85     2008    7.30     Kd=50nM     13
+3aru    1.90     2011    3.22     Kd=600uM    14
+3arv    1.50     2011    5.64     Kd=2.3uM    14
+3ary    1.35     2011    6.00     Kd=1.0uM    14
+3arq    1.50     2011    6.40     Kd=0.4uM    14
+3arp    1.55     2011    7.15     Kd=0.07uM   14
+4ih5    1.90     2013    4.11     Kd=78uM     15
+4ih7    2.30     2013    5.24     Kd=5.8uM    15
+3cj4    2.07     2008    6.51     Kd=0.31uM   15
+4eo8    1.80     2012    8.15     Kd=7nM      15
+3gnw    2.39     2009    9.10     Kd=0.79nM   15
+1gpk    2.10     2002    5.37     Ki=4.3uM    16
+1gpn    2.35     2002    6.48     Ki=0.334uM  16
+1h23    2.15     2002    8.35     Ki=4.5nM    16
+1h22    2.15     2002    9.10     Ki=0.8nM    16
+1e66    2.10     2001    9.89     Ki=0.13nM   16
+3f3a    2.00     2008    4.19     Ki=64.8uM   17
+3f3c    2.10     2008    6.02     Ki=950nM    17
+4mme    2.50     2013    6.50     Kd=318nM    17
+3f3d    2.30     2008    7.16     Kd=69nM     17
+3f3e    1.80     2008    7.70     Kd=20nM     17
+2wbg    1.85     2009    4.45     Ki=35.2uM   18
+2cbv    1.95     2006    5.48     Kd=3.3uM    18
+2j78    1.65     2006    6.42     Kd=384nM    18
+2j7h    1.95     2006    7.19     Kd=65nM     18
+2cet    1.97     2006    8.02     Kd=9.6nM    18
+3udh    1.70     2012    2.85     Kd=1.4mM    19
+3rsx    2.48     2011    4.41     Kd=38.8uM   19
+4djv    1.73     2012    6.72     Ki=0.19uM   19
+2vkm    2.05     2008    8.74     Ki=1.8nM    19
+4gid    2.00     2012   10.77     Ki=0.017nM  19
+4jfs    2.00     2013    5.27     Ki=5.4uM    20
+4j28    1.73     2013    5.70     Ki=2.0uM    20
+2wvt    1.80     2010    6.12     Kd=755nM    20
+2xii    1.80     2010    7.20     Kd=63.3nM   20
+4pcs    1.77     2014    7.85     Ki=14nM     20
+3rr4    1.68     2012    4.55     Ki=28.05uM  21
+1s38    1.81     2004    5.15     Ki=7.0uM    21
+1r5y    1.20     2004    6.46     Ki=0.35uM   21
+3gc5    1.40     2009    7.26     Ki=55nM     21
+3ge7    1.50     2009    8.70     Ki=2nM      21
+4dli    1.91     2013    5.62     Kd=2.40uM   22
+2zb1    2.50     2008    6.32     Kd=0.48uM   22
+4f9w    2.00     2013    6.94     Ki=114nM    22
+3e92    2.00     2008    8.00     Ki=10nM     22
+3e93    2.00     2008    8.85     Ki=1.4nM    22
+4owm    1.99     2014    2.96     Ki=1090uM   23
+3twp    1.83     2012    3.92     Ki=119uM    23
+3r88    1.73     2012    4.82     Ki=15uM     23
+4gkm    1.67     2013    5.17     Ki=6.8uM    23
+3qqs    1.97     2012    5.82     Ki=1.5uM    23
+3gv9    1.80     2009    2.12     Ki=7.5mM    24
+3gr2    1.80     2009    2.52     Ki=3mM      24
+4kz6    1.68     2014    3.10     Ki=0.8mM    24
+4jxs    1.90     2014    4.74     Ki=18uM     24
+2r9w    1.80     2008    5.10     Ki=8uM      24
+2hb1    2.00     2006    3.80     Ki=160uM    25
+1bzc    2.35     1999    4.92     Ki=12uM     25
+2qbr    2.30     2008    6.33     Ki=0.47uM   25
+2qbq    2.10     2008    7.44     Ki=0.036uM  25
+2qbp    2.50     2008    8.40     Ki=0.004uM  25
+1q8t    2.00     2003    4.76     Kd=17.5uM   26
+1ydr    2.20     1997    5.52     Ki=3.0uM    26
+1q8u    1.90     2003    5.96     Kd=1.1uM    26
+1ydt    2.30     1997    7.32     Ki=48nM     26
+3ag9    2.00     2010    8.05     Ki=9nM      26
+3fcq    1.75     2009    2.77     Ki=1.7mM    27
+1z9g    1.70     2005    5.64     Ki=2.3uM    27
+1qf1    2.00     1999    7.32     Ki=48nM     27
+5tmn    1.60     1989    8.04     Ki=9.1nM    27
+4tmn    1.70     1989   10.17     Ki=0.068nM  27
+4ddk    1.75     2013    2.29     Kd=5.13mM   28
+4ddh    2.07     2013    3.32     Kd=0.48mM   28
+3ivg    1.95     2009    4.30     Kd=50uM     28
+3coz    2.00     2008    5.57     Kd=2.7uM    28
+3coy    2.03     2008    6.02     Kd=0.96uM   28
+3pxf    1.80     2011    4.43     Kd=37uM     29
+4eor    2.20     2013    6.30     Ki=500nM    29
+2xnb    1.85     2010    6.83     Ki=149nM    29
+1pxn    2.50     2004    7.15     Ki=0.07uM   29
+2fvd    1.85     2006    8.52     Ki=3nM      29
+4k77    2.40     2013    6.63     Ki=235nM    30
+4e5w    1.86     2012    7.66     Ki=22nM     30
+4ivb    1.90     2013    8.72     Ki=1.9nM    30
+4ivd    1.93     2013    9.52     Ki=0.3nM    30
+4ivc    2.35     2013   10.00     Ki=0.1nM    30
+4f09    2.40     2012    6.70     Ki=200nM    31
+4gfm    2.30     2013    7.22     Ki=0.06uM   31
+4hge    2.30     2012    7.92     Ki=11.9nM   31
+4e6q    1.95     2012    8.36     Ki=4.4nM    31
+4jia    1.85     2013    9.22     Ki=0.6nM    31
+2brb    2.10     2005    4.86     Ki=13.7uM   32
+2br1    2.00     2005    5.14     Ki=7.2uM    32
+3jvr    1.76     2009    5.72     Ki=1.89uM   32
+3jvs    1.90     2009    6.54     Kd=0.29uM   32
+1nvq    2.00     2003    8.25     Ki=5.6nM    32
+3acw    1.63     2010    4.76     Ki=17.5uM   33
+4ea2    2.05     2012    6.44     Ki=0.36uM   33
+2zcr    1.92     2008    6.87     Ki=135nM    33
+2zy1    1.78     2009    7.40     Ki=0.04uM   33
+2zcq    2.38     2008    8.82     Ki=1.5nM    33
+1bcu    2.00     1998    3.28     Kd=0.53mM   34
+3bv9    1.80     2008    5.36     Ki=4.4uM    34
+1oyt    1.67     2003    7.24     Ki=0.057uM  34
+2zda    1.73     2008    8.40     Ki=4nM      34
+3utu    1.55     2012   10.92     Ki=0.012nM  34
+3u9q    1.52     2011    4.38     Ki=41.7uM   35
+2yfe    2.00     2012    6.63     Ki=0.236uM  35
+3fur    2.30     2009    8.00     Ki=10nM     35
+3b1m    1.60     2011    8.48     Ki=3.3nM    35
+2p4y    2.25     2008    9.00     Ki=1nM      35
+3uo4    2.45     2012    6.52     Kd=299nM    36
+3up2    2.30     2012    7.40     Kd=40nM     36
+3e5a    2.30     2008    8.23     Ki=5.9nM    36
+2wtv    2.40     2010    8.74     Ki=1.8nM    36
+3myg    2.40     2010   10.70     Kd=0.02nM   36
+3kgp    2.35     2009    2.57     Ki=2.68mM   37
+1c5z    1.85     2000    4.01     Ki=97uM     37
+1o5b    1.85     2004    5.77     Ki=1.7uM    37
+1owh    1.61     2003    7.40     Ki=40nM     37
+1sqa    2.00     2004    9.21     Ki=0.62nM   37
+4jsz    1.90     2013    2.30     Ki=5000uM   38
+3kwa    2.00     2010    4.08     Ki=84uM     38
+2weg    1.10     2009    6.50     Kd=314nM    38
+3ryj    1.39     2011    7.80     Kd=16nM     38
+3dd0    1.48     2009    9.00     Ki=1nM      38
+2xdl    1.98     2010    3.10     Kd=790uM    39
+3b27    1.50     2011    5.16     Kd=6.9uM    39
+1yc1    1.70     2005    6.17     Kd=680nM    39
+3rlr    1.70     2011    7.52     Ki=30nM     39
+2yki    1.67     2011    9.46     Kd=0.35nM   39
+1z95    1.80     2005    7.12     Ki=76nM     40
+3b68    1.90     2008    8.40     Ki=4nM      40
+3b5r    1.80     2008    8.77     Ki=1.7nM    40
+3b65    1.80     2008    9.27     Ki=0.54nM   40
+3g0w    1.95     2009    9.52     Ki=0.3nM    40
+4u4s    1.90     2014    2.92     Kd=1200uM   41
+1p1q    2.00     2003    4.89     Kd=12.8uM   41
+1syi    2.10     2005    5.44     Ki=3590nM   41
+1p1n    1.60     2003    6.80     Kd=0.16uM   41
+2al5    1.65     2005    8.40     Ki=4nM      41
+3g2z    1.50     2009    2.36     Ki=4.4mM    42
+3g31    1.70     2009    2.89     Ki=1.3mM    42
+4de2    1.40     2012    4.12     Ki=76.0uM   42
+4de3    1.44     2012    5.52     Ki=3.0uM    42
+4de1    1.26     2012    5.96     Ki=1.1uM    42
+1vso    1.85     2007    4.72     Ki=18.98uM  43
+4dld    2.00     2012    5.82     Ki=1.5uM    43
+3gbb    2.10     2009    6.90     Ki=126nM    43
+3fv2    1.50     2010    8.11     Ki=7.7nM    43
+3fv1    1.50     2010    9.30     Ki=0.5nM    43
+4mgd    1.90     2014    4.69     Kd=20.19uM  44
+2qe4    2.40     2007    7.96     Ki=11.0nM   44
+1qkt    2.20     2000    9.04     Kd=0.92nM   44
+2pog    1.84     2007    9.54     Ki=0.29nM   44
+2p15    1.94     2007   10.30     Kd=50pM     44
+2y5h    1.33     2011    5.79     Ki=1620nM   45
+1lpg    2.00     2003    7.09     Ki=82nM     45
+2xbv    1.66     2010    8.43     Kd=3.7nM    45
+1z6e    1.80     2006    9.72     Ki=0.19nM   45
+1mq6    2.10     2003   11.15     Ki=7pM      45
+1nc3    2.20     2003    5.00     Ki=10uM     46
+1nc1    2.00     2003    6.12     Ki=0.75uM   46
+1y6r    2.20     2005   10.11     Ki=77pM     46
+4f2w    2.00     2013   11.30     Ki=5.0pM    46
+4f3c    1.93     2013   11.82     Ki=1.5pM    46
+1uto    1.15     2004    2.27     Kd=5.32mM   47
+4abg    1.52     2012    3.57     Kd=271uM    47
+3gy4    1.55     2010    5.10     Kd=8uM      47
+1k1i    2.20     2001    6.58     Kd=264nM    47
+1o3f    1.55     2003    7.96     Ki=0.011uM  47
+2yge    1.96     2011    5.06     Kd=8.62uM   48
+2fxs    2.00     2007    6.06     Kd=0.87uM   48
+2iwx    1.50     2006    6.68     Kd=0.21uM   48
+2wer    1.60     2009    7.05     Kd=90nM     48
+2vw5    1.90     2008    8.52     Kd=3nM      48
+4kzq    2.25     2013    6.10     Kd=788nM    49
+4kzu    2.10     2013    6.50     Ki=313nM    49
+4j21    1.93     2013    7.41     Kd=39nM     49
+4j3l    2.09     2013    7.80     Kd=16nM     49
+3kr8    2.10     2009    8.10     Kd=8nM      49
+2ymd    1.96     2012    3.16     Kd=693uM    50
+2wnc    2.20     2009    6.32     Kd=479nM    50
+2xys    1.91     2011    7.42     Ki=38nM     50
+2wn9    1.75     2009    8.52     Kd=3.0nM    50
+2x00    2.40     2010   11.33     Kd=4.7pM    50
+3ozt    1.48     2011    4.13     Ki=74.9uM   51
+3ozs    1.44     2011    5.33     Ki=4645nM   51
+3oe5    1.52     2011    6.88     Ki=132nM    51
+3oe4    1.49     2011    7.47     Ki=34nM     51
+3nw9    1.65     2011    9.00     Ki=1nM      51
+3ao4    1.95     2011    2.07     Kd=8.5mM    52
+3zt2    1.70     2012    2.84     Kd=1435uM   52
+3zsx    1.95     2012    3.28     Kd=519uM    52
+4cig    1.70     2014    3.67     Kd=214uM    52
+3zso    1.75     2012    5.12     Kd=7.6uM    52
+3n7a    2.00     2011    3.70     Ki=200uM    53
+4ciw    2.20     2014    4.82     Ki=15.0uM   53
+3n86    1.90     2011    5.64     Ki=2.3uM    53
+3n76    1.90     2011    6.85     Ki=0.14uM   53
+2xb8    2.40     2010    7.59     Ki=26nM     53
+4bkt    2.35     2013    3.62     Kd=240uM    54
+4w9c    2.20     2014    4.65     Kd=22.2uM   54
+4w9l    2.20     2014    5.02     Kd=9.52uM   54
+4w9i    2.40     2014    5.96     Kd=1.10uM   54
+4w9h    2.10     2014    6.73     Kd=0.185uM  54
+3nq9    1.90     2010    4.03     Kd=92.6uM   55
+3ueu    2.10     2011    5.24     Kd=5.81uM   55
+3uev    1.90     2011    5.89     Kd=1.29uM   55
+3uew    2.00     2011    6.31     Kd=0.49uM   55
+3uex    2.10     2011    6.92     Kd=0.12uM   55
+3lka    1.80     2010    2.82     Kd=1.5mM    56
+3ehy    1.90     2009    5.85     Ki=1.4uM    56
+3tsk    2.00     2012    7.17     Kd=67nM     56
+3nx7    1.80     2010    8.10     Kd=7.88nM   56
+4gr0    1.50     2013    9.55     Ki=0.28nM   56
+3dxg    1.39     2009    2.40     Ki=4.0mM    57
+3d6q    1.60     2009    3.76     Ki=172uM    57
+1w4o    1.60     2005    5.22     Ki=6uM      57
+1o0h    1.20     2003    5.92     Ki=1.2uM    57
+1u1b    2.00     2005    7.80     Kd=16nM     57

HGNN/data/PDBbind_v2020/index/INDEX_general_PL_data.2020 ADDED Viewed

The diff for this file is too large to render. See raw diff

HGNN/data/PDBbind_v2020/index/INDEX_general_PL_name.2020 ADDED Viewed

The diff for this file is too large to render. See raw diff

HGNN/data/PDBbind_v2020/index/INDEX_refined_data.2020 ADDED Viewed

The diff for this file is too large to render. See raw diff

HGNN/data/PDBbind_v2020/index/INDEX_refined_name.2020 ADDED Viewed

The diff for this file is too large to render. See raw diff

HGNN/main.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import json
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from PL_Encoder import PLEncoder
+from PL_Aggregator import PLAggregator
+from PP_Encoder import PPEncoder
+from PP_Aggregator import PPAggregator
+from screen_dataset import *
+import torch.nn.functional as F
+import torch.utils.data
+import argparse
+import os
+from util import cal_metrics
+class HGNN(nn.Module):
+    def __init__(self, enc_u, enc_v, r2e):
+        super(HGNN, self).__init__()
+        self.enc_u = enc_u
+        self.enc_v = enc_v
+        self.embed_dim = enc_u.embed_dim
+        self.w_ur1 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.w_ur2 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.w_vr1 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.w_vr2 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.r2e = r2e
+        self.bn1 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
+        self.bn2 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
+        self.logit_scale = nn.Parameter(torch.ones([1], device="cuda") * np.log(14))
+    def trainable_parameters(self):
+        for name, param in self.named_parameters(recurse=True):
+            if param.requires_grad:
+                yield param
+    def forward(self, nodes_u, nodes_v):
+        embeds_u = self.enc_u(nodes_u, nodes_v)
+        embeds_v = self.enc_v(nodes_v)
+        return embeds_u, embeds_v
+    def criterion(self, x_u, x_v, labels):
+        netout = torch.matmul(x_u, torch.transpose(x_v, 0, 1))
+        score = netout * self.logit_scale.exp().detach()
+        score = (labels - torch.eye(len(labels)).to(labels.device)) * -1e6 + score
+        lprobs_pocket = F.log_softmax(score.float(), dim=-1)
+        lprobs_pocket = lprobs_pocket.view(-1, lprobs_pocket.size(-1))
+        sample_size = lprobs_pocket.size(0)
+        targets = torch.arange(sample_size, dtype=torch.long).view(-1).cuda()
+        # pocket retrieve mol
+        loss_pocket = F.nll_loss(
+            lprobs_pocket,
+            targets,
+            reduction="mean"
+        )
+        lprobs_mol = F.log_softmax(torch.transpose(score.float(), 0, 1), dim=-1)
+        lprobs_mol = lprobs_mol.view(-1, lprobs_mol.size(-1))
+        lprobs_mol = lprobs_mol[:sample_size]
+        # mol retrieve pocket
+        loss_mol = F.nll_loss(
+            lprobs_mol,
+            targets,
+            reduction="mean"
+        )
+        loss = 0.5 * loss_pocket + 0.5 * loss_mol
+        ef_all = []
+        for i in range(len(netout)):
+            act_pocket = labels[i]
+            affi_pocket = netout[i]
+            top1_index = torch.argmax(affi_pocket)
+            top1_act = act_pocket[top1_index]
+            ef_all.append(cal_metrics(affi_pocket.detach().cpu().numpy(), act_pocket.detach().cpu().numpy()))
+        ef_mean = {k: np.mean([x[k] for x in ef_all]) for k in ef_all[0].keys()}
+        return loss, ef_mean, netout
+    def loss(self, nodes_u, nodes_v, labels):
+        x_u, x_v = self.forward(nodes_u, nodes_v)
+        loss, ef_mean, netout = self.criterion(x_u, x_v, labels)
+        return loss, ef_mean
+def train(model, device, train_loader, optimizer, epoch, valid_idxes, valid_molidxes, valid_labels):
+    model.train()
+    running_loss = 0.0
+    for i, data in enumerate(train_loader, 0):
+        batch_nodes_u, batch_nodes_v, labels = data
+        optimizer.zero_grad()
+        loss, _ = model.loss(batch_nodes_u[0].to(device), batch_nodes_v[0].to(device), labels[0].to(device))
+        loss.backward(retain_graph=True)
+        optimizer.step()
+        running_loss += loss.item()
+        if i % 200 == 0:
+            print('[%d, %5d] loss: %.3f '%(epoch, i, running_loss / 200))
+            running_loss = 0.0
+            avg_loss, avg_acc = valid(model,
+                                     device,
+                                     torch.tensor(valid_idxes).to(device),
+                                     torch.tensor(valid_molidxes).to(device),
+                                     torch.tensor(valid_labels).to(device))
+            print('Valid set results:', avg_loss.item(), avg_acc)
+    return 0
+def valid(model, device, valid_idxes, valid_molidxes, valid_labels):
+    model.eval()
+    with torch.no_grad():
+        loss, ef = model.loss(valid_idxes.to(device), valid_molidxes.to(device), valid_labels.to(device))
+    model.train()
+    return loss, ef
+def test_dekois(model, device, epoch, result_root, dekois_pocket_name, dekois_idxes):
+    model.eval()
+    loss_all, ef_all = [], []
+    loss_raw_all, ef_raw_all = [], []
+    dekois_dir = f"{result_root}/DEKOIS"
+    with torch.no_grad():
+        for dekois_id, pocket_node_id in zip(dekois_pocket_name, dekois_idxes):
+            embeds_pocket = model.enc_u([pocket_node_id], None, max_sample=-1)
+            embeds_lig = torch.tensor(np.load(f"{dekois_dir}/{dekois_id}/saved_mols_embed.npy")).to(device).float()
+            labels = np.load(f"{dekois_dir}/{dekois_id}/saved_labels.npy")
+            embeds_pocket_raw = model.enc_u.aggregator.u2e(torch.tensor([pocket_node_id]).to(device))
+            score = torch.matmul(embeds_pocket, torch.transpose(embeds_lig, 0, 1)).squeeze().detach().cpu().numpy()
+            score_raw = torch.matmul(embeds_pocket_raw, torch.transpose(embeds_lig, 0, 1)).squeeze().detach().cpu().numpy()
+            np.save(f"{dekois_dir}/{dekois_id}/GNN_res_epoch{epoch}.npy", score)
+            np.save(f"{dekois_dir}/{dekois_id}/noGNN_res.npy", score_raw)
+            metric = cal_metrics(score, labels)
+            metric_raw = cal_metrics(score_raw, labels)
+            # print(dekois_id, metric["EF1"], metric["BEDROC"], metric["AUC"])
+            ef_all.append(metric)
+            ef_raw_all.append(metric_raw)
+    model.train()
+    ef_all = {k: np.mean([x[k] for x in ef_all]) for k in ef_all[0].keys()}
+    ef_raw_all = {k: np.mean([x[k] for x in ef_raw_all]) for k in ef_raw_all[0].keys()}
+    print('Test on dekois:', ef_all)
+    print('No HGNN on dekois:', ef_raw_all)
+def test_dude(model, device, epoch, result_root, dude_pocket_name, dude_idxes):
+    model.eval()
+    loss_all, ef_all = [], []
+    loss_raw_all, ef_raw_all = [], []
+    dude_dir = f"{result_root}/DUDE"
+    with torch.no_grad():
+        for dude_id, pocket_node_id in zip(dude_pocket_name, dude_idxes):
+            embeds_pocket = model.enc_u([pocket_node_id], None, max_sample=-1)
+            embeds_lig = torch.tensor(np.load(f"{dude_dir}/{dude_id}/saved_mols_embed.npy")).to(device).float()
+            labels = np.load(f"{dude_dir}/{dude_id}/saved_labels.npy")
+            embeds_pocket_raw = model.enc_u.aggregator.u2e(torch.tensor([pocket_node_id]).to(device))
+            score = torch.matmul(embeds_pocket, torch.transpose(embeds_lig, 0, 1)).squeeze().detach().cpu().numpy()
+            score_raw = torch.matmul(embeds_pocket_raw, torch.transpose(embeds_lig, 0, 1)).squeeze().detach().cpu().numpy()
+            np.save(f"{dude_dir}/{dude_id}/GNN_res_epoch{epoch}.npy", score)
+            np.save(f"{dude_dir}/{dude_id}/noGNN_res.npy", score_raw)
+            metric = cal_metrics(score, labels)
+            metric_raw = cal_metrics(score_raw, labels)
+            # print(dude_id, metric["EF1"], metric["BEDROC"], metric["AUC"])
+            ef_all.append(metric)
+            ef_raw_all.append(metric_raw)
+    model.train()
+    ef_all = {k: np.mean([x[k] for x in ef_all]) for k in ef_all[0].keys()}
+    ef_raw_all = {k: np.mean([x[k] for x in ef_raw_all]) for k in ef_raw_all[0].keys()}
+    print('Test on dude:', ef_all)
+    print('No HGNN on dude:', ef_raw_all)
+def test_pcba(model, device, epoch, result_root, pcba_idxes):
+    model.eval()
+    loss_all, ef_all = [], []
+    loss_raw_all, ef_raw_all = [], []
+    pcba_dir = f"{result_root}/PCBA"
+    with torch.no_grad():
+        pocket_idx = 0
+        for pcba_id in sorted(list(os.listdir(pcba_dir))):
+            pocket_names = []
+            for names in json.load(open(f"{pcba_dir}/{pcba_id}/saved_pocket_names.json")):
+                pocket_names += names
+            embeds_lig = torch.tensor(np.load(f"{pcba_dir}/{pcba_id}/saved_mols_embed.npy")).to(device).float()
+            labels = np.load(f"{pcba_dir}/{pcba_id}/saved_labels.npy")
+            score_all_pocket = []
+            score_raw_pocket = []
+            for i, pocket_name in enumerate(pocket_names):
+                pcba_test_idx = pcba_idxes[pocket_idx]
+                embeds_pocket = model.enc_u([pcba_test_idx], None, max_sample=-1)
+                netout = torch.matmul(embeds_pocket, torch.transpose(embeds_lig, 0, 1))
+                embeds_pocket_raw = model.enc_u.aggregator.u2e(torch.tensor([pcba_test_idx]).to(device))
+                netout_raw = torch.matmul(embeds_pocket_raw, torch.transpose(embeds_lig, 0, 1))
+                score_all_pocket.append(netout.squeeze().detach().cpu().numpy())
+                score_raw_pocket.append(netout_raw.squeeze().detach().cpu().numpy())
+                pocket_idx += 1
+            score_max = np.stack(score_all_pocket, axis=0).mean(axis=0)
+            score_raw_max = np.stack(score_raw_pocket, axis=0).max(axis=0)
+            metric = cal_metrics(score_max, labels)
+            print(pcba_id, metric["EF1"], metric["BEDROC"], metric["AUC"])
+            np.save(f"{pcba_dir}/{pcba_id}/GNN_res_epoch{epoch}.npy", score_max)
+            np.save(f"{pcba_dir}/{pcba_id}/noGNN_res.npy", score_raw_max)
+            ef_all.append(cal_metrics(score_max, labels))
+            ef_raw_all.append(cal_metrics(score_raw_max, labels))
+    model.train()
+    print(f"saving to {pcba_dir}")
+    ef_all = {k: np.mean([x[k] for x in ef_all]) for k in ef_all[0].keys()}
+    ef_raw_all = {k: np.mean([x[k] for x in ef_raw_all]) for k in ef_raw_all[0].keys()}
+    print('Test on pcba:', ef_all)
+    print('No HGNN on pcba:', ef_raw_all)
+    return ef_all["EF1"]
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='HGNN model training')
+    parser.add_argument('--batch_size', type=int, default=128, metavar='N', help='input batch size for training')
+    parser.add_argument('--embed_dim', type=int, default=128, metavar='N', help='embedding size')
+    parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate')
+    parser.add_argument('--test_batch_size', type=int, default=1000, metavar='N', help='input batch size for testing')
+    parser.add_argument('--epochs', type=int, default=20, metavar='N', help='number of epochs to train')
+    parser.add_argument("--test_ckpt", type=str, default=None)
+    parser.add_argument("--data_root", type=str, default="../data")
+    parser.add_argument("--result_root", type=str, default="../result/pocket_ranking")
+    args = parser.parse_args()
+    data_root = args.data_root
+    seed = 42
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    use_cuda = False
+    if torch.cuda.is_available():
+        use_cuda = True
+    device = torch.device("cuda" if use_cuda else "cpu")
+    print("begin load dataset")
+    assayinfo_lst, pocket_feat, mol_feat, assayid_lst_all, mol_smi_lst, \
+        assayid_lst_train, assayid_lst_test, dude_pocket_name, pcba_pocket_name, dekois_pocket_name, valid_molidxes = load_datas(data_root, result_root)
+    print("begin load pocket-pocket graph")
+    pocket_graph = load_pocket_pocket_graph(data_root, assayid_lst_all, assayid_lst_train)
+    screen_dataset = ScreenDataset(args.batch_size, pocket_graph, assayinfo_lst, assayid_lst_all, mol_smi_lst, assayid_lst_train)
+    num_pockets = len(assayid_lst_all)
+    num_ligs = mol_feat.shape[0]
+    embed_dim = args.embed_dim
+    pocket2e = nn.Embedding(num_pockets, embed_dim).to(device)
+    pocket2e.weight.data.copy_(torch.tensor(pocket_feat).to(device))
+    for param in pocket2e.parameters():
+        param.requires_grad = False
+    lig2e = nn.Embedding(num_ligs, embed_dim).to(device)
+    for param in lig2e.parameters():
+        param.requires_grad = False
+    type2e = nn.Embedding(10, embed_dim).to(device)
+    agg_pocket = PLAggregator(lig2e, type2e, pocket2e, embed_dim, cuda=device, uv=True)
+    enc_pocket = PLEncoder(embed_dim, pocket_graph, agg_pocket, assayid_lst_all, assayid_lst_train, mol_smi_lst, assayinfo_lst, cuda=device, uv=True)
+    # neighobrs
+    agg_pocket_sim = PPAggregator(pocket2e, embed_dim, cuda=device)
+    enc_pocket = PPEncoder(enc_pocket, embed_dim, pocket_graph, agg_pocket_sim, assayid_lst_all, assayid_lst_train,
+                           base_model=enc_pocket, cuda=device)
+    enc_lig = lig2e
+    # model
+    graphrec = HGNN(enc_pocket, enc_lig, type2e).to(device)
+    print("trainable parameters")
+    for name, param in graphrec.named_parameters(recurse=True):
+        if param.requires_grad:
+            print(name, param.shape)
+    optimizer = torch.optim.RMSprop(graphrec.trainable_parameters(), lr=args.lr, alpha=0.9)
+    begin = len(assayid_lst_train+assayid_lst_test)
+    end = begin + len(dude_pocket_name)
+    dude_idxes = range(begin, end)
+    begin = end
+    end += len(pcba_pocket_name)
+    pcba_idxes = range(begin, end)
+    begin = end
+    end += len(dekois_pocket_name)
+    dekois_idxes = range(begin, end)
+    if args.test_ckpt is not None:
+        graphrec.load_state_dict(torch.load(args.test_ckpt, weights_only=True))
+        test_dude(graphrec, device, 0, result_root, dude_pocket_name, dude_idxes)
+        test_dekois(graphrec, device, 0, result_root, dekois_pocket_name, dekois_idxes)
+        test_pcba(graphrec, device, 0, result_root, pcba_idxes)
+    else:
+        for epoch in range(args.epochs):
+            screen_dataset.set_epoch(epoch)
+            train_loader = torch.utils.data.DataLoader(screen_dataset, batch_size=1, shuffle=True, num_workers=8)
+            lig2e.weight.data.copy_(torch.tensor(mol_feat).to(device))
+            valid_labels = load_valid_label(assayid_lst_test)
+            valid_idxes = range(len(assayid_lst_train), len(assayid_lst_train+assayid_lst_test))
+            train(graphrec, device, train_loader, optimizer, epoch, valid_idxes, valid_molidxes, valid_labels)
+            test_dude(graphrec, device, epoch+1, result_root, dude_pocket_name, dude_idxes)
+            test_dekois(graphrec, device, epoch+1, result_root, dekois_pocket_name, dekois_idxes)
+            test_pcba(graphrec, device, epoch+1, result_root, pcba_idxes)
+            os.system(f"mkdir -p {result_root}/HGNN_save")
+            torch.save(graphrec.state_dict(),f"{result_root}/HGNN_save/model_{epoch}.pt")
+if __name__ == "__main__":
+    main()

HGNN/read_fasta.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os, re
+import prody as pr
+from tqdm import tqdm
+from multiprocessing import Pool
+# import subprocess
+# os.environ["BABEL_LIBDIR"] = "/home/shenchao/.conda/envs/my2/lib/openbabel/3.1.0"
+def write_file(output_file, outline):
+    buffer = open(output_file, 'w')
+    buffer.write(outline)
+    buffer.close()
+def lig_rename(infile, outfile):
+    ##some peptides may impede the generation of pocket, so rename the ligname first.
+    lines = open(infile, 'r').readlines()
+    newlines = []
+    for line in lines:
+        if re.search(r'^HETATM|^ATOM', line):
+            newlines.append(line[:17] + "LIG" + line[20:])
+        else:
+            newlines.append(line)
+    write_file(outfile, ''.join(newlines))
+def check_mol(infile, outfile):
+    # Some metals may have the same ID as ligand, thus making ligand included in the pocket.
+    os.system("cat %s | sed '/LIG/d' > %s" % (infile, outfile))
+def extract_pocket(protpath,
+                   ligpath,
+                   cutoff=5.0,
+                   protname=None,
+                   ligname=None,
+                   pdb_pocket_file=None,
+                   workdir='.'):
+    """
+        protpath: the path of protein file (.pdb).
+        ligpath: the path of ligand file (.sdf|.mol2|.pdb).
+        cutoff: the distance range within the ligand to determine the pocket.
+        protname: the name of the protein.
+        ligname: the name of the ligand.
+        workdir: working directory.
+    """
+    if protname is None:
+        protname = os.path.basename(protpath).split('.')[0]
+    if ligname is None:
+        ligname = os.path.basename(ligpath).split('.')[0]
+    if not re.search(r'.pdb$', ligpath):
+        os.system(f"obabel {ligpath} -O {workdir}/{ligname}.pdb")
+    else:
+        os.system(f"cp {ligpath} {workdir}/{ligname}.pdb")
+    xprot = pr.parsePDB(protpath)
+    # xlig = pr.parsePDB("%s/%s.pdb"%(workdir, ligname))
+    # if (xlig.getResnames() == xlig.getResnames()[0]).all():
+    #	lresname = xlig.getResnames()[0]
+    # else:
+    lig_rename("%s/%s.pdb" % (workdir, ligname), "%s/%s2.pdb" % (workdir, ligname))
+    os.remove("%s/%s.pdb" % (workdir, ligname))
+    os.rename("%s/%s2.pdb" % (workdir, ligname), "%s/%s.pdb" % (workdir, ligname))
+    xlig = pr.parsePDB("%s/%s.pdb" % (workdir, ligname))
+    lresname = xlig.getResnames()[0]
+    xcom = xlig + xprot
+    # select ONLY atoms that belong to the protein
+    ret = xcom.select(f'same residue as exwithin %s of resname %s' % (cutoff, lresname))
+    pr.writePDB("%s/%s_pocket_%s_temp.pdb" % (workdir, protname, cutoff), ret)
+    # ret = pr.parsePDB("%s/%s_pocket_%s.pdb"%(workdir, protname, cutoff))
+    check_mol("%s/%s_pocket_%s_temp.pdb" % (workdir, protname, cutoff), pdb_pocket_file)
+    os.remove("%s/%s_pocket_%s_temp.pdb" % (workdir, protname, cutoff))
+def get_fasta_seq(fasta_file):
+    with open(fasta_file) as f:
+        lines = []
+        for line in f.readlines():
+            lines.append(line.strip())
+        fasta = "".join(lines[1:])
+        return fasta
+def read_fasta_from_protein(pdb_file, lig_file, target_id="test", cutoff=5.0, pdb_pocket_file="test_pocket.pdb", fasta_pocket_file="test_pocket.fasta"):
+    if not os.path.exists(pdb_file):
+        return
+    try:
+        extract_pocket(pdb_file,
+                       lig_file,
+                       cutoff=cutoff,
+                       protname=target_id,
+                       pdb_pocket_file=pdb_pocket_file,
+                       ligname=f"{target_id}_ligand")
+    except Exception as e:
+        print(e)
+        return
+    os.system(f"./pdb2fasta {pdb_pocket_file} > {fasta_pocket_file}")
+    return get_fasta_seq(fasta_pocket_file)
+def read_fasta_from_pocket(pocket_pdb_file, fasta_pocket_file="test_pocket.fasta"):
+    os.system(f"./pdb2fasta {pocket_pdb_file} > {fasta_pocket_file}")
+    return get_fasta_seq(fasta_pocket_file)

HGNN/screen_dataset.py ADDED Viewed

	@@ -0,0 +1,420 @@

+import json
+import os
+import math
+import numpy as np
+import contextlib
+import copy
+import torch
+from torch.utils.data import Dataset, sampler, DataLoader
+def load_ligname():
+    pdbbind_lig_dict = {}
+    with open("./data/PDBbind_v2020/index/INDEX_general_PL_data.2020") as f:
+        for line in f.readlines():
+            if line.startswith('#'):
+                continue
+            line = line.strip().split()
+            lig = line[-1][1:-1]
+            if lig != "":
+                pdbid = line[0]
+                pdbbind_lig_dict[pdbid] = lig
+            else:
+                continue
+    with open("./data/PDBbind_v2020/index/INDEX_refined_data.2020") as f:
+        for line in f.readlines():
+            if line.startswith('#'):
+                continue
+            line = line.strip().split()
+            lig = line[-1][1:-1]
+            if lig != "":
+                pdbid = line[0]
+                pdbbind_lig_dict[pdbid] = lig
+            else:
+                continue
+    return pdbbind_lig_dict
+def load_uniprotid():
+    uniprot_id_dict = {}
+    with open("./data/PDBbind_v2020/index/INDEX_refined_name.2020") as f:
+        for line in f.readlines():
+            if line.startswith('#'):
+                continue
+            line = line.strip().split()
+            uniprot_id = line[2]
+            if uniprot_id != "" and uniprot_id != "------":
+                pdbid = line[0]
+                uniprot_id_dict[pdbid] = uniprot_id
+    with open("./data/PDBbind_v2020/index/INDEX_general_PL_name.2020") as f:
+        for line in f.readlines():
+            if line.startswith('#'):
+                continue
+            line = line.strip().split()
+            uniprot_id = line[2]
+            if uniprot_id != "" and uniprot_id != "------":
+                pdbid = line[0]
+                uniprot_id_dict[pdbid] = uniprot_id
+    return uniprot_id_dict
+def load_pocket_dude(result_root):
+    data_root = f"{result_root}/DUDE"
+    dude_pocket_feat = []
+    dude_pocket_name = []
+    for target in sorted(list(os.listdir(data_root))):
+        pocket_arr = np.load(f"{data_root}/{target}/saved_target_embed.npy", allow_pickle=True)
+        dude_pocket_feat.append(pocket_arr)
+        dude_pocket_name.append(target)
+    dude_pocket_feat = np.concatenate(dude_pocket_feat, axis=0)
+    return dude_pocket_feat, dude_pocket_name
+def load_pocket_dekois(result_root):
+    data_root = f"{result_root}/DEKOIS"
+    dekois_pocket_feat = []
+    dekois_pocket_name = []
+    for target in sorted(list(os.listdir(data_root))):
+        pocket_arr = np.load(f"{data_root}/{target}/saved_target_embed.npy", allow_pickle=True)
+        dekois_pocket_feat.append(pocket_arr)
+        dekois_pocket_name.append(target)
+    dekois_pocket_feat = np.concatenate(dekois_pocket_feat, axis=0)
+    return dekois_pocket_feat, dekois_pocket_name
+def load_pocket_pcba(result_root):
+    data_root = f"{result_root}/PCBA"
+    pcba_pocket_feat = []
+    pcba_pocket_name = []
+    for target in sorted(list(os.listdir(data_root))):
+        pocket_arr = np.load(f"{data_root}/{target}/saved_target_embed.npy", allow_pickle=True)
+        names_target = []
+        for names in json.load(open(f"{data_root}/{target}/saved_pocket_names.json")):
+            names_target += [ f"{target}_{x}" for x in names]
+        if pocket_arr.shape[0] == 1:
+            pocket_arr = np.concatenate([pocket_arr]*len(names_target), axis=0)
+        pcba_pocket_feat.append(pocket_arr)
+        pcba_pocket_name += names_target
+    pcba_pocket_feat = np.concatenate(pcba_pocket_feat, axis=0)
+    return pcba_pocket_feat, pcba_pocket_name
+def read_cluster_file(cluster_file):
+    protein_clstr_dict = {}
+    with open(cluster_file) as f:
+        line_in_clstr = []
+        for line in f.readlines():
+            if line.startswith(">"):
+                for a in line_in_clstr:
+                    for b in line_in_clstr:
+                        if a not in protein_clstr_dict.keys():
+                            protein_clstr_dict[a] = []
+                        protein_clstr_dict[a].append(b)
+                line_in_clstr = []
+            else:
+                line_in_clstr.append(line.split('|')[1])
+    return protein_clstr_dict
+def load_assayinfo(data_root, result_root):
+    labels = json.load(open(f"{data_root}/train_label_pdbbind_seq.json")) + \
+             json.load(open("../test_datasets/casf_label_seq.json"))
+    save_dir_bdb = f"{result_root}/BDB"
+    bdb_mol_smi = json.load(open(f"{save_dir_bdb}/bdb_mol_smis.json"))
+    bdb_mol_smi = set(bdb_mol_smi)
+    for label in labels:
+        label["assay_id"] = label["pockets"][0].split("_")[0]
+        label["domain"] = "pdbbind"
+    # breakpoint()
+    labels_bdb = json.load(open(f"{data_root}/train_label_blend_seq_full.json"))
+    non_repeat_uniprot = []
+    testset_uniport_root = "../test_datasets"
+    non_repeat_uniprot += [x[0] for x in json.load(open(f"{testset_uniport_root}/dude.json"))]
+    non_repeat_uniprot += [x[0] for x in json.load(open(f"{testset_uniport_root}/PCBA.json"))]
+    non_repeat_uniprot += [x[0] for x in json.load(open(f"{testset_uniport_root}/dekois.json"))]
+    non_repeat_uniprot_strict = []
+    protein_clstr_dict_40 = read_cluster_file(f"{data_root}/uniport40.clstr")
+    protein_clstr_dict_80 = read_cluster_file(f"{data_root}/uniport80.clstr")
+    for uniprot in non_repeat_uniprot:
+        non_repeat_uniprot_strict += protein_clstr_dict_80.get(uniprot, [])
+        non_repeat_uniprot_strict.append(uniprot)
+    old_len = len(labels_bdb)
+    non_repeat_assayids = json.load(open(os.path.join(data_root, "fep_assays.json")))
+    labels_bdb = [x for x in labels_bdb if (x["assay_id"] not in non_repeat_assayids)]
+    labels_bdb = [x for x in labels_bdb if (x["uniprot"] not in non_repeat_uniprot)]
+    labels_bdb_new = []
+    for label in labels_bdb:
+        ligands = label["ligands"]
+        ligands_new = []
+        for lig in ligands:
+            if lig["smi"] in bdb_mol_smi and lig["act"] >= 5:
+                ligands_new.append(lig)
+        label["ligands"] = ligands_new
+        if len(ligands_new) > 0:
+            labels_bdb_new.append(label)
+    labels += labels_bdb_new
+    for label in labels:
+        label["ligands"] = sorted(label["ligands"], key=lambda x: x["act"], reverse=True)
+    # labels = [x for x in labels if (x["uniprot"] not in non_repeat_uniprot_strict)]
+    return labels
+def load_id_dict(result_root, assayinfo_lst):
+    import random
+    random.seed(42)
+    bdb_dir = f"{result_root}/BDB"
+    pdbbind_dir = f"{result_root}/PDBBind"
+    pocket_names = json.load(open(f"{bdb_dir}/bdb_pocket_names.json"))
+    pocket_embed = np.load(f"{bdb_dir}/bdb_pocket_reps.npy")
+    name2idx = {name:i for i, name in enumerate(pocket_names)}
+    assay_feat_lst = []
+    bdb_assayid_lst = []
+    for assay in assayinfo_lst:
+        assay_id = assay["assay_id"]
+        if assay.get("domain", None) == "pdbbind":
+            continue
+        pockets = assay["pockets"]
+        repeat_num = len(assay["ligands"])
+        repeat_num = int(np.sqrt(repeat_num))
+        for i in range(repeat_num):
+            pocket = random.choice(pockets)
+            assay_feat_lst.append(pocket_embed[name2idx[pocket]])
+            bdb_assayid_lst.append(assay_id)
+    bdb_assay_feat = np.stack(assay_feat_lst)
+    train_pdbbind_ids = json.load(open(f'{pdbbind_dir}/train_pdbbind_ids.json'))
+    train_pdbbind_pocket_embed = np.load(f"{pdbbind_dir}/train_pocket_reps.npy")
+    train_pdbbind_ids_new = []
+    train_pdbbind_pocket_embed_new = []
+    pdbbind_aidlist = [assay["assay_id"] for assay in assayinfo_lst if assay.get("domain", None) == "pdbbind"]
+    pdbbind_aidset = set(pdbbind_aidlist)
+    for id, embed in zip(train_pdbbind_ids, train_pdbbind_pocket_embed):
+        if id in pdbbind_aidset:
+            train_pdbbind_ids_new.append(id)
+            train_pdbbind_pocket_embed_new.append(embed)
+    train_pdbbind_ids = train_pdbbind_ids_new
+    train_pdbbind_pocket_embed = np.stack(train_pdbbind_pocket_embed_new)
+    train_pocket = bdb_assayid_lst + train_pdbbind_ids
+    pocket_feat_train = np.concatenate([bdb_assay_feat, train_pdbbind_pocket_embed])
+    test_pocket = json.load(open(f'{pdbbind_dir}/test_pdbbind_ids.json'))
+    pocket_feat_test = np.load(f'{pdbbind_dir}/test_pocket_reps.npy')
+    return train_pocket, test_pocket, pocket_feat_train, pocket_feat_test
+def load_datas(data_root, result_root):
+    assayinfo_lst = load_assayinfo(data_root, result_root)
+    assayid_lst_train, assayid_lst_test, pocket_feat_train, pocket_feat_test = load_id_dict(result_root, assayinfo_lst)
+    dude_pocket_feat, dude_pocket_name = load_pocket_dude(result_root)
+    pcba_pocket_feat, pcba_pocket_name = load_pocket_pcba(result_root)
+    dekois_pocket_feat, dekois_pocket_name = load_pocket_dekois(result_root)
+    pocket_feat = np.concatenate((pocket_feat_train, pocket_feat_test, dude_pocket_feat, pcba_pocket_feat, dekois_pocket_feat), axis=0)
+    assayid_lst_all = assayid_lst_train + assayid_lst_test + dude_pocket_name + pcba_pocket_name + dekois_pocket_name
+    save_dir_bdb = f"{result_root}/BDB"
+    save_dir_pdbbind = f"{result_root}/PDBBind"
+    mol_feat_train_bdb = np.load(f'{save_dir_bdb}/bdb_mol_reps.npy')
+    mol_feat_train_pdbbind = np.load(f'{save_dir_pdbbind}/train_mol_reps.npy')
+    mol_feat_test = np.load(f'{save_dir_pdbbind}/test_mol_reps.npy')
+    mol_feat = np.concatenate((mol_feat_train_bdb, mol_feat_train_pdbbind, mol_feat_test), axis=0)
+    mol_smi_lst = json.load(open(f"{save_dir_bdb}/bdb_mol_smis.json")) + json.load(open(f"{save_dir_pdbbind}/train_mol_smis.json")) + json.load(open(f"{save_dir_pdbbind}/test_mol_smis.json"))
+    test_len = len(json.load(open(f"{save_dir_pdbbind}/test_mol_smis.json")))
+    test_molidxes = range(len(mol_smi_lst)-test_len, len(mol_smi_lst))
+    return assayinfo_lst, pocket_feat, mol_feat, assayid_lst_all, mol_smi_lst, \
+           assayid_lst_train, assayid_lst_test, dude_pocket_name, pcba_pocket_name, dekois_pocket_name, test_molidxes
+def load_valid_label(assayid_lst_test):
+    coreset = list(open("./data/CoreSet.dat").readlines())[1:]
+    pdbid2cluster = {}
+    for line in coreset:
+        line = line.strip().split()
+        pdbid = line[0]
+        cluster = line[-1]
+        pdbid2cluster[pdbid] = cluster
+    labels = np.zeros((len(assayid_lst_test), len(assayid_lst_test)))
+    for i, pdbid_1 in enumerate(assayid_lst_test):
+        for j, pdbid_2 in enumerate(assayid_lst_test):
+            if pdbid2cluster[pdbid_1] != pdbid2cluster[pdbid_2]:
+                labels[i, j] = 0
+            else:
+                labels[i, j] = 1
+    return labels
+def load_pocket_pocket_graph(data_root, assayid_lst_all, assayid_lst_train):
+    neighbor_dict_train = json.load(
+        open(f"{data_root}/align_pair_res_train_10.23.json"))
+    train_keys = json.load(
+        open(f"{data_root}/align_train_keys_10.23.json"))
+    neighbor_dict_train_new = {}
+    for idx, neighbors in neighbor_dict_train.items():
+        neighbor_dict_train_new[train_keys[int(idx)]] = neighbors
+    neighbor_dict_train = neighbor_dict_train_new
+    assayid_set = set(assayid_lst_all)
+    assayid_set_train = set(assayid_lst_train)
+    PPGraph = {}
+    for assayid_1 in neighbor_dict_train.keys():
+        if assayid_1 not in assayid_set:
+            continue
+        neighbor_dict_train[assayid_1] = sorted(neighbor_dict_train[assayid_1], key=lambda x: x[1], reverse=True)
+        score_new = []
+        for assayid_2, score in neighbor_dict_train[assayid_1]:
+            if assayid_2 not in assayid_set_train:
+                continue
+            if score < 0.5:
+                continue
+            score_new.append((assayid_2, score))
+        PPGraph[assayid_1] = score_new
+    import pickle
+    align_res_test = json.load(open(f"{data_root}/align_pair_res_test_10.23.json"))
+    align_score_test = {}
+    for test_id in align_res_test.keys():
+        if test_id not in assayid_set:
+            continue
+        pocket_sim_infos = align_res_test[test_id]
+        pocket_sim_infos = sorted(pocket_sim_infos, key=lambda x: x[1], reverse=True)
+        score_new = []
+        for test_target, score in pocket_sim_infos:
+            test_target = test_target.split('.')[0]
+            if test_target not in assayid_set_train:
+                continue
+            if score < 0.5:
+                continue
+            score_new.append((test_target, score))
+        align_score_test[test_id] = score_new
+    # breakpoint()
+    PPGraph = {**PPGraph, **align_score_test}
+    return PPGraph
+@contextlib.contextmanager
+def numpy_seed(seed, *addl_seeds):
+    """Context manager which seeds the NumPy PRNG with the specified seed and
+    restores the state afterward"""
+    if seed is None:
+        yield
+        return
+    if len(addl_seeds) > 0:
+        seed = int(hash((seed, *addl_seeds)) % 1e6)
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)
+class ScreenDataset(Dataset):
+    def __init__(self, batch_size, assay_graph, assayinfo_lst, assayid_lst_all, mol_smi_lst, assayid_lst_train):
+        self.batch_size = batch_size
+        self.train_idxes = list(range(len(assayid_lst_train)))
+        self.assayid_set_train = set(assayid_lst_train)
+        self.train_idxes_epoch = copy.deepcopy(self.train_idxes)
+        self.assay_graph = assay_graph
+        self.assayinfo_dicts = {x["assay_id"]: x for x in assayinfo_lst}
+        self.smi2idx = {smi:idx for idx, smi in enumerate(mol_smi_lst)}
+        self.uniprotid_dict = load_uniprotid()
+        self.pocket_lig_graph = self.load_graph()
+        self.seed = 66
+        self.assayid2idxes = {}
+        for idx, assayid in enumerate(assayid_lst_all):
+            if assayid not in self.assayid2idxes:
+                self.assayid2idxes[assayid] = []
+            self.assayid2idxes[assayid].append(idx)
+        self.idx2assayid = assayid_lst_all
+        self.epoch = 0
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+        with numpy_seed(self.seed, epoch):
+            self.train_idxes_epoch = copy.deepcopy(self.train_idxes)
+            np.random.shuffle(self.train_idxes_epoch)
+    def load_graph(self):
+        pocket_lig_graph = {}
+        if os.path.exists("./data/pocket_lig_graph.json"):
+            pocket_lig_graph = json.load(open("./data/pocket_lig_graph.json"))
+        else:
+            from tqdm import tqdm
+            for assayid in tqdm(self.assayid2idxes.keys()):
+                if assayid not in self.assayid_set_train:
+                    continue
+                ligands = self.assayinfo_dicts[assayid]["ligands"]
+                lig_candidate = []
+                if len(ligands) > 1:
+                    lig_assay = [x["smi"] for x in ligands if x["act"] >= 5]
+                else:
+                    lig_assay = [x["smi"] for x in ligands]
+                lig_candidate += lig_assay
+                lig_assay = set(lig_assay)
+                uniprot = self.assayinfo_dicts[assayid]["uniprot"]
+                for assayid_nbr, score in self.assay_graph.get(assayid, []):
+                    if assayid_nbr not in self.assayinfo_dicts:
+                        continue
+                    assay_nbr = self.assayinfo_dicts[assayid_nbr]
+                    uniprot_nbr = assay_nbr["uniprot"]
+                    ligands_nbr = assay_nbr["ligands"]
+                    if len(ligands) > 1:
+                        lig_candidate_nbr = [x["smi"] for x in ligands_nbr if x["act"] >= 5]
+                    else:
+                        lig_candidate_nbr = [x["smi"] for x in ligands_nbr]
+                    if assayid_nbr not in self.assayid_set_train:
+                        continue
+                    if len(lig_assay & set(lig_candidate_nbr)) > 0:
+                        lig_candidate += lig_candidate_nbr
+                    elif uniprot == uniprot_nbr:
+                        lig_candidate += lig_candidate_nbr
+                pocket_lig_graph[assayid] = [x for x in set(lig_candidate) if x in self.smi2idx]
+            json.dump(pocket_lig_graph, open("./data/pocket_lig_graph.json", "w"))
+        return pocket_lig_graph
+    def __getitem__(self, item):
+        pocket_idx_batch = self.train_idxes_epoch[item*self.batch_size:(item+1)*self.batch_size]
+        pocket_batch = [self.idx2assayid[idx] for idx in pocket_idx_batch]
+        lig_batch = []
+        lig_idx_batch = []
+        epoch = self.epoch
+        for pocket in pocket_batch:
+            lig_candidate = self.pocket_lig_graph[pocket]
+            with numpy_seed(self.seed, epoch, item):
+                lig = np.random.choice(lig_candidate)
+                lig_batch.append(lig)
+            lig_idx_batch.append(self.smi2idx[lig])
+        labels = np.zeros((self.batch_size, self.batch_size))
+        for i, pocket in enumerate(pocket_batch):
+            for j, lig in enumerate(lig_batch):
+                if lig in self.pocket_lig_graph[pocket]:
+                    labels[i, j] = 1
+                else:
+                    labels[i, j] = 0
+        return torch.tensor(pocket_idx_batch), torch.tensor(lig_idx_batch), torch.tensor(labels)
+    def __len__(self):
+        return len(self.train_idxes_epoch) // self.batch_size

HGNN/screening.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import json
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+from PL_Encoder import PLEncoder
+from PL_Aggregator import PLAggregator
+from PP_Encoder import PPEncoder
+from PP_Aggregator import PPAggregator
+import torch.nn.functional as F
+import torch.utils.data
+import argparse
+import os
+from util import cal_metrics
+from read_fasta import read_fasta_from_pocket, read_fasta_from_protein
+from align import get_neighbor_pocket
+class HGNN(nn.Module):
+    def __init__(self, enc_u, enc_v=None, r2e=None):
+        super(HGNN, self).__init__()
+        self.enc_u = enc_u
+        self.enc_v = enc_v
+        self.embed_dim = enc_u.embed_dim
+        self.w_ur1 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.w_ur2 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.w_vr1 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.w_vr2 = nn.Linear(self.embed_dim, self.embed_dim)
+        self.r2e = r2e
+        self.bn1 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
+        self.bn2 = nn.BatchNorm1d(self.embed_dim, momentum=0.5)
+        self.logit_scale = nn.Parameter(torch.ones([1], device="cuda") * np.log(14))
+    def trainable_parameters(self):
+        for name, param in self.named_parameters(recurse=True):
+            if param.requires_grad:
+                yield param
+    def forward(self, nodes_u, nodes_v):
+        embeds_u = self.enc_u(nodes_u, nodes_v)
+        embeds_v = self.enc_v(nodes_v)
+        return embeds_u, embeds_v
+    def criterion(self, x_u, x_v, labels):
+        netout = torch.matmul(x_u, torch.transpose(x_v, 0, 1))
+        score = netout * self.logit_scale.exp().detach()
+        score = (labels - torch.eye(len(labels)).to(labels.device)) * -1e6 + score
+        lprobs_pocket = F.log_softmax(score.float(), dim=-1)
+        lprobs_pocket = lprobs_pocket.view(-1, lprobs_pocket.size(-1))
+        sample_size = lprobs_pocket.size(0)
+        targets = torch.arange(sample_size, dtype=torch.long).view(-1).cuda()
+        # pocket retrieve mol
+        loss_pocket = F.nll_loss(
+            lprobs_pocket,
+            targets,
+            reduction="mean"
+        )
+        lprobs_mol = F.log_softmax(torch.transpose(score.float(), 0, 1), dim=-1)
+        lprobs_mol = lprobs_mol.view(-1, lprobs_mol.size(-1))
+        lprobs_mol = lprobs_mol[:sample_size]
+        # mol retrieve pocket
+        loss_mol = F.nll_loss(
+            lprobs_mol,
+            targets,
+            reduction="mean"
+        )
+        loss = 0.5 * loss_pocket + 0.5 * loss_mol
+        ef_all = []
+        for i in range(len(netout)):
+            act_pocket = labels[i]
+            affi_pocket = netout[i]
+            top1_index = torch.argmax(affi_pocket)
+            top1_act = act_pocket[top1_index]
+            ef_all.append(cal_metrics(affi_pocket.detach().cpu().numpy(), act_pocket.detach().cpu().numpy()))
+        ef_mean = {k: np.mean([x[k] for x in ef_all]) for k in ef_all[0].keys()}
+        return loss, ef_mean, netout
+    def loss(self, nodes_u, nodes_v, labels):
+        x_u, x_v = self.forward(nodes_u, nodes_v)
+        loss, ef_mean, netout = self.criterion(x_u, x_v, labels)
+        return loss, ef_mean
+    def refine_pocket(self, pocket_embed, neighbor_pocket_list):
+        embeds_u = self.enc_u.refine_pocket(pocket_embed, neighbor_pocket_list)
+        return embeds_u
+def main():
+    # Training settings
+    parser = argparse.ArgumentParser(description='HGNN model inference')
+    parser.add_argument('--embed_dim', type=int, default=128, metavar='N', help='embedding size')
+    parser.add_argument("--test_ckpt", type=str, default=None)
+    parser.add_argument("--data_root", type=str, default="../data")
+    parser.add_argument("--result_root", type=str, default="../result/pocket_ranking")
+    parser.add_argument("--pocket_embed", type=str, default="../example/pocket_embed.npy")
+    parser.add_argument("--save_file", type=str, default="../example/refined_pocket.npy")
+    parser.add_argument("--pocket_pdb", type=str, default=None)
+    parser.add_argument("--protein_pdb", type=str, default="../example/protein.pdb")
+    parser.add_argument("--ligand_pdb", type=str, default="../example/ligand.pdb")
+    args = parser.parse_args()
+    seed = 42
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    use_cuda = False
+    if torch.cuda.is_available():
+        use_cuda = True
+    device = torch.device("cuda" if use_cuda else "cpu")
+    embed_dim = args.embed_dim
+    type2e = nn.Embedding(10, embed_dim).to(device)
+    # load model
+    agg_pocket = PLAggregator(r2e=type2e, embed_dim=embed_dim, cuda=device, uv=True)
+    enc_pocket = PLEncoder(embed_dim=embed_dim, aggregator=agg_pocket, cuda=device, uv=True)
+    agg_pocket_sim = PPAggregator(embed_dim=embed_dim, cuda=device)
+    enc_pocket = PPEncoder(enc_pocket, embed_dim=embed_dim, aggregator=agg_pocket_sim, cuda=device)
+    model = HGNN(enc_pocket).to(device)
+    model.load_state_dict(torch.load(args.test_ckpt, weights_only=True), strict=False)
+    model.eval()
+    # load pocket embedding and fasta
+    pocket_embed = torch.tensor(np.load(args.pocket_embed)).to(device)
+    if args.pocket_pdb is not None:
+        pocket_fasta = read_fasta_from_pocket(args.pocket_pdb)
+    else:
+        pocket_fasta = read_fasta_from_protein(args.protein_pdb, args.ligand_pdb)
+    # get neighbor pocket
+    neighbor_pocket_list = get_neighbor_pocket(pocket_fasta, args.data_root, args.result_root, device) # [(pocket_embed, ligand_embed, similarity)]
+    # get refined pocket
+    if len(neighbor_pocket_list) > 0:
+        with torch.no_grad():
+            refined_pocket = model.refine_pocket(pocket_embed, neighbor_pocket_list)
+            refined_pocket = refined_pocket.cpu().numpy()
+    else:
+        refined_pocket = pocket_embed.cpu().numpy()
+    print("finished, saving refined pocket embedding into:", args.save_file)
+    np.save(args.save_file, refined_pocket)
+if __name__ == "__main__":
+    main()

HGNN/test_pocket.fasta ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ >aa2ar_pocket_5:_ 19
2	+ VLTLFEMMNWLXNHALMYI

HGNN/util.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import numpy as np
+from rdkit.ML.Scoring.Scoring import CalcBEDROC, CalcAUC, CalcEnrichment
+from sklearn.metrics import roc_curve
+def re_new(y_true, y_score, ratio):
+    fp = 0
+    tp = 0
+    p = sum(y_true)
+    n = len(y_true) - p
+    num = ratio * n
+    sort_index = np.argsort(y_score)[::-1]
+    for i in range(len(sort_index)):
+        index = sort_index[i]
+        if y_true[index] == 1:
+            tp += 1
+        else:
+            fp += 1
+            if fp >= num:
+                break
+    return (tp * n) / (p * fp)
+def calc_re(y_true, y_score, ratio_list):
+    fpr, tpr, thresholds = roc_curve(y_true, y_score, pos_label=1)
+    # print(fpr, tpr)
+    res = {}
+    res2 = {}
+    total_active_compounds = sum(y_true)
+    total_compounds = len(y_true)
+    # for ratio in ratio_list:
+    #     for i, t in enumerate(fpr):
+    #         if t > ratio:
+    #             #print(fpr[i], tpr[i])
+    #             if fpr[i-1]==0:
+    #                 res[str(ratio)]=tpr[i]/fpr[i]
+    #             else:
+    #                 res[str(ratio)]=tpr[i-1]/fpr[i-1]
+    #             break
+    for ratio in ratio_list:
+        res2[str(ratio)] = re_new(y_true, y_score, ratio)
+    # print(res)
+    # print(res2)
+    return res2
+def cal_metrics(y_score, y_true, alpha=80.5):
+    """
+    Calculate BEDROC score.
+    Parameters:
+    - y_true: true binary labels (0 or 1)
+    - y_score: predicted scores or probabilities
+    - alpha: parameter controlling the degree of early retrieval emphasis
+    Returns:
+    - BEDROC score
+    """
+    # concate res_single and labels
+    scores = np.expand_dims(y_score, axis=1)
+    y_true = np.expand_dims(y_true, axis=1)
+    scores = np.concatenate((scores, y_true), axis=1)
+    # inverse sort scores based on first column
+    scores = scores[scores[:, 0].argsort()[::-1]]
+    bedroc = CalcBEDROC(scores, 1, 80.5)
+    count = 0
+    # sort y_score, return index
+    index = np.argsort(y_score)[::-1]
+    for i in range(int(len(index) * 0.005)):
+        if y_true[index[i]] == 1:
+            count += 1
+    auc = CalcAUC(scores, 1)
+    ef_list = CalcEnrichment(scores, 1, [0.005, 0.01, 0.05])
+    return {
+        "BEDROC": bedroc,
+        "AUC": auc,
+        "EF0.5": ef_list[0],
+        "EF1": ef_list[1],
+        "EF5": ef_list[2]
+    }
+# import torch
+# torch.multiprocessing.set_start_method('spawn', force=True)
+# def mycollator(input_batch):
+#     for data in input_batch:
+#         node, neighbors = data
+#         node["pocket_data"] = torch.tensor(node["pocket_data"]).cuda()
+#         node["lig_data"] = torch.tensor(node["lig_data"]).cuda()
+#         for neighbor in neighbors:
+#             neighbor["pocket_data"] = torch.tensor(node["pocket_data"]).cuda()
+#             neighbor["lig_data"] = torch.tensor(node["lig_data"]).cuda()
+#     return input_batch

License ADDED Viewed

	@@ -0,0 +1,159 @@

+# Attribution-NonCommercial 4.0 International
+> *Creative Commons Corporation (“Creative Commons”) is not a law firm and does not provide legal services or legal advice. Distribution of Creative Commons public licenses does not create a lawyer-client or other relationship. Creative Commons makes its licenses and related information available on an “as-is” basis. Creative Commons gives no warranties regarding its licenses, any material licensed under their terms and conditions, or any related information. Creative Commons disclaims all liability for damages resulting from their use to the fullest extent possible.*
+>
+> ### Using Creative Commons Public Licenses
+>
+> Creative Commons public licenses provide a standard set of terms and conditions that creators and other rights holders may use to share original works of authorship and other material subject to copyright and certain other rights specified in the public license below. The following considerations are for informational purposes only, are not exhaustive, and do not form part of our licenses.
+>
+> * __Considerations for licensors:__ Our public licenses are intended for use by those authorized to give the public permission to use material in ways otherwise restricted by copyright and certain other rights. Our licenses are irrevocable. Licensors should read and understand the terms and conditions of the license they choose before applying it. Licensors should also secure all rights necessary before applying our licenses so that the public can reuse the material as expected. Licensors should clearly mark any material not subject to the license. This includes other CC-licensed material, or material used under an exception or limitation to copyright. [More considerations for licensors](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensors).
+>
+> * __Considerations for the public:__ By using one of our public licenses, a licensor grants the public permission to use the licensed material under specified terms and conditions. If the licensor’s permission is not necessary for any reason–for example, because of any applicable exception or limitation to copyright–then that use is not regulated by the license. Our licenses grant only permissions under copyright and certain other rights that a licensor has authority to grant. Use of the licensed material may still be restricted for other reasons, including because others have copyright or other rights in the material. A licensor may make special requests, such as asking that all changes be marked or described. Although not required by our licenses, you are encouraged to respect those requests where reasonable. [More considerations for the public](http://wiki.creativecommons.org/Considerations_for_licensors_and_licensees#Considerations_for_licensees).
+## Creative Commons Attribution-NonCommercial 4.0 International Public License
+By exercising the Licensed Rights (defined below), You accept and agree to be bound by the terms and conditions of this Creative Commons Attribution-NonCommercial 4.0 International Public License ("Public License"). To the extent this Public License may be interpreted as a contract, You are granted the Licensed Rights in consideration of Your acceptance of these terms and conditions, and the Licensor grants You such rights in consideration of benefits the Licensor receives from making the Licensed Material available under these terms and conditions.
+### Section 1 – Definitions.
+a. __Adapted Material__ means material subject to Copyright and Similar Rights that is derived from or based upon the Licensed Material and in which the Licensed Material is translated, altered, arranged, transformed, or otherwise modified in a manner requiring permission under the Copyright and Similar Rights held by the Licensor. For purposes of this Public License, where the Licensed Material is a musical work, performance, or sound recording, Adapted Material is always produced where the Licensed Material is synched in timed relation with a moving image.
+b. __Adapter's License__ means the license You apply to Your Copyright and Similar Rights in Your contributions to Adapted Material in accordance with the terms and conditions of this Public License.
+c. __Copyright and Similar Rights__ means copyright and/or similar rights closely related to copyright including, without limitation, performance, broadcast, sound recording, and Sui Generis Database Rights, without regard to how the rights are labeled or categorized. For purposes of this Public License, the rights specified in Section 2(b)(1)-(2) are not Copyright and Similar Rights.
+d. __Effective Technological Measures__ means those measures that, in the absence of proper authority, may not be circumvented under laws fulfilling obligations under Article 11 of the WIPO Copyright Treaty adopted on December 20, 1996, and/or similar international agreements.
+e. __Exceptions and Limitations__ means fair use, fair dealing, and/or any other exception or limitation to Copyright and Similar Rights that applies to Your use of the Licensed Material.
+f. __Licensed Material__ means the artistic or literary work, database, or other material to which the Licensor applied this Public License.
+g. __Licensed Rights__ means the rights granted to You subject to the terms and conditions of this Public License, which are limited to all Copyright and Similar Rights that apply to Your use of the Licensed Material and that the Licensor has authority to license.
+h. __Licensor__ means the individual(s) or entity(ies) granting rights under this Public License.
+i. __NonCommercial__ means not primarily intended for or directed towards commercial advantage or monetary compensation. For purposes of this Public License, the exchange of the Licensed Material for other material subject to Copyright and Similar Rights by digital file-sharing or similar means is NonCommercial provided there is no payment of monetary compensation in connection with the exchange.
+j. __Share__ means to provide material to the public by any means or process that requires permission under the Licensed Rights, such as reproduction, public display, public performance, distribution, dissemination, communication, or importation, and to make material available to the public including in ways that members of the public may access the material from a place and at a time individually chosen by them.
+k. __Sui Generis Database Rights__ means rights other than copyright resulting from Directive 96/9/EC of the European Parliament and of the Council of 11 March 1996 on the legal protection of databases, as amended and/or succeeded, as well as other essentially equivalent rights anywhere in the world.
+l. __You__ means the individual or entity exercising the Licensed Rights under this Public License. Your has a corresponding meaning.
+### Section 2 – Scope.
+a. ___License grant.___
+   1. Subject to the terms and conditions of this Public License, the Licensor hereby grants You a worldwide, royalty-free, non-sublicensable, non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Material to:
+       A. reproduce and Share the Licensed Material, in whole or in part, for NonCommercial purposes only; and
+       B. produce, reproduce, and Share Adapted Material for NonCommercial purposes only.
+   2. __Exceptions and Limitations.__ For the avoidance of doubt, where Exceptions and Limitations apply to Your use, this Public License does not apply, and You do not need to comply with its terms and conditions.
+   3. __Term.__ The term of this Public License is specified in Section 6(a).
+   4. __Media and formats; technical modifications allowed.__ The Licensor authorizes You to exercise the Licensed Rights in all media and formats whether now known or hereafter created, and to make technical modifications necessary to do so. The Licensor waives and/or agrees not to assert any right or authority to forbid You from making technical modifications necessary to exercise the Licensed Rights, including technical modifications necessary to circumvent Effective Technological Measures. For purposes of this Public License, simply making modifications authorized by this Section 2(a)(4) never produces Adapted Material.
+   5. __Downstream recipients.__
+        A. __Offer from the Licensor – Licensed Material.__ Every recipient of the Licensed Material automatically receives an offer from the Licensor to exercise the Licensed Rights under the terms and conditions of this Public License.
+        B. __No downstream restrictions.__ You may not offer or impose any additional or different terms or conditions on, or apply any Effective Technological Measures to, the Licensed Material if doing so restricts exercise of the Licensed Rights by any recipient of the Licensed Material.
+   6. __No endorsement.__ Nothing in this Public License constitutes or may be construed as permission to assert or imply that You are, or that Your use of the Licensed Material is, connected with, or sponsored, endorsed, or granted official status by, the Licensor or others designated to receive attribution as provided in Section 3(a)(1)(A)(i).
+b. ___Other rights.___
+   1. Moral rights, such as the right of integrity, are not licensed under this Public License, nor are publicity, privacy, and/or other similar personality rights; however, to the extent possible, the Licensor waives and/or agrees not to assert any such rights held by the Licensor to the limited extent necessary to allow You to exercise the Licensed Rights, but not otherwise.
+   2. Patent and trademark rights are not licensed under this Public License.
+   3. To the extent possible, the Licensor waives any right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary or waivable statutory or compulsory licensing scheme. In all other cases the Licensor expressly reserves any right to collect such royalties, including when the Licensed Material is used other than for NonCommercial purposes.
+### Section 3 – License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the following conditions.
+a. ___Attribution.___
+   1. If You Share the Licensed Material (including in modified form), You must:
+       A. retain the following if it is supplied by the Licensor with the Licensed Material:
+         i. identification of the creator(s) of the Licensed Material and any others designated to receive attribution, in any reasonable manner requested by the Licensor (including by pseudonym if designated);
+         ii. a copyright notice;
+         iii. a notice that refers to this Public License;
+         iv. a notice that refers to the disclaimer of warranties;
+         v. a URI or hyperlink to the Licensed Material to the extent reasonably practicable;
+       B. indicate if You modified the Licensed Material and retain an indication of any previous modifications; and
+       C. indicate the Licensed Material is licensed under this Public License, and include the text of, or the URI or hyperlink to, this Public License.
+   2. You may satisfy the conditions in Section 3(a)(1) in any reasonable manner based on the medium, means, and context in which You Share the Licensed Material. For example, it may be reasonable to satisfy the conditions by providing a URI or hyperlink to a resource that includes the required information.
+   3. If requested by the Licensor, You must remove any of the information required by Section 3(a)(1)(A) to the extent reasonably practicable.
+   4. If You Share Adapted Material You produce, the Adapter's License You apply must not prevent recipients of the Adapted Material from complying with this Public License.
+### Section 4 – Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that apply to Your use of the Licensed Material:
+a. for the avoidance of doubt, Section 2(a)(1) grants You the right to extract, reuse, reproduce, and Share all or a substantial portion of the contents of the database for NonCommercial purposes only;
+b. if You include all or a substantial portion of the database contents in a database in which You have Sui Generis Database Rights, then the database in which You have Sui Generis Database Rights (but not its individual contents) is Adapted Material; and
+c. You must comply with the conditions in Section 3(a) if You Share all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not replace Your obligations under this Public License where the Licensed Rights include other Copyright and Similar Rights.
+### Section 5 – Disclaimer of Warranties and Limitation of Liability.
+a. __Unless otherwise separately undertaken by the Licensor, to the extent possible, the Licensor offers the Licensed Material as-is and as-available, and makes no representations or warranties of any kind concerning the Licensed Material, whether express, implied, statutory, or other. This includes, without limitation, warranties of title, merchantability, fitness for a particular purpose, non-infringement, absence of latent or other defects, accuracy, or the presence or absence of errors, whether or not known or discoverable. Where disclaimers of warranties are not allowed in full or in part, this disclaimer may not apply to You.__
+b. __To the extent possible, in no event will the Licensor be liable to You on any legal theory (including, without limitation, negligence) or otherwise for any direct, special, indirect, incidental, consequential, punitive, exemplary, or other losses, costs, expenses, or damages arising out of this Public License or use of the Licensed Material, even if the Licensor has been advised of the possibility of such losses, costs, expenses, or damages. Where a limitation of liability is not allowed in full or in part, this limitation may not apply to You.__
+c. The disclaimer of warranties and limitation of liability provided above shall be interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability.
+### Section 6 – Term and Termination.
+a. This Public License applies for the term of the Copyright and Similar Rights licensed here. However, if You fail to comply with this Public License, then Your rights under this Public License terminate automatically.
+b. Where Your right to use the Licensed Material has terminated under Section 6(a), it reinstates:
+   1. automatically as of the date the violation is cured, provided it is cured within 30 days of Your discovery of the violation; or
+   2. upon express reinstatement by the Licensor.
+   For the avoidance of doubt, this Section 6(b) does not affect any right the Licensor may have to seek remedies for Your violations of this Public License.
+c. For the avoidance of doubt, the Licensor may also offer the Licensed Material under separate terms or conditions or stop distributing the Licensed Material at any time; however, doing so will not terminate this Public License.
+d. Sections 1, 5, 6, 7, and 8 survive termination of this Public License.
+### Section 7 – Other Terms and Conditions.
+a. The Licensor shall not be bound by any additional or different terms or conditions communicated by You unless expressly agreed.
+b. Any arrangements, understandings, or agreements regarding the Licensed Material not stated herein are separate from and independent of the terms and conditions of this Public License.
+### Section 8 – Interpretation.
+a. For the avoidance of doubt, this Public License does not, and shall not be interpreted to, reduce, limit, restrict, or impose conditions on any use of the Licensed Material that could lawfully be made without permission under this Public License.
+b. To the extent possible, if any provision of this Public License is deemed unenforceable, it shall be automatically reformed to the minimum extent necessary to make it enforceable. If the provision cannot be reformed, it shall be severed from this Public License without affecting the enforceability of the remaining terms and conditions.
+c. No term or condition of this Public License will be waived and no failure to comply consented to unless expressly agreed to by the Licensor.
+d. Nothing in this Public License constitutes or may be interpreted as a limitation upon, or waiver of, any privileges and immunities that apply to the Licensor or You, including from the legal processes of any jurisdiction or authority.
+> Creative Commons is not a party to its public licenses. Notwithstanding, Creative Commons may elect to apply one of its public licenses to material it publishes and in those instances will be considered the “Licensor.” Except for the limited purpose of indicating that material is shared under a Creative Commons public license or as otherwise permitted by the Creative Commons policies published at [creativecommons.org/policies](http://creativecommons.org/policies), Creative Commons does not authorize the use of the trademark “Creative Commons” or any other trademark or logo of Creative Commons without its prior written consent including, without limitation, in connection with any unauthorized modifications to any of its public licenses or any other arrangements, understandings, or agreements concerning use of licensed material. For the avoidance of doubt, this paragraph does not form part of the public licenses.
+>
+> Creative Commons may be contacted at creativecommons.org

README.md CHANGED Viewed

@@ -1,3 +1,206 @@
----
-license: apache-2.0
----

+## General
+This repository contains the code for **LigUnity**: **Hierarchical affinity landscape navigation through learning a shared pocket-ligand space.**
+**We are excited to announce that our paper has been accepted by Patterns and is featured as the cover article for the October 2025 issue!**
+[![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green?style=flat-square)](https://github.com/tatsu-lab/stanford_alpaca/blob/main/LICENSE)
+[![Data License](https://img.shields.io/badge/Data%20License-CC%20By%20NC%204.0-red?style=flat-square)](https://github.com/tatsu-lab/stanford_alpaca/blob/main/DATA_LICENSE)
+[![DOI:10.1016/j.patter.2025.101371](http://img.shields.io/badge/DOI-10.1101/2025.02.17.638554-B31B1B.svg)](https://doi.org/10.1016/j.patter.2025.101371)
+[![GitHub Link](https://img.shields.io/badge/GitHub-blue?style=flat-square&logo=github)](https://github.com/IDEA-XL/LigUnity)
+<table>
+  <tr>
+    <td width="250px" valign="top">
+      <a href="https://www.cell.com/patterns/fulltext/S2666-3899(25)00219-3">
+        <img src="https://github.com/user-attachments/assets/5ab7f659-0b56-4cf1-8db0-7129d71ea9d5" alt="LigUnity Patterns Cover Image" width="230px" />
+      </a>
+    </td>
+    <td valign="top">
+      <p>
+        <strong>On the cover:</strong> This ocean symbolizes the human proteome—the complete set of proteins that carry out essential functions in our bodies. For medicine to work, it often needs to interact with a specific protein. For an estimated 90% of these proteins, however, they lack known small-molecule ligands with high activity. In the image, these proteins are represented as sailboats drifting in the dark.
+      </p>
+      <p>
+        At the center, stands a lighthouse symbolizing the AI method <strong>LigUnity</strong>. Its beam illuminates several sailboats, guiding them toward glowing buoys, which symbolize ligands with high activity found by LigUnity. The work by Feng et al. highlights the power of AI-driven computational methods to efficiently find active ligands and optimize their activity, opening up new therapeutic avenues for various diseases.
+      </p>
+    </td>
+  </tr>
+</table>
+## Instruction on running our model
+### Virtual Screening
+Colab demo for virtual screening with given protein pocket and candidate ligands.
+https://colab.research.google.com/drive/1F0QSPjkKKLAfBexmIQotcs-jm87ohHeG?usp=sharing
+### Hit-to-lead optimization
+**Direct inference**
+Colab demo for code inference with given protein and unmeasured ligands.
+https://colab.research.google.com/drive/11Fx6mO51rRkPvq71qupuUmscfBw8Dw5R?usp=sharing
+**Few-shot fine-tuning**
+Colab demo for few-shot fine-tuning with given protein, few measure ligands for fine-tuning and unmeasured ligands for testing.
+https://colab.research.google.com/drive/1gf0HhgyqI4qBjUAUICCvDa-FnTaARmR_?usp=sharing
+Please feel free to contact me by email if there is any problem with the code or paper: fengbin@idea.edu.cn.
+### Resource availability
+The datasets for LigUnity were collected from ChEMBL version 34 and BindingDB version 2024m5. Our training dataset is available on figshare (https://doi.org/10.6084/m9.figshare.27966819). Our PocketAffDB with protein and pocket PDB structures is available on figshare (https://doi.org/10.6084/m9.figshare.29379161).
+## Abstract
+Protein-ligand binding affinity plays an important role in drug discovery, especially during virtual screening and hit-to-lead optimization. Computational chemistry and machine learning methods have been developed to investigate these tasks. Despite the encouraging performance, virtual screening and hit-to-lead optimization are often studied separately by existing methods, partially because they are performed sequentially in the existing drug discovery pipeline, thereby overlooking their interdependency and complementarity. To address this problem, we propose LigUnity, a foundation model for protein-ligand binding prediction by jointly optimizing virtual screening and hit-to-lead optimization.
+In particular, LigUnity learns coarse-grained active/inactive distinction for virtual screening, and fine-grained pocket-specific ligand preference for hit-to-lead optimization.
+We demonstrate the effectiveness and versatility of LigUnity on eight benchmarks across virtual screening and hit-to-lead optimization. In virtual screening, LigUnity outperforms 24 competing methods with more than 50% improvement on the DUD-E and Dekois 2.0 benchmarks, and shows robust generalization to novel proteins. In hit-to-lead optimization, LigUnity achieves the best performance on split-by-time, split-by-scaffold, and split-by-unit settings, further demonstrating its potential as a cost-effective alternative to free energy perturbation (FEP) calculations. We further showcase how LigUnity can be employed in an active learning framework to efficiently identify active ligands for TYK2, a therapeutic target for autoimmune diseases, yielding over 40% improved prediction performance. Collectively, these comprehensive results establish LigUnity as a versatile foundation model for both virtual screening and hit-to-lead optimization, offering broad applicability across the drug discovery pipeline through accurate protein-ligand affinity predictions.
+## Reproduce results in our paper
+### Reproduce results on virtual screening benchmarks
+Please first download checkpoints and processed dataset before running
+- Download our procesed Dekois 2.0 dataset from https://doi.org/10.6084/m9.figshare.27967422
+- Download LIT-PCBA and DUD-E datasets from https://drive.google.com/drive/folders/1zW1MGpgunynFxTKXC2Q4RgWxZmg6CInV?usp=sharing
+- Clone model checkpoint from https://huggingface.co/fengb/LigUnity_VS (test proteins in DUD-E, Dekois, and LIT-PCBA are removed from the training set)
+- Clone dataset from https://figshare.com/articles/dataset/LigUnity_project_data/27966819 and unzip them all (you can ignore .lmdb file if you only want to reproduce test result).
+```
+# run pocket/protein and ligand encoder model
+path2weight="absolute path to the checkpoint of pocket_ranking"
+CUDA_VISIBLE_DEVICES=0 bash test.sh ALL pocket_ranking ${path2weight} "./result/pocket_ranking"
+CUDA_VISIBLE_DEVICES=0 bash test.sh BDB pocket_ranking ${path2weight} "./result/pocket_ranking"
+CUDA_VISIBLE_DEVICES=0 bash test.sh PDB pocket_ranking ${path2weight} "./result/pocket_ranking"
+path2weight="absolute path to the checkpoint of protein_ranking"
+CUDA_VISIBLE_DEVICES=0 bash test.sh ALL protein_ranking ${path2weight} "./result/protein_ranking"
+CUDA_VISIBLE_DEVICES=0 bash test.sh BDB protein_ranking ${path2weight} "./result/protein_ranking"
+CUDA_VISIBLE_DEVICES=0 bash test.sh PDB protein_ranking ${path2weight} "./result/protein_ranking"
+# train H-GNN model
+cd ./HGNN
+path2weight_HGNN="absolute path to the checkpoint of HGNN pocket"
+python main.py --data_root ${path2data} --result_root "../result/pocket_ranking" --test_ckpt ${path2weight_HGNN}
+path2weight_HGNN="absolute path to the checkpoint of HGNN protein"
+python main.py --data_root ${path2data} --result_root "../result/protein_ranking" --test_ckpt ${path2weight_HGNN}
+# get final prediction of our model
+python ensemble_result.py DUDE PCBA DEKOIS
+```
+### Reproduce results on FEP benchmarks (zero-shot)
+Please first download checkpoints before running
+- Clone model checkpoint from https://huggingface.co/fengb/LigUnity_pocket_ranking and https://huggingface.co/fengb/LigUnity_protein_ranking (test ligands and assays in FEP benchmarks are removed from the training set)
+```
+# run pocket/protein and ligand encoder model
+for r in {1..6} do
+    path2weight="path to checkpoint of pocket_ranking"
+    path2result="./result/pocket_ranking/FEP/repeat_{r}"
+    CUDA_VISIBLE_DEVICES=0 bash test.sh FEP pocket_ranking ${path2weight} ${path2result}
+    path2weight="path to checkpoint of protein_ranking"
+    path2result="./result/protein_ranking/FEP/repeat_{r}"
+    CUDA_VISIBLE_DEVICES=0 bash test.sh FEP protein_ranking ${path2weight} ${path2result}
+done
+# get final prediction of our model
+python ensemble_result.py FEP
+```
+### Reproduce results on FEP benchmarks (few-shot)
+```
+# use the same checkpoints as in zero-shot
+# run few-shot fine-tuning
+for r in {1..6} do
+    path2weight="path to checkpoint of pocket_ranking"
+    path2result="./result/pocket_ranking/FEP_fewshot/repeat_{r}"
+    support_num=0.6
+    CUDA_VISIBLE_DEVICES=0 bash test_fewshot.sh FEP pocket_ranking support_num ${path2weight} ${path2result}
+    path2weight="path to checkpoint of protein_ranking"
+    path2result="./result/protein_ranking/FEP_fewshot/repeat_{r}"
+    CUDA_VISIBLE_DEVICES=0 bash test_fewshot.sh FEP protein_ranking support_num ${path2weight} ${path2result}
+done
+# get final prediction of our model
+python ensemble_result_fewshot.py FEP_fewshot support_num
+```
+### Reproduce results on active learning
+to speed up the active learning process, you should modify the unicore code
+1. find the installed dir of unicore (root-to-unicore)
+```
+python -c "import unicore; print('/'.join(unicore.__file__.split('/')[:-2]))"
+```
+2. goto root-to-unicore/unicore/options.py line 250, add following line
+```
+    group.add_argument('--validate-begin-epoch', type=int, default=0, metavar='N',
+                        help='validate begin epoch')
+```
+3. goto root-to-unicore/unicore_cli/train.py line 303, add one line
+```
+    do_validate = (
+        (not end_of_epoch and do_save)
+        or (
+            end_of_epoch
+            and epoch_itr.epoch >= args.validate_begin_epoch # !!!! add this line
+            and epoch_itr.epoch % args.validate_interval == 0
+            and not args.no_epoch_checkpoints
+        )
+        or should_stop
+        or (
+            args.validate_interval_updates > 0
+            and num_updates > 0
+            and num_updates % args.validate_interval_updates == 0
+        )
+    ) and not args.disable_validation
+```
+4. run the active learning procedure
+```
+# use the same checkpoints as in FEP experiments
+path1="path to checkpoint of pocket_ranking"
+path2="path to checkpoint of protein_ranking"
+result1="./result/pocket_ranking/TYK2"
+result2="./result/protein_ranking/TYK2"
+# run active learning cycle for 5 iters with pure greedy strategy
+bash ./active_learning_scripts/run_al.sh 5 0 path1 path2 result1 result2
+```
+## Citation
+```
+@article{feng2025hierarchical,
+  title={Hierarchical affinity landscape navigation through learning a shared pocket-ligand space},
+  author={Feng, Bin and Liu, Zijing and Li, Hao and Yang, Mingjun and Zou, Junjie and Cao, He and Li, Yu and Zhang, Lei and Wang, Sheng},
+  journal={Patterns},
+  year={2025},
+  publisher={Elsevier}
+}
+@article{feng2024bioactivity,
+  title={A bioactivity foundation model using pairwise meta-learning},
+  author={Feng, Bin and Liu, Zequn and Huang, Nanlan and Xiao, Zhiping and Zhang, Haomiao and Mirzoyan, Srbuhi and Xu, Hanwen and Hao, Jiaran and Xu, Yinghui and Zhang, Ming and others},
+  journal={Nature Machine Intelligence},
+  volume={6},
+  number={8},
+  pages={962--974},
+  year={2024},
+  publisher={Nature Publishing Group UK London}
+}
+```
+## Acknowledgments
+This project was built based on Uni-Mol (https://github.com/deepmodeling/Uni-Mol)
+Parts of our code reference the implementation from DrugCLIP (https://github.com/bowen-gao/DrugCLIP) by bowen-gao

active_learning_scripts/run_al.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+num_cycles=${1}
+begin_greedy=${2}
+weight_path1=${3}
+weight_path2=${4}
+result_path1=${5}
+result_path2=${6}
+python ./active_learning_scripts/run_cycle_ours.py \
+    --input_file ../PARank_data_curation/case_study/tyk2_fep_label.csv \
+    --results_dir_1 ${result_path1} \
+    --results_dir_2 ${result_path2} \
+    --al_batch_size 100 \
+    --num_cycles ${num_cycles} \
+    --arch_1 pocket_ranking \
+    --arch_2 protein_ranking \
+    --weight_path_1 ${weight_path1} \
+    --weight_path_2 ${weight_path2} \
+    --lr 0.0001 \
+    --device 0 \
+    --master_port 10071 \
+    --base_seed 42 \
+    --begin_greedy ${begin_greedy}

active_learning_scripts/run_cycle_ensemble.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import pandas as pd
+import numpy as np
+import subprocess
+import os
+from pathlib import Path
+import random
+import argparse
+import json
+import subprocess
+from concurrent.futures import ThreadPoolExecutor, wait
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Active Learning Cycle for Ligand Prediction')
+    # Input/Output arguments
+    parser.add_argument('--input_file', type=str, required=True,
+                        help='Input CSV file containing ligand data (e.g., tyk2_fep.csv)')
+    parser.add_argument('--results_dir_1', type=str, required=True,
+                        help='Results directory for first model')
+    parser.add_argument('--results_dir_2', type=str, required=True,
+                        help='Results directory for second model')
+    parser.add_argument('--al_batch_size', type=int, required=True,
+                        help='Number of samples for each active learning batch')
+    # Experiment configuration
+    parser.add_argument('--num_repeats', type=int, default=5,
+                        help='Number of repeated experiments (default: 5)')
+    parser.add_argument('--num_cycles', type=int, required=True,
+                        help='Number of active learning cycles')
+    # Model configuration
+    parser.add_argument('--arch_1', type=str, required=True,
+                        help='First model architecture')
+    parser.add_argument('--arch_2', type=str, required=True,
+                        help='Second model architecture')
+    parser.add_argument('--weight_path_1', type=str, required=True,
+                        help='Path to first model pretrained weights')
+    parser.add_argument('--weight_path_2', type=str, required=True,
+                        help='Path to second model pretrained weights')
+    parser.add_argument('--lr', type=float, default=0.001,
+                        help='Learning rate (default: 0.001)')
+    parser.add_argument('--master_port', type=int, default=29500,
+                        help='Master port for distributed training (default: 29500)')
+    parser.add_argument('--device', type=int, default=0,
+                        help='Base device to run the models on (default: 0)')
+    parser.add_argument('--begin_greedy', type=int, default=0,
+                        help='iter of begin to be pure greedy, using half greedy before')
+    # Random seed
+    parser.add_argument('--base_seed', type=int, default=42,
+                        help='Base random seed (default: 42)')
+    return parser.parse_args()
+def _run(cmd):
+    import os
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    subprocess.run(cmd, check=True, cwd=project_root)
+def run_model(arch_1, arch_2, weight_path_1, weight_path_2, results_path_1, results_path_2, result_file, lr,
+              master_port, train_ligf, test_ligf, device):
+    cmd1 = [
+        "bash", "./active_learning_scripts/run_model.sh",
+        arch_1,
+        weight_path_1,
+        results_path_1,
+        result_file,
+        str(lr),
+        str(master_port),
+        train_ligf,
+        test_ligf,
+        str(device)
+    ]
+    cmd2 = [
+        "bash", "./active_learning_scripts/run_model.sh",
+        arch_2,
+        weight_path_2,
+        results_path_2,
+        result_file,
+        str(lr),
+        str(master_port + 1),
+        train_ligf,
+        test_ligf,
+        str(device + 1)
+    ]
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        task1 = executor.submit(_run, cmd=cmd1)
+        task2 = executor.submit(_run, cmd=cmd2)
+        wait([task1, task2])
+def read_predictions(results_path, result_file):
+    """
+    Read predictions from a single model
+    """
+    predictions = {}
+    jsonl_path = os.path.join(results_path, result_file)
+    with open(jsonl_path, 'r') as f:
+        first_line = json.loads(f.readline().strip())
+        smiles_list = first_line["tyk2"]["smiles"]
+        all_predictions = []
+        for line in f:
+            pred_line = json.loads(line.strip())
+            all_predictions.append(pred_line["tyk2"]["pred"])
+    # Convert to numpy array and calculate mean predictions
+    pred_array = np.array(all_predictions)
+    mean_predictions = np.mean(pred_array, axis=0)
+    # Create dictionary mapping SMILES to predictions
+    for smile, pred in zip(smiles_list, mean_predictions):
+        predictions[smile] = float(pred)
+    return predictions
+def prepare_initial_split(input_file, results_dir_1, results_dir_2, al_batch_size, repeat_idx, cycle_idx, base_seed):
+    # Read all ligands
+    df = pd.read_csv(input_file)
+    # Set random seed for reproducibility
+    random.seed(base_seed + repeat_idx)
+    # Randomly select ligands for training and testing
+    all_indices = list(range(len(df)))
+    train_indices = random.sample(all_indices, al_batch_size)
+    test_indices = [i for i in all_indices if i not in train_indices]
+    # Create train and test files
+    train_df = df.iloc[train_indices]
+    test_df = df.iloc[test_indices]
+    # Create file names for both directories
+    train_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
+    test_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
+    train_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
+    test_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
+    # Create directories if they don't exist
+    os.makedirs(os.path.dirname(train_file_1), exist_ok=True)
+    os.makedirs(os.path.dirname(train_file_2), exist_ok=True)
+    # Save files to both directories
+    train_df.to_csv(train_file_1, index=False)
+    test_df.to_csv(test_file_1, index=False)
+    train_df.to_csv(train_file_2, index=False)
+    test_df.to_csv(test_file_2, index=False)
+    return train_file_1, test_file_1, train_file_2, test_file_2
+def read_and_combine_predictions(results_path_1, results_path_2, result_file):
+    """
+    Read predictions from both models and calculate average predictions
+    """
+    predictions = {}
+    # Read predictions from model 1
+    jsonl_path_1 = os.path.join(results_path_1, result_file)
+    with open(jsonl_path_1, 'r') as f:
+        first_line = json.loads(f.readline().strip())
+        smiles_list = first_line["tyk2"]["smiles"]
+        all_predictions_1 = []
+        for line in f:
+            pred_line = json.loads(line.strip())
+            all_predictions_1.append(pred_line["tyk2"]["pred"])
+    # Read predictions from model 2
+    jsonl_path_2 = os.path.join(results_path_2, result_file)
+    with open(jsonl_path_2, 'r') as f:
+        f.readline()  # skip first line as we already have smiles_list
+        all_predictions_2 = []
+        for line in f:
+            pred_line = json.loads(line.strip())
+            all_predictions_2.append(pred_line["tyk2"]["pred"])
+    # Convert to numpy arrays
+    pred_array_1 = np.array(all_predictions_1)
+    pred_array_2 = np.array(all_predictions_2)
+    # Calculate mean predictions across both models
+    mean_predictions = (np.mean(pred_array_1, axis=0) + np.mean(pred_array_2, axis=0)) / 2
+    # Create dictionary mapping SMILES to average predictions
+    for smile, pred in zip(smiles_list, mean_predictions):
+        predictions[smile] = float(pred)
+    return predictions
+def update_splits(results_dir_1, results_dir_2, predictions_1, predictions_2,
+                 prev_train_file_1, prev_test_file_1,
+                 prev_train_file_2, prev_test_file_2,
+                 repeat_idx, cycle_idx, al_batch_size, begin_greedy):
+    # Read previous test files
+    test_df_1 = pd.read_csv(prev_test_file_1)
+    test_df_2 = pd.read_csv(prev_test_file_2)
+    # Add predictions to test_df
+    test_df_1['prediction_1'] = test_df_1['Smiles'].map(predictions_1)
+    test_df_1['prediction_2'] = test_df_1['Smiles'].map(predictions_2)
+    test_df_1['prediction'] = (test_df_1['prediction_1'] + test_df_1['prediction_2']) / 2
+    # Sort by average predictions (high to low)
+    test_df_sorted = test_df_1.sort_values('prediction', ascending=False)
+    # Read previous train files
+    train_df_1 = pd.read_csv(prev_train_file_1)
+    train_df_2 = pd.read_csv(prev_train_file_2)
+    # Create new file names for both directories
+    new_train_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
+    new_test_file_1 = os.path.join(results_dir_1, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
+    new_train_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
+    new_test_file_2 = os.path.join(results_dir_2, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
+    # Create directories if they don't exist
+    os.makedirs(os.path.dirname(new_train_file_1), exist_ok=True)
+    os.makedirs(os.path.dirname(new_train_file_2), exist_ok=True)
+    if cycle_idx >= begin_greedy:
+        # Take top al_batch_size compounds for training
+        new_train_compounds = test_df_sorted.head(al_batch_size)
+        remaining_test_compounds = test_df_sorted.iloc[al_batch_size:]
+    else:
+        # use half greedy approach
+        new_train_compounds_tmp_1 = test_df_sorted.head(al_batch_size//2)
+        remaining_test_compounds_tmp = test_df_sorted.iloc[al_batch_size//2:]
+        all_indices = list(range(len(remaining_test_compounds_tmp)))
+        train_indices = random.sample(all_indices, al_batch_size - al_batch_size//2)
+        test_indices = [i for i in all_indices if i not in train_indices]
+        remaining_test_compounds = remaining_test_compounds_tmp.iloc[test_indices]
+        new_train_compounds_tmp_2 = remaining_test_compounds_tmp.iloc[train_indices]
+        new_train_compounds = pd.concat([new_train_compounds_tmp_1, new_train_compounds_tmp_2])
+    # Combine with previous training data
+    combined_train_df = pd.concat([train_df_1, new_train_compounds])
+    for _ in range(3):
+        print("########################################")
+    print("Cycling: ", cycle_idx)
+    print("top_1p: {}/100".format(combined_train_df['top_1p'].sum()))
+    print("top_2p: {}/200".format(combined_train_df['top_2p'].sum()))
+    print("top_5p: {}/500".format(combined_train_df['top_5p'].sum()))
+    # Save files for both models (same content, different directories)
+    combined_train_df.to_csv(new_train_file_1, index=False)
+    remaining_test_compounds.to_csv(new_test_file_1, index=False)
+    combined_train_df.to_csv(new_train_file_2, index=False)
+    remaining_test_compounds.to_csv(new_test_file_2, index=False)
+    return (new_train_file_1, new_test_file_1,
+            new_train_file_2, new_test_file_2)
+def run_active_learning(args):
+    # Create base results directories
+    os.system(f"rm -rf {args.results_dir_1}")
+    os.system(f"rm -rf {args.results_dir_2}")
+    os.makedirs(args.results_dir_1, exist_ok=True)
+    os.makedirs(args.results_dir_2, exist_ok=True)
+    for repeat_idx in range(args.num_repeats):
+        print(f"Starting repeat {repeat_idx}")
+        # Initial split for this repeat
+        train_file_1, test_file_1, train_file_2, test_file_2 = prepare_initial_split(
+            args.input_file,
+            args.results_dir_1,
+            args.results_dir_2,
+            args.al_batch_size,
+            repeat_idx,
+            0,  # First cycle
+            args.base_seed
+        )
+        for cycle_idx in range(args.num_cycles):
+            print(f"Running cycle {cycle_idx} for repeat {repeat_idx}")
+            # Result file name
+            result_file = f"repeat_{repeat_idx}_cycle_{cycle_idx}_results.jsonl"
+            if os.path.exists(f"{args.results_dir_1}/{result_file}"):
+                os.remove(f"{args.results_dir_1}/{result_file}")
+            if os.path.exists(f"{args.results_dir_2}/{result_file}"):
+                os.remove(f"{args.results_dir_2}/{result_file}")
+            # Run both models
+            run_model(
+                arch_1=args.arch_1,
+                arch_2=args.arch_2,
+                weight_path_1=args.weight_path_1,
+                weight_path_2=args.weight_path_2,
+                results_path_1=args.results_dir_1,
+                results_path_2=args.results_dir_2,
+                result_file=result_file,
+                lr=args.lr,
+                master_port=args.master_port,
+                train_ligf=train_file_1,
+                test_ligf=test_file_1,
+                device=args.device
+            )
+            # Update splits for next cycle
+            if cycle_idx < args.num_cycles - 1:
+                # Read predictions from both models separately
+                predictions_1 = read_predictions(args.results_dir_1, result_file)
+                predictions_2 = read_predictions(args.results_dir_2, result_file)
+                # Update splits for both models
+                train_file_1, test_file_1, train_file_2, test_file_2 = update_splits(
+                    args.results_dir_1,
+                    args.results_dir_2,
+                    predictions_1,
+                    predictions_2,
+                    train_file_1,
+                    test_file_1,
+                    train_file_2,
+                    test_file_2,
+                    repeat_idx,
+                    cycle_idx + 1,
+                    args.al_batch_size,
+                    args.begin_greedy
+                )
+if __name__ == "__main__":
+    args = parse_arguments()
+    run_active_learning(args)

active_learning_scripts/run_cycle_one_model.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import pandas as pd
+import numpy as np
+import subprocess
+import os
+from pathlib import Path
+import random
+import argparse
+import json
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Active Learning Cycle for Ligand Prediction')
+    # Input/Output arguments
+    parser.add_argument('--input_file', type=str, required=True,
+                        help='Input CSV file containing ligand data (e.g., tyk2_fep.csv)')
+    parser.add_argument('--results_dir', type=str, required=True,
+                        help='Base directory for storing all results')
+    parser.add_argument('--al_batch_size', type=int, required=True,
+                        help='Number of samples for each active learning batch')
+    # Experiment configuration
+    parser.add_argument('--num_repeats', type=int, default=5,
+                        help='Number of repeated experiments (default: 5)')
+    parser.add_argument('--num_cycles', type=int, required=True,
+                        help='Number of active learning cycles')
+    # Model configuration
+    parser.add_argument('--arch', type=str, required=True,
+                        help='Model architecture')
+    parser.add_argument('--weight_path', type=str, required=True,
+                        help='Path to pretrained model weights')
+    parser.add_argument('--lr', type=float, default=0.001,
+                        help='Learning rate (default: 0.001)')
+    parser.add_argument('--master_port', type=int, default=29500,
+                        help='Master port for distributed training (default: 29500)')
+    parser.add_argument('--device', type=int, default=0,
+                        help='Device to run the model on (default: cuda:0)')
+    parser.add_argument('--begin_greedy', type=int, default=0,
+                        help='iter of begin to be pure greedy, using half greedy before')
+    # Random seed
+    parser.add_argument('--base_seed', type=int, default=42,
+                        help='Base random seed (default: 42)')
+    return parser.parse_args()
+def run_model(arch, weight_path, results_path, result_file, lr, master_port, train_ligf, test_ligf, device):
+    import os
+    project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    cmd = [
+        "bash", "./active_learning_scripts/run_model.sh",
+        arch,
+        weight_path,
+        results_path,
+        result_file,
+        str(lr),
+        str(master_port),
+        train_ligf,
+        test_ligf,
+        str(device)
+    ]
+    subprocess.run(cmd, check=True, cwd=project_root)
+def prepare_initial_split(input_file, results_dir, al_batch_size, repeat_idx, cycle_idx, base_seed):
+    # Read all ligands
+    df = pd.read_csv(input_file)
+    # Set random seed for reproducibility
+    random.seed(base_seed + repeat_idx)  # Different seed for each repeat
+    # Randomly select ligands for training and testing
+    all_indices = list(range(len(df)))
+    train_indices = random.sample(all_indices, al_batch_size)
+    test_indices = [i for i in all_indices if i not in train_indices]
+    # Create train and test files
+    train_df = df.iloc[train_indices]
+    test_df = df.iloc[test_indices]
+    # Create file names with repeat and cycle information
+    train_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
+    test_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(train_file), exist_ok=True)
+    # Save files
+    train_df.to_csv(train_file, index=False)
+    test_df.to_csv(test_file, index=False)
+    return train_file, test_file
+def read_jsonl_predictions(results_path, result_file):
+    """
+    Read predictions from jsonl file and calculate average predictions
+    Returns a dictionary mapping SMILES to average predictions
+    """
+    predictions = {}
+    all_predictions = []
+    smiles_list = None
+    jsonl_path = os.path.join(results_path, result_file)
+    with open(jsonl_path, 'r') as f:
+        # Read first line to get SMILES list
+        first_line = f.readline()
+        smiles_list = json.loads(first_line.strip())["tyk2"]["smiles"]
+        # Read rest of lines containing predictions
+        for line in f:
+            pred_line = json.loads(line.strip())
+            all_predictions.append(pred_line["tyk2"]["pred"])
+    # Convert to numpy array for easier computation
+    pred_array = np.array(all_predictions)
+    # Calculate mean predictions
+    mean_predictions = np.mean(pred_array, axis=0)
+    # Create dictionary mapping SMILES to average predictions
+    for smile, pred in zip(smiles_list, mean_predictions):
+        predictions[smile] = float(pred)
+    return predictions
+def update_splits(results_dir, results_path, result_file, prev_train_file, prev_test_file, repeat_idx, cycle_idx,
+                  al_batch_size, begin_greedy):
+    # Read predictions from jsonl file
+    predictions = read_jsonl_predictions(results_path, result_file)
+    # Read previous test file
+    test_df = pd.read_csv(prev_test_file)
+    # Add predictions to test_df
+    test_df['prediction'] = test_df['Smiles'].map(predictions)
+    # Sort by predictions (high to low)
+    test_df_sorted = test_df.sort_values('prediction', ascending=False)
+    # Read previous train file
+    train_df = pd.read_csv(prev_train_file)
+    # Create new file names
+    new_train_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_train.csv")
+    new_test_file = os.path.join(results_dir, f"repeat_{repeat_idx}_cycle_{cycle_idx}_test.csv")
+    # Create directory if it doesn't exist
+    os.makedirs(os.path.dirname(new_train_file), exist_ok=True)
+    if cycle_idx >= begin_greedy:
+        # Take top al_batch_size compounds for training
+        new_train_compounds = test_df_sorted.head(al_batch_size)
+        remaining_test_compounds = test_df_sorted.iloc[al_batch_size:]
+    else:
+        # use half greedy approach
+        new_train_compounds_tmp_1 = test_df_sorted.head(al_batch_size//2)
+        remaining_test_compounds_tmp = test_df_sorted.iloc[al_batch_size//2:]
+        all_indices = list(range(len(remaining_test_compounds_tmp)))
+        train_indices = random.sample(all_indices, al_batch_size - al_batch_size//2)
+        test_indices = [i for i in all_indices if i not in train_indices]
+        remaining_test_compounds = remaining_test_compounds_tmp.iloc[test_indices]
+        new_train_compounds_tmp_2 = remaining_test_compounds_tmp.iloc[train_indices]
+        new_train_compounds = pd.concat([new_train_compounds_tmp_1, new_train_compounds_tmp_2])
+    # Combine with previous training data
+    combined_train_df = pd.concat([train_df, new_train_compounds])
+    for _ in range(3):
+        print("########################################")
+    print("Cycling: ", cycle_idx)
+    print("top_1p: {}/100".format(combined_train_df['top_1p'].sum()))
+    print("top_2p: {}/200".format(combined_train_df['top_2p'].sum()))
+    print("top_5p: {}/500".format(combined_train_df['top_5p'].sum()))
+    # Save files
+    combined_train_df.to_csv(new_train_file, index=False)
+    remaining_test_compounds.to_csv(new_test_file, index=False)
+    return new_train_file, new_test_file
+def run_active_learning(args):
+    # Create base results directory
+    os.system(f"rm -rf {args.results_dir}")
+    os.makedirs(args.results_dir, exist_ok=True)
+    for repeat_idx in range(args.num_repeats):
+        print(f"Starting repeat {repeat_idx}")
+        # Initial split for this repeat
+        train_file, test_file = prepare_initial_split(
+            args.input_file,
+            args.results_dir,
+            args.al_batch_size,
+            repeat_idx,
+            0,  # First cycle
+            args.base_seed
+        )
+        for cycle_idx in range(args.num_cycles):
+            print(f"Running cycle {cycle_idx} for repeat {repeat_idx}")
+            # Create results directory for this cycle
+            results_path = args.results_dir
+            # Result file name
+            result_file = f"repeat_{repeat_idx}_cycle_{cycle_idx}_results.jsonl"
+            if os.path.exists(f"{args.results_dir}/{result_file}"):
+                os.remove(f"{args.results_dir}/{result_file}")
+            # Run the model
+            run_model(
+                arch=args.arch,
+                weight_path=args.weight_path,
+                results_path=results_path,
+                result_file=result_file,
+                lr=args.lr,
+                master_port=args.master_port,
+                train_ligf=train_file,
+                test_ligf=test_file,
+                device=args.device
+            )
+            # Update splits for next cycle
+            if cycle_idx < args.num_cycles - 1:  # Don't update after last cycle
+                train_file, test_file = update_splits(
+                    args.results_dir,
+                    results_path,
+                    result_file,
+                    train_file,
+                    test_file,
+                    repeat_idx,
+                    cycle_idx + 1,
+                    args.al_batch_size,
+                    args.begin_greedy
+                )
+if __name__ == "__main__":
+    args = parse_arguments()
+    run_active_learning(args)

active_learning_scripts/run_model.sh ADDED Viewed

	@@ -0,0 +1,53 @@

+data_path="./test_datasets"
+n_gpu=1
+batch_size=1
+batch_size_valid=1
+epoch=20
+update_freq=1
+#lr=1e-3
+#MASTER_PORT=10075
+#arch=pocket_ranking
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+arch=${1} # model architecture
+weight_path=${2} # path for pretrained model
+results_path=${3} #
+result_file=${4} #
+lr=${5} # learning rate
+MASTER_PORT=${6}
+train_ligf=${7} # !! input path for training ligands file (.csv format)
+test_ligf=${8} # !! input path for test ligands file (.csv format)
+device=${9} # cuda device
+if [[ "$arch" == "pocketregression" ]] || [[ "$arch" == "DTA" ]]; then
+    loss="mseloss"
+else
+    loss="rank_softmax"
+fi
+CUDA_VISIBLE_DEVICES=${device} python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+     --results-path $results_path \
+     --num-workers 8 --ddp-backend=c10d \
+     --task train_task --loss ${loss} --arch $arch  \
+     --max-pocket-atoms 256 \
+     --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+     --lr-scheduler polynomial_decay --lr $lr --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+     --update-freq $update_freq --seed 1 \
+     --log-interval 1 --log-format simple \
+     --validate-interval 1 --validate-begin-epoch 15 \
+     --best-checkpoint-metric valid_mean_r2 --patience 100 --all-gather-list-size 2048000 \
+     --no-save --save-dir $results_path --tmp-save-dir $results_path  \
+     --find-unused-parameters \
+     --maximize-best-checkpoint-metric \
+     --valid-set TYK2 \
+     --max-lignum 512 --test-max-lignum 10000 \
+     --restore-model $weight_path --few-shot true \
+     --fp16 --fp16-init-scale 4 --fp16-scale-window 256 \
+     --active-learning-resfile ${result_file} \
+     --case-train-ligfile ${train_ligf} --case-test-ligfile ${test_ligf}

ensemble_result.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import sys
+import os
+import json
+import copy
+import numpy as np
+import scipy.stats as stats
+import math
+from rdkit.ML.Scoring.Scoring import CalcBEDROC, CalcAUC, CalcEnrichment
+def cal_metrics(y_score, y_true):
+    # concate res_single and labels
+    scores = np.expand_dims(y_score, axis=1)
+    y_true = np.expand_dims(y_true, axis=1)
+    scores = np.concatenate((scores, y_true), axis=1)
+    # inverse sort scores based on first column
+    scores = scores[scores[:, 0].argsort()[::-1]]
+    bedroc = CalcBEDROC(scores, 1, 80.5)
+    count = 0
+    # sort y_score, return index
+    index = np.argsort(y_score)[::-1]
+    for i in range(int(len(index) * 0.005)):
+        if y_true[index[i]] == 1:
+            count += 1
+    auc = CalcAUC(scores, 1)
+    ef_list = CalcEnrichment(scores, 1, [0.005, 0.01, 0.02, 0.05])
+    return {
+        "BEDROC": bedroc,
+        "AUROC": auc,
+        "EF0.5": ef_list[0],
+        "EF1": ef_list[1],
+        "EF5": ef_list[3]
+    }
+def print_avg_metric(metric_dict, name):
+    metric_lst = list(metric_dict.values())
+    ret_metric = copy.deepcopy(metric_lst[0])
+    for m in metric_lst[1:]:
+        for k in m:
+            ret_metric[k] += m[k]
+    for k in ret_metric:
+        ret_metric[k] = ret_metric[k] / len(metric_lst)
+    print(name, ret_metric)
+def read_zeroshot_res(res_dir):
+    targets = sorted(list(os.listdir(res_dir)))
+    res_dict = {}
+    for target in targets:
+        real_dg = np.load(f"{res_dir}/{target}/saved_labels.npy")
+        if os.path.exists(f"{res_dir}/{target}/saved_preds.npy"):
+            pred_dg = np.load(f"{res_dir}/{target}/saved_preds.npy")
+        else:
+            mol_reps = np.load(f"{res_dir}/{target}/saved_mols_embed.npy")
+            pocket_reps = np.load(f"{res_dir}/{target}/saved_target_embed.npy")
+            res = pocket_reps @ mol_reps.T
+            pred_dg = res.max(axis=0)
+        res_dict[target] = {
+            "pred": pred_dg,
+            "exp": real_dg
+        }
+    return res_dict
+def get_ensemble_res(res_list, begin=0, end=-1):
+    if end == -1:
+        end = len(res_list)
+    ret = copy.deepcopy(res_list[begin])
+    for res in res_list[begin+1:end]:
+        for k in ret.keys():
+            ret[k]["pred"] = np.array(ret[k]["pred"]) + np.array(res[k]["pred"])
+    for k in ret.keys():
+        ret[k]["pred"] = np.array(ret[k]["pred"]) / (end-begin)
+    return ret
+def avg_metric(metric_lst_all):
+    ret_metric_dict = {}
+    for metric_lst in metric_lst_all:
+        ret_metric = copy.deepcopy(metric_lst[0])
+        for m in metric_lst[1:]:
+            for k in ["pearsonr", "spearmanr", "r2"]:
+                ret_metric[k] += m[k]
+        for k in ["spearmanr", "pearsonr", "r2"]:
+            ret_metric[k] = ret_metric[k] / len(metric_lst)
+        ret_metric_dict[ret_metric["target"]] = ret_metric
+    return ret_metric_dict
+def get_metric(res):
+    metric_dict = {}
+    for k in sorted(list(res.keys())):
+        pred = res[k]["pred"]
+        exp = res[k]["exp"]
+        spearmanr = stats.spearmanr(exp, pred).statistic
+        pearsonr = stats.pearsonr(exp, pred).statistic
+        if math.isnan(pearsonr):
+            pearsonr = 0
+        if math.isnan(spearmanr):
+            spearmanr = 0
+        metric_dict[k] = {
+            "pearsonr":pearsonr,
+            "spearmanr":spearmanr,
+            "r2":max(pearsonr, 0)**2,
+            "target":k
+        }
+    return metric_dict
+if __name__ == '__main__':
+    mode = sys.argv[1]
+    if mode == "zeroshot":
+        test_sets = sys.argv[2:]
+        for test_set in test_sets:
+            if test_set in ["DUDE", "PCBA", "DEKOIS"]:
+                metrics = {}
+                target_id_list = sorted(list(os.listdir(f"./result/pocket_ranking/{test_set}")))
+                for target_id in target_id_list:
+                    lig_act = np.load(f"./result/pocket_ranking/{test_set}/{target_id}/saved_labels.npy")
+                    score_1 = np.load(f"./result/pocket_ranking/{test_set}/{target_id}/GNN_res_epoch9.npy")
+                    score_2 = np.load(f"./result/protein_ranking/{test_set}/{target_id}/GNN_res_epoch9.npy")
+                    score = (score_1 + score_2)/2
+                    metrics[target_id] = cal_metrics(score, lig_act)
+                json.dump(metrics, open(f"./result/pocket_ranking/{test_set}_metrics.json", "w"))
+                print_avg_metric(metrics, "Ours")
+            elif test_set in ["FEP"]:
+                target_id_list = sorted(list(os.listdir(f"./result/pocket_ranking/{test_set}")))
+                res_all_pocket, res_all_protein = [], []
+                for repeat in range(1, 6):
+                    res_pocket = read_zeroshot_res(f"./result/pocket_ranking/{test_set}/repeat_{repeat}")
+                    res_protein = read_zeroshot_res(f"./result/protein_ranking/{test_set}/repeat_{repeat}")
+                    res_all_pocket.append(res_pocket)
+                    res_all_protein.append(res_protein)
+                res_all_fusion = get_ensemble_res(res_all_pocket + res_all_protein)
+                metrics = get_metric(res_all_fusion)
+                json.dump(metrics, open(f"./result/pocket_ranking/{test_set}_metrics.json", "w"))
+                print_avg_metric(metrics, "Ours")
+    elif mode == "fewshot":
+        test_set = sys.argv[2]
+        support_num = sys.argv[3]
+        begin = 15
+        end = 20
+        metric_fusion_all = []
+        for seed in range(1, 11):
+            res_repeat_pocket = []
+            res_repeat_seq = []
+            if test_set in ["TIME", "OOD"]:
+                res_file_pocket = f"./result/pocket_ranking/{test_set}/random_{seed}_sup{support_num}.jsonl"
+                res_file_seq = f"./result/pocket_ranking/{test_set}/random_{seed}_sup{support_num}.jsonl"
+                if not os.path.exists(res_file_pocket):
+                    continue
+                res_repeat_pocket = [json.loads(line) for line in open(res_file_pocket)][1:]
+                res_repeat_seq = [json.loads(line) for line in open(res_file_seq)][1:]
+            elif test_set in ["FEP_fewshot"]:
+                for repeat in range(1, 6):
+                    res_file_pocket = f"./result/pocket_ranking/{test_set}/repeat_{repeat}/random_{seed}_sup{support_num}.jsonl"
+                    res_file_seq = f"./result/pocket_ranking/{test_set}/repeat_{repeat}/random_{seed}_sup{support_num}.jsonl"
+                    if not os.path.exists(res_file_pocket):
+                        continue
+                    res_pocket = [json.loads(line) for line in open(res_file_pocket)][1:]
+                    res_seq = [json.loads(line) for line in open(res_file_seq)][1:]
+                    res_pocket = get_ensemble_res(res_pocket, begin, end)
+                    res_seq = get_ensemble_res(res_seq, begin, end)
+                    res_repeat_pocket.append(res_pocket)
+                    res_repeat_seq.append(res_seq)
+            res_repeat_fusion = get_ensemble_res(res_repeat_pocket + res_repeat_seq)
+            metric_fusion_all.append(get_metric(res_repeat_fusion))
+        metric_fusion_all = avg_metric(list(map(list, zip(*metric_fusion_all))))
+        json.dump(metric_fusion_all, open(f"./result/pocket_ranking/{test_set}_metrics.json", "w"))
+        print_avg_metric(metric_fusion_all, "Ours")

py_scripts/__init__.py ADDED Viewed

File without changes

py_scripts/write_case_study.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import argparse
+import gzip
+import json
+import multiprocessing as mp
+import os
+import pickle
+import random
+import lmdb
+import numpy as np
+import pandas as pd
+import rdkit
+import rdkit.Chem.AllChem as AllChem
+import torch
+import tqdm
+from biopandas.mol2 import PandasMol2
+from biopandas.pdb import PandasPdb
+from rdkit import Chem, RDLogger
+from rdkit.Chem.MolStandardize import rdMolStandardize
+RDLogger.DisableLog('rdApp.*')
+def gen_conformation(mol, num_conf=20, num_worker=8):
+    try:
+        mol = Chem.AddHs(mol)
+        AllChem.EmbedMultipleConfs(mol, numConfs=num_conf, numThreads=num_worker, pruneRmsThresh=1, maxAttempts=10000, useRandomCoords=False)
+        try:
+            AllChem.MMFFOptimizeMoleculeConfs(mol, numThreads=num_worker)
+        except:
+            pass
+        mol = Chem.RemoveHs(mol)
+    except:
+        print("cannot gen conf", Chem.MolToSmiles(mol))
+        return None
+    if mol.GetNumConformers() == 0:
+        print("cannot gen conf", Chem.MolToSmiles(mol))
+        return None
+    return mol
+def convert_2Dmol_to_data(smi, num_conf=1, num_worker=5):
+    #to 3D
+    mol = Chem.MolFromSmiles(smi)
+    if mol is None:
+        return None
+    mol = gen_conformation(mol, num_conf, num_worker)
+    if mol is None:
+        return None
+    coords = [np.array(mol.GetConformer(i).GetPositions()) for i in range(mol.GetNumConformers())]
+    atom_types = [a.GetSymbol() for a in mol.GetAtoms()]
+    return {'coords': coords, 'atom_types': atom_types, 'smi': smi, 'mol': mol}
+def convert_3Dmol_to_data(mol):
+    if mol is None:
+        return None
+    coords = [np.array(mol.GetConformer(i).GetPositions()) for i in range(mol.GetNumConformers())]
+    atom_types = [a.GetSymbol() for a in mol.GetAtoms()]
+    return {'coords': coords, 'atom_types': atom_types, 'smi': Chem.MolToSmiles(mol), 'mol': mol}
+def read_pdb(path):
+    pdb_df = PandasPdb().read_pdb(path)
+    coord = pdb_df.df['ATOM'][['x_coord', 'y_coord', 'z_coord']]
+    atom_type = pdb_df.df['ATOM']['atom_name']
+    residue_name = pdb_df.df['ATOM']['chain_id'] + pdb_df.df['ATOM']['residue_number'].astype(str)
+    residue_type = pdb_df.df['ATOM']['residue_name']
+    protein = {'coord': np.array(coord),
+               'atom_type': list(atom_type),
+               'residue_name': list(residue_name),
+               'residue_type': list(residue_type)}
+    return protein
+def read_sdf_gz_3d(path):
+    inf = gzip.open(path)
+    with Chem.ForwardSDMolSupplier(inf, removeHs=False, sanitize=False) as gzsuppl:
+        ms = [add_charges(x) for x in gzsuppl if x is not None]
+    ms = [rdMolStandardize.Uncharger().uncharge(Chem.RemoveHs(m)) for m in ms if m is not None]
+    return ms
+def add_charges(m):
+    m.UpdatePropertyCache(strict=False)
+    ps = Chem.DetectChemistryProblems(m)
+    if not ps:
+        Chem.SanitizeMol(m)
+        return m
+    for p in ps:
+        if p.GetType()=='AtomValenceException':
+            at = m.GetAtomWithIdx(p.GetAtomIdx())
+            if at.GetAtomicNum()==7 and at.GetFormalCharge()==0 and at.GetExplicitValence()==4:
+                at.SetFormalCharge(1)
+            if at.GetAtomicNum()==6 and at.GetExplicitValence()==5:
+                #remove a bond
+                for b in at.GetBonds():
+                    if b.GetBondType()==Chem.rdchem.BondType.DOUBLE:
+                        b.SetBondType(Chem.rdchem.BondType.SINGLE)
+                        break
+            if at.GetAtomicNum()==8 and at.GetFormalCharge()==0 and at.GetExplicitValence()==3:
+                at.SetFormalCharge(1)
+            if at.GetAtomicNum()==5 and at.GetFormalCharge()==0 and at.GetExplicitValence()==4:
+                at.SetFormalCharge(-1)
+    try:
+        Chem.SanitizeMol(m)
+    except:
+        return None
+    return m
+def get_different_raid(protein, ligand, raid=6):
+    protein_coord = protein['coord']
+    ligand_coord = ligand['coord']
+    protein_residue_name = protein['residue_name']
+    pocket_residue = set()
+    for i in range(len(protein_coord)):
+        for j in range(len(ligand_coord)):
+            if np.linalg.norm(protein_coord[i] - ligand_coord[j]) < raid:
+                pocket_residue.add(protein_residue_name[i])
+    return pocket_residue
+def read_mol2_ligand(path):
+    mol2_df = PandasMol2().read_mol2(path)
+    coord = mol2_df.df[['x', 'y', 'z']]
+    atom_type = mol2_df.df['atom_name']
+    ligand = {'coord': np.array(coord), 'atom_type': list(atom_type), 'mol': Chem.MolFromMol2File(path)}
+    return ligand
+def read_smi_mol(path):
+    with open(path, 'r') as f:
+        mols_lines = list(f.readlines())
+    smis = [l.split(' ')[0] for l in mols_lines]
+    mols = [Chem.MolFromSmiles(m) for m in smis]
+    return mols
+def parser(protein_path, mol_path, ligand_path, activity, pocket_index, raid=6):
+    protein = read_pdb(protein_path)
+    data_mols = read_smi_mol(mol_path)
+    ligand = read_mol2_ligand(ligand_path)
+    pocket_residue = get_different_raid(protein, ligand, raid=raid)
+    pocket_atom_idx = [i for i, r in enumerate(protein['residue_name']) if r in pocket_residue]
+    pocket_atom_type = [protein['atom_type'][i] for i in pocket_atom_idx]
+    pocket_coord = [protein['coord'][i] for i in pocket_atom_idx]
+    pocket_residue_type = [protein['residue_type'][i] for i in pocket_atom_idx]
+    pocket_name = protein_path.split('/')[-2]
+    pool = mp.Pool(32)
+    #mols = [convert_2Dmol_to_data(m) for m in data_mols if m is not None]
+    data_mols = [m for m in data_mols if m is not None]
+    mols = [m for m in pool.imap_unordered(convert_2Dmol_to_data, data_mols)]
+    mols = [m for m in mols if m is not None]
+    return [{'atoms': m['atom_types'],
+            'coordinates': m['coords'],
+            'smi': m['smi'],
+            'mol': ligand,
+            'pocket_name': pocket_name,
+            'pocket_index': pocket_index,
+            'activity': activity,
+            "pocket_atom_type": pocket_atom_type,
+            "pocket_coord": pocket_coord} for m in mols]
+def mol_parser(ligand_smis):
+    pool = mp.Pool(16)
+    mols = [m for m in pool.imap_unordered(convert_2Dmol_to_data, tqdm.tqdm(ligand_smis))]
+    mols = [m for m in mols if m is not None]
+    return [{'atoms': m['atom_types'],
+            'coordinates': m['coords'],
+            'smi': m['smi'],
+            'mol': m['mol'],
+            'label': 1,
+            } for m in mols]
+def pocket_parser(protein_path, ligand_path, pocket_index, pocket_name, raid=6):
+    protein = read_pdb(protein_path)
+    ligand = read_mol2_ligand(ligand_path)
+    pocket_residue = get_different_raid(protein, ligand, raid=raid)
+    pocket_atom_idx = [i for i, r in enumerate(protein['residue_name']) if r in pocket_residue]
+    pocket_atom_type = [protein['atom_type'][i] for i in pocket_atom_idx]
+    pocket_coord = [protein['coord'][i] for i in pocket_atom_idx]
+    pocket_residue_type = [protein['residue_type'][i] for i in pocket_atom_idx]
+    pocket_residue_name = [protein['residue_name'][i] for i in pocket_atom_idx]
+    return {'pocket': pocket_name,
+            'pocket_index': pocket_index,
+            "pocket_atoms": pocket_atom_type,
+            "pocket_coordinates": pocket_coord,
+            "pocket_residue_type": pocket_residue_type,
+            "pocket_residue_name": pocket_residue_name}
+def write_lmdb(data, lmdb_path):
+    #resume
+    if os.path.exists(lmdb_path):
+        os.system(f"rm {lmdb_path}")
+    env = lmdb.open(lmdb_path, subdir=False, readonly=False, lock=False, readahead=False, meminit=False, map_size=1099511627776)
+    num = 0
+    with env.begin(write=True) as txn:
+        for d in data:
+            txn.put(str(num).encode('ascii'), pickle.dumps(d))
+            num += 1
+import sys
+if __name__ == '__main__':
+    mode = sys.argv[1]
+    if mode == 'mol':
+        lig_file = sys.argv[2]
+        lig_write_file = sys.argv[3]
+        # read the ligands smiles into a list
+        smis = json.load(open(lig_file))
+        data = []
+        print("number of ligands", len(set(smis)))
+        d_active = (mol_parser(list(set(smis))))
+        data.extend(d_active)
+        # write ligands lmdb
+        write_lmdb(data, lig_write_file)
+    elif mode == 'pocket':
+        prot_file = sys.argv[2]
+        crystal_lig_file = sys.argv[3] # must be .mol2 file
+        prot_write_file = sys.argv[4]
+        # write pocket
+        data = []
+        d = pocket_parser(prot_file, crystal_lig_file, 1, "demo")
+        data.append(d)
+        write_lmdb(data, prot_write_file)

test.sh ADDED Viewed

	@@ -0,0 +1,18 @@

+batch_size=256
+TASK=${1}
+arch=${2}
+weight_path=${3}
+results_path=${4}
+echo "writing to ${results_path}"
+mkdir -p $results_path
+python ./unimol/test.py "./test_datasets" --user-dir ./unimol --valid-subset test \
+       --results-path $results_path \
+       --num-workers 8 --ddp-backend=c10d --batch-size $batch_size \
+       --task test_task --loss rank_softmax --arch $arch \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256  --seed 1 \
+       --path $weight_path \
+       --log-interval 100 --log-format simple \
+       --max-pocket-atoms 511 \
+       --test-task $TASK

test_fewshot.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+data_path="./test_datasets"
+TASK=${1}
+arch=${2}
+sup_num=${3}
+weight_path=${4}
+results_path=${5}
+n_gpu=1
+batch_size=8
+batch_size_valid=16
+epoch=10
+update_freq=1
+lr=1e-4
+MASTER_PORT=10092
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+seed=1
+torchrun --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+        --results-path $results_path \
+        --num-workers 8 --ddp-backend=c10d \
+        --task train_task --loss rank_softmax --arch $arch  \
+        --max-pocket-atoms 256 \
+        --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+        --lr-scheduler polynomial_decay --lr $lr --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+        --update-freq $update_freq --seed $seed \
+        --log-interval 1 --log-format simple \
+        --validate-interval 1 \
+        --best-checkpoint-metric valid_mean_r2 --patience 100 --all-gather-list-size 2048000 \
+        --no-save --save-dir $results_path --tmp-save-dir $results_path  \
+        --find-unused-parameters \
+        --maximize-best-checkpoint-metric \
+        --split-method random --valid-set $TASK \
+        --max-lignum 512 \
+        --sup-num $sup_num \
+        --restore-model $weight_path --few-shot true \
+        --fp16 --fp16-init-scale 4 --fp16-scale-window 256

test_fewshot_demo.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+data_path="./vocab"
+n_gpu=1
+batch_size=1
+batch_size_valid=1
+epoch=20
+update_freq=1
+lr=1e-4
+MASTER_PORT=10092
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+arch=${1}
+weight_path=${2}
+results_path=${3}
+lig_file=${4}
+prot_file=${5}
+split_file=${6}
+sup_num=16
+seed=1
+torchrun --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+        --results-path $results_path \
+        --num-workers 8 --ddp-backend=c10d \
+        --task train_task --loss rank_softmax --arch $arch  \
+        --max-pocket-atoms 256 \
+        --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+        --lr-scheduler polynomial_decay --lr $lr --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+        --update-freq $update_freq --seed $seed \
+        --log-interval 1 --log-format simple \
+        --validate-interval 1 \
+        --best-checkpoint-metric valid_mean_r2 --patience 100 --all-gather-list-size 2048000 \
+        --no-save --save-dir ./tmp --tmp-save-dir ./tmp  \
+        --find-unused-parameters \
+        --maximize-best-checkpoint-metric \
+        --split-method random --valid-set DEMO \
+        --max-lignum 512 \
+        --sup-num $sup_num \
+        --restore-model $weight_path --few-shot true \
+        --demo-lig-file $lig_file --demo-prot-file $prot_file --demo-split-file $split_file \
+        --fp16 --fp16-init-scale 4 --fp16-scale-window 256

test_zeroshot_demo.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+batch_size=128
+lig_file=${1}
+prot_file=${2}
+uniprot=${3}
+arch=${4}
+weight_path=${5}
+results_path=${6}
+echo "writing to ${results_path}"
+mkdir -p $results_path
+python ./unimol/test.py "./vocab" --user-dir ./unimol --valid-subset test \
+       --results-path $results_path \
+       --num-workers 8 --ddp-backend=c10d --batch-size $batch_size \
+       --task test_task --loss rank_softmax --arch $arch \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256  --seed 1 \
+       --path $weight_path \
+       --log-interval 100 --log-format simple \
+       --max-pocket-atoms 511 --demo-lig-file $lig_file --demo-prot-file $prot_file --demo-uniprot $uniprot \
+       --test-task DEMO

train.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+data_path="./data"
+save_root="./save"
+save_name="screen_pocket"
+save_dir="${save_root}/${save_name}/savedir_screen"
+tmp_save_dir="${save_root}/${save_name}/tmp_save_dir_screen"
+tsb_dir="${save_root}/${save_name}/tsb_dir_screen"
+mkdir -p ${save_dir}
+n_gpu=2
+MASTER_PORT=10062
+finetune_mol_model="./pretrain/mol_pre_no_h_220816.pt" # unimol pretrained mol model
+finetune_pocket_model="./pretrain/pocket_pre_220816.pt" # unimol pretrained pocket model
+batch_size=24
+batch_size_valid=32
+epoch=50
+dropout=0.0
+warmup=0.06
+update_freq=1
+dist_threshold=8.0
+recycling=3
+lr=1e-4
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+CUDA_VISIBLE_DEVICES="0,1" python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+       --num-workers 8 --ddp-backend=c10d \
+       --task train_task --loss rank_softmax --arch pocketscreen  \
+       --max-pocket-atoms 256 \
+       --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+       --lr-scheduler polynomial_decay --lr $lr --warmup-ratio $warmup --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 --update-freq $update_freq --seed 1 \
+       --tensorboard-logdir $tsb_dir \
+       --log-interval 100 --log-format simple \
+       --validate-interval 1 \
+       --best-checkpoint-metric valid_bedroc --patience 2000 --all-gather-list-size 2048000 \
+       --save-dir $save_dir --tmp-save-dir $tmp_save_dir --keep-best-checkpoints 8 --keep-last-epochs 10 \
+       --find-unused-parameters \
+       --maximize-best-checkpoint-metric \
+       --finetune-pocket-model $finetune_pocket_model \
+       --finetune-mol-model $finetune_mol_model \
+       --valid-set CASF \
+       --max-lignum 16 \
+       --protein-similarity-thres 1.0 > ${save_root}/train_log/train_log_${save_name}.txt
+save_name="screen_pocket_norank"
+save_dir="${save_root}/${save_name}/savedir_screen"
+tmp_save_dir="${save_root}/${save_name}/tmp_save_dir_screen"
+tsb_dir="${save_root}/${save_name}/tsb_dir_screen"
+mkdir -p ${save_dir}
+n_gpu=2
+MASTER_PORT=10062
+finetune_mol_model="./pretrain/mol_pre_no_h_220816.pt" # unimol pretrained mol model
+finetune_pocket_model="./pretrain/pocket_pre_220816.pt" # unimol pretrained pocket model
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+CUDA_VISIBLE_DEVICES="0,1" python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+       --num-workers 8 --ddp-backend=c10d \
+       --task train_task --loss rank_softmax --arch pocketscreen  \
+       --max-pocket-atoms 256 \
+       --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+       --lr-scheduler polynomial_decay --lr $lr --warmup-ratio $warmup --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 --update-freq $update_freq --seed 1 \
+       --tensorboard-logdir $tsb_dir \
+       --log-interval 100 --log-format simple \
+       --validate-interval 1 \
+       --best-checkpoint-metric valid_bedroc --patience 2000 --all-gather-list-size 2048000 \
+       --save-dir $save_dir --tmp-save-dir $tmp_save_dir --keep-best-checkpoints 8 --keep-last-epochs 10 \
+       --find-unused-parameters \
+       --maximize-best-checkpoint-metric \
+       --finetune-pocket-model $finetune_pocket_model \
+       --finetune-mol-model $finetune_mol_model \
+       --valid-set CASF \
+       --max-lignum 16 \
+       --protein-similarity-thres 1.0 \
+       --rank-weight 0.0 > ${save_root}/train_log/train_log_${save_name}.txt
+save_name="screen_pocket_no_similar_protein0.8"
+save_dir="${save_root}/${save_name}/savedir_screen"
+tmp_save_dir="${save_root}/${save_name}/tmp_save_dir_screen"
+tsb_dir="${save_root}/${save_name}/tsb_dir_screen"
+mkdir -p ${save_dir}
+n_gpu=2
+MASTER_PORT=10062
+finetune_mol_model="./pretrain/mol_pre_no_h_220816.pt" # unimol pretrained mol model
+finetune_pocket_model="./pretrain/pocket_pre_220816.pt" # unimol pretrained pocket model
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+CUDA_VISIBLE_DEVICES="0,1" python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+       --num-workers 8 --ddp-backend=c10d \
+       --task train_task --loss rank_softmax --arch pocketscreen  \
+       --max-pocket-atoms 256 \
+       --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+       --lr-scheduler polynomial_decay --lr $lr --warmup-ratio $warmup --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 --update-freq $update_freq --seed 1 \
+       --tensorboard-logdir $tsb_dir \
+       --log-interval 100 --log-format simple \
+       --validate-interval 1 \
+       --best-checkpoint-metric valid_bedroc --patience 2000 --all-gather-list-size 2048000 \
+       --save-dir $save_dir --tmp-save-dir $tmp_save_dir --keep-best-checkpoints 8 --keep-last-epochs 10 \
+       --find-unused-parameters \
+       --maximize-best-checkpoint-metric \
+       --finetune-pocket-model $finetune_pocket_model \
+       --finetune-mol-model $finetune_mol_model \
+       --valid-set CASF \
+       --max-lignum 16 \
+       --protein-similarity-thres 0.8 > ${save_root}/train_log/train_log_${save_name}.txt
+save_name="screen_pocket_no_similar_protein"
+save_dir="${save_root}/${save_name}/savedir_screen"
+tmp_save_dir="${save_root}/${save_name}/tmp_save_dir_screen"
+tsb_dir="${save_root}/${save_name}/tsb_dir_screen"
+mkdir -p ${save_dir}
+n_gpu=2
+MASTER_PORT=10062
+finetune_mol_model="./pretrain/mol_pre_no_h_220816.pt" # unimol pretrained mol model
+finetune_pocket_model="./pretrain/pocket_pre_220816.pt" # unimol pretrained pocket model
+export NCCL_ASYNC_ERROR_HANDLING=1
+export OMP_NUM_THREADS=1
+CUDA_VISIBLE_DEVICES="0,1" python -m torch.distributed.launch --nproc_per_node=$n_gpu --master_port=$MASTER_PORT $(which unicore-train) $data_path --user-dir ./unimol --train-subset train --valid-subset valid \
+       --num-workers 8 --ddp-backend=c10d \
+       --task train_task --loss rank_softmax --arch pocketscreen  \
+       --max-pocket-atoms 256 \
+       --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-8 --clip-norm 1.0 \
+       --lr-scheduler polynomial_decay --lr $lr --warmup-ratio $warmup --max-epoch $epoch --batch-size $batch_size --batch-size-valid $batch_size_valid \
+       --fp16 --fp16-init-scale 4 --fp16-scale-window 256 --update-freq $update_freq --seed 1 \
+       --tensorboard-logdir $tsb_dir \
+       --log-interval 100 --log-format simple \
+       --validate-interval 1 \
+       --best-checkpoint-metric valid_bedroc --patience 2000 --all-gather-list-size 2048000 \
+       --save-dir $save_dir --tmp-save-dir $tmp_save_dir --keep-best-checkpoints 8 --keep-last-epochs 10 \
+       --find-unused-parameters \
+       --maximize-best-checkpoint-metric \
+       --finetune-pocket-model $finetune_pocket_model \
+       --finetune-mol-model $finetune_mol_model \
+       --valid-set CASF \
+       --max-lignum 16 \
+       --protein-similarity-thres 0.4 > ${save_root}/train_log/train_log_${save_name}.txt

unimol/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import importlib
+import unimol.tasks
+import unimol.data
+import unimol.models
+import unimol.losses
+import unimol.utils

unimol/data/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from .key_dataset import KeyDataset, LengthDataset
+from .normalize_dataset import (
+    NormalizeDataset,
+    NormalizeDockingPoseDataset,
+)
+from .remove_hydrogen_dataset import (
+    RemoveHydrogenDataset,
+    RemoveHydrogenResiduePocketDataset,
+    RemoveHydrogenPocketDataset,
+)
+from .tta_dataset import (
+    TTADataset,
+    TTADecoderDataset,
+    TTADockingPoseDataset,
+)
+from .cropping_dataset import (
+    CroppingDataset,
+    CroppingPocketDataset,
+    CroppingResiduePocketDataset,
+    CroppingPocketDockingPoseDataset,
+    CroppingPocketDockingPoseTestDataset,
+)
+from .atom_type_dataset import AtomTypeDataset
+from .add_2d_conformer_dataset import Add2DConformerDataset
+from .distance_dataset import (
+    DistanceDataset,
+    EdgeTypeDataset,
+    CrossDistanceDataset,
+    CrossEdgeTypeDataset
+)
+from .conformer_sample_dataset import (
+    ConformerSampleDataset,
+    ConformerSampleDecoderDataset,
+    ConformerSamplePocketDataset,
+    ConformerSamplePocketFinetuneDataset,
+    ConformerSampleConfGDataset,
+    ConformerSampleConfGV2Dataset,
+    ConformerSampleDockingPoseDataset,
+)
+from .mask_points_dataset import MaskPointsDataset, MaskPointsPocketDataset
+from .coord_pad_dataset import RightPadDatasetCoord, RightPadDatasetCross2D
+from .from_str_dataset import FromStrLabelDataset
+from .lmdb_dataset import LMDBDataset
+from .prepend_and_append_2d_dataset import PrependAndAppend2DDataset
+from .affinity_dataset import AffinityDataset, AffinityTestDataset, AffinityValidDataset, AffinityMolDataset, AffinityPocketDataset, AffinityHNSDataset, AffinityAugDataset
+from .pocket2mol_dataset import FragmentConformationDataset
+from .vae_binding_dataset import VAEBindingDataset, VAEBindingTestDataset, VAEGenerationTestDataset
+from .resampling_dataset import ResamplingDataset
+from .pair_dataset import PairDataset
+__all__ = []

unimol/data/add_2d_conformer_dataset.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+from functools import lru_cache
+from unicore.data import BaseWrapperDataset
+from rdkit import Chem
+from rdkit.Chem import AllChem
+class Add2DConformerDataset(BaseWrapperDataset):
+    def __init__(self, dataset, smi, atoms, coordinates):
+        self.dataset = dataset
+        self.smi = smi
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        assert len(atoms) > 0
+        smi = self.dataset[index][self.smi]
+        coordinates_2d = smi2_2Dcoords(smi)
+        coordinates = self.dataset[index][self.coordinates]
+        coordinates.append(coordinates_2d)
+        return {"smi": smi, "atoms": atoms, "coordinates": coordinates}
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+def smi2_2Dcoords(smi):
+    mol = Chem.MolFromSmiles(smi)
+    mol = AllChem.AddHs(mol)
+    AllChem.Compute2DCoords(mol)
+    coordinates = mol.GetConformer().GetPositions().astype(np.float32)
+    len(mol.GetAtoms()) == len(
+        coordinates
+    ), "2D coordinates shape is not align with {}".format(smi)
+    return coordinates

unimol/data/affinity_dataset.py ADDED Viewed

	@@ -0,0 +1,527 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import lru_cache
+import numpy as np
+from unicore.data import BaseWrapperDataset
+import pickle
+from . import data_utils
+class AffinityDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        pocket_atoms,
+        pocket_coordinates,
+        affinity,
+        is_train=False,
+        pocket="pocket"
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.affinity = affinity
+        self.is_train = is_train
+        self.pocket=pocket
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        ori_mol_length = len(atoms)
+        #coordinates = self.dataset[index][self.coordinates]
+        size = len(self.dataset[index][self.coordinates])
+        if self.is_train:
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                sample_idx = np.random.randint(size)
+        else:
+            with data_utils.numpy_seed(self.seed, 1, index):
+                sample_idx = np.random.randint(size)
+        #print(len(self.dataset[index][self.coordinates][sample_idx]))
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        #print(coordinates.shape)
+        pocket_atoms = np.array(
+            [self.pocket_atom(item) for item in self.dataset[index][self.pocket_atoms]]
+        )
+        ori_pocket_length = len(pocket_atoms)
+        pocket_coordinates = np.stack(self.dataset[index][self.pocket_coordinates])
+        smi = self.dataset[index]["smi"]
+        pocket = self.dataset[index][self.pocket]
+        if self.affinity in self.dataset[index]:
+            affinity = float(self.dataset[index][self.affinity])
+        else:
+            affinity = 1
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "holo_coordinates": coordinates.astype(np.float32),#placeholder
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": pocket_coordinates.astype(np.float32),#placeholder
+            "smi": smi,
+            "pocket": pocket,
+            "affinity": affinity,
+            "ori_mol_length": ori_mol_length,
+            "ori_pocket_length": ori_pocket_length
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class AffinityAugDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        pocket_atoms,
+        pocket_coordinates,
+        affinity,
+        is_train=False,
+        pocket="pocket_id"
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.affinity = affinity
+        self.is_train = is_train
+        self.pocket=pocket
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        #mol_atoms_list = self.dataset[index][self.atoms]
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            mol_idx = np.random.randint(len(self.dataset[index][self.atoms]))
+        atoms = np.array(self.dataset[index][self.atoms][mol_idx])
+        ori_mol_length = len(atoms)
+        #coordinates = self.dataset[index][self.coordinates]
+        size = len(self.dataset[index][self.coordinates][mol_idx])
+        if self.is_train:
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                sample_idx = np.random.randint(size)
+        else:
+            with data_utils.numpy_seed(self.seed, 1, index):
+                sample_idx = np.random.randint(size)
+        #print(len(self.dataset[index][self.coordinates][sample_idx]))
+        coordinates = self.dataset[index][self.coordinates][mol_idx][sample_idx]
+        #pocket_list = self.dataset[index][self.pocket_atoms]
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            pocket_idx = np.random.randint(len(self.dataset[index][self.pocket_atoms]))
+        pocket_atoms = np.array(
+            [self.pocket_atom(item) for item in self.dataset[index][self.pocket_atoms][pocket_idx]]
+        )
+        ori_pocket_length = len(pocket_atoms)
+        pocket_coordinates = np.stack(self.dataset[index][self.pocket_coordinates][pocket_idx])
+        smi = self.dataset[index]["smiles"][mol_idx]
+        pocket = self.dataset[index][self.pocket][0]
+        if self.affinity in self.dataset[index]:
+            affinity = float(self.dataset[index][self.affinity])
+        else:
+            affinity = 1
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "holo_coordinates": coordinates.astype(np.float32),#placeholder
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": pocket_coordinates.astype(np.float32),#placeholder
+            "smi": smi,
+            "pocket": pocket,
+            "affinity": affinity,
+            "ori_mol_length": ori_mol_length,
+            "ori_pocket_length": ori_pocket_length
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class AffinityHNSDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        atoms_hns,
+        coordinates_hns,
+        pocket_atoms,
+        pocket_coordinates,
+        affinity,
+        is_train=False,
+        pocket="pocket"
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.atoms_hns = atoms_hns
+        self.coordinates_hns = coordinates_hns
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.affinity = affinity
+        self.is_train = is_train
+        self.pocket=pocket
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        ori_mol_length = len(atoms)
+        #coordinates = self.dataset[index][self.coordinates]
+        size = len(self.dataset[index][self.coordinates])
+        if self.is_train:
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                sample_idx = np.random.randint(size)
+        else:
+            with data_utils.numpy_seed(self.seed, 1, index):
+                sample_idx = np.random.randint(size)
+        #print(len(self.dataset[index][self.coordinates][sample_idx]))
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        atoms_hns = np.array(self.dataset[index][self.atoms_hns])
+        coordinates_hns = self.dataset[index][self.coordinates_hns][0]
+        pocket_atoms = np.array(
+            [self.pocket_atom(item) for item in self.dataset[index][self.pocket_atoms]]
+        )
+        ori_pocket_length = len(pocket_atoms)
+        pocket_coordinates = np.stack(self.dataset[index][self.pocket_coordinates])
+        smi = self.dataset[index]["smi"]
+        pocket = self.dataset[index][self.pocket]
+        if self.affinity in self.dataset[index]:
+            affinity = float(self.dataset[index][self.affinity])
+        else:
+            affinity = 1
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "atoms_hns": atoms_hns,
+            "coordinates_hns": coordinates_hns.astype(np.float32),
+            "holo_coordinates": coordinates.astype(np.float32),#placeholder
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": pocket_coordinates.astype(np.float32),#placeholder
+            "smi": smi,
+            "pocket": pocket,
+            "affinity": affinity,
+            "ori_mol_length": ori_mol_length,
+            "ori_pocket_length": ori_pocket_length
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class AffinityTestDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        pocket_atoms,
+        pocket_coordinates,
+        affinity=None,
+        is_train=False,
+        pocket="pocket"
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.affinity = affinity
+        self.is_train = is_train
+        self.pocket=pocket
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        ori_length = len(atoms)
+        #coordinates = self.dataset[index][self.coordinates]
+        size = len(self.dataset[index][self.coordinates])
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        pocket_atoms = np.array(
+            [self.pocket_atom(item) for item in self.dataset[index][self.pocket_atoms]]
+        )
+        #print(len(self.dataset[index][self.pocket_coordinates]))
+        pocket_coordinates = np.stack(self.dataset[index][self.pocket_coordinates])
+        smi = self.dataset[index]["smi"]
+        pocket = self.dataset[index][self.pocket]
+        affinity = self.dataset[index][self.affinity]
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "holo_coordinates": coordinates.astype(np.float32),#placeholder
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": pocket_coordinates.astype(np.float32),#placeholder
+            "smi": smi,
+            "pocket": pocket,
+            "affinity": affinity.astype(np.float32),
+            "ori_length": ori_length
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class AffinityMolDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        is_train=False,
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.is_train = is_train
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        #print(self.dataset[index])
+        atoms = np.array(self.dataset[index][self.atoms])
+        ori_length = len(atoms)
+        #coordinates = self.dataset[index][self.coordinates]
+        size = len(self.dataset[index][self.coordinates])
+        #print(size)
+        # TODO: FB: introduce enough random when training using pairwise data
+        # with data_utils.numpy_seed(self.seed, epoch, index):
+        if self.is_train:
+            sample_idx = np.random.randint(size)
+        else:
+            with data_utils.numpy_seed(self.seed, index):
+                sample_idx = np.random.randint(size)
+        # check coordinates is 2 dimension or not
+        if len(self.dataset[index][self.coordinates][sample_idx].shape) == 2:
+            coordinates = self.dataset[index][self.coordinates][sample_idx]
+        else:
+            coordinates = self.dataset[index][self.coordinates]
+        #coordinates = self.dataset[index][self.coordinates][sample_idx]
+        #coordinates = self.dataset[index][self.coordinates]
+        smi = self.dataset[index]["smi"]
+        name = self.dataset[index].get("name", None)
+        mol = pickle.dumps(self.dataset[index].get("mol", None))
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "holo_coordinates": coordinates.astype(np.float32),#placeholder
+            "smi": smi,
+            "ori_length": ori_length,
+            "name": name,
+            "mol": mol
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class AffinityPocketDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        pocket_atoms,
+        pocket_coordinates,
+        is_train=False,
+        pocket="pocket"
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.is_train = is_train
+        self.pocket=pocket
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        # print(self.dataset[index].keys())
+        pocket_atoms = np.array(
+            [self.pocket_atom(item) for item in self.dataset[index][self.pocket_atoms]]
+        )
+        ori_length = len(pocket_atoms)
+        pocket_coordinates = np.stack(self.dataset[index][self.pocket_coordinates])
+        if self.pocket in self.dataset[index]:
+            pocket = self.dataset[index][self.pocket]
+        else:
+            pocket = ""
+        if "pocket_residue_name" in self.dataset[index]:
+            pocket_residue_name = self.dataset[index]["pocket_residue_name"]
+            pocket_residue_name_noH = []
+            for res, atom in zip(pocket_residue_name, pocket_atoms):
+                if atom == "H":
+                    continue
+                pocket_residue_name_noH.append(res)
+        else:
+            pocket_residue_name_noH = [""]
+        return {
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": pocket_coordinates.astype(np.float32),#placeholder
+            "pocket": pocket,
+            "pocket_residue_name": pocket_residue_name_noH,
+            "ori_length": ori_length
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class AffinityValidDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        pocket_atoms,
+        pocket_coordinates,
+        pocket="pocket"
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.pocket=pocket
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    def pocket_atom(self, atom):
+        if atom[0] in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
+            return atom[1]
+        else:
+            return atom[0]
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        ori_mol_length = len(atoms)
+        #coordinates = self.dataset[index][self.coordinates]
+        size = len(self.dataset[index][self.coordinates])
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        pocket_atoms = np.array(
+            [self.pocket_atom(item) for item in self.dataset[index][self.pocket_atoms]]
+        )
+        ori_pocket_length = len(pocket_atoms)
+        pocket_coordinates = np.stack(self.dataset[index][self.pocket_coordinates])
+        smi = self.dataset[index]["smi"]
+        pocket = self.dataset[index][self.pocket]
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "holo_coordinates": coordinates.astype(np.float32),#placeholder
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": pocket_coordinates.astype(np.float32),#placeholder
+            "smi": smi,
+            "pocket": pocket,
+            "ori_mol_length": ori_mol_length,
+            "ori_pocket_length": ori_pocket_length
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)

unimol/data/atom_type_dataset.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import lru_cache
+from unicore.data import BaseWrapperDataset
+class AtomTypeDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        raw_dataset,
+        dataset,
+        smi="smi",
+        atoms="atoms",
+    ):
+        self.raw_dataset = raw_dataset
+        self.dataset = dataset
+        self.smi = smi
+        self.atoms = atoms
+    @lru_cache(maxsize=16)
+    def __getitem__(self, index: int):
+        # for low rdkit version
+        if len(self.dataset[index]["atoms"]) != len(self.dataset[index]["coordinates"]):
+            min_len = min(
+                len(self.dataset[index]["atoms"]),
+                len(self.dataset[index]["coordinates"]),
+            )
+            self.dataset[index]["atoms"] = self.dataset[index]["atoms"][:min_len]
+            self.dataset[index]["coordinates"] = self.dataset[index]["coordinates"][
+                :min_len
+            ]
+        return self.dataset[index]

unimol/data/conformer_sample_dataset.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+from functools import lru_cache
+from unicore.data import BaseWrapperDataset
+from . import data_utils
+class ConformerSampleDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, coordinates):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        #print(index,self.dataset[index])
+        atoms = np.array(self.dataset[index][self.atoms])
+        assert len(atoms) > 0
+        size = len(self.dataset[index][self.coordinates])
+        #print(size)
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        return {"atoms": atoms, "coordinates": coordinates.astype(np.float32)}
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class ConformerSampleDecoderDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, coordinates, selfies):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.selfies = selfies
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        assert len(atoms) > 0
+        # print("self.dataset[index][self.atoms]")
+        # print(self.dataset[index][self.atoms])
+        # print("self.dataset[index][self.selfies]")
+        # print(self.dataset[index][self.selfies])
+        selfies = np.array(self.dataset[index][self.selfies])
+        assert len(selfies) > 0
+        size = len(self.dataset[index][self.coordinates])
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        return {"atoms": atoms, "selfies": selfies, "coordinates": coordinates.astype(np.float32)}
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class ConformerSamplePocketDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, coordinates, dict_name):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.dict_name = dict_name
+        self.coordinates = coordinates
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        if self.dict_name == "dict_coarse.txt":
+            atoms = np.array([a[0] for a in self.dataset[index][self.atoms]])
+        elif self.dict_name == "dict_fine.txt":
+            atoms = np.array(
+                [
+                    a[0] if len(a) == 1 or a[0] == "H" else a[:2]
+                    for a in self.dataset[index][self.atoms]
+                ]
+            )
+        assert len(atoms) > 0
+        size = len(self.dataset[index][self.coordinates])
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        residue = np.array(self.dataset[index]["residue"])
+        score = np.float(self.dataset[index]["meta_info"]["fpocket"]["Score"])
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "residue": residue,
+            "score": score,
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class ConformerSamplePocketFinetuneDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, residues, coordinates):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.residues = residues
+        self.coordinates = coordinates
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(
+            [a[0] for a in self.dataset[index][self.atoms]]
+        )  # only 'C H O N S'
+        assert len(atoms) > 0
+        # This judgment is reserved for possible future expansion.
+        # The number of pocket conformations is 1, and the 'sample' does not work.
+        if isinstance(self.dataset[index][self.coordinates], list):
+            size = len(self.dataset[index][self.coordinates])
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                sample_idx = np.random.randint(size)
+            coordinates = self.dataset[index][self.coordinates][sample_idx]
+        else:
+            coordinates = self.dataset[index][self.coordinates]
+        if self.residues in self.dataset[index]:
+            residues = np.array(self.dataset[index][self.residues])
+        else:
+            residues = None
+        assert len(atoms) == len(coordinates)
+        return {
+            self.atoms: atoms,
+            self.coordinates: coordinates.astype(np.float32),
+            self.residues: residues,
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class ConformerSampleConfGDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, coordinates, tgt_coordinates):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.tgt_coordinates = tgt_coordinates
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        assert len(atoms) > 0
+        size = len(self.dataset[index][self.coordinates])
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        tgt_coordinates = self.dataset[index][self.tgt_coordinates]
+        return {
+            self.atoms: atoms,
+            self.coordinates: coordinates.astype(np.float32),
+            self.tgt_coordinates: tgt_coordinates.astype(np.float32),
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class ConformerSampleConfGV2Dataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        tgt_coordinates,
+        beta=1.0,
+        smooth=0.1,
+        topN=10,
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.tgt_coordinates = tgt_coordinates
+        self.beta = beta
+        self.smooth = smooth
+        self.topN = topN
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        assert len(atoms) > 0
+        meta_df = self.dataset[index]["meta"]
+        tgt_conf_ids = meta_df["gid"].unique()
+        # randomly choose one conf
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            conf_id = np.random.choice(tgt_conf_ids)
+        conf_df = meta_df[meta_df["gid"] == conf_id]
+        conf_df = conf_df.sort_values("score").reset_index(drop=False)[
+            : self.topN
+        ]  # only use top 5 confs for sampling...
+        # importance sampling with rmsd inverse score
+        def normalize(x, beta=1.0, smooth=0.1):
+            x = 1.0 / (x**beta + smooth)
+            return x / x.sum()
+        rmsd_score = conf_df["score"].values
+        weight = normalize(
+            rmsd_score, beta=self.beta, smooth=self.smooth
+        )  # for smoothing purpose
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            idx = np.random.choice(len(conf_df), 1, replace=False, p=weight)
+        # idx = [np.argmax(weight)]
+        coordinates = conf_df.iloc[idx]["rdkit_coords"].values[0]
+        tgt_coordinates = conf_df.iloc[idx]["tgt_coords"].values[0]
+        return {
+            self.atoms: atoms,
+            self.coordinates: coordinates.astype(np.float32),
+            self.tgt_coordinates: tgt_coordinates.astype(np.float32),
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class ConformerSampleDockingPoseDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        seed,
+        atoms,
+        coordinates,
+        pocket_atoms,
+        pocket_coordinates,
+        holo_coordinates,
+        holo_pocket_coordinates,
+        is_train=True,
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.pocket_atoms = pocket_atoms
+        self.pocket_coordinates = pocket_coordinates
+        self.holo_coordinates = holo_coordinates
+        self.holo_pocket_coordinates = holo_pocket_coordinates
+        self.is_train = is_train
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        atoms = np.array(self.dataset[index][self.atoms])
+        size = len(self.dataset[index][self.coordinates])
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            sample_idx = np.random.randint(size)
+        coordinates = self.dataset[index][self.coordinates][sample_idx]
+        pocket_atoms = np.array(
+            [item[0] for item in self.dataset[index][self.pocket_atoms]]
+        )
+        pocket_coordinates = self.dataset[index][self.pocket_coordinates][0]
+        if self.is_train:
+            holo_coordinates = self.dataset[index][self.holo_coordinates][0]
+            holo_pocket_coordinates = self.dataset[index][self.holo_pocket_coordinates][
+                0
+            ]
+        else:
+            holo_coordinates = coordinates
+            holo_pocket_coordinates = pocket_coordinates
+        smi = self.dataset[index]["smi"]
+        pocket = self.dataset[index]["pocket"]
+        return {
+            "atoms": atoms,
+            "coordinates": coordinates.astype(np.float32),
+            "pocket_atoms": pocket_atoms,
+            "pocket_coordinates": pocket_coordinates.astype(np.float32),
+            "holo_coordinates": holo_coordinates.astype(np.float32),
+            "holo_pocket_coordinates": holo_pocket_coordinates.astype(np.float32),
+            "smi": smi,
+            "pocket": pocket,
+        }
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)

unimol/data/coord_pad_dataset.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from unicore.data import BaseWrapperDataset
+def collate_tokens_coords(
+    values,
+    pad_idx,
+    left_pad=False,
+    pad_to_length=None,
+    pad_to_multiple=1,
+):
+    """Convert a list of 1d tensors into a padded 2d tensor."""
+    size = max(v.size(0) for v in values)
+    size = size if pad_to_length is None else max(size, pad_to_length)
+    #if pad_to_multiple != 1 and size % pad_to_multiple != 0:
+    #    size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
+    res = values[0].new(len(values), size, 3).fill_(pad_idx)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(v, res[i][size - len(v) :, :] if left_pad else res[i][: len(v), :])
+    return res
+class RightPadDatasetCoord(BaseWrapperDataset):
+    def __init__(self, dataset, pad_idx, left_pad=False):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+    def collater(self, samples):
+        return collate_tokens_coords(
+            samples, self.pad_idx, left_pad=self.left_pad, pad_to_multiple=8
+        )
+def collate_cross_2d(
+    values,
+    pad_idx,
+    left_pad=False,
+    pad_to_length=None,
+    pad_to_multiple=1,
+):
+    """Convert a list of 2d tensors into a padded 2d tensor."""
+    size_h = max(v.size(0) for v in values)
+    size_w = max(v.size(1) for v in values)
+    if pad_to_multiple != 1 and size_h % pad_to_multiple != 0:
+        size_h = int(((size_h - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
+    if pad_to_multiple != 1 and size_w % pad_to_multiple != 0:
+        size_w = int(((size_w - 0.1) // pad_to_multiple + 1) * pad_to_multiple)
+    res = values[0].new(len(values), size_h, size_w).fill_(pad_idx)
+    def copy_tensor(src, dst):
+        assert dst.numel() == src.numel()
+        dst.copy_(src)
+    for i, v in enumerate(values):
+        copy_tensor(
+            v,
+            res[i][size_h - v.size(0) :, size_w - v.size(1) :]
+            if left_pad
+            else res[i][: v.size(0), : v.size(1)],
+        )
+    return res
+class RightPadDatasetCross2D(BaseWrapperDataset):
+    def __init__(self, dataset, pad_idx, left_pad=False):
+        super().__init__(dataset)
+        self.pad_idx = pad_idx
+        self.left_pad = left_pad
+    def collater(self, samples):
+        return collate_cross_2d(
+            samples, self.pad_idx, left_pad=self.left_pad, pad_to_multiple=8
+        )

unimol/data/cropping_dataset.py ADDED Viewed

	@@ -0,0 +1,269 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+from functools import lru_cache
+import logging
+from unicore.data import BaseWrapperDataset
+from . import data_utils
+logger = logging.getLogger(__name__)
+class CroppingDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, coordinates, max_atoms=256):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.max_atoms = max_atoms
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        atoms = dd[self.atoms]
+        coordinates = dd[self.coordinates]
+        if self.max_atoms and len(atoms) > self.max_atoms:
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                index = np.random.choice(len(atoms), self.max_atoms, replace=False)
+                atoms = np.array(atoms)[index]
+                coordinates = coordinates[index]
+        dd[self.atoms] = atoms
+        dd[self.coordinates] = coordinates.astype(np.float32)
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class CroppingPocketDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, coordinates, max_atoms=256):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.max_atoms = (
+            max_atoms  # max number of atoms in a molecule, None indicates no limit.
+        )
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        atoms = dd[self.atoms]
+        coordinates = dd[self.coordinates]
+        #residue = dd["residue"]
+        # crop atoms according to their distance to the center of pockets
+        if self.max_atoms and len(atoms) > self.max_atoms:
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                distance = np.linalg.norm(
+                    coordinates - coordinates.mean(axis=0), axis=1
+                )
+                def softmax(x):
+                    x -= np.max(x)
+                    x = np.exp(x) / np.sum(np.exp(x))
+                    return x
+                distance += 1  # prevent inf
+                weight = softmax(np.reciprocal(distance))
+                index = np.random.choice(
+                    len(atoms), self.max_atoms, replace=False, p=weight
+                )
+                atoms = atoms[index]
+                coordinates = coordinates[index]
+                #residue = residue[index]
+        dd[self.atoms] = atoms
+        dd[self.coordinates] = coordinates.astype(np.float32)
+        #dd["residue"] = residue
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class CroppingResiduePocketDataset(BaseWrapperDataset):
+    def __init__(self, dataset, seed, atoms, residues, coordinates, max_atoms=256):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.residues = residues
+        self.coordinates = coordinates
+        self.max_atoms = (
+            max_atoms  # max number of atoms in a molecule, None indicates no limit.
+        )
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        atoms = dd[self.atoms]
+        residues = dd[self.residues]
+        coordinates = dd[self.coordinates]
+        residues_distance_map = {}
+        # crop atoms according to their distance to the center of pockets
+        if self.max_atoms and len(atoms) > self.max_atoms:
+            with data_utils.numpy_seed(self.seed, epoch, index):
+                distance = np.linalg.norm(
+                    coordinates - coordinates.mean(axis=0), axis=1
+                )
+                residues_ids, residues_distance = [], []
+                for res in residues:
+                    if res not in residues_ids:
+                        residues_ids.append(res)
+                        residues_distance.append(distance[residues == res].mean())
+                residues_ids = np.array(residues_ids)
+                residues_distance = np.array(residues_distance)
+                def softmax(x):
+                    x -= np.max(x)
+                    x = np.exp(x) / np.sum(np.exp(x))
+                    return x
+                residues_distance += 1  # prevent inf and smoothing out the distance
+                weight = softmax(np.reciprocal(residues_distance))
+                max_residues = self.max_atoms // (len(atoms) // (len(residues_ids) + 1))
+                if max_residues < 1:
+                    max_residues += 1
+                max_residues = min(max_residues, len(residues_ids))
+                residue_index = np.random.choice(
+                    len(residues_ids), max_residues, replace=False, p=weight
+                )
+                index = [
+                    i
+                    for i in range(len(atoms))
+                    if residues[i] in residues_ids[residue_index]
+                ]
+                atoms = atoms[index]
+                coordinates = coordinates[index]
+                residues = residues[index]
+        dd[self.atoms] = atoms
+        dd[self.coordinates] = coordinates.astype(np.float32)
+        dd[self.residues] = residues
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class CroppingPocketDockingPoseDataset(BaseWrapperDataset):
+    def __init__(
+        self, dataset, seed, atoms, coordinates, holo_coordinates, max_atoms=256
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.max_atoms = max_atoms
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        atoms = dd[self.atoms]
+        coordinates = dd[self.coordinates]
+        holo_coordinates = dd[self.holo_coordinates]
+        # crop atoms according to their distance to the center of pockets
+        #print(len(atoms))
+        if self.max_atoms and len(atoms) > self.max_atoms:
+            with data_utils.numpy_seed(self.seed, 1):
+                distance = np.linalg.norm(
+                    coordinates - coordinates.mean(axis=0), axis=1
+                )
+                def softmax(x):
+                    x -= np.max(x)
+                    x = np.exp(x) / np.sum(np.exp(x))
+                    return x
+                distance += 1  # prevent inf
+                weight = softmax(np.reciprocal(distance))
+                index = np.random.choice(
+                    len(atoms), self.max_atoms, replace=False, p=weight
+                )
+                atoms = atoms[index]
+                coordinates = coordinates[index]
+                holo_coordinates = holo_coordinates[index]
+        dd[self.atoms] = atoms
+        dd[self.coordinates] = coordinates.astype(np.float32)
+        dd[self.holo_coordinates] = holo_coordinates.astype(np.float32)
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class CroppingPocketDockingPoseTestDataset(BaseWrapperDataset):
+    def __init__(
+        self, dataset, seed, atoms, coordinates, max_atoms=256
+    ):
+        self.dataset = dataset
+        self.seed = seed
+        self.atoms = atoms
+        self.coordinates = coordinates
+        self.max_atoms = max_atoms
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        atoms = dd[self.atoms]
+        coordinates = dd[self.coordinates]
+        # crop atoms according to their distance to the center of pockets
+        if self.max_atoms and len(atoms) > self.max_atoms:
+            with data_utils.numpy_seed(1, 1):
+                distance = np.linalg.norm(
+                    coordinates - coordinates.mean(axis=0), axis=1
+                )
+                def softmax(x):
+                    x -= np.max(x)
+                    x = np.exp(x) / np.sum(np.exp(x))
+                    return x
+                distance += 1  # prevent inf
+                weight = softmax(np.reciprocal(distance))
+                index = np.random.choice(
+                    len(atoms), self.max_atoms, replace=False, p=weight
+                )
+                atoms = atoms[index]
+                coordinates = coordinates[index]
+        dd[self.atoms] = atoms
+        dd[self.coordinates] = coordinates.astype(np.float32)
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)

unimol/data/data_utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import contextlib
+@contextlib.contextmanager
+def numpy_seed(seed, *addl_seeds):
+    """Context manager which seeds the NumPy PRNG with the specified seed and
+    restores the state afterward"""
+    if seed is None:
+        yield
+        return
+    if len(addl_seeds) > 0:
+        seed = int(hash((seed, *addl_seeds)) % 1e6)
+    state = np.random.get_state()
+    np.random.seed(seed)
+    try:
+        yield
+    finally:
+        np.random.set_state(state)

unimol/data/dictionary.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) DP Technology.
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import numpy as np
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+class DecoderDictionary:
+    """A mapping from symbols to consecutive integers"""
+    def __init__(
+        self,
+        *,  # begin keyword-only arguments
+        bos="[CLS]",
+        pad="[PAD]",
+        eos="[SEP]",
+        unk="[UNK]",
+        extra_special_symbols=None,
+    ):
+        self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        self.idx2sym = {}
+        self.specials = set()
+        self.specials.add(bos)
+        self.specials.add(unk)
+        self.specials.add(pad)
+        self.specials.add(eos)
+    def __eq__(self, other):
+        return self.indices == other.indices
+    def __getitem__(self, idx):
+        if idx < len(self.symbols):
+            return self.symbols[idx]
+        return self.unk_word
+    def __len__(self):
+        """Returns the number of symbols in the dictionary"""
+        return len(self.symbols)
+    def __contains__(self, sym):
+        return sym in self.indices
+    def vec_index(self, a):
+        return np.vectorize(self.index)(a)
+    def index(self, sym):
+        """Returns the index of the specified symbol"""
+        assert isinstance(sym, str)
+        if sym in self.indices:
+            return self.indices[sym]
+        return self.indices[self.unk_word]
+    def index2symbol(self, idx):
+        """Returns the corresponding symbol of the specified index"""
+        assert isinstance(idx, int)
+        if idx in self.idx2sym:
+            return self.idx2sym[idx]
+        return self.unk_word
+    def special_index(self):
+        return [self.index(x) for x in self.specials]
+    def add_symbol(self, word, n=1, overwrite=False, is_special=False):
+        """Adds a word to the dictionary"""
+        if is_special:
+            self.specials.add(word)
+        if word in self.indices and not overwrite:
+            idx = self.indices[word]
+            self.count[idx] = self.count[idx] + n
+            return idx
+        else:
+            idx = len(self.symbols)
+            self.indices[word] = idx
+            self.idx2sym[idx] = word
+            self.symbols.append(word)
+            self.count.append(n)
+            return idx
+    def bos(self):
+        """Helper to get index of beginning-of-sentence symbol"""
+        return self.index(self.bos_word)
+    def pad(self):
+        """Helper to get index of pad symbol"""
+        return self.index(self.pad_word)
+    def eos(self):
+        """Helper to get index of end-of-sentence symbol"""
+        return self.index(self.eos_word)
+    def unk(self):
+        """Helper to get index of unk symbol"""
+        return self.index(self.unk_word)
+    @classmethod
+    def load(cls, f):
+        """Loads the dictionary from a text file with the format:
+        ```
+        <symbol0> <count0>
+        <symbol1> <count1>
+        ...
+        ```
+        """
+        d = cls()
+        d.add_from_file(f)
+        return d
+    def add_from_file(self, f):
+        """
+        Loads a pre-existing dictionary from a text file and adds its symbols
+        to this instance.
+        """
+        if isinstance(f, str):
+            try:
+                with open(f, "r", encoding="utf-8") as fd:
+                    self.add_from_file(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except UnicodeError:
+                raise Exception(
+                    "Incorrect encoding detected in {}, please "
+                    "rebuild the dataset".format(f)
+                )
+            return
+        lines = f.readlines()
+        for line_idx, line in enumerate(lines):
+            try:
+                splits = line.rstrip().rsplit(" ", 1)
+                line = splits[0]
+                field = splits[1] if len(splits) > 1 else str(len(lines) - line_idx)
+                if field == "#overwrite":
+                    overwrite = True
+                    line, field = line.rsplit(" ", 1)
+                else:
+                    overwrite = False
+                count = int(field)
+                word = line
+                if word in self and not overwrite:
+                    logger.info(
+                        "Duplicate word found when loading Dictionary: '{}', index is {}.".format(word, self.indices[word])
+                    )
+                else:
+                    self.add_symbol(word, n=count, overwrite=overwrite)
+            except ValueError:
+                raise ValueError(
+                    "Incorrect dictionary format, expected '<token> <cnt> [flags]'"
+                )

unimol/data/distance_dataset.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+import torch
+from scipy.spatial import distance_matrix
+from functools import lru_cache
+from unicore.data import BaseWrapperDataset
+class DistanceDataset(BaseWrapperDataset):
+    def __init__(self, dataset):
+        super().__init__(dataset)
+        self.dataset = dataset
+    @lru_cache(maxsize=16)
+    def __getitem__(self, idx):
+        pos = self.dataset[idx].view(-1, 3).numpy()
+        dist = distance_matrix(pos, pos).astype(np.float32)
+        return torch.from_numpy(dist)
+class EdgeTypeDataset(BaseWrapperDataset):
+    def __init__(self, dataset: torch.utils.data.Dataset, num_types: int):
+        self.dataset = dataset
+        self.num_types = num_types
+    @lru_cache(maxsize=16)
+    def __getitem__(self, index: int):
+        node_input = self.dataset[index].clone()
+        offset = node_input.view(-1, 1) * self.num_types + node_input.view(1, -1)
+        return offset
+class CrossDistanceDataset(BaseWrapperDataset):
+    def __init__(self, mol_dataset, pocket_dataset):
+        super().__init__(mol_dataset)
+        self.dataset = mol_dataset
+        self.mol_dataset = mol_dataset
+        self.pocket_dataset = pocket_dataset
+    @lru_cache(maxsize=16)
+    def __getitem__(self, idx):
+        mol_pos = self.mol_dataset[idx].view(-1, 3).numpy()
+        pocket_pos = self.pocket_dataset[idx].view(-1, 3).numpy()
+        dist = distance_matrix(mol_pos, pocket_pos).astype(np.float32)
+        assert dist.shape[0] == self.mol_dataset[idx].shape[0]
+        assert dist.shape[1] == self.pocket_dataset[idx].shape[0]
+        return torch.from_numpy(dist)
+class CrossEdgeTypeDataset(BaseWrapperDataset):
+    def __init__(self, mol_dataset, pocket_dataset, num_types: int):
+        self.dataset = mol_dataset
+        self.mol_dataset = mol_dataset
+        self.pocket_dataset = pocket_dataset
+        self.num_types = num_types
+    @lru_cache(maxsize=16)
+    def __getitem__(self, index: int):
+        mol_node_input = self.mol_dataset[index].clone()
+        pocket_node_input = self.pocket_dataset[index].clone()
+        offset = mol_node_input.view(-1, 1) * self.num_types + pocket_node_input.view(1, -1)
+        return offset

unimol/data/from_str_dataset.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+from functools import lru_cache
+from unicore.data import UnicoreDataset
+class FromStrLabelDataset(UnicoreDataset):
+    def __init__(self, labels):
+        super().__init__()
+        self.labels = labels
+    @lru_cache(maxsize=16)
+    def __getitem__(self, index):
+        return self.labels[index]
+    def __len__(self):
+        return len(self.labels)
+    def collater(self, samples):
+        return torch.tensor(list(map(float, samples)))

unimol/data/key_dataset.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import lru_cache
+from unicore.data import BaseWrapperDataset
+class KeyDataset(BaseWrapperDataset):
+    def __init__(self, dataset, key):
+        self.dataset = dataset
+        self.key = key
+    def __len__(self):
+        return len(self.dataset)
+    @lru_cache(maxsize=16)
+    def __getitem__(self, idx):
+        return self.dataset[idx][self.key]
+class LengthDataset(BaseWrapperDataset):
+    def __init__(self, dataset):
+        super().__init__(dataset)
+    @lru_cache(maxsize=16)
+    def __getitem__(self, idx):
+        item = self.dataset[idx]
+        return len(item)

unimol/data/lmdb_dataset.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import lmdb
+import os
+import pickle
+from functools import lru_cache
+import logging
+logger = logging.getLogger(__name__)
+class LMDBDataset:
+    def __init__(self, db_path):
+        self.db_path = db_path
+        assert os.path.isfile(self.db_path), "{} not found".format(self.db_path)
+        env = self.connect_db(self.db_path)
+        with env.begin() as txn:
+            self._keys = list(txn.cursor().iternext(values=False))
+    def connect_db(self, lmdb_path, save_to_self=False):
+        env = lmdb.open(
+            lmdb_path,
+            subdir=False,
+            readonly=True,
+            lock=False,
+            readahead=False,
+            meminit=False,
+            max_readers=256,
+        )
+        if not save_to_self:
+            return env
+        else:
+            self.env = env
+    def __len__(self):
+        return len(self._keys)
+    @lru_cache(maxsize=16)
+    def __getitem__(self, idx):
+        if not hasattr(self, "env"):
+            self.connect_db(self.db_path, save_to_self=True)
+        #datapoint_pickled = self.env.begin().get(f"{idx}".encode("ascii"))
+        #print(idx)
+        datapoint_pickled = self.env.begin().get(f"{idx}".encode("ascii"))
+        data = pickle.loads(datapoint_pickled)
+        return data

unimol/data/mask_points_dataset.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from functools import lru_cache
+import numpy as np
+import torch
+from unicore.data import Dictionary
+from unicore.data import BaseWrapperDataset
+from . import data_utils
+class MaskPointsDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        coord_dataset: torch.utils.data.Dataset,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        noise_type: str,
+        noise: float = 1.0,
+        seed: int = 1,
+        mask_prob: float = 0.15,
+        leave_unmasked_prob: float = 0.1,
+        random_token_prob: float = 0.1,
+    ):
+        assert 0.0 < mask_prob < 1.0
+        assert 0.0 <= random_token_prob <= 1.0
+        assert 0.0 <= leave_unmasked_prob <= 1.0
+        assert random_token_prob + leave_unmasked_prob <= 1.0
+        self.dataset = dataset
+        self.coord_dataset = coord_dataset
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.noise_type = noise_type
+        self.noise = noise
+        self.seed = seed
+        self.mask_prob = mask_prob
+        self.leave_unmasked_prob = leave_unmasked_prob
+        self.random_token_prob = random_token_prob
+        if random_token_prob > 0.0:
+            weights = np.ones(len(self.vocab))
+            weights[vocab.special_index()] = 0
+            self.weights = weights / weights.sum()
+        self.epoch = None
+        if self.noise_type == "trunc_normal":
+            self.noise_f = lambda num_mask: np.clip(
+                np.random.randn(num_mask, 3) * self.noise,
+                a_min=-self.noise * 2.0,
+                a_max=self.noise * 2.0,
+            )
+        elif self.noise_type == "normal":
+            self.noise_f = lambda num_mask: np.random.randn(num_mask, 3) * self.noise
+        elif self.noise_type == "uniform":
+            self.noise_f = lambda num_mask: np.random.uniform(
+                low=-self.noise, high=self.noise, size=(num_mask, 3)
+            )
+        else:
+            self.noise_f = lambda num_mask: 0.0
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.coord_dataset.set_epoch(epoch)
+        self.dataset.set_epoch(epoch)
+        self.epoch = epoch
+    def __getitem__(self, index: int):
+        return self.__getitem_cached__(self.epoch, index)
+    @lru_cache(maxsize=16)
+    def __getitem_cached__(self, epoch: int, index: int):
+        ret = {}
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            item = self.dataset[index]
+            coord = self.coord_dataset[index]
+            sz = len(item)
+            # don't allow empty sequence
+            assert sz > 0
+            # decide elements to mask
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                self.mask_prob * sz
+                + np.random.rand()
+            )
+            mask_idc = np.random.choice(sz, num_mask, replace=False)
+            mask = np.full(sz, False)
+            mask[mask_idc] = True
+            ret["targets"] = np.full(len(mask), self.pad_idx)
+            ret["targets"][mask] = item[mask]
+            ret["targets"] = torch.from_numpy(ret["targets"]).long()
+            # decide unmasking and random replacement
+            rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob
+            if rand_or_unmask_prob > 0.0:
+                rand_or_unmask = mask & (np.random.rand(sz) < rand_or_unmask_prob)
+                if self.random_token_prob == 0.0:
+                    unmask = rand_or_unmask
+                    rand_mask = None
+                elif self.leave_unmasked_prob == 0.0:
+                    unmask = None
+                    rand_mask = rand_or_unmask
+                else:
+                    unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob
+                    decision = np.random.rand(sz) < unmask_prob
+                    unmask = rand_or_unmask & decision
+                    rand_mask = rand_or_unmask & (~decision)
+            else:
+                unmask = rand_mask = None
+            if unmask is not None:
+                mask = mask ^ unmask
+            new_item = np.copy(item)
+            new_item[mask] = self.mask_idx
+            num_mask = mask.astype(np.int32).sum()
+            new_coord = np.copy(coord)
+            new_coord[mask, :] += self.noise_f(num_mask)
+            if rand_mask is not None:
+                num_rand = rand_mask.sum()
+                if num_rand > 0:
+                    new_item[rand_mask] = np.random.choice(
+                        len(self.vocab),
+                        num_rand,
+                        p=self.weights,
+                    )
+            ret["atoms"] = torch.from_numpy(new_item).long()
+            ret["coordinates"] = torch.from_numpy(new_coord).float()
+            return ret
+class MaskPointsPocketDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset: torch.utils.data.Dataset,
+        coord_dataset: torch.utils.data.Dataset,
+        residue_dataset: torch.utils.data.Dataset,
+        vocab: Dictionary,
+        pad_idx: int,
+        mask_idx: int,
+        noise_type: str,
+        noise: float = 1.0,
+        seed: int = 1,
+        mask_prob: float = 0.15,
+        leave_unmasked_prob: float = 0.1,
+        random_token_prob: float = 0.1,
+    ):
+        assert 0.0 < mask_prob < 1.0
+        assert 0.0 <= random_token_prob <= 1.0
+        assert 0.0 <= leave_unmasked_prob <= 1.0
+        assert random_token_prob + leave_unmasked_prob <= 1.0
+        self.dataset = dataset
+        self.coord_dataset = coord_dataset
+        self.residue_dataset = residue_dataset
+        self.vocab = vocab
+        self.pad_idx = pad_idx
+        self.mask_idx = mask_idx
+        self.noise_type = noise_type
+        self.noise = noise
+        self.seed = seed
+        self.mask_prob = mask_prob
+        self.leave_unmasked_prob = leave_unmasked_prob
+        self.random_token_prob = random_token_prob
+        if random_token_prob > 0.0:
+            weights = np.ones(len(self.vocab))
+            weights[vocab.special_index()] = 0
+            self.weights = weights / weights.sum()
+        self.epoch = None
+        if self.noise_type == "trunc_normal":
+            self.noise_f = lambda num_mask: np.clip(
+                np.random.randn(num_mask, 3) * self.noise,
+                a_min=-self.noise * 2.0,
+                a_max=self.noise * 2.0,
+            )
+        elif self.noise_type == "normal":
+            self.noise_f = lambda num_mask: np.random.randn(num_mask, 3) * self.noise
+        elif self.noise_type == "uniform":
+            self.noise_f = lambda num_mask: np.random.uniform(
+                low=-self.noise, high=self.noise, size=(num_mask, 3)
+            )
+        else:
+            self.noise_f = lambda num_mask: 0.0
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.coord_dataset.set_epoch(epoch)
+        self.dataset.set_epoch(epoch)
+        self.epoch = epoch
+    def __getitem__(self, index: int):
+        return self.__getitem_cached__(self.epoch, index)
+    @lru_cache(maxsize=16)
+    def __getitem_cached__(self, epoch: int, index: int):
+        ret = {}
+        with data_utils.numpy_seed(self.seed, epoch, index):
+            item = self.dataset[index]
+            coord = self.coord_dataset[index]
+            sz = len(item)
+            # don't allow empty sequence
+            assert sz > 0
+            # mask on the level of residues
+            residue = self.residue_dataset[index]
+            res_list = list(set(residue))
+            res_sz = len(res_list)
+            # decide elements to mask
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                self.mask_prob * res_sz
+                + np.random.rand()
+            )
+            mask_res = np.random.choice(res_list, num_mask, replace=False).tolist()
+            mask = np.isin(residue, mask_res)
+            ret["targets"] = np.full(len(mask), self.pad_idx)
+            ret["targets"][mask] = item[mask]
+            ret["targets"] = torch.from_numpy(ret["targets"]).long()
+            # decide unmasking and random replacement
+            rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob
+            if rand_or_unmask_prob > 0.0:
+                rand_or_unmask = mask & (np.random.rand(sz) < rand_or_unmask_prob)
+                if self.random_token_prob == 0.0:
+                    unmask = rand_or_unmask
+                    rand_mask = None
+                elif self.leave_unmasked_prob == 0.0:
+                    unmask = None
+                    rand_mask = rand_or_unmask
+                else:
+                    unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob
+                    decision = np.random.rand(sz) < unmask_prob
+                    unmask = rand_or_unmask & decision
+                    rand_mask = rand_or_unmask & (~decision)
+            else:
+                unmask = rand_mask = None
+            if unmask is not None:
+                mask = mask ^ unmask
+            new_item = np.copy(item)
+            new_item[mask] = self.mask_idx
+            num_mask = mask.astype(np.int32).sum()
+            new_coord = np.copy(coord)
+            new_coord[mask, :] += self.noise_f(num_mask)
+            if rand_mask is not None:
+                num_rand = rand_mask.sum()
+                if num_rand > 0:
+                    new_item[rand_mask] = np.random.choice(
+                        len(self.vocab),
+                        num_rand,
+                        p=self.weights,
+                    )
+            ret["atoms"] = torch.from_numpy(new_item).long()
+            ret["coordinates"] = torch.from_numpy(new_coord).float()
+            return ret

unimol/data/normalize_dataset.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# Copyright (c) DP Technology.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import numpy as np
+from functools import lru_cache
+from unicore.data import BaseWrapperDataset
+class NormalizeDataset(BaseWrapperDataset):
+    def __init__(self, dataset, coordinates, normalize_coord=True):
+        self.dataset = dataset
+        self.coordinates = coordinates
+        self.normalize_coord = normalize_coord  # normalize the coordinates.
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        coordinates = dd[self.coordinates]
+        # normalize
+        if self.normalize_coord:
+            coordinates = coordinates - coordinates.mean(axis=0)
+            dd[self.coordinates] = coordinates.astype(np.float32)
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)
+class NormalizeDockingPoseDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        coordinates,
+        pocket_coordinates,
+        center_coordinates="center_coordinates",
+    ):
+        self.dataset = dataset
+        self.coordinates = coordinates
+        self.pocket_coordinates = pocket_coordinates
+        self.center_coordinates = center_coordinates
+        self.set_epoch(None)
+    def set_epoch(self, epoch, **unused):
+        super().set_epoch(epoch)
+        self.epoch = epoch
+    @lru_cache(maxsize=16)
+    def __cached_item__(self, index: int, epoch: int):
+        dd = self.dataset[index].copy()
+        coordinates = dd[self.coordinates]
+        pocket_coordinates = dd[self.pocket_coordinates]
+        # normalize coordinates and pocket coordinates ,align with pocket center coordinates
+        center_coordinates = pocket_coordinates.mean(axis=0)
+        coordinates = coordinates - center_coordinates
+        pocket_coordinates = pocket_coordinates - center_coordinates
+        dd[self.coordinates] = coordinates.astype(np.float32)
+        dd[self.pocket_coordinates] = pocket_coordinates.astype(np.float32)
+        dd[self.center_coordinates] = center_coordinates.astype(np.float32)
+        return dd
+    def __getitem__(self, index: int):
+        return self.__cached_item__(index, self.epoch)

unimol/data/pair_dataset.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import json
+import os.path
+import math
+from functools import lru_cache
+import torch
+from unicore.data import UnicoreDataset
+import numpy as np
+from . import data_utils
+import rdkit
+from rdkit import Chem
+from rdkit import DataStructs
+from rdkit.Chem import rdFingerprintGenerator
+from multiprocessing import Pool
+from tqdm import tqdm
+def get_fp(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    fp_numpy = np.zeros((0,), np.int8)  # Generate target pointer to fill
+    if mol is None:
+        return None
+    fingerprints_vect = rdFingerprintGenerator.GetCountFPs(
+        [mol], fpType=rdFingerprintGenerator.MorganFP
+    )[0]
+    DataStructs.ConvertToNumpyArray(fingerprints_vect, fp_numpy)
+    return fp_numpy
+class PairDataset(UnicoreDataset):
+    def __init__(self, args, pocket_dataset, mol_dataset, labels, split, use_cache=True, cache_dir=None):
+        self.args = args
+        self.pocket_dataset = pocket_dataset
+        self.mol_dataset = mol_dataset
+        self.labels = labels
+        # use the cached file, or it will take loooooong time to load
+        if use_cache:
+            pocket_name2idx_file = f"{cache_dir}/cache/pocket_name2idx_train_blend.json"
+            if os.path.exists(pocket_name2idx_file):
+                self.pocket_name2idx = json.load(open(pocket_name2idx_file))
+            else:
+                self.pocket_name2idx = {x["pocket_name"]:i for i,x in enumerate(self.pocket_dataset)}
+                json.dump(self.pocket_name2idx, open(pocket_name2idx_file, "w"))
+        else:
+            self.pocket_name2idx = {x["pocket_name"]: i for i, x in enumerate(self.pocket_dataset)}
+        if use_cache:
+            mol_smi2idx_file = f"{cache_dir}/cache/mol_smi2idx_train_blend.json"
+            if os.path.exists(mol_smi2idx_file):
+                self.mol_smi2idx = json.load(open(mol_smi2idx_file))
+            else:
+                self.mol_smi2idx = {x["smi_name"]: i for i, x in enumerate(self.mol_dataset)}
+                json.dump(self.mol_smi2idx, open(mol_smi2idx_file, "w"))
+        else:
+            self.mol_smi2idx = {x["smi_name"]: i for i, x in enumerate(self.mol_dataset)}
+        uniprot_ids = [x["uniprot"] for x in labels]
+        self.uniprot_id_dict = {x:i for i,x in enumerate(set(uniprot_ids))}
+        self.split = split
+        if self.split == "train":
+            self.max_lignum = args.max_lignum # default=16
+        else:
+            self.max_lignum = args.test_max_lignum # default 512
+        if self.split == "train":
+            trainidxmap = []
+            for idx, assay_item in enumerate(self.labels):
+                lig_info = assay_item["ligands"]
+                trainidxmap += [idx]*math.ceil(len(lig_info)/max(self.max_lignum, 32))
+            self.trainidxmap = trainidxmap
+        self.epoch = 0
+    def __len__(self):
+        if self.split == "train":
+            import os
+            world_size = int(os.environ["WORLD_SIZE"])
+            div = self.args.batch_size * world_size
+            return (len(self.trainidxmap) // div) * div
+        else:
+            return len(self.labels)
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+        self.pocket_dataset.set_epoch(epoch)
+        self.mol_dataset.set_epoch(epoch)
+        super().set_epoch(epoch)
+    def collater(self, samples):
+        ret_pocket = []
+        ret_lig = []
+        batch_list = []
+        act_list = []
+        uniprot_list = []
+        ret_protein = []
+        assay_id_list = []
+        if len(samples) == 0:
+            return {}
+        for pocket, ligs, acts, uniprot, assay_id, prot_seq in samples:
+            ret_pocket.append(pocket)
+            lignum_old = len(ret_lig)
+            ret_lig += ligs
+            batch_list.append([lignum_old, len(ret_lig)])
+            uniprot_list.append(self.uniprot_id_dict[uniprot])
+            assay_id_list.append(assay_id)
+            act_list.append(acts)
+            ret_protein.append(prot_seq)
+        ret_pocket = self.pocket_dataset.collater(ret_pocket)
+        ret_lig = self.mol_dataset.collater(ret_lig)
+        return {"pocket": ret_pocket, "lig": ret_lig, "protein": ret_protein,
+                "batch_list": batch_list, "act_list": act_list,
+                "uniprot_list": uniprot_list, "assay_id_list": assay_id_list}
+    # @lru_cache(maxsize=16)
+    def __getitem__(self, idx):
+        if self.split == "train":
+            t_idx = self.trainidxmap[idx]
+        else:
+            t_idx = idx
+        with data_utils.numpy_seed(1111, idx, self.epoch):
+            pocket_name = np.random.choice(self.labels[t_idx]["pockets"], 1, replace=False)[0]
+        lig_info = self.labels[t_idx]["ligands"]
+        lig_info = [x for x in lig_info if x["smi"] in self.mol_smi2idx]
+        uniprot = self.labels[t_idx]["uniprot"]
+        assay_id = self.labels[t_idx].get("assay_id", "none")
+        prot_seq = self.labels[t_idx]["sequence"]
+        if len(lig_info) > self.max_lignum:
+            with data_utils.numpy_seed(1111, idx, self.epoch):
+                lig_idxes = np.random.choice(list(range(len(lig_info))), self.max_lignum, replace=False)
+                lig_idxes = sorted(lig_idxes)
+                lig_info = [lig_info[idx] for idx in lig_idxes]
+        lig_idxes = [self.mol_smi2idx[info["smi"]] for info in lig_info]
+        pocket_idx = self.pocket_name2idx[pocket_name]
+        lig_act = [info["act"] for info in lig_info]
+        pocket_data = self.pocket_dataset[pocket_idx]
+        lig_data = [self.mol_dataset[x] for x in lig_idxes]
+        return pocket_data, lig_data, lig_act, uniprot, assay_id, prot_seq