| | import contextlib |
| | from dataclasses import dataclass, replace |
| | from typing import Optional |
| |
|
| | import gemmi |
| | import numpy as np |
| | from rdkit import rdBase |
| | from rdkit.Chem import AllChem |
| | from rdkit.Chem.rdchem import Conformer, Mol |
| | from sklearn.neighbors import KDTree |
| |
|
| | from boltz.data import const |
| | from boltz.data.types import ( |
| | Atom, |
| | Bond, |
| | Chain, |
| | Connection, |
| | Interface, |
| | Residue, |
| | Structure, |
| | StructureInfo, |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | @dataclass(frozen=True, slots=True) |
| | class ParsedAtom: |
| | """A parsed atom object.""" |
| |
|
| | name: str |
| | element: int |
| | charge: int |
| | coords: tuple[float, float, float] |
| | conformer: tuple[float, float, float] |
| | is_present: bool |
| | chirality: int |
| |
|
| |
|
| | @dataclass(frozen=True, slots=True) |
| | class ParsedBond: |
| | """A parsed bond object.""" |
| |
|
| | atom_1: int |
| | atom_2: int |
| | type: int |
| |
|
| |
|
| | @dataclass(frozen=True, slots=True) |
| | class ParsedResidue: |
| | """A parsed residue object.""" |
| |
|
| | name: str |
| | type: int |
| | idx: int |
| | atoms: list[ParsedAtom] |
| | bonds: list[ParsedBond] |
| | orig_idx: Optional[int] |
| | atom_center: int |
| | atom_disto: int |
| | is_standard: bool |
| | is_present: bool |
| |
|
| |
|
| | @dataclass(frozen=True, slots=True) |
| | class ParsedChain: |
| | """A parsed chain object.""" |
| |
|
| | name: str |
| | entity: str |
| | type: str |
| | residues: list[ParsedResidue] |
| | sequence: list[str] |
| |
|
| |
|
| | @dataclass(frozen=True, slots=True) |
| | class ParsedConnection: |
| | """A parsed connection object.""" |
| |
|
| | chain_1: str |
| | chain_2: str |
| | residue_index_1: int |
| | residue_index_2: int |
| | atom_index_1: str |
| | atom_index_2: str |
| |
|
| |
|
| | @dataclass(frozen=True, slots=True) |
| | class ParsedStructure: |
| | """A parsed structure object.""" |
| |
|
| | data: Structure |
| | info: StructureInfo |
| | covalents: list[int] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | def get_dates(block: gemmi.cif.Block) -> tuple[str, str, str]: |
| | """Get the deposited, released, and last revision dates. |
| | |
| | Parameters |
| | ---------- |
| | block : gemmi.cif.Block |
| | The block to process. |
| | |
| | Returns |
| | ------- |
| | str |
| | The deposited date. |
| | str |
| | The released date. |
| | str |
| | The last revision date. |
| | |
| | """ |
| | deposited = "_pdbx_database_status.recvd_initial_deposition_date" |
| | revision = "_pdbx_audit_revision_history.revision_date" |
| | deposit_date = revision_date = release_date = "" |
| | with contextlib.suppress(Exception): |
| | deposit_date = block.find([deposited])[0][0] |
| | release_date = block.find([revision])[0][0] |
| | revision_date = block.find([revision])[-1][0] |
| |
|
| | return deposit_date, release_date, revision_date |
| |
|
| |
|
| | def get_resolution(block: gemmi.cif.Block) -> float: |
| | """Get the resolution from a gemmi structure. |
| | |
| | Parameters |
| | ---------- |
| | block : gemmi.cif.Block |
| | The block to process. |
| | |
| | Returns |
| | ------- |
| | float |
| | The resolution. |
| | |
| | """ |
| | resolution = 0.0 |
| | for res_key in ( |
| | "_refine.ls_d_res_high", |
| | "_em_3d_reconstruction.resolution", |
| | "_reflns.d_resolution_high", |
| | ): |
| | with contextlib.suppress(Exception): |
| | resolution = float(block.find([res_key])[0].str(0)) |
| | break |
| | return resolution |
| |
|
| |
|
| | def get_method(block: gemmi.cif.Block) -> str: |
| | """Get the method from a gemmi structure. |
| | |
| | Parameters |
| | ---------- |
| | block : gemmi.cif.Block |
| | The block to process. |
| | |
| | Returns |
| | ------- |
| | str |
| | The method. |
| | |
| | """ |
| | method = "" |
| | method_key = "_exptl.method" |
| | with contextlib.suppress(Exception): |
| | methods = block.find([method_key]) |
| | method = ",".join([m.str(0).lower() for m in methods]) |
| |
|
| | return method |
| |
|
| |
|
| | def convert_atom_name(name: str) -> tuple[int, int, int, int]: |
| | """Convert an atom name to a standard format. |
| | |
| | Parameters |
| | ---------- |
| | name : str |
| | The atom name. |
| | |
| | Returns |
| | ------- |
| | tuple[int, int, int, int] |
| | The converted atom name. |
| | |
| | """ |
| | name = name.strip() |
| | name = [ord(c) - 32 for c in name] |
| | name = name + [0] * (4 - len(name)) |
| | return tuple(name) |
| |
|
| |
|
| | def get_unk_token(dtype: gemmi.PolymerType) -> str: |
| | """Get the unknown token for a given entity type. |
| | |
| | Parameters |
| | ---------- |
| | dtype : gemmi.EntityType |
| | The entity type. |
| | |
| | Returns |
| | ------- |
| | str |
| | The unknown token. |
| | |
| | """ |
| | if dtype == gemmi.PolymerType.PeptideL: |
| | unk = const.unk_token["PROTEIN"] |
| | elif dtype == gemmi.PolymerType.Dna: |
| | unk = const.unk_token["DNA"] |
| | elif dtype == gemmi.PolymerType.Rna: |
| | unk = const.unk_token["RNA"] |
| | else: |
| | msg = f"Unknown polymer type: {dtype}" |
| | raise ValueError(msg) |
| |
|
| | return unk |
| |
|
| |
|
| | def get_conformer(mol: Mol) -> Conformer: |
| | """Retrieve an rdkit object for a deemed conformer. |
| | |
| | Inspired by `pdbeccdutils.core.component.Component`. |
| | |
| | Parameters |
| | ---------- |
| | mol: Mol |
| | The molecule to process. |
| | |
| | Returns |
| | ------- |
| | Conformer |
| | The desired conformer, if any. |
| | |
| | Raises |
| | ------ |
| | ValueError |
| | If there are no conformers of the given tyoe. |
| | |
| | """ |
| | for c in mol.GetConformers(): |
| | try: |
| | if c.GetProp("name") == "Computed": |
| | return c |
| | except KeyError: |
| | pass |
| |
|
| | for c in mol.GetConformers(): |
| | try: |
| | if c.GetProp("name") == "Ideal": |
| | return c |
| | except KeyError: |
| | pass |
| |
|
| | msg = "Conformer does not exist." |
| | raise ValueError(msg) |
| |
|
| |
|
| | def compute_covalent_ligands( |
| | connections: list[gemmi.Connection], |
| | subchain_map: dict[tuple[str, int], str], |
| | entities: dict[str, gemmi.Entity], |
| | ) -> set[str]: |
| | """Compute the covalent ligands from a list of connections. |
| | |
| | Parameters |
| | ---------- |
| | connections: List[gemmi.Connection] |
| | The connections to process. |
| | subchain_map: dict[tuple[str, int], str] |
| | The mapping from chain, residue index to subchain name. |
| | entities: dict[str, gemmi.Entity] |
| | The entities in the structure. |
| | |
| | Returns |
| | ------- |
| | set |
| | The covalent ligand subchains. |
| | |
| | """ |
| | |
| | covalent_chain_ids = set() |
| | for connection in connections: |
| | if connection.type.name != "Covale": |
| | continue |
| |
|
| | |
| | chain_1_name = connection.partner1.chain_name |
| | chain_2_name = connection.partner2.chain_name |
| |
|
| | res_1_id = connection.partner1.res_id.seqid |
| | res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip() |
| |
|
| | res_2_id = connection.partner2.res_id.seqid |
| | res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip() |
| |
|
| | subchain_1 = subchain_map[(chain_1_name, res_1_id)] |
| | subchain_2 = subchain_map[(chain_2_name, res_2_id)] |
| |
|
| | |
| | entity_1 = entities[subchain_1].entity_type.name |
| | entity_2 = entities[subchain_2].entity_type.name |
| |
|
| | if entity_1 in {"NonPolymer", "Branched"}: |
| | covalent_chain_ids.add(subchain_1) |
| | if entity_2 in {"NonPolymer", "Branched"}: |
| | covalent_chain_ids.add(subchain_2) |
| |
|
| | return covalent_chain_ids |
| |
|
| |
|
| | def compute_interfaces(atom_data: np.ndarray, chain_data: np.ndarray) -> np.ndarray: |
| | """Compute the chain-chain interfaces from a gemmi structure. |
| | |
| | Parameters |
| | ---------- |
| | atom_data : List[tuple] |
| | The atom data. |
| | chain_data : List[tuple] |
| | The chain data. |
| | |
| | Returns |
| | ------- |
| | List[tuple[int, int]] |
| | The interfaces. |
| | |
| | """ |
| | |
| | chain_ids = [] |
| | for idx, chain in enumerate(chain_data): |
| | chain_ids.extend([idx] * chain["atom_num"]) |
| | chain_ids = np.array(chain_ids) |
| |
|
| | |
| | coords = atom_data["coords"] |
| | mask = atom_data["is_present"] |
| |
|
| | coords = coords[mask] |
| | chain_ids = chain_ids[mask] |
| |
|
| | |
| | tree = KDTree(coords, metric="euclidean") |
| | query = tree.query_radius(coords, const.atom_interface_cutoff) |
| |
|
| | |
| | interfaces = set() |
| | for c1, pairs in zip(chain_ids, query): |
| | chains = np.unique(chain_ids[pairs]) |
| | chains = chains[chains != c1] |
| | interfaces.update((c1, c2) for c2 in chains) |
| |
|
| | |
| | interfaces = [(min(i, j), max(i, j)) for i, j in interfaces] |
| | interfaces = list({(int(i), int(j)) for i, j in interfaces}) |
| | interfaces = np.array(interfaces, dtype=Interface) |
| | return interfaces |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | def parse_ccd_residue( |
| | name: str, |
| | components: dict[str, Mol], |
| | res_idx: int, |
| | gemmi_mol: Optional[gemmi.Residue] = None, |
| | is_covalent: bool = False, |
| | ) -> Optional[ParsedResidue]: |
| | """Parse an MMCIF ligand. |
| | |
| | First tries to get the SMILES string from the RCSB. |
| | Then, tries to infer atom ordering using RDKit. |
| | |
| | Parameters |
| | ---------- |
| | name: str |
| | The name of the molecule to parse. |
| | components : dict |
| | The preprocessed PDB components dictionary. |
| | res_idx : int |
| | The residue index. |
| | gemmi_mol : Optional[gemmi.Residue] |
| | The PDB molecule, as a gemmi Residue object, if any. |
| | |
| | Returns |
| | ------- |
| | ParsedResidue, optional |
| | The output ParsedResidue, if successful. |
| | |
| | """ |
| | unk_chirality = const.chirality_type_ids[const.unk_chirality_type] |
| | |
| | |
| | is_present = gemmi_mol is not None |
| |
|
| | |
| | if is_present: |
| | orig_idx = gemmi_mol.seqid |
| | orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip() |
| | else: |
| | orig_idx = None |
| |
|
| | |
| | ref_mol = components[name] |
| |
|
| | |
| | ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False) |
| |
|
| | |
| | if ref_mol.GetNumAtoms() == 1: |
| | pos = (0, 0, 0) |
| | if is_present: |
| | pos = ( |
| | gemmi_mol[0].pos.x, |
| | gemmi_mol[0].pos.y, |
| | gemmi_mol[0].pos.z, |
| | ) |
| | ref_atom = ref_mol.GetAtoms()[0] |
| | chirality_type = const.chirality_type_ids.get( |
| | str(ref_atom.GetChiralTag()), unk_chirality |
| | ) |
| | atom = ParsedAtom( |
| | name=ref_atom.GetProp("name"), |
| | element=ref_atom.GetAtomicNum(), |
| | charge=ref_atom.GetFormalCharge(), |
| | coords=pos, |
| | conformer=(0, 0, 0), |
| | is_present=is_present, |
| | chirality=chirality_type, |
| | ) |
| | unk_prot_id = const.unk_token_ids["PROTEIN"] |
| | residue = ParsedResidue( |
| | name=name, |
| | type=unk_prot_id, |
| | atoms=[atom], |
| | bonds=[], |
| | idx=res_idx, |
| | orig_idx=orig_idx, |
| | atom_center=0, |
| | atom_disto=0, |
| | is_standard=False, |
| | is_present=is_present, |
| | ) |
| | return residue |
| |
|
| | |
| | pdb_pos = {} |
| | if is_present: |
| | |
| | for atom in gemmi_mol: |
| | atom: gemmi.Atom |
| | pos = (atom.pos.x, atom.pos.y, atom.pos.z) |
| | pdb_pos[atom.name] = pos |
| |
|
| | |
| | conformer = get_conformer(ref_mol) |
| |
|
| | |
| | atoms = [] |
| | atom_idx = 0 |
| | idx_map = {} |
| |
|
| | for i, atom in enumerate(ref_mol.GetAtoms()): |
| | |
| | atom_name = atom.GetProp("name") |
| | charge = atom.GetFormalCharge() |
| | element = atom.GetAtomicNum() |
| | ref_coords = conformer.GetAtomPosition(atom.GetIdx()) |
| | ref_coords = (ref_coords.x, ref_coords.y, ref_coords.z) |
| | chirality_type = const.chirality_type_ids.get( |
| | str(atom.GetChiralTag()), unk_chirality |
| | ) |
| |
|
| | |
| | if ( |
| | int(atom.GetProp("leaving_atom")) == 1 |
| | and is_covalent |
| | and (atom_name not in pdb_pos) |
| | ): |
| | continue |
| |
|
| | |
| | coords = pdb_pos.get(atom_name) |
| | if coords is None: |
| | atom_is_present = False |
| | coords = (0, 0, 0) |
| | else: |
| | atom_is_present = True |
| |
|
| | |
| | atoms.append( |
| | ParsedAtom( |
| | name=atom_name, |
| | element=element, |
| | charge=charge, |
| | coords=coords, |
| | conformer=ref_coords, |
| | is_present=atom_is_present, |
| | chirality=chirality_type, |
| | ) |
| | ) |
| | idx_map[i] = atom_idx |
| | atom_idx += 1 |
| |
|
| | |
| | bonds = [] |
| | unk_bond = const.bond_type_ids[const.unk_bond_type] |
| | for bond in ref_mol.GetBonds(): |
| | idx_1 = bond.GetBeginAtomIdx() |
| | idx_2 = bond.GetEndAtomIdx() |
| |
|
| | |
| | if (idx_1 not in idx_map) or (idx_2 not in idx_map): |
| | continue |
| |
|
| | idx_1 = idx_map[idx_1] |
| | idx_2 = idx_map[idx_2] |
| | start = min(idx_1, idx_2) |
| | end = max(idx_1, idx_2) |
| | bond_type = bond.GetBondType().name |
| | bond_type = const.bond_type_ids.get(bond_type, unk_bond) |
| | bonds.append(ParsedBond(start, end, bond_type)) |
| |
|
| | unk_prot_id = const.unk_token_ids["PROTEIN"] |
| | return ParsedResidue( |
| | name=name, |
| | type=unk_prot_id, |
| | atoms=atoms, |
| | bonds=bonds, |
| | idx=res_idx, |
| | atom_center=0, |
| | atom_disto=0, |
| | orig_idx=orig_idx, |
| | is_standard=False, |
| | is_present=is_present, |
| | ) |
| |
|
| |
|
| | def parse_polymer( |
| | polymer: gemmi.ResidueSpan, |
| | polymer_type: gemmi.PolymerType, |
| | sequence: list[str], |
| | chain_id: str, |
| | entity: str, |
| | components: dict[str, Mol], |
| | ) -> Optional[ParsedChain]: |
| | """Process a gemmi Polymer into a chain object. |
| | |
| | Performs alignment of the full sequence to the polymer |
| | residues. Loads coordinates and masks for the atoms in |
| | the polymer, following the ordering in const.atom_order. |
| | |
| | Parameters |
| | ---------- |
| | polymer : gemmi.ResidueSpan |
| | The polymer to process. |
| | polymer_type : gemmi.PolymerType |
| | The polymer type. |
| | sequence : str |
| | The full sequence of the polymer. |
| | chain_id : str |
| | The chain identifier. |
| | entity : str |
| | The entity name. |
| | components : dict[str, Mol] |
| | The preprocessed PDB components dictionary. |
| | |
| | Returns |
| | ------- |
| | ParsedChain, optional |
| | The output chain, if successful. |
| | |
| | Raises |
| | ------ |
| | ValueError |
| | If the alignment fails. |
| | |
| | """ |
| | |
| | unk_chirality = const.chirality_type_ids[const.unk_chirality_type] |
| |
|
| | |
| | sequence = [gemmi.Entity.first_mon(item) for item in sequence] |
| |
|
| | |
| | |
| | result = gemmi.align_sequence_to_polymer( |
| | sequence, |
| | polymer, |
| | polymer_type, |
| | gemmi.AlignmentScoring(), |
| | ) |
| |
|
| | |
| | i = 0 |
| | ref_res = set(const.tokens) |
| | parsed = [] |
| | for j, match in enumerate(result.match_string): |
| | |
| | res_name = sequence[j] |
| |
|
| | |
| | res = None |
| | name_to_atom = {} |
| |
|
| | if match == "|": |
| | |
| | res = polymer[i] |
| | name_to_atom = {a.name.upper(): a for a in res} |
| |
|
| | |
| | if res.name != res_name: |
| | msg = "Alignment mismatch!" |
| | raise ValueError(msg) |
| |
|
| | |
| | i += 1 |
| |
|
| | |
| | if res_name == "MSE": |
| | res_name = "MET" |
| | if "SE" in name_to_atom: |
| | name_to_atom["SD"] = name_to_atom["SE"] |
| |
|
| | |
| | elif res_name not in ref_res: |
| | residue = parse_ccd_residue( |
| | name=res_name, |
| | components=components, |
| | res_idx=j, |
| | gemmi_mol=res, |
| | is_covalent=True, |
| | ) |
| | parsed.append(residue) |
| | continue |
| |
|
| | |
| | ref_mol = components[res_name] |
| | ref_mol = AllChem.RemoveHs(ref_mol, sanitize=False) |
| | ref_conformer = get_conformer(ref_mol) |
| |
|
| | |
| | ref_name_to_atom = {a.GetProp("name"): a for a in ref_mol.GetAtoms()} |
| | ref_atoms = [ref_name_to_atom[a] for a in const.ref_atoms[res_name]] |
| |
|
| | |
| | atoms: list[ParsedAtom] = [] |
| |
|
| | for ref_atom in ref_atoms: |
| | |
| | atom_name = ref_atom.GetProp("name") |
| | idx = ref_atom.GetIdx() |
| |
|
| | |
| | ref_coords = ref_conformer.GetAtomPosition(idx) |
| | ref_coords = (ref_coords.x, ref_coords.y, ref_coords.z) |
| |
|
| | |
| | if atom_name in name_to_atom: |
| | atom = name_to_atom[atom_name] |
| | atom_is_present = True |
| | coords = (atom.pos.x, atom.pos.y, atom.pos.z) |
| | else: |
| | atom_is_present = False |
| | coords = (0, 0, 0) |
| |
|
| | |
| | atoms.append( |
| | ParsedAtom( |
| | name=atom_name, |
| | element=ref_atom.GetAtomicNum(), |
| | charge=ref_atom.GetFormalCharge(), |
| | coords=coords, |
| | conformer=ref_coords, |
| | is_present=atom_is_present, |
| | chirality=const.chirality_type_ids.get( |
| | str(ref_atom.GetChiralTag()), unk_chirality |
| | ), |
| | ) |
| | ) |
| |
|
| | |
| | |
| | if (res is not None) and (res_name == "ARG"): |
| | ref_atoms: list[str] = const.ref_atoms["ARG"] |
| | cd = atoms[ref_atoms.index("CD")] |
| | nh1 = atoms[ref_atoms.index("NH1")] |
| | nh2 = atoms[ref_atoms.index("NH2")] |
| |
|
| | cd_coords = np.array(cd.coords) |
| | nh1_coords = np.array(nh1.coords) |
| | nh2_coords = np.array(nh2.coords) |
| |
|
| | if all(atom.is_present for atom in (cd, nh1, nh2)) and ( |
| | np.linalg.norm(nh1_coords - cd_coords) |
| | > np.linalg.norm(nh2_coords - cd_coords) |
| | ): |
| | atoms[ref_atoms.index("NH1")] = replace(nh1, coords=nh2.coords) |
| | atoms[ref_atoms.index("NH2")] = replace(nh2, coords=nh1.coords) |
| |
|
| | |
| | if res is not None: |
| | orig_idx = res.seqid |
| | orig_idx = str(orig_idx.num) + str(orig_idx.icode).strip() |
| | else: |
| | orig_idx = None |
| |
|
| | atom_center = const.res_to_center_atom_id[res_name] |
| | atom_disto = const.res_to_disto_atom_id[res_name] |
| | parsed.append( |
| | ParsedResidue( |
| | name=res_name, |
| | type=const.token_ids[res_name], |
| | atoms=atoms, |
| | bonds=[], |
| | idx=j, |
| | atom_center=atom_center, |
| | atom_disto=atom_disto, |
| | is_standard=True, |
| | is_present=res is not None, |
| | orig_idx=orig_idx, |
| | ) |
| | ) |
| |
|
| | |
| | if polymer_type == gemmi.PolymerType.PeptideL: |
| | chain_type = const.chain_type_ids["PROTEIN"] |
| | elif polymer_type == gemmi.PolymerType.Dna: |
| | chain_type = const.chain_type_ids["DNA"] |
| | elif polymer_type == gemmi.PolymerType.Rna: |
| | chain_type = const.chain_type_ids["RNA"] |
| |
|
| | |
| | return ParsedChain( |
| | name=chain_id, |
| | entity=entity, |
| | residues=parsed, |
| | type=chain_type, |
| | sequence=gemmi.one_letter_code(sequence), |
| | ) |
| |
|
| |
|
| | def parse_connection( |
| | connection: gemmi.Connection, |
| | chains: list[ParsedChain], |
| | subchain_map: dict[tuple[str, int], str], |
| | ) -> ParsedConnection: |
| | """Parse (covalent) connection from a gemmi Connection. |
| | |
| | Parameters |
| | ---------- |
| | connections : gemmi.ConnectionList |
| | The connection list to parse. |
| | chains : List[Chain] |
| | The parsed chains. |
| | subchain_map : dict[tuple[str, int], str] |
| | The mapping from chain, residue index to subchain name. |
| | |
| | Returns |
| | ------- |
| | List[Connection] |
| | The parsed connections. |
| | |
| | """ |
| | |
| | chain_1_name = connection.partner1.chain_name |
| | chain_2_name = connection.partner2.chain_name |
| |
|
| | res_1_id = connection.partner1.res_id.seqid |
| | res_1_id = str(res_1_id.num) + str(res_1_id.icode).strip() |
| |
|
| | res_2_id = connection.partner2.res_id.seqid |
| | res_2_id = str(res_2_id.num) + str(res_2_id.icode).strip() |
| |
|
| | subchain_1 = subchain_map[(chain_1_name, res_1_id)] |
| | subchain_2 = subchain_map[(chain_2_name, res_2_id)] |
| |
|
| | |
| | chain_1 = next(chain for chain in chains if (chain.name == subchain_1)) |
| | chain_2 = next(chain for chain in chains if (chain.name == subchain_2)) |
| |
|
| | |
| | res_1_idx, res_1 = next( |
| | (idx, res) |
| | for idx, res in enumerate(chain_1.residues) |
| | if (res.orig_idx == res_1_id) |
| | ) |
| | res_2_idx, res_2 = next( |
| | (idx, res) |
| | for idx, res in enumerate(chain_2.residues) |
| | if (res.orig_idx == res_2_id) |
| | ) |
| |
|
| | |
| | atom_index_1 = next( |
| | idx |
| | for idx, atom in enumerate(res_1.atoms) |
| | if atom.name == connection.partner1.atom_name |
| | ) |
| | atom_index_2 = next( |
| | idx |
| | for idx, atom in enumerate(res_2.atoms) |
| | if atom.name == connection.partner2.atom_name |
| | ) |
| |
|
| | conn = ParsedConnection( |
| | chain_1=subchain_1, |
| | chain_2=subchain_2, |
| | residue_index_1=res_1_idx, |
| | residue_index_2=res_2_idx, |
| | atom_index_1=atom_index_1, |
| | atom_index_2=atom_index_2, |
| | ) |
| |
|
| | return conn |
| |
|
| |
|
| | def parse_mmcif( |
| | path: str, |
| | components: dict[str, Mol], |
| | use_assembly: bool = True, |
| | ) -> ParsedStructure: |
| | """Parse a structure in MMCIF format. |
| | |
| | Parameters |
| | ---------- |
| | mmcif_file : PathLike |
| | Path to the MMCIF file. |
| | components: dict[str, Mol] |
| | The preprocessed PDB components dictionary. |
| | use_assembly: bool |
| | Whether to use the first assembly. |
| | |
| | Returns |
| | ------- |
| | ParsedStructure |
| | The parsed structure. |
| | |
| | """ |
| | |
| | blocker = rdBase.BlockLogs() |
| |
|
| | |
| | block = gemmi.cif.read(str(path))[0] |
| |
|
| | |
| | deposit_date, release_date, revision_date = get_dates(block) |
| | resolution = get_resolution(block) |
| | method = get_method(block) |
| |
|
| | |
| | structure = gemmi.make_structure_from_block(block) |
| |
|
| | |
| | structure.merge_chain_parts() |
| | structure.remove_waters() |
| | structure.remove_hydrogens() |
| | structure.remove_alternative_conformations() |
| | structure.remove_empty_chains() |
| |
|
| | |
| | if use_assembly and structure.assemblies: |
| | how = gemmi.HowToNameCopiedChain.AddNumber |
| | assembly_name = structure.assemblies[0].name |
| | structure.transform_to_assembly(assembly_name, how=how) |
| |
|
| | |
| | |
| | entities: dict[str, gemmi.Entity] = {} |
| | entity_ids: dict[str, int] = {} |
| | for entity_id, entity in enumerate(structure.entities): |
| | entity: gemmi.Entity |
| | if entity.entity_type.name == "Water": |
| | continue |
| | for subchain_id in entity.subchains: |
| | entities[subchain_id] = entity |
| | entity_ids[subchain_id] = entity_id |
| |
|
| | |
| | |
| | subchain_map = {} |
| | for chain in structure[0]: |
| | for residue in chain: |
| | seq_id = residue.seqid |
| | seq_id = str(seq_id.num) + str(seq_id.icode).strip() |
| | subchain_map[(chain.name, seq_id)] = residue.subchain |
| |
|
| | |
| | covalent_chain_ids = compute_covalent_ligands( |
| | connections=structure.connections, |
| | subchain_map=subchain_map, |
| | entities=entities, |
| | ) |
| |
|
| | |
| | chains: list[ParsedChain] = [] |
| | chain_seqs = [] |
| | for raw_chain in structure[0].subchains(): |
| | |
| | subchain_id = raw_chain.subchain_id() |
| | entity: gemmi.Entity = entities[subchain_id] |
| | entity_type = entity.entity_type.name |
| |
|
| | |
| | if entity_type == "Polymer": |
| | |
| | if entity.polymer_type.name not in { |
| | "PeptideL", |
| | "Dna", |
| | "Rna", |
| | }: |
| | continue |
| |
|
| | |
| | parsed_polymer = parse_polymer( |
| | polymer=raw_chain, |
| | polymer_type=entity.polymer_type, |
| | sequence=entity.full_sequence, |
| | chain_id=subchain_id, |
| | entity=entity.name, |
| | components=components, |
| | ) |
| | if parsed_polymer is not None: |
| | chains.append(parsed_polymer) |
| | chain_seqs.append(parsed_polymer.sequence) |
| |
|
| | |
| | elif entity_type in {"NonPolymer", "Branched"}: |
| | |
| | if any(components.get(lig.name) is None for lig in raw_chain): |
| | continue |
| |
|
| | residues = [] |
| | for lig_idx, ligand in enumerate(raw_chain): |
| | |
| | if entity_type == "Branched": |
| | is_covalent = True |
| | else: |
| | is_covalent = subchain_id in covalent_chain_ids |
| |
|
| | ligand: gemmi.Residue |
| | residue = parse_ccd_residue( |
| | name=ligand.name, |
| | components=components, |
| | res_idx=lig_idx, |
| | gemmi_mol=ligand, |
| | is_covalent=is_covalent, |
| | ) |
| | residues.append(residue) |
| |
|
| | if residues: |
| | chains.append( |
| | ParsedChain( |
| | name=subchain_id, |
| | entity=entity.name, |
| | residues=residues, |
| | type=const.chain_type_ids["NONPOLYMER"], |
| | sequence=None, |
| | ) |
| | ) |
| |
|
| | |
| | if not chains: |
| | msg = "No chains parsed!" |
| | raise ValueError(msg) |
| |
|
| | |
| | connections: list[ParsedConnection] = [] |
| | for connection in structure.connections: |
| | |
| | connection: gemmi.Connection |
| | if connection.type.name != "Covale": |
| | continue |
| |
|
| | parsed_connection = parse_connection( |
| | connection=connection, |
| | chains=chains, |
| | subchain_map=subchain_map, |
| | ) |
| | connections.append(parsed_connection) |
| |
|
| | |
| | atom_data = [] |
| | bond_data = [] |
| | res_data = [] |
| | chain_data = [] |
| | connection_data = [] |
| |
|
| | |
| | atom_idx = 0 |
| | res_idx = 0 |
| | asym_id = 0 |
| | sym_count = {} |
| | chain_to_idx = {} |
| | res_to_idx = {} |
| |
|
| | for asym_id, chain in enumerate(chains): |
| | |
| | res_num = len(chain.residues) |
| | atom_num = sum(len(res.atoms) for res in chain.residues) |
| |
|
| | |
| | entity_id = entity_ids[chain.name] |
| | sym_id = sym_count.get(entity_id, 0) |
| | chain_data.append( |
| | ( |
| | chain.name, |
| | chain.type, |
| | entity_id, |
| | sym_id, |
| | asym_id, |
| | atom_idx, |
| | atom_num, |
| | res_idx, |
| | res_num, |
| | ) |
| | ) |
| | chain_to_idx[chain.name] = asym_id |
| | sym_count[entity_id] = sym_id + 1 |
| |
|
| | |
| | for i, res in enumerate(chain.residues): |
| | atom_center = atom_idx + res.atom_center |
| | atom_disto = atom_idx + res.atom_disto |
| | res_data.append( |
| | ( |
| | res.name, |
| | res.type, |
| | res.idx, |
| | atom_idx, |
| | len(res.atoms), |
| | atom_center, |
| | atom_disto, |
| | res.is_standard, |
| | res.is_present, |
| | ) |
| | ) |
| | res_to_idx[(chain.name, i)] = (res_idx, atom_idx) |
| |
|
| | for bond in res.bonds: |
| | atom_1 = atom_idx + bond.atom_1 |
| | atom_2 = atom_idx + bond.atom_2 |
| | bond_data.append((atom_1, atom_2, bond.type)) |
| |
|
| | for atom in res.atoms: |
| | atom_data.append( |
| | ( |
| | convert_atom_name(atom.name), |
| | atom.element, |
| | atom.charge, |
| | atom.coords, |
| | atom.conformer, |
| | atom.is_present, |
| | atom.chirality, |
| | ) |
| | ) |
| | atom_idx += 1 |
| |
|
| | res_idx += 1 |
| |
|
| | |
| | for conn in connections: |
| | chain_1_idx = chain_to_idx[conn.chain_1] |
| | chain_2_idx = chain_to_idx[conn.chain_2] |
| | res_1_idx, atom_1_offset = res_to_idx[(conn.chain_1, conn.residue_index_1)] |
| | res_2_idx, atom_2_offset = res_to_idx[(conn.chain_2, conn.residue_index_2)] |
| | atom_1_idx = atom_1_offset + conn.atom_index_1 |
| | atom_2_idx = atom_2_offset + conn.atom_index_2 |
| | connection_data.append( |
| | ( |
| | chain_1_idx, |
| | chain_2_idx, |
| | res_1_idx, |
| | res_2_idx, |
| | atom_1_idx, |
| | atom_2_idx, |
| | ) |
| | ) |
| |
|
| | |
| | atoms = np.array(atom_data, dtype=Atom) |
| | bonds = np.array(bond_data, dtype=Bond) |
| | residues = np.array(res_data, dtype=Residue) |
| | chains = np.array(chain_data, dtype=Chain) |
| | connections = np.array(connection_data, dtype=Connection) |
| | mask = np.ones(len(chain_data), dtype=bool) |
| |
|
| | |
| | interfaces = compute_interfaces(atoms, chains) |
| |
|
| | |
| | info = StructureInfo( |
| | deposited=deposit_date, |
| | revised=revision_date, |
| | released=release_date, |
| | resolution=resolution, |
| | method=method, |
| | num_chains=len(chains), |
| | num_interfaces=len(interfaces), |
| | ) |
| |
|
| | data = Structure( |
| | atoms=atoms, |
| | bonds=bonds, |
| | residues=residues, |
| | chains=chains, |
| | connections=connections, |
| | interfaces=interfaces, |
| | mask=mask, |
| | ) |
| |
|
| | return ParsedStructure(data=data, info=info, covalents=[]) |
| |
|