add boundmolecule reader

roshan · roshan · commit c8f767224bf7 · 2026-02-28T14:12:33.000Z
diff --git a/pdbeccdutils/core/bm_reader.py b/pdbeccdutils/core/bm_reader.py
@@ -0,0 +1,398 @@
+import os
+from gemmi import cif
+from collections import namedtuple
+import rdkit
+from pdbeccdutils.core import ccd_reader, models
+from rdkit.Chem.rdMolDescriptors import CalcMolFormula
+from pdbeccdutils.core.boundmolecule import infer_bound_molecules
+from pdbeccdutils.core.component import Component
+from networkx import MultiDiGraph
+from pdbeccdutils.utils import config
+from pdbeccdutils.helpers import cif_tools, conversions, mol_tools, helper
+from pdbeccdutils.core.models import (
+    CCDProperties,
+    ConformerType,
+    ReleaseStatus,
+    Descriptor,
+)
+
+BMReaderResult = namedtuple(
+    "BMReaderResult", ccd_reader.CCDReaderResult._fields + ("bound_molecule",)
+)
+
+def get_boundmolecules(
+    path_to_cif: str,
+    to_discard: set[str] = config.DISCARDED_RESIDUES,
+    sanitize: bool = True,
+    assembly: bool = False,
+) -> list[BMReaderResult]:
+    """
+    Read in single wwPDB Model CIF and create internal
+    representation of its bound-molecules.
+
+    Args:
+        path_to_cif (str): Path to the cif file
+        sanitize (bool): [Defaults: True]
+
+    Raises:
+        ValueError: if file does not exist
+
+    Returns:
+        A list of BMReaderResult representations of each bound-molecule.
+    """
+    if not os.path.isfile(path_to_cif):
+        raise ValueError(f"File '{path_to_cif}' does not exists")
+
+    biomolecule_result = []
+    bms = infer_bound_molecules(path_to_cif, to_discard, assembly)
+    for i, bm in enumerate(bms, start=1):
+        bm_id = f"bm{i}"
+        reader_result = infer_chem_comp(path_to_cif, bm, bm_id, sanitize)
+        if reader_result:
+            biomolecule_result.append(reader_result)
+
+    return biomolecule_result
+
+
+def infer_chem_comp(path_to_cif, bm, bm_id, sanitize=True):
+    """Args:
+        path_to_cif: Path to input structure
+        bm: bound-molecules identified from input structure
+        bm_id: ID of bound-molecule
+        sanitize: True if bound-molecule need to be sanitized
+
+    Returns:
+        BMReaderResult: Namedtuple containing Component representation of bound-molecule
+
+    """
+
+    cif_block = cif.read(path_to_cif).sole_block()
+    (mol, warnings, errors) = _parse_pdb_mmcif(cif_block, bm.graph)
+    sanitized = False
+    if sanitize:
+        sanitized_result = mol_tools.sanitize(mol)
+        mol, sanitized = sanitized_result.mol, sanitized_result.status
+
+    inchi_result = mol_tools.inchi_from_mol(mol)
+    if inchi_result.warnings:
+        warnings.append(inchi_result.warnings)
+    if inchi_result.errors:
+        errors.append(inchi_result.errors)
+
+    inchikey = mol_tools.inchikey_from_inchi(inchi_result.inchi)
+    descriptors = [
+        Descriptor(
+            type="SMILES",
+            program="rdkit",
+            program_version=rdkit.__version__,
+            value=rdkit.Chem.MolToSmiles(mol),
+        ),
+        Descriptor(
+            type="InChI",
+            program="rdkit",
+            program_version=rdkit.__version__,
+            value=inchi_result.inchi,
+        ),
+        Descriptor(
+            type="InChIKey",
+            program="rdkit",
+            program_version=rdkit.__version__,
+            value=inchikey,
+        ),
+    ]
+
+    # define release category based on warnings or errors
+    pdbx_release_status = models.ReleaseStatus.NOT_SET
+    if warnings or errors:
+        pdbx_release_status = models.ReleaseStatus.HOLD
+    else:
+        pdbx_release_status = models.ReleaseStatus.REL
+
+    properties = CCDProperties(
+        id=bm_id,
+        name=mol_tools.rdkit_object_property(mol, "name"),
+        formula=CalcMolFormula(mol),
+        modified_date=None,
+        pdbx_release_status=pdbx_release_status,
+        weight="",
+    )
+
+    comp = Component(mol, None, properties, descriptors)
+
+    reader_result = BMReaderResult(
+        warnings=warnings,
+        errors=errors,
+        component=comp,
+        bound_molecule=bm,
+        sanitized=sanitized,
+    )
+
+    return reader_result
+
+def _parse_pdb_mmcif(
+    cif_block: cif.Block, bm: models.BoundMolecule
+) -> tuple[rdkit.Chem.rdchem.Mol, list[str], list[str]]:
+    """
+    Create internal representation of the molecule from mmcif format.
+
+    Args:
+        cif_block (cif.Block): mmcif block object from gemmi
+        sanitize (bool): Whether or not the rdkit component should
+            be sanitized. Defaults to True.
+
+    Returns:
+        CCDReaderResult: internal representation with the results
+            of parsing and Mol object.
+    """
+    warnings = []
+    errors = []
+    mol = rdkit.Chem.RWMol()
+
+    w = cif_tools.validate_mm_cif_categories(cif_block)
+    if w:
+        warnings.append(w)
+
+    bm_atoms = _get_boundmolecule_atoms(cif_block, bm)
+
+    _parse_pdb_atoms(mol, bm_atoms)
+    _parse_pdb_conformers(mol, bm_atoms)
+    _parse_pdb_bonds(mol, bm, cif_block, errors)
+    _add_connections(mol, bm, errors)
+    _parse_pdb_entity(mol, bm, cif_block)
+    mol = _handle_hydrogens(mol)
+    return (mol, warnings, errors)
+
+
+def _get_boundmolecule_atoms(cif_block, bm):
+    if "_atom_site." not in cif_block.get_mmcif_category_names():
+        return
+
+    atoms = cif_block.get_mmcif_category("_atom_site.")
+    bm_atoms = {key: [] for key in atoms}
+    for i in range(len(atoms["id"])):
+        if atoms["group_PDB"][i] == "HETATM":
+            for residue in bm.nodes():
+                if (
+                    atoms["label_comp_id"][i] == residue.name
+                    and atoms["auth_asym_id"][i] == residue.chain
+                    and atoms["auth_seq_id"][i] == residue.res_id
+                ):
+                    for key in bm_atoms:
+                        bm_atoms[key].append(atoms[key][i])
+
+    return bm_atoms
+
+
+def _parse_pdb_atoms(mol: rdkit.Chem.rdchem.Mol, atoms: dict[str, list[str]]):
+    """Setup atoms of bound-molecules in the Component
+
+    Args:
+        mol: Rdkit Mol object of bound-molecule
+        atoms: atoms of bound-molecules
+    """
+
+    for i in range(len(atoms["id"])):
+        atom_id = atoms["label_atom_id"][i]
+        chain = atoms["auth_asym_id"][i]
+        res_name = atoms["label_comp_id"][i]
+        res_id = atoms["auth_seq_id"][i]
+        ins_code = (
+            "" if not atoms["pdbx_PDB_ins_code"][i] else atoms["pdbx_PDB_ins_code"][i]
+        )
+        residue_id = f"{chain}{res_id}{ins_code}"
+        element = atoms["type_symbol"][i]
+        element = element if len(element) == 1 else element[0] + element[1].lower()
+        isotope = None
+        if element == "D":
+            element = "H"
+            isotope = 2
+        elif element == "X":
+            element = "*"
+
+        atom_name = f"{element}{i}"
+        atom = rdkit.Chem.Atom(element)
+        atom.SetProp("name", atom_name)
+        atom.SetProp("component_atom_id", atom_id)
+        atom.SetProp("residue_id", residue_id)
+        # _atom_site.auth_seq_id is not necessary to be a number (https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_atom_site.auth_seq_id.html)
+
+        res_info = rdkit.Chem.AtomPDBResidueInfo()
+        res_info.SetResidueName(res_name)
+        res_info.SetIsHeteroAtom(True)
+        atom.SetMonomerInfo(res_info)
+
+        if isotope is not None:
+            atom.SetIsotope(isotope)
+
+        mol.AddAtom(atom)
+
+
+def _parse_pdb_conformers(mol: rdkit.Chem.rdchem.Mol, atoms: dict[str, list[str]]):
+    """Setup model cooordinates in the rdkit Mol object.
+
+    Args:
+        mol: RDKit Mol object of bound-molecule
+        atoms: atoms of bound-molecule
+    """
+    if not atoms:
+        return
+
+    model = _setup_pdb_conformer(atoms)
+    mol.AddConformer(model, assignId=True)
+
+
+def _setup_pdb_conformer(atoms):
+    if not atoms:
+        return
+
+    conformer = rdkit.Chem.Conformer(len(atoms["id"]))
+    for i in range(len(atoms["id"])):
+        x = conversions.str_to_float(atoms["Cartn_x"][i])
+        y = conversions.str_to_float(atoms["Cartn_y"][i])
+        z = conversions.str_to_float(atoms["Cartn_z"][i])
+        atom_position = rdkit.Chem.rdGeometry.Point3D(x, y, z)
+        conformer.SetAtomPosition(i, atom_position)
+
+    conformer.SetProp("name", ConformerType.Model.name)
+    return conformer
+
+
+def _parse_pdb_bonds(
+    mol: rdkit.Chem.rdchem.Mol,
+    bm: MultiDiGraph,
+    cif_block: cif.Block,
+    errors: list[str],
+):
+    """Setup bonds in the rdkit Mol object
+
+    Args:
+        mol: RDKit Mol object of bound-molecule
+        bm: bound-molecule
+        errors: list of errors encountered while parsing.
+    """
+    if (
+        "_atom_site." not in cif_block.get_mmcif_category_names()
+        or "_chem_comp_bond." not in cif_block.get_mmcif_category_names()
+    ):
+        return
+
+    for residue in bm.nodes():
+        resiude_bonds = get_chem_comp_bonds(cif_block, residue.name)
+        for i in range(len(resiude_bonds.atom_id_1)):
+            try:
+                atom_1 = resiude_bonds.atom_id_1[i]
+                mol_atom_1_idx = helper.find_atom_index(mol, residue.id, atom_1)
+                atom_2 = resiude_bonds.atom_id_2[i]
+                mol_atom_2_idx = helper.find_atom_index(mol, residue.id, atom_2)
+                bond_order = helper.bond_pdb_order(resiude_bonds.value_order[i])
+                if (mol_atom_1_idx is not None) and (mol_atom_2_idx is not None):
+                    mol.AddBond(mol_atom_1_idx, mol_atom_2_idx, bond_order)
+            except ValueError:
+                errors.append(
+                    f"Error perceiving {atom_1} - {atom_2} bond from _chem_comp_bond"
+                )
+            except RuntimeError:
+                errors.append(f"Duplicit bond {atom_1} - {atom_2}")
+
+
+def _add_connections(
+    mol: rdkit.Chem.rdchem.Mol, bm: MultiDiGraph, errors: list[str]
+) -> None:
+    """Add bonds between CCDs in the bound-molecule
+
+    Args:
+        mol: RDKit Mol object of bound-molecule
+        bm: bound-molecule
+        errors: list of errors encountered while parsing
+
+    """
+    for residue_1, residue_2, atoms in bm.edges(data=True):
+        try:
+            atom_1 = atoms["atom_id_1"]
+            mol_atom_1_idx = helper.find_atom_index(mol, residue_1.id, atom_1)
+            atom_2 = atoms["atom_id_2"]
+            mol_atom_2_idx = helper.find_atom_index(mol, residue_2.id, atom_2)
+            bond_order = helper.bond_pdb_order("SING")
+            if (mol_atom_1_idx is not None) and (mol_atom_2_idx is not None):
+                mol.AddBond(mol_atom_1_idx, mol_atom_2_idx, bond_order)
+        except ValueError:
+            errors.append(
+                f"Error perceiving {atom_1} - {atom_2} bond from Boundmolecule connections"
+            )
+        except RuntimeError:
+            errors.append(f"Duplicit bond {atom_1} - {atom_2}")
+
+
+def _parse_pdb_entity(mol, bm, cif_block):
+    if "_entity." not in cif_block.get_mmcif_category_names():
+        return
+
+    entities = cif_block.find("_entity.", ["id", "pdbx_description"])
+    bm_entities = list({residue.ent_id for residue in bm.nodes()})
+    if len(bm_entities) == 1:
+        for row in entities:
+            if cif.as_string(row["_entity.id"]) == bm_entities[0]:
+                mol.SetProp("name", cif.as_string(row["pdbx_description"]))
+
+
+def get_chem_comp_bonds(cif_block: cif.Block, residue: str):
+    """Returns _chem_comp_bond associated with a residue
+
+    Args:
+        cif_block: gemmi.cif.Block object of protein mmCIF file
+        residue: CCD ID
+    """
+
+    if "_chem_comp_bond." not in cif_block.get_mmcif_category_names():
+        return
+    chem_comp_bonds = cif_block.get_mmcif_category("_chem_comp_bond.")
+    ResidueBonds = namedtuple("ResidueBonds", "residue atom_id_1 atom_id_2 value_order")
+    atom_id_1 = []
+    atom_id_2 = []
+    value_order = []
+    last_comp_id = None
+    residue_found = False
+    for i in range(len(chem_comp_bonds["comp_id"])):
+        chem_comp_id = chem_comp_bonds["comp_id"][i]
+        if chem_comp_id == residue:
+            residue_found = True
+            atom_id_1.append(chem_comp_bonds["atom_id_1"][i])
+            atom_id_2.append(chem_comp_bonds["atom_id_2"][i])
+            value_order.append(chem_comp_bonds["value_order"][i])
+        last_comp_id = chem_comp_id
+        if last_comp_id != residue and residue_found:
+            break
+
+    residue_bonds = ResidueBonds(residue, atom_id_1, atom_id_2, value_order)
+    return residue_bonds
+
+
+def _handle_hydrogens(mol):
+    """
+    Returns a rdkit.Chem.rdchem.Mol after adding hydrogens
+
+    Args:
+        mol: Rdkit Mol object
+    """
+
+    hydrogen_indices = [
+        atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomicNum() == 1
+    ]
+
+    hydrogen_indices.sort(reverse=True)
+    for index in hydrogen_indices:
+        mol.RemoveAtom(index)
+
+    mol.UpdatePropertyCache(strict=False)
+    mol = rdkit.Chem.AddHs(mol, addCoords=True, addResidueInfo=True)
+    conformer = mol.GetConformer()
+    for atom in mol.GetAtoms():
+        if atom.GetAtomicNum() == 1:
+            atom_id = atom.GetSymbol() + str(atom.GetIdx())
+            atom.SetProp("name", atom_id)
+            mol_tools.correct_atom_coords(conformer, atom.GetIdx())
+            for bond in atom.GetBonds():
+                other = bond.GetOtherAtom(atom)
+                residue_id = other.GetProp("residue_id")
+                atom.SetProp("residue_id", residue_id)
+    return mol