Skip to content

Commit c8f7672

Browse files
author
roshan
committed
add boundmolecule reader
1 parent d37db07 commit c8f7672

1 file changed

Lines changed: 398 additions & 0 deletions

File tree

pdbeccdutils/core/bm_reader.py

Lines changed: 398 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,398 @@
1+
import os
2+
from gemmi import cif
3+
from collections import namedtuple
4+
import rdkit
5+
from pdbeccdutils.core import ccd_reader, models
6+
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
7+
from pdbeccdutils.core.boundmolecule import infer_bound_molecules
8+
from pdbeccdutils.core.component import Component
9+
from networkx import MultiDiGraph
10+
from pdbeccdutils.utils import config
11+
from pdbeccdutils.helpers import cif_tools, conversions, mol_tools, helper
12+
from pdbeccdutils.core.models import (
13+
CCDProperties,
14+
ConformerType,
15+
ReleaseStatus,
16+
Descriptor,
17+
)
18+
19+
BMReaderResult = namedtuple(
20+
"BMReaderResult", ccd_reader.CCDReaderResult._fields + ("bound_molecule",)
21+
)
22+
23+
def get_boundmolecules(
24+
path_to_cif: str,
25+
to_discard: set[str] = config.DISCARDED_RESIDUES,
26+
sanitize: bool = True,
27+
assembly: bool = False,
28+
) -> list[BMReaderResult]:
29+
"""
30+
Read in single wwPDB Model CIF and create internal
31+
representation of its bound-molecules.
32+
33+
Args:
34+
path_to_cif (str): Path to the cif file
35+
sanitize (bool): [Defaults: True]
36+
37+
Raises:
38+
ValueError: if file does not exist
39+
40+
Returns:
41+
A list of BMReaderResult representations of each bound-molecule.
42+
"""
43+
if not os.path.isfile(path_to_cif):
44+
raise ValueError(f"File '{path_to_cif}' does not exists")
45+
46+
biomolecule_result = []
47+
bms = infer_bound_molecules(path_to_cif, to_discard, assembly)
48+
for i, bm in enumerate(bms, start=1):
49+
bm_id = f"bm{i}"
50+
reader_result = infer_chem_comp(path_to_cif, bm, bm_id, sanitize)
51+
if reader_result:
52+
biomolecule_result.append(reader_result)
53+
54+
return biomolecule_result
55+
56+
57+
def infer_chem_comp(path_to_cif, bm, bm_id, sanitize=True):
58+
"""Args:
59+
path_to_cif: Path to input structure
60+
bm: bound-molecules identified from input structure
61+
bm_id: ID of bound-molecule
62+
sanitize: True if bound-molecule need to be sanitized
63+
64+
Returns:
65+
BMReaderResult: Namedtuple containing Component representation of bound-molecule
66+
67+
"""
68+
69+
cif_block = cif.read(path_to_cif).sole_block()
70+
(mol, warnings, errors) = _parse_pdb_mmcif(cif_block, bm.graph)
71+
sanitized = False
72+
if sanitize:
73+
sanitized_result = mol_tools.sanitize(mol)
74+
mol, sanitized = sanitized_result.mol, sanitized_result.status
75+
76+
inchi_result = mol_tools.inchi_from_mol(mol)
77+
if inchi_result.warnings:
78+
warnings.append(inchi_result.warnings)
79+
if inchi_result.errors:
80+
errors.append(inchi_result.errors)
81+
82+
inchikey = mol_tools.inchikey_from_inchi(inchi_result.inchi)
83+
descriptors = [
84+
Descriptor(
85+
type="SMILES",
86+
program="rdkit",
87+
program_version=rdkit.__version__,
88+
value=rdkit.Chem.MolToSmiles(mol),
89+
),
90+
Descriptor(
91+
type="InChI",
92+
program="rdkit",
93+
program_version=rdkit.__version__,
94+
value=inchi_result.inchi,
95+
),
96+
Descriptor(
97+
type="InChIKey",
98+
program="rdkit",
99+
program_version=rdkit.__version__,
100+
value=inchikey,
101+
),
102+
]
103+
104+
# define release category based on warnings or errors
105+
pdbx_release_status = models.ReleaseStatus.NOT_SET
106+
if warnings or errors:
107+
pdbx_release_status = models.ReleaseStatus.HOLD
108+
else:
109+
pdbx_release_status = models.ReleaseStatus.REL
110+
111+
properties = CCDProperties(
112+
id=bm_id,
113+
name=mol_tools.rdkit_object_property(mol, "name"),
114+
formula=CalcMolFormula(mol),
115+
modified_date=None,
116+
pdbx_release_status=pdbx_release_status,
117+
weight="",
118+
)
119+
120+
comp = Component(mol, None, properties, descriptors)
121+
122+
reader_result = BMReaderResult(
123+
warnings=warnings,
124+
errors=errors,
125+
component=comp,
126+
bound_molecule=bm,
127+
sanitized=sanitized,
128+
)
129+
130+
return reader_result
131+
132+
def _parse_pdb_mmcif(
133+
cif_block: cif.Block, bm: models.BoundMolecule
134+
) -> tuple[rdkit.Chem.rdchem.Mol, list[str], list[str]]:
135+
"""
136+
Create internal representation of the molecule from mmcif format.
137+
138+
Args:
139+
cif_block (cif.Block): mmcif block object from gemmi
140+
sanitize (bool): Whether or not the rdkit component should
141+
be sanitized. Defaults to True.
142+
143+
Returns:
144+
CCDReaderResult: internal representation with the results
145+
of parsing and Mol object.
146+
"""
147+
warnings = []
148+
errors = []
149+
mol = rdkit.Chem.RWMol()
150+
151+
w = cif_tools.validate_mm_cif_categories(cif_block)
152+
if w:
153+
warnings.append(w)
154+
155+
bm_atoms = _get_boundmolecule_atoms(cif_block, bm)
156+
157+
_parse_pdb_atoms(mol, bm_atoms)
158+
_parse_pdb_conformers(mol, bm_atoms)
159+
_parse_pdb_bonds(mol, bm, cif_block, errors)
160+
_add_connections(mol, bm, errors)
161+
_parse_pdb_entity(mol, bm, cif_block)
162+
mol = _handle_hydrogens(mol)
163+
return (mol, warnings, errors)
164+
165+
166+
def _get_boundmolecule_atoms(cif_block, bm):
167+
if "_atom_site." not in cif_block.get_mmcif_category_names():
168+
return
169+
170+
atoms = cif_block.get_mmcif_category("_atom_site.")
171+
bm_atoms = {key: [] for key in atoms}
172+
for i in range(len(atoms["id"])):
173+
if atoms["group_PDB"][i] == "HETATM":
174+
for residue in bm.nodes():
175+
if (
176+
atoms["label_comp_id"][i] == residue.name
177+
and atoms["auth_asym_id"][i] == residue.chain
178+
and atoms["auth_seq_id"][i] == residue.res_id
179+
):
180+
for key in bm_atoms:
181+
bm_atoms[key].append(atoms[key][i])
182+
183+
return bm_atoms
184+
185+
186+
def _parse_pdb_atoms(mol: rdkit.Chem.rdchem.Mol, atoms: dict[str, list[str]]):
187+
"""Setup atoms of bound-molecules in the Component
188+
189+
Args:
190+
mol: Rdkit Mol object of bound-molecule
191+
atoms: atoms of bound-molecules
192+
"""
193+
194+
for i in range(len(atoms["id"])):
195+
atom_id = atoms["label_atom_id"][i]
196+
chain = atoms["auth_asym_id"][i]
197+
res_name = atoms["label_comp_id"][i]
198+
res_id = atoms["auth_seq_id"][i]
199+
ins_code = (
200+
"" if not atoms["pdbx_PDB_ins_code"][i] else atoms["pdbx_PDB_ins_code"][i]
201+
)
202+
residue_id = f"{chain}{res_id}{ins_code}"
203+
element = atoms["type_symbol"][i]
204+
element = element if len(element) == 1 else element[0] + element[1].lower()
205+
isotope = None
206+
if element == "D":
207+
element = "H"
208+
isotope = 2
209+
elif element == "X":
210+
element = "*"
211+
212+
atom_name = f"{element}{i}"
213+
atom = rdkit.Chem.Atom(element)
214+
atom.SetProp("name", atom_name)
215+
atom.SetProp("component_atom_id", atom_id)
216+
atom.SetProp("residue_id", residue_id)
217+
# _atom_site.auth_seq_id is not necessary to be a number (https://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Items/_atom_site.auth_seq_id.html)
218+
219+
res_info = rdkit.Chem.AtomPDBResidueInfo()
220+
res_info.SetResidueName(res_name)
221+
res_info.SetIsHeteroAtom(True)
222+
atom.SetMonomerInfo(res_info)
223+
224+
if isotope is not None:
225+
atom.SetIsotope(isotope)
226+
227+
mol.AddAtom(atom)
228+
229+
230+
def _parse_pdb_conformers(mol: rdkit.Chem.rdchem.Mol, atoms: dict[str, list[str]]):
231+
"""Setup model cooordinates in the rdkit Mol object.
232+
233+
Args:
234+
mol: RDKit Mol object of bound-molecule
235+
atoms: atoms of bound-molecule
236+
"""
237+
if not atoms:
238+
return
239+
240+
model = _setup_pdb_conformer(atoms)
241+
mol.AddConformer(model, assignId=True)
242+
243+
244+
def _setup_pdb_conformer(atoms):
245+
if not atoms:
246+
return
247+
248+
conformer = rdkit.Chem.Conformer(len(atoms["id"]))
249+
for i in range(len(atoms["id"])):
250+
x = conversions.str_to_float(atoms["Cartn_x"][i])
251+
y = conversions.str_to_float(atoms["Cartn_y"][i])
252+
z = conversions.str_to_float(atoms["Cartn_z"][i])
253+
atom_position = rdkit.Chem.rdGeometry.Point3D(x, y, z)
254+
conformer.SetAtomPosition(i, atom_position)
255+
256+
conformer.SetProp("name", ConformerType.Model.name)
257+
return conformer
258+
259+
260+
def _parse_pdb_bonds(
261+
mol: rdkit.Chem.rdchem.Mol,
262+
bm: MultiDiGraph,
263+
cif_block: cif.Block,
264+
errors: list[str],
265+
):
266+
"""Setup bonds in the rdkit Mol object
267+
268+
Args:
269+
mol: RDKit Mol object of bound-molecule
270+
bm: bound-molecule
271+
errors: list of errors encountered while parsing.
272+
"""
273+
if (
274+
"_atom_site." not in cif_block.get_mmcif_category_names()
275+
or "_chem_comp_bond." not in cif_block.get_mmcif_category_names()
276+
):
277+
return
278+
279+
for residue in bm.nodes():
280+
resiude_bonds = get_chem_comp_bonds(cif_block, residue.name)
281+
for i in range(len(resiude_bonds.atom_id_1)):
282+
try:
283+
atom_1 = resiude_bonds.atom_id_1[i]
284+
mol_atom_1_idx = helper.find_atom_index(mol, residue.id, atom_1)
285+
atom_2 = resiude_bonds.atom_id_2[i]
286+
mol_atom_2_idx = helper.find_atom_index(mol, residue.id, atom_2)
287+
bond_order = helper.bond_pdb_order(resiude_bonds.value_order[i])
288+
if (mol_atom_1_idx is not None) and (mol_atom_2_idx is not None):
289+
mol.AddBond(mol_atom_1_idx, mol_atom_2_idx, bond_order)
290+
except ValueError:
291+
errors.append(
292+
f"Error perceiving {atom_1} - {atom_2} bond from _chem_comp_bond"
293+
)
294+
except RuntimeError:
295+
errors.append(f"Duplicit bond {atom_1} - {atom_2}")
296+
297+
298+
def _add_connections(
299+
mol: rdkit.Chem.rdchem.Mol, bm: MultiDiGraph, errors: list[str]
300+
) -> None:
301+
"""Add bonds between CCDs in the bound-molecule
302+
303+
Args:
304+
mol: RDKit Mol object of bound-molecule
305+
bm: bound-molecule
306+
errors: list of errors encountered while parsing
307+
308+
"""
309+
for residue_1, residue_2, atoms in bm.edges(data=True):
310+
try:
311+
atom_1 = atoms["atom_id_1"]
312+
mol_atom_1_idx = helper.find_atom_index(mol, residue_1.id, atom_1)
313+
atom_2 = atoms["atom_id_2"]
314+
mol_atom_2_idx = helper.find_atom_index(mol, residue_2.id, atom_2)
315+
bond_order = helper.bond_pdb_order("SING")
316+
if (mol_atom_1_idx is not None) and (mol_atom_2_idx is not None):
317+
mol.AddBond(mol_atom_1_idx, mol_atom_2_idx, bond_order)
318+
except ValueError:
319+
errors.append(
320+
f"Error perceiving {atom_1} - {atom_2} bond from Boundmolecule connections"
321+
)
322+
except RuntimeError:
323+
errors.append(f"Duplicit bond {atom_1} - {atom_2}")
324+
325+
326+
def _parse_pdb_entity(mol, bm, cif_block):
327+
if "_entity." not in cif_block.get_mmcif_category_names():
328+
return
329+
330+
entities = cif_block.find("_entity.", ["id", "pdbx_description"])
331+
bm_entities = list({residue.ent_id for residue in bm.nodes()})
332+
if len(bm_entities) == 1:
333+
for row in entities:
334+
if cif.as_string(row["_entity.id"]) == bm_entities[0]:
335+
mol.SetProp("name", cif.as_string(row["pdbx_description"]))
336+
337+
338+
def get_chem_comp_bonds(cif_block: cif.Block, residue: str):
339+
"""Returns _chem_comp_bond associated with a residue
340+
341+
Args:
342+
cif_block: gemmi.cif.Block object of protein mmCIF file
343+
residue: CCD ID
344+
"""
345+
346+
if "_chem_comp_bond." not in cif_block.get_mmcif_category_names():
347+
return
348+
chem_comp_bonds = cif_block.get_mmcif_category("_chem_comp_bond.")
349+
ResidueBonds = namedtuple("ResidueBonds", "residue atom_id_1 atom_id_2 value_order")
350+
atom_id_1 = []
351+
atom_id_2 = []
352+
value_order = []
353+
last_comp_id = None
354+
residue_found = False
355+
for i in range(len(chem_comp_bonds["comp_id"])):
356+
chem_comp_id = chem_comp_bonds["comp_id"][i]
357+
if chem_comp_id == residue:
358+
residue_found = True
359+
atom_id_1.append(chem_comp_bonds["atom_id_1"][i])
360+
atom_id_2.append(chem_comp_bonds["atom_id_2"][i])
361+
value_order.append(chem_comp_bonds["value_order"][i])
362+
last_comp_id = chem_comp_id
363+
if last_comp_id != residue and residue_found:
364+
break
365+
366+
residue_bonds = ResidueBonds(residue, atom_id_1, atom_id_2, value_order)
367+
return residue_bonds
368+
369+
370+
def _handle_hydrogens(mol):
371+
"""
372+
Returns a rdkit.Chem.rdchem.Mol after adding hydrogens
373+
374+
Args:
375+
mol: Rdkit Mol object
376+
"""
377+
378+
hydrogen_indices = [
379+
atom.GetIdx() for atom in mol.GetAtoms() if atom.GetAtomicNum() == 1
380+
]
381+
382+
hydrogen_indices.sort(reverse=True)
383+
for index in hydrogen_indices:
384+
mol.RemoveAtom(index)
385+
386+
mol.UpdatePropertyCache(strict=False)
387+
mol = rdkit.Chem.AddHs(mol, addCoords=True, addResidueInfo=True)
388+
conformer = mol.GetConformer()
389+
for atom in mol.GetAtoms():
390+
if atom.GetAtomicNum() == 1:
391+
atom_id = atom.GetSymbol() + str(atom.GetIdx())
392+
atom.SetProp("name", atom_id)
393+
mol_tools.correct_atom_coords(conformer, atom.GetIdx())
394+
for bond in atom.GetBonds():
395+
other = bond.GetOtherAtom(atom)
396+
residue_id = other.GetProp("residue_id")
397+
atom.SetProp("residue_id", residue_id)
398+
return mol

0 commit comments

Comments
 (0)