217 lines
6.4 KiB
Python
217 lines
6.4 KiB
Python
# pdb spec
|
|
# https://www.biostat.jhsph.edu/~iruczins/teaching/260.655/links/pdbformat.pdf
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import sys
|
|
import logging
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class PDBEntry:
|
|
id: str
|
|
|
|
|
|
@dataclass
|
|
class Atom(PDBEntry):
|
|
type = "ATOM"
|
|
|
|
residue_name: str
|
|
|
|
position: Point3D
|
|
|
|
_row: str
|
|
|
|
@classmethod
|
|
def parse(cls, row: str) -> Atom:
|
|
return cls(
|
|
id=row[6:11].strip(),
|
|
residue_name=row[17:20].strip(),
|
|
position=Point3D(
|
|
x=float(row[30:38]), y=float(row[39:46]), z=float(row[46:54])
|
|
),
|
|
_row=row,
|
|
)
|
|
|
|
def __str__(self) -> str:
|
|
return self._row
|
|
|
|
|
|
class Hetatm(Atom):
|
|
type = "HETATM"
|
|
|
|
|
|
@dataclass
|
|
class Point3D:
|
|
x: float
|
|
y: float
|
|
z: float
|
|
|
|
|
|
@dataclass
|
|
class WaterAtomsCategorizedByDistance:
|
|
protein_atoms: list[Atom]
|
|
close_water_atoms: list[Atom]
|
|
far_water_atoms: list[Atom]
|
|
|
|
|
|
class AtomCategorizer:
|
|
def __init__(
|
|
self, protein_pdb_file_path: str, maximum_distance_from_protein: float
|
|
):
|
|
self.protein_pdb_file_path = protein_pdb_file_path
|
|
self.maximum_distance_from_protein = maximum_distance_from_protein
|
|
|
|
def categorize_water_atoms(self) -> WaterAtomsCategorizedByDistance:
|
|
pdb_entries = self._read_protein_pdb()
|
|
protein_atoms: list[Atom] = self._get_protein_atoms(pdb_entries=pdb_entries)
|
|
water_atoms: list[Atom] = self._get_water_atoms(pdb_entries=pdb_entries)
|
|
|
|
logger.info(f"Total amount of pdb entries: '{len(pdb_entries)}'")
|
|
logger.info(f"Amount of protein's atoms: '{len(protein_atoms)}'")
|
|
logger.info(f"Amount of water atom: '{len(water_atoms)}'")
|
|
|
|
close_water_atoms: list[Atom] = self._get_close_water_atoms(
|
|
protein_atoms=protein_atoms, water_atoms=water_atoms
|
|
)
|
|
far_water_atoms: list[Atom] = self._get_far_water_atoms(
|
|
water_atoms=water_atoms, close_water_atoms=close_water_atoms
|
|
)
|
|
|
|
logger.info(
|
|
f"There are '{len(water_atoms)}' total water atoms, from which '{len(close_water_atoms)}' "
|
|
f"are not further than '{self.maximum_distance_from_protein}' angstrom from protein atoms "
|
|
f"and '{len(far_water_atoms)}' that are further"
|
|
)
|
|
return WaterAtomsCategorizedByDistance(
|
|
protein_atoms=protein_atoms,
|
|
close_water_atoms=close_water_atoms,
|
|
far_water_atoms=far_water_atoms,
|
|
)
|
|
|
|
def _get_close_water_atoms(
|
|
self, protein_atoms: list[Atom], water_atoms: list[Atom]
|
|
) -> list[Atom]:
|
|
close_water_atoms: list[Atom] = []
|
|
for water_atoms in water_atoms:
|
|
for protein_atom in protein_atoms:
|
|
if (
|
|
self._calc_distance_square(
|
|
water_atoms.position, protein_atom.position
|
|
)
|
|
<= self.maximum_distance_from_protein**2
|
|
):
|
|
close_water_atoms.append(water_atoms)
|
|
break
|
|
return close_water_atoms
|
|
|
|
@staticmethod
|
|
def _get_far_water_atoms(
|
|
water_atoms: list[Atom], close_water_atoms: list[Atom]
|
|
) -> list[Atom]:
|
|
close_water_atom_ids = {
|
|
close_water_atom.id for close_water_atom in close_water_atoms
|
|
}
|
|
return [
|
|
water_atom
|
|
for water_atom in water_atoms
|
|
if water_atom.id not in close_water_atom_ids
|
|
]
|
|
|
|
@staticmethod
|
|
def _get_water_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]:
|
|
water_atoms: list[Atom] = []
|
|
for entry in pdb_entries:
|
|
if not (isinstance(entry, Atom) or isinstance(entry, Hetatm)):
|
|
continue
|
|
if entry.residue_name == "HOH":
|
|
water_atoms.append(entry)
|
|
return water_atoms
|
|
|
|
@staticmethod
|
|
def _get_protein_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]:
|
|
protein_atoms: list[Atom] = []
|
|
for entry in pdb_entries:
|
|
if not (isinstance(entry, Atom) or isinstance(entry, Hetatm)):
|
|
break
|
|
|
|
if entry.residue_name == "BNZ":
|
|
break
|
|
|
|
protein_atoms.append(entry)
|
|
return protein_atoms
|
|
|
|
def _read_protein_pdb(self) -> list[PDBEntry]:
|
|
with open(self.protein_pdb_file_path) as f:
|
|
data = f.read()
|
|
|
|
pdb_entries: list[PDBEntry] = []
|
|
for row in data.split("\n"):
|
|
if row.startswith(Atom.type):
|
|
pdb_entries.append(Atom.parse(row))
|
|
elif row.startswith(Hetatm.type):
|
|
pdb_entries.append(Hetatm.parse(row))
|
|
return pdb_entries
|
|
|
|
@staticmethod
|
|
def _calc_distance_square(p1: Point3D, p2: Point3D) -> float:
|
|
return (p2.x - p1.x) ** 2 + (p2.y - p1.y) ** 2 + (p2.z - p1.z) ** 2
|
|
|
|
|
|
def save_atom_ids_to_file(atoms: list[Atom], filename: str) -> None:
|
|
logger.info(f"Saving '{len(atoms)}' atoms ids to '{filename}'.")
|
|
atom_ids = {atom.id for atom in atoms}
|
|
with open(filename, "w") as f:
|
|
f.write(json.dumps(list(atom_ids)))
|
|
|
|
|
|
def save_atom_rows_to_file(atoms: list[Atom], filename: str) -> None:
|
|
data = ""
|
|
for water_atom in atoms:
|
|
data += str(water_atom) + "\n"
|
|
with open(filename, "w") as f:
|
|
f.write(data)
|
|
|
|
|
|
def main(pdb_filename: str) -> None:
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(levelname)s | %(asctime)s | %(message)s"
|
|
)
|
|
|
|
result = AtomCategorizer(
|
|
protein_pdb_file_path=pdb_filename, maximum_distance_from_protein=5.0
|
|
).categorize_water_atoms()
|
|
|
|
save_atom_ids_to_file(
|
|
atoms=result.close_water_atoms, filename="close_water_atom_ids.json"
|
|
)
|
|
save_atom_rows_to_file(
|
|
atoms=result.close_water_atoms, filename="close_water_atoms.pdb"
|
|
)
|
|
save_atom_rows_to_file(
|
|
atoms=result.close_water_atoms + result.protein_atoms,
|
|
filename="close_water_atoms_with_protein_atoms.pdb",
|
|
)
|
|
|
|
save_atom_ids_to_file(
|
|
atoms=result.far_water_atoms, filename="far_water_atom_ids.json"
|
|
)
|
|
save_atom_rows_to_file(atoms=result.far_water_atoms, filename="far_water_atoms.pdb")
|
|
save_atom_rows_to_file(
|
|
atoms=result.far_water_atoms + result.protein_atoms,
|
|
filename="far_water_atoms_with_protein_atoms.pdb",
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) <= 1:
|
|
pdb_filename = "prot_ligand.pdb"
|
|
else:
|
|
pdb_filename = sys.argv[1]
|
|
|
|
main(pdb_filename=pdb_filename)
|