atom categorizer update

This commit is contained in:
Artem Korolev 2024-05-22 15:24:13 +04:00 committed by Artem Korolev
parent ce28da76aa
commit e935617894
10 changed files with 63114 additions and 47 deletions

View File

@ -24,6 +24,8 @@ class Atom(PDBEntry):
position: Point3D position: Point3D
_row: str
@classmethod @classmethod
def parse(cls, row: str) -> Atom: def parse(cls, row: str) -> Atom:
return cls( return cls(
@ -32,8 +34,12 @@ class Atom(PDBEntry):
position=Point3D( position=Point3D(
x=float(row[30:38]), y=float(row[39:46]), z=float(row[46:54]) x=float(row[30:38]), y=float(row[39:46]), z=float(row[46:54])
), ),
_row=row,
) )
def __str__(self) -> str:
return self._row
class Hetatm(Atom): class Hetatm(Atom):
type = "HETATM" type = "HETATM"
@ -47,82 +53,84 @@ class Point3D:
@dataclass @dataclass
class WaterMoleculesCategorizedByDistance: class WaterAtomsCategorizedByDistance:
close_water_molecules: list[Atom] protein_atoms: list[Atom]
far_water_molecules: list[Atom] close_water_atoms: list[Atom]
far_water_atoms: list[Atom]
class MoleculeCategorizer: class AtomCategorizer:
def __init__( def __init__(
self, protein_pdb_file_path: str, maximum_distance_from_protein: float self, protein_pdb_file_path: str, maximum_distance_from_protein: float
): ):
self.protein_pdb_file_path = protein_pdb_file_path self.protein_pdb_file_path = protein_pdb_file_path
self.maximum_distance_from_protein = maximum_distance_from_protein self.maximum_distance_from_protein = maximum_distance_from_protein
def categorize_water_molecules(self) -> WaterMoleculesCategorizedByDistance: def categorize_water_atoms(self) -> WaterAtomsCategorizedByDistance:
pdb_entries = self._read_protein_pdb() pdb_entries = self._read_protein_pdb()
protein_atoms: list[Atom] = self._get_protein_atoms(pdb_entries=pdb_entries) protein_atoms: list[Atom] = self._get_protein_atoms(pdb_entries=pdb_entries)
water_molecules: list[Atom] = self._get_water_molecules(pdb_entries=pdb_entries) water_atoms: list[Atom] = self._get_water_atoms(pdb_entries=pdb_entries)
logger.info(f"Total amount of pdb entries: '{len(pdb_entries)}'") logger.info(f"Total amount of pdb entries: '{len(pdb_entries)}'")
logger.info(f"Amount of protein's atoms: '{len(protein_atoms)}'") logger.info(f"Amount of protein's atoms: '{len(protein_atoms)}'")
logger.info(f"Amount of water molecules: '{len(water_molecules)}'") logger.info(f"Amount of water atom: '{len(water_atoms)}'")
close_water_molecules: list[Atom] = self._get_close_water_molecules( close_water_atoms: list[Atom] = self._get_close_water_atoms(
protein_atoms=protein_atoms, water_molecules=water_molecules protein_atoms=protein_atoms, water_atoms=water_atoms
) )
far_water_molecules: list[Atom] = self._get_far_water_molecules( far_water_atoms: list[Atom] = self._get_far_water_atoms(
water_molecules=water_molecules, close_water_molecules=close_water_molecules water_atoms=water_atoms, close_water_atoms=close_water_atoms
) )
logger.info( logger.info(
f"There are '{len(water_molecules)}' total water molecules, from which '{len(close_water_molecules)}' " f"There are '{len(water_atoms)}' total water atoms, from which '{len(close_water_atoms)}' "
f"are not further than '{self.maximum_distance_from_protein}' angstrom from protein atoms " f"are not further than '{self.maximum_distance_from_protein}' angstrom from protein atoms "
f"and '{len(far_water_molecules)}' that are further" f"and '{len(far_water_atoms)}' that are further"
) )
return WaterMoleculesCategorizedByDistance( return WaterAtomsCategorizedByDistance(
close_water_molecules=close_water_molecules, protein_atoms=protein_atoms,
far_water_molecules=far_water_molecules, close_water_atoms=close_water_atoms,
far_water_atoms=far_water_atoms,
) )
def _get_close_water_molecules( def _get_close_water_atoms(
self, protein_atoms: list[Atom], water_molecules: list[Atom] self, protein_atoms: list[Atom], water_atoms: list[Atom]
) -> list[Atom]: ) -> list[Atom]:
close_water_molecules: list[Atom] = [] close_water_atoms: list[Atom] = []
for water_molecule in water_molecules: for water_atoms in water_atoms:
for protein_atom in protein_atoms: for protein_atom in protein_atoms:
if ( if (
self._calc_distance_square( self._calc_distance_square(
water_molecule.position, protein_atom.position water_atoms.position, protein_atom.position
) )
<= self.maximum_distance_from_protein**2 <= self.maximum_distance_from_protein**2
): ):
close_water_molecules.append(water_molecule) close_water_atoms.append(water_atoms)
break break
return close_water_molecules return close_water_atoms
@staticmethod @staticmethod
def _get_far_water_molecules( def _get_far_water_atoms(
water_molecules: list[Atom], close_water_molecules: list[Atom] water_atoms: list[Atom], close_water_atoms: list[Atom]
) -> list[Atom]: ) -> list[Atom]:
close_water_molecule_ids = { close_water_atom_ids = {
close_water_molecule.id for close_water_molecule in close_water_molecules close_water_atom.id for close_water_atom in close_water_atoms
} }
return [ return [
water_molecule water_atom
for water_molecule in water_molecules for water_atom in water_atoms
if water_molecule.id not in close_water_molecule_ids if water_atom.id not in close_water_atom_ids
] ]
@staticmethod @staticmethod
def _get_water_molecules(pdb_entries: list[PDBEntry]) -> list[Atom]: def _get_water_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]:
water_molecules: list[Atom] = [] water_atoms: list[Atom] = []
for entry in pdb_entries: for entry in pdb_entries:
if not (isinstance(entry, Atom) or isinstance(entry, Hetatm)): if not (isinstance(entry, Atom) or isinstance(entry, Hetatm)):
continue continue
if entry.residue_name == "HOH": if entry.residue_name == "HOH":
water_molecules.append(entry) water_atoms.append(entry)
return water_molecules return water_atoms
@staticmethod @staticmethod
def _get_protein_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]: def _get_protein_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]:
@ -155,28 +163,47 @@ class MoleculeCategorizer:
def save_atom_ids_to_file(atoms: list[Atom], filename: str) -> None: def save_atom_ids_to_file(atoms: list[Atom], filename: str) -> None:
logger.info(f"Saving '{len(atoms)}' atoms ids to '{filename}'.")
atom_ids = {atom.id for atom in atoms} atom_ids = {atom.id for atom in atoms}
with open(filename, "w") as f: with open(filename, "w") as f:
f.write(json.dumps(list(atom_ids))) f.write(json.dumps(list(atom_ids)))
def save_atom_rows_to_file(atoms: list[Atom], filename: str) -> None:
data = ""
for water_atom in atoms:
data += str(water_atom) + "\n"
with open(filename, "w") as f:
f.write(data)
def main(pdb_filename: str) -> None: def main(pdb_filename: str) -> None:
logging.basicConfig( logging.basicConfig(
level=logging.INFO, format="%(levelname)s | %(asctime)s | %(message)s" level=logging.INFO, format="%(levelname)s | %(asctime)s | %(message)s"
) )
result = MoleculeCategorizer( result = AtomCategorizer(
protein_pdb_file_path=pdb_filename, maximum_distance_from_protein=5.0 protein_pdb_file_path=pdb_filename, maximum_distance_from_protein=5.0
).categorize_water_molecules() ).categorize_water_atoms()
logger.info("Saving IDs of a close water molecules to 'close_water_molecules.json'")
save_atom_ids_to_file( save_atom_ids_to_file(
atoms=result.close_water_molecules, filename="close_water_molecules.json" atoms=result.close_water_atoms, filename="close_water_atom_ids.json"
)
save_atom_rows_to_file(
atoms=result.close_water_atoms, filename="close_water_atoms.pdb"
)
save_atom_rows_to_file(
atoms=result.close_water_atoms + result.protein_atoms,
filename="close_water_atoms_with_protein_atoms.pdb",
) )
logger.info("Saving IDs of a far water molecules to 'far_water_molecules.json'")
save_atom_ids_to_file( save_atom_ids_to_file(
atoms=result.far_water_molecules, filename="far_water_molecules.json" atoms=result.far_water_atoms, filename="far_water_atom_ids.json"
)
save_atom_rows_to_file(atoms=result.far_water_atoms, filename="far_water_atoms.pdb")
save_atom_rows_to_file(
atoms=result.far_water_atoms + result.protein_atoms,
filename="far_water_atoms_with_protein_atoms.pdb",
) )

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,13 +1,13 @@
from categorize_water_molecules_by_distance import MoleculeCategorizer from categorize_water_molecules_by_distance import AtomCategorizer
if __name__ == "__main__": if __name__ == "__main__":
molecule_categorizer = MoleculeCategorizer( molecule_categorizer = AtomCategorizer(
protein_pdb_file_path="prot_ligand.pdb", maximum_distance_from_protein=5.0 protein_pdb_file_path="prot_ligand.pdb", maximum_distance_from_protein=5.0
) )
result = molecule_categorizer.categorize_water_molecules() result = molecule_categorizer.categorize_water_atoms()
assert {m.id for m in result.close_water_molecules} == { assert {m.id for m in result.close_water_atoms} == {
"2715", "2715",
"2716", "2716",
"2717", "2717",
@ -2148,7 +2148,7 @@ if __name__ == "__main__":
"25342", "25342",
"25343", "25343",
} }
assert {m.id for m in result.far_water_molecules} == { assert {m.id for m in result.far_water_atoms} == {
"8993", "8993",
"18608", "18608",
"13848", "13848",