atom categorizer update

This commit is contained in:
Artem Korolev 2024-05-22 15:24:13 +04:00
parent ce28da76aa
commit cae5a1bee8
10 changed files with 63114 additions and 47 deletions

View File

@ -24,6 +24,8 @@ class Atom(PDBEntry):
position: Point3D
_row: str
@classmethod
def parse(cls, row: str) -> Atom:
return cls(
@ -32,8 +34,12 @@ class Atom(PDBEntry):
position=Point3D(
x=float(row[30:38]), y=float(row[39:46]), z=float(row[46:54])
),
_row=row,
)
def __str__(self) -> str:
return self._row
class Hetatm(Atom):
type = "HETATM"
@ -47,82 +53,84 @@ class Point3D:
@dataclass
class WaterMoleculesCategorizedByDistance:
close_water_molecules: list[Atom]
far_water_molecules: list[Atom]
class WaterAtomsCategorizedByDistance:
protein_atoms: list[Atom]
close_water_atoms: list[Atom]
far_water_atoms: list[Atom]
class MoleculeCategorizer:
class AtomCategorizer:
def __init__(
self, protein_pdb_file_path: str, maximum_distance_from_protein: float
):
self.protein_pdb_file_path = protein_pdb_file_path
self.maximum_distance_from_protein = maximum_distance_from_protein
def categorize_water_molecules(self) -> WaterMoleculesCategorizedByDistance:
def categorize_water_atoms(self) -> WaterAtomsCategorizedByDistance:
pdb_entries = self._read_protein_pdb()
protein_atoms: list[Atom] = self._get_protein_atoms(pdb_entries=pdb_entries)
water_molecules: list[Atom] = self._get_water_molecules(pdb_entries=pdb_entries)
water_atoms: list[Atom] = self._get_water_atoms(pdb_entries=pdb_entries)
logger.info(f"Total amount of pdb entries: '{len(pdb_entries)}'")
logger.info(f"Amount of protein's atoms: '{len(protein_atoms)}'")
logger.info(f"Amount of water molecules: '{len(water_molecules)}'")
logger.info(f"Amount of water atom: '{len(water_atoms)}'")
close_water_molecules: list[Atom] = self._get_close_water_molecules(
protein_atoms=protein_atoms, water_molecules=water_molecules
close_water_atoms: list[Atom] = self._get_close_water_atoms(
protein_atoms=protein_atoms, water_atoms=water_atoms
)
far_water_molecules: list[Atom] = self._get_far_water_molecules(
water_molecules=water_molecules, close_water_molecules=close_water_molecules
far_water_atoms: list[Atom] = self._get_far_water_atoms(
water_atoms=water_atoms, close_water_atoms=close_water_atoms
)
logger.info(
f"There are '{len(water_molecules)}' total water molecules, from which '{len(close_water_molecules)}' "
f"There are '{len(water_atoms)}' total water atoms, from which '{len(close_water_atoms)}' "
f"are not further than '{self.maximum_distance_from_protein}' angstrom from protein atoms "
f"and '{len(far_water_molecules)}' that are further"
f"and '{len(far_water_atoms)}' that are further"
)
return WaterMoleculesCategorizedByDistance(
close_water_molecules=close_water_molecules,
far_water_molecules=far_water_molecules,
return WaterAtomsCategorizedByDistance(
protein_atoms=protein_atoms,
close_water_atoms=close_water_atoms,
far_water_atoms=far_water_atoms,
)
def _get_close_water_molecules(
self, protein_atoms: list[Atom], water_molecules: list[Atom]
def _get_close_water_atoms(
self, protein_atoms: list[Atom], water_atoms: list[Atom]
) -> list[Atom]:
close_water_molecules: list[Atom] = []
for water_molecule in water_molecules:
close_water_atoms: list[Atom] = []
for water_atoms in water_atoms:
for protein_atom in protein_atoms:
if (
self._calc_distance_square(
water_molecule.position, protein_atom.position
water_atoms.position, protein_atom.position
)
<= self.maximum_distance_from_protein**2
):
close_water_molecules.append(water_molecule)
close_water_atoms.append(water_atoms)
break
return close_water_molecules
return close_water_atoms
@staticmethod
def _get_far_water_molecules(
water_molecules: list[Atom], close_water_molecules: list[Atom]
def _get_far_water_atoms(
water_atoms: list[Atom], close_water_atoms: list[Atom]
) -> list[Atom]:
close_water_molecule_ids = {
close_water_molecule.id for close_water_molecule in close_water_molecules
close_water_atom_ids = {
close_water_atom.id for close_water_atom in close_water_atoms
}
return [
water_molecule
for water_molecule in water_molecules
if water_molecule.id not in close_water_molecule_ids
water_atom
for water_atom in water_atoms
if water_atom.id not in close_water_atom_ids
]
@staticmethod
def _get_water_molecules(pdb_entries: list[PDBEntry]) -> list[Atom]:
water_molecules: list[Atom] = []
def _get_water_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]:
water_atoms: list[Atom] = []
for entry in pdb_entries:
if not (isinstance(entry, Atom) or isinstance(entry, Hetatm)):
continue
if entry.residue_name == "HOH":
water_molecules.append(entry)
return water_molecules
water_atoms.append(entry)
return water_atoms
@staticmethod
def _get_protein_atoms(pdb_entries: list[PDBEntry]) -> list[Atom]:
@ -155,28 +163,47 @@ class MoleculeCategorizer:
def save_atom_ids_to_file(atoms: list[Atom], filename: str) -> None:
logger.info(f"Saving '{len(atoms)}' atoms ids to '{filename}'.")
atom_ids = {atom.id for atom in atoms}
with open(filename, "w") as f:
f.write(json.dumps(list(atom_ids)))
def save_atom_rows_to_file(atoms: list[Atom], filename: str) -> None:
data = ""
for water_atom in atoms:
data += str(water_atom) + "\n"
with open(filename, "w") as f:
f.write(data)
def main(pdb_filename: str) -> None:
logging.basicConfig(
level=logging.INFO, format="%(levelname)s | %(asctime)s | %(message)s"
)
result = MoleculeCategorizer(
result = AtomCategorizer(
protein_pdb_file_path=pdb_filename, maximum_distance_from_protein=5.0
).categorize_water_molecules()
).categorize_water_atoms()
logger.info("Saving IDs of a close water molecules to 'close_water_molecules.json'")
save_atom_ids_to_file(
atoms=result.close_water_molecules, filename="close_water_molecules.json"
atoms=result.close_water_atoms, filename="close_water_atom_ids.json"
)
save_atom_rows_to_file(
atoms=result.close_water_atoms, filename="close_water_atoms.pdb"
)
save_atom_rows_to_file(
atoms=result.close_water_atoms + result.protein_atoms,
filename="close_water_atoms_with_protein_atoms.pdb",
)
logger.info("Saving IDs of a far water molecules to 'far_water_molecules.json'")
save_atom_ids_to_file(
atoms=result.far_water_molecules, filename="far_water_molecules.json"
atoms=result.far_water_atoms, filename="far_water_atom_ids.json"
)
save_atom_rows_to_file(atoms=result.far_water_atoms, filename="far_water_atoms.pdb")
save_atom_rows_to_file(
atoms=result.far_water_atoms + result.protein_atoms,
filename="far_water_atoms_with_protein_atoms.pdb",
)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -1,13 +1,13 @@
from categorize_water_molecules_by_distance import MoleculeCategorizer
from categorize_water_molecules_by_distance import AtomCategorizer
if __name__ == "__main__":
molecule_categorizer = MoleculeCategorizer(
molecule_categorizer = AtomCategorizer(
protein_pdb_file_path="prot_ligand.pdb", maximum_distance_from_protein=5.0
)
result = molecule_categorizer.categorize_water_molecules()
result = molecule_categorizer.categorize_water_atoms()
assert {m.id for m in result.close_water_molecules} == {
assert {m.id for m in result.close_water_atoms} == {
"2715",
"2716",
"2717",
@ -2148,7 +2148,7 @@ if __name__ == "__main__":
"25342",
"25343",
}
assert {m.id for m in result.far_water_molecules} == {
assert {m.id for m in result.far_water_atoms} == {
"8993",
"18608",
"13848",