Source code for ccdc.protein

#
# This code is Copyright (C) 2015 The Cambridge Crystallographic Data Centre
# (CCDC) of 12 Union Road, Cambridge CB2 1EZ, UK and a proprietary work of CCDC.
# This code may not be used, reproduced, translated, modified, disassembled or
# copied, except in accordance with a valid licence agreement with CCDC and may
# not be disclosed or redistributed in any form, either in whole or in part, to
# any third party. All copies of this code made in accordance with a valid
# licence agreement as referred to above must contain this copyright notice.
#
# No representations, warranties, or liabilities are expressed or implied in the
# supply of this code by CCDC, its servants or agents, except where such
# exclusion or limitation is prohibited, void or unenforceable under governing
# law.
#
'''
The main class of the :mod:`ccdc.protein` module is :class:`ccdc.protein.Protein`.

A :class:`ccdc.protein.Protein` contains attributes and functions that relate
to protein structures.
'''

###########################################################################

import os
from functools import total_ordering

from ccdc.io import EntryReader, _CSDDatabaseLocator
from ccdc.molecule import Molecule, Atom
from ccdc.utilities import nested_class, bidirectional_dict

from ccdc.utilities import _private_importer
with _private_importer() as pi:
    pi.import_ccdc_module('FileFormatsLib')
    pi.import_ccdc_module('ChemistryLib')
    pi.import_ccdc_module('ChemicalAnalysisLib')
    pi.import_ccdc_module('ProteinLib')
    pi.import_ccdc_module('AnnotationsLib')

###########################################################################

[docs]class Protein(Molecule):
[docs] @nested_class('Protein') class Chain(object): '''A chain of a protein.''' def __init__(self, index, _protein_structure=None): self._index = index self._protein_structure = _protein_structure def __len__(self): '''Length of the chain.''' return len(self.residues) def __str__(self): return 'Chain(%s)' % self.identifier def __getitem__(self, identifier): '''Get the residue with the given identifier.''' identifier = '%s:%s' % (self.identifier, identifier) possibles = [r for r in self.residues if r.identifier == identifier] if len(possibles) == 1: return possibles[0] elif len(possibles) > 1: raise IndexError('More than one residue with the identifier %s' % identifier) else: raise IndexError('No residue with the identifier %s' % identifier) @property def residues(self): '''The residues of a chain.''' chain = self._protein_structure.chain(self.identifier) return tuple( Protein.Residue(i, _residue=chain.residue(i)) for i in range(chain.n_sub_parts()) ) @property def sequence(self): '''The sequence of amino acid one letter codes in this chain.''' return ''.join(r.one_letter_code for r in self.residues) @property def index(self): '''The index of the chain in the protein.''' return self._index @property def identifier(self): '''The identifier of the chain.''' return self._protein_structure.chain(self._index).chain_id() @property def author_identifier(self): '''The author provided identifier of the chain.''' return self._protein_structure.chain(self._index).author_chain_id()
[docs] @nested_class('Protein') @total_ordering class Residue(object): '''A single amino acid residue of a protein.''' def __init__(self, i, _residue): self._residue = _residue self._index = i def __str__(self): return 'Residue(%s)' % self.identifier __repr__ = __str__ def __eq__(self, other): return self.chain_identifier == other.chain_identifier and \ self._residue.sequence_number() == other._residue.sequence_number() def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): # The < operator should order first by chain ID, and second by sequence number if self.chain_identifier == other.chain_identifier: return self._residue.sequence_number() < other._residue.sequence_number() return self.chain_identifier < other.chain_identifier def __hash__(self): return hash(str(self)) @property def identifier(self): '''The identifier of this residue.''' return '%s:%s' % (self._residue.chain_id(), self._residue.full_name()) @property def chain_identifier(self): '''The identifier of the chain of which this residue is a part.''' return self._residue.chain_id() @property def author_identifier(self): '''The author provided identifier of this residue.''' return '%s:%s' % (self._residue.author_chain_id(), self._residue.author_full_name()) @property def chain_author_identifier(self): '''The author provided identifier of the chain of which this residue is a part.''' return self._residue.author_chain_id() @property def three_letter_code(self): '''The three letter code of the amino acid.''' return self._residue.amino_acid().three_letter_code(self._residue.amino_acid().type()) @property def one_letter_code(self): '''The one letter code of the amino acid.''' return self._residue.amino_acid().one_letter_code() @property def backbone_atoms(self): '''The backbone atoms of the amino acid.''' return [Atom(_atom=a) for a in self._residue.backbone_atoms()] @property def sidechain_atoms(self): '''The sidechain atoms of this amino acid.''' return [Atom(_atom=a) for a in self._residue.sidechain_atoms()] @property def atoms(self): '''The atoms of the residue.''' return self.backbone_atoms + self.sidechain_atoms @property def c_alpha(self): '''The C alpha atom of the residue.''' return Atom(_atom=self._residue.c_alpha()) @property def c_beta(self): '''The C beta atom, or ``None`` if there is no C beta atom.''' a = self._residue.c_beta() if a: return Atom(_atom=a) @property def carbonyl_oxygen(self): '''The carbonyl oxygen atom.''' return Atom(_atom=self._residue.carbonyl_oxygen()) @property def n_terminus(self): '''The N terminus atom.''' return Atom(_atom=self._residue.n_terminus()) @property def c_terminus(self): '''The C terminus atom.''' return Atom(_atom=self._residue.c_terminus()) @property def cysteine_sulphur(self): '''The sulphur of a cysteine residue, or ``None`` if not a cysteine.''' if self.three_letter_code == 'CYS': return [a for a in self.sidechain_atoms if a.atomic_symbol == 'S'][0] @property def is_hydrophobic(self): '''Whether the residue is hydrophobic.''' return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.HYDROPHOBIC) @property def is_hydrophilic(self): '''Whether the residue is hydrophilic.''' return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.HYDROPHILIC) @property def is_acidic(self): '''Whether the residue is acidic.''' return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.ACIDIC) @property def is_basic(self): '''Whether the residue is basic.''' return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.BASIC) @property def index(self): '''The index of this residue in its chain.''' return self._index
[docs] @nested_class('Protein') class NucleicAcid(object): ''' A nucleic acid of a protein.''' def __init__(self, index, _protein_structure=None): self._index = index self._protein_structure = _protein_structure def __len__(self): '''Length of the nucleic acid.''' return len(self.nucleotides) def __str__(self): return 'NucleicAcid(%s)' % self.identifier def __getitem__(self, identifier): '''Get the nucleotide with the given identifier''' identifier = '%s:%s' % (self.identifier, identifier) possibles = [r for r in self.nucleotides if r.identifier == identifier] if len(possibles) == 1: return possibles[0] elif len(possibles) > 1: raise IndexError('More than one nucleotide with the identifier %s' % identifier) else: raise IndexError('No nucleotide with the identifier %s' % identifier) @property def nucleotides(self): '''The nucleotides of the nucleic acid''' nucleic_acid = self._protein_structure.nucleic_acid(self.identifier) return tuple( Protein.Nucleotide(i, _nucleotide=nucleic_acid.nucleotide(i)) for i in range(nucleic_acid.n_sub_parts()) ) @property def sequence(self): '''The sequence of nucleotide one letter codes in this nucleic acid.''' return ''.join(r.one_letter_code for r in self.nucleotides) @property def index(self): '''The index of the nucleic acid in the protein.''' return self._index @property def identifier(self): '''The identifier of the nucleic acid.''' return self._protein_structure.nucleic_acid(self._index).chain_id() @property def author_identifier(self): '''The author provided identifier of the nucleic acid.''' return self._protein_structure.nucleic_acid(self._index).author_chain_id()
[docs] @nested_class('Protein') @total_ordering class Nucleotide(object): '''A single nucleotide of a nucleic acid.''' def __init__(self, index, _nucleotide): self._index = index self._nucleotide = _nucleotide def __str__(self): return 'Nucleotide(%s)' % self.identifier __repr__ = __str__ def __eq__(self, other): return self.nucleic_acid_identifier == other.nucleic_acid_identifier and \ self._nucleotide.sequence_number() == other._nucleotide.sequence_number() def __ne__(self, other): return not self.__eq__(other) def __lt__(self, other): # The < operator should order first by chain ID, and second by sequence number if self.nucleic_acid_identifier == other.nucleic_acid_identifier: return self._nucleotide.sequence_number() < other._nucleotide.sequence_number() return self.nucleic_acid_identifier < other.nucleic_acid_identifier def __hash__(self): return hash(str(self)) @property def index(self): '''The index of this nucleotide in its nucleic acid.''' return self._index @property def identifier(self): '''The identifier of this nucleotide.''' return '%s:%s' % (self._nucleotide.chain_id(), self._nucleotide.full_name()) @property def nucleic_acid_identifier(self): '''The identifier of the nucleic acid of which this nucleotide is a part.''' return self._nucleotide.chain_id() @property def author_identifier(self): '''The author provided identifier of this nucleotide.''' return '%s:%s' % (self._nucleotide.author_chain_id(), self._nucleotide.author_full_name()) @property def nucleic_acid_author_identifier(self): '''The author provided identifier of the nucleic acid of which this nucleotide is a part.''' return self._nucleotide.author_chain_id() @property def code(self): '''The PDB nucleotide code.''' return self._nucleotide.code() @property def one_letter_code(self): '''The nucleotide one letter code.''' return self._nucleotide.one_letter_code() @property def atoms(self): '''The atoms of the nucleotide.''' return [Atom(_atom=self._nucleotide.atom(i)) for i in range(self._nucleotide.natoms())]
def __init__(self, identifier, _molecule=None, _protein_structure=None, _cell=None): if _molecule is not None: Molecule.__init__(self, identifier=identifier, _molecule=_molecule) if _protein_structure is None: self._cell = _cell if self._cell is None: self._cell = ChemistryLib.Cell() self._crystal = self._protein_structure = FileFormatsLib.create_protein_structure( self._cell, _molecule ) else: self._crystal = self._protein_structure = _protein_structure self._cell = _protein_structure.cell() for i in range(self._protein_structure.nwaters()): w = self._protein_structure.water(i) for j in range(w.natoms()): a = w.atom(j) if a.element().atomic_symbol() == 'O': a.set_label(a.annotations().obtain_ProteinSubstructureData().full_name()) @staticmethod def _make_molecule(protein_part, add_hs=False): editable_mol = FileFormatsLib.create_editable_molecule(protein_part) mol = Molecule('%s:%s' % (protein_part.chain_id(), protein_part.full_name()), editable_mol) if all(a.atomic_number > 0 for a in mol.atoms): mol.assign_bond_types('Unknown') if add_hs: mol.add_hydrogens() if isinstance(protein_part, FileFormatsLib.Ligand): mol.is_covalent = protein_part.covalent_link() mol._protein_part = protein_part mol.author_identifier = f'{protein_part.author_chain_id()}:{protein_part.author_full_name()}' return mol
[docs] @staticmethod def from_entry(entry): '''Constructs a protein from a given :class:`ccdc.entry.Entry`. :param entry: Entry from which to construct the protein. ''' p = Protein(entry.identifier, _molecule=entry.molecule._molecule, _cell=entry.crystal._crystal.cell()) p.attributes = entry.attributes if hasattr(entry, 'atom_sets'): p.atom_sets = entry.atom_sets return p
[docs] @staticmethod def from_file(file_name): ''' Reads a protein from a file, and constructs the protein. ''' fname, ext = os.path.splitext(file_name.lower()) is_pdb_file = ext in ['.pdb', '.ent'] is_mol2_file = ext == '.mol2' if not is_pdb_file: e = EntryReader(file_name)[0] prot = Protein.from_entry(e) else: # If reading a PDB file, load it using the ProteinPDBFile class f = FileFormatsLib.ProteinPdbFile(file_name) _protein = f.protein() editable = _protein.editable_molecule() identifier = f.identifier().str() if not identifier: identifier, _ = os.path.splitext(os.path.basename(fname)) prot = Protein(identifier, _molecule=editable, _protein_structure=_protein) if is_mol2_file: # Reconstitute Residue ids for aminoacid residues, waters, ligands, cofactors and metals for residue in prot.residues: for a in residue.atoms: a.residue_label = residue.three_letter_code for water in prot.waters: for a in water.atoms: a.residue_label = 'HOH' for ligand in prot.ligands: for a in ligand.atoms: a.residue_label = ligand.identifier[:3] for cofactor in prot.cofactors: for a in cofactor.atoms: a.residue_label = cofactor.identifier[:3] for nucleic_acid in prot.nucleic_acids: for nucleotide in nucleic_acid.nucleotides: for a in nucleotide.atoms: a.residue_label = nucleotide.code for metal in prot.metals: metal.residue_label = metal.label return prot
[docs] @staticmethod def known_cofactor_codes(): '''Provide access to a list of known cofactors codes in the underlying library.''' return FileFormatsLib.Cofactor.known_cofactor_codes()
[docs] def copy(self): '''Copies the protein.''' return Protein(self.identifier, _molecule=Molecule.copy(self)._molecule, _cell=self._cell)
@property def chains(self): '''A tuple of :class:`ccdc.protein.Protein.Chain`.''' return tuple( Protein.Chain(i, self._protein_structure) for i in range(self._protein_structure.nchains()) )
[docs] def remove_chain(self, chain_id): '''Remove the chain with the given identifier.''' if self._protein_structure.has_chain(chain_id): self._protein_structure.remove_chain(chain_id) else: raise IndexError('Protein does not have a chain %s' % chain_id)
@property def residues(self): '''The amino acid residues of the protein.''' return tuple( r for c in self.chains for r in c.residues )
[docs] def remove_residue(self, residue_id): '''Remove the specified residue.''' self._protein_structure.remove_residue(self[residue_id]._residue, False)
@property def nucleic_acids(self): '''A tuple of :class:`ccdc.protein.Protein.NucleicAcid`.''' return tuple( Protein.NucleicAcid(i, self._protein_structure) for i in range(self._protein_structure.n_nucleic_acids()) )
[docs] def remove_nucleic_acid(self, nucleic_acid_chain_id): '''Remove the chain with the given identifier.''' if self._protein_structure.has_nucleic_acid(nucleic_acid_chain_id): self._protein_structure.remove_nucleic_acid(self._protein_structure.nucleic_acid(nucleic_acid_chain_id)) else: raise IndexError('Protein does not have a nucleic acid %s' % nucleic_acid_chain_id)
@property def nucleotides(self): '''The nucleotides of the protein.''' return tuple( n for nc in self.nucleic_acids for n in nc.nucleotides )
[docs] def remove_nucleotide(self, nucleotide_id): '''Remove the specified nucleotide.''' self._protein_structure.remove_nucleotide(self[nucleotide_id]._nucleotide, False)
@property def ligands(self): '''The tuple of ligands in the protein. The identifier of the molecule is of the form chain_id:residue_name. Note that hydrogen atoms are added automatically to the returned molecules however these are not added to the parent protein. ''' return tuple( Protein._make_molecule(self._protein_structure.ligand(i), add_hs=True) for i in range(self._protein_structure.nligands()) ) @property def cofactors(self): '''The tuple of cofactors in the protein. The identifier of the molecule is of the form chain_id:residue_name. Note that hydrogen atoms are added automatically to the returned molecules however these are not added to the parent protein. ''' return tuple( Protein._make_molecule(self._protein_structure.cofactor(i), add_hs=True) for i in range(self._protein_structure.ncofactors()) ) def _prepare_molecule_for_add(self, molecule): '''Add annotations to a molecule in preparation for being added to the protein''' # Cope with Chain IDs if ':' in molecule.identifier: bits = molecule.identifier.split(':') chain_id = bits[0] identifier = bits[1] else: chain_id = '' identifier = molecule.identifier # Cope with GOLD identifiers if '|' in identifier: bits = identifier.split('|') chain_id = '' identifier = bits[1] max_res = max(a._atom.annotations().obtain_ProteinSubstructureData().residue_sequence_number() for a in self.atoms) for a in molecule.atoms: at = a._atom psd = at.annotations().obtain_ProteinSubstructureData() psd.set_full_name(identifier) psd.set_residue_name(identifier) psd.set_chain_id(chain_id) psd.set_residue_sequence_number(max_res+1)
[docs] def add_ligand(self, molecule): '''Add a molecule to the protein as a ligand.''' self._prepare_molecule_for_add(molecule) l = self._protein_structure.add_ligand(molecule._molecule, False)
[docs] def remove_ligand(self, ligand_id): '''Remove the specified ligand. :param ligand_id: str, of the form chain_id:ligand_id. ''' ligs = [self._protein_structure.ligand(i) for i in range(self._protein_structure.nligands())] chain_id, name = ligand_id.split(':') for l in ligs: if l.chain_id() == chain_id and l.full_name() == name: self._protein_structure.remove_ligand(l, False) break else: raise TypeError('Protein has no ligand %s' % ligand_id)
[docs] def add_cofactor(self, molecule): '''Add a molecule to the protein as a cofactor.''' self._prepare_molecule_for_add(molecule) l = self._protein_structure.add_cofactor(molecule._molecule, False)
[docs] def remove_cofactor(self, cofactor_id): '''Remove the specified cofactor. :param cofactor_id: str, of the form chain_id:cofactor_id. ''' ligs = [self._protein_structure.cofactor(i) for i in range(self._protein_structure.ncofactors())] chain_id, name = cofactor_id.split(':') for l in ligs: if l.chain_id() == chain_id and l.full_name() == name: self._protein_structure.remove_cofactor(l, False) break else: raise TypeError('Protein has no cofactor %s' % cofactor_id)
def update_protein(self): self._protein_structure.notify_molecule_changed(True) @property def sequence(self): '''The one-letter code sequence.''' return ' '.join(c.sequence for c in self.chains) @property def waters(self): '''The waters of the protein. :return: a tuple of :class:`ccdc.molecule.Molecule`, representing the oxygens of the water. ''' return tuple( Protein._make_molecule(self._protein_structure.water(i)) for i in range(self._protein_structure.nwaters()) )
[docs] def remove_water(self, water_mols): '''Remove the water (or waters). If water_mols is a list (or tuple) of water objects remove all waters in said list or tuple ''' if hasattr(water_mols,"identifier"): water_mols = [water_mols] water_mols_ids = set(w.identifier for w in water_mols) removed_water_ids = [] indexes = [] for i, w in enumerate(self.waters): if w.identifier in water_mols_ids: removed_water_ids.append(w.identifier) indexes.append(i) if len(removed_water_ids) == len(water_mols_ids): break else: raise TypeError('Protein has no water(s) %s' % ",".join(list(water_mols_ids -set(removed_water_ids)))) # These are removed in reverse to avoid the underlying protein structure # invalidating a later water that needs removing by removing an earlier water. for i in sorted(indexes, reverse=True): self._protein_structure.remove_water(self._protein_structure.water(i), False)
[docs] def remove_all_waters(self): '''Removes all waters from the protein.''' for i in range(self._protein_structure.nwaters()-1, -1, -1): self._protein_structure.remove_water(self._protein_structure.water(i), False)
@property def metals(self): '''The metal atoms of the protein.''' metals = [] for i in range(self._protein_structure.nmetals()): m = self._protein_structure.metal(i) for inx in m.atom_indices(): atom = Atom(_atom=self._molecule.atom(inx)) atom.identifier = f'{m.chain_id()}:{m.full_name()}' atom.author_identifier = f'{m.author_chain_id()}:{m.author_full_name()}' metals.append(atom) return tuple(metals)
[docs] def remove_metal(self, atom): '''Remove the given metal atom.''' for i, m in enumerate(self.metals): if m.index == atom.index: self._protein_structure.remove_metal(self._protein_structure.metal(i), False) break else: raise TypeError('Protein has no metal %s (index %s)' % (atom, str(atom.index)))
[docs] def remove_all_metals(self): '''Removes all metals from the protein.''' metals = self.metals self.remove_atoms(metals)
[docs] def remove_metal_bonds(self, bonds=None): '''Removes metal bonds. :param bonds: iterable of :class:`ccdc.molecule.Bond` instances. If ``None`` all metal bonds will be removed. ''' if bonds is None: bonds = [b for b in self.bonds if any(a.is_metal for a in b.atoms)] self.remove_bonds(bonds)
def __getitem__(self, identifier): '''Return the residue, nucleotide, chain or nucleic acid with the given identifier. :param identifier: may be of the form chain_id:residue_id, chain_id:nucleotide_id, chain_id or nucleic_acid_id :return: :class:`ccdc.protein.Protein` or :class:`ccdc.protein.Protein.Residue` or :class:`ccdc.protein.Protein.NucleicAcid` or :class:`ccdc.protein.Protein.Nucleotide ''' if ':' in identifier: cid, rid = identifier.split(':') else: cid, rid = identifier, None if self._protein_structure.has_chain(cid): c = [ch for ch in self.chains if ch.identifier == cid][0] if rid is not None: return c[rid] else: return c elif self._protein_structure.has_nucleic_acid(cid): nc = [nc for nc in self.nucleic_acids if nc.identifier == cid][0] if rid is not None: return nc[rid] else: return nc else: raise IndexError('Protein has no chain or nucleic acid %s (%s)' % (cid, identifier)) @property def cavity_atoms(self): '''The atoms making up the binding site, if this was read from a gold protein.''' if not hasattr(self, '_cavity_atoms'): self._cavity_atoms = None if hasattr(self, 'atom_sets'): if 'CAVITY_ATOMS' in self.atom_sets: self._cavity_atoms = tuple( self.atoms[i] for i in self.atom_sets['CAVITY_ATOMS'] ) return self._cavity_atoms @property def cavity_residues(self): '''The residues making up the cavity.''' if not hasattr(self, '_cavity_residues'): self._cavity_residues = None if not hasattr(self, '_atom_residue_map'): self._atom_residue_map = { a : r for r in self.residues for a in r.atoms } ats = self.cavity_atoms if ats is not None: self._cavity_residues = set( self._atom_residue_map[a] for a in ats if a in self._atom_residue_map ) self._cavity_residues = list(self._cavity_residues) self._cavity_residues.sort() return self._cavity_residues
[docs] def remove_hydrogens(self): '''Remove all hydrogens from the protein''' structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) structure_editor.remove_all_hydrogens()
[docs] def add_hydrogens(self, mode='All',rules_file = None): ''' Add hydrogens to the protein structure This method protonates the protein structure by performing the following operations: * Remove metal bonds * Assign ligand and cofactor bond types and standardise aromatic and delocalised bonds to CSD conventions * Set atom charges to zero * Set bond types for ARG, GLU, ASP appropriately * Apply protonation rules to ligands and cofactors * Add hydrogens to protein, ligands, cofactors, nucleic acids and waters where necessary * Set any remaining unknown bond type to single :param mode: 'all' to generate all hydrogens (remove any existing hydrogens first) or 'missing' to generate hydrogens deemed to be missing. :param rules_file: File of rules that express special cases - if None, a default version will be used :raises FileNotFoundError: if the rules_file passed in doesnt exist :raises ValueError: if mode is not either 'all' or 'missing' ''' if mode.lower() not in ['all', 'missing']: raise ValueError( 'add_hydrogens: %s should be all or missing' % mode ) if rules_file is None: rules_dir = _CSDDatabaseLocator.get_optimisation_parameter_file_location() rules_file = os.path.join(rules_dir, 'protonation_rules.txt') elif not os.path.exists(rules_file): raise FileNotFoundError("The rules file {} does not exist".format(rules_file)) structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) if mode.lower() == 'all': self.remove_hydrogens() rules = ChemicalAnalysisLib.ProtonationRules(rules_file) structure_editor.add_hydrogens(rules)
[docs] def normalise_labels(self, mode='pdb'): ''' Normalise labels of atoms in the protein structure :param mode: 'pdb' (the default) will try to normalise the labels to PDB compliance if possible (i.e. no longer than 4 characters.) If labels are already compliant they will not be changed 'force' will call the normalisation regardless of whether they are already compliant 'molecule' will normalise using :attr:`ccdc.molecule.Molecule.normalise_labels` ''' if mode.lower() == 'molecule': Molecule.normalise_labels(self) elif mode.lower() == 'force': structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) structure_editor.normalise_atom_labels() else: structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) if not structure_editor.are_labels_pdb_compliant(): structure_editor.normalise_atom_labels()
[docs] def are_labels_pdb_compliant(self): ''' Are labels writeable in PDB format? ''' structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) return structure_editor.are_labels_pdb_compliant()
[docs] def sort_atoms_by_residue(self): ''' Sorts atoms by residue After editing, sometimes the underlying atom list in a protein is not sorted by residue so atoms in a single residue are not in a single block of atoms. In particular, adding hydrogens will add new hydrogen atoms to the end of the atom list. Calling this method will re-order the atoms in the protein so that each residue is in a single atom block in the atom list. This is useful in particular if you are writing PDB files where having residues as single blocks of ATOM lines is desirable. Note that calling this method will mean that any pre-existing indexes into the atom list will probably be invalidated. ''' structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) structure_editor.sort_atoms_by_residue()
[docs] def assign_unique_chain_identifiers(self, mode='all'): ''' Assigns unique chain identifiers to disconnected chain fragments and/or non-chain components Occasionally, for example when merging two proteins, you can have a protein that has multiple chains with the same chain identifier. This can lead to issues down the line: for example file writers may end up mixing the order of residues as the code will 'see' the residues as belonging to the same chain and sort them by sequence number Calling this method will reassign chain identifiers to all components. Each disconnected chain will get a unique chain ID. Note that there is no guarentee that old chain IDs will be the same. If, say previously the first chain was B and the second A the chain identifiers may reverse. Ligands, Co-factors, Metals and Waters will also get new chain ids: these will be derived based on which chain they are overall closest to (i.e. a ligand is given the chain id of the chain that has the most atoms within 5.0 Angstroms.) Note that calling this method will mean that any pre-existing indexes into the atom list will probably be invalidated, as the consequence of changing ids can mean a re-ordering of atoms and residues. :param mode: 'all' (the default) will assign chains new IDs to chain and then associate non-chain components (e.g.co-factors, waters, ligands and metals) to have the chain id of the chain they are nearest to. 'nonchain' will just re-assign chain ids to co-factors, waters, ligands and metals based on chain proximity. ''' structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure) if mode == 'all': structure_editor.assign_chain_ids_to_components() else: structure_editor.assign_chain_ids_to_non_chain_components()
[docs] def detect_ligand_bonds(self, covalent_links = 'include'): ''' Removes all bonds between ligand or cofactor atoms, and redetects them based on distance between atoms. This can be useful if the bonds specified by the CONECT records in the PDB are unspecified or undesirable. :param mode: covalent_links 'include' to include covalent links between the protein and the ligand (the default) and 'exclude' to remove them ''' covlinks = True if covalent_links == 'exclude': covlinks = False self._protein_structure.assign_ligand_connectivity(covlinks) self._protein_structure.build_protein_structure()
[docs] class BindingSite(object): '''A binding site in the protein.''' def __init__(self, protein, whole_residues=True): self.protein = protein self.whole_residues = whole_residues @property def atoms(self): '''The atoms of the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) if self.whole_residues: protein_parts = bsd.expand_to_whole_parts(ats) return tuple( self.protein.atoms[i] for x in protein_parts for i in x.atom_indices() ) else: return tuple(Atom(_atom=a) for a in ats) @property def residues(self): '''The residues of the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) protein_parts = bsd.expand_to_whole_parts(ats) return tuple( self.protein['%s:%s' % (x.chain_id(), x.full_name())] for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.AMINOACID ) @property def nucleotides(self): '''The nucleotides of the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) self._pp = protein_parts = bsd.expand_to_whole_parts(ats) return tuple( self.protein['%s:%s' % (x.chain_id(), x.full_name())] for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.NUCLEOTIDE ) @property def waters(self): '''The waters of the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) protein_parts = bsd.expand_to_whole_parts(ats) return tuple( Protein._make_molecule(x) for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.WATER ) @property def ligands(self): '''The ligands of the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) protein_parts = bsd.expand_to_whole_parts(ats) return tuple( Protein._make_molecule(x) for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.LIGAND ) @property def cofactors(self): '''The cofactors of the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) protein_parts = bsd.expand_to_whole_parts(ats) return tuple( Protein._make_molecule(x) for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.COFACTOR ) @property def metals(self): '''The metals in the cavity.''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) protein_parts = bsd.expand_to_whole_parts(ats) return tuple( self.protein.atoms[i] for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.METAL for i in x.atom_indices() ) @property def _molecule(self): '''Molecule comprising only the cavity atoms of the protein''' bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure) ats = self._detect_binding_site(bsd) if self.whole_residues: ats = bsd.expand_to_whole_parts_atoms(ats) return bsd.binding_site_subset(ats) @property def identifier(self): return self.protein.identifier @property def formula(self): '''Return the chemical formula of the molecules in the binding site.''' return str(ChemistryLib.MoleculeChemicalFormula(self._molecule))
[docs] class BindingSiteFromPoint(BindingSite): '''A cavity defined from a point.''' def __init__(self, protein, origin=(0,0,0), distance=12.): Protein.BindingSite.__init__(self, protein) self.origin = origin self.distance = distance def _detect_binding_site(self, bsd): return bsd.detect_binding_site(self.origin, self.distance)
[docs] class BindingSiteFromAtom(BindingSiteFromPoint): '''A binding site defined from a protein atom.''' def __init__(self, protein, atom, distance): Protein.BindingSiteFromPoint.__init__(self, protein, atom.coordinates, distance) self.atom = atom
[docs] class BindingSiteFromMolecule(BindingSite): '''A binding site defined from an arbitrary molecule.''' def __init__(self, protein, molecule, distance, whole_residues=True): Protein.BindingSite.__init__(self, protein, whole_residues=whole_residues) self.molecule = molecule self.distance = distance def _detect_binding_site(self, bsd): return bsd.detect_binding_site(self.molecule._molecule, self.distance)
[docs] class BindingSiteFromResidue(BindingSiteFromMolecule): '''A binding site defined from protein residue.''' def __init__(self, protein, residue, distance): mol = Protein._make_molecule(residue._residue) Protein.BindingSiteFromMolecule.__init__(self, protein, mol, distance) self.residue = residue
[docs] class BindingSiteFromListOfAtoms(BindingSite): '''A binding site defined from a list of protein atoms.''' def __init__(self, protein, atoms): Protein.BindingSite.__init__(self, protein) self.list_of_atoms = atoms def _detect_binding_site(self, bsd): bsd.set_binding_site_atoms(tuple(a._atom for a in self.list_of_atoms)) return bsd.binding_site_atoms()
[docs] class BindingSiteFromListOfResidues(BindingSiteFromListOfAtoms): 'A binding site from a list of residues.''' def __init__(self, protein, list_of_residues): self.list_of_atoms = tuple(a for r in list_of_residues for a in r.atoms) self.list_of_residues = list_of_residues Protein.BindingSiteFromListOfAtoms.__init__(self, protein, self.list_of_atoms)
[docs] @nested_class('Protein') class ChainSuperposition(object): '''Class for superposition of protein chains using sequence alignment''' _superposition_mode = bidirectional_dict( CALPHA=ProteinLib.ChainTransformationCalculator.CALPHA, # Overlay on C-alpha atoms only BACKBONE=ProteinLib.ChainTransformationCalculator.BACKBONE, # Overlay on backbone atoms RIGID=ProteinLib.ChainTransformationCalculator.RIGID # Overlay on backbone and C-beta atoms )
[docs] @nested_class('Protein.ChainSuperposition') class Settings(object): '''Configuration options for the superposition of protein chains.''' def __init__(self): self.overlay_weighting_factor = 10. '''weighting factor to use in overlay''' self.overlay_minimum_cycles = 1 '''minimum number of cycles in overlay''' self.overlay_convergence_tolerance = 0.01 '''tolerance for convergence in overlay''' self.sequence_alignment_tool = None '''external sequence alignment program''' self.sequence_search_tool = None '''external sequence search program''' self.superposition_atoms = 'RIGID' '''protein chain atoms to use in overlay (RIGID, BACKBONE or CALPHA)'''
def __init__(self, settings=None): '''Set the chain superposition settings.''' ProteinLib.licence_check() if settings is None: settings = Protein.ChainSuperposition.Settings() self.settings = settings
[docs] def superpose(self, chain1, chain2, binding_site1=None): '''Superpose two protein chains or binding sites An implementation of the Smith-Waterman algorithm is used unless an external sequence alignment tool is specified in the settings. If a binding site is supplied for the first chain, only the atoms in the binding site will be overlaid. :param chain1: a :class:`ccdc.protein.Chain` instance :param chain2: a :class:`ccdc.protein.Chain` instance :param binding_site1: a :class:`ccdc.protein.BindingSite` instance for the first chain :returns: the root-mean square deviation of the overlay and the transformation matrix ''' if self.settings.sequence_alignment_tool is None: alignment_calculator = ProteinLib.SmithWatermanChainAlignmentCalculator() else: alignment_calculator = ProteinLib.SequenceBasedChainAlignmentCalculator( self.settings.sequence_alignment_tool, self.settings.sequence_search_tool) residues = alignment_calculator.matched_residues_for_chain_ids( chain1._protein_structure, chain1.identifier, chain2._protein_structure, chain2.identifier) if binding_site1 is not None: protein_atoms1 = list(a._atom for a in binding_site1.atoms) chain_transformation = ProteinLib.ChainTransformationCalculator(residues, protein_atoms1) else: chain_transformation = ProteinLib.ChainTransformationCalculator(residues) superposition_result = chain_transformation.calculate( self.settings.overlay_weighting_factor, self.settings.overlay_minimum_cycles, self.settings.overlay_convergence_tolerance, self._superposition_mode.prefix_lookup(self.settings.superposition_atoms)) ChemistryLib.MoleculeTransformation(chain2._protein_structure.editable_molecule(), superposition_result.transformation(), chain2._protein_structure.cell()).apply() return superposition_result.rms(), Molecule.Transformation( _transformation=superposition_result.transformation())
###########################################################################