#
# This code is Copyright (C) 2015 The Cambridge Crystallographic Data Centre
# (CCDC) of 12 Union Road, Cambridge CB2 1EZ, UK and a proprietary work of CCDC.
# This code may not be used, reproduced, translated, modified, disassembled or
# copied, except in accordance with a valid licence agreement with CCDC and may
# not be disclosed or redistributed in any form, either in whole or in part, to
# any third party. All copies of this code made in accordance with a valid
# licence agreement as referred to above must contain this copyright notice.
#
# No representations, warranties, or liabilities are expressed or implied in the
# supply of this code by CCDC, its servants or agents, except where such
# exclusion or limitation is prohibited, void or unenforceable under governing
# law.
#
'''
The main class of the :mod:`ccdc.protein` module is :class:`ccdc.protein.Protein`.
A :class:`ccdc.protein.Protein` contains attributes and functions that relate
to protein structures.
'''
###########################################################################
import os
from functools import total_ordering
from ccdc.io import EntryReader, _CSDDatabaseLocator
from ccdc.molecule import Molecule, Atom
from ccdc.utilities import nested_class, bidirectional_dict
from ccdc.utilities import _private_importer
with _private_importer() as pi:
pi.import_ccdc_module('FileFormatsLib')
pi.import_ccdc_module('ChemistryLib')
pi.import_ccdc_module('ChemicalAnalysisLib')
pi.import_ccdc_module('ProteinLib')
pi.import_ccdc_module('AnnotationsLib')
###########################################################################
[docs]class Protein(Molecule):
[docs] @nested_class('Protein')
class Chain(object):
'''A chain of a protein.'''
def __init__(self, index, _protein_structure=None):
self._index = index
self._protein_structure = _protein_structure
def __len__(self):
'''Length of the chain.'''
return len(self.residues)
def __str__(self):
return 'Chain(%s)' % self.identifier
def __getitem__(self, identifier):
'''Get the residue with the given identifier.'''
identifier = '%s:%s' % (self.identifier, identifier)
possibles = [r for r in self.residues if r.identifier == identifier]
if len(possibles) == 1:
return possibles[0]
elif len(possibles) > 1:
raise IndexError('More than one residue with the identifier %s' % identifier)
else:
raise IndexError('No residue with the identifier %s' % identifier)
@property
def residues(self):
'''The residues of a chain.'''
chain = self._protein_structure.chain(self.identifier)
return tuple(
Protein.Residue(i, _residue=chain.residue(i)) for i in range(chain.n_sub_parts())
)
@property
def sequence(self):
'''The sequence of amino acid one letter codes in this chain.'''
return ''.join(r.one_letter_code for r in self.residues)
@property
def index(self):
'''The index of the chain in the protein.'''
return self._index
@property
def identifier(self):
'''The identifier of the chain.'''
return self._protein_structure.chain(self._index).chain_id()
@property
def author_identifier(self):
'''The author provided identifier of the chain.'''
return self._protein_structure.chain(self._index).author_chain_id()
[docs] @nested_class('Protein')
@total_ordering
class Residue(object):
'''A single amino acid residue of a protein.'''
def __init__(self, i, _residue):
self._residue = _residue
self._index = i
def __str__(self):
return 'Residue(%s)' % self.identifier
__repr__ = __str__
def __eq__(self, other):
return self.chain_identifier == other.chain_identifier and \
self._residue.sequence_number() == other._residue.sequence_number()
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
# The < operator should order first by chain ID, and second by sequence number
if self.chain_identifier == other.chain_identifier:
return self._residue.sequence_number() < other._residue.sequence_number()
return self.chain_identifier < other.chain_identifier
def __hash__(self):
return hash(str(self))
@property
def identifier(self):
'''The identifier of this residue.'''
return '%s:%s' % (self._residue.chain_id(), self._residue.full_name())
@property
def chain_identifier(self):
'''The identifier of the chain of which this residue is a part.'''
return self._residue.chain_id()
@property
def author_identifier(self):
'''The author provided identifier of this residue.'''
return '%s:%s' % (self._residue.author_chain_id(), self._residue.author_full_name())
@property
def chain_author_identifier(self):
'''The author provided identifier of the chain of which this residue is a part.'''
return self._residue.author_chain_id()
@property
def three_letter_code(self):
'''The three letter code of the amino acid.'''
return self._residue.amino_acid().three_letter_code(self._residue.amino_acid().type())
@property
def one_letter_code(self):
'''The one letter code of the amino acid.'''
return self._residue.amino_acid().one_letter_code()
@property
def backbone_atoms(self):
'''The backbone atoms of the amino acid.'''
return [Atom(_atom=a) for a in self._residue.backbone_atoms()]
@property
def sidechain_atoms(self):
'''The sidechain atoms of this amino acid.'''
return [Atom(_atom=a) for a in self._residue.sidechain_atoms()]
@property
def atoms(self):
'''The atoms of the residue.'''
return self.backbone_atoms + self.sidechain_atoms
@property
def c_alpha(self):
'''The C alpha atom of the residue.'''
return Atom(_atom=self._residue.c_alpha())
@property
def c_beta(self):
'''The C beta atom, or ``None`` if there is no C beta atom.'''
a = self._residue.c_beta()
if a:
return Atom(_atom=a)
@property
def carbonyl_oxygen(self):
'''The carbonyl oxygen atom.'''
return Atom(_atom=self._residue.carbonyl_oxygen())
@property
def n_terminus(self):
'''The N terminus atom.'''
return Atom(_atom=self._residue.n_terminus())
@property
def c_terminus(self):
'''The C terminus atom.'''
return Atom(_atom=self._residue.c_terminus())
@property
def cysteine_sulphur(self):
'''The sulphur of a cysteine residue, or ``None`` if not a cysteine.'''
if self.three_letter_code == 'CYS':
return [a for a in self.sidechain_atoms if a.atomic_symbol == 'S'][0]
@property
def is_hydrophobic(self):
'''Whether the residue is hydrophobic.'''
return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.HYDROPHOBIC)
@property
def is_hydrophilic(self):
'''Whether the residue is hydrophilic.'''
return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.HYDROPHILIC)
@property
def is_acidic(self):
'''Whether the residue is acidic.'''
return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.ACIDIC)
@property
def is_basic(self):
'''Whether the residue is basic.'''
return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.BASIC)
@property
def index(self):
'''The index of this residue in its chain.'''
return self._index
[docs] @nested_class('Protein')
class NucleicAcid(object):
''' A nucleic acid of a protein.'''
def __init__(self, index, _protein_structure=None):
self._index = index
self._protein_structure = _protein_structure
def __len__(self):
'''Length of the nucleic acid.'''
return len(self.nucleotides)
def __str__(self):
return 'NucleicAcid(%s)' % self.identifier
def __getitem__(self, identifier):
'''Get the nucleotide with the given identifier'''
identifier = '%s:%s' % (self.identifier, identifier)
possibles = [r for r in self.nucleotides if r.identifier == identifier]
if len(possibles) == 1:
return possibles[0]
elif len(possibles) > 1:
raise IndexError('More than one nucleotide with the identifier %s' % identifier)
else:
raise IndexError('No nucleotide with the identifier %s' % identifier)
@property
def nucleotides(self):
'''The nucleotides of the nucleic acid'''
nucleic_acid = self._protein_structure.nucleic_acid(self.identifier)
return tuple(
Protein.Nucleotide(i, _nucleotide=nucleic_acid.nucleotide(i)) for i in range(nucleic_acid.n_sub_parts())
)
@property
def sequence(self):
'''The sequence of nucleotide one letter codes in this nucleic acid.'''
return ''.join(r.one_letter_code for r in self.nucleotides)
@property
def index(self):
'''The index of the nucleic acid in the protein.'''
return self._index
@property
def identifier(self):
'''The identifier of the nucleic acid.'''
return self._protein_structure.nucleic_acid(self._index).chain_id()
@property
def author_identifier(self):
'''The author provided identifier of the nucleic acid.'''
return self._protein_structure.nucleic_acid(self._index).author_chain_id()
[docs] @nested_class('Protein')
@total_ordering
class Nucleotide(object):
'''A single nucleotide of a nucleic acid.'''
def __init__(self, index, _nucleotide):
self._index = index
self._nucleotide = _nucleotide
def __str__(self):
return 'Nucleotide(%s)' % self.identifier
__repr__ = __str__
def __eq__(self, other):
return self.nucleic_acid_identifier == other.nucleic_acid_identifier and \
self._nucleotide.sequence_number() == other._nucleotide.sequence_number()
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
# The < operator should order first by chain ID, and second by sequence number
if self.nucleic_acid_identifier == other.nucleic_acid_identifier:
return self._nucleotide.sequence_number() < other._nucleotide.sequence_number()
return self.nucleic_acid_identifier < other.nucleic_acid_identifier
def __hash__(self):
return hash(str(self))
@property
def index(self):
'''The index of this nucleotide in its nucleic acid.'''
return self._index
@property
def identifier(self):
'''The identifier of this nucleotide.'''
return '%s:%s' % (self._nucleotide.chain_id(), self._nucleotide.full_name())
@property
def nucleic_acid_identifier(self):
'''The identifier of the nucleic acid of which this nucleotide is a part.'''
return self._nucleotide.chain_id()
@property
def author_identifier(self):
'''The author provided identifier of this nucleotide.'''
return '%s:%s' % (self._nucleotide.author_chain_id(), self._nucleotide.author_full_name())
@property
def nucleic_acid_author_identifier(self):
'''The author provided identifier of the nucleic acid of which this nucleotide is a part.'''
return self._nucleotide.author_chain_id()
@property
def code(self):
'''The PDB nucleotide code.'''
return self._nucleotide.code()
@property
def one_letter_code(self):
'''The nucleotide one letter code.'''
return self._nucleotide.one_letter_code()
@property
def atoms(self):
'''The atoms of the nucleotide.'''
return [Atom(_atom=self._nucleotide.atom(i)) for i in range(self._nucleotide.natoms())]
def __init__(self, identifier, _molecule=None, _protein_structure=None, _cell=None):
if _molecule is not None:
Molecule.__init__(self, identifier=identifier, _molecule=_molecule)
if _protein_structure is None:
self._cell = _cell
if self._cell is None:
self._cell = ChemistryLib.Cell()
self._crystal = self._protein_structure = FileFormatsLib.create_protein_structure(
self._cell, _molecule
)
else:
self._crystal = self._protein_structure = _protein_structure
self._cell = _protein_structure.cell()
for i in range(self._protein_structure.nwaters()):
w = self._protein_structure.water(i)
for j in range(w.natoms()):
a = w.atom(j)
if a.element().atomic_symbol() == 'O':
a.set_label(a.annotations().obtain_ProteinSubstructureData().full_name())
@staticmethod
def _make_molecule(protein_part, add_hs=False):
editable_mol = FileFormatsLib.create_editable_molecule(protein_part)
mol = Molecule('%s:%s' % (protein_part.chain_id(), protein_part.full_name()), editable_mol)
if all(a.atomic_number > 0 for a in mol.atoms):
mol.assign_bond_types('Unknown')
if add_hs:
mol.add_hydrogens()
if isinstance(protein_part, FileFormatsLib.Ligand):
mol.is_covalent = protein_part.covalent_link()
mol._protein_part = protein_part
mol.author_identifier = f'{protein_part.author_chain_id()}:{protein_part.author_full_name()}'
return mol
[docs] @staticmethod
def from_entry(entry):
'''Constructs a protein from a given :class:`ccdc.entry.Entry`.
:param entry: Entry from which to construct the protein.
'''
p = Protein(entry.identifier, _molecule=entry.molecule._molecule, _cell=entry.crystal._crystal.cell())
p.attributes = entry.attributes
if hasattr(entry, 'atom_sets'):
p.atom_sets = entry.atom_sets
return p
[docs] @staticmethod
def from_file(file_name):
''' Reads a protein from a file, and constructs the protein. '''
fname, ext = os.path.splitext(file_name.lower())
is_pdb_file = ext in ['.pdb', '.ent']
is_mol2_file = ext == '.mol2'
if not is_pdb_file:
e = EntryReader(file_name)[0]
prot = Protein.from_entry(e)
else:
# If reading a PDB file, load it using the ProteinPDBFile class
f = FileFormatsLib.ProteinPdbFile(file_name)
_protein = f.protein()
editable = _protein.editable_molecule()
identifier = f.identifier().str()
if not identifier:
identifier, _ = os.path.splitext(os.path.basename(fname))
prot = Protein(identifier, _molecule=editable, _protein_structure=_protein)
if is_mol2_file:
# Reconstitute Residue ids for aminoacid residues, waters, ligands, cofactors and metals
for residue in prot.residues:
for a in residue.atoms:
a.residue_label = residue.three_letter_code
for water in prot.waters:
for a in water.atoms:
a.residue_label = 'HOH'
for ligand in prot.ligands:
for a in ligand.atoms:
a.residue_label = ligand.identifier[:3]
for cofactor in prot.cofactors:
for a in cofactor.atoms:
a.residue_label = cofactor.identifier[:3]
for nucleic_acid in prot.nucleic_acids:
for nucleotide in nucleic_acid.nucleotides:
for a in nucleotide.atoms:
a.residue_label = nucleotide.code
for metal in prot.metals:
metal.residue_label = metal.label
return prot
[docs] @staticmethod
def known_cofactor_codes():
'''Provide access to a list of known cofactors codes in the underlying library.'''
return FileFormatsLib.Cofactor.known_cofactor_codes()
[docs] def copy(self):
'''Copies the protein.'''
return Protein(self.identifier, _molecule=Molecule.copy(self)._molecule, _cell=self._cell)
@property
def chains(self):
'''A tuple of :class:`ccdc.protein.Protein.Chain`.'''
return tuple(
Protein.Chain(i, self._protein_structure)
for i in range(self._protein_structure.nchains())
)
[docs] def remove_chain(self, chain_id):
'''Remove the chain with the given identifier.'''
if self._protein_structure.has_chain(chain_id):
self._protein_structure.remove_chain(chain_id)
else:
raise IndexError('Protein does not have a chain %s' % chain_id)
@property
def residues(self):
'''The amino acid residues of the protein.'''
return tuple(
r for c in self.chains for r in c.residues
)
[docs] def remove_residue(self, residue_id):
'''Remove the specified residue.'''
self._protein_structure.remove_residue(self[residue_id]._residue, False)
@property
def nucleic_acids(self):
'''A tuple of :class:`ccdc.protein.Protein.NucleicAcid`.'''
return tuple(
Protein.NucleicAcid(i, self._protein_structure)
for i in range(self._protein_structure.n_nucleic_acids())
)
[docs] def remove_nucleic_acid(self, nucleic_acid_chain_id):
'''Remove the chain with the given identifier.'''
if self._protein_structure.has_nucleic_acid(nucleic_acid_chain_id):
self._protein_structure.remove_nucleic_acid(self._protein_structure.nucleic_acid(nucleic_acid_chain_id))
else:
raise IndexError('Protein does not have a nucleic acid %s' % nucleic_acid_chain_id)
@property
def nucleotides(self):
'''The nucleotides of the protein.'''
return tuple(
n for nc in self.nucleic_acids for n in nc.nucleotides
)
[docs] def remove_nucleotide(self, nucleotide_id):
'''Remove the specified nucleotide.'''
self._protein_structure.remove_nucleotide(self[nucleotide_id]._nucleotide, False)
@property
def ligands(self):
'''The tuple of ligands in the protein.
The identifier of the molecule is of the form chain_id:residue_name.
Note that hydrogen atoms are added automatically to the returned molecules
however these are not added to the parent protein.
'''
return tuple(
Protein._make_molecule(self._protein_structure.ligand(i), add_hs=True)
for i in range(self._protein_structure.nligands())
)
@property
def cofactors(self):
'''The tuple of cofactors in the protein.
The identifier of the molecule is of the form chain_id:residue_name.
Note that hydrogen atoms are added automatically to the returned molecules
however these are not added to the parent protein.
'''
return tuple(
Protein._make_molecule(self._protein_structure.cofactor(i), add_hs=True)
for i in range(self._protein_structure.ncofactors())
)
def _prepare_molecule_for_add(self, molecule):
'''Add annotations to a molecule in preparation for being added to the protein'''
# Cope with Chain IDs
if ':' in molecule.identifier:
bits = molecule.identifier.split(':')
chain_id = bits[0]
identifier = bits[1]
else:
chain_id = ''
identifier = molecule.identifier
# Cope with GOLD identifiers
if '|' in identifier:
bits = identifier.split('|')
chain_id = ''
identifier = bits[1]
max_res = max(a._atom.annotations().obtain_ProteinSubstructureData().residue_sequence_number() for a in self.atoms)
for a in molecule.atoms:
at = a._atom
psd = at.annotations().obtain_ProteinSubstructureData()
psd.set_full_name(identifier)
psd.set_residue_name(identifier)
psd.set_chain_id(chain_id)
psd.set_residue_sequence_number(max_res+1)
[docs] def add_ligand(self, molecule):
'''Add a molecule to the protein as a ligand.'''
self._prepare_molecule_for_add(molecule)
l = self._protein_structure.add_ligand(molecule._molecule, False)
[docs] def remove_ligand(self, ligand_id):
'''Remove the specified ligand.
:param ligand_id: str, of the form chain_id:ligand_id.
'''
ligs = [self._protein_structure.ligand(i) for i in range(self._protein_structure.nligands())]
chain_id, name = ligand_id.split(':')
for l in ligs:
if l.chain_id() == chain_id and l.full_name() == name:
self._protein_structure.remove_ligand(l, False)
break
else:
raise TypeError('Protein has no ligand %s' % ligand_id)
[docs] def add_cofactor(self, molecule):
'''Add a molecule to the protein as a cofactor.'''
self._prepare_molecule_for_add(molecule)
l = self._protein_structure.add_cofactor(molecule._molecule, False)
[docs] def remove_cofactor(self, cofactor_id):
'''Remove the specified cofactor.
:param cofactor_id: str, of the form chain_id:cofactor_id.
'''
ligs = [self._protein_structure.cofactor(i) for i in range(self._protein_structure.ncofactors())]
chain_id, name = cofactor_id.split(':')
for l in ligs:
if l.chain_id() == chain_id and l.full_name() == name:
self._protein_structure.remove_cofactor(l, False)
break
else:
raise TypeError('Protein has no cofactor %s' % cofactor_id)
def update_protein(self):
self._protein_structure.notify_molecule_changed(True)
@property
def sequence(self):
'''The one-letter code sequence.'''
return ' '.join(c.sequence for c in self.chains)
@property
def waters(self):
'''The waters of the protein.
:return: a tuple of :class:`ccdc.molecule.Molecule`, representing the oxygens of the water.
'''
return tuple(
Protein._make_molecule(self._protein_structure.water(i))
for i in range(self._protein_structure.nwaters())
)
[docs] def remove_water(self, water_mols):
'''Remove the water (or waters). If water_mols is a list (or tuple) of
water objects remove all waters in said list or tuple
'''
if hasattr(water_mols,"identifier"):
water_mols = [water_mols]
water_mols_ids = set(w.identifier for w in water_mols)
removed_water_ids = []
indexes = []
for i, w in enumerate(self.waters):
if w.identifier in water_mols_ids:
removed_water_ids.append(w.identifier)
indexes.append(i)
if len(removed_water_ids) == len(water_mols_ids):
break
else:
raise TypeError('Protein has no water(s) %s' % ",".join(list(water_mols_ids -set(removed_water_ids))))
# These are removed in reverse to avoid the underlying protein structure
# invalidating a later water that needs removing by removing an earlier water.
for i in sorted(indexes, reverse=True):
self._protein_structure.remove_water(self._protein_structure.water(i), False)
[docs] def remove_all_waters(self):
'''Removes all waters from the protein.'''
for i in range(self._protein_structure.nwaters()-1, -1, -1):
self._protein_structure.remove_water(self._protein_structure.water(i), False)
@property
def metals(self):
'''The metal atoms of the protein.'''
metals = []
for i in range(self._protein_structure.nmetals()):
m = self._protein_structure.metal(i)
for inx in m.atom_indices():
atom = Atom(_atom=self._molecule.atom(inx))
atom.identifier = f'{m.chain_id()}:{m.full_name()}'
atom.author_identifier = f'{m.author_chain_id()}:{m.author_full_name()}'
metals.append(atom)
return tuple(metals)
def __getitem__(self, identifier):
'''Return the residue, nucleotide, chain or nucleic acid with the given identifier.
:param identifier: may be of the form chain_id:residue_id, chain_id:nucleotide_id, chain_id or nucleic_acid_id
:return: :class:`ccdc.protein.Protein` or :class:`ccdc.protein.Protein.Residue` or
:class:`ccdc.protein.Protein.NucleicAcid` or :class:`ccdc.protein.Protein.Nucleotide
'''
if ':' in identifier:
cid, rid = identifier.split(':')
else:
cid, rid = identifier, None
if self._protein_structure.has_chain(cid):
c = [ch for ch in self.chains if ch.identifier == cid][0]
if rid is not None:
return c[rid]
else:
return c
elif self._protein_structure.has_nucleic_acid(cid):
nc = [nc for nc in self.nucleic_acids if nc.identifier == cid][0]
if rid is not None:
return nc[rid]
else:
return nc
else:
raise IndexError('Protein has no chain or nucleic acid %s (%s)' % (cid, identifier))
@property
def cavity_atoms(self):
'''The atoms making up the binding site, if this was read from a gold protein.'''
if not hasattr(self, '_cavity_atoms'):
self._cavity_atoms = None
if hasattr(self, 'atom_sets'):
if 'CAVITY_ATOMS' in self.atom_sets:
self._cavity_atoms = tuple(
self.atoms[i] for i in self.atom_sets['CAVITY_ATOMS']
)
return self._cavity_atoms
@property
def cavity_residues(self):
'''The residues making up the cavity.'''
if not hasattr(self, '_cavity_residues'):
self._cavity_residues = None
if not hasattr(self, '_atom_residue_map'):
self._atom_residue_map = {
a : r
for r in self.residues
for a in r.atoms
}
ats = self.cavity_atoms
if ats is not None:
self._cavity_residues = set(
self._atom_residue_map[a]
for a in ats
if a in self._atom_residue_map
)
self._cavity_residues = list(self._cavity_residues)
self._cavity_residues.sort()
return self._cavity_residues
[docs] def remove_hydrogens(self):
'''Remove all hydrogens from the protein'''
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
structure_editor.remove_all_hydrogens()
[docs] def add_hydrogens(self, mode='All',rules_file = None):
'''
Add hydrogens to the protein structure
This method protonates the protein structure by performing the following operations:
* Remove metal bonds
* Assign ligand and cofactor bond types and standardise aromatic and delocalised bonds to CSD conventions
* Set atom charges to zero
* Set bond types for ARG, GLU, ASP appropriately
* Apply protonation rules to ligands and cofactors
* Add hydrogens to protein, ligands, cofactors, nucleic acids and waters where necessary
* Set any remaining unknown bond type to single
:param mode: 'all' to generate all hydrogens (remove any existing
hydrogens first) or 'missing' to generate hydrogens deemed to
be missing.
:param rules_file: File of rules that express special cases - if None, a default version will be used
:raises FileNotFoundError: if the rules_file passed in doesnt exist
:raises ValueError: if mode is not either 'all' or 'missing'
'''
if mode.lower() not in ['all', 'missing']:
raise ValueError(
'add_hydrogens: %s should be all or missing' % mode
)
if rules_file is None:
rules_dir = _CSDDatabaseLocator.get_optimisation_parameter_file_location()
rules_file = os.path.join(rules_dir, 'protonation_rules.txt')
elif not os.path.exists(rules_file):
raise FileNotFoundError("The rules file {} does not exist".format(rules_file))
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
if mode.lower() == 'all':
self.remove_hydrogens()
rules = ChemicalAnalysisLib.ProtonationRules(rules_file)
structure_editor.add_hydrogens(rules)
[docs] def normalise_labels(self, mode='pdb'):
'''
Normalise labels of atoms in the protein structure
:param mode: 'pdb' (the default) will try to normalise the labels to PDB compliance if possible
(i.e. no longer than 4 characters.) If labels are already compliant they will not be changed
'force' will call the normalisation regardless of whether they are already compliant
'molecule' will normalise using :attr:`ccdc.molecule.Molecule.normalise_labels`
'''
if mode.lower() == 'molecule':
Molecule.normalise_labels(self)
elif mode.lower() == 'force':
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
structure_editor.normalise_atom_labels()
else:
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
if not structure_editor.are_labels_pdb_compliant():
structure_editor.normalise_atom_labels()
[docs] def are_labels_pdb_compliant(self):
'''
Are labels writeable in PDB format?
'''
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
return structure_editor.are_labels_pdb_compliant()
[docs] def sort_atoms_by_residue(self):
'''
Sorts atoms by residue
After editing, sometimes the underlying atom list in a protein is not sorted by residue so atoms
in a single residue are not in a single block of atoms. In particular, adding hydrogens will add new
hydrogen atoms to the end of the atom list.
Calling this method will re-order the atoms in the protein so that each residue is in a single atom block in
the atom list. This is useful in particular if you are writing PDB files where having residues as single blocks
of ATOM lines is desirable.
Note that calling this method will mean that any pre-existing indexes into the atom list will probably
be invalidated.
'''
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
structure_editor.sort_atoms_by_residue()
[docs] def assign_unique_chain_identifiers(self, mode='all'):
'''
Assigns unique chain identifiers to disconnected chain fragments and/or non-chain components
Occasionally, for example when merging two proteins, you can have a protein that has multiple chains with
the same chain identifier. This can lead to issues down the line: for example file writers may end up mixing
the order of residues as the code will 'see' the residues as belonging to the same chain and sort them by sequence
number
Calling this method will reassign chain identifiers to all components. Each disconnected chain will get a unique
chain ID. Note that there is no guarentee that old chain IDs will be the same. If, say previously the first chain was B
and the second A the chain identifiers may reverse.
Ligands, Co-factors, Metals and Waters will also get new chain ids: these will be derived based on which chain they are
overall closest to (i.e. a ligand is given the chain id of the chain that has the most atoms within 5.0 Angstroms.)
Note that calling this method will mean that any pre-existing indexes into the atom list will probably
be invalidated, as the consequence of changing ids can mean a re-ordering of atoms and residues.
:param mode: 'all' (the default) will assign chains new IDs to chain and then associate non-chain components
(e.g.co-factors, waters, ligands and metals) to have the chain id of the chain they are nearest to.
'nonchain' will just re-assign chain ids to co-factors, waters, ligands and metals based on chain proximity.
'''
structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
if mode == 'all':
structure_editor.assign_chain_ids_to_components()
else:
structure_editor.assign_chain_ids_to_non_chain_components()
[docs] def detect_ligand_bonds(self, covalent_links = 'include'):
'''
Removes all bonds between ligand or cofactor atoms, and redetects them based on
distance between atoms.
This can be useful if the bonds specified by the CONECT records in the
PDB are unspecified or undesirable.
:param mode: covalent_links 'include' to include covalent links between the protein and the ligand (the default) and 'exclude' to remove them
'''
covlinks = True
if covalent_links == 'exclude':
covlinks = False
self._protein_structure.assign_ligand_connectivity(covlinks)
self._protein_structure.build_protein_structure()
[docs] class BindingSite(object):
'''A binding site in the protein.'''
def __init__(self, protein, whole_residues=True):
self.protein = protein
self.whole_residues = whole_residues
@property
def atoms(self):
'''The atoms of the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
if self.whole_residues:
protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
self.protein.atoms[i] for x in protein_parts for i in x.atom_indices()
)
else:
return tuple(Atom(_atom=a) for a in ats)
@property
def residues(self):
'''The residues of the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
self.protein['%s:%s' % (x.chain_id(), x.full_name())]
for x in protein_parts
if x.type() == AnnotationsLib.ProteinSubstructureData.AMINOACID
)
@property
def nucleotides(self):
'''The nucleotides of the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
self._pp = protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
self.protein['%s:%s' % (x.chain_id(), x.full_name())]
for x in protein_parts
if x.type() == AnnotationsLib.ProteinSubstructureData.NUCLEOTIDE
)
@property
def waters(self):
'''The waters of the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
Protein._make_molecule(x)
for x in protein_parts
if x.type() == AnnotationsLib.ProteinSubstructureData.WATER
)
@property
def ligands(self):
'''The ligands of the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
Protein._make_molecule(x)
for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.LIGAND
)
@property
def cofactors(self):
'''The cofactors of the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
Protein._make_molecule(x)
for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.COFACTOR
)
@property
def metals(self):
'''The metals in the cavity.'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
protein_parts = bsd.expand_to_whole_parts(ats)
return tuple(
self.protein.atoms[i]
for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.METAL
for i in x.atom_indices()
)
@property
def _molecule(self):
'''Molecule comprising only the cavity atoms of the protein'''
bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
ats = self._detect_binding_site(bsd)
if self.whole_residues:
ats = bsd.expand_to_whole_parts_atoms(ats)
return bsd.binding_site_subset(ats)
@property
def identifier(self):
return self.protein.identifier
@property
def formula(self):
'''Return the chemical formula of the molecules in the binding site.'''
return str(ChemistryLib.MoleculeChemicalFormula(self._molecule))
[docs] class BindingSiteFromPoint(BindingSite):
'''A cavity defined from a point.'''
def __init__(self, protein, origin=(0,0,0), distance=12.):
Protein.BindingSite.__init__(self, protein)
self.origin = origin
self.distance = distance
def _detect_binding_site(self, bsd):
return bsd.detect_binding_site(self.origin, self.distance)
[docs] class BindingSiteFromAtom(BindingSiteFromPoint):
'''A binding site defined from a protein atom.'''
def __init__(self, protein, atom, distance):
Protein.BindingSiteFromPoint.__init__(self, protein, atom.coordinates, distance)
self.atom = atom
[docs] class BindingSiteFromMolecule(BindingSite):
'''A binding site defined from an arbitrary molecule.'''
def __init__(self, protein, molecule, distance, whole_residues=True):
Protein.BindingSite.__init__(self, protein, whole_residues=whole_residues)
self.molecule = molecule
self.distance = distance
def _detect_binding_site(self, bsd):
return bsd.detect_binding_site(self.molecule._molecule, self.distance)
[docs] class BindingSiteFromResidue(BindingSiteFromMolecule):
'''A binding site defined from protein residue.'''
def __init__(self, protein, residue, distance):
mol = Protein._make_molecule(residue._residue)
Protein.BindingSiteFromMolecule.__init__(self, protein, mol, distance)
self.residue = residue
[docs] class BindingSiteFromListOfAtoms(BindingSite):
'''A binding site defined from a list of protein atoms.'''
def __init__(self, protein, atoms):
Protein.BindingSite.__init__(self, protein)
self.list_of_atoms = atoms
def _detect_binding_site(self, bsd):
bsd.set_binding_site_atoms(tuple(a._atom for a in self.list_of_atoms))
return bsd.binding_site_atoms()
[docs] class BindingSiteFromListOfResidues(BindingSiteFromListOfAtoms):
'A binding site from a list of residues.'''
def __init__(self, protein, list_of_residues):
self.list_of_atoms = tuple(a for r in list_of_residues for a in r.atoms)
self.list_of_residues = list_of_residues
Protein.BindingSiteFromListOfAtoms.__init__(self, protein, self.list_of_atoms)
[docs] @nested_class('Protein')
class ChainSuperposition(object):
'''Class for superposition of protein chains using sequence alignment'''
_superposition_mode = bidirectional_dict(
CALPHA=ProteinLib.ChainTransformationCalculator.CALPHA, # Overlay on C-alpha atoms only
BACKBONE=ProteinLib.ChainTransformationCalculator.BACKBONE, # Overlay on backbone atoms
RIGID=ProteinLib.ChainTransformationCalculator.RIGID # Overlay on backbone and C-beta atoms
)
[docs] @nested_class('Protein.ChainSuperposition')
class Settings(object):
'''Configuration options for the superposition of protein chains.'''
def __init__(self):
self.overlay_weighting_factor = 10.
'''weighting factor to use in overlay'''
self.overlay_minimum_cycles = 1
'''minimum number of cycles in overlay'''
self.overlay_convergence_tolerance = 0.01
'''tolerance for convergence in overlay'''
self.sequence_alignment_tool = None
'''external sequence alignment program'''
self.sequence_search_tool = None
'''external sequence search program'''
self.superposition_atoms = 'RIGID'
'''protein chain atoms to use in overlay (RIGID, BACKBONE or CALPHA)'''
def __init__(self, settings=None):
'''Set the chain superposition settings.'''
ProteinLib.licence_check()
if settings is None:
settings = Protein.ChainSuperposition.Settings()
self.settings = settings
[docs] def superpose(self, chain1, chain2, binding_site1=None):
'''Superpose two protein chains or binding sites
An implementation of the Smith-Waterman algorithm is used unless
an external sequence alignment tool is specified in the settings.
If a binding site is supplied for the first chain, only the atoms
in the binding site will be overlaid.
:param chain1: a :class:`ccdc.protein.Chain` instance
:param chain2: a :class:`ccdc.protein.Chain` instance
:param binding_site1: a :class:`ccdc.protein.BindingSite` instance for the first chain
:returns: the root-mean square deviation of the overlay and the transformation matrix
'''
if self.settings.sequence_alignment_tool is None:
alignment_calculator = ProteinLib.SmithWatermanChainAlignmentCalculator()
else:
alignment_calculator = ProteinLib.SequenceBasedChainAlignmentCalculator(
self.settings.sequence_alignment_tool,
self.settings.sequence_search_tool)
residues = alignment_calculator.matched_residues_for_chain_ids(
chain1._protein_structure, chain1.identifier,
chain2._protein_structure, chain2.identifier)
if binding_site1 is not None:
protein_atoms1 = list(a._atom for a in binding_site1.atoms)
chain_transformation = ProteinLib.ChainTransformationCalculator(residues, protein_atoms1)
else:
chain_transformation = ProteinLib.ChainTransformationCalculator(residues)
superposition_result = chain_transformation.calculate(
self.settings.overlay_weighting_factor,
self.settings.overlay_minimum_cycles,
self.settings.overlay_convergence_tolerance,
self._superposition_mode.prefix_lookup(self.settings.superposition_atoms))
ChemistryLib.MoleculeTransformation(chain2._protein_structure.editable_molecule(),
superposition_result.transformation(),
chain2._protein_structure.cell()).apply()
return superposition_result.rms(), Molecule.Transformation(
_transformation=superposition_result.transformation())
###########################################################################