Source code for ccdc.protein

#
# This code is Copyright (C) 2015 The Cambridge Crystallographic Data Centre
# (CCDC) of 12 Union Road, Cambridge CB2 1EZ, UK and a proprietary work of CCDC.
# This code may not be used, reproduced, translated, modified, disassembled or
# copied, except in accordance with a valid licence agreement with CCDC and may
# not be disclosed or redistributed in any form, either in whole or in part, to
# any third party. All copies of this code made in accordance with a valid
# licence agreement as referred to above must contain this copyright notice.
#
# No representations, warranties, or liabilities are expressed or implied in the
# supply of this code by CCDC, its servants or agents, except where such
# exclusion or limitation is prohibited, void or unenforceable under governing
# law.
#
'''
The main class of the :mod:`ccdc.protein` module is :class:`ccdc.protein.Protein`.

A :class:`ccdc.protein.Protein` contains attributes and functions that relate
to protein structures.
'''

###########################################################################

import os
from functools import total_ordering

from ccdc.io import EntryReader, _CSDDatabaseLocator
from ccdc.molecule import Molecule, Atom
from ccdc.utilities import nested_class, bidirectional_dict

from ccdc.utilities import _private_importer
with _private_importer() as pi:
    pi.import_ccdc_module('FileFormatsLib')
    pi.import_ccdc_module('ChemistryLib')
    pi.import_ccdc_module('ChemicalAnalysisLib')
    pi.import_ccdc_module('ProteinLib')
    pi.import_ccdc_module('AnnotationsLib')

###########################################################################


[docs]
class Protein(Molecule):

[docs]
    @nested_class('Protein')
    class Chain(object):
        '''A chain of a protein.'''
        def __init__(self, index, _protein_structure=None):
            self._index = index
            self._protein_structure = _protein_structure

        def __len__(self):
            '''Length of the chain.'''
            return len(self.residues)

        def __str__(self):
            return 'Chain(%s)' % self.identifier

        def __getitem__(self, identifier):
            '''Get the residue with the given identifier.'''
            identifier = '%s:%s' % (self.identifier, identifier)
            possibles = [r for r in self.residues if r.identifier == identifier]
            if len(possibles) == 1:
                return possibles[0]
            elif len(possibles) > 1:
                raise IndexError('More than one residue with the identifier %s' % identifier)
            else:
                raise IndexError('No residue with the identifier %s' % identifier)

        @property
        def residues(self):
            '''The residues of a chain.'''
            chain = self._protein_structure.chain(self.identifier)
            return tuple(
                Protein.Residue(i, _residue=chain.residue(i)) for i in range(chain.n_sub_parts())
            )

        @property
        def sequence(self):
            '''The sequence of amino acid one letter codes in this chain.'''
            return ''.join(r.one_letter_code for r in self.residues)

        @property
        def index(self):
            '''The index of the chain in the protein.'''
            return self._index

        @property
        def identifier(self):
            '''The identifier of the chain.'''
            return self._protein_structure.chain(self._index).chain_id()

        @property
        def author_identifier(self):
            '''The author provided identifier of the chain.'''
            return self._protein_structure.chain(self._index).author_chain_id()



[docs]
    @nested_class('Protein')
    @total_ordering
    class Residue(object):
        '''A single amino acid residue of a protein.'''
        def __init__(self, i, _residue):
            self._residue = _residue
            self._index = i

        def __str__(self):
            return 'Residue(%s)' % self.identifier
        __repr__ = __str__

        def __eq__(self, other):
            return self.chain_identifier == other.chain_identifier and \
                    self._residue.sequence_number() == other._residue.sequence_number()

        def __ne__(self, other):
            return not self.__eq__(other)

        def __lt__(self, other):
            # The < operator should order first by chain ID, and second by sequence number
            if self.chain_identifier == other.chain_identifier:
                return self._residue.sequence_number() < other._residue.sequence_number()

            return self.chain_identifier < other.chain_identifier

        def __hash__(self):
            return hash(str(self))

        @property
        def identifier(self):
            '''The identifier of this residue.'''
            return '%s:%s' % (self._residue.chain_id(), self._residue.full_name())

        @property
        def chain_identifier(self):
            '''The identifier of the chain of which this residue is a part.'''
            return self._residue.chain_id()

        @property
        def author_identifier(self):
            '''The author provided identifier of this residue.'''
            return '%s:%s' % (self._residue.author_chain_id(), self._residue.author_full_name())

        @property
        def chain_author_identifier(self):
            '''The author provided identifier of the chain of which this residue is a part.'''
            return self._residue.author_chain_id()

        @property
        def three_letter_code(self):
            '''The three letter code of the amino acid.'''
            return self._residue.amino_acid().three_letter_code(self._residue.amino_acid().type())

        @property
        def one_letter_code(self):
            '''The one letter code of the amino acid.'''
            return self._residue.amino_acid().one_letter_code()

        @property
        def backbone_atoms(self):
            '''The backbone atoms of the amino acid.'''
            return [Atom(_atom=a) for a in self._residue.backbone_atoms()]

        @property
        def sidechain_atoms(self):
            '''The sidechain atoms of this amino acid.'''
            return [Atom(_atom=a) for a in self._residue.sidechain_atoms()]

        @property
        def atoms(self):
            '''The atoms of the residue.'''
            return self.backbone_atoms + self.sidechain_atoms

        @property
        def c_alpha(self):
            '''The C alpha atom of the residue.'''
            return Atom(_atom=self._residue.c_alpha())

        @property
        def c_beta(self):
            '''The C beta atom, or ``None`` if there is no C beta atom.'''
            a = self._residue.c_beta()
            if a:
                return Atom(_atom=a)

        @property
        def carbonyl_oxygen(self):
            '''The carbonyl oxygen atom.'''
            return Atom(_atom=self._residue.carbonyl_oxygen())

        @property
        def n_terminus(self):
            '''The N terminus atom.'''
            return Atom(_atom=self._residue.n_terminus())

        @property
        def c_terminus(self):
            '''The C terminus atom.'''
            return Atom(_atom=self._residue.c_terminus())

        @property
        def cysteine_sulphur(self):
            '''The sulphur of a cysteine residue, or ``None`` if not a cysteine.'''
            if self.three_letter_code == 'CYS':
                return [a for a in self.sidechain_atoms if a.atomic_symbol == 'S'][0]

        @property
        def is_hydrophobic(self):
            '''Whether the residue is hydrophobic.'''
            return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.HYDROPHOBIC)

        @property
        def is_hydrophilic(self):
            '''Whether the residue is hydrophilic.'''
            return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.HYDROPHILIC)

        @property
        def is_acidic(self):
            '''Whether the residue is acidic.'''
            return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.ACIDIC)

        @property
        def is_basic(self):
            '''Whether the residue is basic.'''
            return self._residue.amino_acid().property(FileFormatsLib.AminoAcid.BASIC)

        @property
        def index(self):
            '''The index of this residue in its chain.'''
            return self._index



[docs]
    @nested_class('Protein')
    class NucleicAcid(object):
        ''' A nucleic acid of a protein.'''
        def __init__(self, index, _protein_structure=None):
            self._index = index
            self._protein_structure = _protein_structure

        def __len__(self):
            '''Length of the nucleic acid.'''
            return len(self.nucleotides)

        def __str__(self):
            return 'NucleicAcid(%s)' % self.identifier

        def __getitem__(self, identifier):
            '''Get the nucleotide with the given identifier'''
            identifier = '%s:%s' % (self.identifier, identifier)
            possibles = [r for r in self.nucleotides if r.identifier == identifier]
            if len(possibles) == 1:
                return possibles[0]
            elif len(possibles) > 1:
                raise IndexError('More than one nucleotide with the identifier %s' % identifier)
            else:
                raise IndexError('No nucleotide with the identifier %s' % identifier)

        @property
        def nucleotides(self):
            '''The nucleotides of the nucleic acid'''
            nucleic_acid = self._protein_structure.nucleic_acid(self.identifier)
            return tuple(
                Protein.Nucleotide(i, _nucleotide=nucleic_acid.nucleotide(i)) for i in range(nucleic_acid.n_sub_parts())
            )

        @property
        def sequence(self):
            '''The sequence of nucleotide one letter codes in this nucleic acid.'''
            return ''.join(r.one_letter_code for r in self.nucleotides)

        @property
        def index(self):
            '''The index of the nucleic acid in the protein.'''
            return self._index

        @property
        def identifier(self):
            '''The identifier of the nucleic acid.'''
            return self._protein_structure.nucleic_acid(self._index).chain_id()

        @property
        def author_identifier(self):
            '''The author provided identifier of the nucleic acid.'''
            return self._protein_structure.nucleic_acid(self._index).author_chain_id()



[docs]
    @nested_class('Protein')
    @total_ordering
    class Nucleotide(object):
        '''A single nucleotide of a nucleic acid.'''
        def __init__(self, index, _nucleotide):
            self._index = index
            self._nucleotide = _nucleotide

        def __str__(self):
            return 'Nucleotide(%s)' % self.identifier
        __repr__ = __str__

        def __eq__(self, other):
            return self.nucleic_acid_identifier == other.nucleic_acid_identifier and \
                   self._nucleotide.sequence_number() == other._nucleotide.sequence_number()

        def __ne__(self, other):
            return not self.__eq__(other)

        def __lt__(self, other):
            # The < operator should order first by chain ID, and second by sequence number
            if self.nucleic_acid_identifier == other.nucleic_acid_identifier:
                return self._nucleotide.sequence_number() < other._nucleotide.sequence_number()
            return self.nucleic_acid_identifier < other.nucleic_acid_identifier

        def __hash__(self):
            return hash(str(self))

        @property
        def index(self):
            '''The index of this nucleotide in its nucleic acid.'''
            return self._index

        @property
        def identifier(self):
            '''The identifier of this nucleotide.'''
            return '%s:%s' % (self._nucleotide.chain_id(), self._nucleotide.full_name())

        @property
        def nucleic_acid_identifier(self):
            '''The identifier of the nucleic acid of which this nucleotide is a part.'''
            return self._nucleotide.chain_id()

        @property
        def author_identifier(self):
            '''The author provided identifier of this nucleotide.'''
            return '%s:%s' % (self._nucleotide.author_chain_id(), self._nucleotide.author_full_name())

        @property
        def nucleic_acid_author_identifier(self):
            '''The author provided identifier of the nucleic acid of which this nucleotide is a part.'''
            return self._nucleotide.author_chain_id()

        @property
        def code(self):
            '''The PDB nucleotide code.'''
            return self._nucleotide.code()

        @property
        def one_letter_code(self):
            '''The nucleotide one letter code.'''
            return self._nucleotide.one_letter_code()

        @property
        def atoms(self):
            '''The atoms of the nucleotide.'''
            return [Atom(_atom=self._nucleotide.atom(i)) for i in range(self._nucleotide.natoms())]


    def __init__(self, identifier, _molecule=None, _protein_structure=None, _cell=None):
        if _molecule is not None:
            Molecule.__init__(self, identifier=identifier, _molecule=_molecule)
            if _protein_structure is None:
                self._cell = _cell
                if self._cell is None:
                    self._cell = ChemistryLib.Cell()
                self._crystal = self._protein_structure = FileFormatsLib.create_protein_structure(
                    self._cell, _molecule
                )
            else:
                self._crystal = self._protein_structure = _protein_structure
                self._cell = _protein_structure.cell()
            for i in range(self._protein_structure.nwaters()):
                w = self._protein_structure.water(i)
                for j in range(w.natoms()):
                    a = w.atom(j)
                    if a.element().atomic_symbol() == 'O':
                        a.set_label(a.annotations().obtain_ProteinSubstructureData().full_name())

    @staticmethod
    def _make_molecule(protein_part, add_hs=False):
        editable_mol = FileFormatsLib.create_editable_molecule(protein_part)
        mol = Molecule('%s:%s' % (protein_part.chain_id(), protein_part.full_name()), editable_mol)
        if all(a.atomic_number > 0 for a in mol.atoms):
            mol.assign_bond_types('Unknown')
            if add_hs:
                mol.add_hydrogens()
        if isinstance(protein_part, FileFormatsLib.Ligand):
            mol.is_covalent = protein_part.covalent_link()
        mol._protein_part = protein_part
        mol.author_identifier = f'{protein_part.author_chain_id()}:{protein_part.author_full_name()}'
        return mol


[docs]
    @staticmethod
    def from_entry(entry):
        '''Constructs a protein from a given :class:`ccdc.entry.Entry`.

        :param entry: Entry from which to construct the protein.
        '''
        p = Protein(entry.identifier, _molecule=entry.molecule._molecule, _cell=entry.crystal._crystal.cell())
        p.attributes = entry.attributes
        if hasattr(entry, 'atom_sets'):
            p.atom_sets = entry.atom_sets
        return p



[docs]
    @staticmethod
    def from_file(file_name):
        ''' Reads a protein from a file, and constructs the protein. '''
        fname, ext = os.path.splitext(file_name.lower())
        is_pdb_file = ext in ['.pdb', '.ent']
        is_mol2_file = ext == '.mol2'

        if not is_pdb_file:
            e = EntryReader(file_name)[0]
            prot = Protein.from_entry(e)
        else:
            # If reading a PDB file, load it using the ProteinPDBFile class
            f = FileFormatsLib.ProteinPdbFile(file_name)
            _protein = f.protein()
            editable = _protein.editable_molecule()
            identifier = f.identifier().str()
            if not identifier:
                identifier, _ = os.path.splitext(os.path.basename(fname))
            prot = Protein(identifier, _molecule=editable, _protein_structure=_protein)

        if is_mol2_file:
            # Reconstitute Residue ids for aminoacid residues, waters, ligands, cofactors and metals
            for residue in prot.residues:
                for a in residue.atoms:
                    a.residue_label = residue.three_letter_code

            for water in prot.waters:
                for a in water.atoms:
                    a.residue_label = 'HOH'

            for ligand in prot.ligands:
                for a in ligand.atoms:
                    a.residue_label = ligand.identifier[:3]

            for cofactor in prot.cofactors:
                for a in cofactor.atoms:
                    a.residue_label = cofactor.identifier[:3]

            for nucleic_acid in prot.nucleic_acids:
                for nucleotide in nucleic_acid.nucleotides:
                    for a in nucleotide.atoms:
                        a.residue_label = nucleotide.code

            for metal in prot.metals:
                metal.residue_label = metal.label

        return prot



[docs]
    @staticmethod
    def known_cofactor_codes():
        '''Provide access to a list of known cofactors codes in the underlying library.'''
        return FileFormatsLib.Cofactor.known_cofactor_codes()



[docs]
    def copy(self):
        '''Copies the protein.'''
        return Protein(self.identifier, _molecule=Molecule.copy(self)._molecule, _cell=self._cell)


    @property
    def chains(self):
        '''A tuple of :class:`ccdc.protein.Protein.Chain`.'''
        return tuple(
            Protein.Chain(i, self._protein_structure)
            for i in range(self._protein_structure.nchains())
        )


[docs]
    def remove_chain(self, chain_id):
        '''Remove the chain with the given identifier.'''
        if self._protein_structure.has_chain(chain_id):
            self._protein_structure.remove_chain(chain_id)
        else:
            raise IndexError('Protein does not have a chain %s' % chain_id)


    @property
    def residues(self):
        '''The amino acid residues of the protein.'''
        return tuple(
            r for c in self.chains for r in c.residues
        )


[docs]
    def remove_residue(self, residue_id):
        '''Remove the specified residue.'''
        self._protein_structure.remove_residue(self[residue_id]._residue, False)


    @property
    def nucleic_acids(self):
        '''A tuple of :class:`ccdc.protein.Protein.NucleicAcid`.'''
        return tuple(
            Protein.NucleicAcid(i, self._protein_structure)
            for i in range(self._protein_structure.n_nucleic_acids())
        )


[docs]
    def remove_nucleic_acid(self, nucleic_acid_chain_id):
        '''Remove the chain with the given identifier.'''
        if self._protein_structure.has_nucleic_acid(nucleic_acid_chain_id):
            self._protein_structure.remove_nucleic_acid(self._protein_structure.nucleic_acid(nucleic_acid_chain_id))
        else:
            raise IndexError('Protein does not have a nucleic acid %s' % nucleic_acid_chain_id)


    @property
    def nucleotides(self):
        '''The nucleotides of the protein.'''
        return tuple(
            n for nc in self.nucleic_acids for n in nc.nucleotides
        )


[docs]
    def remove_nucleotide(self, nucleotide_id):
        '''Remove the specified nucleotide.'''
        self._protein_structure.remove_nucleotide(self[nucleotide_id]._nucleotide, False)


    @property
    def ligands(self):
        '''The tuple of ligands in the protein.

        The identifier of the molecule is of the form chain_id:residue_name.

        Note that hydrogen atoms are added automatically to the returned molecules
        however these are not added to the parent protein.
        '''
        return tuple(
            Protein._make_molecule(self._protein_structure.ligand(i), add_hs=True)
            for i in range(self._protein_structure.nligands())
        )

    @property
    def cofactors(self):
        '''The tuple of cofactors in the protein.

        The identifier of the molecule is of the form chain_id:residue_name.

        Note that hydrogen atoms are added automatically to the returned molecules
        however these are not added to the parent protein.
        '''
        return tuple(
            Protein._make_molecule(self._protein_structure.cofactor(i), add_hs=True)
            for i in range(self._protein_structure.ncofactors())
        )

    def _prepare_molecule_for_add(self, molecule):
        '''Add annotations to a molecule in preparation for being added to the protein'''
        
        # Cope with Chain IDs
        if ':' in molecule.identifier:
            bits = molecule.identifier.split(':')
            chain_id = bits[0]
            identifier = bits[1]
        else:
            chain_id = ''
            identifier = molecule.identifier
        # Cope with GOLD identifiers
        if '|' in identifier:
            bits = identifier.split('|')
            chain_id = ''
            identifier = bits[1]
        max_res = max(a._atom.annotations().obtain_ProteinSubstructureData().residue_sequence_number() for a in self.atoms)
        for a in molecule.atoms:
            at = a._atom
            psd = at.annotations().obtain_ProteinSubstructureData()
            psd.set_full_name(identifier)
            psd.set_residue_name(identifier)
            psd.set_chain_id(chain_id)
            psd.set_residue_sequence_number(max_res+1)


[docs]
    def add_ligand(self, molecule):
        '''Add a molecule to the protein as a ligand.'''
        self._prepare_molecule_for_add(molecule)
        l = self._protein_structure.add_ligand(molecule._molecule, False)



[docs]
    def remove_ligand(self, ligand_id):
        '''Remove the specified ligand.

        :param ligand_id: str, of the form chain_id:ligand_id.
        '''
        ligs = [self._protein_structure.ligand(i) for i in range(self._protein_structure.nligands())]
        chain_id, name = ligand_id.split(':')
        for l in ligs:
            if l.chain_id() == chain_id and l.full_name() == name:
                self._protein_structure.remove_ligand(l, False)
                break
        else:
            raise TypeError('Protein has no ligand %s' % ligand_id)



[docs]
    def add_cofactor(self, molecule):
        '''Add a molecule to the protein as a cofactor.'''
        self._prepare_molecule_for_add(molecule)
        l = self._protein_structure.add_cofactor(molecule._molecule, False)



[docs]
    def remove_cofactor(self, cofactor_id):
        '''Remove the specified cofactor.

        :param cofactor_id: str, of the form chain_id:cofactor_id.
        '''
        ligs = [self._protein_structure.cofactor(i) for i in range(self._protein_structure.ncofactors())]
        chain_id, name = cofactor_id.split(':')
        for l in ligs:
            if l.chain_id() == chain_id and l.full_name() == name:
                self._protein_structure.remove_cofactor(l, False)
                break
        else:
            raise TypeError('Protein has no cofactor %s' % cofactor_id)


    def update_protein(self):
        self._protein_structure.notify_molecule_changed(True)

    @property
    def sequence(self):
        '''The one-letter code sequence.'''
        return ' '.join(c.sequence for c in self.chains)

    @property
    def waters(self):
        '''The waters of the protein.

        :return: a tuple of :class:`ccdc.molecule.Molecule`, representing the oxygens of the water.
        '''
        return tuple(
            Protein._make_molecule(self._protein_structure.water(i))
            for i in range(self._protein_structure.nwaters())
        )


[docs]
    def remove_water(self, water_mols):
        '''Remove the water (or waters). If water_mols is a list (or tuple) of 
           water objects remove all waters in said list or tuple
        '''
        
        if  hasattr(water_mols,"identifier"):
            water_mols = [water_mols]

        water_mols_ids = set(w.identifier for w in water_mols)
        
        removed_water_ids = []
        
        indexes = []
        for i, w in enumerate(self.waters):
            if w.identifier in water_mols_ids:
                removed_water_ids.append(w.identifier)
                indexes.append(i)
                if len(removed_water_ids) == len(water_mols_ids):
                    break
        else:
            raise TypeError('Protein has no water(s) %s' % ",".join(list(water_mols_ids -set(removed_water_ids))))
        
        # These are removed in reverse to avoid the underlying protein structure
        # invalidating a later water that needs removing by removing an earlier water.
        for i in sorted(indexes, reverse=True):
            self._protein_structure.remove_water(self._protein_structure.water(i), False)

        


[docs]
    def remove_all_waters(self):
        '''Removes all waters from the protein.'''
        for i in range(self._protein_structure.nwaters()-1, -1, -1):
            self._protein_structure.remove_water(self._protein_structure.water(i), False)


    @property
    def metals(self):
        '''The metal atoms of the protein.'''
        metals = []
        for i in range(self._protein_structure.nmetals()):
            m = self._protein_structure.metal(i)
            for inx in m.atom_indices():
                atom = Atom(_atom=self._molecule.atom(inx))
                atom.identifier = f'{m.chain_id()}:{m.full_name()}'
                atom.author_identifier = f'{m.author_chain_id()}:{m.author_full_name()}'
                metals.append(atom)
        return tuple(metals)


[docs]
    def remove_metal(self, atom):
        '''Remove the given metal atom.'''
        for i, m in enumerate(self.metals):
            if m.index == atom.index:
                self._protein_structure.remove_metal(self._protein_structure.metal(i), False)
                break
        else:
            raise TypeError('Protein has no metal %s (index %s)' % (atom, str(atom.index)))



[docs]
    def remove_all_metals(self):
        '''Removes all metals from the protein.'''
        metals = self.metals
        self.remove_atoms(metals)



[docs]
    def remove_metal_bonds(self, bonds=None):
        '''Removes metal bonds.

        :param bonds: iterable of :class:`ccdc.molecule.Bond` instances. If ``None`` all metal bonds will be removed.
        '''
        if bonds is None:
            bonds = [b for b in self.bonds if any(a.is_metal for a in b.atoms)]
        self.remove_bonds(bonds)


    def __getitem__(self, identifier):
        '''Return the residue, nucleotide, chain or nucleic acid with the given identifier.

        :param identifier: may be of the form chain_id:residue_id, chain_id:nucleotide_id, chain_id or nucleic_acid_id
        :return: :class:`ccdc.protein.Protein` or :class:`ccdc.protein.Protein.Residue` or
                 :class:`ccdc.protein.Protein.NucleicAcid` or :class:`ccdc.protein.Protein.Nucleotide
        '''
        if ':' in identifier:
            cid, rid = identifier.split(':')
        else:
            cid, rid = identifier, None
        if self._protein_structure.has_chain(cid):
            c = [ch for ch in self.chains if ch.identifier == cid][0]
            if rid is not None:
                return c[rid]
            else:
                return c
        elif self._protein_structure.has_nucleic_acid(cid):
            nc = [nc for nc in self.nucleic_acids if nc.identifier == cid][0]
            if rid is not None:
                return nc[rid]
            else:
                return nc
        else:
            raise IndexError('Protein has no chain or nucleic acid %s (%s)' % (cid, identifier))

    @property
    def cavity_atoms(self):
        '''The atoms making up the binding site, if this was read from a gold protein.'''
        if not hasattr(self, '_cavity_atoms'):
            self._cavity_atoms = None
            if hasattr(self, 'atom_sets'):
                if 'CAVITY_ATOMS' in self.atom_sets:
                    self._cavity_atoms = tuple(
                        self.atoms[i] for i in self.atom_sets['CAVITY_ATOMS']
                    )
        return self._cavity_atoms

    @property
    def cavity_residues(self):
        '''The residues making up the cavity.'''
        if not hasattr(self, '_cavity_residues'):
            self._cavity_residues = None
            if not hasattr(self, '_atom_residue_map'):
                self._atom_residue_map = {
                    a : r
                    for r in self.residues
                    for a in r.atoms
                }
            ats = self.cavity_atoms
            if ats is not None:
                self._cavity_residues = set(
                    self._atom_residue_map[a]
                    for a in ats
                    if a in self._atom_residue_map
                )
                self._cavity_residues = list(self._cavity_residues)
                self._cavity_residues.sort()
        return self._cavity_residues


[docs]
    def remove_hydrogens(self):
        '''Remove all hydrogens from the protein'''

        structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
        structure_editor.remove_all_hydrogens()



[docs]
    def add_hydrogens(self, mode='All',rules_file = None):
        '''
        Add hydrogens to the protein structure

        This method protonates the protein structure by performing the following operations:

        * Remove metal bonds
        * Assign ligand and cofactor bond types and standardise aromatic and delocalised bonds to CSD conventions
        * Set atom charges to zero
        * Set bond types for ARG, GLU, ASP appropriately
        * Apply protonation rules to ligands and cofactors
        * Add hydrogens to protein, ligands, cofactors, nucleic acids and waters where necessary
        * Set any remaining unknown bond type to single

        :param mode: 'all' to generate all hydrogens (remove any existing
                     hydrogens first) or 'missing' to generate hydrogens deemed to
                     be missing.
        :param rules_file: File of rules that express special cases - if None, a default version will be used
        :raises FileNotFoundError: if the rules_file passed in doesnt exist
        :raises ValueError: if mode is not either 'all' or 'missing'
        '''
        if mode.lower() not in ['all', 'missing']:
            raise ValueError(
                    'add_hydrogens: %s should be all or missing' % mode
            )

        if rules_file is None:
            rules_dir = _CSDDatabaseLocator.get_optimisation_parameter_file_location()
            rules_file = os.path.join(rules_dir, 'protonation_rules.txt')
        elif not os.path.exists(rules_file):
            raise FileNotFoundError("The rules file {} does not exist".format(rules_file))
        
        structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)

        if mode.lower() == 'all':
            self.remove_hydrogens()
            


        rules = ChemicalAnalysisLib.ProtonationRules(rules_file)

        structure_editor.add_hydrogens(rules)

    

[docs]
    def normalise_labels(self, mode='pdb'):
        '''
        Normalise labels of atoms in the protein structure
        
        :param mode: 'pdb' (the default) will try to normalise the labels to PDB compliance if possible 
                     (i.e. no longer than 4 characters.) If labels are already compliant they will not be changed
                     'force' will call the normalisation regardless of whether they are already compliant
                     'molecule' will normalise using :attr:`ccdc.molecule.Molecule.normalise_labels`                   
        '''
        if mode.lower() == 'molecule':
            Molecule.normalise_labels(self)
        elif mode.lower() == 'force':
            structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
            structure_editor.normalise_atom_labels()
        else:       
            structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
            if not structure_editor.are_labels_pdb_compliant():
                structure_editor.normalise_atom_labels()

    

[docs]
    def are_labels_pdb_compliant(self):
        '''
        Are labels writeable in PDB format?
        '''
        structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
        return structure_editor.are_labels_pdb_compliant()

    

[docs]
    def sort_atoms_by_residue(self):
        '''
        Sorts atoms by residue
        
        After editing, sometimes the underlying atom list in a protein is not sorted by residue so atoms
        in a single residue are not in a single block of atoms. In particular, adding hydrogens will add new
        hydrogen atoms to the end of the atom list.
        
        Calling this method will re-order the atoms in the protein so that each residue is in a single atom block in
        the atom list. This is useful in particular if you are writing PDB files where having residues as single blocks
        of ATOM lines is desirable.
        
        Note that calling this method will mean that any pre-existing indexes into the atom list will probably
        be invalidated.
        '''
        structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
        structure_editor.sort_atoms_by_residue()



[docs]
    def assign_unique_chain_identifiers(self, mode='all'):
        '''

        Assigns unique chain identifiers to disconnected chain fragments and/or non-chain components

        Occasionally, for example when merging two proteins, you can have a protein that has multiple chains with
        the same chain identifier. This can lead to issues down the line: for example file writers may end up mixing
        the order of residues as the code will 'see' the residues as belonging to the same chain and sort them by sequence 
        number

        Calling this method will reassign chain identifiers to all components. Each disconnected chain will get a unique
        chain ID. Note that there is no guarentee that old chain IDs will be the same. If, say previously the first chain was B
        and the second A the chain identifiers may reverse.

        Ligands, Co-factors, Metals and Waters will also get new chain ids: these will be derived based on which chain they are
        overall closest to (i.e. a ligand is given the chain id of the chain that has the most atoms within 5.0 Angstroms.)

        Note that calling this method will mean that any pre-existing indexes into the atom list will probably
        be invalidated, as the consequence of changing ids can mean a re-ordering of atoms and residues.

        :param mode: 'all' (the default) will assign chains new IDs to chain and then associate non-chain components
                     (e.g.co-factors, waters, ligands and metals) to have the chain id of the chain they are nearest to.
                     'nonchain' will just re-assign chain ids to co-factors, waters, ligands and metals based on chain proximity.
        '''

        structure_editor = FileFormatsLib.ProteinStructureEditor(self._protein_structure)
        if mode == 'all':
            structure_editor.assign_chain_ids_to_components()
        else:
            structure_editor.assign_chain_ids_to_non_chain_components()



[docs]
    def detect_ligand_bonds(self, covalent_links = 'include'):
        '''
        Removes all bonds between ligand or cofactor atoms, and redetects them based on
        distance between atoms.

        This can be useful if the bonds specified by the CONECT records in the
        PDB are unspecified or undesirable.

        :param mode: covalent_links 'include' to include covalent links between the protein and the ligand (the default) and 'exclude' to remove them
        '''
        covlinks = True
        if covalent_links == 'exclude':
             covlinks = False

        self._protein_structure.assign_ligand_connectivity(covlinks)
        self._protein_structure.build_protein_structure()



[docs]
    class BindingSite(object):
        '''A binding site in the protein.'''
        def __init__(self, protein, whole_residues=True):
            self.protein = protein
            self.whole_residues = whole_residues

        @property
        def atoms(self):
            '''The atoms of the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            if self.whole_residues:
                protein_parts = bsd.expand_to_whole_parts(ats)
                return tuple(
                    self.protein.atoms[i] for x in protein_parts for i in x.atom_indices()
                )
            else:
                return tuple(Atom(_atom=a) for a in ats)

        @property
        def residues(self):
            '''The residues of the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            protein_parts = bsd.expand_to_whole_parts(ats)
            return tuple(
                self.protein['%s:%s' % (x.chain_id(), x.full_name())]
                for x in protein_parts
                if x.type() == AnnotationsLib.ProteinSubstructureData.AMINOACID
            )

        @property
        def nucleotides(self):
            '''The nucleotides of the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            self._pp = protein_parts = bsd.expand_to_whole_parts(ats)
            return tuple(
                self.protein['%s:%s' % (x.chain_id(), x.full_name())]
                for x in protein_parts
                if x.type() == AnnotationsLib.ProteinSubstructureData.NUCLEOTIDE
            )

        @property
        def waters(self):
            '''The waters of the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            protein_parts = bsd.expand_to_whole_parts(ats)
            return tuple(
                Protein._make_molecule(x)
                for x in protein_parts
                if x.type() == AnnotationsLib.ProteinSubstructureData.WATER
            )

        @property
        def ligands(self):
            '''The ligands of the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            protein_parts = bsd.expand_to_whole_parts(ats)
            return tuple(
                Protein._make_molecule(x)
                for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.LIGAND
            )

        @property
        def cofactors(self):
            '''The cofactors of the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            protein_parts = bsd.expand_to_whole_parts(ats)
            return tuple(
                Protein._make_molecule(x)
                for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.COFACTOR
            )

        @property
        def metals(self):
            '''The metals in the cavity.'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)
            protein_parts = bsd.expand_to_whole_parts(ats)
            return tuple(
                self.protein.atoms[i]
                for x in protein_parts if x.type() == AnnotationsLib.ProteinSubstructureData.METAL
                for i in x.atom_indices()
            )

        @property
        def _molecule(self):
            '''Molecule comprising only the cavity atoms of the protein'''
            bsd = FileFormatsLib.BindingSiteDetector(self.protein._protein_structure)
            ats = self._detect_binding_site(bsd)

            if self.whole_residues:
               ats = bsd.expand_to_whole_parts_atoms(ats)

            return bsd.binding_site_subset(ats)

        @property
        def identifier(self):
            return self.protein.identifier

        @property
        def formula(self):
            '''Return the chemical formula of the molecules in the binding site.'''
            return str(ChemistryLib.MoleculeChemicalFormula(self._molecule))



[docs]
    class BindingSiteFromPoint(BindingSite):
        '''A cavity defined from a point.'''
        def __init__(self, protein, origin=(0,0,0), distance=12.):
            Protein.BindingSite.__init__(self, protein)
            self.origin = origin
            self.distance = distance

        def _detect_binding_site(self, bsd):
            return bsd.detect_binding_site(self.origin, self.distance)



[docs]
    class BindingSiteFromAtom(BindingSiteFromPoint):
        '''A binding site defined from a protein atom.'''
        def __init__(self, protein, atom, distance):
            Protein.BindingSiteFromPoint.__init__(self, protein, atom.coordinates, distance)
            self.atom = atom



[docs]
    class BindingSiteFromMolecule(BindingSite):
        '''A binding site defined from an arbitrary molecule.'''
        def __init__(self, protein, molecule, distance, whole_residues=True):
            Protein.BindingSite.__init__(self, protein, whole_residues=whole_residues)
            self.molecule = molecule
            self.distance = distance

        def _detect_binding_site(self, bsd):
            return bsd.detect_binding_site(self.molecule._molecule, self.distance)



[docs]
    class BindingSiteFromResidue(BindingSiteFromMolecule):
        '''A binding site defined from protein residue.'''
        def __init__(self, protein, residue, distance):
            mol = Protein._make_molecule(residue._residue)
            Protein.BindingSiteFromMolecule.__init__(self, protein, mol, distance)
            self.residue = residue



[docs]
    class BindingSiteFromListOfAtoms(BindingSite):
        '''A binding site defined from a list of protein atoms.'''
        def __init__(self, protein, atoms):
            Protein.BindingSite.__init__(self, protein)
            self.list_of_atoms = atoms

        def _detect_binding_site(self, bsd):
            bsd.set_binding_site_atoms(tuple(a._atom for a in self.list_of_atoms))
            return bsd.binding_site_atoms()



[docs]
    class BindingSiteFromListOfResidues(BindingSiteFromListOfAtoms):
        'A binding site from a list of residues.'''
        def __init__(self, protein, list_of_residues):
            self.list_of_atoms = tuple(a for r in list_of_residues for a in r.atoms)
            self.list_of_residues = list_of_residues
            Protein.BindingSiteFromListOfAtoms.__init__(self, protein, self.list_of_atoms)



[docs]
    @nested_class('Protein')
    class ChainSuperposition(object):
        '''Class for superposition of protein chains using sequence alignment'''
        _superposition_mode = bidirectional_dict(
            CALPHA=ProteinLib.ChainTransformationCalculator.CALPHA,  # Overlay on C-alpha atoms only
            BACKBONE=ProteinLib.ChainTransformationCalculator.BACKBONE,  # Overlay on backbone atoms
            RIGID=ProteinLib.ChainTransformationCalculator.RIGID  # Overlay on backbone and C-beta atoms
        )


[docs]
        @nested_class('Protein.ChainSuperposition')
        class Settings(object):
            '''Configuration options for the superposition of protein chains.'''

            def __init__(self):
                self.overlay_weighting_factor = 10.
                '''weighting factor to use in overlay'''
                self.overlay_minimum_cycles = 1
                '''minimum number of cycles in overlay'''
                self.overlay_convergence_tolerance = 0.01
                '''tolerance for convergence in overlay'''
                self.sequence_alignment_tool = None
                '''external sequence alignment program'''
                self.sequence_search_tool = None
                '''external sequence search program'''
                self.superposition_atoms = 'RIGID'
                '''protein chain atoms to use in overlay (RIGID, BACKBONE or CALPHA)'''


        def __init__(self, settings=None):
            '''Set the chain superposition settings.'''
            ProteinLib.licence_check()
            if settings is None:
                settings = Protein.ChainSuperposition.Settings()
            self.settings = settings


[docs]
        def superpose(self, chain1, chain2, binding_site1=None):
            '''Superpose two protein chains or binding sites

            An implementation of the Smith-Waterman algorithm is used unless
            an external sequence alignment tool is specified in the settings.

            If a binding site is supplied for the first chain, only the atoms
            in the binding site will be overlaid.

            :param chain1: a :class:`ccdc.protein.Chain` instance
            :param chain2: a :class:`ccdc.protein.Chain` instance
            :param binding_site1: a :class:`ccdc.protein.BindingSite` instance for the first chain
            :returns: the root-mean square deviation of the overlay and the transformation matrix
            '''
            if self.settings.sequence_alignment_tool is None:
                alignment_calculator = ProteinLib.SmithWatermanChainAlignmentCalculator()
            else:
                alignment_calculator = ProteinLib.SequenceBasedChainAlignmentCalculator(
                    self.settings.sequence_alignment_tool,
                    self.settings.sequence_search_tool)

            residues = alignment_calculator.matched_residues_for_chain_ids(
                chain1._protein_structure, chain1.identifier,
                chain2._protein_structure, chain2.identifier)

            if binding_site1 is not None:
                protein_atoms1 = list(a._atom for a in binding_site1.atoms)
                chain_transformation = ProteinLib.ChainTransformationCalculator(residues, protein_atoms1)
            else:
                chain_transformation = ProteinLib.ChainTransformationCalculator(residues)

            superposition_result = chain_transformation.calculate(
                self.settings.overlay_weighting_factor,
                self.settings.overlay_minimum_cycles,
                self.settings.overlay_convergence_tolerance,
                self._superposition_mode.prefix_lookup(self.settings.superposition_atoms))

            ChemistryLib.MoleculeTransformation(chain2._protein_structure.editable_molecule(),
                                                superposition_result.transformation(),
                                                chain2._protein_structure.cell()).apply()

            return superposition_result.rms(), Molecule.Transformation(
                _transformation=superposition_result.transformation())




###########################################################################