Source code for ccdc.cavity

# coding=utf8
#
# This code is Copyright (C) 2015 The Cambridge Crystallographic Data Centre
# (CCDC) of 12 Union Road, Cambridge CB2 1EZ, UK and a proprietary work of CCDC.
# This code may not be used, reproduced, translated, modified, disassembled or
# copied, except in accordance with a valid licence agreement with CCDC and may
# not be disclosed or redistributed in any form, either in whole or in part, to
# any third party. All copies of this code made in accordance with a valid
# licence agreement as referred to above must contain this copyright notice.
#
# No representations, warranties, or liabilities are expressed or implied in the
# supply of this code by CCDC, its servants or agents, except where such
# exclusion or limitation is prohibited, void or unenforceable under governing
# law.
#
#################################################################################################
'''
The :mod:`ccdc.cavity` module contains a class :class:`ccdc.cavity.Cavity`
representing a putative binding site on the protein surface that has been
automatically detected by the LIGSITE_ algorithm.

Reference article introducing the LIGSITE_ algorithm.

.. _LIGSITE: http://www.sciencedirect.com/science/article/pii/S1093326398000023

This approach uses a three-dimensional grid for the detection of surface
depressions with a grid spacing of 0.5 Ångströms. As a result, the cavity is
represented by a set of grid intersection points, also called surface points.

Once a cavity has been detected, the flanking fragments of amino acids are
assigned pseudocenters to represent their physicochemical properties.
Currently, seven types of pseudocenters are used:

  - DONOR: representing a hydrogen bond donor functionality
  - ACCEPTOR: representing a hydrogen bond acceptor functionality
  - DONOR_ACCEPTOR: representing both hydrogen bond donor and acceptor functionalities
  - AROMATIC: represents the center of an aromatic ring system
  - ALIPHATIC: represents an aliphatic moiety
  - PI: denotes the presence of a double bond
  - METAL: represents the position of a metal ion

Reference article for the introduction of the `cavity graph comparison method`_.

.. _`cavity graph comparison method`: http://www.sciencedirect.com/science/article/pii/S0022283602008112

Reference article for the introduction of the `fast cavity graph comparison method`_.

.. _`fast cavity graph comparison method`: http://ieeexplore.ieee.org/document/6853389/

Reference article for the introduction of the `cavity histograms comparison method`_.

.. _`cavity histograms comparison method`: https://pubs.acs.org/doi/abs/10.1021/ci5005898

The class :class:`ccdc.cavity.Cavity` has methods to automatically create
cavities for PDB files, or read a cavity from an XML representation, to extract
useful information such as bound ligands, and to compare two cavities using
different comparison methods. From this it is possible to write screens to
search for similar cavities across a range of proteins, a defined set of
cavities, or the entire cavity database.
'''
#################################################################################################

import os
import sqlite3
import zlib
import time
import itertools
import collections
import operator
import math
import csv
import functools
# Python 3.4: from enum import Enum

from ccdc.molecule import Coordinates
from ccdc.molecule import Molecule
from ccdc.descriptors import GeometricDescriptors
from ccdc.utilities import bidirectional_dict, Histogram, Timer
from ccdc.protein import Protein
from ccdc.io import _CSDDatabaseLocator

from ccdc.utilities import _temporary_copy
from ccdc.utilities import _private_importer

with _private_importer() as pi:
    pi.import_ccdc_module('CavityLib')
    pi.import_ccdc_module('MathsLib')

CavityLib.licence_check()

from packaging import version
too_old_sqlite3_warning = f'Your python interpreter includes sqlite3 version {sqlite3.sqlite_version} which ' + \
    'will cause extreme slowness of this module. Please use a python interpreter with an up to date sqlite3 module.'
if version.parse('3.18.0') > version.parse(sqlite3.sqlite_version) \
    and not 'CCDC_PYTHON_API_IGNORE_OLD_SQLITE3_WARNING' in os.environ:
        raise RuntimeError(too_old_sqlite3_warning)
            

#################################################################################################


[docs]
class Cavity(object):
    '''A cavity on the protein surface.'''

    feature_types = bidirectional_dict(
        donor=CavityLib.PSEUDOCENTER.DONOR,
        acceptor=CavityLib.PSEUDOCENTER.ACCEPTOR,
        donor_acceptor=CavityLib.PSEUDOCENTER.DONOR_ACCEPTOR,
        pi=CavityLib.PSEUDOCENTER.PI,
        aromatic=CavityLib.PSEUDOCENTER.AROMATIC,
        metal=CavityLib.PSEUDOCENTER.METAL,
        aliphatic=CavityLib.PSEUDOCENTER.ALIPHATIC
    )

    _relibase_amino_acid_codes = bidirectional_dict(
        ALA=0,
        ARG=1,
        ASN=2,
        ASP=3,
        CYS=4,
        GLN=5,
        GLU=6,
        GLY=7,
        HIS=8,
        ILE=9,
        LEU=10,
        LYS=11,
        MET=12,
        PHE=13,
        PRO=14,
        SER=15,
        THR=16,
        TRP=17,
        TYR=18,
        VAL=19,
        UNK=20,
        ASX=21,
        GLX=22,
        NAD=23,
        NAP=24,
        FMN=25,
        MTX=26,
        HEM=27,
        ATP=28,
        COA=29,
        C=30,
        A=31,
        G=32,
        U=33,
        T=34,
        ACE=35,
        PCA=36,
        UNL=39,
        HOH=100
    )

    # Python 3.4: class ComparisonMethod(Enum):
    class ComparisonMethod:
        FAST_CAVITY_GRAPH_COMPARISON = 1
        CAVITY_GRAPH_COMPARISON = 2
        CAVITY_HISTOGRAMS_COMPARISON = 3


[docs]
    class Feature(object):
        '''An interaction feature in a cavity.'''

        def __init__(self, _feature=None):
            self._feature = _feature

        def __repr__(self):
            '''String representation of a feature.'''
            return 'Feature(%s at %s)' % (self.type, self.coordinates)

        def __eq__(self, other):
            '''Equality of features.'''
            return self.type == other.type and self.coordinates == other.coordinates

        def __hash__(self):
            return hash(self.type) + hash(self.coordinates)

        @property
        def coordinates(self):
            '''The position of the feature.

            :returns: a named tuple of coordinates
            '''
            p = self._feature.coordinates()
            return Coordinates(p.x(), p.y(), p.z())

        @property
        def burial(self):
            '''The burial value assigned to this feature (0 to 7, where 7 means most buried).'''
            return self._feature.depth()


[docs]
        def distance(self, other):
            '''The distance between this feature and another.'''
            return GeometricDescriptors.point_distance(self.coordinates, other.coordinates)



[docs]
        def point_distance(self, point):
            '''The distance from this feature to an arbitrary point.'''
            return GeometricDescriptors.point_distance(self.coordinates, point)


        @property
        def protein_vector(self):
            '''Vector denoting the ideal interaction direction of the feature with another one outside the protein.'''
            vec = self._feature.std_vec()
            return GeometricDescriptors.Vector(vec.x(), vec.y(), vec.z())

        @property
        def surface_vector(self):
            '''Vector denoting the connection from the feature to the centre of its assigned surface points.'''
            vec = self._feature.surf_vec()
            return GeometricDescriptors.Vector(vec.x(), vec.y(), vec.z())

        @property
        def surface_points(self):
            '''The surface points associated with this feature.

            These approximate the surface shape close to the feature.

            :returns: a tuple of named tuples of coordinates
            '''
            return tuple(
                Coordinates(p.x(), p.y(), p.z()) for p in self._feature.srfdots()
            )

        @property
        def surface_depths(self):
            '''The depth values assigned to the surface points of this feature (0 to 7, where 7 means most buried).

            :returns: a tuple of depth values
            '''
            return tuple(
                s.depth() for s in self._feature.srfdots()
            )

        @property
        def type(self):
            '''The type of the interaction feature.'''
            return Cavity.feature_types.inverse_lookup(self._feature.type())

        @property
        def amino_acid_code(self):
            '''The associated amino acid code.

            This will be 'UNK' if no protein structure has been associated with the cavity containing the feature.
            '''
            return Cavity._relibase_amino_acid_codes.inverse_lookup(
                ord(self._feature.amino_acid_type())
            )

        @property
        def chain(self):
            '''The chain from which this feature was defined.

            This will be ``None`` if no protein information is associated with the cavity.
            '''
            if hasattr(self, '_protein'):
                obj_name = self._feature.mmobj_name()
                model = self._pdb.find_model(obj_name)
                unit = model.unit_i(self._feature.unitid())
                chain_id = unit.chain_id()
                chain = self._protein[chain_id]
                return chain

        @property
        def residue(self):
            '''The residue associated with this feature.

             This will be ``None`` if no protein information is associated with the cavity.
             '''
            if hasattr(self, '_protein'):
                unit_id = self._feature.unitid()
                return self.chain.residues[unit_id]

        @property
        def atom(self):
            '''The atom from which this feature is defined.

            This will be ``None`` if no protein information is associated with the cavity.
            Aromatic and aliphatic features have no associated atom, so this
            property has the value ``None`` in those cases.
            '''
            if self.type in ['aromatic', 'aliphatic']:
                return None
            if hasattr(self, '_protein'):
                model = self._pdb.find_model(self._feature.mmobj_name())
                unit = model.unit_i(self._feature.unitid())
                at = unit.atom(self._feature.atomid())
                at_name = at.atom_name().strip()
                possibles = [a for a in self.residue.atoms if a.label == at_name]
                if len(possibles) == 1:
                    return possibles[0]
                atom_id = self._feature.atomid()
                for a in self.residue.atoms:
                    if a.atomic_number == 1:
                        continue
                    if atom_id == 0:
                        return a
                    atom_id -= 1
            #  Matching atom not found
            return None



[docs]
    class CavityGraphComparison(object):
        '''The result of a cavity graph comparison.'''

        def __init__(self, _comparison):
            self._comparison = _comparison

        @property
        def score(self):
            '''The similarity score of the comparison.'''
            return self._comparison.score()

        @property
        def rmsd(self):
            '''The rms deviation for the match of all pseudocenters.'''
            return self._comparison.rms()

        @property
        def clique_rmsd(self):
            '''The rms deviation of the matched clique points.'''
            return self._comparison.rms_cliq()

        @property
        def n_cliques(self):
            '''The total number of cliques detected during the comparison.'''
            return self._comparison.number_of_cliques()

        @property
        def n_matches(self):
            '''The number of matching pseudocenters detected.'''
            return self._comparison.number_of_matches()

        @property
        def transformation_matrix(self):
            '''The transformation matrix to overlay the target onto the query cavity'''
            return Molecule.Transformation(_transformation=self._comparison.transformation_matrix())



[docs]
    class FastCavityGraphComparison(object):
        '''The result of a fast cavity graph comparison.'''

        def __init__(self, _comparison):
            self._comparison = _comparison

        @property
        def score(self):
            '''The similarity score of the comparison.'''
            return self._comparison.score()

        @property
        def largest_clique_size(self):
            '''The size of the largest clique detected.'''
            return self._comparison.largest_clique_size()

        @property
        def product_graph_size(self):
            '''The size of the product graph generated during the comparison.'''
            return self._comparison.product_graph_size()



[docs]
    class CavityDistanceHistograms(object):
        '''A cavity description based on histograms of distances between reference points and cavity pseudocenters.'''

        _reference_points = bidirectional_dict(
            CENTROID=CavityLib.CENTROID,
            CENTROID_CLOSEST=CavityLib.CENTROID_CLOSEST,
            CENTROID_FURTHEST=CavityLib.CENTROID_FURTHEST,
            CENTROID_FURTHEST_FURTHEST=CavityLib.CENTROID_FURTHEST_FURTHEST
        )

        def __init__(self, cavity, reference_points=None):
            '''Create a set of feature distance histograms for the cavity.

            :param cavity: a :class:`ccdc.cavity.Cavity` instance
            :param reference_points: an iterable of strings drawn from
            'centroid', 'centroid_closest', 'centroid_furthest',
            'centroid_furthest_furthest'. If empty or None, 'centroid' and
            'centroid_closest' will be used for the generation of distance
            histograms
            '''
            if reference_points is None or len(reference_points) == 0:
                self.reference_points = [CavityLib.CENTROID, CavityLib.CENTROID_CLOSEST]
            else:
                self.reference_points = [
                    Cavity.CavityDistanceHistograms._reference_points.prefix_lookup(k) for k in
                    reference_points]

            self._histogram_set = CavityLib.RapmadPocket(cavity._cavity, self.reference_points)

        @property
        def histograms(self):
            '''The tuple of histograms defined for this cavity.'''
            return tuple(
                Histogram._from_histogram(self._histogram_set.get_histogram(i, j))
                for i in range(len(Cavity.CavityDistanceHistograms._reference_points))
                for j in range(len(Cavity.feature_types))
                if self._histogram_set.has_histogram(i, j)
            )

        def _compare(self, other):
            '''Compare another cavity with this one using the cavity histograms comparison method.'''
            return CavityLib.StandaloneCavityComparison().rapmad(self._histogram_set,
                                                                 other._histogram_set)


    def __init__(self, _cavity, _pdb):
        '''Initialise a cavity.'''
        self._cavity = _cavity
        self._pdb = _pdb
        self._writable = False


[docs]
    def subcavity(self, features):
        '''Make a subcavity based on a set of features from this cavity.

        :param features: a set of features for construction of the subcavity

        :returns: a :class:`ccdc.cavity.Cavity` instance
        '''
        cav = CavityLib.CAVITY(self._cavity)
        cav.assign_pseudocenters([
            f._feature for f in features
        ])
        if self._pdb.object_name():
            cav.set_pdb(self._pdb)
        else:
            cav._pdb = None
        cavity = Cavity(_cavity=cav, _pdb=self._pdb)
        if hasattr(self, '_protein'):
            cavity._protein = self._protein
        return cavity


    @property
    def identifier(self):
        '''The identifier of this cavity.'''
        return self._cavity.object_name().replace('CAV::', '')

    @property
    def ligand_identifiers(self):
        '''Tuple of ligand identifiers found in the cavity.'''

        def strip(name):
            parts = name.split('_')
            name = parts[0][5:]
            if '-' in parts[1]:
                subparts = parts[1].split('-')
                name = subparts[1] + ':' + name
            return name

        return tuple(
            strip(name) for name in self._cavity.ligands()
        )

    @property
    def ligands(self):
        '''List of ligands of the cavity.

        If there is no protein associated with the cavity this will be ``None``.
        '''
        def ligand_identifiers_match(id1, id2):
            parts1 = id1.split(':')
            parts2 = id2.split(':')
            if len(parts1) == len(parts2):
                return id1.startswith(id2)
            else:
                return parts1[-1].startswith(parts2[-1])

        # Need something better - have a look in _pdb to see if I can get a better ligand id.
        ligands = None
        if hasattr(self, '_protein') and self._protein is not None:
            ligands = [ligand for ligand in self._protein.ligands
             if any(
                ligand_identifiers_match(ligand.identifier, lig_id)
                for lig_id in self.ligand_identifiers)
             ]
        return ligands


[docs]
    def features_by_type(self, type):
        '''The features of a given type.'''
        return set([f for f in self.features if f.type == type.lower()])



[docs]
    def features_by_distance(self, centre, radius):
        '''The set of features of the cavity within radius of the centre.'''
        return set([f for f in self.features if
                    GeometricDescriptors.point_distance(f.coordinates, centre) < radius])



[docs]
    def features_by_atom_distance(self, atoms, radius):
        '''The set of all features within a radius of any of the atoms.'''
        s = functools.reduce(operator.__or__,
                   [self.features_by_distance(a.coordinates, radius) for a in atoms], set())
        return s



[docs]
    def features_by_residues(self, residues):
        '''The set of features associated with any of the given residues.'''
        s = set(f for f in self.features if f.residue in residues)
        return s


    @property
    def bounding_box(self):
        '''The origin and far corner of the cavity.'''
        g = self._cavity.grid()
        return (
            Coordinates(g.x_off(), g.y_off(), g.z_off()),
            Coordinates(g.x_off() + g.step() * g.dx(),
                        g.y_off() + g.step() * g.dy(),
                        g.z_off() + g.step() * g.dz())
        )

    @property
    def volume(self):
        '''Volume of the cavity in cubic Angstroms.'''
        return self._cavity.volume()

    @property
    def features(self):
        '''The features of the cavity.

        :returns: a tuple of :class:`ccdc.cavity.Cavity.Feature` instances
        '''
        features = tuple(Cavity.Feature(_feature=p) for p in self._cavity.pseudocenters())
        if hasattr(self, '_protein'):
            for f in features:
                f._protein = self._protein
                f._pdb = self._cavity.pdb()
        return features


[docs]
    def write(self, file_name):
        '''Write the cavity to a rlbcoor file for visualisation in hermes.'''
        if self._pdb is None or self._pdb.n_atoms() == 0:
            raise NotImplementedError('A cavity without a protein cannot be written to rlbcoor format files')
        if not self._writable:
            raise RuntimeError('Cavities must be defined from a PDB file to be writable as rlbcoor')
        xvec = MathsLib.vector_3d(0.5, 0.0, 0.0)
        yvec = MathsLib.vector_3d(0.0, 0.5, 0.0)
        zvec = MathsLib.vector_3d(0.0, 0.0, 0.5)
        self._cavity.set_xgridvector(xvec)
        self._cavity.set_ygridvector(yvec)
        self._cavity.set_zgridvector(zvec)
        writer = CavityLib.CavityOutputWriter()
        writer.out_write_cav_in_cif(file_name, self._cavity, self._pdb)



[docs]
    def to_pymol_file(self, file_name=None, show_surface_points=False):
        '''Create a visualisation file of this cavity that can be run in PyMOL.

        The cavity will be represented by its physicochemical features.

        :param file_name: Python file containing the information for displaying
            the cavity. This should have a .py extension. If not defined, the file
            will be named using the cavity identifier
        :param show_surface_points: additionally display the points
            representing the cavity's surface shape
        '''

        def feature_color(type):
            colors = {'donor': 'blue', 'acceptor': 'red', 'donor_acceptor': 'green',
                      'aromatic': 'orange', 'aliphatic': 'yellow', 'pi': 'grey', 'metal': 'cyan'}
            return colors.get(type, 'black')

        def add_point_set(coordinates, name, color, scale):
            point_list = []
            for c in coordinates:
                point_list.append('7.0, %s, %f' % (', '.join(map(str, c)), scale))
            lines = [
                '%s = [%s]' % (name, ', '.join(point_list)),
                'cmd.load_cgo(%s, "%s", 1)' % (name, name),
                'cmd.color("%s", "%s")' % (color, name)
            ]
            return '\n'.join(lines)

        def add_point(point, name, color, scale):
            lines = [
                'cmd.do("pseudoatom %s, pos=[%s], color=%s")' % (
                name, ', '.join(map(str, point.coordinates)), color),
                'cmd.do("show sphere, %s")' % (name),
                'cmd.do("set sphere_scale, %s, %s")' % (scale, name)
            ]
            return '\n'.join(lines)

        py_file_name = self.identifier + '.py' if file_name is None else file_name

        with open(py_file_name, 'w') as f:
            f.write('from pymol.cgo import *\n')
            f.write('from pymol import cmd\n')
            i = 1
            for feature in self.features:
                feature_name = 'Feat_%i_%s' % (i, feature.type)
                f.write(add_point(feature, feature_name, feature_color(feature.type), 0.3) + '\n')
                if show_surface_points == True and len(feature.surface_points) > 0:
                    surface_name = 'Surf_%i' % (i)
                    f.write(
                        add_point_set(feature.surface_points, surface_name, 'wheat', 0.05) + '\n')
                i += 1


    def _check_cavity(self):
        if len(self.features) <= 1:
            raise RuntimeError('Unable to make a comparison '
                               '(cavity {0} has too few pseudocentres)'.format(self.identifier))

    def _compare(self, other):
        '''Compare another cavity with this one using the cavity graph comparison method.

        :param other: :class:`ccdc.cavity.Cavity` instance

        :raises: RuntimeError if the result of the cavity comparison is None

        :returns: a :class:`ccdc.cavity.Cavity.CavityGraphComparison` instance
        '''
        settings = CavityLib.CavBaseSearchSettings()

        self._check_cavity()
        other._check_cavity()

        comper = CavityLib.CavityComparison(self._cavity, other._cavity, settings)
        res = comper.compare(False, False)
        if res is None:
            raise RuntimeError('Unable to make a comparison '
                               '(possibly one cavity is too large?)')
        return Cavity.CavityGraphComparison(_comparison=res)

    def _fast_compare(self, other, max_product_graph_size):
        '''Compare another cavity with this one using the fast cavity graph comparison method.

        :param other: a :class:`ccdc.cavity.Cavity` instance
        :param max_product_graph_size: the maximum allowed size of the product graph

        :returns: a :class:`ccdc.cavity.Cavity.FastCavityGraphComparison` instance
        '''
        self._check_cavity()
        other._check_cavity()

        result = CavityLib.StandaloneCavityComparison().local_cliques(self._cavity,
                                                                      other._cavity,
                                                                      max_product_graph_size)
        return Cavity.FastCavityGraphComparison(_comparison=result)


[docs]
    def cavity_distance_histograms(self, reference_points=None):
        '''Create a set of feature distance histograms for this cavity based on
        the given reference point specification.

        :param reference_points: a set of reference point measures

        :returns: a :class:`ccdc.cavity.Cavity.CavityDistanceHistograms` instance
        '''
        return Cavity.CavityDistanceHistograms(self, reference_points)



[docs]
    def compare(self, other, comparison_method=ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON,
                histogram_reference_points=None, max_product_graph_size=36000):
        '''Compare this cavity to another cavity.

        :param other: a :class:`ccdc.cavity.Cavity` instance
        :param comparison_method: a member of :class:`ccdc.cavity.Cavity.ComparisonMethod`,
            either Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON,
            Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON or
            Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON
        :param histogram_reference_points: an iterable of strings drawn from
            'centroid', 'centroid_closest', 'centroid_furthest',
            'centroid_furthest_furthest'. If empty or None, 'centroid' and
            'centroid_closest' will be used for the generation of distance
            histograms with Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON
        :param max_product_graph_size: the maximum allowed size of the
            product graph for fast cavity graph comparisons
        :returns: a :class:`ccdc.cavity.Cavity.FastCavityGraphComparison` instance,
            a :class:`ccdc.cavity.Cavity.CavityGraphComparison` instance
            or a similarity score for cavity histogram comparisons
        '''
        if comparison_method == Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON:
            return self.cavity_distance_histograms(histogram_reference_points)._compare(
                other.cavity_distance_histograms(histogram_reference_points))
        elif comparison_method == Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON:
            return self._compare(other)
        elif comparison_method == Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON:
            return self._fast_compare(other, max_product_graph_size)
        else:
            raise RuntimeError('Not an allowed comparison method')



[docs]
    def to_xml(self):
        '''An XML representation of the cavity.'''
        ostr = CavityLib.ostringstream()
        self._cavity.print_xml(ostr)
        return ostr.str()



[docs]
    def to_xml_file(self, file_name):
        '''Writes the XML representing a cavity to a file.

        :param file_name: the file to which to write the XML
        '''
        with open(file_name, 'w') as writer:
            writer.write(self.to_xml())


    @staticmethod
    def _make_pdb(pdb_file=None):
        '''Private.'''
        if pdb_file is None:
            return CavityLib.PDB()
        if not os.path.exists(pdb_file):
            raise RuntimeError('PDB file %s does not exist' % pdb_file)
        bali_settings = CavityLib.BaliSettings.instantiate('.')
        bali_settings.set_auto_accept_ligand_templates(True)
        bali_settings.set_max_journal_title_length(1024)
        bali_settings.set_truncate_text_fields(True)
        pdb_settings = bali_settings.parser_settings()
        pdb_settings.set_title_required(False)
        pdb_settings.set_method_required(False)
        pdb_settings.set_spacegroup_required(False)
        bali_proc = CavityLib.BaliProcessing.instantiate(pdb_file, bali_settings)
        if bali_proc.has_errors():
            raise RuntimeError(
                'Errors in processing PDB file:\n%s' % ('\n'.join(bali_proc.errors())))
        return bali_proc.pdb()


[docs]
    @staticmethod
    def from_xml(xml, pdb_file=None):
        '''Reads a cavity from an XML string.

        :param xml: an XML representation of the cavity
        :param pdb_file: an optional PDB file for the associated protein, from
            which additional data for the cavity may be computed
        '''
        if not isinstance(xml, str):
            xml = xml.decode('utf-8')
        stream = CavityLib.stringstream(xml)
        _pdb = Cavity._make_pdb(pdb_file)
        reader = CavityLib.cav_xml()
        reader.init_xml()
        _cavity = reader.inp_read_cavxml(stream, _pdb)
        if pdb_file is not None:
            pseudo_searcher = CavityLib.PseudocenterSearch(_cavity, _pdb,
                                                           CavityLib.PseudocenterSearchSettings())
            _cavity.set_pseudocenters(pseudo_searcher.pseudocenters())
        # surface_distance = 1.1
        #    _cavity.calculate_surrounding_objects(surface_distance, CavityLib.CavityGenerationSettings.METAL_AS_PROTEIN)
        c = Cavity(_cavity, _pdb)
        if pdb_file is not None:
            c._protein = Protein.from_file(pdb_file)
            _cavity.set_pdb(_pdb)
        c._writable = False
        return c



[docs]
    @staticmethod
    def from_xml_file(xml_file, pdb_file=None):
        '''Reads a cavity from an XML file and associated PDB file.

        :param xml_file: XML file representing the cavity
        :param pdb_file: an optional PDB file for the associated protein, from
            which additional data for the cavity may be computed

        :raises: RuntimeError if the XML file does not exist

        :returns: a :class:`ccdc.cavity.Cavity` instance
        '''
        if not os.path.exists(xml_file):
            raise RuntimeError('XML file %s does not exist' % xml_file)
        with open(xml_file) as xml:
            return Cavity.from_xml(xml.read(), pdb_file)



[docs]
    @staticmethod
    def from_pdb_file(pdb_file,  maximum_incomplete_residues_per_chain = 0):
        '''Create cavities from a PDB file.

        :param pdb_file: PDB file for the generation of cavities
        :param maximum_incomplete_residues: the maximum number of incomplete residues to allow (0 by default)

        :raises: RuntimeError if the PDB file contains more than 1000 SEQRES lines

        :returns: a tuple of :class:`ccdc.cavity.Cavity` instances
        '''
        with open(pdb_file) as pdb:
            seqres_lines = [l for l in pdb if l.startswith('SEQRES')]
        if len(seqres_lines) > 1000:
            raise RuntimeError('%s contains too many residues for Cavity construction' % pdb_file)
        _pdb = Cavity._make_pdb(pdb_file)
        _settings = CavityLib.CavityGenerationSettings()
        _settings.set_metal_atom_mode(CavityLib.CavityGenerationSettings.METAL_AS_PROTEIN)
        _settings.set_resolution_limit(1000.)  # Effectively no resolution limit
        _settings.set_maximum_incomplete_residues_per_chain(maximum_incomplete_residues_per_chain)
        _settings.set_nmr_structures(CavityLib.CavityGenerationSettings.INCLUDE_NMR_STRUCTURES)

        cavities = CavityLib.generate_cavities(_pdb, _settings)
        cavities = sorted(cavities, key=lambda k: k.volume(), reverse=True)

        protein = Protein.from_file(pdb_file)
        for c in cavities:
            c.set_pdb(_pdb)
        cavs = tuple(Cavity(_cavity=c, _pdb=_pdb) for c in cavities)
        for c in cavs:
            c._protein = protein
            c._writable = True
        return cavs




#################################################################################################
#   Cavity database
#################################################################################################


[docs]
class CavityDatabase(object):
    '''An SQLite database for cavities. A path to a database must be passed in when
    creating an instance of this class.

    Please note that the schema for the database, and, indeed, the final choice
    of underlying database has not been finalised. Accordingly this should be
    treated as a prototypical implementation. The API for creation and for
    searching should remain valid, however.
    '''

    def __init__(self, file_name=None):
        '''Connect to the database.'''
        self._db = None
        if file_name is None:
            raise RuntimeError('A path to a cavity database must be provided.')
        self.file_name = file_name
        self._db = sqlite3.connect(file_name)
        self._cursor = self._db.cursor()

    def __del__(self):
        '''Ensure the database is closed.'''
        if self._db:
            self._db.close()


[docs]
    @staticmethod
    def drugbank_database_dir():
        '''Return the directory containing the DrugBank database.'''
        return _CSDDatabaseLocator.get_cavity_dir_location()



[docs]
    def cavities(self):
        '''Iterator over the cavities.'''
        self._cursor.execute('''
            SELECT xml FROM cav_xml
            ORDER BY name ASC
            '''
                             )
        for row in self._result_iterator():
            cav = self._make_cavity(row[0])
            yield cav


    __iter__ = cavities


[docs]
    def cavity_distance_histogram_sets(self):
        '''Iterator over the cavity histogram sets.'''
        for cav in self.cavities():
            yield cav.cavity_distance_histograms()



[docs]
    def cavity(self, identifier):
        '''The cavity of the given identifier.'''
        return self.get_cavity_by_name(identifier)



[docs]
    def cavity_distance_histograms(self, identifier):
        '''The distance histograms corresponding to the given cavity identifier.'''
        return self.cavity(identifier).cavity_distance_histograms()


    @staticmethod
    def _make_cavity(xml):
        '''Private.'''
        return Cavity.from_xml(zlib.decompress(xml))

    def _result_iterator(self, size=1000):
        '''Generate the rows of a query.'''
        while True:
            results = self._cursor.fetchmany(size)
            if not results:
                break
            for r in results:
                yield r

            ################################################################################
            # Creation
            ################################################################################


[docs]
    def drop_all_tables(self):
        '''Drops all tables in the database.'''
        self._cursor.execute('DROP TABLE IF EXISTS cav_xml')
        self._cursor.execute('DROP TABLE IF EXISTS cav_hists')
        self._cursor.execute('DROP TABLE IF EXISTS cav_info')
        self._cursor.execute('DROP TABLE IF EXISTS cav_ligands')



[docs]
    def populate_all(self, directory, id_file=None, verbose=False, maximum_allowed_incomplete_residues = 0):
        '''Create all tables from the directory of input files.

        :param directory: directory containing PDB or XML cavity files
        :param verbose: enable verbose output, default False
        '''
        self._populate_cav_xml(directory, id_file, verbose, maximum_allowed_incomplete_residues)
        self._populate_cav_hists()
        self._populate_cav_info()
        self._populate_cav_ligands()


    def _populate_cav_xml(self, directory, id_file, verbose, maximum_allowed_incomplete_residues):
        '''Load and compress the cavity XML files in a given directory.'''
        self._cursor.execute('DROP TABLE IF EXISTS cav_xml')
        self._cursor.execute(
            '''CREATE TABLE IF NOT EXISTS cav_xml (
                cavity_id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT,
                xml BLOB
            )'''
        )

        file_ids = None
        if id_file is not None:
            with open(os.path.join(directory, id_file), 'r') as file:
                file_ids = file.readlines()
                if ',' in file_ids[0]:
                    file_ids = [file_id.strip().lower() for file_id in file_ids[0].split(',')]
                else:
                    file_ids = [file_id.strip().lower() for file_id in file_ids]

        self._populate_cav_xml_impl(directory, file_ids, verbose, maximum_allowed_incomplete_residues)

    def _populate_cav_xml_impl(self, directory, file_ids, verbose, maximum_allowed_incomplete_residues):
        '''Load and compress the cavity XML files in a single subdirectory.'''
        file_names = []
        for file_name in sorted(os.listdir(directory)):
            if os.path.isdir(os.path.join(directory, file_name)):
                self._populate_cav_xml_impl(os.path.join(directory, file_name), file_ids, verbose, maximum_allowed_incomplete_residues)
            else:
                file_names.append(file_name)

        def loader():
            '''Generate the cavity files.'''
            prefix = 'cavity_'
            start = time.time()

            for i, fname in enumerate(file_names):
                full_id = os.path.basename(fname).split('.')[0]
                if file_ids is not None:
                    pdb_id = full_id[-4:]
                    if full_id not in file_ids and pdb_id not in file_ids \
                            and full_id.lower() not in file_ids and pdb_id.lower() not in file_ids:
                        # print("Skipping %s %s\n" % (full_id, pdb_id))
                        continue

                t = time.time()
                if verbose:
                    print(fname)
                path = os.path.join(directory, fname)
                if fname.endswith(".gz"):
                    fname = fname[:-3]  # temporary_copy will deal with unzipping the full path

                with _temporary_copy(path) as local_path:
                    if fname.endswith('pdb') or fname.endswith('ent'):
                        try:
                            cavities = Cavity.from_pdb_file(local_path, maximum_allowed_incomplete_residues)
                        except RuntimeError as exc:
                            print('Skipping {} because {}'.format(full_id, exc))
                            continue

                        for c in cavities:
                            xml = c.to_xml().encode()
                            zipped = zlib.compress(xml)
                            yield c.identifier, sqlite3.Binary(zipped)
                    elif fname.endswith('xml'):
                        if fname.startswith(prefix):
                            name = fname[len(prefix):-4]
                        else:
                            name = fname[:-4]
                        with open(local_path, 'rb') as xml:
                            zipped = zlib.compress(xml.read())
                        yield name, sqlite3.Binary(zipped)
                if verbose:
                    print('%4d in %.2f seconds (%.2f seconds for %s)' % (
                    i, time.time() - start, time.time() - t, fname))

        t = time.time()
        self._cursor.execute('SELECT COUNT(cavity_id) FROM cav_xml')
        ct_start = self._cursor.fetchone()[0]

        self._cursor.executemany('''
            INSERT INTO cav_xml (name, xml) VALUES(?,?)
            ''', loader()
                                 )
        self._commit()
        self._cursor.execute('SELECT COUNT(cavity_id) FROM cav_xml')
        ct_end = self._cursor.fetchone()[0]
        if ct_end > ct_start:
            print('Loaded %d cavities in %.2f seconds' % (ct_end-ct_start, time.time() - t))

    def _commit(self):
        '''Commit the database if necessary.'''
        try:
            self._db.commit()
        except sqlite3.OperationalError:
            pass

    def _populate_cav_hists(self):
        '''Populate the histogram table for all cavity distance histograms.'''
        self._cursor.execute('DROP TABLE IF EXISTS cav_hists')
        self._cursor.execute('''
            CREATE TABLE IF NOT EXISTS cav_hists(
                hist_id INTEGER PRIMARY KEY AUTOINCREMENT,
                cavity_id REFERENCES cav_xml (cavity_id),
                hist_index INTEGER,
                ref_point INTEGER,
                histogram TEXT
            )'''
                             )

        def loader():
            self._cursor.execute('''
                SELECT cavity_id, xml FROM cav_xml
                ORDER BY cavity_id ASC
                '''
                                 )
            for cav_id, xml in self._result_iterator():
                cav = self._make_cavity(xml)
                rap = cav.cavity_distance_histograms(
                    ['centroid', 'centroid_closest', 'centroid_furthest',
                     'centroid_furthest_furthest'])
                hs = rap.histograms
                for i, h in enumerate(hs):
                    yield cav_id, i % 7, int(i / 7), str(CavityLib.write_histogram(h._histogram))

        t = time.time()
        inner_cursor = self._db.cursor()
        inner_cursor.executemany('''
                INSERT INTO cav_hists (cavity_id, hist_index, ref_point, histogram)
                VALUES (?,?,?,?)
            ''', loader()
                                 )
        self._cursor.execute('CREATE INDEX IF NOT EXISTS cav_hists_index ON cav_hists(cavity_id)')
        self._commit()
        self._cursor.execute('SELECT COUNT(*) FROM cav_hists')
        ct = int(list(self._cursor.fetchone())[0])
        print('Loaded %d histograms in %.2f seconds' % (ct, time.time() - t))

    def _populate_cav_info(self):
        '''Populate the info table.'''
        t = time.time()
        self._cursor.execute('DROP TABLE IF EXISTS cav_info')
        self._commit()
        self._cursor.execute('''
            CREATE TABLE cav_info (
                cavity_id REFERENCES cav_xml (cavity_id),
                volume REAL,
                ndonors INTEGER,
                nacceptors INTEGER,
                ndonor_acceptors INTEGER,
                npi INTEGER,
                naromatic INTEGER,
                nmetal INTEGER,
                naliphatic INTEGER,
                nligands INTEGER
            )
            '''
                             )
        self._commit()

        def loader():
            self._cursor.execute('''
                SELECT cavity_id, xml FROM cav_xml
                ORDER BY cavity_id ASC
                '''
                                 )
            for cav_id, xml in self._result_iterator():
                cav = self._make_cavity(xml)
                features = cav.features
                d = collections.defaultdict(int)
                for f in features:
                    d[f.type] += 1
                nligands = len(cav.ligand_identifiers)
                yield cav_id, cav.volume, d['donor'], d['acceptor'], \
                      d['donor_acceptor'], d['pi'], d['aromatic'], d['metal'], \
                      d['aliphatic'], nligands

        inner_cursor = self._db.cursor()
        inner_cursor.executemany('''
            INSERT INTO cav_info (
                cavity_id,
                volume,
                ndonors,
                nacceptors,
                ndonor_acceptors,
                npi,
                naromatic,
                nmetal,
                naliphatic,
                nligands
            ) VALUES (?,?,?,?,?,?,?,?,?,?)
            ''', loader()
                                 )
        self._cursor.execute('CREATE INDEX IF NOT EXISTS cav_info_index ON cav_info(cavity_id)')
        self._commit()
        print('Loaded info in %.2f seconds' % (time.time() - t))

    def _populate_cav_ligands(self):
        '''Populate the ligand table.'''
        t = time.time()
        self._cursor.execute('DROP TABLE IF EXISTS cav_ligands')
        self._cursor.execute('''
            CREATE TABLE IF NOT EXISTS cav_ligands (
                cavity_id REFERENCES cav_xml(cavity_id),
                ligand_id TEXT
            )
            '''
                             )

        def loader():
            self._cursor.execute('''
                SELECT cavity_id, xml FROM cav_xml
                ORDER BY cavity_id ASC
                '''
                                 )
            for cav_id, xml in self._result_iterator():
                cav = self._make_cavity(xml)
                ks = cav._cavity.ligands().keys()
                for k in ks:
                    lig_id = k[k.index(':') + 2: k.index('_')]
                    yield cav_id, lig_id

        inner_cursor = self._db.cursor()
        inner_cursor.executemany('''
            INSERT INTO cav_ligands (
                cavity_id, ligand_id
            ) VALUES (?,?)
            ''', loader()
                                 )
        self._cursor.execute(
            'CREATE INDEX IF NOT EXISTS cav_ligands_index ON cav_ligands(cavity_id)')
        self._commit()
        self._cursor.execute('''
            SELECT COUNT(ligand_id) FROM cav_ligands
            '''
                             )
        ct = int(self._cursor.fetchone()[0])
        print('Loaded %d ligands in %.2f seconds' % (ct, time.time() - t))

    ################################################################################
    # Query
    ################################################################################


[docs]
    def get_cavity_by_name(self, name):
        '''Get the cavity with the exact identifier.'''
        self._cursor.execute('''
            SELECT xml from cav_xml WHERE name = '%s'
            ''' % name
                             )
        l = list(self._cursor.fetchall())
        if l is not None and len(l):
            return self._make_cavity(l[0][0])



[docs]
    def get_cavities_by_pdb_code(self, pdb_code):
        '''Get all the cavities for a protein.'''
        self._cursor.execute('''
            SELECT xml FROM cav_xml WHERE name LIKE '%%{}.%%'
            '''.format(pdb_code)
                             )
        return [self._make_cavity(row[0]) for row in self._result_iterator()]



[docs]
    def get_ligands_by_pdb_code(self, pdb_code):
        '''All ligand identifiers in the protein structure.'''
        self._cursor.execute('''
            SELECT ligand_id
            FROM cav_xml JOIN cav_ligands ON cav_xml.cavity_id = cav_ligands.cavity_id
            WHERE cav_xml.name LIKE '%%{}.%%'
            '''.format(pdb_code)
                             )
        return [row[0] for row in self._cursor.fetchall()]



[docs]
    def get_ligands_by_cavity_identifier(self, cav_id):
        '''All identifiers of ligands in the cavity.'''
        self._cursor.execute('''
            SELECT cav_ligands.ligand_id
            FROM cav_xml JOIN cav_ligands ON cav_xml.cavity_id = cav_ligands.cavity_id
            WHERE cav_xml.name = '%s'
            ''' % cav_id
                             )
        return [r[0] for r in self._cursor.fetchall()]



[docs]
    def get_cavities_by_ligand(self, ligand_id):
        '''Get all the cavities containing a particular PDB ligand identifier.'''
        self._cursor.execute('''
            SELECT xml
            FROM cav_xml
            WHERE cavity_id IN (
                SELECT DISTINCT cavity_id
                FROM cav_ligands
                WHERE ligand_id = '%s'
            )
            ''' % ligand_id
                             )
        return [self._make_cavity(row[0]) for row in self._cursor.fetchall()]



[docs]
    def get_cavity_identifiers_by_ligand(self, ligand_id):
        '''Get the identifiers of all cavities containing a particular PDB ligand identifier.'''
        self._cursor.execute('''
            SELECT DISTINCT cav_xml.name
            FROM cav_ligands JOIN cav_xml ON cav_ligands.cavity_id = cav_xml.cavity_id
            WHERE cav_ligands.ligand_id = '%s'
            ORDER BY cav_xml.name ASC
            ''' % ligand_id
                             )
        return [r[0] for r in self._cursor.fetchall()]



[docs]
    def get_info_for_cavity(self, cav_name):
        '''Get the information for a cavity.

        :param cav_name: the name of the cavity from the cav_xml table of the database

        :returns: a dictionary of values from the info table
        '''
        self._cursor.execute('''
            SELECT volume, ndonors, nacceptors, ndonor_acceptors, npi, naromatic, nmetal, naliphatic, nligands
            FROM cav_info JOIN cav_xml ON cav_info.cavity_id = cav_xml.cavity_id
            WHERE name = '%s'
            ''' % cav_name
                             )
        vals = list(self._cursor.fetchone())
        d = {
            self._cursor.description[i][0]: vals[i]
            for i in range(len(vals))
        }
        return d



[docs]
    def get_number_of_cavities(self):
        '''Get the number of cavities stored in the database.'''
        self._cursor.execute('''
            SELECT COUNT(*)
            FROM cav_xml
            '''
                             )
        l = list(self._cursor.fetchall())
        return l[0][0]



[docs]
    def get_number_of_ligands(self):
        '''Get the number of cavity ligands stored in the database.'''
        self._cursor.execute('''
            SELECT COUNT(*)
            FROM cav_ligands
            '''
                             )
        l = list(self._cursor.fetchall())
        return l[0][0]


    def _general_get(self, fields, tables, where, order, limit, offset):
        joins = ['JOIN %s ON %s.cavity_id = %s.cavity_id'
                 % (s, tables[0], s) for s in tables[1:]]
        where_clause = 'WHERE ' + where if where else ''
        ord = 'ORDER BY %s' % ', '.join(order) if order else ''
        if limit:
            lim = 'LIMIT %d' % limit
        else:
            lim = ''
        if offset:
            off = 'OFFSET %d' % offset
            if not lim:
                lim = 'LIMIT -1'
        else:
            off = ''

        sql = [
            'SELECT %s' % (', '.join(fields)),
            'FROM %s %s' % (tables[0], ' '.join(joins)),
            where_clause,
            ord,
            lim,
            off
        ]
        sql = '\n'.join(sql)
        # print(sql)
        self._cursor.execute(sql)
        return self._result_iterator()

    ################################################################################
    # Search
    ################################################################################


[docs]
    class Settings(object):
        '''Settings appropriate to cavity searches.'''

        max_hit_structures = 0  #: maximum number of structures returned
        max_product_graph_size = 36000  #: maximum size of product graph allowed when using the fast cavity graph comparison method
        start = 0  #: offset starting position in database
        aromatic_range = None  #: minimum and maximum number of aromatic features
        aliphatic_range = None  #: minimum and maximum number of aliphatic features
        donor_range = None  #: minimum and maximum number of donor features
        acceptor_range = None  #: minimum and maximum number of acceptor features
        donor_acceptor_range = None  #: minimum and maximum number of donor-acceptor features
        metal_range = None  #: minimum and maximum number of metal features
        pi_range = None  #: minimum and maximum number of pi features
        volume_range = None  #: minimum and maximum cavity volume
        ligand_range = None  #: minimum and maximum number of ligands
        with_ligands = None  #: ligand identifiers
        verbose = False      #: verbose output, default False
        logfile = False      #: logfile of comparison scores, default False
        histogram_reference_points = None
        '''an iterable of strings drawn from
        'centroid', 'centroid_closest', 'centroid_furthest',
        'centroid_furthest_furthest'. If empty or None, 'centroid' and
        'centroid_closest' will be used for the generation of distance
        histograms with Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON
        '''


    @staticmethod
    def _make_settings_sql(settings):
        '''Private.'''

        def one_clause(r, field, fmt):
            if hasattr(r, '__iter__'):
                s = '%s BETWEEN %s AND %s' % (field, fmt, fmt) % (min(r), max(r))
            else:
                s = '%s = %s' % (field, fmt) % r
            return s

        clauses = []
        if settings.aromatic_range is not None:
            clauses.append(one_clause(settings.aromatic_range, 'naromatic', '%d'))
        if settings.aliphatic_range is not None:
            clauses.append(one_clause(settings.aliphatic_range, 'naliphatic', '%d'))
        if settings.donor_range is not None:
            clauses.append(one_clause(settings.donor_range, 'ndonors', '%d'))
        if settings.acceptor_range is not None:
            clauses.append(one_clause(settings.acceptor_range, 'nacceptors', '%d'))
        if settings.donor_acceptor_range is not None:
            clauses.append(one_clause(settings.donor_acceptor_range, 'ndonor_acceptors', '%d'))
        if settings.metal_range is not None:
            clauses.append(one_clause(settings.metal_range, 'nmetal', '%d'))
        if settings.pi_range is not None:
            clauses.append(one_clause(settings.pi_range, 'npi', '%d'))
        if settings.volume_range is not None:
            clauses.append(one_clause(settings.volume_range, 'volume', '%.3f'))
        if settings.ligand_range is not None:
            clauses.append(one_clause(settings.ligand_range, 'nligands', '%d'))
        if settings.with_ligands is not None:
            sql = '''
            SELECT DISTINCT name FROM cav_xml JOIN cav_ligands ON cav_xml.cavity_id = cav_ligands.cavity_id
            WHERE '''
            ligand_clause = []
            for name in settings.with_ligands:
                ligand_clause.append('ligand_id = "%s"' % name)
            sql += ' OR '.join(ligand_clause)
            clauses.append('name in ( ' + sql + ')')

        return '\n AND '.join(clauses)

    def _constraint_only_search(self, extra_tables, where_clause, limit, offset):
        # Find cavities matching the settings
        rows = self._general_get(
            ['xml'], ['cav_xml'] + extra_tables, where_clause,
            ['cav_xml.cavity_id ASC'], limit, offset
        )

        def one_by_one():
            for r in rows:
                yield self._make_cavity(r[0])

        return one_by_one()

    def _cavity_graph_comparison_search(self, cavity, comparison_method,
                                        extra_tables, where_clause,
                                        limit, offset, max_nodes, verbose, logfile):
        if verbose:
            print('Counting the number of targets for comparison')
            if not limit or limit <= 0:
                count = self._general_get(
                    ['COUNT(DISTINCT cav_xml.cavity_id)'], ['cav_xml'] + extra_tables, where_clause,
                    ['cav_xml.cavity_id ASC'], '', ''
                )
                n_rows = list(itertools.islice(count, 1))[0][0]
                if offset:
                    n_rows = n_rows - offset
            else:
                n_rows = limit
            print(str(n_rows) + ' targets for comparison')

        rows = self._general_get(
            ['xml'], ['cav_xml'] + extra_tables, where_clause,
            ['cav_xml.cavity_id ASC'], limit, offset
        )
        results = []
        n_complete = 0
        start_time = time.time()
        for r in rows:
            cav = self._make_cavity(r[0])
            try:
                if comparison_method == Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON:
                    # Cavity graph comparison
                    result = cavity.compare(cav, comparison_method=comparison_method)
                    result_tuple = (result.score, cav.identifier)
                    results.append(result_tuple)
                    if logfile:
                        with open('logfile.csv', 'ab') as logfile:
                            logwriter = csv.writer(logfile, delimiter=',')
                            logwriter.writerow(result_tuple)
                elif comparison_method == Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON:
                    # Fast cavity graph comparison
                    result = cavity.compare(cav, max_product_graph_size=max_nodes)
                    result_tuple = (result.score, cav.identifier)
                    results.append(result_tuple)
                    if logfile:
                        with open('logfile.csv', 'ab') as logfile:
                            logwriter = csv.writer(logfile, delimiter=',')
                            logwriter.writerow(result_tuple)
                else:
                    raise RuntimeError('Not an allowed comparison method')
            except RuntimeError as e:
                print(
                    'Unable to compare %s with %s: %s'
                    % (cavity.identifier, cav.identifier, e))
            n_complete += 1
            if verbose:
                Timer.progress(start_time, n_complete, n_rows, '', in_place=True)

        results.sort(reverse=True)
        return results

    def _cavity_histograms_comparison_search(self, cavity_distance_histograms, extra_tables,
                                             where_clause, limit, offset, verbose, logfile):
        rap_hists = cavity_distance_histograms.histograms
        if limit:
            limit *= len(rap_hists)
        if offset:
            offset *= len(rap_hists)
        if where_clause:
            where_clause = ' AND ' + where_clause

        if verbose:
            print('Counting the number of targets for comparison')
            if not limit or limit <= 0:
                count = self._general_get(
                    ['COUNT(DISTINCT cav_hists.cavity_id)'],
                    ['cav_xml', 'cav_hists'] + extra_tables,
                    'cav_hists.ref_point IN ( %s )' % ', '.join(
                        str(r) for r in cavity_distance_histograms.reference_points) + where_clause,
                    ['cav_hists.cavity_id ASC', 'cav_hists.ref_point ASC', 'cav_hists.hist_index ASC'], '', '')
                n_rows = list(itertools.islice(count, 1))[0][0]
                if offset:
                    n_rows = n_rows - int(offset / len(rap_hists))
            else:
                n_rows = int(limit / len(rap_hists))
            print(str(n_rows) + ' targets for comparison')

        rows = self._general_get(
            ['name', 'histogram'],
            ['cav_xml', 'cav_hists'] + extra_tables,
            'cav_hists.ref_point IN ( %s )' % ', '.join(
                str(r) for r in cavity_distance_histograms.reference_points) + where_clause,
            ['cav_hists.cavity_id ASC', 'cav_hists.ref_point ASC', 'cav_hists.hist_index ASC'],
            limit,
            offset
        )

        weights = [item[1] for item in
                   sorted(cavity_distance_histograms._histogram_set.pc_weights().items())
                   ] * int(len(rap_hists) / 7)

        def compare(raphs, hs):
            return CavityLib.jensen_shannon_divergence([rh._histogram for rh in raphs],
                                                       hs, weights) / max(1, len(raphs))

        results = []
        n_complete = 0
        start_time = time.time()
        for k, g in itertools.groupby(rows, lambda r: r[0]):
            h_lines = list(g)
            hists = [CavityLib.read_histogram(r[-1]) for r in h_lines]
            cmp = compare(rap_hists, hists)
            score = 100.0 if (cmp < 1.3) else (10.0 / math.log10(cmp))
            result_tuple = (score, h_lines[0][0])
            results.append(result_tuple)
            if logfile:
                with open('logfile.csv', 'ab') as logfile:
                    logwriter = csv.writer(logfile, delimiter=',')
                    logwriter.writerow(result_tuple)
            n_complete += 1
            if verbose:
                Timer.progress(start_time, n_complete, n_rows, '', in_place=True)

        results.sort(reverse=True)
        return results


[docs]
    def search(self, cavity=None, comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON, settings=None):
        '''Searches the database and optionally performs cavity comparisons against the results.

        The query can include a Cavity for comparison against the database.
        If a cavity is not specified, a search for cavities matching the constraints
        is performed.

        :param cavity: a :class:`ccdc.cavity.Cavity` instance
        :param comparison_method: a member of :class:`ccdc.cavity.Cavity.ComparisonMethod`,
            either Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON,
            Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON or
            Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON
        :param settings: a :class:`ccdc.cavity.Cavity.Settings` instance

        :returns: a list of tuples of comparison score and cavity identifier,
            sorted by comparison score, starting with the highest similarity, or
            alternatively returns a generator in the case where no comparisons are
            performed
        '''
        if settings is None:
            where_clause = ''
            extra_tables = []
            limit = ''
            offset = ''
            max_nodes = 36000
            verbose = False
            logfile = False
            histogram_reference_points = None
        else:
            where_clause = CavityDatabase._make_settings_sql(settings)
            if where_clause:
                extra_tables = ['cav_info']
            else:
                extra_tables = []

            if settings.max_hit_structures:
                limit = settings.max_hit_structures
            else:
                limit = ''
            if settings.start:
                offset = settings.start
            else:
                offset = ''
            if settings.max_product_graph_size:
                max_nodes = settings.max_product_graph_size
            verbose = settings.verbose
            logfile = settings.logfile
            histogram_reference_points = settings.histogram_reference_points

        if cavity is None:
            return self._constraint_only_search(extra_tables, where_clause, limit, offset)
        else:
            if logfile:
                with open('logfile.csv', 'wb') as logfile:
                    logwriter = csv.writer(logfile, delimiter=',')
                    logwriter.writerow(['Score', 'Cavity identifier'])
            if comparison_method == Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON:
                return self._cavity_histograms_comparison_search(
                    cavity.cavity_distance_histograms(histogram_reference_points),
                    extra_tables, where_clause, limit, offset, verbose, logfile)
            else:
                return self._cavity_graph_comparison_search(
                    cavity, comparison_method, extra_tables, where_clause,
                    limit, offset, max_nodes, verbose, logfile)