Source code for ccdc.cavity

# coding=utf8
#
# This code is Copyright (C) 2015 The Cambridge Crystallographic Data Centre
# (CCDC) of 12 Union Road, Cambridge CB2 1EZ, UK and a proprietary work of CCDC.
# This code may not be used, reproduced, translated, modified, disassembled or
# copied, except in accordance with a valid licence agreement with CCDC and may
# not be disclosed or redistributed in any form, either in whole or in part, to
# any third party. All copies of this code made in accordance with a valid
# licence agreement as referred to above must contain this copyright notice.
#
# No representations, warranties, or liabilities are expressed or implied in the
# supply of this code by CCDC, its servants or agents, except where such
# exclusion or limitation is prohibited, void or unenforceable under governing
# law.
#
#################################################################################################
'''
The :mod:`ccdc.cavity` module contains a class :class:`ccdc.cavity.Cavity`
representing a putative binding site on the protein surface that has been
automatically detected by the LIGSITE_ algorithm.

Reference article introducing the LIGSITE_ algorithm.

.. _LIGSITE: http://www.sciencedirect.com/science/article/pii/S1093326398000023

This approach uses a three-dimensional grid for the detection of surface
depressions with a grid spacing of 0.5 Ångströms. As a result, the cavity is
represented by a set of grid intersection points, also called surface points.

Once a cavity has been detected, the flanking fragments of amino acids are
assigned pseudocenters to represent their physicochemical properties.
Currently, seven types of pseudocenters are used:

  - DONOR: representing a hydrogen bond donor functionality
  - ACCEPTOR: representing a hydrogen bond acceptor functionality
  - DONOR_ACCEPTOR: representing both hydrogen bond donor and acceptor functionalities
  - AROMATIC: represents the center of an aromatic ring system
  - ALIPHATIC: represents an aliphatic moiety
  - PI: denotes the presence of a double bond
  - METAL: represents the position of a metal ion

Reference article for the introduction of the `cavity graph comparison method`_.

.. _`cavity graph comparison method`: http://www.sciencedirect.com/science/article/pii/S0022283602008112

Reference article for the introduction of the `fast cavity graph comparison method`_.

.. _`fast cavity graph comparison method`: http://ieeexplore.ieee.org/document/6853389/

Reference article for the introduction of the `cavity histograms comparison method`_.

.. _`cavity histograms comparison method`: https://pubs.acs.org/doi/abs/10.1021/ci5005898

The class :class:`ccdc.cavity.Cavity` has methods to automatically create
cavities for PDB files, or read a cavity from an XML representation, to extract
useful information such as bound ligands, and to compare two cavities using
different comparison methods. From this it is possible to write screens to
search for similar cavities across a range of proteins, a defined set of
cavities, or the entire cavity database.
'''
#################################################################################################

import os
import sqlite3
import zlib
import time
import itertools
import collections
import operator
import math
import csv
import functools
# Python 3.4: from enum import Enum

from ccdc.molecule import Coordinates
from ccdc.molecule import Molecule
from ccdc.descriptors import GeometricDescriptors
from ccdc.utilities import bidirectional_dict, Histogram, Timer
from ccdc.protein import Protein
from ccdc.io import _CSDDatabaseLocator

from ccdc.utilities import _temporary_copy
from ccdc.utilities import _private_importer

with _private_importer() as pi:
    pi.import_ccdc_module('CavityLib')
    pi.import_ccdc_module('MathsLib')

CavityLib.licence_check()

from packaging import version
too_old_sqlite3_warning = f'Your python interpreter includes sqlite3 version {sqlite3.sqlite_version} which ' + \
    'will cause extreme slowness of this module. Please use a python interpreter with an up to date sqlite3 module.'
if version.parse('3.18.0') > version.parse(sqlite3.sqlite_version) \
    and not 'CCDC_PYTHON_API_IGNORE_OLD_SQLITE3_WARNING' in os.environ:
        raise RuntimeError(too_old_sqlite3_warning)
            

#################################################################################################

[docs]class Cavity(object): '''A cavity on the protein surface.''' feature_types = bidirectional_dict( donor=CavityLib.PSEUDOCENTER.DONOR, acceptor=CavityLib.PSEUDOCENTER.ACCEPTOR, donor_acceptor=CavityLib.PSEUDOCENTER.DONOR_ACCEPTOR, pi=CavityLib.PSEUDOCENTER.PI, aromatic=CavityLib.PSEUDOCENTER.AROMATIC, metal=CavityLib.PSEUDOCENTER.METAL, aliphatic=CavityLib.PSEUDOCENTER.ALIPHATIC ) _relibase_amino_acid_codes = bidirectional_dict( ALA=0, ARG=1, ASN=2, ASP=3, CYS=4, GLN=5, GLU=6, GLY=7, HIS=8, ILE=9, LEU=10, LYS=11, MET=12, PHE=13, PRO=14, SER=15, THR=16, TRP=17, TYR=18, VAL=19, UNK=20, ASX=21, GLX=22, NAD=23, NAP=24, FMN=25, MTX=26, HEM=27, ATP=28, COA=29, C=30, A=31, G=32, U=33, T=34, ACE=35, PCA=36, UNL=39, HOH=100 ) # Python 3.4: class ComparisonMethod(Enum): class ComparisonMethod: FAST_CAVITY_GRAPH_COMPARISON = 1 CAVITY_GRAPH_COMPARISON = 2 CAVITY_HISTOGRAMS_COMPARISON = 3
[docs] class Feature(object): '''An interaction feature in a cavity.''' def __init__(self, _feature=None): self._feature = _feature def __repr__(self): '''String representation of a feature.''' return 'Feature(%s at %s)' % (self.type, self.coordinates) def __eq__(self, other): '''Equality of features.''' return self.type == other.type and self.coordinates == other.coordinates def __hash__(self): return hash(self.type) + hash(self.coordinates) @property def coordinates(self): '''The position of the feature. :returns: a named tuple of coordinates ''' p = self._feature.coordinates() return Coordinates(p.x(), p.y(), p.z()) @property def burial(self): '''The burial value assigned to this feature (0 to 7, where 7 means most buried).''' return self._feature.depth()
[docs] def distance(self, other): '''The distance between this feature and another.''' return GeometricDescriptors.point_distance(self.coordinates, other.coordinates)
[docs] def point_distance(self, point): '''The distance from this feature to an arbitrary point.''' return GeometricDescriptors.point_distance(self.coordinates, point)
@property def protein_vector(self): '''Vector denoting the ideal interaction direction of the feature with another one outside the protein.''' vec = self._feature.std_vec() return GeometricDescriptors.Vector(vec.x(), vec.y(), vec.z()) @property def surface_vector(self): '''Vector denoting the connection from the feature to the centre of its assigned surface points.''' vec = self._feature.surf_vec() return GeometricDescriptors.Vector(vec.x(), vec.y(), vec.z()) @property def surface_points(self): '''The surface points associated with this feature. These approximate the surface shape close to the feature. :returns: a tuple of named tuples of coordinates ''' return tuple( Coordinates(p.x(), p.y(), p.z()) for p in self._feature.srfdots() ) @property def surface_depths(self): '''The depth values assigned to the surface points of this feature (0 to 7, where 7 means most buried). :returns: a tuple of depth values ''' return tuple( s.depth() for s in self._feature.srfdots() ) @property def type(self): '''The type of the interaction feature.''' return Cavity.feature_types.inverse_lookup(self._feature.type()) @property def amino_acid_code(self): '''The associated amino acid code. This will be 'UNK' if no protein structure has been associated with the cavity containing the feature. ''' return Cavity._relibase_amino_acid_codes.inverse_lookup( ord(self._feature.amino_acid_type()) ) @property def chain(self): '''The chain from which this feature was defined. This will be ``None`` if no protein information is associated with the cavity. ''' if hasattr(self, '_protein'): obj_name = self._feature.mmobj_name() model = self._pdb.find_model(obj_name) unit = model.unit_i(self._feature.unitid()) chain_id = unit.chain_id() chain = self._protein[chain_id] return chain @property def residue(self): '''The residue associated with this feature. This will be ``None`` if no protein information is associated with the cavity. ''' if hasattr(self, '_protein'): unit_id = self._feature.unitid() return self.chain.residues[unit_id] @property def atom(self): '''The atom from which this feature is defined. This will be ``None`` if no protein information is associated with the cavity. Aromatic and aliphatic features have no associated atom, so this property has the value ``None`` in those cases. ''' if self.type in ['aromatic', 'aliphatic']: return None if hasattr(self, '_protein'): model = self._pdb.find_model(self._feature.mmobj_name()) unit = model.unit_i(self._feature.unitid()) at = unit.atom(self._feature.atomid()) at_name = at.atom_name().strip() possibles = [a for a in self.residue.atoms if a.label == at_name] if len(possibles) == 1: return possibles[0] atom_id = self._feature.atomid() for a in self.residue.atoms: if a.atomic_number == 1: continue if atom_id == 0: return a atom_id -= 1 # Matching atom not found return None
[docs] class CavityGraphComparison(object): '''The result of a cavity graph comparison.''' def __init__(self, _comparison): self._comparison = _comparison @property def score(self): '''The similarity score of the comparison.''' return self._comparison.score() @property def rmsd(self): '''The rms deviation for the match of all pseudocenters.''' return self._comparison.rms() @property def clique_rmsd(self): '''The rms deviation of the matched clique points.''' return self._comparison.rms_cliq() @property def n_cliques(self): '''The total number of cliques detected during the comparison.''' return self._comparison.number_of_cliques() @property def n_matches(self): '''The number of matching pseudocenters detected.''' return self._comparison.number_of_matches() @property def transformation_matrix(self): '''The transformation matrix to overlay the target onto the query cavity''' return Molecule.Transformation(_transformation=self._comparison.transformation_matrix())
[docs] class FastCavityGraphComparison(object): '''The result of a fast cavity graph comparison.''' def __init__(self, _comparison): self._comparison = _comparison @property def score(self): '''The similarity score of the comparison.''' return self._comparison.score() @property def largest_clique_size(self): '''The size of the largest clique detected.''' return self._comparison.largest_clique_size() @property def product_graph_size(self): '''The size of the product graph generated during the comparison.''' return self._comparison.product_graph_size()
[docs] class CavityDistanceHistograms(object): '''A cavity description based on histograms of distances between reference points and cavity pseudocenters.''' _reference_points = bidirectional_dict( CENTROID=CavityLib.CENTROID, CENTROID_CLOSEST=CavityLib.CENTROID_CLOSEST, CENTROID_FURTHEST=CavityLib.CENTROID_FURTHEST, CENTROID_FURTHEST_FURTHEST=CavityLib.CENTROID_FURTHEST_FURTHEST ) def __init__(self, cavity, reference_points=None): '''Create a set of feature distance histograms for the cavity. :param cavity: a :class:`ccdc.cavity.Cavity` instance :param reference_points: an iterable of strings drawn from 'centroid', 'centroid_closest', 'centroid_furthest', 'centroid_furthest_furthest'. If empty or None, 'centroid' and 'centroid_closest' will be used for the generation of distance histograms ''' if reference_points is None or len(reference_points) == 0: self.reference_points = [CavityLib.CENTROID, CavityLib.CENTROID_CLOSEST] else: self.reference_points = [ Cavity.CavityDistanceHistograms._reference_points.prefix_lookup(k) for k in reference_points] self._histogram_set = CavityLib.RapmadPocket(cavity._cavity, self.reference_points) @property def histograms(self): '''The tuple of histograms defined for this cavity.''' return tuple( Histogram._from_histogram(self._histogram_set.get_histogram(i, j)) for i in range(len(Cavity.CavityDistanceHistograms._reference_points)) for j in range(len(Cavity.feature_types)) if self._histogram_set.has_histogram(i, j) ) def _compare(self, other): '''Compare another cavity with this one using the cavity histograms comparison method.''' return CavityLib.StandaloneCavityComparison().rapmad(self._histogram_set, other._histogram_set)
def __init__(self, _cavity, _pdb): '''Initialise a cavity.''' self._cavity = _cavity self._pdb = _pdb self._writable = False
[docs] def subcavity(self, features): '''Make a subcavity based on a set of features from this cavity. :param features: a set of features for construction of the subcavity :returns: a :class:`ccdc.cavity.Cavity` instance ''' cav = CavityLib.CAVITY(self._cavity) cav.assign_pseudocenters([ f._feature for f in features ]) if self._pdb.object_name(): cav.set_pdb(self._pdb) else: cav._pdb = None cavity = Cavity(_cavity=cav, _pdb=self._pdb) if hasattr(self, '_protein'): cavity._protein = self._protein return cavity
@property def identifier(self): '''The identifier of this cavity.''' return self._cavity.object_name().replace('CAV::', '') @property def ligand_identifiers(self): '''Tuple of ligand identifiers found in the cavity.''' def strip(name): parts = name.split('_') name = parts[0][5:] if '-' in parts[1]: subparts = parts[1].split('-') name = subparts[1] + ':' + name return name return tuple( strip(name) for name in self._cavity.ligands() ) @property def ligands(self): '''List of ligands of the cavity. If there is no protein associated with the cavity this will be ``None``. ''' def ligand_identifiers_match(id1, id2): parts1 = id1.split(':') parts2 = id2.split(':') if len(parts1) == len(parts2): return id1.startswith(id2) else: return parts1[-1].startswith(parts2[-1]) # Need something better - have a look in _pdb to see if I can get a better ligand id. ligands = None if hasattr(self, '_protein') and self._protein is not None: ligands = [ligand for ligand in self._protein.ligands if any( ligand_identifiers_match(ligand.identifier, lig_id) for lig_id in self.ligand_identifiers) ] return ligands
[docs] def features_by_type(self, type): '''The features of a given type.''' return set([f for f in self.features if f.type == type.lower()])
[docs] def features_by_distance(self, centre, radius): '''The set of features of the cavity within radius of the centre.''' return set([f for f in self.features if GeometricDescriptors.point_distance(f.coordinates, centre) < radius])
[docs] def features_by_atom_distance(self, atoms, radius): '''The set of all features within a radius of any of the atoms.''' s = functools.reduce(operator.__or__, [self.features_by_distance(a.coordinates, radius) for a in atoms], set()) return s
[docs] def features_by_residues(self, residues): '''The set of features associated with any of the given residues.''' s = set(f for f in self.features if f.residue in residues) return s
@property def bounding_box(self): '''The origin and far corner of the cavity.''' g = self._cavity.grid() return ( Coordinates(g.x_off(), g.y_off(), g.z_off()), Coordinates(g.x_off() + g.step() * g.dx(), g.y_off() + g.step() * g.dy(), g.z_off() + g.step() * g.dz()) ) @property def volume(self): '''Volume of the cavity in cubic Angstroms.''' return self._cavity.volume() @property def features(self): '''The features of the cavity. :returns: a tuple of :class:`ccdc.cavity.Cavity.Feature` instances ''' features = tuple(Cavity.Feature(_feature=p) for p in self._cavity.pseudocenters()) if hasattr(self, '_protein'): for f in features: f._protein = self._protein f._pdb = self._cavity.pdb() return features
[docs] def write(self, file_name): '''Write the cavity to a rlbcoor file for visualisation in hermes.''' if self._pdb is None or self._pdb.n_atoms() == 0: raise NotImplementedError('A cavity without a protein cannot be written to rlbcoor format files') if not self._writable: raise RuntimeError('Cavities must be defined from a PDB file to be writable as rlbcoor') xvec = MathsLib.vector_3d(0.5, 0.0, 0.0) yvec = MathsLib.vector_3d(0.0, 0.5, 0.0) zvec = MathsLib.vector_3d(0.0, 0.0, 0.5) self._cavity.set_xgridvector(xvec) self._cavity.set_ygridvector(yvec) self._cavity.set_zgridvector(zvec) writer = CavityLib.CavityOutputWriter() writer.out_write_cav_in_cif(file_name, self._cavity, self._pdb)
[docs] def to_pymol_file(self, file_name=None, show_surface_points=False): '''Create a visualisation file of this cavity that can be run in PyMOL. The cavity will be represented by its physicochemical features. :param file_name: Python file containing the information for displaying the cavity. This should have a .py extension. If not defined, the file will be named using the cavity identifier :param show_surface_points: additionally display the points representing the cavity's surface shape ''' def feature_color(type): colors = {'donor': 'blue', 'acceptor': 'red', 'donor_acceptor': 'green', 'aromatic': 'orange', 'aliphatic': 'yellow', 'pi': 'grey', 'metal': 'cyan'} return colors.get(type, 'black') def add_point_set(coordinates, name, color, scale): point_list = [] for c in coordinates: point_list.append('7.0, %s, %f' % (', '.join(map(str, c)), scale)) lines = [ '%s = [%s]' % (name, ', '.join(point_list)), 'cmd.load_cgo(%s, "%s", 1)' % (name, name), 'cmd.color("%s", "%s")' % (color, name) ] return '\n'.join(lines) def add_point(point, name, color, scale): lines = [ 'cmd.do("pseudoatom %s, pos=[%s], color=%s")' % ( name, ', '.join(map(str, point.coordinates)), color), 'cmd.do("show sphere, %s")' % (name), 'cmd.do("set sphere_scale, %s, %s")' % (scale, name) ] return '\n'.join(lines) py_file_name = self.identifier + '.py' if file_name is None else file_name with open(py_file_name, 'w') as f: f.write('from pymol.cgo import *\n') f.write('from pymol import cmd\n') i = 1 for feature in self.features: feature_name = 'Feat_%i_%s' % (i, feature.type) f.write(add_point(feature, feature_name, feature_color(feature.type), 0.3) + '\n') if show_surface_points == True and len(feature.surface_points) > 0: surface_name = 'Surf_%i' % (i) f.write( add_point_set(feature.surface_points, surface_name, 'wheat', 0.05) + '\n') i += 1
def _check_cavity(self): if len(self.features) <= 1: raise RuntimeError('Unable to make a comparison ' '(cavity {0} has too few pseudocentres)'.format(self.identifier)) def _compare(self, other): '''Compare another cavity with this one using the cavity graph comparison method. :param other: :class:`ccdc.cavity.Cavity` instance :raises: RuntimeError if the result of the cavity comparison is None :returns: a :class:`ccdc.cavity.Cavity.CavityGraphComparison` instance ''' settings = CavityLib.CavBaseSearchSettings() self._check_cavity() other._check_cavity() comper = CavityLib.CavityComparison(self._cavity, other._cavity, settings) res = comper.compare(False, False) if res is None: raise RuntimeError('Unable to make a comparison ' '(possibly one cavity is too large?)') return Cavity.CavityGraphComparison(_comparison=res) def _fast_compare(self, other, max_product_graph_size): '''Compare another cavity with this one using the fast cavity graph comparison method. :param other: a :class:`ccdc.cavity.Cavity` instance :param max_product_graph_size: the maximum allowed size of the product graph :returns: a :class:`ccdc.cavity.Cavity.FastCavityGraphComparison` instance ''' self._check_cavity() other._check_cavity() result = CavityLib.StandaloneCavityComparison().local_cliques(self._cavity, other._cavity, max_product_graph_size) return Cavity.FastCavityGraphComparison(_comparison=result)
[docs] def cavity_distance_histograms(self, reference_points=None): '''Create a set of feature distance histograms for this cavity based on the given reference point specification. :param reference_points: a set of reference point measures :returns: a :class:`ccdc.cavity.Cavity.CavityDistanceHistograms` instance ''' return Cavity.CavityDistanceHistograms(self, reference_points)
[docs] def compare(self, other, comparison_method=ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON, histogram_reference_points=None, max_product_graph_size=36000): '''Compare this cavity to another cavity. :param other: a :class:`ccdc.cavity.Cavity` instance :param comparison_method: a member of :class:`ccdc.cavity.Cavity.ComparisonMethod`, either Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON, Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON or Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON :param histogram_reference_points: an iterable of strings drawn from 'centroid', 'centroid_closest', 'centroid_furthest', 'centroid_furthest_furthest'. If empty or None, 'centroid' and 'centroid_closest' will be used for the generation of distance histograms with Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON :param max_product_graph_size: the maximum allowed size of the product graph for fast cavity graph comparisons :returns: a :class:`ccdc.cavity.Cavity.FastCavityGraphComparison` instance, a :class:`ccdc.cavity.Cavity.CavityGraphComparison` instance or a similarity score for cavity histogram comparisons ''' if comparison_method == Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON: return self.cavity_distance_histograms(histogram_reference_points)._compare( other.cavity_distance_histograms(histogram_reference_points)) elif comparison_method == Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON: return self._compare(other) elif comparison_method == Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON: return self._fast_compare(other, max_product_graph_size) else: raise RuntimeError('Not an allowed comparison method')
[docs] def to_xml(self): '''An XML representation of the cavity.''' ostr = CavityLib.ostringstream() self._cavity.print_xml(ostr) return ostr.str()
[docs] def to_xml_file(self, file_name): '''Writes the XML representing a cavity to a file. :param file_name: the file to which to write the XML ''' with open(file_name, 'w') as writer: writer.write(self.to_xml())
@staticmethod def _make_pdb(pdb_file=None): '''Private.''' if pdb_file is None: return CavityLib.PDB() if not os.path.exists(pdb_file): raise RuntimeError('PDB file %s does not exist' % pdb_file) bali_settings = CavityLib.BaliSettings_instantiate('.') bali_settings.set_auto_accept_ligand_templates(True) bali_settings.set_max_journal_title_length(1024) bali_settings.set_truncate_text_fields(True) pdb_settings = bali_settings.parser_settings() pdb_settings.set_title_required(False) pdb_settings.set_method_required(False) pdb_settings.set_spacegroup_required(False) bali_proc = CavityLib.BaliProcessing_instantiate(pdb_file, bali_settings) if bali_proc.has_errors(): raise RuntimeError( 'Errors in processing PDB file:\n%s' % ('\n'.join(bali_proc.errors()))) return bali_proc.pdb()
[docs] @staticmethod def from_xml(xml, pdb_file=None): '''Reads a cavity from an XML string. :param xml: an XML representation of the cavity :param pdb_file: an optional PDB file for the associated protein, from which additional data for the cavity may be computed ''' if not isinstance(xml, str): xml = xml.decode('utf-8') stream = CavityLib.stringstream(xml) _pdb = Cavity._make_pdb(pdb_file) reader = CavityLib.cav_xml() reader.init_xml() _cavity = reader.inp_read_cavxml(stream, _pdb) if pdb_file is not None: pseudo_searcher = CavityLib.PseudocenterSearch(_cavity, _pdb, CavityLib.PseudocenterSearchSettings()) _cavity.set_pseudocenters(pseudo_searcher.pseudocenters()) # surface_distance = 1.1 # _cavity.calculate_surrounding_objects(surface_distance, CavityLib.CavityGenerationSettings.METAL_AS_PROTEIN) c = Cavity(_cavity, _pdb) if pdb_file is not None: c._protein = Protein.from_file(pdb_file) _cavity.set_pdb(_pdb) c._writable = False return c
[docs] @staticmethod def from_xml_file(xml_file, pdb_file=None): '''Reads a cavity from an XML file and associated PDB file. :param xml_file: XML file representing the cavity :param pdb_file: an optional PDB file for the associated protein, from which additional data for the cavity may be computed :raises: RuntimeError if the XML file does not exist :returns: a :class:`ccdc.cavity.Cavity` instance ''' if not os.path.exists(xml_file): raise RuntimeError('XML file %s does not exist' % xml_file) with open(xml_file) as xml: return Cavity.from_xml(xml.read(), pdb_file)
[docs] @staticmethod def from_pdb_file(pdb_file, maximum_incomplete_residues_per_chain = 0): '''Create cavities from a PDB file. :param pdb_file: PDB file for the generation of cavities :param maximum_incomplete_residues: the maximum number of incomplete residues to allow (0 by default) :raises: RuntimeError if the PDB file contains more than 1000 SEQRES lines :returns: a tuple of :class:`ccdc.cavity.Cavity` instances ''' with open(pdb_file) as pdb: seqres_lines = [l for l in pdb if l.startswith('SEQRES')] if len(seqres_lines) > 1000: raise RuntimeError('%s contains too many residues for Cavity construction' % pdb_file) _pdb = Cavity._make_pdb(pdb_file) _settings = CavityLib.CavityGenerationSettings() _settings.set_metal_atom_mode(CavityLib.CavityGenerationSettings.METAL_AS_PROTEIN) _settings.set_resolution_limit(1000.) # Effectively no resolution limit _settings.set_maximum_incomplete_residues_per_chain(maximum_incomplete_residues_per_chain) _settings.set_nmr_structures(CavityLib.CavityGenerationSettings.INCLUDE_NMR_STRUCTURES) cavities = CavityLib.generate_cavities(_pdb, _settings) cavities = sorted(cavities, key=lambda k: k.volume(), reverse=True) protein = Protein.from_file(pdb_file) for c in cavities: c.set_pdb(_pdb) cavs = tuple(Cavity(_cavity=c, _pdb=_pdb) for c in cavities) for c in cavs: c._protein = protein c._writable = True return cavs
################################################################################################# # Cavity database #################################################################################################
[docs]class CavityDatabase(object): '''An SQLite database for cavities. A path to a database must be passed in when creating an instance of this class. Please note that the schema for the database, and, indeed, the final choice of underlying database has not been finalised. Accordingly this should be treated as a prototypical implementation. The API for creation and for searching should remain valid, however. ''' def __init__(self, file_name=None): '''Connect to the database.''' self._db = None if file_name is None: raise RuntimeError('A path to a cavity database must be provided.') self.file_name = file_name self._db = sqlite3.connect(file_name) self._cursor = self._db.cursor() def __del__(self): '''Ensure the database is closed.''' if self._db: self._db.close()
[docs] @staticmethod def drugbank_database_dir(): '''Return the directory containing the DrugBank database.''' return _CSDDatabaseLocator.get_cavity_dir_location()
[docs] def cavities(self): '''Iterator over the cavities.''' self._cursor.execute(''' SELECT xml FROM cav_xml ORDER BY name ASC ''' ) for row in self._result_iterator(): cav = self._make_cavity(row[0]) yield cav
__iter__ = cavities
[docs] def cavity_distance_histogram_sets(self): '''Iterator over the cavity histogram sets.''' for cav in self.cavities(): yield cav.cavity_distance_histograms()
[docs] def cavity(self, identifier): '''The cavity of the given identifier.''' return self.get_cavity_by_name(identifier)
[docs] def cavity_distance_histograms(self, identifier): '''The distance histograms corresponding to the given cavity identifier.''' return self.cavity(identifier).cavity_distance_histograms()
@staticmethod def _make_cavity(xml): '''Private.''' return Cavity.from_xml(zlib.decompress(xml)) def _result_iterator(self, size=1000): '''Generate the rows of a query.''' while True: results = self._cursor.fetchmany(size) if not results: break for r in results: yield r ################################################################################ # Creation ################################################################################
[docs] def drop_all_tables(self): '''Drops all tables in the database.''' self._cursor.execute('DROP TABLE IF EXISTS cav_xml') self._cursor.execute('DROP TABLE IF EXISTS cav_hists') self._cursor.execute('DROP TABLE IF EXISTS cav_info') self._cursor.execute('DROP TABLE IF EXISTS cav_ligands')
[docs] def populate_all(self, directory, id_file=None, verbose=False, maximum_allowed_incomplete_residues = 0): '''Create all tables from the directory of input files. :param directory: directory containing PDB or XML cavity files :param verbose: enable verbose output, default False ''' self._populate_cav_xml(directory, id_file, verbose, maximum_allowed_incomplete_residues) self._populate_cav_hists() self._populate_cav_info() self._populate_cav_ligands()
def _populate_cav_xml(self, directory, id_file, verbose, maximum_allowed_incomplete_residues): '''Load and compress the cavity XML files in a given directory.''' self._cursor.execute('DROP TABLE IF EXISTS cav_xml') self._cursor.execute( '''CREATE TABLE IF NOT EXISTS cav_xml ( cavity_id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, xml BLOB )''' ) file_ids = None if id_file is not None: with open(os.path.join(directory, id_file), 'r') as file: file_ids = file.readlines() if ',' in file_ids[0]: file_ids = [file_id.strip().lower() for file_id in file_ids[0].split(',')] else: file_ids = [file_id.strip().lower() for file_id in file_ids] self._populate_cav_xml_impl(directory, file_ids, verbose, maximum_allowed_incomplete_residues) def _populate_cav_xml_impl(self, directory, file_ids, verbose, maximum_allowed_incomplete_residues): '''Load and compress the cavity XML files in a single subdirectory.''' file_names = [] for file_name in sorted(os.listdir(directory)): if os.path.isdir(os.path.join(directory, file_name)): self._populate_cav_xml_impl(os.path.join(directory, file_name), file_ids, verbose, maximum_allowed_incomplete_residues) else: file_names.append(file_name) def loader(): '''Generate the cavity files.''' prefix = 'cavity_' start = time.time() for i, fname in enumerate(file_names): full_id = os.path.basename(fname).split('.')[0] if file_ids is not None: pdb_id = full_id[-4:] if full_id not in file_ids and pdb_id not in file_ids \ and full_id.lower() not in file_ids and pdb_id.lower() not in file_ids: # print("Skipping %s %s\n" % (full_id, pdb_id)) continue t = time.time() if verbose: print(fname) path = os.path.join(directory, fname) if fname.endswith(".gz"): fname = fname[:-3] # temporary_copy will deal with unzipping the full path with _temporary_copy(path) as local_path: if fname.endswith('pdb') or fname.endswith('ent'): try: cavities = Cavity.from_pdb_file(local_path, maximum_allowed_incomplete_residues) except RuntimeError as exc: print('Skipping {} because {}'.format(full_id, exc)) continue for c in cavities: xml = c.to_xml().encode() zipped = zlib.compress(xml) yield c.identifier, sqlite3.Binary(zipped) elif fname.endswith('xml'): if fname.startswith(prefix): name = fname[len(prefix):-4] else: name = fname[:-4] with open(local_path, 'rb') as xml: zipped = zlib.compress(xml.read()) yield name, sqlite3.Binary(zipped) if verbose: print('%4d in %.2f seconds (%.2f seconds for %s)' % ( i, time.time() - start, time.time() - t, fname)) t = time.time() self._cursor.execute('SELECT COUNT(cavity_id) FROM cav_xml') ct_start = self._cursor.fetchone()[0] self._cursor.executemany(''' INSERT INTO cav_xml (name, xml) VALUES(?,?) ''', loader() ) self._db.commit() self._cursor.execute('SELECT COUNT(cavity_id) FROM cav_xml') ct_end = self._cursor.fetchone()[0] if ct_end > ct_start: print('Loaded %d cavities in %.2f seconds' % (ct_end-ct_start, time.time() - t)) def _populate_cav_hists(self): '''Populate the histogram table for all cavity distance histograms.''' self._cursor.execute('DROP TABLE IF EXISTS cav_hists') self._cursor.execute(''' CREATE TABLE IF NOT EXISTS cav_hists( hist_id INTEGER PRIMARY KEY AUTOINCREMENT, cavity_id REFERENCES cav_xml (cavity_id), hist_index INTEGER, ref_point INTEGER, histogram TEXT )''' ) def loader(): self._cursor.execute(''' SELECT cavity_id, xml FROM cav_xml ORDER BY cavity_id ASC ''' ) for cav_id, xml in self._result_iterator(): cav = self._make_cavity(xml) rap = cav.cavity_distance_histograms( ['centroid', 'centroid_closest', 'centroid_furthest', 'centroid_furthest_furthest']) hs = rap.histograms for i, h in enumerate(hs): yield cav_id, i % 7, int(i / 7), str(CavityLib.write_histogram(h._histogram)) t = time.time() inner_cursor = self._db.cursor() inner_cursor.executemany(''' INSERT INTO cav_hists (cavity_id, hist_index, ref_point, histogram) VALUES (?,?,?,?) ''', loader() ) self._cursor.execute('CREATE INDEX IF NOT EXISTS cav_hists_index ON cav_hists(cavity_id)') self._db.commit() self._cursor.execute('SELECT COUNT(*) FROM cav_hists') ct = int(list(self._cursor.fetchone())[0]) print('Loaded %d histograms in %.2f seconds' % (ct, time.time() - t)) def _populate_cav_info(self): '''Populate the info table.''' t = time.time() self._cursor.execute('DROP TABLE IF EXISTS cav_info') self._db.commit() self._cursor.execute(''' CREATE TABLE cav_info ( cavity_id REFERENCES cav_xml (cavity_id), volume REAL, ndonors INTEGER, nacceptors INTEGER, ndonor_acceptors INTEGER, npi INTEGER, naromatic INTEGER, nmetal INTEGER, naliphatic INTEGER, nligands INTEGER ) ''' ) self._db.commit() def loader(): self._cursor.execute(''' SELECT cavity_id, xml FROM cav_xml ORDER BY cavity_id ASC ''' ) for cav_id, xml in self._result_iterator(): cav = self._make_cavity(xml) features = cav.features d = collections.defaultdict(int) for f in features: d[f.type] += 1 nligands = len(cav.ligand_identifiers) yield cav_id, cav.volume, d['donor'], d['acceptor'], \ d['donor_acceptor'], d['pi'], d['aromatic'], d['metal'], \ d['aliphatic'], nligands inner_cursor = self._db.cursor() inner_cursor.executemany(''' INSERT INTO cav_info ( cavity_id, volume, ndonors, nacceptors, ndonor_acceptors, npi, naromatic, nmetal, naliphatic, nligands ) VALUES (?,?,?,?,?,?,?,?,?,?) ''', loader() ) self._cursor.execute('CREATE INDEX IF NOT EXISTS cav_info_index ON cav_info(cavity_id)') self._db.commit() print('Loaded info in %.2f seconds' % (time.time() - t)) def _populate_cav_ligands(self): '''Populate the ligand table.''' t = time.time() self._cursor.execute('DROP TABLE IF EXISTS cav_ligands') self._cursor.execute(''' CREATE TABLE IF NOT EXISTS cav_ligands ( cavity_id REFERENCES cav_xml(cavity_id), ligand_id TEXT ) ''' ) def loader(): self._cursor.execute(''' SELECT cavity_id, xml FROM cav_xml ORDER BY cavity_id ASC ''' ) for cav_id, xml in self._result_iterator(): cav = self._make_cavity(xml) ks = cav._cavity.ligands().keys() for k in ks: lig_id = k[k.index(':') + 2: k.index('_')] yield cav_id, lig_id inner_cursor = self._db.cursor() inner_cursor.executemany(''' INSERT INTO cav_ligands ( cavity_id, ligand_id ) VALUES (?,?) ''', loader() ) self._cursor.execute( 'CREATE INDEX IF NOT EXISTS cav_ligands_index ON cav_ligands(cavity_id)') self._db.commit() self._cursor.execute(''' SELECT COUNT(ligand_id) FROM cav_ligands ''' ) ct = int(self._cursor.fetchone()[0]) print('Loaded %d ligands in %.2f seconds' % (ct, time.time() - t)) ################################################################################ # Query ################################################################################
[docs] def get_cavity_by_name(self, name): '''Get the cavity with the exact identifier.''' self._cursor.execute(''' SELECT xml from cav_xml WHERE name = '%s' ''' % name ) l = list(self._cursor.fetchall()) if l is not None and len(l): return self._make_cavity(l[0][0])
[docs] def get_cavities_by_pdb_code(self, pdb_code): '''Get all the cavities for a protein.''' self._cursor.execute(''' SELECT xml FROM cav_xml WHERE name LIKE '%%{}.%%' '''.format(pdb_code) ) return [self._make_cavity(row[0]) for row in self._result_iterator()]
[docs] def get_ligands_by_pdb_code(self, pdb_code): '''All ligand identifiers in the protein structure.''' self._cursor.execute(''' SELECT ligand_id FROM cav_xml JOIN cav_ligands ON cav_xml.cavity_id = cav_ligands.cavity_id WHERE cav_xml.name LIKE '%%{}.%%' '''.format(pdb_code) ) return [row[0] for row in self._cursor.fetchall()]
[docs] def get_ligands_by_cavity_identifier(self, cav_id): '''All identifiers of ligands in the cavity.''' self._cursor.execute(''' SELECT cav_ligands.ligand_id FROM cav_xml JOIN cav_ligands ON cav_xml.cavity_id = cav_ligands.cavity_id WHERE cav_xml.name = '%s' ''' % cav_id ) return [r[0] for r in self._cursor.fetchall()]
[docs] def get_cavities_by_ligand(self, ligand_id): '''Get all the cavities containing a particular PDB ligand identifier.''' self._cursor.execute(''' SELECT xml FROM cav_xml WHERE cavity_id IN ( SELECT DISTINCT cavity_id FROM cav_ligands WHERE ligand_id = '%s' ) ''' % ligand_id ) return [self._make_cavity(row[0]) for row in self._cursor.fetchall()]
[docs] def get_cavity_identifiers_by_ligand(self, ligand_id): '''Get the identifiers of all cavities containing a particular PDB ligand identifier.''' self._cursor.execute(''' SELECT DISTINCT cav_xml.name FROM cav_ligands JOIN cav_xml ON cav_ligands.cavity_id = cav_xml.cavity_id WHERE cav_ligands.ligand_id = '%s' ORDER BY cav_xml.name ASC ''' % ligand_id ) return [r[0] for r in self._cursor.fetchall()]
[docs] def get_info_for_cavity(self, cav_name): '''Get the information for a cavity. :param cav_name: the name of the cavity from the cav_xml table of the database :returns: a dictionary of values from the info table ''' self._cursor.execute(''' SELECT volume, ndonors, nacceptors, ndonor_acceptors, npi, naromatic, nmetal, naliphatic, nligands FROM cav_info JOIN cav_xml ON cav_info.cavity_id = cav_xml.cavity_id WHERE name = '%s' ''' % cav_name ) vals = list(self._cursor.fetchone()) d = { self._cursor.description[i][0]: vals[i] for i in range(len(vals)) } return d
[docs] def get_number_of_cavities(self): '''Get the number of cavities stored in the database.''' self._cursor.execute(''' SELECT COUNT(*) FROM cav_xml ''' ) l = list(self._cursor.fetchall()) return l[0][0]
[docs] def get_number_of_ligands(self): '''Get the number of cavity ligands stored in the database.''' self._cursor.execute(''' SELECT COUNT(*) FROM cav_ligands ''' ) l = list(self._cursor.fetchall()) return l[0][0]
def _general_get(self, fields, tables, where, order, limit, offset): joins = ['JOIN %s ON %s.cavity_id = %s.cavity_id' % (s, tables[0], s) for s in tables[1:]] where_clause = 'WHERE ' + where if where else '' ord = 'ORDER BY %s' % ', '.join(order) if order else '' if limit: lim = 'LIMIT %d' % limit else: lim = '' if offset: off = 'OFFSET %d' % offset if not lim: lim = 'LIMIT -1' else: off = '' sql = [ 'SELECT %s' % (', '.join(fields)), 'FROM %s %s' % (tables[0], ' '.join(joins)), where_clause, ord, lim, off ] sql = '\n'.join(sql) # print(sql) self._cursor.execute(sql) return self._result_iterator() ################################################################################ # Search ################################################################################
[docs] class Settings(object): '''Settings appropriate to cavity searches.''' max_hit_structures = 0 #: maximum number of structures returned max_product_graph_size = 36000 #: maximum size of product graph allowed when using the fast cavity graph comparison method start = 0 #: offset starting position in database aromatic_range = None #: minimum and maximum number of aromatic features aliphatic_range = None #: minimum and maximum number of aliphatic features donor_range = None #: minimum and maximum number of donor features acceptor_range = None #: minimum and maximum number of acceptor features donor_acceptor_range = None #: minimum and maximum number of donor-acceptor features metal_range = None #: minimum and maximum number of metal features pi_range = None #: minimum and maximum number of pi features volume_range = None #: minimum and maximum cavity volume ligand_range = None #: minimum and maximum number of ligands with_ligands = None #: ligand identifiers verbose = False #: verbose output, default False logfile = False #: logfile of comparison scores, default False histogram_reference_points = None '''an iterable of strings drawn from 'centroid', 'centroid_closest', 'centroid_furthest', 'centroid_furthest_furthest'. If empty or None, 'centroid' and 'centroid_closest' will be used for the generation of distance histograms with Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON '''
@staticmethod def _make_settings_sql(settings): '''Private.''' def one_clause(r, field, fmt): if hasattr(r, '__iter__'): s = '%s BETWEEN %s AND %s' % (field, fmt, fmt) % (min(r), max(r)) else: s = '%s = %s' % (field, fmt) % r return s clauses = [] if settings.aromatic_range is not None: clauses.append(one_clause(settings.aromatic_range, 'naromatic', '%d')) if settings.aliphatic_range is not None: clauses.append(one_clause(settings.aliphatic_range, 'naliphatic', '%d')) if settings.donor_range is not None: clauses.append(one_clause(settings.donor_range, 'ndonors', '%d')) if settings.acceptor_range is not None: clauses.append(one_clause(settings.acceptor_range, 'nacceptors', '%d')) if settings.donor_acceptor_range is not None: clauses.append(one_clause(settings.donor_acceptor_range, 'ndonor_acceptors', '%d')) if settings.metal_range is not None: clauses.append(one_clause(settings.metal_range, 'nmetal', '%d')) if settings.pi_range is not None: clauses.append(one_clause(settings.pi_range, 'npi', '%d')) if settings.volume_range is not None: clauses.append(one_clause(settings.volume_range, 'volume', '%.3f')) if settings.ligand_range is not None: clauses.append(one_clause(settings.ligand_range, 'nligands', '%d')) if settings.with_ligands is not None: sql = ''' SELECT DISTINCT name FROM cav_xml JOIN cav_ligands ON cav_xml.cavity_id = cav_ligands.cavity_id WHERE ''' ligand_clause = [] for name in settings.with_ligands: ligand_clause.append('ligand_id = "%s"' % name) sql += ' OR '.join(ligand_clause) clauses.append('name in ( ' + sql + ')') return '\n AND '.join(clauses) def _constraint_only_search(self, extra_tables, where_clause, limit, offset): # Find cavities matching the settings rows = self._general_get( ['xml'], ['cav_xml'] + extra_tables, where_clause, ['cav_xml.cavity_id ASC'], limit, offset ) def one_by_one(): for r in rows: yield self._make_cavity(r[0]) return one_by_one() def _cavity_graph_comparison_search(self, cavity, comparison_method, extra_tables, where_clause, limit, offset, max_nodes, verbose, logfile): if verbose: print('Counting the number of targets for comparison') if not limit or limit <= 0: count = self._general_get( ['COUNT(DISTINCT cav_xml.cavity_id)'], ['cav_xml'] + extra_tables, where_clause, ['cav_xml.cavity_id ASC'], '', '' ) n_rows = list(itertools.islice(count, 1))[0][0] if offset: n_rows = n_rows - offset else: n_rows = limit print(str(n_rows) + ' targets for comparison') rows = self._general_get( ['xml'], ['cav_xml'] + extra_tables, where_clause, ['cav_xml.cavity_id ASC'], limit, offset ) results = [] n_complete = 0 start_time = time.time() for r in rows: cav = self._make_cavity(r[0]) try: if comparison_method == Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON: # Cavity graph comparison result = cavity.compare(cav, comparison_method=comparison_method) result_tuple = (result.score, cav.identifier) results.append(result_tuple) if logfile: with open('logfile.csv', 'ab') as logfile: logwriter = csv.writer(logfile, delimiter=',') logwriter.writerow(result_tuple) elif comparison_method == Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON: # Fast cavity graph comparison result = cavity.compare(cav, max_product_graph_size=max_nodes) result_tuple = (result.score, cav.identifier) results.append(result_tuple) if logfile: with open('logfile.csv', 'ab') as logfile: logwriter = csv.writer(logfile, delimiter=',') logwriter.writerow(result_tuple) else: raise RuntimeError('Not an allowed comparison method') except RuntimeError as e: print( 'Unable to compare %s with %s: %s' % (cavity.identifier, cav.identifier, e)) n_complete += 1 if verbose: Timer.progress(start_time, n_complete, n_rows, '', in_place=True) results.sort(reverse=True) return results def _cavity_histograms_comparison_search(self, cavity_distance_histograms, extra_tables, where_clause, limit, offset, verbose, logfile): rap_hists = cavity_distance_histograms.histograms if limit: limit *= len(rap_hists) if offset: offset *= len(rap_hists) if where_clause: where_clause = ' AND ' + where_clause if verbose: print('Counting the number of targets for comparison') if not limit or limit <= 0: count = self._general_get( ['COUNT(DISTINCT cav_hists.cavity_id)'], ['cav_xml', 'cav_hists'] + extra_tables, 'cav_hists.ref_point IN ( %s )' % ', '.join( str(r) for r in cavity_distance_histograms.reference_points) + where_clause, ['cav_hists.cavity_id ASC', 'cav_hists.ref_point ASC', 'cav_hists.hist_index ASC'], '', '') n_rows = list(itertools.islice(count, 1))[0][0] if offset: n_rows = n_rows - int(offset / len(rap_hists)) else: n_rows = int(limit / len(rap_hists)) print(str(n_rows) + ' targets for comparison') rows = self._general_get( ['name', 'histogram'], ['cav_xml', 'cav_hists'] + extra_tables, 'cav_hists.ref_point IN ( %s )' % ', '.join( str(r) for r in cavity_distance_histograms.reference_points) + where_clause, ['cav_hists.cavity_id ASC', 'cav_hists.ref_point ASC', 'cav_hists.hist_index ASC'], limit, offset ) weights = [item[1] for item in sorted(cavity_distance_histograms._histogram_set.pc_weights().items()) ] * int(len(rap_hists) / 7) def compare(raphs, hs): return CavityLib.jensen_shannon_divergence([rh._histogram for rh in raphs], hs, weights) / max(1, len(raphs)) results = [] n_complete = 0 start_time = time.time() for k, g in itertools.groupby(rows, lambda r: r[0]): h_lines = list(g) hists = [CavityLib.read_histogram(r[-1]) for r in h_lines] cmp = compare(rap_hists, hists) score = 100.0 if (cmp < 1.3) else (10.0 / math.log10(cmp)) result_tuple = (score, h_lines[0][0]) results.append(result_tuple) if logfile: with open('logfile.csv', 'ab') as logfile: logwriter = csv.writer(logfile, delimiter=',') logwriter.writerow(result_tuple) n_complete += 1 if verbose: Timer.progress(start_time, n_complete, n_rows, '', in_place=True) results.sort(reverse=True) return results
[docs] def search(self, cavity=None, comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON, settings=None): '''Searches the database and optionally performs cavity comparisons against the results. The query can include a Cavity for comparison against the database. If a cavity is not specified, a search for cavities matching the constraints is performed. :param cavity: a :class:`ccdc.cavity.Cavity` instance :param comparison_method: a member of :class:`ccdc.cavity.Cavity.ComparisonMethod`, either Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON, Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON or Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON :param settings: a :class:`ccdc.cavity.Cavity.Settings` instance :returns: a list of tuples of comparison score and cavity identifier, sorted by comparison score, starting with the highest similarity, or alternatively returns a generator in the case where no comparisons are performed ''' if settings is None: where_clause = '' extra_tables = [] limit = '' offset = '' max_nodes = 36000 verbose = False logfile = False histogram_reference_points = None else: where_clause = CavityDatabase._make_settings_sql(settings) if where_clause: extra_tables = ['cav_info'] else: extra_tables = [] if settings.max_hit_structures: limit = settings.max_hit_structures else: limit = '' if settings.start: offset = settings.start else: offset = '' if settings.max_product_graph_size: max_nodes = settings.max_product_graph_size verbose = settings.verbose logfile = settings.logfile histogram_reference_points = settings.histogram_reference_points if cavity is None: return self._constraint_only_search(extra_tables, where_clause, limit, offset) else: if logfile: with open('logfile.csv', 'wb') as logfile: logwriter = csv.writer(logfile, delimiter=',') logwriter.writerow(['Score', 'Cavity identifier']) if comparison_method == Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON: return self._cavity_histograms_comparison_search( cavity.cavity_distance_histograms(histogram_reference_points), extra_tables, where_clause, limit, offset, verbose, logfile) else: return self._cavity_graph_comparison_search( cavity, comparison_method, extra_tables, where_clause, limit, offset, max_nodes, verbose, logfile)