Cavity examples

Compare cavities in proteins using histograms

This example allows you to detect and compare cavities from PDB files stored in a user-defined directory. The cavity histograms comparison uses feature distance histograms and the similarity scores are written out to a .csv file. Note that cavity detection is the rate-determining step and therefore it is best practice to store the detected cavities as XML files or in a cavity database for future reuse and analysis.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_compare.py - compare the cavities in all proteins (PDB files) in a certain directory.

    Usage: python cavity_compare.py pdb_directory
'''
#####################################################################

import sys
import argparse
import os

from ccdc.cavity import Cavity
from glob import glob

#####################################################################


def parse_args():
    usage_txt = ('Detect cavities from PDB files stored in a user defined directory'
                 ' and perform an all-by-all comparison of the detected cavities.')
    parser = argparse.ArgumentParser(
        description=usage_txt,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-d', '--directory',
                        help='Directory of PDB files to find cavities in.',
                        required=True)
    parser.add_argument('-m', '--maximum-incomplete-residues-per-chain',type=int,
                        help='Number of incomplete residues that are permissible when generating cavities',
                        default=100) # High default - presumably we want all cavities included unless they are really poor otherwise they would not be in the directory

    return parser.parse_args()


def main():
    # Get command line argument:
    args = parse_args()
    pdb_dir = args.directory

    # Collect PDB files
    pdbs = glob(os.path.join(pdb_dir, '*.pdb'))

    # At least two PDB files are needed to perform comparisons
    if len(pdbs) < 2:
        print('Error: At least two PDB files are required. Not enough found in %s' % pdb_dir)
        sys.exit(1)

    print(' -> Detecting the cavities for all PDB structures in %s' % pdb_dir)
    cavs = []
    for pdb in pdbs:
        cavs += Cavity.from_pdb_file(pdb, maximum_incomplete_residues_per_chain = args.maximum_incomplete_residues_per_chain)

    # Generate CSV output
    with open('similarities.csv', 'w') as f:
        # header:
        cav_ids = [''] + [cav.identifier for cav in cavs]
        f.write(','.join(cav_ids) + '\n')

        # rows:
        for i in range(0, len(cavs)):
            cells = [cavs[i].identifier]
            for j in range(0, len(cavs)):
                score = cavs[i].compare(
                    cavs[j],
                    comparison_method=Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
                cells.append(str(round(score, 3)))
            f.write(','.join(cells) + '\n')

        print(' -> Similarity matrix written to similarities.csv')

#####################################################################


if __name__ == '__main__':
    main()

Overlay pairs of proteins based on their contained cavities

This example allows you to detect and compare cavities from pairs of PDB files. The script reads and detects cavities in 2 protein structures then overlays pairs of cavities. The resultant transformation matrices are applied to the input proteins and new output files are written that contain the superimposed protein with the transformation applied.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_pair_view.py - generate cavity overlays for all cavities in a  pair of protein structures and then generate mol2 files of the proteins in the overlaid configuration. Also writes a pairwise similarity matrix for the cavities

    Usage: python cavity_pair_view.py <pdb file 1> <pdb file 2>
'''
#####################################################################

import sys
import argparse
import os

from ccdc.cavity import Cavity
from ccdc.protein import Protein
from ccdc.io import MoleculeWriter

from glob import glob

#####################################################################


def parse_args():
    usage_txt = ('Detect cavities from a pair of PDB files and then overlays them based on the best match')
    parser = argparse.ArgumentParser(
        description=usage_txt,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('filenames', metavar='<filename>',type=str, nargs='+',
                        help='Filenames of PDB files to find cavities in.')
    return parser.parse_args()


def main():
    # Get command line argument:
    args = parse_args()


    # At least two PDB files are needed to perform comparisons
    if len(args.filenames) != 2:
        print('Error: You must specify exactly two PDB filenames')
        sys.exit(1)

    print(' -> Detecting the cavities for all PDB structures in %s and %s' % ( args.filenames[0],args.filenames[1] ))


    file_lookup = {}

    first_cavs = Cavity.from_pdb_file(args.filenames[0])
    second_cavs =  Cavity.from_pdb_file(args.filenames[1])

    for cav in first_cavs:
        file_lookup[cav.identifier] = args.filenames[0]

    for cav in second_cavs:
        file_lookup[cav.identifier] = args.filenames[1]

    # Generate CSV output
    with open('similarities.csv', 'w') as f:
        # header:
        cav_ids = [''] + [cav.identifier for cav in second_cavs]
        f.write(','.join(cav_ids) + '\n')

        for first_cav in first_cavs:
            cells = [first_cav.identifier]
            for second_cav in second_cavs:

                comparison_object = first_cav.compare(
                    second_cav,
                    comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)

                score = comparison_object.score
                cells.append(str(round(score, 3)))
                filename = first_cav.identifier + "_" + second_cav.identifier + ".mol2"

                m = Protein.from_file(file_lookup[second_cav.identifier])
                m.transform(comparison_object.transformation_matrix)
                print(' -> Writing %s - a file containing PDB entry %s - superimposition using cavity %s in %s onto cavity %s' % ( filename,m.identifier,second_cav.identifier,m.identifier, first_cav.identifier))

                er = MoleculeWriter(filename)
                er.write(m)

            f.write(','.join(cells) + '\n')

        print(' -> Similarity matrix written to similarities.csv')

#####################################################################


if __name__ == '__main__':
    main()

Compare XML cavity files

This script allows you to compare cavity information stored in XML cavity files. You can switch between a cavity histograms comparison, a fast cavity graph comparison and a cavity graph comparison, using the appropriate parameter in the command line argument. The script furthermore provides a more elaborate example on how to make use of Python’s argument parser.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_compare_args.py - compare a cavity against a set of other
        cavities using command line arguments.

    Usage: python cavity_compare_args.py -d C:/path/to/xml_cavities
                                         -q query_cavity
                                         -c compare_cavities
                                         -m method
'''
#####################################################################

import argparse
from ccdc.cavity import Cavity
import os

#####################################################################


def parse_args():
    usage_txt = 'Compare a cavity against a set of other cavities using command line arguments.'
    parser = argparse.ArgumentParser(description=usage_txt,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-m', '--method', default='GRAPH',
                        choices=['HISTOGRAMS', 'FAST_GRAPH', 'GRAPH'],
                        help='Comparison method to use.')
    parser.add_argument('-q', '--query',
                        help='Query cavity name (e.g. "pdb3jw3.1" for cavity_pdb3jw3.1.xml).',
                        required=True)
    parser.add_argument('-c', '--comparisons', nargs='+', required=True,
                        help='Cavity names with which to compare, separated by spaces.')
    parser.add_argument('-d', '--directory',
                        help='Directory of XML cavity files.')
    return parser.parse_args()


def main():
    # Get command line arguments:
    args = parse_args()
    xml_dir = args.directory
    query = args.query
    method = args.method
    comps = args.comparisons

    # Load the query cavity:
    query_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % query))

    # Do the comparisons:
    if method == 'HISTOGRAMS':
        # Create histograms for cavity histograms comparison
        _ = query_cav.cavity_distance_histograms()
        print('=== Cavity histogram comparison ===')
        print('Query\tComparison\tScore')
        for comp in comps:
            try:
                comp_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % comp))
                score = query_cav.compare(
                    comp_cav,
                    comparison_method=Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
                print('\t'.join([query, comp, str(score)]))
            except RuntimeError as err:
                print('\t'.join([query, comp, 'Error: %s' % err]))
    elif method == 'FAST_GRAPH':
        print('=== Fast graph comparison ===')
        print('Query\tComparison\tScore')
        for comp in comps:
            try:
                comp_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % comp))
                score = query_cav.compare(
                    comp_cav,
                    comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON).score
                print('\t'.join([query, comp, str(score)]))
            except RuntimeError as err:
                print('\t'.join([query, comp, 'Error: %s' % err]))
    elif method == 'GRAPH':
        print('=== Graph comparison ===')
        print('Query\tComparison\tScore')
        for comp in comps:
            try:
                comp_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % comp))
                score = query_cav.compare(
                    comp_cav,
                    comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON).score
                print('\t'.join([query, comp, str(score)]))
            except RuntimeError as err:
                print('\t'.join([query, comp, 'Error: %s' % err]))

#####################################################################


if __name__ == '__main__':
    main()

Compare cavities using multiple cores

This example shows how to use multiple cores to speed up the comparison of large numbers of cavities.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-02-02: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_compare_db_multiple_cores.py - Pairwise comparison of cavities
        taken from a database using multiple cores.

    Usage: python cavity_compare_db_multiple_cores.py -n n_processes
                                                      -q query_cavity
                                                      -v volume_min
                                                      -w volume_max
'''
#####################################################################

import argparse
from multiprocessing import Pool, get_context
import os
from ccdc.cavity import *

#####################################################################

process_db = None


def initializer(initargs):
    """  Create a database connection for each process. """
    global process_db
    process_db = CavityDatabase(initargs)


def comparison(cavities):
    """ Compare two cavities in a tuple and output the scoring. """
    query_cav = process_db.get_cavity_by_name(cavities[0])
    comp_cav = process_db.get_cavity_by_name(cavities[1])
    try:
        result = query_cav.compare(
            comp_cav,
            comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
        print('%s\t%s' % ('\t'.join(cavities[:2]), result.score))
    except RuntimeError as exc:
        print('%s\tComparison unsuccessful: %s' % ('\t'.join(cavities[:2]), str(exc)))


def mp_handler(comparisons_list, n_processes, db_path):
    """ Create a pool of worker processes and distribute the cavity pairs between them. """
    with get_context('spawn').Pool(n_processes, initializer, initargs=(db_path,)) as p:
        p.map(comparison, comparisons_list)


def parse_args():
    usage_txt = 'Pairwise comparison of cavities taken from a database using multiple cores.'
    parser = argparse.ArgumentParser(
        description=usage_txt,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        '-n', '--n_processes',
        help='Number of parallel processes',
        default=2
    )
    parser.add_argument(
        '-q', '--query',
        help='Query cavity name',
        required=True
    )
    parser.add_argument(
        '-v', '--volume_min',
        help='Minimum volume for target cavities search',
        required=True
    )
    parser.add_argument(
        '-w', '--volume_max',
        help='Maximum volume for target cavities search',
        required=True
    )
    parser.add_argument(
        '-d', '--db_path',
        help='Cavity database to use as source data.',
        default=os.path.join(CavityDatabase.drugbank_database_dir(), 'drugbank_cavities.sqlite')
    )
    return parser.parse_args()


def main():
    args = parse_args()
    n_processes = int(args.n_processes)
    query = args.query
    volume_min = float(args.volume_min)
    volume_max = float(args.volume_max)
    db_path = args.db_path

    if not os.path.exists(db_path):
        print('Cavity database %s does not exist!' % db_path)
        exit(1)

    print('Comparing cavity %s with a volume range of %s-%s in database %s using %s processes.' %
          (query, volume_min, volume_max, db_path, n_processes))

    # Create database connection for initial search
    db = CavityDatabase(db_path)

    query_cav = db.get_cavity_by_name(query)
    if query_cav is None:
        raise RuntimeError('Query cavity {0} not found in database'.format(query))

    # Define database search settings
    settings = CavityDatabase.Settings()
    settings.volume_range = [volume_min, volume_max]

    # Search using settings
    comparisons = db.search(settings=settings)

    # Make list of comparison cavities fulfilling the search criteria
    comparisons_list = []
    for c in comparisons:
        comparisons_list.append((query, c.identifier))

    print('Number of comparisons to be performed: %s' % len(comparisons_list))
    print('=== Cavity Graph Comparison ===')
    print('Query\tComparison\tScore')

    mp_handler(comparisons_list, n_processes, db_path=db_path)

#####################################################################


if __name__ == '__main__':
    import ccdc.utilities
    ccdc.utilities._fix_multiprocessing_on_macos()
    main()

Compare cavities stored in an SQLite database

Cavity comparison using an SQLite database is faster than processing individual XML files. Refer to this example to set up a comparison using SQLite input.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_compare_db_pairwise.py - Pairwise comparison of cavities taken from a database.

    Usage: python cavity_compare_db_pairwise.py -q query_cavity
                                                -c comparison1 comparison2 comparison3
'''
#####################################################################

import argparse
from ccdc.cavity import *

#####################################################################


def parse_args():
    usage_txt = 'Pairwise comparison of cavities taken from a database.'
    parser = argparse.ArgumentParser(description=usage_txt,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-c', '--comparisons', nargs='+',
                        default=['pdb3dar.1', 'pdb3f8s.1', 'pdb2hha.1', 'pdb21bi.2'],
                        help='Names of cavities in the database to compare against.')
    parser.add_argument('-q', '--query', default='pdb21bi.1',
                        help='The name of the cavity to use as a reference.')
    parser.add_argument('-d', '--database',
                        default=os.path.join(CavityDatabase.drugbank_database_dir(),
                                             'drugbank_cavities.sqlite'),
                        help='Database file to use a source for cavities.')
    return parser.parse_args()


def main():
    args = parse_args()
    db = CavityDatabase(args.database)

    # Load the query cavity:
    query_cav = db.get_cavity_by_name(args.query)

    print('=== Cavity comparison ===')
    print('Query\tComparison\tScore')

    for comp in args.comparisons:
        comp_cav = db.get_cavity_by_name(comp)
        try:
            result = query_cav.compare(
                comp_cav,
                comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
            # Output the comparison result:
            print('\t'.join([args.query, comp, str(result.score)]))
        except RuntimeError as e:
            print('\t'.join([args.query, comp, 'Failed: %s' % e.message]))

#####################################################################


if __name__ == '__main__':
    main()

A script to compile an SQLite database from your own data is provided as ccdc/utilities/create_cavity_database/create_cavity_database.py.

Download and extract the PDB header for a cavity

This example shows how to download a PDB file from the wwPDB for a cavity and extract header information using Biopython.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-03-01: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_fetch_pdb_header.py - Retrieve pdb file header information for a cavity
                                 from wwwPDB using Biopython

    Usage: python cavity_fetch_pdb_header.py --query [cavity]
                                             --database [database_path]
'''
#####################################################################

import argparse
from Bio import PDB
from ccdc.cavity import *
from ccdc.utilities import output_file, to_utf8
import os
import requests

import tempfile

#####################################################################


def parse_args():
    """Parse command-line arguments."""
    usage = 'Get pdb file header information for a cavity from the wwwPDB using Biopython.'
    parser = argparse.ArgumentParser(description=usage,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-q', '--query', default='pdb3k2h.8',
                        help='The name of the cavity to use as a reference.')
    parser.add_argument('-d', '--database',
                        default=os.path.join(CavityDatabase.drugbank_database_dir(),
                                             'drugbank_cavities.sqlite'),
                        help='Database file to use a source for cavities.')
    return parser.parse_args()


def fetch_from_pdb(pdb_id, filename):
    """Try to retrieve the given structure from the RCSB web database.

    :param pdb_id: (:obj:`str`) The PDB structure identifier to retrieve.
    :param filename: (:obj:`str`) The file name to write the structure to.

    :raises: requests.HTTPError if an error occurs trying to download the structure.
    """

    # Try to retrieve the structure from the online database.
    pdb_req = requests.get('https://files.rcsb.org/download/%s.pdb' % pdb_id)
    # Raise a requests.HTTPError if any HTTP errors have occurred.
    pdb_req.raise_for_status()

    # Write the structure returned from the PDB to the given file name
    with output_file(filename) as pdb_file:
        pdb_file.write(to_utf8(pdb_req.text))


def extract_pdb_id(cavity_id):
    """Convert Relibase chain id to CSD Python API chain id"""
    pdb_id = cavity_id.split('.')[0]
    pdb_id = pdb_id.replace('pdb', '')
    return pdb_id


def safe_key_value(a_dict, key):
    """Return a value corresponding to the specified header key."""
    value = a_dict.get(key, '')
    return value.strip() if isinstance(value, str) else value


class Header(object):
    def __init__(self, structure):
        self.header = structure.header
        self.structure_id = structure.id.upper()
        self.resolution = safe_key_value(self.header, 'resolution')
        self.structure_method = safe_key_value(self.header, 'structure_method')
        self.deposition_date = safe_key_value(self.header, 'deposition_date')
        self.pdb_title = safe_key_value(self.header, 'name').capitalize()
        self.pdb_class = safe_key_value(self.header, 'head').capitalize()
        self.compound_dict = safe_key_value(self.header, 'compound')
        self.source_dict = safe_key_value(self.header, 'source')
        try:
            self.pdb_chain_list = list({chain.id.upper() for chain in structure.get_chains()})
        except Exception:
            self.pdb_chain_list = []

    def __getitem__(self, key):
        return self.header[key]


def main():
    args = parse_args()
    db = CavityDatabase(args.database)

    # Load the query cavity
    query_cavity = db.get_cavity_by_name(args.query)
    if query_cavity is None:
        raise Exception('Failed to find {} in database {}!'.format(args.query, args.database))
    pdb_id = extract_pdb_id(query_cavity.identifier)

    # load the pdb file for the protein from wwwPDB
    output_dir = tempfile.mkdtemp()
    pdb_file = os.path.join(output_dir, '{}.pdb'.format(pdb_id))
    fetch_from_pdb(pdb_id, pdb_file)

    # strict parser, will assign altloc
    struct_parser = PDB.PDBParser(PERMISSIVE=False, get_header=True)
    pdb_structure = struct_parser.get_structure(pdb_id, pdb_file)
    pdb_header = Header(pdb_structure)

    print("Identifier: {}".format(pdb_id))
    print("Title: {}".format(pdb_header.pdb_title))
    print("Class: {}".format(pdb_header.pdb_class))
    print("Resolution: {}".format(pdb_header.resolution))
    print("Structure method: {}".format(pdb_header.structure_method))
    print("Deposition date: {}".format(pdb_header.deposition_date))
    print("Compound dictionary: {}".format(pdb_header.compound_dict))
    print("Source dictionary: {}".format(pdb_header.source_dict))
    print("Chain list: {}".format(pdb_header.pdb_chain_list))


#####################################################################

if __name__ == '__main__':
    main()

Find the most buried cavity area using k-means clustering and use it for subcavity comparisons

This script allows you to locate the most buried area of a cavity using a k-means clustering algorithm and compare it against other cavities. The value of k giving the number of clusters and the minimum size of the clusters can be adjusted within the script.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-01-15: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_most_buried.py - use pseudocenter depth values and k-means clustering
    to detect the most buried area of a cavity.

    Usage: python cavity_most_buried.py -x c:/path/to/input/xml/directory
                                        -i c:/path/to/input/directory/reference_file.xml
                                        [-v c:/path/to/visualisation/output/directory]
'''
#####################################################################

import argparse
from ccdc.cavity import *
import glob
from math import sqrt
import os
import random

#####################################################################


def parse_args():

    usage_txt = ('Use pseudocenter depth values and k-means clustering to detect',
                 ' the most buried area of a cavity.')
    parser = argparse.ArgumentParser(description=usage_txt,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        '-x', '--xml_directory',
        help='Directory for input XML files.',
        required=True
    )
    parser.add_argument(
        '-i', '--input_file',
        help='Reference cavity XML file.',
        required=True)
    parser.add_argument(
        '-v', '--visualisation_directory',
        help='Output directory for PyMol visualisation file.',
        default=None
    )
    parser.add_argument(
        '-s', '--min_subcavity_size',
        help='The minimum size of the buried subcavity (number of pseudocenters).',
        type=int, default=15
    )
    parser.add_argument(
        '-d', '--min_depth',
        help=('Minimum depth value of the pseudocenters to be used for subcavity detection.',
              ' This is in the interval [0,7] where 7 means most buried.'),
        type=float, default=7.0)
    parser.add_argument(
        '-k', '--k_value',
        help='The number of clusters to be detected.',
        type=int, default=3)
    return parser.parse_args()


def k_means_clustering(features, min_depth, num_clusters):

    def euclidean_dist(p, q):
        return sqrt(sum([pow(p[i] - q[i], 2) for i in range(3)]))

    def get_buried_indices(features, min_depth):
        indices = []
        for i in range(len(features)):
            if features[i].burial >= min_depth:
                indices.append(i)
        return indices

    def get_centroid(cluster):
        x = y = z = 0.0
        for feature in cluster:
            x += feature.coordinates[0]
            y += feature.coordinates[1]
            z += feature.coordinates[2]
        n = len(cluster)
        return (x / n, y / n, z / n)

    buried_indices = get_buried_indices(features, min_depth)

    # Return set of empty clusters if not enough points with the specified
    # depth were found:
    if len(buried_indices) < num_clusters:
        return [[] for i in range(num_clusters)]

    clusters = []

    # Initialisation: generate 'num_clusters' clusters
    # each of which contains one random feature.
    used_positions = []
    for i in range(num_clusters):
        new_cluster = []
        while True:
            pos = random.randint(0, len(buried_indices) - 1)
            if pos not in used_positions:
                new_cluster.append(features[buried_indices[pos]])
                used_positions.append(pos)
                break
        clusters.append(new_cluster)

    # Iteration: re-generate clusters until convergence.
    while True:
        cluster_centers = [get_centroid(c) for c in clusters]
        cluster_sizes = [len(c) for c in clusters]
        clusters = [[] for i in range(num_clusters)]   # clear previous clusters

        for bi in buried_indices:
            closest_cluster_center_index = -1
            closest_distance = 10000.0
            for ci in range(len(cluster_centers)):
                d = euclidean_dist(features[bi].coordinates, cluster_centers[ci])
                if d < closest_distance:
                    closest_cluster_center_index = ci
                    closest_distance = d
            clusters[closest_cluster_center_index].append(features[bi])

        new_cluster_sizes = [len(c) for c in clusters]
        if new_cluster_sizes == cluster_sizes:
            break

    return clusters


def visualise_points(cluster, scale=0.50, color=(255, 120, 50)):
    serial = str(random.randint(0, 10000))
    spheres = []
    for f in cluster:
        p = f.coordinates
        spheres.append('7.0, %f, %f, %f, %f' % (p[0], p[1], p[2], scale))
    out = []
    out.append('points_' + serial + ' = [' + ', '.join(spheres) + ']')
    out.append('cmd.load_cgo(points_' + serial + ', \'points_' + serial + '\', 1)')
    out.append('cmd.set_color(\'color_' + serial + '\', [%i, %i, %i])' % color)
    out.append('cmd.color(\'color_' + serial + '\', \'points_' + serial + '\')')
    return os.linesep.join(out)


def main():
    args = parse_args()

    # Load a previously detected cavity:
    cav = Cavity.from_xml_file(args.input_file)

    min_subcavity_size = args.min_subcavity_size

    # Define the initial minimum depth value of the pseudocenters that should be used for
    # the subcavity detection. This is in the interval [0,7] where 7 means most buried.
    min_depth = args.min_depth

    # Define the number of clusters to be detected
    k_value = args.k_value

    # Perform a k-means clustering with a decreasing depth threshold until a cluster was
    # found that is large enough:
    clusters = []
    while min_depth >= 1.0:
        print('Running k-means clustering with min_depth %s' % min_depth)

        clusters = k_means_clustering(cav.features, min_depth, k_value)

        print('Cluster sizes: %s' % [len(c) for c in clusters])
        if max([len(c) for c in clusters]) >= min_subcavity_size:
            break
        min_depth -= 0.5
    print('Detected subcavity has size %s' % max([len(c) for c in clusters]))

    # Write a PyMol visualisation file for the detected clusters if an output
    # path is given
    if args.visualisation_directory is not None:
        out_file_name = os.path.join(args.visualisation_directory, 'cav_clusters.py')
        with open(out_file_name, 'w') as out_file:
            out_file.write('from pymol.cgo import *%s' % os.linesep)
            out_file.write('from pymol import cmd%s' % os.linesep)
            for i in range(len(clusters)):
                color = (80 + 80 * i, 50, 50)
                out_file.write(visualise_points(clusters[i], 0.50, color))
                out_file.write(os.linesep)

    # Find the index of the largest detected cluster
    max_cluster_index = max_cluster_size = -1
    for i in range(len(clusters)):
        if len(clusters[i]) > max_cluster_size:
            max_cluster_index = i
            max_cluster_size = len(clusters[i])

    # Create a subcavity from the largest cluster
    subcav = cav.subcavity(clusters[max_cluster_index])
    # Cavities to be compared with the deeply buried subcavity
    comp_cavs = glob.glob(os.path.join(args.xml_directory, '*.xml'))
    print('%s==========%s' % (os.linesep, os.linesep))
    print('Comparison\tScore')
    for cav_xml in comp_cavs:
        c = Cavity.from_xml_file(cav_xml)
        try:
            comp = subcav.compare(
                c, comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
            score = comp.score
        except RuntimeError as exc:
            score = 'Failed: %s' % str(exc)
        print('\t'.join([c.identifier, str(score)]))

#####################################################################


if __name__ == '__main__':
    main()

Evaluate enrichment in cavity comparison

Screen a cavity against an entire database and obtain a list of all comparison cavities ordered by their degree of similarity. In the example script the cavity pdb3k2h.8, which contains the ligand LYA (Pemetrexed), is used as query and all cavities that accommodate the same ligand are being defined as true positives. Also, the ROC AUC of the enrichment rates is calculated.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_retrieval.py - Screen a cavity against an entire database and
    calculate the ROC AUC of the enrichment rates of cavities binding the same
    ligand.

    Usage: python cavity_retrieval.py
'''
#####################################################################

import argparse
from ccdc.cavity import *
import os

#####################################################################


def parse_args():
    usage_txt = ('Screen a cavity against an entire database and calculate',
                 ' the ROC AUC of the enrichment rates of cavities',
                 ' binding the same ligand.')
    parser = argparse.ArgumentParser(description=usage_txt,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-q', '--query', default='pdb3k22.2',
                        help='The name of the cavity to use as a reference.')
    parser.add_argument('-l', '--ligand', default='LYA',
                        help='The identifier of the ligand within the cavity to define the pocket.')
    parser.add_argument('-d', '--database',
                        default=os.path.join(CavityDatabase.drugbank_database_dir(),
                                             'drugbank_cavities.sqlite'),
                        help='Database file to use a source for cavities.')
    return parser.parse_args()


def get_roc_coords(hits_positions, num_results):
    """ Calculates the coordinates of a ROC plot
        using the obtained positions of true positives and the total
        number of results in the list.
    """
    coords = []
    pos = 0
    for i in range(len(hits_positions)):
        while pos < hits_positions[i]:
            pos += 1
        x = float(pos - i) / (num_results - len(hits_positions))
        y = float(i) / len(hits_positions)
        coords.append((x, y))
    coords.append((1.0, 1.0))
    return coords


def get_auc(coords):
    """ Calculates the area under curve (AUC) for a given set of ROC coordinates """
    auc = 0.0
    for i in range(1, len(coords)):
        auc += (coords[i][0] - coords[i - 1][0]) * coords[i][1]
    return auc


def main():
    args = parse_args()

    # Load the cavity database and query cavity
    db = CavityDatabase(args.database)
    query_cav = db.get_cavity_by_name(args.query)

    if query_cav is None:
        raise RuntimeError(f'Cavity {args.query} not found in database {args.database}')

    # Collect all cavities from the database that bind the target ligand
    hits = db.get_cavity_identifiers_by_ligand(args.ligand)

    # Compare the query cavity against all entries of the database using
    # the cavity histograms comparison method:
    results = db.search(query_cav, Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)

    # Collect and print the positions of the true positives in the result list:
    print('Hit\tPos\tDistance')
    hits_positions = []

    for i in range(len(results) - 1):
        cav_id = results[i][1]

        # Don't output the comparison of the query cavity with itself
        if cav_id == args.query:
            pass

        # Output any other results
        elif cav_id in hits:
            hits_positions.append(i)
            print('\t'.join([cav_id, str(i), str(results[i][0])]))

    # Calculate the ROC coordinates and the corresponding AUC:
    roc = get_roc_coords(hits_positions, len(results))
    auc = get_auc(roc)
    print('%sAUC: %s' % (os.linesep, auc))

#####################################################################


if __name__ == '__main__':
    main()

Create a subcavity and use it for comparisons

This example shows how to generate a subcavity on the basis of a regular cavity and use it for a database screening afterwards. We are using a Thrombin structure (PDB code: 5af9) as the starting point and are interested in creating a new cavity that only comprises the S1 pocket of that enzyme. In order to identify the features that make up the subcavity, we employ the methoxy-benzamide fragment of the bound ligand, as we know that it points directly into the S1 subpocket. The newly created subcavity is then compared against all the other cavities in a database by making use the cavity graph comparison method. The cavity histograms comparison method is not recommended for subcavity comparisons.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2016-12-13: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_subcavity.py - Create a subcavity based on a ligand fragment and
    compare it against a database.

    Usage: python cavity_subcavity.py
'''
#####################################################################

import argparse
from ccdc.cavity import *
import os
import sys

#####################################################################


def parse_args():
    usage_txt = ('Create a subcavity based on a ligand fragment and ',
                 'compare it against a database')
    parser = argparse.ArgumentParser(description=usage_txt,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p', '--pdb_file', default='5af9.pdb',
                        help='The pdb file to work from.')
    parser.add_argument('-l', '--ligand', default='SJR',
                        help='The identifier of the ligand to define the subcavity from.')
    parser.add_argument('-f', '--fragment_labels', nargs='+',
                        default=['C', 'O', 'C1', 'C12', 'C11', 'C2', 'C3', 'C4'],
                        help='The atoms defining the subcavity fragment within the ligand.')
    parser.add_argument('-r', '--subcavity_radius', type=float, default=6.0,
                        help='The radius around the fragment atoms defining the subcavity volume.')
    parser.add_argument('-s', '--max_hit_structures', type=int, default=50,
                        help='The maximum number of structures to compare against.')
    parser.add_argument('-m', '--max_results', type=int, default=10,
                        help='The maximum number of results to output.')
    parser.add_argument('-d', '--database',
                        default=os.path.join(CavityDatabase.drugbank_database_dir(),
                                             'drugbank_cavities.sqlite'),
                        help='Database file to use a source for cavities.')
    return parser.parse_args()


def main():
    args = parse_args()
    cavs = Cavity.from_pdb_file(args.pdb_file)
    pdb = Protein.from_file(args.pdb_file)
    db = CavityDatabase(args.database)

    # Get the ligand matching the given identifier
    ligand = None
    for lig in pdb.ligands:
        if args.ligand in lig.identifier:
            ligand = lig
            break

    # Get the cavity that hosts the ligand:
    query_cav = None
    for cav in cavs:
        for lig in cav.ligand_identifiers:
            if args.ligand in lig:
                query_cav = cav
                break

    if ligand is None:
        print('Could not find the ligand %s in the PDB structure.' % args.ligand)
        sys.exit(1)
    if query_cav is None:
        print('Could not find the cavity hosting the ligand %s.' % args.ligand)
        sys.exit(1)

    # Get the ligand atoms that make up the defined fragment
    atoms = [a for a in ligand.atoms if a.label in args.fragment_labels]

    # Create the subcavity as the volume within the given radius around the fragment atoms
    subcav = query_cav.subcavity(query_cav.features_by_atom_distance(atoms, args.subcavity_radius))

    # Apply settings that limits the maximum number of cavities found in the search
    settings = CavityDatabase.Settings()
    settings.max_hit_structures = args.max_hit_structures
    settings.verbose = True

    # Find matches for the subcavity using the fast cavity graph comparison method
    results = db.search(subcav, settings=settings,
                        comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON)

    # Print the 10 most similar cavities
    print('%sCavity\tScore' % os.linesep)
    for result in results[:args.max_results]:
        print('%s\t%s' % (result[1], result[0]))

#####################################################################


if __name__ == '__main__':
    main()

Perform subcavity comparisons and filter out results from structures homologous to the query

This examples demonstrates the creation of a subcavity using a specified radius around a fragment of a ligand. This is then used in fast cavity graph comparisons against a cavity database and the results of this search are filtered on the basis of a FASTA sequence search. All cavities from proteins with a chain that has statistically significant sequence similarity to the chain associated with the query ligand, as judged by an expectation value below a certain threshold (default 0.001), are removed from the results. Cavity identifiers and scores are written out to a CSV file.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-03-21: created by the Cambridge Crystallographic Data Centre
#
'''
    cavity_subcavity_compare_filter.py - generate a subcavity of a certain
    radius around a specified ligand and perform comparisons against a
    database, filtering out results for proteins within the same homologous
    family using FASTA and Biopython. Expectation (E()) values are used in filtering
    the results, which depend on the size of the database searched. For protein
    searches, sequences with E()-values < 0.001 for searches of a 10,000 entry database
    are almost always homologous. Here, expectation values are calculated for each chain
    separately, and if any of these fall below the specified expectation threshold, all
    cavities associated with this PDB will be excluded from the results. Expectation
    values printed in the output files are the lowest found for the corresponding PDB
    structure, rather than being particularly associated with that cavity. The printed
    sequence similarity is that associated with the chain that led to the lowest expectation
    value.
'''
#####################################################################

import os
import argparse
import requests
import tempfile
import shutil
import errno

from subprocess import call
from collections import Counter
from time import sleep

from Bio import SeqIO, PDB, SearchIO

from ccdc.utilities import _temporary_copy, output_file, to_utf8
from ccdc.protein import Protein
from ccdc.cavity import Cavity, CavityDatabase


class Runner(argparse.ArgumentParser):
    '''Define and parse arguments, and run the comparisons according to the arguments.'''

    def __init__(self):
        '''Defines and parses the arguments.'''
        super(self.__class__, self).__init__(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)

        cavity_dir = CavityDatabase.drugbank_database_dir()
        if cavity_dir is not None:
            default_db = os.path.join(cavity_dir, 'drugbank_cavities.sqlite')
            have_default_db = os.path.exists(default_db)
        else:
            have_default_db = False
        if have_default_db:
            self.add_argument(
                '-d', '--database', default=default_db,
                help='The cavity database to search [%s]' % default_db
            )
        else:
            self.add_argument(
                '-d', '--database', required=True,
                help='The cavity database to search. See create_cavity_database.py if you need to build one'
            )
        self.add_argument(
            '-q',
            '--query_pdb_id',
            help='PDB identifier for the query protein, e.g. 3a7h',
            required=True
        )
        self.add_argument(
            '-l',
            '--ligand_id',
            help='Ligand identifier with residue number for the cavity of interest, e.g. ATP400',
            required=True
        )
        self.add_argument(
            '-c',
            '--ligand_chain_id',
            help='Chain identifier for ligand in the cavity of interest, e.g. B',
            default=None
        )
        self.add_argument(
            '-a',
            '--ligand_atoms', nargs='+',
            help='Ligand atoms used in defining the binding site, '
                 'e.g. N3 C2 N1 C6 C5 N7 C8 N9 C4',
            default=None
        )
        self.add_argument(
            '-r',
            '--radius_around_ligand',
            help='Radius around ligand used in defining the binding site [6.0]',
            type=float,
            default=6.0
        )
        self.add_argument(
            '-m',
            '--max_hit_structures',
            help='Maximum number of structures returned in the database search',
            type=int,
            default=-1
        )
        group = self.add_mutually_exclusive_group()
        group.add_argument(
            '--pdb_mirror', nargs='?',
            help='PDB mirror file path template, e.g. /local/mirror/pub/pdb/data/structures/all/pdb/pdb{}s.ent.gz',
            default=None
        )
        group.add_argument(
            '--pdb_url', nargs='?',
            help='PDB URL file template',
            default='https://files.rcsb.org/download/{}.pdb'
        )
        if 'FASTA_SEQUENCE_SEARCH_TOOL' in os.environ:
            self.add_argument(
                '-f',
                '--fasta_location',
                help='Path to FASTA executable',
                default=os.environ['FASTA_SEQUENCE_SEARCH_TOOL']
            )
        else:
            self.add_argument(
                '-f',
                '--fasta_location',
                help='Path to FASTA executable',
                required=True
            )
        self.add_argument(
            '-e',
            '--expectation_threshold',
            help='Threshold for the FASTA expectation value, below which structures will be considered homologous [0.001]',
            type=float,
            default=0.001
        )
        self.add_argument(
            '-s',
            '--score-filter',
            help='Threshold below which comparison scores will not be written to the output file',
            type=float,
            default=0.0
        )
        self.add_argument(
            '-o',
            '--results_output_file',
            help='Name of output file for results',
            default='output.csv'
        )
        self.add_argument(
            '-x',
            '--excluded_results_output_file',
            help='Name of output file for excluded results',
            default='excluded_output.csv'
        )

        self.args = self.parse_args()
        self.temp_dir = tempfile.mkdtemp()

        self.query_sequence_file = 'query.fasta'
        open(self.query_sequence_file, 'w').close()
        self.targets_sequence_file = 'targets.fasta'
        open(self.targets_sequence_file, 'w').close()
        self.fasta_output_file = 'output.fasta'

        if not os.path.exists(self.args.database):
            raise ValueError('The specified database {} does not exist'.format(self.args.database))

        self.db = CavityDatabase(self.args.database)
        self.settings = CavityDatabase.Settings()
        self.query_pdb_file = 'pdb{}.pdb'.format(self.args.query_pdb_id)
        self.query_protein = self.load_query_protein()
        self.query_cavities = Cavity.from_pdb_file(self.query_pdb_file)
        self.ligand_chain_id = self.args.ligand_chain_id

    @staticmethod
    def safe_key_value(a_dict, key):
        '''Return a value corresponding to the specified header key.'''
        value = a_dict.get(key, '')
        return value.strip() if isinstance(value, str) else value

    def get_pdb_class(self, pdb_id, pdb_file):
        struct_parser = PDB.PDBParser(PERMISSIVE=False, get_header=True)  # strict parser, will assign altloc
        pdb_structure = struct_parser.get_structure(pdb_id, pdb_file)
        classification = self.safe_key_value(pdb_structure.header, 'head').capitalize()
        return classification

    def load_query_protein(self):
        self.fetch_pdb(self.args.query_pdb_id, self.query_pdb_file)
        if not os.path.exists(self.query_pdb_file):
            raise RuntimeError('Unable to fetch file {} for query PDB {}'.format(self.query_pdb_file, self.args.query_pdb_id))
        protein = Protein.from_file(self.query_pdb_file)
        return protein

    def fetch_from_pdb(self, pdb_id, filename):
        """Try to retrieve the given structure from the RCSB web database.

        :param pdb_id: (:obj:`str`) The PDB structure identifier to retrieve.
        :param filename: (:obj:`str`) The file name to write the structure to.

        :raises: requests.HTTPError if an error occurs trying to download the structure.
        """

        # Try to retrieve the structure from the online database.
        pdb_req = requests.get(self.args.pdb_url.format(pdb_id))
        # Raise a requests.HTTPError if any HTTP errors have occurred.
        pdb_req.raise_for_status()

        # Write the structure returned from the PDB to the given file name
        with output_file(filename) as pdb_file:
            pdb_file.write(to_utf8(pdb_req.text))

    def fetch_pdb_from_mirror(self, pdb_id, filename):
        full_path_input_file = self.args.pdb_mirror.format(pdb_id)
        with _temporary_copy(full_path_input_file) as pdb_file:
            shutil.copy2(pdb_file, filename)

    def fetch_pdb(self, pdb_id, filename):
        if self.args.pdb_mirror is not None:
            self.fetch_pdb_from_mirror(pdb_id, filename)
        else:
            for attempt in range(15):
                try:
                    self.fetch_from_pdb(pdb_id, filename)
                except:
                    sleep(1)
                else:
                    break
            else:
                print('Unable to fetch PDB {} from URL {}'.format(pdb_id, self.args.pdb_url.format(pdb_id)))

    def create_subcavity_around_ligand(self):
        '''Return the subcavity of a specified radius around the query ligand.'''
        # Get the ligand that contains the ligand identifier
        query_ligand = None
        for ligand in self.query_protein.ligands:
            chain_id_name = ligand.identifier.split(':')
            if self.args.ligand_id == chain_id_name[-1]:
                if len(chain_id_name) == 1 or self.ligand_chain_id is None or self.ligand_chain_id == chain_id_name[0]:
                    query_ligand = ligand
                    # Set chain ID for selected ligand if none was specified
                    if len(chain_id_name) == 2 and self.ligand_chain_id is None:
                        self.ligand_chain_id = chain_id_name[0]
                    break

        if query_ligand is None:
            raise IndexError('Ligand not found for ligand identifier {}'.format(self.args.ligand_id))
        print('Selected ligand ' + query_ligand.identifier)

        # Get the cavity that hosts the ligand
        query_cav = None
        for cav in self.query_cavities:
            for lig in cav.ligand_identifiers:
                chain_id_name = lig.split(':')
                if chain_id_name[-1] in self.args.ligand_id:
                    if len(chain_id_name) == 1 or self.ligand_chain_id == chain_id_name[0]:
                        query_cav = cav
                        break

        if query_cav is None:
            raise RuntimeError('Could not find the cavity hosting the ligand.')

        print('Selected cavity ' + query_cav.identifier + ' containing ligand ')
        [print(cav) for cav in query_cav.ligand_identifiers]

        # Get the ligand atoms that make up the desired fragment of the ligand
        # around which the subcavity will be defined
        if not self.args.ligand_atoms:
            fragment_labels = [str(a.label) for a in query_ligand.atoms]
        else:
            fragment_labels = self.args.ligand_atoms
        atoms = [a for a in query_ligand.atoms if a.label in fragment_labels]

        # Create the subcavity by using the pseudocenters within a specified radius of the fragment atoms
        subcavity = query_cav.subcavity(query_cav.features_by_atom_distance(atoms, self.args.radius_around_ligand))
        return subcavity

    @staticmethod
    def get_subcavity_chain(subcavity):
        '''Return the ID for the chain associated with the majority of subcavity pseudocenters.'''
        feature_chain_ids = []
        for feature in subcavity.features:
            feature_chain_ids.append(feature.chain.identifier)
        count = Counter(feature_chain_ids)
        chain_id = str(count.most_common(1)[0][0])
        return chain_id

    @staticmethod
    def write_sequence_file(pdb_file, sequence_file, chain_id=None):
        '''Write sequence file for FASTA.'''
        sequences = []
        records = list(SeqIO.parse(pdb_file, 'pdb-seqres'))
        for record in records:
            if not record.seq:
                continue
            if ':' in record.id:
                record_chain_id = record.id.split(':')[1]
            else:
                # Handle the case where Biopython cannot determine the PDB ID by getting this from the file name
                record_chain_id = record.id
                record.id = pdb_file[-8:-4].upper() + ':' + record_chain_id
            if chain_id is not None:
                if record_chain_id == chain_id:
                    sequences.append(record)
                    break
            else:
                sequences.append(record)

        if not sequences:
            raise IndexError('No chain sequences found.')

        with open(sequence_file, 'a') as output_handle:
            for sequence in sequences:
                SeqIO.write(sequence, output_handle, 'fasta')

    def find_expectation_values(self):
        '''Run FASTA and return a dictionary associating a PDB identifier with a tuple of the lowest
        expectation value found during FASTA sequence alignment and the associated sequence similarity.'''
        max_evalue = 0
        for record in SeqIO.parse("query.fasta", "fasta"):
            max_evalue += 1
        for record in SeqIO.parse("targets.fasta", "fasta"):
            max_evalue += 1

        args = ['-q', '-m', '10', '-E', str(max_evalue),
                self.query_sequence_file,
                self.targets_sequence_file]
        print('Running FASTA with arguments ' + str(args) + ' and writing the results to ' + self.fasta_output_file)
        with open(self.fasta_output_file, 'w') as out:
            program = self.args.fasta_location
            call([program] + args, stdout=out)

        pdb_dictionary = {}
        results = SearchIO.read('output.fasta', "fasta-m10")
        for target in results:
            target_pdb_id = target.id.split(':')[0]
            if target_pdb_id.lower() in pdb_dictionary:
                if target[0].evalue < pdb_dictionary[target_pdb_id.lower()][0]:
                    pdb_dictionary[target_pdb_id.lower()] = (target[0].evalue, target[0].pos_pct)
            else:
                pdb_dictionary[target_pdb_id.lower()] = (target[0].evalue, target[0].pos_pct)

        return pdb_dictionary

    def _extract_pdb_code(self, entry_id):
        pdb_code = entry_id[3:7] if entry_id[:3] == 'pdb' else entry_id[:4]
        return pdb_code

    def write_targets_sequence_file(self, results, temp_dir):
        '''Write FASTA sequence file for the PDB structures returned from the database search
        and return the classification of each PDB.'''
        header_classes = []
        target_pdbs = []
        for i in range(len(results)):
            target_pdb_id = self._extract_pdb_code(results[i][1])
            target_pdb_file = os.path.join(temp_dir, 'pdb{}.pdb'.format(target_pdb_id))
            self.fetch_pdb(target_pdb_id, target_pdb_file)
            if not os.path.exists(target_pdb_file):
                continue
            try:
                header_classes.append(self.get_pdb_class(target_pdb_id, target_pdb_file))
            except PDB.PDBExceptions.PDBConstructionException as e:
                print('Unable to determine PDB classification: {}'.format(e))
                header_classes.append('')
            if target_pdb_id.lower() not in target_pdbs:
                self.write_sequence_file(target_pdb_file, self.targets_sequence_file)
                target_pdbs.append(target_pdb_id.lower())
            os.remove(target_pdb_file)
        return header_classes

    def filter_homologous_structures(self, header_classes, results):
        '''Return two lists of results (non-homologous and homologous with query structure).'''
        pdb_dictionary = self.find_expectation_values()
        filtered_results = []
        homologous_results = []
        for result, header_class in zip(results, header_classes):
            expectation_value, similarity = pdb_dictionary[self._extract_pdb_code(result[1]).lower()]
            if expectation_value >= self.args.expectation_threshold:
                filtered_results.append((result[1], result[0], header_class, expectation_value, similarity))
            else:
                homologous_results.append((result[1], result[0], header_class, expectation_value, similarity))
        return filtered_results, homologous_results

    def write_results_file(self, subcavity, results, output_file):
        with open(output_file, 'w') as output:
            output.write('Identifier, Comparison score, Classification, No. pseudocenters, Lowest expectation, Associated similarity\n')
            # Results for subcavity self-self comparison
            output.write(', '.join((str(subcavity.identifier) + ' subcavity', str(subcavity.compare(subcavity).score),
                                    str(self.get_pdb_class(self.args.query_pdb_id, self.query_pdb_file)),
                                    str(len(subcavity.features)), '-1', '-1')) + '\n')
            for i in range(len(results)):
                if results[i][1] >= self.args.score_filter:
                    # Filtered database comparison results
                    output.write(', '.join((results[i][0], str(results[i][1]), results[i][2],
                                            str(len(self.db.get_cavity_by_name(results[i][0]).features)),
                                            str(results[i][3]), str(results[i][4]))) + '\n')

    def run(self):
        '''Run comparisons of the subcavity against the database and filter out structures homologous to the query.'''
        atoms = ''
        if self.args.ligand_atoms is not None:
            atoms = 'atoms {} of '.format(self.args.ligand_atoms)
        chain = ''
        if self.ligand_chain_id is not None:
            chain = self.ligand_chain_id + ':'
        print(u'Creating subcavity of radius {} Angstroms around {}'
              'the {}{} ligand for PDB structure {}'
              .format(self.args.radius_around_ligand,
                      atoms,
                      chain,
                      self.args.ligand_id,
                      self.query_protein.identifier).encode('utf-8'))

        subcavity = self.create_subcavity_around_ligand()
        print('The query subcavity has ' + str(len(subcavity.features)) + ' pseudocenters')

        chain_id = self.get_subcavity_chain(subcavity)
        self.write_sequence_file(self.query_pdb_file, self.query_sequence_file, chain_id)

        self.settings.max_hit_structures = self.args.max_hit_structures
        self.settings.verbose = True
        # Compare the subcavity against the database using the fast cavity graph comparison method
        results = self.db.search(subcavity,
                                 settings=self.settings,
                                 comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON)

        # Make temporary directory for fetching PDB targets for writing sequence file
        try:
            temp_dir = tempfile.mkdtemp()
            header_classes = self.write_targets_sequence_file(results, temp_dir)
        finally:
            try:
                shutil.rmtree(temp_dir)
            except OSError as exc:
                if exc.errno != errno.ENOENT:
                    raise

        filtered_results, homologous_results = self.filter_homologous_structures(header_classes, results)

        # Write cavity identifiers, comparison scores and PDB classifications to a file (sorted by score)
        print('Writing filtered results to {}'.format(self.args.results_output_file))
        self.write_results_file(subcavity, filtered_results, self.args.results_output_file)
        print('Writing excluded results to {}'.format(self.args.excluded_results_output_file))
        self.write_results_file(subcavity, homologous_results, self.args.excluded_results_output_file)

########################################################################################

if __name__ == '__main__':
    Runner().run()

Perform a k-leave-one-out cross-validation

This example shows how to perform a leave-one-out cross-validation (LOOCV) on two classes of binding sites using a k-nearest-neighbor classifier. In the example script, the LOOCV is carried out for k = 1, 3, 5, 7, 9. The two classes of binding sites are Imatinib binders (also known as Gleecev/Glivec) and Pemetrexed binders (also known as Alimta). Both are cytostatics used in cancer treatment. However, Imatinib is a tyrosine kinase inhibitor while Pemetrexed inhibits the folate-dependent enzymes thymidylate synthase, dihydrofolate reductase and glycinamide ribonucleotide formyltransferase. The LOOCV is a method to evaluate how well two classes of binding sites can be separated by making use of the results of a cavity comparison method.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
    k-leave-one-out_cross-validation.py - Generate a matrix of distances by
    using two classes of binding sites and run a k leave-one-out cross-validation
    to determine classification success. First, an all-against-all similarity
    matrix is generated. This matrix is subsequently subjected to a
    leave-one-out cross-validation by using a k-nearest-neighbor method as
    classifier and the success rates are printed for five different values of
    k (1, 3, 5, 7 ,9).

    Usage: python k-leave-one-out_cross-validation.py
'''
#####################################################################

import sys
import argparse

from ccdc.cavity import *

#####################################################################

# Perform a k-leave-one-out cross-validation using a similarity matrix, a value
# for k, and an index that defines the boundary between the first and the
# second class of cavities.
def kloocv(scores, boundary, k):
    def get_lowest_positions(values, n, exclude):
        positions = []
        while len(positions) < n:
            lowest = sys.maxsize
            lowest_pos = -1
            for i in range(len(values)):
                if values[i] < lowest and i not in positions and i != exclude:
                    lowest = values[i]
                    lowest_pos = i
            positions.append(lowest_pos)
        return positions

    total = 0
    correct = 0
    for i in range(len(scores)):
        line = list(scores[i])
        lowests = get_lowest_positions(line, k, i)
        votes_for_class_b = sum(v > boundary for v in lowests)
        votes_for_class_a = (k - votes_for_class_b)
        total += 1
        if i > boundary:
            if votes_for_class_b > votes_for_class_a:
                correct += 1
        else:
            if votes_for_class_a > votes_for_class_b:
                correct += 1
    return (100.0 * correct / total)

def main(cavity_database):
    db = CavityDatabase(cavity_database)
    # Collect the cavities of the two classes to be used:
    sti_cavities = db.get_cavities_by_ligand("STI")  # cavities binding Imatinib (Gleevec/Glivec)
    lya_cavities = db.get_cavities_by_ligand("LYA")  # cavities binding Pemetrexed

    print("Found", len(sti_cavities), "cavities binding STI.")
    print("Found", len(lya_cavities), "cavities binding LYA.")

    cavities = sti_cavities + lya_cavities

    # Build scoring matrix
    scores = []
    for i in range(len(cavities)):
        scores.append([])
        for j in range(len(cavities)):
            scores[i].append(0.0)

    # Insert similarity scores
    print("Performing the comparisons")
    c = 0
    start = time.time()
    for i in range(len(cavities)):
        for j in range(i + 1, len(cavities)):
            d = cavities[i].compare(cavities[j], comparison_method=Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
            scores[i][j] = d
            scores[j][i] = d
            c += 1
    elapsed = round(time.time() - start, 3)
    print("  -> ran", c, "comparisons in", elapsed, "sec.")

    # Run k leave-one-out cross-validation
    print("---")
    print("k leave-one-out cross-validation results:")
    boundary = len(sti_cavities) - 1
    for k in range(1, 11, 2):
        success = kloocv(scores, boundary, k)
        print("Success rate for k =", k, ":", round(success, 1), "%")

#####################################################################

if __name__ == '__main__':

    drugbank_cavity_database = \
        os.path.join(CavityDatabase.drugbank_database_dir(), 'drugbank_cavities.sqlite')
    parser = argparse.ArgumentParser(description='Cavity K Leave One Out Cross Validation.')
    parser.add_argument(
        '-d',
        '--cavity_database',
        help='Cavity sqlite database for testing.',
        default=drugbank_cavity_database)
    args = parser.parse_args()
    main(args.cavity_database)