Cavity examples¶
Compare cavities in proteins using histograms¶
This example allows you to detect and compare cavities from PDB files stored in a user-defined directory. The cavity histograms comparison uses feature distance histograms and the similarity scores are written out to a .csv file. Note that cavity detection is the rate-determining step and therefore it is best practice to store the detected cavities as XML files or in a cavity database for future reuse and analysis.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_compare.py - compare the cavities in all proteins (PDB files) in a certain directory.
Usage: python cavity_compare.py pdb_directory
'''
#####################################################################
import sys
import argparse
import os
from ccdc.cavity import Cavity
from glob import glob
#####################################################################
def parse_args():
usage_txt = ('Detect cavities from PDB files stored in a user defined directory'
' and perform an all-by-all comparison of the detected cavities.')
parser = argparse.ArgumentParser(
description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-d', '--directory',
help='Directory of PDB files to find cavities in.',
required=True)
parser.add_argument('-m', '--maximum-incomplete-residues-per-chain',type=int,
help='Number of incomplete residues that are permissible when generating cavities',
default=100) # High default - presumably we want all cavities included unless they are really poor otherwise they would not be in the directory
return parser.parse_args()
def main():
# Get command line argument:
args = parse_args()
pdb_dir = args.directory
# Collect PDB files
pdbs = glob(os.path.join(pdb_dir, '*.pdb'))
# At least two PDB files are needed to perform comparisons
if len(pdbs) < 2:
print('Error: At least two PDB files are required. Not enough found in %s' % pdb_dir)
sys.exit(1)
print(' -> Detecting the cavities for all PDB structures in %s' % pdb_dir)
cavs = []
for pdb in pdbs:
cavs += Cavity.from_pdb_file(pdb, maximum_incomplete_residues_per_chain = args.maximum_incomplete_residues_per_chain)
# Generate CSV output
with open('similarities.csv', 'w') as f:
# header:
cav_ids = [''] + [cav.identifier for cav in cavs]
f.write(','.join(cav_ids) + '\n')
# rows:
for i in range(0, len(cavs)):
cells = [cavs[i].identifier]
for j in range(0, len(cavs)):
score = cavs[i].compare(
cavs[j],
comparison_method=Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
cells.append(str(round(score, 3)))
f.write(','.join(cells) + '\n')
print(' -> Similarity matrix written to similarities.csv')
#####################################################################
if __name__ == '__main__':
main()
Overlay pairs of proteins based on their contained cavities¶
This example allows you to detect and compare cavities from pairs of PDB files. The script reads and detects cavities in 2 protein structures then overlays pairs of cavities. The resultant transformation matrices are applied to the input proteins and new output files are written that contain the superimposed protein with the transformation applied.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_pair_view.py - generate cavity overlays for all cavities in a pair of protein structures and then generate mol2 files of the proteins in the overlaid configuration. Also writes a pairwise similarity matrix for the cavities
Usage: python cavity_pair_view.py <pdb file 1> <pdb file 2>
'''
#####################################################################
import sys
import argparse
import os
from ccdc.cavity import Cavity
from ccdc.protein import Protein
from ccdc.io import MoleculeWriter
from glob import glob
#####################################################################
def parse_args():
usage_txt = ('Detect cavities from a pair of PDB files and then overlays them based on the best match')
parser = argparse.ArgumentParser(
description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('filenames', metavar='<filename>',type=str, nargs='+',
help='Filenames of PDB files to find cavities in.')
return parser.parse_args()
def main():
# Get command line argument:
args = parse_args()
# At least two PDB files are needed to perform comparisons
if len(args.filenames) != 2:
print('Error: You must specify exactly two PDB filenames')
sys.exit(1)
print(' -> Detecting the cavities for all PDB structures in %s and %s' % ( args.filenames[0],args.filenames[1] ))
file_lookup = {}
first_cavs = Cavity.from_pdb_file(args.filenames[0])
second_cavs = Cavity.from_pdb_file(args.filenames[1])
for cav in first_cavs:
file_lookup[cav.identifier] = args.filenames[0]
for cav in second_cavs:
file_lookup[cav.identifier] = args.filenames[1]
# Generate CSV output
with open('similarities.csv', 'w') as f:
# header:
cav_ids = [''] + [cav.identifier for cav in second_cavs]
f.write(','.join(cav_ids) + '\n')
for first_cav in first_cavs:
cells = [first_cav.identifier]
for second_cav in second_cavs:
comparison_object = first_cav.compare(
second_cav,
comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
score = comparison_object.score
cells.append(str(round(score, 3)))
filename = first_cav.identifier + "_" + second_cav.identifier + ".mol2"
m = Protein.from_file(file_lookup[second_cav.identifier])
m.transform(comparison_object.transformation_matrix)
print(' -> Writing %s - a file containing PDB entry %s - superimposition using cavity %s in %s onto cavity %s' % ( filename,m.identifier,second_cav.identifier,m.identifier, first_cav.identifier))
er = MoleculeWriter(filename)
er.write(m)
f.write(','.join(cells) + '\n')
print(' -> Similarity matrix written to similarities.csv')
#####################################################################
if __name__ == '__main__':
main()
Compare XML cavity files¶
This script allows you to compare cavity information stored in XML cavity files. You can switch between a cavity histograms comparison, a fast cavity graph comparison and a cavity graph comparison, using the appropriate parameter in the command line argument. The script furthermore provides a more elaborate example on how to make use of Python’s argument parser.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_compare_args.py - compare a cavity against a set of other
cavities using command line arguments.
Usage: python cavity_compare_args.py -d C:/path/to/xml_cavities
-q query_cavity
-c compare_cavities
-m method
'''
#####################################################################
import argparse
from ccdc.cavity import Cavity
import os
#####################################################################
def parse_args():
usage_txt = 'Compare a cavity against a set of other cavities using command line arguments.'
parser = argparse.ArgumentParser(description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-m', '--method', default='GRAPH',
choices=['HISTOGRAMS', 'FAST_GRAPH', 'GRAPH'],
help='Comparison method to use.')
parser.add_argument('-q', '--query',
help='Query cavity name (e.g. "pdb3jw3.1" for cavity_pdb3jw3.1.xml).',
required=True)
parser.add_argument('-c', '--comparisons', nargs='+', required=True,
help='Cavity names with which to compare, separated by spaces.')
parser.add_argument('-d', '--directory',
help='Directory of XML cavity files.')
return parser.parse_args()
def main():
# Get command line arguments:
args = parse_args()
xml_dir = args.directory
query = args.query
method = args.method
comps = args.comparisons
# Load the query cavity:
query_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % query))
# Do the comparisons:
if method == 'HISTOGRAMS':
# Create histograms for cavity histograms comparison
_ = query_cav.cavity_distance_histograms()
print('=== Cavity histogram comparison ===')
print('Query\tComparison\tScore')
for comp in comps:
try:
comp_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % comp))
score = query_cav.compare(
comp_cav,
comparison_method=Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
print('\t'.join([query, comp, str(score)]))
except RuntimeError as err:
print('\t'.join([query, comp, 'Error: %s' % err]))
elif method == 'FAST_GRAPH':
print('=== Fast graph comparison ===')
print('Query\tComparison\tScore')
for comp in comps:
try:
comp_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % comp))
score = query_cav.compare(
comp_cav,
comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON).score
print('\t'.join([query, comp, str(score)]))
except RuntimeError as err:
print('\t'.join([query, comp, 'Error: %s' % err]))
elif method == 'GRAPH':
print('=== Graph comparison ===')
print('Query\tComparison\tScore')
for comp in comps:
try:
comp_cav = Cavity.from_xml_file(os.path.join(xml_dir, 'cavity_%s.xml' % comp))
score = query_cav.compare(
comp_cav,
comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON).score
print('\t'.join([query, comp, str(score)]))
except RuntimeError as err:
print('\t'.join([query, comp, 'Error: %s' % err]))
#####################################################################
if __name__ == '__main__':
main()
Compare cavities using multiple cores¶
This example shows how to use multiple cores to speed up the comparison of large numbers of cavities.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-02-02: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_compare_db_multiple_cores.py - Pairwise comparison of cavities
taken from a database using multiple cores.
Usage: python cavity_compare_db_multiple_cores.py -n n_processes
-q query_cavity
-v volume_min
-w volume_max
'''
#####################################################################
import argparse
from multiprocessing import Pool, get_context
import os
from ccdc.cavity import *
#####################################################################
process_db = None
def initializer(initargs):
""" Create a database connection for each process. """
global process_db
process_db = CavityDatabase(initargs)
def comparison(cavities):
""" Compare two cavities in a tuple and output the scoring. """
query_cav = process_db.get_cavity_by_name(cavities[0])
comp_cav = process_db.get_cavity_by_name(cavities[1])
try:
result = query_cav.compare(
comp_cav,
comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
print('%s\t%s' % ('\t'.join(cavities[:2]), result.score))
except RuntimeError as exc:
print('%s\tComparison unsuccessful: %s' % ('\t'.join(cavities[:2]), str(exc)))
def mp_handler(comparisons_list, n_processes, db_path):
""" Create a pool of worker processes and distribute the cavity pairs between them. """
with get_context('spawn').Pool(n_processes, initializer, initargs=(db_path,)) as p:
p.map(comparison, comparisons_list)
def parse_args():
usage_txt = 'Pairwise comparison of cavities taken from a database using multiple cores.'
parser = argparse.ArgumentParser(
description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
'-n', '--n_processes',
help='Number of parallel processes',
default=2
)
parser.add_argument(
'-q', '--query',
help='Query cavity name',
required=True
)
parser.add_argument(
'-v', '--volume_min',
help='Minimum volume for target cavities search',
required=True
)
parser.add_argument(
'-w', '--volume_max',
help='Maximum volume for target cavities search',
required=True
)
parser.add_argument(
'-d', '--db_path',
help='Cavity database to use as source data.',
default=os.path.join(CavityDatabase.drugbank_database_dir(), 'drugbank_cavities.sqlite')
)
return parser.parse_args()
def main():
args = parse_args()
n_processes = int(args.n_processes)
query = args.query
volume_min = float(args.volume_min)
volume_max = float(args.volume_max)
db_path = args.db_path
if not os.path.exists(db_path):
print('Cavity database %s does not exist!' % db_path)
exit(1)
print('Comparing cavity %s with a volume range of %s-%s in database %s using %s processes.' %
(query, volume_min, volume_max, db_path, n_processes))
# Create database connection for initial search
db = CavityDatabase(db_path)
query_cav = db.get_cavity_by_name(query)
if query_cav is None:
raise RuntimeError('Query cavity {0} not found in database'.format(query))
# Define database search settings
settings = CavityDatabase.Settings()
settings.volume_range = [volume_min, volume_max]
# Search using settings
comparisons = db.search(settings=settings)
# Make list of comparison cavities fulfilling the search criteria
comparisons_list = []
for c in comparisons:
comparisons_list.append((query, c.identifier))
print('Number of comparisons to be performed: %s' % len(comparisons_list))
print('=== Cavity Graph Comparison ===')
print('Query\tComparison\tScore')
mp_handler(comparisons_list, n_processes, db_path=db_path)
#####################################################################
if __name__ == '__main__':
import ccdc.utilities
ccdc.utilities._fix_multiprocessing_on_macos()
main()
Compare cavities stored in an SQLite database¶
Cavity comparison using an SQLite database is faster than processing individual XML files. Refer to this example to set up a comparison using SQLite input.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_compare_db_pairwise.py - Pairwise comparison of cavities taken from a database.
Usage: python cavity_compare_db_pairwise.py -q query_cavity
-c comparison1 comparison2 comparison3
'''
#####################################################################
import argparse
from ccdc.cavity import *
#####################################################################
def parse_args():
usage_txt = 'Pairwise comparison of cavities taken from a database.'
parser = argparse.ArgumentParser(description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-c', '--comparisons', nargs='+',
default=['pdb3dar.1', 'pdb3f8s.1', 'pdb2hha.1', 'pdb21bi.2'],
help='Names of cavities in the database to compare against.')
parser.add_argument('-q', '--query', default='pdb21bi.1',
help='The name of the cavity to use as a reference.')
parser.add_argument('-d', '--database',
default=os.path.join(CavityDatabase.drugbank_database_dir(),
'drugbank_cavities.sqlite'),
help='Database file to use a source for cavities.')
return parser.parse_args()
def main():
args = parse_args()
db = CavityDatabase(args.database)
# Load the query cavity:
query_cav = db.get_cavity_by_name(args.query)
print('=== Cavity comparison ===')
print('Query\tComparison\tScore')
for comp in args.comparisons:
comp_cav = db.get_cavity_by_name(comp)
try:
result = query_cav.compare(
comp_cav,
comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
# Output the comparison result:
print('\t'.join([args.query, comp, str(result.score)]))
except RuntimeError as e:
print('\t'.join([args.query, comp, 'Failed: %s' % e.message]))
#####################################################################
if __name__ == '__main__':
main()
A script to compile an SQLite database from your own data is provided as
ccdc/utilities/create_cavity_database/create_cavity_database.py
.
Download and extract the PDB header for a cavity¶
This example shows how to download a PDB file from the wwPDB for a cavity and extract header information using Biopython.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-03-01: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_fetch_pdb_header.py - Retrieve pdb file header information for a cavity
from wwwPDB using Biopython
Usage: python cavity_fetch_pdb_header.py --query [cavity]
--database [database_path]
'''
#####################################################################
import argparse
from Bio import PDB
from ccdc.cavity import *
from ccdc.utilities import output_file, to_utf8
import os
import requests
import tempfile
#####################################################################
def parse_args():
"""Parse command-line arguments."""
usage = 'Get pdb file header information for a cavity from the wwwPDB using Biopython.'
parser = argparse.ArgumentParser(description=usage,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-q', '--query', default='pdb3k2h.8',
help='The name of the cavity to use as a reference.')
parser.add_argument('-d', '--database',
default=os.path.join(CavityDatabase.drugbank_database_dir(),
'drugbank_cavities.sqlite'),
help='Database file to use a source for cavities.')
return parser.parse_args()
def fetch_from_pdb(pdb_id, filename):
"""Try to retrieve the given structure from the RCSB web database.
:param pdb_id: (:obj:`str`) The PDB structure identifier to retrieve.
:param filename: (:obj:`str`) The file name to write the structure to.
:raises: requests.HTTPError if an error occurs trying to download the structure.
"""
# Try to retrieve the structure from the online database.
pdb_req = requests.get('https://files.rcsb.org/download/%s.pdb' % pdb_id)
# Raise a requests.HTTPError if any HTTP errors have occurred.
pdb_req.raise_for_status()
# Write the structure returned from the PDB to the given file name
with output_file(filename) as pdb_file:
pdb_file.write(to_utf8(pdb_req.text))
def extract_pdb_id(cavity_id):
"""Convert Relibase chain id to CSD Python API chain id"""
pdb_id = cavity_id.split('.')[0]
pdb_id = pdb_id.replace('pdb', '')
return pdb_id
def safe_key_value(a_dict, key):
"""Return a value corresponding to the specified header key."""
value = a_dict.get(key, '')
return value.strip() if isinstance(value, str) else value
class Header(object):
def __init__(self, structure):
self.header = structure.header
self.structure_id = structure.id.upper()
self.resolution = safe_key_value(self.header, 'resolution')
self.structure_method = safe_key_value(self.header, 'structure_method')
self.deposition_date = safe_key_value(self.header, 'deposition_date')
self.pdb_title = safe_key_value(self.header, 'name').capitalize()
self.pdb_class = safe_key_value(self.header, 'head').capitalize()
self.compound_dict = safe_key_value(self.header, 'compound')
self.source_dict = safe_key_value(self.header, 'source')
try:
self.pdb_chain_list = list({chain.id.upper() for chain in structure.get_chains()})
except Exception:
self.pdb_chain_list = []
def __getitem__(self, key):
return self.header[key]
def main():
args = parse_args()
db = CavityDatabase(args.database)
# Load the query cavity
query_cavity = db.get_cavity_by_name(args.query)
if query_cavity is None:
raise Exception('Failed to find {} in database {}!'.format(args.query, args.database))
pdb_id = extract_pdb_id(query_cavity.identifier)
# load the pdb file for the protein from wwwPDB
output_dir = tempfile.mkdtemp()
pdb_file = os.path.join(output_dir, '{}.pdb'.format(pdb_id))
fetch_from_pdb(pdb_id, pdb_file)
# strict parser, will assign altloc
struct_parser = PDB.PDBParser(PERMISSIVE=False, get_header=True)
pdb_structure = struct_parser.get_structure(pdb_id, pdb_file)
pdb_header = Header(pdb_structure)
print("Identifier: {}".format(pdb_id))
print("Title: {}".format(pdb_header.pdb_title))
print("Class: {}".format(pdb_header.pdb_class))
print("Resolution: {}".format(pdb_header.resolution))
print("Structure method: {}".format(pdb_header.structure_method))
print("Deposition date: {}".format(pdb_header.deposition_date))
print("Compound dictionary: {}".format(pdb_header.compound_dict))
print("Source dictionary: {}".format(pdb_header.source_dict))
print("Chain list: {}".format(pdb_header.pdb_chain_list))
#####################################################################
if __name__ == '__main__':
main()
Find the most buried cavity area using k-means clustering and use it for subcavity comparisons¶
This script allows you to locate the most buried area of a cavity using a k-means clustering algorithm and compare it against other cavities. The value of k giving the number of clusters and the minimum size of the clusters can be adjusted within the script.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-01-15: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_most_buried.py - use pseudocenter depth values and k-means clustering
to detect the most buried area of a cavity.
Usage: python cavity_most_buried.py -x c:/path/to/input/xml/directory
-i c:/path/to/input/directory/reference_file.xml
[-v c:/path/to/visualisation/output/directory]
'''
#####################################################################
import argparse
from ccdc.cavity import *
import glob
from math import sqrt
import os
import random
#####################################################################
def parse_args():
usage_txt = ('Use pseudocenter depth values and k-means clustering to detect',
' the most buried area of a cavity.')
parser = argparse.ArgumentParser(description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
'-x', '--xml_directory',
help='Directory for input XML files.',
required=True
)
parser.add_argument(
'-i', '--input_file',
help='Reference cavity XML file.',
required=True)
parser.add_argument(
'-v', '--visualisation_directory',
help='Output directory for PyMol visualisation file.',
default=None
)
parser.add_argument(
'-s', '--min_subcavity_size',
help='The minimum size of the buried subcavity (number of pseudocenters).',
type=int, default=15
)
parser.add_argument(
'-d', '--min_depth',
help=('Minimum depth value of the pseudocenters to be used for subcavity detection.',
' This is in the interval [0,7] where 7 means most buried.'),
type=float, default=7.0)
parser.add_argument(
'-k', '--k_value',
help='The number of clusters to be detected.',
type=int, default=3)
return parser.parse_args()
def k_means_clustering(features, min_depth, num_clusters):
def euclidean_dist(p, q):
return sqrt(sum([pow(p[i] - q[i], 2) for i in range(3)]))
def get_buried_indices(features, min_depth):
indices = []
for i in range(len(features)):
if features[i].burial >= min_depth:
indices.append(i)
return indices
def get_centroid(cluster):
x = y = z = 0.0
for feature in cluster:
x += feature.coordinates[0]
y += feature.coordinates[1]
z += feature.coordinates[2]
n = max(len(cluster), 1)
return (x / n, y / n, z / n)
buried_indices = get_buried_indices(features, min_depth)
# Return set of empty clusters if not enough points with the specified
# depth were found:
if len(buried_indices) < num_clusters:
return [[] for i in range(num_clusters)]
clusters = []
# Initialisation: generate 'num_clusters' clusters
# each of which contains one random feature.
used_positions = []
for i in range(num_clusters):
new_cluster = []
while True:
pos = random.randint(0, len(buried_indices) - 1)
if pos not in used_positions:
new_cluster.append(features[buried_indices[pos]])
used_positions.append(pos)
break
clusters.append(new_cluster)
# Iteration: re-generate clusters until convergence.
while True:
cluster_centers = [get_centroid(c) for c in clusters]
cluster_sizes = [len(c) for c in clusters]
clusters = [[] for i in range(num_clusters)] # clear previous clusters
for bi in buried_indices:
closest_cluster_center_index = -1
closest_distance = 10000.0
for ci in range(len(cluster_centers)):
d = euclidean_dist(features[bi].coordinates, cluster_centers[ci])
if d < closest_distance:
closest_cluster_center_index = ci
closest_distance = d
clusters[closest_cluster_center_index].append(features[bi])
new_cluster_sizes = [len(c) for c in clusters]
if new_cluster_sizes == cluster_sizes:
break
return clusters
def visualise_points(cluster, scale=0.50, color=(255, 120, 50)):
serial = str(random.randint(0, 10000))
spheres = []
for f in cluster:
p = f.coordinates
spheres.append('7.0, %f, %f, %f, %f' % (p[0], p[1], p[2], scale))
out = []
out.append('points_' + serial + ' = [' + ', '.join(spheres) + ']')
out.append('cmd.load_cgo(points_' + serial + ', \'points_' + serial + '\', 1)')
out.append('cmd.set_color(\'color_' + serial + '\', [%i, %i, %i])' % color)
out.append('cmd.color(\'color_' + serial + '\', \'points_' + serial + '\')')
return os.linesep.join(out)
def main():
args = parse_args()
# Load a previously detected cavity:
cav = Cavity.from_xml_file(args.input_file)
min_subcavity_size = args.min_subcavity_size
# Define the initial minimum depth value of the pseudocenters that should be used for
# the subcavity detection. This is in the interval [0,7] where 7 means most buried.
min_depth = args.min_depth
# Define the number of clusters to be detected
k_value = args.k_value
# Perform a k-means clustering with a decreasing depth threshold until a cluster was
# found that is large enough:
clusters = []
while min_depth >= 1.0:
print('Running k-means clustering with min_depth %s' % min_depth)
clusters = k_means_clustering(cav.features, min_depth, k_value)
print('Cluster sizes: %s' % [len(c) for c in clusters])
if max([len(c) for c in clusters]) >= min_subcavity_size:
break
min_depth -= 0.5
print('Detected subcavity has size %s' % max([len(c) for c in clusters]))
# Write a PyMol visualisation file for the detected clusters if an output
# path is given
if args.visualisation_directory is not None:
out_file_name = os.path.join(args.visualisation_directory, 'cav_clusters.py')
with open(out_file_name, 'w') as out_file:
out_file.write('from pymol.cgo import *%s' % os.linesep)
out_file.write('from pymol import cmd%s' % os.linesep)
for i in range(len(clusters)):
color = (80 + 80 * i, 50, 50)
out_file.write(visualise_points(clusters[i], 0.50, color))
out_file.write(os.linesep)
# Find the index of the largest detected cluster
max_cluster_index = max_cluster_size = -1
for i in range(len(clusters)):
if len(clusters[i]) > max_cluster_size:
max_cluster_index = i
max_cluster_size = len(clusters[i])
# Create a subcavity from the largest cluster
subcav = cav.subcavity(clusters[max_cluster_index])
# Cavities to be compared with the deeply buried subcavity
comp_cavs = glob.glob(os.path.join(args.xml_directory, '*.xml'))
print('%s==========%s' % (os.linesep, os.linesep))
print('Comparison\tScore')
for cav_xml in comp_cavs:
c = Cavity.from_xml_file(cav_xml)
try:
comp = subcav.compare(
c, comparison_method=Cavity.ComparisonMethod.CAVITY_GRAPH_COMPARISON)
score = comp.score
except RuntimeError as exc:
score = 'Failed: %s' % str(exc)
print('\t'.join([c.identifier, str(score)]))
#####################################################################
if __name__ == '__main__':
main()
Evaluate enrichment in cavity comparison¶
Screen a cavity against an entire database and obtain a list of all comparison cavities ordered by their degree of similarity. In the example script the cavity pdb3k2h.8, which contains the ligand LYA (Pemetrexed), is used as query and all cavities that accommodate the same ligand are being defined as true positives. Also, the ROC AUC of the enrichment rates is calculated.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_retrieval.py - Screen a cavity against an entire database and
calculate the ROC AUC of the enrichment rates of cavities binding the same
ligand.
Usage: python cavity_retrieval.py
'''
#####################################################################
import argparse
from ccdc.cavity import *
import os
#####################################################################
def parse_args():
usage_txt = ('Screen a cavity against an entire database and calculate',
' the ROC AUC of the enrichment rates of cavities',
' binding the same ligand.')
parser = argparse.ArgumentParser(description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-q', '--query', default='pdb3k22.2',
help='The name of the cavity to use as a reference.')
parser.add_argument('-l', '--ligand', default='LYA',
help='The identifier of the ligand within the cavity to define the pocket.')
parser.add_argument('-d', '--database',
default=os.path.join(CavityDatabase.drugbank_database_dir(),
'drugbank_cavities.sqlite'),
help='Database file to use a source for cavities.')
return parser.parse_args()
def get_roc_coords(hits_positions, num_results):
""" Calculates the coordinates of a ROC plot
using the obtained positions of true positives and the total
number of results in the list.
"""
coords = []
pos = 0
for i in range(len(hits_positions)):
while pos < hits_positions[i]:
pos += 1
x = float(pos - i) / (num_results - len(hits_positions))
y = float(i) / len(hits_positions)
coords.append((x, y))
coords.append((1.0, 1.0))
return coords
def get_auc(coords):
""" Calculates the area under curve (AUC) for a given set of ROC coordinates """
auc = 0.0
for i in range(1, len(coords)):
auc += (coords[i][0] - coords[i - 1][0]) * coords[i][1]
return auc
def main():
args = parse_args()
# Load the cavity database and query cavity
db = CavityDatabase(args.database)
query_cav = db.get_cavity_by_name(args.query)
if query_cav is None:
raise RuntimeError(f'Cavity {args.query} not found in database {args.database}')
# Collect all cavities from the database that bind the target ligand
hits = db.get_cavity_identifiers_by_ligand(args.ligand)
# Compare the query cavity against all entries of the database using
# the cavity histograms comparison method:
results = db.search(query_cav, Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
# Collect and print the positions of the true positives in the result list:
print('Hit\tPos\tDistance')
hits_positions = []
for i in range(len(results) - 1):
cav_id = results[i][1]
# Don't output the comparison of the query cavity with itself
if cav_id == args.query:
pass
# Output any other results
elif cav_id in hits:
hits_positions.append(i)
print('\t'.join([cav_id, str(i), str(results[i][0])]))
# Calculate the ROC coordinates and the corresponding AUC:
roc = get_roc_coords(hits_positions, len(results))
auc = get_auc(roc)
print('%sAUC: %s' % (os.linesep, auc))
#####################################################################
if __name__ == '__main__':
main()
Create a subcavity and use it for comparisons¶
This example shows how to generate a subcavity on the basis of a regular cavity and use it for a database screening afterwards. We are using a Thrombin structure (PDB code: 5af9) as the starting point and are interested in creating a new cavity that only comprises the S1 pocket of that enzyme. In order to identify the features that make up the subcavity, we employ the methoxy-benzamide fragment of the bound ligand, as we know that it points directly into the S1 subpocket. The newly created subcavity is then compared against all the other cavities in a database by making use the cavity graph comparison method. The cavity histograms comparison method is not recommended for subcavity comparisons.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2016-12-13: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_subcavity.py - Create a subcavity based on a ligand fragment and
compare it against a database.
Usage: python cavity_subcavity.py
'''
#####################################################################
import argparse
from ccdc.cavity import *
import os
import sys
#####################################################################
def parse_args():
usage_txt = ('Create a subcavity based on a ligand fragment and ',
'compare it against a database')
parser = argparse.ArgumentParser(description=usage_txt,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-p', '--pdb_file', default='5af9.pdb',
help='The pdb file to work from.')
parser.add_argument('-l', '--ligand', default='SJR',
help='The identifier of the ligand to define the subcavity from.')
parser.add_argument('-f', '--fragment_labels', nargs='+',
default=['C', 'O', 'C1', 'C12', 'C11', 'C2', 'C3', 'C4'],
help='The atoms defining the subcavity fragment within the ligand.')
parser.add_argument('-r', '--subcavity_radius', type=float, default=6.0,
help='The radius around the fragment atoms defining the subcavity volume.')
parser.add_argument('-s', '--max_hit_structures', type=int, default=50,
help='The maximum number of structures to compare against.')
parser.add_argument('-m', '--max_results', type=int, default=10,
help='The maximum number of results to output.')
parser.add_argument('-d', '--database',
default=os.path.join(CavityDatabase.drugbank_database_dir(),
'drugbank_cavities.sqlite'),
help='Database file to use a source for cavities.')
return parser.parse_args()
def main():
args = parse_args()
cavs = Cavity.from_pdb_file(args.pdb_file)
pdb = Protein.from_file(args.pdb_file)
db = CavityDatabase(args.database)
# Get the ligand matching the given identifier
ligand = None
for lig in pdb.ligands:
if args.ligand in lig.identifier:
ligand = lig
break
# Get the cavity that hosts the ligand:
query_cav = None
for cav in cavs:
for lig in cav.ligand_identifiers:
if args.ligand in lig:
query_cav = cav
break
if ligand is None:
print('Could not find the ligand %s in the PDB structure.' % args.ligand)
sys.exit(1)
if query_cav is None:
print('Could not find the cavity hosting the ligand %s.' % args.ligand)
sys.exit(1)
# Get the ligand atoms that make up the defined fragment
atoms = [a for a in ligand.atoms if a.label in args.fragment_labels]
# Create the subcavity as the volume within the given radius around the fragment atoms
subcav = query_cav.subcavity(query_cav.features_by_atom_distance(atoms, args.subcavity_radius))
# Apply settings that limits the maximum number of cavities found in the search
settings = CavityDatabase.Settings()
settings.max_hit_structures = args.max_hit_structures
settings.verbose = True
# Find matches for the subcavity using the fast cavity graph comparison method
results = db.search(subcav, settings=settings,
comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON)
# Print the 10 most similar cavities
print('%sCavity\tScore' % os.linesep)
for result in results[:args.max_results]:
print('%s\t%s' % (result[1], result[0]))
#####################################################################
if __name__ == '__main__':
main()
Perform subcavity comparisons and filter out results from structures homologous to the query¶
This examples demonstrates the creation of a subcavity using a specified radius around a fragment of a ligand. This is then used in fast cavity graph comparisons against a cavity database and the results of this search are filtered on the basis of a FASTA sequence search. All cavities from proteins with a chain that has statistically significant sequence similarity to the chain associated with the query ligand, as judged by an expectation value below a certain threshold (default 0.001), are removed from the results. Cavity identifiers and scores are written out to a CSV file.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-03-21: created by the Cambridge Crystallographic Data Centre
#
'''
cavity_subcavity_compare_filter.py - generate a subcavity of a certain
radius around a specified ligand and perform comparisons against a
database, filtering out results for proteins within the same homologous
family using FASTA and Biopython. Expectation (E()) values are used in filtering
the results, which depend on the size of the database searched. For protein
searches, sequences with E()-values < 0.001 for searches of a 10,000 entry database
are almost always homologous. Here, expectation values are calculated for each chain
separately, and if any of these fall below the specified expectation threshold, all
cavities associated with this PDB will be excluded from the results. Expectation
values printed in the output files are the lowest found for the corresponding PDB
structure, rather than being particularly associated with that cavity. The printed
sequence similarity is that associated with the chain that led to the lowest expectation
value.
'''
#####################################################################
import os
import argparse
import requests
import tempfile
import shutil
import errno
from subprocess import call
from collections import Counter
from time import sleep
from Bio import SeqIO, PDB, SearchIO
from ccdc.utilities import _temporary_copy, output_file, to_utf8
from ccdc.protein import Protein
from ccdc.cavity import Cavity, CavityDatabase
class Runner(argparse.ArgumentParser):
'''Define and parse arguments, and run the comparisons according to the arguments.'''
def __init__(self):
'''Defines and parses the arguments.'''
super(self.__class__, self).__init__(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
cavity_dir = CavityDatabase.drugbank_database_dir()
if cavity_dir is not None:
default_db = os.path.join(cavity_dir, 'drugbank_cavities.sqlite')
have_default_db = os.path.exists(default_db)
else:
have_default_db = False
if have_default_db:
self.add_argument(
'-d', '--database', default=default_db,
help='The cavity database to search [%s]' % default_db
)
else:
self.add_argument(
'-d', '--database', required=True,
help='The cavity database to search. See create_cavity_database.py if you need to build one'
)
self.add_argument(
'-q',
'--query_pdb_id',
help='PDB identifier for the query protein, e.g. 3a7h',
required=True
)
self.add_argument(
'-l',
'--ligand_id',
help='Ligand identifier with residue number for the cavity of interest, e.g. ATP400',
required=True
)
self.add_argument(
'-c',
'--ligand_chain_id',
help='Chain identifier for ligand in the cavity of interest, e.g. B',
default=None
)
self.add_argument(
'-a',
'--ligand_atoms', nargs='+',
help='Ligand atoms used in defining the binding site, '
'e.g. N3 C2 N1 C6 C5 N7 C8 N9 C4',
default=None
)
self.add_argument(
'-r',
'--radius_around_ligand',
help='Radius around ligand used in defining the binding site [6.0]',
type=float,
default=6.0
)
self.add_argument(
'-m',
'--max_hit_structures',
help='Maximum number of structures returned in the database search',
type=int,
default=-1
)
group = self.add_mutually_exclusive_group()
group.add_argument(
'--pdb_mirror', nargs='?',
help='PDB mirror file path template, e.g. /local/mirror/pub/pdb/data/structures/all/pdb/pdb{}s.ent.gz',
default=None
)
group.add_argument(
'--pdb_url', nargs='?',
help='PDB URL file template',
default='https://files.rcsb.org/download/{}.pdb'
)
if 'FASTA_SEQUENCE_SEARCH_TOOL' in os.environ:
self.add_argument(
'-f',
'--fasta_location',
help='Path to FASTA executable',
default=os.environ['FASTA_SEQUENCE_SEARCH_TOOL']
)
else:
self.add_argument(
'-f',
'--fasta_location',
help='Path to FASTA executable',
required=True
)
self.add_argument(
'-e',
'--expectation_threshold',
help='Threshold for the FASTA expectation value, below which structures will be considered homologous [0.001]',
type=float,
default=0.001
)
self.add_argument(
'-s',
'--score-filter',
help='Threshold below which comparison scores will not be written to the output file',
type=float,
default=0.0
)
self.add_argument(
'-o',
'--results_output_file',
help='Name of output file for results',
default='output.csv'
)
self.add_argument(
'-x',
'--excluded_results_output_file',
help='Name of output file for excluded results',
default='excluded_output.csv'
)
self.args = self.parse_args()
self.temp_dir = tempfile.mkdtemp()
self.query_sequence_file = 'query.fasta'
open(self.query_sequence_file, 'w').close()
self.targets_sequence_file = 'targets.fasta'
open(self.targets_sequence_file, 'w').close()
self.fasta_output_file = 'output.fasta'
if not os.path.exists(self.args.database):
raise ValueError('The specified database {} does not exist'.format(self.args.database))
self.db = CavityDatabase(self.args.database)
self.settings = CavityDatabase.Settings()
self.query_pdb_file = 'pdb{}.pdb'.format(self.args.query_pdb_id)
self.query_protein = self.load_query_protein()
self.query_cavities = Cavity.from_pdb_file(self.query_pdb_file)
self.ligand_chain_id = self.args.ligand_chain_id
@staticmethod
def safe_key_value(a_dict, key):
'''Return a value corresponding to the specified header key.'''
value = a_dict.get(key, '')
return value.strip() if isinstance(value, str) else value
def get_pdb_class(self, pdb_id, pdb_file):
struct_parser = PDB.PDBParser(PERMISSIVE=False, get_header=True) # strict parser, will assign altloc
pdb_structure = struct_parser.get_structure(pdb_id, pdb_file)
classification = self.safe_key_value(pdb_structure.header, 'head').capitalize()
return classification
def load_query_protein(self):
self.fetch_pdb(self.args.query_pdb_id, self.query_pdb_file)
if not os.path.exists(self.query_pdb_file):
raise RuntimeError('Unable to fetch file {} for query PDB {}'.format(self.query_pdb_file, self.args.query_pdb_id))
protein = Protein.from_file(self.query_pdb_file)
return protein
def fetch_from_pdb(self, pdb_id, filename):
"""Try to retrieve the given structure from the RCSB web database.
:param pdb_id: (:obj:`str`) The PDB structure identifier to retrieve.
:param filename: (:obj:`str`) The file name to write the structure to.
:raises: requests.HTTPError if an error occurs trying to download the structure.
"""
# Try to retrieve the structure from the online database.
pdb_req = requests.get(self.args.pdb_url.format(pdb_id))
# Raise a requests.HTTPError if any HTTP errors have occurred.
pdb_req.raise_for_status()
# Write the structure returned from the PDB to the given file name
with output_file(filename) as pdb_file:
pdb_file.write(to_utf8(pdb_req.text))
def fetch_pdb_from_mirror(self, pdb_id, filename):
full_path_input_file = self.args.pdb_mirror.format(pdb_id)
with _temporary_copy(full_path_input_file) as pdb_file:
shutil.copy2(pdb_file, filename)
def fetch_pdb(self, pdb_id, filename):
if self.args.pdb_mirror is not None:
self.fetch_pdb_from_mirror(pdb_id, filename)
else:
for attempt in range(15):
try:
self.fetch_from_pdb(pdb_id, filename)
except:
sleep(1)
else:
break
else:
print('Unable to fetch PDB {} from URL {}'.format(pdb_id, self.args.pdb_url.format(pdb_id)))
def create_subcavity_around_ligand(self):
'''Return the subcavity of a specified radius around the query ligand.'''
# Get the ligand that contains the ligand identifier
query_ligand = None
for ligand in self.query_protein.ligands:
chain_id_name = ligand.identifier.split(':')
if self.args.ligand_id == chain_id_name[-1]:
if len(chain_id_name) == 1 or self.ligand_chain_id is None or self.ligand_chain_id == chain_id_name[0]:
query_ligand = ligand
# Set chain ID for selected ligand if none was specified
if len(chain_id_name) == 2 and self.ligand_chain_id is None:
self.ligand_chain_id = chain_id_name[0]
break
if query_ligand is None:
raise IndexError('Ligand not found for ligand identifier {}'.format(self.args.ligand_id))
print('Selected ligand ' + query_ligand.identifier)
# Get the cavity that hosts the ligand
query_cav = None
for cav in self.query_cavities:
for lig in cav.ligand_identifiers:
chain_id_name = lig.split(':')
if chain_id_name[-1] in self.args.ligand_id:
if len(chain_id_name) == 1 or self.ligand_chain_id == chain_id_name[0]:
query_cav = cav
break
if query_cav is None:
raise RuntimeError('Could not find the cavity hosting the ligand.')
print('Selected cavity ' + query_cav.identifier + ' containing ligand ')
[print(cav) for cav in query_cav.ligand_identifiers]
# Get the ligand atoms that make up the desired fragment of the ligand
# around which the subcavity will be defined
if not self.args.ligand_atoms:
fragment_labels = [str(a.label) for a in query_ligand.atoms]
else:
fragment_labels = self.args.ligand_atoms
atoms = [a for a in query_ligand.atoms if a.label in fragment_labels]
# Create the subcavity by using the pseudocenters within a specified radius of the fragment atoms
subcavity = query_cav.subcavity(query_cav.features_by_atom_distance(atoms, self.args.radius_around_ligand))
return subcavity
@staticmethod
def get_subcavity_chain(subcavity):
'''Return the ID for the chain associated with the majority of subcavity pseudocenters.'''
feature_chain_ids = []
for feature in subcavity.features:
feature_chain_ids.append(feature.chain.identifier)
count = Counter(feature_chain_ids)
chain_id = str(count.most_common(1)[0][0])
return chain_id
@staticmethod
def write_sequence_file(pdb_file, sequence_file, chain_id=None):
'''Write sequence file for FASTA.'''
sequences = []
records = list(SeqIO.parse(pdb_file, 'pdb-seqres'))
for record in records:
if not record.seq:
continue
if ':' in record.id:
record_chain_id = record.id.split(':')[1]
else:
# Handle the case where Biopython cannot determine the PDB ID by getting this from the file name
record_chain_id = record.id
record.id = pdb_file[-8:-4].upper() + ':' + record_chain_id
if chain_id is not None:
if record_chain_id == chain_id:
sequences.append(record)
break
else:
sequences.append(record)
if not sequences:
raise IndexError('No chain sequences found.')
with open(sequence_file, 'a') as output_handle:
for sequence in sequences:
SeqIO.write(sequence, output_handle, 'fasta')
def find_expectation_values(self):
'''Run FASTA and return a dictionary associating a PDB identifier with a tuple of the lowest
expectation value found during FASTA sequence alignment and the associated sequence similarity.'''
max_evalue = 0
for record in SeqIO.parse("query.fasta", "fasta"):
max_evalue += 1
for record in SeqIO.parse("targets.fasta", "fasta"):
max_evalue += 1
args = ['-q', '-m', '10', '-E', str(max_evalue),
self.query_sequence_file,
self.targets_sequence_file]
print('Running FASTA with arguments ' + str(args) + ' and writing the results to ' + self.fasta_output_file)
with open(self.fasta_output_file, 'w') as out:
program = self.args.fasta_location
call([program] + args, stdout=out)
pdb_dictionary = {}
results = SearchIO.read('output.fasta', "fasta-m10")
for target in results:
target_pdb_id = target.id.split(':')[0]
if target_pdb_id.lower() in pdb_dictionary:
if target[0].evalue < pdb_dictionary[target_pdb_id.lower()][0]:
pdb_dictionary[target_pdb_id.lower()] = (target[0].evalue, target[0].pos_pct)
else:
pdb_dictionary[target_pdb_id.lower()] = (target[0].evalue, target[0].pos_pct)
return pdb_dictionary
def _extract_pdb_code(self, entry_id):
pdb_code = entry_id[3:7] if entry_id[:3] == 'pdb' else entry_id[:4]
return pdb_code
def write_targets_sequence_file(self, results, temp_dir):
'''Write FASTA sequence file for the PDB structures returned from the database search
and return the classification of each PDB.'''
header_classes = []
target_pdbs = []
for i in range(len(results)):
target_pdb_id = self._extract_pdb_code(results[i][1])
target_pdb_file = os.path.join(temp_dir, 'pdb{}.pdb'.format(target_pdb_id))
self.fetch_pdb(target_pdb_id, target_pdb_file)
if not os.path.exists(target_pdb_file):
continue
try:
header_classes.append(self.get_pdb_class(target_pdb_id, target_pdb_file))
except PDB.PDBExceptions.PDBConstructionException as e:
print('Unable to determine PDB classification: {}'.format(e))
header_classes.append('')
if target_pdb_id.lower() not in target_pdbs:
self.write_sequence_file(target_pdb_file, self.targets_sequence_file)
target_pdbs.append(target_pdb_id.lower())
os.remove(target_pdb_file)
return header_classes
def filter_homologous_structures(self, header_classes, results):
'''Return two lists of results (non-homologous and homologous with query structure).'''
pdb_dictionary = self.find_expectation_values()
filtered_results = []
homologous_results = []
for result, header_class in zip(results, header_classes):
expectation_value, similarity = pdb_dictionary[self._extract_pdb_code(result[1]).lower()]
if expectation_value >= self.args.expectation_threshold:
filtered_results.append((result[1], result[0], header_class, expectation_value, similarity))
else:
homologous_results.append((result[1], result[0], header_class, expectation_value, similarity))
return filtered_results, homologous_results
def write_results_file(self, subcavity, results, output_file):
with open(output_file, 'w') as output:
output.write('Identifier, Comparison score, Classification, No. pseudocenters, Lowest expectation, Associated similarity\n')
# Results for subcavity self-self comparison
output.write(', '.join((str(subcavity.identifier) + ' subcavity', str(subcavity.compare(subcavity).score),
str(self.get_pdb_class(self.args.query_pdb_id, self.query_pdb_file)),
str(len(subcavity.features)), '-1', '-1')) + '\n')
for i in range(len(results)):
if results[i][1] >= self.args.score_filter:
# Filtered database comparison results
output.write(', '.join((results[i][0], str(results[i][1]), results[i][2],
str(len(self.db.get_cavity_by_name(results[i][0]).features)),
str(results[i][3]), str(results[i][4]))) + '\n')
def run(self):
'''Run comparisons of the subcavity against the database and filter out structures homologous to the query.'''
atoms = ''
if self.args.ligand_atoms is not None:
atoms = 'atoms {} of '.format(self.args.ligand_atoms)
chain = ''
if self.ligand_chain_id is not None:
chain = self.ligand_chain_id + ':'
print(u'Creating subcavity of radius {} Angstroms around {}'
'the {}{} ligand for PDB structure {}'
.format(self.args.radius_around_ligand,
atoms,
chain,
self.args.ligand_id,
self.query_protein.identifier).encode('utf-8'))
subcavity = self.create_subcavity_around_ligand()
print('The query subcavity has ' + str(len(subcavity.features)) + ' pseudocenters')
chain_id = self.get_subcavity_chain(subcavity)
self.write_sequence_file(self.query_pdb_file, self.query_sequence_file, chain_id)
self.settings.max_hit_structures = self.args.max_hit_structures
self.settings.verbose = True
# Compare the subcavity against the database using the fast cavity graph comparison method
results = self.db.search(subcavity,
settings=self.settings,
comparison_method=Cavity.ComparisonMethod.FAST_CAVITY_GRAPH_COMPARISON)
# Make temporary directory for fetching PDB targets for writing sequence file
try:
temp_dir = tempfile.mkdtemp()
header_classes = self.write_targets_sequence_file(results, temp_dir)
finally:
try:
shutil.rmtree(temp_dir)
except OSError as exc:
if exc.errno != errno.ENOENT:
raise
filtered_results, homologous_results = self.filter_homologous_structures(header_classes, results)
# Write cavity identifiers, comparison scores and PDB classifications to a file (sorted by score)
print('Writing filtered results to {}'.format(self.args.results_output_file))
self.write_results_file(subcavity, filtered_results, self.args.results_output_file)
print('Writing excluded results to {}'.format(self.args.excluded_results_output_file))
self.write_results_file(subcavity, homologous_results, self.args.excluded_results_output_file)
########################################################################################
if __name__ == '__main__':
Runner().run()
Perform a k-leave-one-out cross-validation¶
This example shows how to perform a leave-one-out cross-validation (LOOCV) on two classes of binding sites using a k-nearest-neighbor classifier. In the example script, the LOOCV is carried out for k = 1, 3, 5, 7, 9. The two classes of binding sites are Imatinib binders (also known as Gleecev/Glivec) and Pemetrexed binders (also known as Alimta). Both are cytostatics used in cancer treatment. However, Imatinib is a tyrosine kinase inhibitor while Pemetrexed inhibits the folate-dependent enzymes thymidylate synthase, dihydrofolate reductase and glycinamide ribonucleotide formyltransferase. The LOOCV is a method to evaluate how well two classes of binding sites can be separated by making use of the results of a cavity comparison method.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-02-25: created by the Cambridge Crystallographic Data Centre
#
'''
k-leave-one-out_cross-validation.py - Generate a matrix of distances by
using two classes of binding sites and run a k leave-one-out cross-validation
to determine classification success. First, an all-against-all similarity
matrix is generated. This matrix is subsequently subjected to a
leave-one-out cross-validation by using a k-nearest-neighbor method as
classifier and the success rates are printed for five different values of
k (1, 3, 5, 7 ,9).
Usage: python k-leave-one-out_cross-validation.py
'''
#####################################################################
import sys
import argparse
from ccdc.cavity import *
#####################################################################
# Perform a k-leave-one-out cross-validation using a similarity matrix, a value
# for k, and an index that defines the boundary between the first and the
# second class of cavities.
def kloocv(scores, boundary, k):
def get_lowest_positions(values, n, exclude):
positions = []
while len(positions) < n:
lowest = sys.maxsize
lowest_pos = -1
for i in range(len(values)):
if values[i] < lowest and i not in positions and i != exclude:
lowest = values[i]
lowest_pos = i
positions.append(lowest_pos)
return positions
total = 0
correct = 0
for i in range(len(scores)):
line = list(scores[i])
lowests = get_lowest_positions(line, k, i)
votes_for_class_b = sum(v > boundary for v in lowests)
votes_for_class_a = (k - votes_for_class_b)
total += 1
if i > boundary:
if votes_for_class_b > votes_for_class_a:
correct += 1
else:
if votes_for_class_a > votes_for_class_b:
correct += 1
return (100.0 * correct / total)
def main(cavity_database):
db = CavityDatabase(cavity_database)
# Collect the cavities of the two classes to be used:
sti_cavities = db.get_cavities_by_ligand("STI") # cavities binding Imatinib (Gleevec/Glivec)
lya_cavities = db.get_cavities_by_ligand("LYA") # cavities binding Pemetrexed
print("Found", len(sti_cavities), "cavities binding STI.")
print("Found", len(lya_cavities), "cavities binding LYA.")
cavities = sti_cavities + lya_cavities
# Build scoring matrix
scores = []
for i in range(len(cavities)):
scores.append([])
for j in range(len(cavities)):
scores[i].append(0.0)
# Insert similarity scores
print("Performing the comparisons")
c = 0
start = time.time()
for i in range(len(cavities)):
for j in range(i + 1, len(cavities)):
d = cavities[i].compare(cavities[j], comparison_method=Cavity.ComparisonMethod.CAVITY_HISTOGRAMS_COMPARISON)
scores[i][j] = d
scores[j][i] = d
c += 1
elapsed = round(time.time() - start, 3)
print(" -> ran", c, "comparisons in", elapsed, "sec.")
# Run k leave-one-out cross-validation
print("---")
print("k leave-one-out cross-validation results:")
boundary = len(sti_cavities) - 1
for k in range(1, 11, 2):
success = kloocv(scores, boundary, k)
print("Success rate for k =", k, ":", round(success, 1), "%")
#####################################################################
if __name__ == '__main__':
drugbank_cavity_database = \
os.path.join(CavityDatabase.drugbank_database_dir(), 'drugbank_cavities.sqlite')
parser = argparse.ArgumentParser(description='Cavity K Leave One Out Cross Validation.')
parser.add_argument(
'-d',
'--cavity_database',
help='Cavity sqlite database for testing.',
default=drugbank_cavity_database)
args = parser.parse_args()
main(args.cavity_database)