Docking examples¶
Note
The docking features are available only to CSD-Discovery and CSD-Enterprise users.
Docking and Rescoring¶
This example shows how a docking or rescoring may be performed from a protein and a set of ligands. It is not intended to be a production-quality protein-ligand docking, but to exemplify the technology.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
######################################################################
'''
docking_and_rescoring.py - read protein, perform minimal clean-up, define
docking or rescoring and run the docker.
Note that any realistic docking would require a great deal of thought
to prepare the protein properly. This example shows the minimum
that needs to be done to define a docking.
The script exemplifies docking and rescoring.
'''
###########################################################################
import argparse
import tempfile
import os
from ccdc.io import EntryWriter, MoleculeReader
from ccdc.protein import Protein
from ccdc.docking import Docker
class Runner(argparse.ArgumentParser):
'''Parses arguments, runs the docking.'''
def __init__(self):
'''Defines the arguments.'''
super(self.__class__, self).__init__(description=__doc__)
self.add_argument(
'protein_file_name',
help='The protein file name'
)
self.add_argument(
'-l', '--ligand-file-name',
help='Ligand file. If not supplied the substrate ligand will be docked. If rescoring only, this file should contain the docked ligands.'
)
self.add_argument(
'-rl', '--reference-ligand-file-name',
help='Reference ligand file.',
required=True
)
self.add_argument(
'-s', '--speed',
default='very fast',
choices=('very fast', 'fast', 'medium', 'slow', 'very slow'),
help='How thorough the docking attempts should be. The values correspond to 10%%, 25%%, 50%%, 75%% and 100%% autoscaling of docking parameters.'
)
self.add_argument(
'-n', '--ndocks', default=10, type=int,
help='Number of docking attempts to run.'
)
self.add_argument(
'-d', '--directory',
help='Output directory for the docking. Default is a temporary directory.'
)
self.add_argument(
'-f', '--fitness-function', choices=('plp', 'asp', 'chemscore', 'goldscore'),
help='Fitness function to use (default None).'
)
self.add_argument(
'-r', '--rescore-function', choices=('plp', 'asp', 'chemscore', 'goldscore'),
help='Rescore function to use (default None)'
)
self.args = self.parse_args()
self.docker = None
self.settings = None
self.ref_ligand = None
self.protein = None
def run(self):
'''Run the docking.'''
self.create_docker()
protein_ligands = self.prepare_protein()
self.prepare_ligands(protein_ligands)
self.prepare_binding_site()
self.dock()
def is_rescore(self):
return self.args.rescore_function and not self.args.fitness_function
def create_docker(self):
'''Create a docker.'''
self.docker = Docker()
self.settings = self.docker.settings
if self.args.directory:
if not os.path.isdir(self.args.directory):
os.makedirs(self.args.directory)
else:
self.args.directory = tempfile.mkdtemp()
self.settings.output_directory = self.args.directory
if self.is_rescore():
self.settings.output_file = 'rescored_ligands.mol2'
else:
self.settings.output_file = 'docked_ligands.mol2'
self.settings.output_format = 'mol2'
self.settings.autoscale = {
'very fast': 10,
'fast': 25,
'medium': 50,
'slow': 75,
'very slow': 100
}[self.args.speed]
self.settings.fitness_function = self.args.fitness_function
if self.args.rescore_function:
self.settings.rescore_function = self.args.rescore_function
if self.args.reference_ligand_file_name:
self.settings.reference_ligand_file = self.args.reference_ligand_file_name
self.ref_ligand = MoleculeReader(self.args.reference_ligand_file_name)[0]
self.settings.reference_ligand_file = self.args.reference_ligand_file_name
def prepare_binding_site(self):
'''Define the binding site from the reference ligand.'''
ligand_cog = self.ref_ligand.centre_of_geometry()
self.settings.binding_site = self.settings.BindingSiteFromPoint(
None, ligand_cog, 12.0
)
def prepare_protein(self):
'''Prepare the protein and remove any ligands.'''
self.protein = Protein.from_file(self.args.protein_file_name)
self.protein.remove_all_waters()
self.protein.remove_unknown_atoms()
self.protein.add_hydrogens()
ligands = self.protein.ligands
for l in ligands:
self.protein.remove_ligand(l.identifier)
protein_file_name = os.path.join(
self.settings.output_directory, 'clean_%s.mol2' % self.protein.identifier
)
with EntryWriter(protein_file_name) as writer:
writer.write(self.protein)
self.settings.add_protein_file(protein_file_name)
return ligands
def prepare_ligands(self, protein_ligands):
'''Write an appropriate ligand file to the docking directory, if required.'''
if not self.args.ligand_file_name:
if protein_ligands:
self.args.ligand_file_name = os.path.join(
self.settings.output_directory,
'protein_ligands.mol2'
)
with EntryWriter(self.args.ligand_file_name) as writer:
for ligand in protein_ligands:
writer.write(ligand)
else:
raise RuntimeError('No ligand file given and no ligands in the protein.')
self.settings.add_ligand_file(self.args.ligand_file_name, self.args.ndocks)
def dock(self):
'''Do the docking.'''
self.docker.dock()
print('Docking completed. Results are in %s' % self.settings.output_directory)
###########################################################################
if __name__ == '__main__':
runner = Runner()
runner.run()
Interactive docking¶
This example shows how the results of a docking may be used to inform further dockings in an iterative fashion.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
'''
similarity_docking.py - search the CSD for similar structures to a known
ligand, dock them, saving structures which score better than the known ligand.
This exemplifies using an interactive docker.
'''
######################################################################
import sys
import os
import argparse
from ccdc.io import EntryReader, EntryWriter
from ccdc.search import SimilaritySearch
from ccdc.docking import Docker
from ccdc.entry import Entry
########################################################################################
class SimilarityDocking(argparse.ArgumentParser):
'''Defines arguments and runs the job.'''
def __init__(self):
'''Defines the arguments.'''
super(self.__class__, self).__init__(description=__doc__)
# Input and output options
self.add_argument(
'conf_file_name',
help='The configuration file to use.'
)
self.add_argument(
'-l', '--ligand-file-name', default=None,
help='Ligand to use in the docking'
)
self.add_argument(
'-o', '--output-directory', default=os.path.abspath('output'),
help='Directory in which results will be stored.'
)
# Similarity search parameters
self.add_argument(
'-m', '--max_hit_structures', default=None, type=int,
help='The maximum number of CSD structures to try and dock (default: unlimited).'
)
# Docking parameters
self.add_argument(
'-f', '--fitness-function', default=None,
help='The fitness function to use.'
)
self.add_argument(
'-n', '--ndocks', default=10, type=int,
help='The number of docking attempts to make.'
)
self.add_argument(
'-s', '--speed', default=None,
choices=('very fast', 'fast', 'medium', 'slow', 'very slow'),
help=('How thorough the docking attempts should be. ',
'Values correspond to 10%%, 25%%, 50%%, 75%% and 100%% ',
'auto-scaling of docking parameters.')
)
def get_arguments(self):
'''Extract and process the arguments.'''
self.args = self.parse_args()
self.settings = Docker.Settings.from_file(self.args.conf_file_name)
if self.args.output_directory:
self.settings.output_directory = self.args.output_directory
if self.args.fitness_function is not None:
self.settings.fitness_function = self.args.fitness_function
if self.args.speed is not None:
self.settings.autoscale = {
'very fast': 10,
'fast': 25,
'medium': 50,
'slow': 75,
'very slow': 100
}[self.args.speed]
self.ligand_info = self.settings.ligand_files
if self.args.ligand_file_name is None:
self.args.ligand_file_name = self.ligand_info[0].file_name
self.settings.clear_ligand_files()
self.settings.set_hostname(ndocks=self.args.ndocks)
self.docker = Docker(settings=self.settings)
if not os.path.isdir(self.settings.output_directory):
os.makedirs(self.settings.output_directory)
self.winner_writer = EntryWriter(
os.path.join(self.settings.output_directory, 'winners.mol2')
)
self.better_writer = EntryWriter(
os.path.join(self.settings.output_directory, 'better.mol2')
)
def run(self):
'''Runs the job.'''
self.get_arguments()
self.session = self.docker.dock(
os.path.join(self.settings.output_directory, 'settings.conf'),
'interactive'
)
self.run_substrate()
self.done = set()
dock_index = 1
while True:
hits = self.search()
if not hits:
break
for h in hits:
self.done.add(h.identifier)
mol = h.molecule
if mol.all_atoms_have_sites:
mol.assign_bond_types()
mol.add_hydrogens()
entry = Entry.from_molecule(mol)
poses = self.session.dock(entry)
dock_index += 1
if poses:
score, pose = self.best(poses)
for inx, p in enumerate(poses):
with EntryWriter(
os.path.join(
self.settings.output_directory,
'gold_soln_CSD_API_m%d_%d.mol2' % (dock_index, inx + 1)
)
) as w:
w.write(p)
if score > self.best_score:
print('%-8s: %.3f Winner!' % (h.identifier, score))
self.best_score, self.best_pose = score, pose
self.winner_writer.write(self.best_pose)
else:
print('%-8s: %.3f' % (h.identifier, score))
if score > self.substrate_score:
self.better_writer.write(pose)
else:
print('%-8s: Failed to dock' % h.identifier)
else:
print('%-8s: No 3d structure' % h.identifier)
def search(self):
'''Similarity search the csd.'''
threshold = 1.0
hits = []
mol = self.best_pose.molecule
mol.standardise_aromatic_bonds()
while not hits and round(threshold, 6) > 0.5:
# initially just search for CSD structures similar to the ligand
threshold -= 0.1
simsearcher = SimilaritySearch(mol, threshold)
simsearcher.settings.only_organic = True
simsearcher.settings.has_3d_coordinates = True
hits = [h for h in simsearcher.search(max_hit_structures=self.args.max_hit_structures)
if h.identifier not in self.done]
print('Found %s structures similar to %s with threshold %.2f.' %
(len(hits), mol.identifier, threshold))
if hits:
# then further specify the search by finding structures
# similar to individual components of hits
def match_components(h):
if len(h.molecule.components) > 1:
for m in h.molecule.components:
m.identifier = h.identifier
hh = simsearcher.search(m)
if hh:
return hh[0]
else:
return None
# raise RuntimeError('Similarity search failed to match any component')
else:
return h
hits = [x for x in [match_components(h) for h in hits] if x is not None]
print('%d new hits with threshold %.2f against %s' %
(len(hits), threshold, mol.identifier))
return hits
else:
print('No hits with threshold %.2f against %s' % (threshold, mol.identifier))
def run_substrate(self):
'''Runs the substrate, for the comparison.'''
self.substrate = EntryReader(self.args.ligand_file_name)[0]
self.substrate.identifier = self.substrate.identifier.replace(':', '_')
poses = self.session.dock(self.substrate)
if not poses:
print('FAILED TO DOCK SUBSTRATE')
sys.exit()
self.best_score, self.best_pose = self.best(poses)
self.substrate_score = self.best_score
for inx, p in enumerate(poses):
outfile_path = os.path.join(self.settings.output_directory,
'gold_soln_CSD_API_m1_%d.mol2' % (inx + 1))
with EntryWriter(outfile_path) as output_file:
output_file.write(p)
print('%-9s: %.3f' % ('Substrate', self.best_score))
def best(self, poses):
'''Extracts best score and best structure from the docked poses.'''
fitness_tag = dict(
plp='Gold.PLP.Fitness',
chemscore='Gold.Chemscore.Fitness',
goldscore='Gold.Goldscore.Fitness',
asp='Gold.ASP.Fitness'
)[self.settings.fitness_function]
scores = [
float(e.attributes[fitness_tag]) for e in poses
]
best_score = max(scores)
best_pose = poses[scores.index(best_score)]
return best_score, best_pose
########################################################################################
if __name__ == '__main__':
similarity_docking = SimilarityDocking()
similarity_docking.run()
Testing fitness functions¶
This example shows how the performance of docking and rescoring functions can be systematically evaulated, how the pool method can be employed to parallize docking runs and the pandas module can be used to help with statistical analysis
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-02-10: created by the Cambridge Crystallographic Data Centre
#
#
'''
This simple example script uses docking to evaluate different scoring functions
Some of the analysis code depends on the numpy package and the pandas package
being installed.
Presentation of the results depends on the package matplotlib.
.. seealso:: <https://www.numpy.org/> for more information on numpy.
.. seealso:: <https://pandas.pydata.org/> for more information on pandas.
.. seealso:: <https://matplotlib.org/> for more information on matplotlib.
'''
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import argparse
import os
import multiprocessing
from multiprocessing import Pool
import re
import numpy as np
import pandas as pd
from ccdc.docking import Docker
from ccdc.io import MoleculeReader, EntryReader
from ccdc.protein import Protein
def run_gold(params):
(protein, ligand, reference, ndocks,
fitness_function, rescore_function, no_run, base_out_dir, max_results) = params
protein_file = os.path.abspath(protein)
ligand_file = os.path.abspath(ligand)
native_ligand_file = os.path.abspath(reference)
docker = Docker()
settings = docker.settings
protein = Protein.from_file(protein_file)
ligand = MoleculeReader(ligand_file)[0]
native_ligand = MoleculeReader(native_ligand_file)[0]
settings.add_protein_file(protein_file)
settings.add_ligand_file(ligand_file, ndocks)
settings.reference_ligand_file = reference
settings.binding_site = settings.BindingSiteFromLigand(protein, native_ligand, 6.0)
settings.diverse_solutions = (True, 3, 1.0)
# The max_results parameter can be passed in to speed up runs for testing purposes.
# However this WILL reduce the usefulness of the results as the set of structures to compare
# is reduced to the given number.
if max_results is not None:
settings.early_termination = (True, max_results, 1.0)
else:
settings.early_termination = False
settings.fitness_function = fitness_function
rescore_label = ''
if rescore_function != 'None':
settings.rescore_function = rescore_function
rescore_label = "score_" + str(rescore_function)
else:
rescore_function = fitness_function
output_dir = os.path.abspath(get_docking_directory(base_out_dir,
fitness_function, rescore_function, no_run))
settings.output_directory = output_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
settings.output_file = os.path.join(
output_dir, get_docking_file(fitness_function, rescore_function, no_run))
docker.dock(file_name=os.path.join(output_dir,
get_conf_file(fitness_function, rescore_function, no_run)))
def rescore_gold(params):
(reference, fitness_function, rescore_function, no_run, base_out_dir) = params
in_path = get_docking_directory(base_out_dir, fitness_function, fitness_function, no_run)
protein_file = os.path.join(in_path, 'gold_protein.mol2')
ligand_file = os.path.join(in_path,
get_docking_file(fitness_function, fitness_function, no_run))
# protein settings for docking
docker = Docker()
settings = docker.settings
protein = Protein.from_file(protein_file)
settings.add_protein_file(os.path.join(protein_file))
settings.add_ligand_file(ligand_file, 1)
# define the binding site using the reference ligand
native_ligand_file = os.path.abspath(reference)
native_ligand = MoleculeReader(native_ligand_file)[0]
settings.binding_site = settings.BindingSiteFromLigand(protein, native_ligand, 6.)
settings.reference_ligand_file = native_ligand_file
settings.fitness_function = None
settings.rescore_function = rescore_function
output_dir = get_docking_directory(base_out_dir, fitness_function, rescore_function, no_run)
settings.output_directory = output_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
settings.output_file = os.path.join(output_dir, get_docking_file(
fitness_function, rescore_function, no_run))
docker.dock(file_name=os.path.join(get_docking_directory(base_out_dir, fitness_function,
rescore_function, no_run),
get_conf_file(fitness_function, rescore_function, no_run)))
def get_conf_file(fitness_function, scoring_function, no_run):
return '%s_score_%s_run_%s.conf' % (fitness_function, scoring_function, no_run)
def get_docking_directory(base_dir, fitness_function, scoring_function, no_run):
return os.path.abspath(os.path.join(base_dir,
'fitness_%s' % fitness_function,
'score_%s' % scoring_function,
'run_%s' % no_run))
def get_docking_file(fitness_function, scoring_function, no_run):
return 'poses_%s_score_%s_run_%s.mol2' % (fitness_function, scoring_function, no_run)
def get_path(base_dir, fitness_function, scoring_function, no_run):
return os.path.join(
get_docking_directory(base_dir, fitness_function, scoring_function, no_run),
get_docking_file(fitness_function, scoring_function, no_run))
def get_scores(ndocks, nruns, output_directory):
SF = ('goldscore', 'plp', 'chemscore', 'asp')
re_RMSD = re.compile(r'\w*\.Reference\.RMSD')
re_FITNESS = re.compile(r'.+\..+\.Fitness')
df = {}
pose_metrics = {}
for run in range(1, int(ndocks) + 1):
for dockfunc in SF:
# dockmols = {}
dock_entries = {}
poselist = {}
for sf in SF:
this_path = get_path(output_directory, dockfunc, sf, run)
# dockmols[sf] = MoleculeReader(this_path)
dock_entries[sf] = EntryReader(this_path)
for i in range(0, int(nruns)):
table = {}
for sf in SF:
table['Docking_function'] = sf
d = dock_entries[sf][i]
# m = dockmols[sf][i]
for key, value in d.attributes.items():
test = re_RMSD.match(key)
if test:
table[key] = value
test_fitness = re_FITNESS.match(key)
if test_fitness:
table[key] = value
series = pd.Series(table)
poselist[i] = series
df = pd.DataFrame(poselist)
pose_metrics[dockfunc, run] = df.T
# Next, let's work out if any correct poses were found (sampling success)
# and if so, if the scoring or rescoring functions were able to detect them
# as the top solution
# We create a dictionary for each docking/rescoring/run combination,
# add them to a list and convert them to a dataframe
RMSD_threshold = 2.0
data = []
for dockfunc in ['asp', 'chemscore', 'goldscore', 'plp']:
for run in range(1, int(ndocks) + 1):
df = pose_metrics[dockfunc, run]
for scorefunc in ['Gold.ASP.Fitness', 'Gold.Chemscore.Fitness',
'Gold.Goldscore.Fitness', 'Gold.PLP.Fitness', 'Gold.Reference.RMSD']:
dockings = {}
dockings['dockfunc'] = dockfunc
dockings['run'] = run
dockings['scorefunc'] = scorefunc
dockings['worked'] = 0
flag = False
if scorefunc == 'Gold.Reference.RMSD':
flag = True
if (float(df.sort_values(by=scorefunc,
ascending=flag).iloc[0]['Gold.Reference.RMSD']) <
float(RMSD_threshold)):
dockings['worked'] = 1
data.append(dockings)
success_list = pd.DataFrame(data)
# Now we can average the success obtained by a combination of docking
# and scoring function using a pivot table
mean_success = pd.pivot_table(success_list, index=['dockfunc', 'scorefunc'], values=[
'worked'], aggfunc=np.mean)
mean_success.to_csv(os.path.join(output_directory, 'mean_sucess.csv'))
# Last thing to do is to illustrate our data in a visual diagram
plot_data = mean_success.unstack()
dockfunc = plot_data.index.get_level_values(0)
scorefunc = [str(a).split('.')[1].lower() for a in plot_data.columns.get_level_values(1)]
scorefunc = [s.replace('reference', 'rmsd') for s in scorefunc]
plt.yticks(np.arange(0.5, len(dockfunc), 1), dockfunc)
plt.xticks(np.arange(0.5, len(scorefunc), 1), scorefunc)
plt.pcolor(plot_data, cmap=matplotlib.cm.Blues)
plt.title('GOLD docking performance')
plt.xlabel('scoring function')
plt.ylabel('docking function')
plt.tight_layout()
plt.savefig(os.path.join(output_directory, 'statistics.png'))
plt.show()
def parse_args():
parser = argparse.ArgumentParser(
description='Run dockings to evaluate different fitness functions.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-p', '--protein', required=True, dest='protein',
help='The prepared protein .mol2 file')
parser.add_argument('-l', '--ligand', required=True, dest='ligand',
help='The prepared ligand .mol2 file')
parser.add_argument('-r', '--reference', required=True, dest='reference',
help='The reference ligand to define the active site')
parser.add_argument('-n', '--ndocks', type=int, default=10, dest='ndocks',
help='The number of dockings to attempt per ligand')
parser.add_argument('-t', '--ntests', type=int, default=5, dest='ntests',
help='The number of parallel test runs to attempt')
parser.add_argument('-d', '--output_directory', default=os.path.abspath('output'),
help='The directory to store output files in.')
parser.add_argument('-c', '--num_cores', type=int, default=multiprocessing.cpu_count(),
help='How many docking processes to run in parallel.')
parser.add_argument('-m', '--max_results', type=int, default=None,
help='Number of results after which to end docking for testing purposes.')
return parser.parse_args()
if __name__ == '__main__':
import ccdc.utilities
ccdc.utilities._fix_multiprocessing_on_macos()
args = parse_args()
SF = ('goldscore', 'plp', 'chemscore', 'asp')
with Pool(args.num_cores) as p:
# generate parameters for docking runs and distribute them among CPUs
params = [(args.protein, args.ligand, args.reference, int(args.ndocks),
scoring_function, 'None', run, os.path.abspath(args.output_directory),
args.max_results)
for scoring_function in SF
for run in range(1, int(args.ntests) + 1)]
p.map(run_gold, params)
# generate parameters for rescoring runs and distribute them among CPUs
rescore_params = [(args.reference, scoring_function, rescoring_function,
run, os.path.abspath(args.output_directory))
for scoring_function in SF
for rescoring_function in SF
for run in range(1, int(args.ntests) + 1)
if scoring_function != rescoring_function]
p.map(rescore_gold, rescore_params)
# Actually run the dockings
get_scores(args.ntests, args.ndocks, os.path.abspath(args.output_directory))
Ensemble docking¶
This example shows again how docking and rescoring can be evaluated. It also shows how to systematically set a protein h-bond constraints looking atom distance search lookup.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-03-03: created by the Cambridge Crystallographic Data Centre
#
"""
This script exemplifies docking, constraints and the use of MolecularDescriptors.AtomDistanceSearch.
This script runs GOLD in batch mode, example usage:
python ensemble_docking_constraints.py
--proteins 1TVO_protein.mol2 2OJG_protein.mol2 4ZZN_protein.mol2
3I5Z_protein.mol2 4FV9_protein.mol2
--out_dir results --cavity cavity_ligands.mol2
--ligand FRZ_ideal.sdf --PHBconstraint 1TVO A MET108 H
"""
import argparse
from multiprocessing import Pool
import os
import glob
import multiprocessing
from ccdc.descriptors import MolecularDescriptors
from ccdc.docking import Docker
from ccdc.io import MoleculeReader
from ccdc.molecule import Molecule
from ccdc.protein import Protein
def parse_args():
usage = ('Run GOLD in batch mode on multiple proteins.')
parser = argparse.ArgumentParser(description=usage,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--proteins', dest='proteins', nargs='+', required=True,
help='protein(s) used for docking')
parser.add_argument(
'--PHBconstraint', dest='PHBconstraint', nargs=4, required=True,
help='HBond constraint (protein ID, chain, residue, atom label, e.g. "1TVO A MET108 H")')
parser.add_argument('--cavity', dest='cavity', required=True,
help='one or more ligand files for cavity definition')
parser.add_argument('--ligand', dest='ligand', required=True,
help='one or more ligands for docking')
parser.add_argument('--out_dir', dest='odir', default='output',
help='output directory for docking results')
parser.add_argument('-c', '--num_cores', type=int, default=multiprocessing.cpu_count(),
help='How many docking processes to run in parallel.')
parser.add_argument('-m', '--max_results', type=int, default=None,
help='How many solutions to calculate before stopping the docking.')
parser.add_argument('-r', '--ndocks', type=int, default=2,
help='How many docking runs to perform for each function combination.')
return parser.parse_args()
def define_docker(proteins, cavity, ligands, constraint):
merge_cavity = Molecule()
with MoleculeReader(cavity) as cavity_reader:
for c in cavity_reader:
merge_cavity.add_molecule(c)
settings = Docker.Settings()
merge_protein = Molecule()
# creating a merged protein from multiple proteins
# and adding proteins individually to the ensemble
for protein in proteins:
settings.add_protein_file(os.path.abspath(protein))
protein = Protein.from_file(os.path.abspath(protein))
merge_protein.add_molecule(protein)
# defining the ligand
settings.add_ligand_file(os.path.abspath(ligands))
# defining the binding site
settings.binding_site = settings.BindingSiteFromLigand(merge_protein, merge_cavity, 6.0)
# sorting out protein HBond constraints
for protein in settings.proteins:
# find the protein specified in the constraint
if str(protein.identifier) == str(constraint[0]):
chain = str(constraint[1].upper())
residue = str(constraint[2].upper())
label = str(constraint[3].upper())
atom = [a for a in protein['%s:%s' % (chain, residue)].atoms if a.label == label]
c = settings.ProteinHBondConstraint(atom)
settings.add_constraint(c)
for otherprot in [x for x in settings.proteins if x != protein]:
searcher = MolecularDescriptors.AtomDistanceSearch(otherprot)
near_atoms = searcher.atoms_within_range(atom[0].coordinates, 1.5)
for n in near_atoms:
if n.label == atom[0].label:
matched_atom_index = n.index
ec = settings.ProteinHBondConstraint([otherprot.atoms[matched_atom_index]])
settings.add_constraint(ec)
new_docker = Docker()
new_docker.settings = settings
return new_docker
def run_gold(params):
(proteins, cavity, ligands, PHBconstraint, outdir, fitness_function,
rescore_function, no_run, max_results) = params
docker = define_docker(proteins, cavity, ligands, PHBconstraint)
settings = docker.settings
settings.clear_ligand_files()
settings.add_ligand_file(os.path.abspath(ligands), 10)
settings.fitness_function = fitness_function
if rescore_function != 'None':
settings.rescore_function = rescore_function
rescore_label = 'score_%s' % rescore_function
else:
rescore_label = 'score_%s' % fitness_function
out_path = os.path.join(outdir, 'fitness_%s' % fitness_function,
rescore_label, 'run_%s' % no_run)
settings.output_directory = out_path
if not os.path.exists(out_path):
os.makedirs(out_path)
# The max_results parameter can be passed in to speed up runs for testing purposes.
# However this WILL reduce the usefulness of the results as the set of structures to compare
# is reduced to the given number.
if max_results is not None:
settings.early_termination = (True, max_results, 1.0)
else:
settings.early_termination = False
new_docker = Docker()
new_docker.settings = settings
new_docker.dock(file_name='%s_%s_run_%s.conf' %
(fitness_function, rescore_label, no_run))
def rescore_gold(params):
# expand parameters to local variables for usability
(proteins, cavity, ligands, constraint, odir, fitness_function,
rescore_function, no_run) = params
docker = define_docker(proteins, cavity, ligands, constraint)
settings = docker.settings
settings.clear_ligand_files()
in_path = os.path.join(odir,
'fitness_%s' % fitness_function,
'score_%s' % fitness_function,
'run_%s' % no_run)
out_path = os.path.join(odir,
'fitness_%s' % fitness_function,
'score_%s' % rescore_function,
'run_%s' % no_run)
settings.output_directory = out_path
if not os.path.exists(out_path):
os.makedirs(out_path)
poses = glob.glob(os.path.join(in_path, 'gold_soln*'))
for pose in poses:
settings.add_ligand_file(os.path.join(pose), 1)
settings.fitness_function = None
settings.rescore_function = rescore_function
settings.output_directory = out_path
new_docker = Docker()
new_docker.settings = settings
new_docker.dock(file_name='%s_score_%s_run_%s.conf' %
(fitness_function, rescore_function, no_run))
if __name__ == '__main__':
import ccdc.utilities
ccdc.utilities._fix_multiprocessing_on_macos()
args = parse_args()
SF = ('goldscore', 'plp', 'chemscore', 'asp')
NO = range(1, args.ndocks + 1)
params = [(args.proteins, args.cavity, args.ligand, args.PHBconstraint, args.odir,
scoring_function, 'None', run, args.max_results)
for scoring_function in SF for run in NO]
with Pool(args.num_cores) as p:
p.map(run_gold, params)
rescore_params = [(args.proteins, args.cavity, args.ligand, args.PHBconstraint,
args.odir, scoring_function, rescoring_function, run)
for scoring_function in SF for rescoring_function in SF
for run in NO if scoring_function != rescoring_function]
p.map(rescore_gold, rescore_params)
Searching docked structures¶
This example shows how to write a csdsql format database from docking results, to enable faster substructure searching, and how to associate the GOLD docking tags with the substructure search hits found.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
##############################################################################
'''
searching_docking_results.py - how to perform substructure searching on a multi-mol2 file.
This example shows how to create a .csdsql database from a multi-mol2 file
to get fast substructure searching, then to associate the GOLD docking scores with the hits.
The .csdsql database will be written to the output directory of the docking, if it doesn't already exist, or to
a directory optionally specified on the command line.
One or more SMARTS patterns may be provided to search, and the resulting mean fitness scores calculated and printed.
'''
##############################################################################
import os
import argparse
import collections
from ccdc import io, docking, search
##############################################################################
class Runner(argparse.ArgumentParser):
def __init__(self):
super(self.__class__, self).__init__(description=__doc__)
self.add_argument(
'conf_file',
help='Docking configuration file'
)
self.add_argument(
'SMARTS', nargs='*',
help='SMARTS patterns to search for in the database'
)
self.add_argument(
'-o', '--output-directory', default=None,
help='Alternative directory for the csdsql file'
)
self.args = self.parse_args()
def run(self):
settings = docking.Docker.Settings.from_file(self.args.conf_file)
docker = docking.Docker(settings=settings)
results = docker.results
if self.args.output_directory is None:
output_dir = settings.output_directory
else:
output_dir = self.args.output_directory
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_base, _ = os.path.splitext(os.path.basename(settings.output_file))
if not output_base:
csdsql_file = os.path.join(output_dir, 'docking_results.csdsql')
else:
csdsql_file = os.path.join(output_dir, output_base + '.csdsql')
if not os.path.exists(csdsql_file):
print('Writing:', csdsql_file, 'from', settings.output_file)
# Ensure that identifiers are unique
identifier_suffix = collections.defaultdict(int)
with io.EntryWriter(csdsql_file) as writer:
for i, ligand in enumerate(results.ligands):
identifier_suffix[ligand.identifier] += 1
ligand.identifier = '%s_%d' % (ligand.identifier, identifier_suffix[ligand.identifier])
writer.write(ligand)
db = io.EntryReader(csdsql_file)
ligands = results.ligands
longest = max(len(s) for s in self.args.SMARTS)
# Need an index by new identifier
index = {
e.identifier : i for i, e in enumerate(db)
}
for smarts in self.args.SMARTS:
searcher = search.SubstructureSearch()
searcher.add_substructure(search.SMARTSSubstructure(smarts))
hits = searcher.search(database=db, max_hits_per_structure=1)
fitness = 0.0
for h in hits:
ligand = ligands[index[h.identifier]]
fitness += ligand.fitness()
if hits:
fitness /= len(hits)
print('%*s: %4d hits %.2f average fitness' % (longest, smarts, len(hits), fitness))
##############################################################################
if __name__ == '__main__':
runner = Runner()
runner.run()