Docking examples

Note

The docking features are available only to CSD-Discovery and CSD-Enterprise users.

Docking and Rescoring

This example shows how a docking or rescoring may be performed from a protein and a set of ligands. It is not intended to be a production-quality protein-ligand docking, but to exemplify the technology.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
######################################################################
'''
docking_and_rescoring.py - read protein, perform minimal clean-up, define
         docking or rescoring and run the docker.

Note that any realistic docking would require a great deal of thought
to prepare the protein properly.  This example shows the minimum
that needs to be done to define a docking.

The script exemplifies docking and rescoring.
'''
###########################################################################

import argparse
import tempfile
import os

from ccdc.io import EntryWriter, MoleculeReader
from ccdc.protein import Protein
from ccdc.docking import Docker


class Runner(argparse.ArgumentParser):
    '''Parses arguments, runs the docking.'''
    def __init__(self):
        '''Defines the arguments.'''
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'protein_file_name',
            help='The protein file name'
        )
        self.add_argument(
            '-l', '--ligand-file-name',
            help='Ligand file. If not supplied the substrate ligand will be docked. If rescoring only, this file should contain the docked ligands.'
        )
        self.add_argument(
            '-rl', '--reference-ligand-file-name',
            help='Reference ligand file.',
            required=True
        )
        self.add_argument(
            '-s', '--speed',
            default='very fast',
            choices=('very fast', 'fast', 'medium', 'slow', 'very slow'),
            help='How thorough the docking attempts should be. The values correspond to 10%%, 25%%, 50%%, 75%% and 100%% autoscaling of docking parameters.'
        )
        self.add_argument(
            '-n', '--ndocks', default=10, type=int,
            help='Number of docking attempts to run.'
        )
        self.add_argument(
            '-d', '--directory',
            help='Output directory for the docking. Default is a temporary directory.'
        )
        self.add_argument(
            '-f', '--fitness-function', choices=('plp', 'asp', 'chemscore', 'goldscore'),
            help='Fitness function to use (default None).'
        )
        self.add_argument(
            '-r', '--rescore-function', choices=('plp', 'asp', 'chemscore', 'goldscore'),
            help='Rescore function to use (default None)'
        )

        self.args = self.parse_args()

        self.docker = None
        self.settings = None
        self.ref_ligand = None
        self.protein = None

    def run(self):
        '''Run the docking.'''
        self.create_docker()
        protein_ligands = self.prepare_protein()
        self.prepare_ligands(protein_ligands)
        self.prepare_binding_site()
        self.dock()

    def is_rescore(self):
        return self.args.rescore_function and not self.args.fitness_function

    def create_docker(self):
        '''Create a docker.'''
        self.docker = Docker()
        self.settings = self.docker.settings
        if self.args.directory:
            if not os.path.isdir(self.args.directory):
                os.makedirs(self.args.directory)
        else:
            self.args.directory = tempfile.mkdtemp()

        self.settings.output_directory = self.args.directory
        if self.is_rescore():
            self.settings.output_file = 'rescored_ligands.mol2'
        else:
            self.settings.output_file = 'docked_ligands.mol2'

        self.settings.output_format = 'mol2'
        self.settings.autoscale = {
            'very fast': 10,
            'fast': 25,
            'medium': 50,
            'slow': 75,
            'very slow': 100
        }[self.args.speed]

        self.settings.fitness_function = self.args.fitness_function
        if self.args.rescore_function:
            self.settings.rescore_function = self.args.rescore_function

        if self.args.reference_ligand_file_name:
            self.settings.reference_ligand_file = self.args.reference_ligand_file_name

        self.ref_ligand = MoleculeReader(self.args.reference_ligand_file_name)[0]
        self.settings.reference_ligand_file = self.args.reference_ligand_file_name

    def prepare_binding_site(self):
        '''Define the binding site from the reference ligand.'''
        ligand_cog = self.ref_ligand.centre_of_geometry()
        self.settings.binding_site = self.settings.BindingSiteFromPoint(
            None, ligand_cog, 12.0
        )

    def prepare_protein(self):
        '''Prepare the protein and remove any ligands.'''
        self.protein = Protein.from_file(self.args.protein_file_name)
        self.protein.remove_all_waters()
        self.protein.remove_unknown_atoms()
        self.protein.add_hydrogens()

        ligands = self.protein.ligands

        for l in ligands:
            self.protein.remove_ligand(l.identifier)

        protein_file_name = os.path.join(
            self.settings.output_directory, 'clean_%s.mol2' % self.protein.identifier
        )

        with EntryWriter(protein_file_name) as writer:
            writer.write(self.protein)
        self.settings.add_protein_file(protein_file_name)

        return ligands

    def prepare_ligands(self, protein_ligands):
        '''Write an appropriate ligand file to the docking directory, if required.'''
        if not self.args.ligand_file_name:
            if protein_ligands:
                self.args.ligand_file_name = os.path.join(
                    self.settings.output_directory,
                    'protein_ligands.mol2'
                )
                with EntryWriter(self.args.ligand_file_name) as writer:
                    for ligand in protein_ligands:
                        writer.write(ligand)
            else:
                raise RuntimeError('No ligand file given and no ligands in the protein.')

        self.settings.add_ligand_file(self.args.ligand_file_name, self.args.ndocks)

    def dock(self):
        '''Do the docking.'''
        self.docker.dock()
        print('Docking completed. Results are in %s' % self.settings.output_directory)

###########################################################################

if __name__ == '__main__':
    runner = Runner()
    runner.run()

Interactive docking

This example shows how the results of a docking may be used to inform further dockings in an iterative fashion.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

'''
    similarity_docking.py - search the CSD for similar structures to a known
    ligand, dock them, saving structures which score better than the known ligand.

    This exemplifies using an interactive docker.
'''
######################################################################

import sys
import os
import argparse

from ccdc.io import EntryReader, EntryWriter
from ccdc.search import SimilaritySearch
from ccdc.docking import Docker
from ccdc.entry import Entry

########################################################################################


class SimilarityDocking(argparse.ArgumentParser):
    '''Defines arguments and runs the job.'''

    def __init__(self):
        '''Defines the arguments.'''
        super(self.__class__, self).__init__(description=__doc__)

        # Input and output options
        self.add_argument(
            'conf_file_name',
            help='The configuration file to use.'
        )
        self.add_argument(
            '-l', '--ligand-file-name', default=None,
            help='Ligand to use in the docking'
        )
        self.add_argument(
            '-o', '--output-directory', default=os.path.abspath('output'),
            help='Directory in which results will be stored.'
        )

        # Similarity search parameters
        self.add_argument(
            '-m', '--max_hit_structures', default=None, type=int,
            help='The maximum number of CSD structures to try and dock (default: unlimited).'
        )

        # Docking parameters
        self.add_argument(
            '-f', '--fitness-function', default=None,
            help='The fitness function to use.'
        )
        self.add_argument(
            '-n', '--ndocks', default=10, type=int,
            help='The number of docking attempts to make.'
        )
        self.add_argument(
            '-s', '--speed', default=None,
            choices=('very fast', 'fast', 'medium', 'slow', 'very slow'),
            help=('How thorough the docking attempts should be. ',
                  'Values correspond to 10%%, 25%%, 50%%, 75%% and 100%% ',
                  'auto-scaling of docking parameters.')
        )

    def get_arguments(self):
        '''Extract and process the arguments.'''
        self.args = self.parse_args()
        self.settings = Docker.Settings.from_file(self.args.conf_file_name)
        if self.args.output_directory:
            self.settings.output_directory = self.args.output_directory
        if self.args.fitness_function is not None:
            self.settings.fitness_function = self.args.fitness_function
        if self.args.speed is not None:
            self.settings.autoscale = {
                'very fast': 10,
                'fast': 25,
                'medium': 50,
                'slow': 75,
                'very slow': 100
            }[self.args.speed]
        self.ligand_info = self.settings.ligand_files
        if self.args.ligand_file_name is None:
            self.args.ligand_file_name = self.ligand_info[0].file_name
        self.settings.clear_ligand_files()
        self.settings.set_hostname(ndocks=self.args.ndocks)
        self.docker = Docker(settings=self.settings)
        if not os.path.isdir(self.settings.output_directory):
            os.makedirs(self.settings.output_directory)
        self.winner_writer = EntryWriter(
            os.path.join(self.settings.output_directory, 'winners.mol2')
        )
        self.better_writer = EntryWriter(
            os.path.join(self.settings.output_directory, 'better.mol2')
        )

    def run(self):
        '''Runs the job.'''
        self.get_arguments()
        self.session = self.docker.dock(
            os.path.join(self.settings.output_directory, 'settings.conf'),
            'interactive'
        )
        self.run_substrate()
        self.done = set()
        dock_index = 1
        while True:
            hits = self.search()
            if not hits:
                break
            for h in hits:
                self.done.add(h.identifier)
                mol = h.molecule
                if mol.all_atoms_have_sites:
                    mol.assign_bond_types()
                    mol.add_hydrogens()
                    entry = Entry.from_molecule(mol)
                    poses = self.session.dock(entry)
                    dock_index += 1
                    if poses:
                        score, pose = self.best(poses)
                        for inx, p in enumerate(poses):
                            with EntryWriter(
                                os.path.join(
                                    self.settings.output_directory,
                                    'gold_soln_CSD_API_m%d_%d.mol2' % (dock_index, inx + 1)
                                )
                            ) as w:
                                w.write(p)
                        if score > self.best_score:
                            print('%-8s: %.3f Winner!' % (h.identifier, score))
                            self.best_score, self.best_pose = score, pose
                            self.winner_writer.write(self.best_pose)
                        else:
                            print('%-8s: %.3f' % (h.identifier, score))
                            if score > self.substrate_score:
                                self.better_writer.write(pose)
                    else:
                        print('%-8s: Failed to dock' % h.identifier)
                else:
                    print('%-8s: No 3d structure' % h.identifier)

    def search(self):
        '''Similarity search the csd.'''
        threshold = 1.0
        hits = []
        mol = self.best_pose.molecule
        mol.standardise_aromatic_bonds()
        while not hits and round(threshold, 6) > 0.5:
            # initially just search for CSD structures similar to the ligand
            threshold -= 0.1
            simsearcher = SimilaritySearch(mol, threshold)
            simsearcher.settings.only_organic = True
            simsearcher.settings.has_3d_coordinates = True
            hits = [h for h in simsearcher.search(max_hit_structures=self.args.max_hit_structures)
                    if h.identifier not in self.done]
            print('Found %s structures similar to %s with threshold %.2f.' %
                  (len(hits), mol.identifier, threshold))

            if hits:
                # then further specify the search by finding structures
                # similar to individual components of hits
                def match_components(h):
                    if len(h.molecule.components) > 1:
                        for m in h.molecule.components:
                            m.identifier = h.identifier
                            hh = simsearcher.search(m)
                            if hh:
                                return hh[0]
                        else:
                            return None
                            # raise RuntimeError('Similarity search failed to match any component')
                    else:
                        return h

                hits = [x for x in [match_components(h) for h in hits] if x is not None]
                print('%d new hits with threshold %.2f against %s' %
                      (len(hits), threshold, mol.identifier))
                return hits
            else:
                print('No hits with threshold %.2f against %s' % (threshold, mol.identifier))

    def run_substrate(self):
        '''Runs the substrate, for the comparison.'''
        self.substrate = EntryReader(self.args.ligand_file_name)[0]
        self.substrate.identifier = self.substrate.identifier.replace(':', '_')
        poses = self.session.dock(self.substrate)

        if not poses:
            print('FAILED TO DOCK SUBSTRATE')
            sys.exit()
        self.best_score, self.best_pose = self.best(poses)
        self.substrate_score = self.best_score
        for inx, p in enumerate(poses):
            outfile_path = os.path.join(self.settings.output_directory,
                                        'gold_soln_CSD_API_m1_%d.mol2' % (inx + 1))
            with EntryWriter(outfile_path) as output_file:
                output_file.write(p)
        print('%-9s: %.3f' % ('Substrate', self.best_score))

    def best(self, poses):
        '''Extracts best score and best structure from the docked poses.'''
        fitness_tag = dict(
            plp='Gold.PLP.Fitness',
            chemscore='Gold.Chemscore.Fitness',
            goldscore='Gold.Goldscore.Fitness',
            asp='Gold.ASP.Fitness'
        )[self.settings.fitness_function]
        scores = [
            float(e.attributes[fitness_tag]) for e in poses
        ]
        best_score = max(scores)
        best_pose = poses[scores.index(best_score)]
        return best_score, best_pose

########################################################################################


if __name__ == '__main__':
    similarity_docking = SimilarityDocking()
    similarity_docking.run()

Testing fitness functions

This example shows how the performance of docking and rescoring functions can be systematically evaulated, how the pool method can be employed to parallize docking runs and the pandas module can be used to help with statistical analysis

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-02-10: created by the Cambridge Crystallographic Data Centre
#
#

'''
This simple example script uses docking to evaluate different scoring functions

Some of the analysis code depends on the numpy package and the pandas package
being installed.
Presentation of the results depends on the package matplotlib.

.. seealso:: <https://www.numpy.org/> for more information on numpy.
.. seealso:: <https://pandas.pydata.org/> for more information on pandas.
.. seealso:: <https://matplotlib.org/> for more information on matplotlib.
'''
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

import argparse
import os
import multiprocessing
from multiprocessing import Pool
import re
import numpy as np
import pandas as pd
from ccdc.docking import Docker
from ccdc.io import MoleculeReader, EntryReader
from ccdc.protein import Protein


def run_gold(params):
    (protein, ligand, reference, ndocks,
     fitness_function, rescore_function, no_run, base_out_dir, max_results) = params

    protein_file = os.path.abspath(protein)
    ligand_file = os.path.abspath(ligand)
    native_ligand_file = os.path.abspath(reference)

    docker = Docker()
    settings = docker.settings
    protein = Protein.from_file(protein_file)

    ligand = MoleculeReader(ligand_file)[0]

    native_ligand = MoleculeReader(native_ligand_file)[0]
    settings.add_protein_file(protein_file)
    settings.add_ligand_file(ligand_file, ndocks)
    settings.reference_ligand_file = reference
    settings.binding_site = settings.BindingSiteFromLigand(protein, native_ligand, 6.0)

    settings.diverse_solutions = (True, 3, 1.0)
    # The max_results parameter can be passed in to speed up runs for testing purposes.
    # However this WILL reduce the usefulness of the results as the set of structures to compare
    # is reduced to the given number.
    if max_results is not None:
        settings.early_termination = (True, max_results, 1.0)
    else:
        settings.early_termination = False

    settings.fitness_function = fitness_function
    rescore_label = ''
    if rescore_function != 'None':
        settings.rescore_function = rescore_function
        rescore_label = "score_" + str(rescore_function)
    else:
        rescore_function = fitness_function

    output_dir = os.path.abspath(get_docking_directory(base_out_dir,
                                                       fitness_function, rescore_function, no_run))
    settings.output_directory = output_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    settings.output_file = os.path.join(
        output_dir, get_docking_file(fitness_function, rescore_function, no_run))
    docker.dock(file_name=os.path.join(output_dir,
                                       get_conf_file(fitness_function, rescore_function, no_run)))


def rescore_gold(params):
    (reference, fitness_function, rescore_function, no_run, base_out_dir) = params
    in_path = get_docking_directory(base_out_dir, fitness_function, fitness_function, no_run)

    protein_file = os.path.join(in_path, 'gold_protein.mol2')
    ligand_file = os.path.join(in_path,
                               get_docking_file(fitness_function, fitness_function, no_run))

    # protein settings for docking
    docker = Docker()
    settings = docker.settings
    protein = Protein.from_file(protein_file)
    settings.add_protein_file(os.path.join(protein_file))
    settings.add_ligand_file(ligand_file, 1)

    # define the binding site using the reference ligand
    native_ligand_file = os.path.abspath(reference)
    native_ligand = MoleculeReader(native_ligand_file)[0]
    settings.binding_site = settings.BindingSiteFromLigand(protein, native_ligand, 6.)


    settings.reference_ligand_file = native_ligand_file


    settings.fitness_function = None
    settings.rescore_function = rescore_function

    output_dir = get_docking_directory(base_out_dir, fitness_function, rescore_function, no_run)
    settings.output_directory = output_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    settings.output_file = os.path.join(output_dir, get_docking_file(
        fitness_function, rescore_function, no_run))
    docker.dock(file_name=os.path.join(get_docking_directory(base_out_dir, fitness_function,
                                                             rescore_function, no_run),
                                       get_conf_file(fitness_function, rescore_function, no_run)))


def get_conf_file(fitness_function, scoring_function, no_run):
    return '%s_score_%s_run_%s.conf' % (fitness_function, scoring_function, no_run)


def get_docking_directory(base_dir, fitness_function, scoring_function, no_run):
    return os.path.abspath(os.path.join(base_dir,
                                        'fitness_%s' % fitness_function,
                                        'score_%s' % scoring_function,
                                        'run_%s' % no_run))


def get_docking_file(fitness_function, scoring_function, no_run):
    return 'poses_%s_score_%s_run_%s.mol2' % (fitness_function, scoring_function, no_run)


def get_path(base_dir, fitness_function, scoring_function, no_run):
    return os.path.join(
        get_docking_directory(base_dir, fitness_function, scoring_function, no_run),
        get_docking_file(fitness_function, scoring_function, no_run))


def get_scores(ndocks, nruns, output_directory):
    SF = ('goldscore', 'plp', 'chemscore', 'asp')
    re_RMSD = re.compile(r'\w*\.Reference\.RMSD')
    re_FITNESS = re.compile(r'.+\..+\.Fitness')
    df = {}
    pose_metrics = {}
    for run in range(1, int(ndocks) + 1):
        for dockfunc in SF:
            # dockmols = {}
            dock_entries = {}
            poselist = {}
            for sf in SF:
                this_path = get_path(output_directory, dockfunc, sf, run)
                # dockmols[sf] = MoleculeReader(this_path)
                dock_entries[sf] = EntryReader(this_path)
            for i in range(0, int(nruns)):
                table = {}
                for sf in SF:
                    table['Docking_function'] = sf
                    d = dock_entries[sf][i]
                    # m = dockmols[sf][i]
                    for key, value in d.attributes.items():
                        test = re_RMSD.match(key)
                        if test:
                            table[key] = value
                        test_fitness = re_FITNESS.match(key)
                        if test_fitness:
                            table[key] = value
                series = pd.Series(table)
                poselist[i] = series
            df = pd.DataFrame(poselist)
            pose_metrics[dockfunc, run] = df.T

    # Next, let's work out if any correct poses were found (sampling success)
    # and if so, if the scoring or rescoring functions were able to detect them
    # as the top solution
    # We create a dictionary for each docking/rescoring/run combination,
    # add them to a list and convert them to a dataframe
    RMSD_threshold = 2.0
    data = []
    for dockfunc in ['asp', 'chemscore', 'goldscore', 'plp']:
        for run in range(1, int(ndocks) + 1):
            df = pose_metrics[dockfunc, run]
            for scorefunc in ['Gold.ASP.Fitness', 'Gold.Chemscore.Fitness',
                              'Gold.Goldscore.Fitness', 'Gold.PLP.Fitness', 'Gold.Reference.RMSD']:
                dockings = {}
                dockings['dockfunc'] = dockfunc
                dockings['run'] = run
                dockings['scorefunc'] = scorefunc
                dockings['worked'] = 0
                flag = False
                if scorefunc == 'Gold.Reference.RMSD':
                    flag = True
                if (float(df.sort_values(by=scorefunc,
                                         ascending=flag).iloc[0]['Gold.Reference.RMSD']) <
                        float(RMSD_threshold)):
                    dockings['worked'] = 1
                data.append(dockings)
    success_list = pd.DataFrame(data)

    # Now we can average the success obtained by a combination of docking
    # and scoring function using a pivot table
    mean_success = pd.pivot_table(success_list, index=['dockfunc', 'scorefunc'], values=[
                                  'worked'], aggfunc=np.mean)
    mean_success.to_csv(os.path.join(output_directory, 'mean_sucess.csv'))

    # Last thing to do is to illustrate our data in a visual diagram
    plot_data = mean_success.unstack()
    dockfunc = plot_data.index.get_level_values(0)
    scorefunc = [str(a).split('.')[1].lower() for a in plot_data.columns.get_level_values(1)]
    scorefunc = [s.replace('reference', 'rmsd') for s in scorefunc]
    plt.yticks(np.arange(0.5, len(dockfunc), 1), dockfunc)
    plt.xticks(np.arange(0.5, len(scorefunc), 1), scorefunc)
    plt.pcolor(plot_data, cmap=matplotlib.cm.Blues)
    plt.title('GOLD docking performance')
    plt.xlabel('scoring function')
    plt.ylabel('docking function')
    plt.tight_layout()
    plt.savefig(os.path.join(output_directory, 'statistics.png'))
    plt.show()


def parse_args():
    parser = argparse.ArgumentParser(
        description='Run dockings to evaluate different fitness functions.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p', '--protein', required=True, dest='protein',
                        help='The prepared protein .mol2 file')
    parser.add_argument('-l', '--ligand', required=True, dest='ligand',
                        help='The prepared ligand .mol2 file')
    parser.add_argument('-r', '--reference', required=True, dest='reference',
                        help='The reference ligand to define the active site')
    parser.add_argument('-n', '--ndocks', type=int, default=10, dest='ndocks',
                        help='The number of dockings to attempt per ligand')
    parser.add_argument('-t', '--ntests', type=int, default=5, dest='ntests',
                        help='The number of parallel test runs to attempt')
    parser.add_argument('-d', '--output_directory', default=os.path.abspath('output'),
                        help='The directory to store output files in.')
    parser.add_argument('-c', '--num_cores', type=int, default=multiprocessing.cpu_count(),
                        help='How many docking processes to run in parallel.')
    parser.add_argument('-m', '--max_results', type=int, default=None,
                        help='Number of results after which to end docking for testing purposes.')
    return parser.parse_args()


if __name__ == '__main__':
    import ccdc.utilities
    ccdc.utilities._fix_multiprocessing_on_macos()
    args = parse_args()

    SF = ('goldscore', 'plp', 'chemscore', 'asp')

    with Pool(args.num_cores) as p:
        # generate parameters for docking runs and distribute them among CPUs
        params = [(args.protein, args.ligand, args.reference, int(args.ndocks),
                scoring_function, 'None', run, os.path.abspath(args.output_directory),
                args.max_results)
                for scoring_function in SF
                for run in range(1, int(args.ntests) + 1)]
        p.map(run_gold, params)

        # generate parameters for rescoring runs and distribute them among CPUs
        rescore_params = [(args.reference, scoring_function, rescoring_function,
                        run, os.path.abspath(args.output_directory))
                        for scoring_function in SF
                        for rescoring_function in SF
                        for run in range(1, int(args.ntests) + 1)
                        if scoring_function != rescoring_function]
        p.map(rescore_gold, rescore_params)

    # Actually run the dockings
    get_scores(args.ntests, args.ndocks, os.path.abspath(args.output_directory))

Ensemble docking

This example shows again how docking and rescoring can be evaluated. It also shows how to systematically set a protein h-bond constraints looking atom distance search lookup.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-03-03: created by the Cambridge Crystallographic Data Centre
#
"""
This script exemplifies docking, constraints and the use of MolecularDescriptors.AtomDistanceSearch.

This script runs GOLD in batch mode, example usage:

python ensemble_docking_constraints.py
    --proteins 1TVO_protein.mol2 2OJG_protein.mol2 4ZZN_protein.mol2
               3I5Z_protein.mol2 4FV9_protein.mol2
    --out_dir results --cavity cavity_ligands.mol2
    --ligand FRZ_ideal.sdf  --PHBconstraint 1TVO A MET108 H
"""

import argparse
from multiprocessing import Pool
import os
import glob
import multiprocessing

from ccdc.descriptors import MolecularDescriptors
from ccdc.docking import Docker
from ccdc.io import MoleculeReader
from ccdc.molecule import Molecule
from ccdc.protein import Protein


def parse_args():
    usage = ('Run GOLD in batch mode on multiple proteins.')
    parser = argparse.ArgumentParser(description=usage,
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--proteins', dest='proteins', nargs='+', required=True,
                        help='protein(s) used for docking')
    parser.add_argument(
        '--PHBconstraint', dest='PHBconstraint', nargs=4, required=True,
        help='HBond constraint (protein ID, chain, residue, atom label, e.g. "1TVO A MET108 H")')
    parser.add_argument('--cavity', dest='cavity', required=True,
                        help='one or more ligand files for cavity definition')
    parser.add_argument('--ligand', dest='ligand', required=True,
                        help='one or more ligands for docking')
    parser.add_argument('--out_dir', dest='odir', default='output',
                        help='output directory for docking results')
    parser.add_argument('-c', '--num_cores', type=int, default=multiprocessing.cpu_count(),
                        help='How many docking processes to run in parallel.')
    parser.add_argument('-m', '--max_results', type=int, default=None,
                        help='How many solutions to calculate before stopping the docking.')
    parser.add_argument('-r', '--ndocks', type=int, default=2,
                        help='How many docking runs to perform for each function combination.')
    return parser.parse_args()


def define_docker(proteins, cavity, ligands, constraint):
    merge_cavity = Molecule()
    with MoleculeReader(cavity) as cavity_reader:
        for c in cavity_reader:
            merge_cavity.add_molecule(c)
    settings = Docker.Settings()
    merge_protein = Molecule()

    # creating a merged protein from multiple proteins
    # and adding proteins individually to the ensemble
    for protein in proteins:
        settings.add_protein_file(os.path.abspath(protein))
        protein = Protein.from_file(os.path.abspath(protein))
        merge_protein.add_molecule(protein)

    # defining the ligand
    settings.add_ligand_file(os.path.abspath(ligands))

    # defining the binding site
    settings.binding_site = settings.BindingSiteFromLigand(merge_protein, merge_cavity, 6.0)

    # sorting out protein HBond constraints
    for protein in settings.proteins:

        # find the protein specified in the constraint
        if str(protein.identifier) == str(constraint[0]):
            chain = str(constraint[1].upper())
            residue = str(constraint[2].upper())
            label = str(constraint[3].upper())
            atom = [a for a in protein['%s:%s' % (chain, residue)].atoms if a.label == label]
            c = settings.ProteinHBondConstraint(atom)
            settings.add_constraint(c)
            for otherprot in [x for x in settings.proteins if x != protein]:
                searcher = MolecularDescriptors.AtomDistanceSearch(otherprot)
                near_atoms = searcher.atoms_within_range(atom[0].coordinates, 1.5)
                for n in near_atoms:
                    if n.label == atom[0].label:
                        matched_atom_index = n.index
                        ec = settings.ProteinHBondConstraint([otherprot.atoms[matched_atom_index]])
                        settings.add_constraint(ec)

    new_docker = Docker()
    new_docker.settings = settings
    return new_docker


def run_gold(params):
    (proteins, cavity, ligands, PHBconstraint, outdir, fitness_function,
        rescore_function, no_run, max_results) = params
    docker = define_docker(proteins, cavity, ligands, PHBconstraint)
    settings = docker.settings
    settings.clear_ligand_files()
    settings.add_ligand_file(os.path.abspath(ligands), 10)
    settings.fitness_function = fitness_function
    if rescore_function != 'None':
        settings.rescore_function = rescore_function
        rescore_label = 'score_%s' % rescore_function
    else:
        rescore_label = 'score_%s' % fitness_function
    out_path = os.path.join(outdir, 'fitness_%s' % fitness_function,
                            rescore_label, 'run_%s' % no_run)
    settings.output_directory = out_path
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    # The max_results parameter can be passed in to speed up runs for testing purposes.
    # However this WILL reduce the usefulness of the results as the set of structures to compare
    # is reduced to the given number.
    if max_results is not None:
        settings.early_termination = (True, max_results, 1.0)
    else:
        settings.early_termination = False

    new_docker = Docker()
    new_docker.settings = settings
    new_docker.dock(file_name='%s_%s_run_%s.conf' %
                    (fitness_function, rescore_label, no_run))


def rescore_gold(params):
    # expand parameters to local variables for usability
    (proteins, cavity, ligands, constraint, odir, fitness_function,
        rescore_function, no_run) = params
    docker = define_docker(proteins, cavity, ligands, constraint)
    settings = docker.settings
    settings.clear_ligand_files()
    in_path = os.path.join(odir,
                           'fitness_%s' % fitness_function,
                           'score_%s' % fitness_function,
                           'run_%s' % no_run)
    out_path = os.path.join(odir,
                            'fitness_%s' % fitness_function,
                            'score_%s' % rescore_function,
                            'run_%s' % no_run)
    settings.output_directory = out_path
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    poses = glob.glob(os.path.join(in_path, 'gold_soln*'))
    for pose in poses:
        settings.add_ligand_file(os.path.join(pose), 1)
    settings.fitness_function = None
    settings.rescore_function = rescore_function
    settings.output_directory = out_path
    new_docker = Docker()
    new_docker.settings = settings
    new_docker.dock(file_name='%s_score_%s_run_%s.conf' %
                    (fitness_function, rescore_function, no_run))


if __name__ == '__main__':
    import ccdc.utilities
    ccdc.utilities._fix_multiprocessing_on_macos()
    args = parse_args()
    SF = ('goldscore', 'plp', 'chemscore', 'asp')
    NO = range(1, args.ndocks + 1)
    params = [(args.proteins, args.cavity, args.ligand, args.PHBconstraint, args.odir,
               scoring_function, 'None', run, args.max_results)
              for scoring_function in SF for run in NO]
    with Pool(args.num_cores) as p:
        p.map(run_gold, params)
        rescore_params = [(args.proteins, args.cavity, args.ligand, args.PHBconstraint,
                        args.odir, scoring_function, rescoring_function, run)
                        for scoring_function in SF for rescoring_function in SF
                        for run in NO if scoring_function != rescoring_function]
        p.map(rescore_gold, rescore_params)

Searching docked structures

This example shows how to write a csdsql format database from docking results, to enable faster substructure searching, and how to associate the GOLD docking tags with the substructure search hits found.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
##############################################################################
'''
    searching_docking_results.py    -   how to perform substructure searching on a multi-mol2 file.

    This example shows how to create a .csdsql database from a multi-mol2 file
    to get fast substructure searching, then to associate the GOLD docking scores with the hits.
    The .csdsql database will be written to the output directory of the docking, if it doesn't already exist, or to
    a directory optionally specified on the command line.
    One or more SMARTS patterns may be provided to search, and the resulting mean fitness scores calculated and printed.
'''
##############################################################################

import os
import argparse
import collections

from ccdc import io, docking, search

##############################################################################

class Runner(argparse.ArgumentParser):
    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'conf_file',
            help='Docking configuration file'
        )
        self.add_argument(
            'SMARTS', nargs='*',
            help='SMARTS patterns to search for in the database'
        )
        self.add_argument(
            '-o', '--output-directory', default=None,
            help='Alternative directory for the csdsql file'
        )
        self.args = self.parse_args()

    def run(self):
        settings = docking.Docker.Settings.from_file(self.args.conf_file)
        docker = docking.Docker(settings=settings)
        results = docker.results
        if self.args.output_directory is None:
            output_dir = settings.output_directory
        else:
            output_dir = self.args.output_directory
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)

        output_base, _ = os.path.splitext(os.path.basename(settings.output_file))
        if not output_base:
            csdsql_file = os.path.join(output_dir, 'docking_results.csdsql')
        else:
            csdsql_file = os.path.join(output_dir, output_base + '.csdsql')

        if not os.path.exists(csdsql_file):
            print('Writing:', csdsql_file, 'from', settings.output_file)
            # Ensure that identifiers are unique
            identifier_suffix = collections.defaultdict(int)
            with io.EntryWriter(csdsql_file) as writer:
                for i, ligand in enumerate(results.ligands):
                    identifier_suffix[ligand.identifier] += 1
                    ligand.identifier = '%s_%d' % (ligand.identifier, identifier_suffix[ligand.identifier])
                    writer.write(ligand)

        db = io.EntryReader(csdsql_file)
        ligands = results.ligands
        longest = max(len(s) for s in self.args.SMARTS)
        # Need an index by new identifier
        index = {
            e.identifier : i for i, e in enumerate(db)
        }
        for smarts in self.args.SMARTS:
            searcher = search.SubstructureSearch()
            searcher.add_substructure(search.SMARTSSubstructure(smarts))
            hits = searcher.search(database=db, max_hits_per_structure=1)
            fitness = 0.0
            for h in hits:
                ligand = ligands[index[h.identifier]]
                fitness += ligand.fitness()
            if hits:
                fitness /= len(hits)
            print('%*s: %4d hits %.2f average fitness' % (longest, smarts, len(hits), fitness))

##############################################################################

if __name__ == '__main__':
    runner = Runner()
    runner.run()