Search examples

Torsion angle preferences in drug-like molecules

Note that this script makes use of functionality from the cookbook utility module.

#!/usr/bin/env python
# This script can be used for any purpose without limitation subject to the
# conditions at
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
Script to generate torsion angle histograms for common med. chem. fragments.

This script will create a report containing torsion histograms for ten common
medicinal chemistry fragments in an output directory.

The script was inspired by and the SMARTS pattern taken from the supplementary
material of:

"Torsion Angle Preferences in Druglike Chemical Space: A Comprehensive Guide"
Scharfer C.; Schulz-Gasch T.; Ehrilich H.-C.; Guba W.; Rarey M.; Stahl M.,
J. Med. Chem. (2013) 56, 2016-2028

import argparse
import os
import sys

from utilities import Histogram
from import SMARTSSubstructure, SubstructureSearch

def main():
    # Define arguments and parse them
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-m', '--max-hits', type=int,
                        help='Maximum number of entries to use (0=unlimited) [100]')

    parser.add_argument('-o', '--output-directory',
                        help='Output directory [med_chem_torsions]')

    args = parser.parse_args()
    if args.max_hits == 0:
        args.max_hits = sys.maxsize

    # Extract file names and create directory if necessary
    if not os.path.exists(args.output_directory):

    # Top ten SMARTS patterns
    patts = [
    # Do the search, and draw the histograms
    hist_filenames = []
    for index, pattern in enumerate(patts):
        substructure = SMARTSSubstructure(pattern)
        searcher = SubstructureSearch()
            0, substructure.label_to_atom_index(1),
            0, substructure.label_to_atom_index(2),
            0, substructure.label_to_atom_index(3),
            0, substructure.label_to_atom_index(4))
        hits =,
        hist_fn = os.path.join(args.output_directory, 'torsion_%02d.png' % index)
        hist = Histogram(title=pattern, xlabel='Torsion', ylabel='Count',
        torsions = [abs(hit.measurements['Torsion'])
                    for hit in hits if hit.measurements['Torsion'] is not None]
        hist.add_plot(torsions, color='blue', bins=18)
        del hist

    # Write the html
    with open(os.path.join(args.output_directory, 'index.html'), 'w') as out_file:
        out_file.write('<html><head><title>Torsion angle preferences</title></head>\n')
        out_file.write('<body><h1>Druglike chemical space torsion angle preferences</h1>\n')
        for hist_fn in hist_filenames:
            out_file.write('<img src="%s"/>\n' % os.path.basename(hist_fn))

if __name__ == '__main__':

Similarity search with maximum common substructure highlighted

This example performs a similarity search, then generates diagrams of the matching structures showing the maximum common substructure if the probe molecule and the hit molecule.

#!/usr/bin/env python
# This script can be used for any purpose without limitation subject to the
# conditions at
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
# 2015-06-17: created by the Cambridge Crystallographic Data Centre

"""  -   perform a similarity search, then
    format a web page containing the common substructures of the original
    structure and the returned similar structures.

import argparse
import os
import sys

from import MoleculeReader
from import SimilaritySearch
from ccdc.descriptors import MolecularDescriptors
from ccdc.diagram import DiagramGenerator
import ccdc

class Runner(argparse.ArgumentParser):
    """Defines arguments, reads command line, performs the search and formats the report."""

    def __init__(self):
        """Defines arguments."""
        super(self.__class__, self).__init__(description=__doc__)
            'refcodes', nargs='+',
            help='CSD refcodes or files from which to read molecules.'
            '-t', '--threshold', type=float, default=0.9,
            help='Similarity threshold sought'
            '-m', '--maximum', type=int, default=ccdc.maxint32,
            help='Maximum number of structures to retrieve [all]'
            '-o', '--output', default='results.html',
            help='Where to write the HTML report [report.html]'

    def run(self):
        """Runs the task."""
        self.args = self.parse_args()
        csd = MoleculeReader('csd')
        self.mcss = MolecularDescriptors.MaximumCommonSubstructure()
        self.generator = DiagramGenerator()
        self.generator.settings.return_type = 'SVG'
        self.generator.settings.highlight_color = '#00C0C0'
        self.generator.settings.shrink_symbols = False
        self.output = open(self.args.output, 'w')
        self.output.write('<html><head><title>Similarity Report</title></head><body>\n')
        for r in self.args.refcodes:
            if os.path.exists(r):
                for m in MoleculeReader(r):

    def run_one(self, mol):
        """Runs one search and report generation."""
        search = SimilaritySearch(mol, self.args.threshold)
        hits =

        def one_hit(hit):
            """Format a single hit."""
            atoms, bonds =, hit.molecule)
            mol_atoms = list(zip(*atoms))[0]
            hit_atoms = list(zip(*atoms))[1]
            image1 = self.generator.image(mol, highlight_atoms=mol_atoms)
            image2 = self.generator.image(hit.molecule, highlight_atoms=hit_atoms)
            return '\n'.join([
                '<tr><th align="center">%s</th><th align="center">%s</th></tr>'
                % (mol.identifier, hit.identifier),
                % (image1.replace('Qt SVG Document', 'Diagram for %s' % mol.identifier),
                   image2.replace('Qt SVG Document', 'Diagram for %s' % hit.identifier))])

        text = [
            '<h2>Similarity to %s with threshold %.2f</h2>\n'
            % (mol.identifier, self.args.threshold),
            '\n'.join(one_hit(h) for h in hits if mol.identifier != h.identifier),

if __name__ == '__main__':

Hit processor example

This example shows how to use to analyse search hits as they are found during a search.

#!/usr/bin/env python
# This script can be used for any purpose without limitation subject to the
# conditions at
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
# 2017-08-31: created by the Cambridge Crystallographic Data Centre
"""    -   example of a search using a HitProcessor

    This example searches the CSD for instances of a given SMARTS pattern,
    collecting counts of matching conformations.
    At the end it will format a brief HTML report the refcode of the first
    structure containing a match with a unique conformation, and the number
    of matches sharing this conformation.
    Arguments may specify the RMSD tolerance to consider two
    conformations the same and a maximum number of structures to inspect.

import argparse
import operator

from ccdc import search, io
from ccdc.descriptors import MolecularDescriptors

csd = io.EntryReader('csd')

class Result(object):
    """Records all matching hits."""

    def __init__(self, hit):
        """First instance of this conformation in the CSD."""
        self.identifier = hit.identifier
        self.count = 1
        self.mol = hit.molecule
        self.atoms = hit.match_atoms()

    def matches(self, hit, tolerance):
        """Test whether a hit matches this conformation."""
        rmsd = MolecularDescriptors.rmsd(self.mol, hit.molecule, atoms=list(zip(
            self.atoms, hit.match_atoms())), overlay=True)
        return rmsd < tolerance

class Analysis(search.SubstructureSearch.HitProcessor):
    """Deals with each hit as it is found by the SubstructureSearch."""

    def __init__(self, tolerance, max_structures):
        self.tolerance = tolerance
        self.max_structures = max_structures
        self.results = []
        self.last_identifier = csd.identifier(min(len(csd) - 1, self.max_structures))

    def add_hit(self, hit):
        """Called from the searcher."""
        if self.last_identifier < hit.identifier:
        if len(self.results) != 0:
            for r in self.results:
                if r.matches(hit, self.tolerance):
                    r.count += 1

class Runner(argparse.ArgumentParser):
    """Defines arguments, runs the job."""

    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__)
            'smarts', help='SMARTS pattern to analyse'
            '-t', '--title', help='Title for the report [smarts]'
            '-o', '--output-file', help='Output file name [smarts.html]'
            '-r', '--rmsd-tolerance', default=0.1, type=float,
            help='Maximum RMSD to be considered the same conformation'
            '-m', '--max-structures', default=len(csd), type=int,
            help='Maximum number of structures to process'

    def run(self):
        """Runs the job."""

        # process command-line arguments
        self.args = self.parse_args()
        if self.args.title is None:
            self.args.title = self.args.smarts
        if self.args.output_file is None:
            self.args.output_file = '%s.html' % self.args.title

        # run a search
        searcher = search.SubstructureSearch()
        analysis = Analysis(self.args.rmsd_tolerance, self.args.max_structures)
        analysis.results.sort(key=operator.attrgetter('count'), reverse=True)

        # output HTML
        with open(self.args.output_file, 'w') as writer:
            writer.write('<html>\n<head>\n<title>%s</title>\n</head>\n<body>\n' % self.args.title)
            writer.write('<h2>%s</h2>\n' % self.args.title)
            writer.write('<p>CSD matches for %s' % self.args.smarts)
            writer.write('<table border="1">\n')
            writer.write('<tr><th>Identifier</th><th># Matches</th></tr>\n')
            for r in analysis.results:
                writer.write('<tr><td>%s</td><td align="right">%d</td></tr>\n' %
                             (r.identifier, r.count))

if __name__ == '__main__':