Search examples¶

Torsion angle preferences in drug-like molecules¶

Note that this script makes use of functionality from the cookbook utility module.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
"""
Script to generate torsion angle histograms for common med. chem. fragments.

This script will create a report containing torsion histograms for ten common
medicinal chemistry fragments in an output directory.

The script was inspired by and the SMARTS pattern taken from the supplementary
material of:

"Torsion Angle Preferences in Druglike Chemical Space: A Comprehensive Guide"
Scharfer C.; Schulz-Gasch T.; Ehrilich H.-C.; Guba W.; Rarey M.; Stahl M.,
J. Med. Chem. (2013) 56, 2016-2028
"""

import argparse
import os
import sys

from utilities import Histogram
from ccdc.search import SMARTSSubstructure, SubstructureSearch


def main():
    # Define arguments and parse them
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-m', '--max-hits', type=int,
                        default=100,
                        help='Maximum number of entries to use (0=unlimited) [100]')

    parser.add_argument('-o', '--output-directory',
                        default='med_chem_torsions',
                        help='Output directory [med_chem_torsions]')

    args = parser.parse_args()
    if args.max_hits == 0:
        args.max_hits = sys.maxsize

    # Extract file names and create directory if necessary
    if not os.path.exists(args.output_directory):
        os.mkdir(args.output_directory)

    # Top ten SMARTS patterns
    patts = [
        '[*:1]~[CX3:2]!@[NX3:3]~[*:4]',
        '[*:1]~[NX3:2]!@[NX2:3]~[*:4]',
        '[*:1]~[NX3:2]!@[NX3:3]~[*:4]',
        '[cH1:1]~[a:2]([cH1])!@[a:3]([s,o,n:4])([cH0])',
        '[*:1]~[cX3:2]!@[NX2:3]~[*:4]',
        '[*:1]~[CX4:2]!@[NX2:3]~[*:4]',
        '[cH1:1]~[a:2]([cH1])!@[a:3]([cH0])[cH0:4]',
        '[*:1]~[OX2:2]!@[P:3]~[*:4]',
        '[*:1]~[CX4:2]!@[P:3]~[*:4]',
        '[*:1]~[CX4:2]!@[CX3:3]~[*:4]',
    ]
    # Do the search, and draw the histograms
    hist_filenames = []
    for index, pattern in enumerate(patts):
        substructure = SMARTSSubstructure(pattern)
        searcher = SubstructureSearch()
        searcher.add_substructure(substructure)
        searcher.add_torsion_angle_measurement(
            'Torsion',
            0, substructure.label_to_atom_index(1),
            0, substructure.label_to_atom_index(2),
            0, substructure.label_to_atom_index(3),
            0, substructure.label_to_atom_index(4))
        hits = searcher.search(max_hit_structures=args.max_hits,
                               max_hits_per_structure=1)
        hist_fn = os.path.join(args.output_directory, 'torsion_%02d.png' % index)
        hist_filenames.append(hist_fn)
        hist = Histogram(title=pattern, xlabel='Torsion', ylabel='Count',
                         file_name=hist_fn)
        torsions = [abs(hit.measurements['Torsion'])
                    for hit in hits if hit.measurements['Torsion'] is not None]
        hist.add_plot(torsions, color='blue', bins=18)
        hist.write()
        del hist

    # Write the html
    with open(os.path.join(args.output_directory, 'index.html'), 'w') as out_file:
        out_file.write('<html><head><title>Torsion angle preferences</title></head>\n')
        out_file.write('<body><h1>Druglike chemical space torsion angle preferences</h1>\n')
        for hist_fn in hist_filenames:
            out_file.write('<img src="%s"/>\n' % os.path.basename(hist_fn))
        out_file.write('</body></html>\n')


if __name__ == '__main__':
    main()

Similarity search with maximum common substructure highlighted¶

This example performs a similarity search, then generates diagrams of the matching structures showing the maximum common substructure if the probe molecule and the hit molecule.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

"""
    maximum_common_substructure.py  -   perform a similarity search, then
    format a web page containing the common substructures of the original
    structure and the returned similar structures.
"""

import argparse
import os
import sys

from ccdc.io import MoleculeReader
from ccdc.search import SimilaritySearch
from ccdc.descriptors import MolecularDescriptors
from ccdc.diagram import DiagramGenerator
import ccdc

class Runner(argparse.ArgumentParser):
    """Defines arguments, reads command line, performs the search and formats the report."""

    def __init__(self):
        """Defines arguments."""
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'refcodes', nargs='+',
            help='CSD refcodes or files from which to read molecules.'
        )
        self.add_argument(
            '-t', '--threshold', type=float, default=0.9,
            help='Similarity threshold sought'
        )
        self.add_argument(
            '-m', '--maximum', type=int, default=ccdc.maxint32,
            help='Maximum number of structures to retrieve [all]'
        )
        self.add_argument(
            '-o', '--output', default='results.html',
            help='Where to write the HTML report [report.html]'
        )

    def run(self):
        """Runs the task."""
        self.args = self.parse_args()
        csd = MoleculeReader('csd')
        self.mcss = MolecularDescriptors.MaximumCommonSubstructure()
        self.generator = DiagramGenerator()
        self.generator.settings.return_type = 'SVG'
        self.generator.settings.highlight_color = '#00C0C0'
        self.generator.settings.shrink_symbols = False
        self.output = open(self.args.output, 'w')
        self.output.write('<html><head><title>Similarity Report</title></head><body>\n')
        for r in self.args.refcodes:
            if os.path.exists(r):
                for m in MoleculeReader(r):
                    self.run_one(m)
            else:
                self.run_one(csd.molecule(r))
        self.output.write('</body></html>\n')
        self.output.close()

    def run_one(self, mol):
        """Runs one search and report generation."""
        search = SimilaritySearch(mol, self.args.threshold)
        hits = search.search(max_hit_structures=self.args.maximum)

        def one_hit(hit):
            """Format a single hit."""
            atoms, bonds = self.mcss.search(mol, hit.molecule)
            mol_atoms = list(zip(*atoms))[0]
            hit_atoms = list(zip(*atoms))[1]
            image1 = self.generator.image(mol, highlight_atoms=mol_atoms)
            image2 = self.generator.image(hit.molecule, highlight_atoms=hit_atoms)
            return '\n'.join([
                '<tr><th align="center">%s</th><th align="center">%s</th></tr>'
                % (mol.identifier, hit.identifier),
                '<tr><td>%s</td><td>%s</td></tr>'
                % (image1.replace('Qt SVG Document', 'Diagram for %s' % mol.identifier),
                   image2.replace('Qt SVG Document', 'Diagram for %s' % hit.identifier))])

        text = [
            '<h2>Similarity to %s with threshold %.2f</h2>\n'
            % (mol.identifier, self.args.threshold),
            '<table>',
            '\n'.join(one_hit(h) for h in hits if mol.identifier != h.identifier),
            '</table>'
        ]
        self.output.write('\n'.join(text))


if __name__ == '__main__':
    Runner().run()

Hit processor example¶

This example shows how to use ccdc.search.SubstructureSearch.HitProcessor to analyse search hits as they are found during a search.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-08-31: created by the Cambridge Crystallographic Data Centre
#
"""
    hit_processor.py    -   example of a search using a HitProcessor

    This example searches the CSD for instances of a given SMARTS pattern,
    collecting counts of matching conformations.
    At the end it will format a brief HTML report the refcode of the first
    structure containing a match with a unique conformation, and the number
    of matches sharing this conformation.
    Arguments may specify the RMSD tolerance to consider two
    conformations the same and a maximum number of structures to inspect.
"""

import argparse
import operator

from ccdc import search, io
from ccdc.descriptors import MolecularDescriptors

csd = io.EntryReader('csd')


class Result(object):
    """Records all matching hits."""

    def __init__(self, hit):
        """First instance of this conformation in the CSD."""
        self.identifier = hit.identifier
        self.count = 1
        self.mol = hit.molecule
        self.atoms = hit.match_atoms()

    def matches(self, hit, tolerance):
        """Test whether a hit matches this conformation."""
        rmsd = MolecularDescriptors.rmsd(self.mol, hit.molecule, atoms=list(zip(
            self.atoms, hit.match_atoms())), overlay=True)
        return rmsd < tolerance


class Analysis(search.SubstructureSearch.HitProcessor):
    """Deals with each hit as it is found by the SubstructureSearch."""

    def __init__(self, tolerance, max_structures):
        self.tolerance = tolerance
        self.max_structures = max_structures
        self.results = []
        self.last_identifier = csd.identifier(min(len(csd) - 1, self.max_structures))

    def add_hit(self, hit):
        """Called from the searcher."""
        if self.last_identifier < hit.identifier:
            self.cancel()
        if len(self.results) != 0:
            for r in self.results:
                if r.matches(hit, self.tolerance):
                    r.count += 1
                    return
        self.results.append(Result(hit))


class Runner(argparse.ArgumentParser):
    """Defines arguments, runs the job."""

    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'smarts', help='SMARTS pattern to analyse'
        )
        self.add_argument(
            '-t', '--title', help='Title for the report [smarts]'
        )
        self.add_argument(
            '-o', '--output-file', help='Output file name [smarts.html]'
        )
        self.add_argument(
            '-r', '--rmsd-tolerance', default=0.1, type=float,
            help='Maximum RMSD to be considered the same conformation'
        )
        self.add_argument(
            '-m', '--max-structures', default=len(csd), type=int,
            help='Maximum number of structures to process'
        )

    def run(self):
        """Runs the job."""

        # process command-line arguments
        self.args = self.parse_args()
        if self.args.title is None:
            self.args.title = self.args.smarts
        if self.args.output_file is None:
            self.args.output_file = '%s.html' % self.args.title

        # run a search
        searcher = search.SubstructureSearch()
        searcher.add_substructure(search.SMARTSSubstructure(self.args.smarts))
        analysis = Analysis(self.args.rmsd_tolerance, self.args.max_structures)
        analysis.search(searcher)
        analysis.results.sort(key=operator.attrgetter('count'), reverse=True)

        # output HTML
        with open(self.args.output_file, 'w') as writer:
            writer.write('<html>\n<head>\n<title>%s</title>\n</head>\n<body>\n' % self.args.title)
            writer.write('<h2>%s</h2>\n' % self.args.title)
            writer.write('<p>CSD matches for %s' % self.args.smarts)
            writer.write('<table border="1">\n')
            writer.write('<tr><th>Identifier</th><th># Matches</th></tr>\n')
            for r in analysis.results:
                writer.write('<tr><td>%s</td><td align="right">%d</td></tr>\n' %
                             (r.identifier, r.count))
            writer.write('</table>\n')
            writer.write('</body>\n</html>')


if __name__ == '__main__':
    Runner().run()

Protein-ligand search¶

This example shows how to perform protein-ligand searching on an sqlmol2 format database.

#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2019-04-26: created by the Cambridge Crystallographic Data Centre
#
"""
    protein_ligand_search.py    -   examples of protein ligand searches.

    All arguments may be specified by an unambiguous prefix of its long form.

    Each of the components of a protein-ligand complex may have one or more SMARTS
    patterns defined by:
        -l --ligand
        -p --protein
        -c --cofactor
        -w --water
        -m --metal
        --nucleotide
    These may be repeated.  Each separate argument adds the appropriate substructure
    to the SubstructureSearch. The SMARTS definition should be enclosed in single
    quotes to prevent command shell expansion of special characters. Substructure
    indices for use in geometric definitions and measurements will be in the above
    order, i.e. ligand substructures start at 0, protein substructures at the length
    of ligand substructures and so on.

    Geometric objects may be defined by:
        -P --plane
        -V --vector
        -C --centroid
    These will be followed by a quoted string of the form
    '(sub_index, at_index) (sub_index, at_index) ...' or the name of a centroid.
    Geometric objects will be defined sequentially, labelled PLANE1, VEC1, CENT1 etc.
    and added to the SubstructureSearch.

    Measurements and constraints may be defined by:
        -d --distances
        -a --angles
        -t --torsions
        --vector-angles
        --plane-angles
        --point-plane-distances
        --vector-plane-angles
    These will be followed by a quoted string containing the appropriate number of
    (sub, at) pairs or references to geometric objects, optionally followed by the
    constraint range.  If the constraint range is not present the appropriate
    measurement will be added to the SubstructureSearch, otherwise the appropriate
    constraint will be added.  The constraint range will be a pair (min, max)
    where min and max are floats.

    The maximum number of hits and the maximum number of hits per structure may be
    specified by:
        -m --max-hit-structures
        -M --max-hits-per-structure

    An output directory may be specified by:
        -o --output-directory

    The prefix of the output files may be specified by:
        --prefix

    A location of an csdsqlx file may be specified by:
        -D --database

    A feature database for further searches will be written if
        -f --feature-database
    is present in the arguments.

    Into the specified output directory will be written a .csv file containing the
    labels of matched atoms and values for the defined measurements and constraints.
    Also a .mol2 file containing the hit structures will be written.

    An example:
    python protein_ligand_search.py -D /path/to/testdb.csdsqlx \ # A small database for testing
    -o /tmp/results/ \                                           # Where to put the output
    -l "C-NH" \                                                  # Ligand SMARTS
    -p "c(~o)~o" \                                               # Protein SMARTS
    -d "(0, 0) (1, 1) (-5, 0)" \                                 # Distance constraint
    --dis "(0, 0) (1, 2)" \                                      # Distance measurement
    -P "(1, 0) (1, 1) (1, 2)" "(0, 0) (0, 1) (0, 2)" \           # A pair of planes
    --plane-angle "PLANE1 PLANE2 (0, 25)" \                      # Plane-angle constraint
    --pre test                                                   # Prefix of output files
"""

import os
import argparse
import collections
import csv
import string
import io as sys_io
import codecs
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

from ccdc import search, io, utilities, diagram, pharmacophore
from ccdc.descriptors import MolecularDescriptors


class ParseString(argparse.Action):
    '''Stores the string argument into the appropriate place in the args namespace.'''

    def __call__(self, parser, namespace, values, option_string):
        '''Works out where to put the concatenated value.'''
        vals = [''.join(l) for l in values]
        if getattr(namespace, self.dest) is None:
            setattr(namespace, self.dest, vals)
        else:
            getattr(namespace, self.dest).extend(vals)


class Runner(argparse.ArgumentParser):
    '''Sets up and runs the job.'''

    def __init__(self):
        """ Defines and parses the arguments to the job. """
        super(self.__class__, self).__init__(description='Perform a protein ligand search.',
                                             formatter_class=argparse.ArgumentDefaultsHelpFormatter)

        smarts_args = self.add_argument_group(title='SMARTS search patterns')
        smarts_args.add_argument(
            '-l', '--ligand-smarts',
            nargs='*', type=list, default=[], action=ParseString,
            help='SMARTS patterns to be searched for in a ligand.'
        )
        smarts_args.add_argument(
            '-p', '--protein-smarts',
            nargs='*', type=list, default=[], action=ParseString,
            help='SMARTS patterns to be searched for in a protein.'
        )
        smarts_args.add_argument(
            '-c', '--cofactor-smarts',
            nargs='*', type=list, default=[], action=ParseString,
            help='SMARTS patterns to be searched for in a cofactor.'
        )
        smarts_args.add_argument(
            '-w', '--water-smarts',
            nargs='*', type=list, default=[], action=ParseString,
            help='SMARTS patterns to be searched for in a water.'
        )
        smarts_args.add_argument(
            '-m', '--metal-smarts',
            nargs='*', type=list, default=[], action=ParseString,
            help='SMARTS patterns to be searched for in a metal.'
        )
        smarts_args.add_argument(
            '--nucleotide-smarts',
            nargs='*', type=list, default=[], action=ParseString,
            help='SMARTS patterns to be searched for in a nucleotide.'
        )

        geometry_args = self.add_argument_group(
            title='Geometric objects',
            description='Any points for parameters in this group can be given either as ' +
            '(substructure_index, atom_index) or the name of a Centroid, e.g. CENT1.')
        geometry_args.add_argument(
            '-P', '--planes',
            nargs='*', type=list, default=[], action=ParseString,
            help='Planes as three points, e.g. "(0, 0) (0, 1) (0, 2)" or "CENT1 (0, 1) (0, 2)".'
        )
        geometry_args.add_argument(
            '-C', '--centroids',
            nargs='*', type=list, default=[], action=ParseString,
            help='Centroids as one point, e.g. "(1, 0)".'
        )
        geometry_args.add_argument(
            '-V', '--vectors',
            nargs='*', type=list, default=[], action=ParseString,
            help='Vectors as two points, e.g. "(0, 0) (0, 1)" or "(1, 0) CENT2".'
        )

        constraints_args = self.add_argument_group(
            title='Measurements and Constraints',
            description='Any points for parameters in this group can be given either as ' +
            '(substructure_index, atom_index) or the name of a Centroid, e.g. CENT1. ' +
            'Each parameter ends in an additional optional range to make a constraint; ' +
            'if no range is given a measurement will be made instead.')
        constraints_args.add_argument(
            '-d', '--distances',
            nargs='*', type=list, default=[], action=ParseString,
            help='Distances as two points followed by a distance range for constraints; ' +
                 'e.g. "(0, 0) (1, 0)" or (0, 0) CENT1 (10, 30)".'
        )
        constraints_args.add_argument(
            '-a', '--angles',
            nargs='*', type=list, default=[], action=ParseString,
            help='Angles as three points followed by an angle range for constraints; ' +
                 'e.g. "(0, 0) (1, 0) (0, 1)" or "(0, 0) (1, 0) CENT1 (0, 25)".'
        )
        constraints_args.add_argument(
            '-t', '--torsions',
            nargs='*', type=list, default=[], action=ParseString,
            help='Torsion as four points, followed by an angle range for constraints; ' +
                 'e.g. "(1, 0) (1, 1) (0, 1) (0, 0)" or "(1, 0) CENT1 (0, 1) (0, 0) (0, 25)".'
        )
        constraints_args.add_argument(
            '--vector-angles',
            nargs='*', type=list, default=[], action=ParseString,
            help='Vector angles as two vectors, followed by an angle range for constraints; ' +
                 'e.g. "VEC1 VEC2" or "VEC1 VEC2 (0, 25)".'
        )
        constraints_args.add_argument(
            '--plane-angles',
            nargs='*', type=list, default=[], action=ParseString,
            help='Plane angles as two planes, followed by an angle range for constraints; ' +
                 'e.g. "PLANE1 PLANE2" or "PLANE1 PLANE2 (0, 25)".'
        )
        constraints_args.add_argument(
            '--point-plane-distances',
            nargs='*', type=list, default=[], action=ParseString,
            help='Point-plane distances as a point and a plane + a dist. range for constraints; ' +
                 'e.g. "(0, 1) PLANE1" or "CENT1 PLANE1 (0, 25)".'
        )
        constraints_args.add_argument(
            '--vector-plane-angles',
            nargs='*', type=list, default=[], action=ParseString,
            help='Vector-plane angles as a vector and a plane + an angle range for constraints; ' +
                 'e.g. "VEC1 PLANE1" or "VEC1 PLANE1 (10, 30)".'
        )

        hits_args = self.add_argument_group(title='Result limitations')
        hits_args.add_argument(
            '-n', '--max-hit-structures', default=0, type=int,
            help='Maximum number of structures to find.'
        )
        hits_args.add_argument(
            '-N', '--max-hits-per-structure', default=0, type=int,
            help='Maximum number of hits per structure.'
        )

        output_args = self.add_argument_group(title='Output files')
        output_args.add_argument(
            '-o', '--output-directory', default='./',
            help='Where to write output files.'
        )
        output_args.add_argument(
            '--prefix', default='hits',
            help='Prefix for the output files.'
        )
        output_args.add_argument(
            '-f', '--feature-database', default=False, action='store_true',
            help='Whether or not to write a feature database from the hits.'
        )

        database_args = self.add_argument_group(title='Databases')
        database_args.add_argument(
            '-D', '--database',
            default=os.path.join(os.path.dirname(
                pharmacophore.Pharmacophore.default_feature_database_location()),
                'pdb_crossminer.csdsqlx'),
            help='Location of the csdsqlx database to search.'
        )

        self.args = self.parse_args()
        if not os.path.exists(self.args.output_directory):
            os.makedirs(self.args.output_directory)

    def _parse_point(self, s):
        """Extract a name or a pair of indices from the string."""
        if s[0] in string.ascii_letters:
            if ' ' in s:
                x = s[:s.index(' ')]
                s = s[s.index(' ') + 1:].strip()
                return x, s
            else:
                x = s
                s = ''
                return x, s
        elif s[0] == '(':
            part1 = s[1:s.index(',')].strip()
            part2 = s[s.index(',') + 1:s.index(')')].strip()
            s = s[s.index(')') + 1:].strip()
            try:
                p1 = int(part1)
                p2 = int(part2)
            except TypeError:
                p1 = float(part1)
                p2 = float(part2)
            return (p1, p2), s
        else:
            self.error('Malformed argument %s' % s)

    def _parse_geom(self, label, d, required, obj=None):
        """Extracts a geometric object, a constraint or a measurement and adds to the query."""
        # print('Parsing', label, d, obj)
        s = d.strip()
        comps = []
        while s:
            try:
                g, s = self._parse_point(s)
            except ValueError as exc:
                raise ValueError('Cannot parse parameter %s %s: %s' % (label, s, exc))
            comps.append(g)

        if obj == 'plane':
            self.query.add_plane(label, *comps)
        elif obj == 'centroid':
            self.query.add_centroid(label, *comps)
        elif obj == 'vector':
            self.query.add_vector(label, *comps)
        elif obj is not None:
            raise NotImplementedError('Parameter type %s not implemented!' % obj)
        elif len(comps) == required + 1:
            rng = comps.pop()
            if label.startswith('DIST'):
                self.query.add_distance_constraint(
                    label, comps[0], comps[1], rng, vdw_corrected=True, type='any')
            elif label.startswith('ANG'):
                self.query.add_angle_constraint(label, comps[0], comps[1], comps[2], rng)
            elif label.startswith('TOR'):
                self.query.add_torsion_angle_constraint(
                    label, comps[0], comps[1], comps[2], comps[3], rng)
            elif label.startswith('VA'):
                self.query.add_vector_angle_constraint(label, comps[0], comps[1], rng)
            elif label.startswith('PA'):
                self.query.add_plane_angle_constraint(label, comps[0], comps[1], rng)
            elif label.startswith('PP'):
                self.query.add_point_plane_distance_constraint(label, comps[0], comps[1], rng)
            elif label.startswith('VP'):
                self.query.add_vector_plane_angle_constraint(label, comps[0], comps[1], rng)
        elif len(comps) == required:
            if label.startswith('DIST'):
                self.query.add_distance_measurement(label, comps[0], comps[1])
            elif label.startswith('ANG'):
                self.query.add_angle_measurement(label, comps[0], comps[1], comps[2])
            elif label.startswith('TOR'):
                self.query.add_torsion_angle_measurement(
                    label, comps[0], comps[1], comps[2], comps[3])
            elif label.startswith('VA'):
                self.query.add_vector_angle_measurement(label, comps[0], comps[1])
            elif label.startswith('PA'):
                self.query.add_plane_angle_measurement(label, comps[0], comps[1])
            elif label.startswith('PP'):
                self.query.add_point_plane_distance_measurement(label, comps[0], comps[1])
            elif label.startswith('VP'):
                self.query.add_vector_plane_angle_measurement(label, comps[0], comps[1])
        else:
            raise NotImplementedError('Cannot parse parameter "%s".' % d)

    def _make_substructures(self, smarts, tag):
        """Makes substructures from the SMARTS definitions."""
        all_labels = []
        for i, s in enumerate(smarts):
            _inx_dict = collections.defaultdict(int)
            sub = search.SMARTSSubstructure(s)
            self.query.add_substructure(sub)
            labels = []
            for a in sub.atoms:
                _inx_dict[self._atom_tag(a)] += 1
                labels.append('%s_%s_%d' % (tag, self._atom_tag(a), _inx_dict[self._atom_tag(a)]))
                a.add_protein_atom_type_constraint(tag)
            all_labels.append(labels)
        return all_labels

    def build_query(self):
        """Builds the whole query."""
        self.query = search.SubstructureSearch()
        self.query.settings.max_hit_structures = self.args.max_hit_structures
        self.query.settings.max_hits_per_structure = self.args.max_hits_per_structure
        self.atom_tags = []
        self.atom_tags.extend(self._make_substructures(self.args.ligand_smarts, 'Ligand'))
        self.atom_tags.extend(self._make_substructures(self.args.protein_smarts, 'Amino'))
        self.atom_tags.extend(self._make_substructures(self.args.cofactor_smarts, 'Cofactor'))
        self.atom_tags.extend(self._make_substructures(self.args.water_smarts, 'Water'))
        self.atom_tags.extend(self._make_substructures(self.args.metal_smarts, 'Metal'))
        self.atom_tags.extend(self._make_substructures(self.args.nucleotide_smarts, 'Nucleotide'))
        for i, d in enumerate(self.args.planes):
            self._parse_geom('PLANE%d' % (i + 1), d, -1, obj='plane')
        for i, d in enumerate(self.args.centroids):
            self._parse_geom('CENT%d' % (i + 1), d, -1, obj='centroid')
        for i, d in enumerate(self.args.vectors):
            self._parse_geom('VEC%d' % (i + 1), d, 2, obj='vector')
        for i, d in enumerate(self.args.distances):
            self._parse_geom('DIST%d' % (i + 1), d, 2)
        for i, d in enumerate(self.args.angles):
            self._parse_geom('ANG%d' % (i + 1), d, 3)
        for i, d in enumerate(self.args.torsions):
            self._parse_geom('TOR%d' % (i + 1), d, 4)
        for i, d in enumerate(self.args.vector_angles):
            self._parse_geom('VA%d' % (i + 1), d, 2)
        for i, d in enumerate(self.args.plane_angles):
            self._parse_geom('PA%d' % (i + 1), d, 2)
        for i, d in enumerate(self.args.point_plane_distances):
            self._parse_geom('PP%d' % (i + 1), d, 2)
        for i, d in enumerate(self.args.vector_plane_angles):
            self._parse_geom('VP%d' % (i + 1), d, 2)

    @staticmethod
    def _atom_tag(a):
        """Makes a string representation of a substructure atom."""
        s = str(a)
        return s[len('QueryAtom('):s.rindex(')')]

    def get_field_names(self):
        """Column names for the csv file."""
        names = ['Identifier']
        for tags in self.atom_tags:
            names.extend(tags)
        for c in self.query.constraints.keys():
            names.append(c)
        for c in self.query.contacts.keys():
            names.append(c)
        for m in self.query.measurements.keys():
            names.append(m)
        for g in self.query.geometric_objects.keys():
            names.append(g)
        return names

    class HitProcessor(search.SubstructureSearch.HitProcessor):
        """Processes hits as they are found."""

        def __init__(self, query, atom_tags, csv_file, mol2_file, html_writer, unique_writer):
            self.query = query
            self.atom_tags = atom_tags
            self.csv_file = csv_file
            self.mol2_file = mol2_file
            self.html_file = html_writer
            self.unique_writer = unique_writer
            self.superimpose_onto = None
            self.hit_ct = 0
            self.last_id = None
            self.measurements = collections.defaultdict(list)
            self.hits_per_structure = 0

        def add_hit(self, hit):
            """Adds a hit to the csv and mol2 files."""
            print(hit.identifier)
            if self.superimpose_onto is None:
                self.superimpose_onto = hit.molecule
                self.superimpose_atoms = hit.match_atoms()
                mol = hit.molecule
            else:
                mol, rmsd, _, _ = MolecularDescriptors.overlay_rmsds_and_transformation(
                    self.superimpose_onto, hit.molecule,
                    atoms=zip(self.superimpose_atoms, hit.match_atoms()),
                    with_symmetry=False)
            if self.query.settings.max_hit_structures and\
                    self.hit_ct >= self.query.settings.max_hit_structures:
                self.cancel()
                return
            if self.last_id != hit.identifier:
                self.hit_ct += 1
                self.last_id = hit.identifier
                self.unique_writer.write(mol)
                self.hits_per_structure = 1
            else:
                self.hits_per_structure += 1
                if self.query.settings.max_hits_per_structure and\
                        self.query.settings.max_hits_per_structure >= self.hits_per_structure:
                    return
            self.mol2_file.write(mol)

            for k, v in hit.measurements.items():
                self.measurements[k].append(v)
            for k, v in hit.constraints.items():
                self.measurements[k].append(v)
            d = {
                'Identifier': hit.identifier,
            }
            for sub, tags in zip(hit.match_substructures(), self.atom_tags):
                d.update({
                    t: '%s:%s:%s' % (a.chain_label, a.residue_label, a.label)
                    for a, t in zip(sub.atoms, tags)
                })
            for k, v in hit.measurements.items():
                d[k] = round(v, 3)
            for k, v in hit.constraints.items():
                d[k] = round(v, 3)
            for k, v in hit.geometric_objects.items():
                d[k] = str(v)
            self.csv_file.writerow(d)

    def smarts_table(self, smarts, tag):
        diagram_gen = diagram.DiagramGenerator()
        diagram_gen.settings.image_width = diagram_gen.image_height = 100
        diagram_gen.settings.return_type = 'SVG'

        def smarts_diag(s):
            sub = search.SMARTSSubstructure(s)
            img = diagram_gen.image(sub)
            return img.replace('Qt SVG Document', 'Diagram for %s' % s)

        table = [
            '<table border="1">',
            '<tr><th colspan="%d">%s smarts</th></tr>' % (len(smarts), tag),
            '<tr>',
            ' '.join('<td>%s</td>' % s for s in smarts),
            '</tr>',
            '<tr>',
            '\n'.join('<td>%s</td>' % smarts_diag(s) for s in smarts),
            '</tr>',
            '</table>',
        ]
        return '\n'.join(table)

    def geometric_object_table(self, objects, tag, name):
        table = [
            '<table border="1">',
            '<tr><th colspan="2">%s</th></tr>' % tag,
            '\n'.join('<tr><td>%s%d</td><td>%s</td></tr>' % (name, i + 1, obj)
                      for i, obj in enumerate(objects)),
            '</table>'
        ]
        return '\n'.join(table)

    def one_histogram(self, tag, data):
        plt.hist(data)
        plt.title(tag)
        svg = sys_io.StringIO()
        plt.savefig(svg, format='svg')
        plt.close('all')
        return svg.getvalue()

    def write_html_body(self, hp, html):
        lines = [
            '<html><head>',
            '<title>%s %s</title>' % (self.args.prefix, 'Search'),
            '</head>',
            '<body>',
            '<h1>%s %s</h1>' % (self.args.prefix, 'Search'),
        ]
        html.write('\n'.join(lines))
        html.write(self.smarts_table(self.args.ligand_smarts, 'Ligand'))
        html.write(self.smarts_table(self.args.protein_smarts, 'Protein'))
        html.write(self.smarts_table(self.args.cofactor_smarts, 'Cofactor'))
        html.write(self.smarts_table(self.args.water_smarts, 'Water'))
        html.write(self.smarts_table(self.args.metal_smarts, 'Metal'))
        html.write(self.smarts_table(self.args.nucleotide_smarts, 'Nucleotide'))
        html.write(self.geometric_object_table(self.args.planes, 'Planes', 'PLANE'))
        html.write(self.geometric_object_table(self.args.centroids, 'Centroids', 'CENT'))
        html.write(self.geometric_object_table(self.args.vectors, 'Vectors', 'VEC'))
        html.write(self.geometric_object_table(self.args.distances, 'Distances', 'DIST'))
        html.write(self.geometric_object_table(self.args.angles, 'Angles', 'ANG'))
        html.write(self.geometric_object_table(
            self.args.torsions, 'torsions', 'TOR'))
        html.write(self.geometric_object_table(self.args.vector_angles, 'Vector Angles', 'VA'))
        html.write(self.geometric_object_table(self.args.plane_angles, 'Plane Angles', 'PA'))
        html.write(self.geometric_object_table(
            self.args.point_plane_distances, 'Point Plane Distances', 'PP'))
        html.write(self.geometric_object_table(
            self.args.vector_plane_angles, 'Vector Plane Angles', 'VP'))
        lines = [
            '</body>',
            '</html>',
        ]
        html.write('\n'.join(lines))
        for i, _ in enumerate(self.args.distances):
            label = 'DIST%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        for i, _ in enumerate(self.args.angles):
            label = 'ANG%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        for i, _ in enumerate(self.args.torsions):
            label = 'TOR%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        for i, _ in enumerate(self.args.vector_angles):
            label = 'VA%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        for i, _ in enumerate(self.args.plane_angles):
            label = 'PA%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        for i, _ in enumerate(self.args.point_plane_distances):
            label = 'PP%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        for i, _ in enumerate(self.args.vector_plane_angles):
            label = 'VP%d' % (i + 1)
            lines.append(self.one_histogram(label, hp.measurements[label]))
        html.write('\n'.join(lines))

    def run(self):
        """Runs with provided arguments."""
        self.database = io.EntryReader(self.args.database)
        self.build_query()
        field_names = self.get_field_names()
        unique_file_name = os.path.join(self.args.output_directory,
                                        'unique_%s.mol2' % self.args.prefix)
        if os.path.exists(unique_file_name):
            os.unlink(unique_file_name)
        with open(os.path.join(self.args.output_directory, '%s.csv' % self.args.prefix), 'w') as w,\
                io.MoleculeWriter(os.path.join(self.args.output_directory,
                                               '%s.mol2' % self.args.prefix)) as mol_writer, \
                codecs.open(os.path.join(self.args.output_directory, '%s.html' % self.args.prefix), 'w', encoding="utf-8")\
                as html_writer, \
                io.MoleculeWriter(unique_file_name) as unique_writer:
            csv_file = csv.DictWriter(w, field_names)
            csv_file.writeheader()
            hp = self.HitProcessor(self.query, self.atom_tags, csv_file,
                                   mol_writer, html_writer, unique_writer)
            hp.search(self.query, self.database)
            self.write_html_body(hp, html_writer)

        if self.args.feature_database:
            prot_info = pharmacophore.Pharmacophore.FeatureDatabase.DatabaseInfo(
                os.path.join(self.args.output_directory, 'unique_%s.mol2' %
                             self.args.prefix), 0, utilities.Colour(255, 0, 128, 292)
            )
            prot_sd = pharmacophore.Pharmacophore.FeatureDatabase.Creator.StructureDatabase(
                prot_info, use_crystal_symmetry=False,
                structure_database_path=os.path.join(self.args.output_directory,
                                                     '%s.csdsqlx' % self.args.prefix)
            )
            creator = pharmacophore.Pharmacophore.FeatureDatabase.Creator()
            db = creator.create((prot_sd,))
            file_name = os.path.join(self.args.output_directory, '%s.feat' % self.args.prefix)
            if os.path.exists(file_name):
                os.unlink(file_name)
            db.write(file_name)


if __name__ == '__main__':
    runner = Runner()
    runner.run()