Geometry analysis examples

Generating a HTML report

Note that this script makes use of functionality from the cookbook utility module.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

"""
Generate a HTML report along with histograms for any unusual geometric
features.

This code uses matplotlib (https://matplotlib.org/)  for result presentation
which may need installing before use.

For more help use the -h argument.

"""

import argparse
import os
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plot

from ccdc.io import MoleculeReader
from ccdc.conformer import GeometryAnalyser
from utilities import write_html_table, sanitise_file_name


def write_histogram(file_name, data, ref_val, settings):
    """Write a histogram to a file.

    This function is called by the functions:
      - write_bond_distance_histogram
      - write_valence_angle_histogram
      - write_torsion_angle_histogram
      - write_ring_rmsd_histogram

    """
    # Work out where the histogram bins should be placed on the histogram.
    xs = [settings['low'] + i * settings['bin_width'] for i in range(len(data))]

    # Set up the basic plot.
    fig = plot.figure()
    ax = fig.add_subplot(111)
    _ = ax.bar(xs, data, settings['bin_width'], color='g')
    ax.set_xlim(settings['low'], settings['high'])
    ax.set_xlabel(settings['xlabel'])
    ax.set_ylabel(settings['ylabel'])
    ax.set_title(settings['title'])

    # Add the reference value line.
    ax.add_line(plot.Line2D((ref_val, ref_val), plot.ylim(), color='r'))

    # Add the text describing the query value next to the reference line.
    # Finding a good position for this text requires a little bit of
    # manipulation. If the reference value is to the right of the middle of
    # the plot we place the text on the left hand side of the reference line.
    # Otherwise on the right hand side. We also create some relative offsets
    # so that the text does not cross the frame of the plot or the reference
    # line.
    ref_val_str = 'Value in query: %.2f' % ref_val
    middle = (settings['low'] + settings['high']) / 2.0
    ymin, ymax = plot.ylim()
    xoffset = (settings['high'] - settings['low']) / 100
    yoffset = (ymin - ymax) / 80
    ref_y = ymax + yoffset
    if ref_val > middle:
        ref_x = ref_val - xoffset
        ha = 'right'
    else:
        ref_x = ref_val + xoffset
        ha = 'left'
    ax.text(ref_x, ref_y, ref_val_str, color='r', ha=ha, va='top')
    plot.savefig(file_name)
    plot.close(fig)


def write_bond_distance_histogram(bond, file_name):
    """Write bond distance histogram to a file."""
    settings = {}
    settings['low'] = min(bond.value, bond.minimum) - 0.1
    settings['high'] = max(bond.value, bond.maximum) + 0.1
    diff = settings['high'] - settings['low']
    settings['bin_width'] = diff / 40.0
    settings['title'] = 'Bond lengths for %s' % bond.fragment_label
    settings['xlabel'] = 'Bond length'
    settings['ylabel'] = 'Number of hits'
    data = bond.histogram(settings['bin_width'],
                          settings['low'],
                          settings['high'])
    write_histogram(file_name, data, bond.value, settings)


def write_valence_angle_histogram(angle, file_name):
    """Write bond angle histogram to a file."""
    settings = {}
    settings['low'] = min(angle.value, angle.minimum) - 1
    settings['high'] = max(angle.value, angle.maximum) + 1
    diff = settings['high'] - settings['low']
    settings['bin_width'] = diff / 40.0
    settings['title'] = 'Valence angles for %s' % angle.fragment_label
    settings['xlabel'] = 'Valence angle'
    settings['ylabel'] = 'Number of hits'
    data = angle.histogram(settings['bin_width'],
                           settings['low'],
                           settings['high'])
    write_histogram(file_name, data, angle.value, settings)


def write_torsion_angle_histogram(torsion, file_name):
    """Write torsion angle histogram to a file."""
    settings = {}
    settings['low'] = 0
    settings['high'] = 180
    settings['bin_width'] = 5.0
    settings['title'] = 'Torsion angles for %s' % torsion.fragment_label
    settings['xlabel'] = 'Torsion angle'
    settings['ylabel'] = 'Number of hits'
    data = torsion.histogram(settings['bin_width'],
                             settings['low'],
                             settings['high'])
    write_histogram(file_name, data, abs(torsion.value), settings)


def write_ring_rmsd_histogram(ring, file_name):
    """Write ring RMSD histogram to a file."""
    settings = {}
    settings['low'] = -1.0
    settings['high'] = ring.maximum
    diff = settings['high'] - settings['low']
    settings['bin_width'] = diff / 40.0
    settings['title'] = 'Ring analysis for %s' % ring.fragment_label
    settings['xlabel'] = 'RMSD of torsions from query ring'
    settings['ylabel'] = 'Number of hits'
    data = ring.histogram(settings['bin_width'],
                          settings['low'],
                          settings['high'])
    write_histogram(file_name, data, 0.0, settings)


HEADERS = (
    'Fragment',
    'Classification',
    'Value',
    'Count',
    'Mean',
    'Median',
    'Z Score',
    'Dmin',
)

TOR_HEADERS = (
    'Fragment',
    'Classification',
    'Value',
    'Count',
    'Mean',
    'Median',
    'Local density',
    'Dmin',
)

RING_HEADERS = (
    'Fragment',
    'Classification',
    'Value',
    'Count',
    'Mean',
    'Median',
    'Dmin',
)

def get_row(frag):
    """Return table data row."""
    if frag.type == 'torsion':
        z_score = frag.local_density
    elif frag.type == 'ring':
        return (
            frag.fragment_label,
            frag.classification,
            '%.3f' % frag.value,
            '%i' % frag.nhits,
            '%.3f' % frag.mean,
            '%.3f' % frag.median,
            '%.3f' % frag.d_min,
        )
    else:
        z_score = frag.z_score
    return (
        frag.fragment_label,
        frag.classification,
        '%.3f' % frag.value,
        '%i' % frag.nhits,
        '%.3f' % frag.mean,
        '%.3f' % frag.median,
        '%.3f' % z_score,
        '%.3f' % frag.d_min,
    )


def write_histograms(frag_list, out_dir, mol_name, report):
    """Write histogram figures and add image links to report."""

    # sanitise molecule name
    mol_name = sanitise_file_name(mol_name)

    for frag in frag_list:
        suffix = frag.fragment_label
        file_name = '%s_%s_%s.png' % (mol_name, frag.type, suffix)
        file_path = os.path.join(out_dir, file_name)
        if frag.nhits < 2:
            print(frag.type, suffix, frag.classification, frag.nhits, end=' ')
            print('in distribution, so not writing a histogram')
            continue
        if frag.type == 'bond':
            write_bond_distance_histogram(frag, file_path)
        elif frag.type == 'angle':
            write_valence_angle_histogram(frag, file_path)
        elif frag.type == 'torsion':
            write_torsion_angle_histogram(frag, file_path)
        elif frag.type == 'ring':
            write_ring_rmsd_histogram(frag, file_path)

        for line in ['<h3> %s for %s </h3>' % (frag.classification, suffix),
                     '<img src="%s" />' % file_name]:
            print(line, file=report)


def write_report(mol, out_dir):
    """Write histograms and HTML report."""
    file_name = '%s_report.html' % (sanitise_file_name(mol.identifier))

    with open(os.path.join(out_dir, file_name), 'w') as report:
        for line in ['<html>', '<head>', '<title>Geometry report for %s</title>' % mol.identifier,
                     '</head>', '<body>', '<h1>Geometry report for %s</h1>\n' % mol.identifier]:
            print(line, file=report)

        print('<h2>Bond distances</h2>', file=report)
        write_html_table(HEADERS,
                         [get_row(bond) for bond in mol.analysed_bonds],
                         stream=report,
                         border='1px', id='bond')
        write_histograms([b for b in mol.analysed_bonds if b.unusual],
                         out_dir,
                         mol.identifier,
                         report)

        print('<h2>Valence angles</h2>', file=report)
        write_html_table(HEADERS,
                         [get_row(angle) for angle in mol.analysed_angles],
                         stream=report,
                         border='1px', id='angle')
        write_histograms([a for a in mol.analysed_angles if a.unusual],
                         out_dir,
                         mol.identifier,
                         report)

        print('<h2>Torsion angles</h2>', file=report)
        write_html_table(TOR_HEADERS,
                         [get_row(torsion) for torsion in mol.analysed_torsions],
                         stream=report,
                         border='1px', id='torsion')
        write_histograms([t for t in mol.analysed_torsions if t.unusual],
                         out_dir,
                         mol.identifier,
                         report)

        # print('<h3>Ring RMSDs</h3>', file=report)
        # write_html_table(RING_HEADERS,
        #                [get_row(ring) for ring in mol.analysed_rings],
        #                stream=report,
        #                border=1)
        # write_histograms(mol.analysed_rings.unusual(), out_dir, mol.identifier, report)

        print('</body></html>', file=report)


def main(file_name, out_dir):
    reader = MoleculeReader(file_name)
    engine = GeometryAnalyser()
    engine.settings.summary()
    for mol in reader:
        mol = engine.analyse_molecule(mol)
        write_report(mol, out_dir)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('input_file', help='Input file of structures to generate reports for.')
    parser.add_argument('output_directory', help='Directory to write HTML reports to.')
    args = parser.parse_args()

    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)
    main(args.input_file, args.output_directory)

Extracting the CSD hits of the analysis distributions

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

"""
Analyse the torsion angles of an input molecule and for any unusual torsions
write out the CSD entries that make up the analysis distribution.

This script will take an input molecule in Mol2 or SDF file format and perform
a geometry analysis on it. For every unusual torsion it will then write out a
multi-mol2 or SDF file containing the CSD entries that make up the analysis
distribution for that particular torsion. Furthermore, for each unusual
torsion a csv file is written to the output directory. This csv file contains
information on the atoms matched in the hit CSD entries.

For more help use the -h argument.

"""

import argparse
from ccdc.io import MoleculeReader, MoleculeWriter
from ccdc.conformer import GeometryAnalyser
import os.path

CSV_HEADER = '"Refcode","AtId1","AtId2","AtId3","AtId4","Torsion value"\n'


def create_analyser():
    "Return geometry analysis engine with the bond, angle and ring analysis turned off."
    #   Create a geometry analysis instance
    engine = GeometryAnalyser()

    #   Turn off unwanted analyses
    #   Separate instances in settings for each geometry query
    engine.settings.bond.analyse = False
    engine.settings.angle.analyse = False
    engine.settings.ring.analyse = False
    return engine


def main(input_file, output_directory, output_file_format):
    "Analyse the molecule and write out the hits for the unusual torsions."

    engine = create_analyser()

    #   Iterate over molecules in the input file.
    mol_reader = MoleculeReader(input_file)

    for mol in mol_reader:

        print('Analysing all torsions in %s...' % mol.identifier)
        all_torsions = engine.analyse_molecule(mol).analysed_torsions
        unusual_torsions = [tor for tor in all_torsions if tor.unusual]
        print('Found %s unusual torsions.' % len(unusual_torsions))

        # Iterate over the unusual torsions
        for torsion in unusual_torsions:
            # Limit the number of hits printed for a faster demo
            max_hits = min(len(torsion.hits), 100)
        
            # Write the CSD entries that make up the distribution to a
            # file in the output directory.
            mol_fname = '%s_%s_hits.%s' %\
                (mol.identifier, torsion.fragment_label, output_file_format)
            mol_fpath = os.path.join(output_directory, mol_fname)
            with MoleculeWriter(mol_fpath) as mol_writer:
                print('Writing hit structures for %s to %s.' %
                      (torsion.fragment_label, mol_fpath))
                for hit in torsion.hits[:max_hits]:
                    mol_writer.write(hit.molecule)

            # Write out the atom ids of the atoms matched in the hits to a csv
            # file in the output directory.
            csv_fname = '%s_%s_hits.csv' % (mol.identifier, torsion.fragment_label)
            csv_fpath = os.path.join(output_directory, csv_fname)
            print('Writing hit details to %s...' % csv_fpath)
            with open(csv_fpath, 'w') as csv_file:
                csv_file.write(CSV_HEADER)
                for hit in torsion.hits[:max_hits]:
                    atomId1, atomId2, atomId3, atomId4 = hit.atom_indices
                    line = '"%s",%i,%i,%i,%i,%.2f\n' %\
                        (hit.refcode, atomId1, atomId2, atomId3, atomId4, hit.torsion_angle)
                    csv_file.write(line)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('input_file', help='A file of molecules to analyse.')
    parser.add_argument('output_directory', help='The directory to write output files to.')
    parser.add_argument('-o', '--output_file_format', choices=['mol2', 'sdf'], default='sdf',
                        help='What file format to write the output structures as.')
    args = parser.parse_args()

    # Do some sanity checking of the input arguments.
    if not os.path.isfile(args.input_file):
        parser.error('%s does not exist.' % args.input_file)
    if not os.path.isdir(args.output_directory):
        os.makedirs(args.output_directory)

    # Run the main script.
    main(args.input_file, args.output_directory, args.output_file_format)

Filtering a set of conformers

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

"""
Filter a set of conformers of a single structure using GeometryAnalyser.

The filtering is based on the idea of minimising the number of unusual
torsions.

The default definition of "unusual" is any torsion angle which has a GeometryAnalyser
local density value of less than 10%. That is less than 10% of the CSD
observed torsions lie within +/- 10 degrees of the input torsion angle. This
cut-off can be adjusted using the "LOCAL_DENSITY" argument.

The default minimum number of CSD hits for a torsion to be considered at all
is 15. This cut-off can be adjusted using the "MIN_OBS" argument.

For more help use the -h argument.

"""

import argparse
import ccdc.conformer
import ccdc.io
import os
import sys


def create_analyser():
    """Create a GeometryAnalyser engine to analyse the conformers."""
    engine = ccdc.conformer.GeometryAnalyser()

    # We are only interested doing a analysis of the torsions.
    engine.settings.bond.analyse = False
    engine.settings.angle.analyse = False
    engine.settings.ring.analyse = False

    # Also we do not want to use generalisation.
    engine.settings.generalisation = False
    # Only the organic subset, i.e. exclude organometallic
    engine.settings.organometallic_filter = 'Organic'
    return engine


def main(infile, outfile, local_density_cutoff, min_obs):
    """The main program for filtering a set of conformers.

    :param input: Input molecule file
    :param outfile: Output molecule file
    :param local_density_cutoff: Local density cutoff for defining an unusual torsion
    :param min_obs: Minimum number of observations for a torsion to be considered
    """
    engine = create_analyser()

    # Set the current best (minimum number of unusual torsions) to a large
    # number.
    best_so_far = sys.maxsize

    # Open a file to write molecules to.
    with ccdc.io.MoleculeReader(infile) as mol_reader:
        for mol in mol_reader:

            # Do the analysis.
            checked_mol = engine.analyse_molecule(mol)

            # Find the number of unusual torsions according to the input
            # criteria.
            unusual_torsions = [t for t in checked_mol.analysed_torsions
                                if (t.local_density < local_density_cutoff and t.nhits > min_obs)]
            num_unusual_torsions = len(unusual_torsions)

            # Should the conformer be kept or discarded.
            if num_unusual_torsions > best_so_far or num_unusual_torsions == 0:
                continue
            elif num_unusual_torsions == best_so_far:
                with ccdc.io.MoleculeWriter(outfile) as out:
                    out.write(checked_mol)
            elif num_unusual_torsions < best_so_far:
                # We have found something better, which means we have to
                # discard all the conformers written out already. We therefore
                # close the output writer and open it again.
                if os.path.exists(outfile):
                    os.remove(outfile)
                with ccdc.io.MoleculeWriter(outfile) as out:
                    out.write(checked_mol)

                # Make sure that we update the variable that we are using for
                # the comparison.
                best_so_far = num_unusual_torsions


if __name__ == '__main__':
    # Set up the command line parser
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('infile', type=str, help='Input molecule file')
    parser.add_argument('outfile', type=str, help='Output molecule file')
    parser.add_argument('-l', '--local_density', type=float, default=10.0,
                        help='Local density cutoff for defining an unusual torsion')
    parser.add_argument('-m', '--min_obs', type=int, default=15,
                        help='Minimum number of observations for a torsion to be considered')

    # Parse the command line arguments
    args = parser.parse_args()

    # Run the conformer filtering
    main(args.infile, args.outfile, args.local_density, args.min_obs)