Entry examples

Create indexes of useful information for subsets of CSD entries

Note that this script makes use of functionality from the cookbook utility module.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

'''
Provide information on a set of structures in the CSD.

This script takes as input a GCD file (a text file with CSD refcodes) and
writes out the identifier, author(s), literature reference, formula, compound
name and compound synonym(s). The output can be formatted as csv or html.

If a missing entry is found, it is logged to a stderr file.

'''

import sys
import os
import csv
import html
import argparse
import codecs

from ccdc.io import EntryReader


STDERR_FILE = __file__ + '.stderr.txt'


class Writer(object):
    def __init__(self, infile, out, format='csv'):
        try:
            self.rdr = EntryReader(infile, format='identifiers')
        except RuntimeError:
            print('Failed to read input file %s!' % infile)
            exit(1)

        self.out = out
        getattr(self, format + '_header')()
        # Iterate with index so we can catch missing entry
        missing_entries = []
        for i in range(len(self.rdr)):
            try:
                getattr(self, format + '_line')(self.rdr[i])
            except RuntimeError as exc:
                missing_entries.append(open(infile).read().splitlines()[i])
        getattr(self, format + '_footer')()

        if missing_entries:
            with open(STDERR_FILE, 'w') as fh:
                for entry in missing_entries:
                    fh.write(f'WARNING: Entry {entry} not found\n')

    def csv_header(self):
        data = ','.join([
            'Identifier',
            'Author',
            'Literature Ref',
            'Formula',
            'Compound Name',
            'Synonym'
        ])
        self.out.write(str(data))
        self.out.write('\n')

    def csv_footer(self):
        pass

    def csv_line(self, e):
        cit = e.publication
        self.out.write(','.join([
            '"%s"' % e.identifier,
            '"%s"' % cit.authors,
            '"%s"' % ' '.join([str(x) for x in (cit.journal.name, cit.year, cit.volume, cit.first_page)]),
            '"%s"' % e.formula,
            '"%s"' % e.chemical_name,
            '"%s"' % ' '.join(e.synonyms)
        ]))
        self.out.write('\n')

    def html_header(self):
        self.out.write('''
<TABLE border=1>
<TR>
    <TH>Identifier</TH>
    <TH>Author</TH>
    <TH>Literature Ref</TH>
    <TH>Formula</TH>
    <TH>Compound Name</TH>
    <TH>Synonym</TH>
</TR>
''')

    def html_footer(self):
        self.out.write('</TABLE>\n')

    def html_line(self, e):
        cit = e.publication
        self.out.write(
            '<TR><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD><TD>%s</TD></TR>\n' %
            tuple(html.escape(x) for x in (
                e.identifier,
                cit.authors,
                ' '.join([str(x) for x in (cit.journal.name, cit.year, cit.volume, cit.first_page)]),
                e.formula,
                e.chemical_name,
                ' '.join(e.synonyms)
            )))


class Arguments(argparse.ArgumentParser):
    '''Options for the program'''

    def __init__(self):
        argparse.ArgumentParser.__init__(self, description=__doc__)
        self.add_argument(
            'input_file',
            help='Location of a GCD file of required refcodes'
        )
        self.add_argument(
            '-o', '--output', default='stdout',
            help='output file [stdout]'
        )
        self.add_argument(
            '-f', '--format', default='csv', choices=['csv', 'html'],
            help='output format [csv]'
        )
        self.args = self.parse_args()


class OsLineEndDialect(csv.excel):
    """An os-dependent dialect that will write the correct line endings
    to the CSV file.
    """
    lineterminator = os.linesep


if __name__ == '__main__':
    args = Arguments()
    if args.args.format == 'csv':
        if args.args.output == 'stdout':
            out = sys.stdout
        else:
            out = codecs.open(args.args.output, 'w', encoding='utf-8')
    else:
        if args.args.output == 'stdout':
            out = sys.stdout
        else:
            out = codecs.open(args.args.output, 'w', encoding='utf8')
    w = Writer(args.args.input_file, out, args.args.format)
    if args.args.format != 'csv':
        out.close()

Filter the CSD using a ccdc.search.Search.Settings instance

This script will search for given numbers of acceptor atoms and donor atoms or donatable protons. There are arguments to control which CSD entries are acceptable.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

'''
    filter_csd.py - finds molecules in the CSD subject to various criteria

    Only provide the arguments you feel strongly about - the others will have
    defaults which mean they won't have an effect
    Options are -d (or --donors or any prefix), -p or --protons, -a or --acceptors.
    Also -o or --output and -m or --maximum
    Options may be in the form 2,7 to select anything within the range.
    The default for maximum hits is to search the whole CSD.

'''
####################################################################################################

import sys
import argparse

from ccdc.io import EntryReader, MoleculeWriter
from ccdc.search import Search


class ParseRange(argparse.Action):
    """ argparse Action to parse arguments in 'x,y' format into a list. """

    def __init__(self, option_strings, dest, nargs=None, **kwargs):
        super(ParseRange, self).__init__(option_strings, dest, **kwargs)

    def __call__(self, parser, namespace, values, option_string=None):
        try:
            if ',' in values:
                result = [int(x.strip()) for x in values.split(',')]
            else:
                result = [int(values), int(values)]
        except ValueError:
            raise ValueError('Invalid number in parameter %s: %s' % (option_string, values))

        if result[0] > result[1]:
            raise ValueError('Cannot filter %s by reverse range %s' %
                             (option_string, values))

        for value in result:
            if value < 0:
                raise ValueError('%s Range %s cannot be negative.' % (values, option_string))

        setattr(namespace, self.dest, result)


class Runner(argparse.ArgumentParser):
    '''Fishes out arguments, runs the search.'''

    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            '-d', '--donors', default=[0, 1000], action=ParseRange,
            help='number of donor atoms required (may be a range separated by a comma)'
        )
        self.add_argument(
            '-a', '--acceptors', default=[0, 1000], action=ParseRange,
            help='Number of acceptor atoms required (may be a range separated by a comma)'
        )
        self.add_argument(
            '-p', '--protons', default=[0, 1000], action=ParseRange,
            help='Number of donatable protons required (may be a range separated by a comma)'
        )
        self.add_argument(
            '-o', '--output', default='results.gcd',
            help='output file [results.gcd]'
        )
        self.add_argument(
            '-m', '--maximum', default=0, type=int,
            help='Maximum number of structures to find [all]'''
        )

        self.add_argument(
            '-R', '--r_factor', '--r-factor', default=5.0, type=float,
            help='Maximum acceptable R-factor [5.0]'
        )
        self.add_argument(
            '-D', '--disorder', default=True, action='store_false',
            help='Whether disordered structures are acceptable [No]'
        )
        self.add_argument(
            '-E', '--errors', default=True, action='store_false',
            help='Whether structures with errors are acceptable [No]'
        )
        self.add_argument(
            '-M', '--organometallic', default=True, action='store_false',
            help='Whether organometallic structures are acceptable [No]'
        )
        self.add_argument(
            '-P', '--polymeric', default=True, action='store_false',
            help='Whether polymeric structures are acceptable [No]'
        )
        self.add_argument(
            '-T', '--two_d', '--two-d', default=True, action='store_false',
            help='Whether 2D structures are acceptable [No]'
        )

        self.args = self.parse_args()
        if not self.args.protons and not self.args.donors and not self.args.acceptors:
            print('Error: Please specify protons, donors and/or acceptors options.')
            sys.exit()

        self.settings = Search.Settings()
        self.settings.only_organic = self.args.organometallic
        self.settings.no_disorder = self.args.disorder
        self.settings.no_errors = self.args.errors
        self.settings.has_3d_coordinates = self.args.two_d
        self.settings.max_r_factor = self.args.r_factor
        self.settings.max_hit_structures = self.args.maximum  # Set this to 0 if you want everything

    def run(self):
        '''Run the search.'''
        csd = EntryReader('csd')
        min_d, max_d = self.args.donors
        min_p, max_p = self.args.protons
        min_a, max_a = self.args.acceptors
        ct = 0

        with MoleculeWriter(self.args.output) as writer:
            for i, e in enumerate(csd):
                if i and i % 10000 == 0:
                    print('%d hits from %6d...' % (ct, i))
                if not self.settings.test(e):
                    continue
                try:
                    mol = e.molecule
                except RuntimeError:
                    continue
                donors = [a for a in mol.atoms if a.is_donor]
                acceptors = [a for a in mol.atoms if a.is_acceptor]
                protons = [a for d in donors for a in d.neighbours if a.atomic_number == 1]
                if ((min_d <= len(donors) <= max_d) and
                        (min_p <= len(protons) <= max_p) and
                        (min_a <= len(acceptors) <= max_a)):
                    writer.write(mol)
                    print('Matched %s' % e.identifier)
                    ct += 1
                    if self.settings.max_hit_structures and ct >= self.settings.max_hit_structures:
                        break


if __name__ == '__main__':
    r = Runner()
    r.run()

Use python’s string formatting method to generate an HTML report

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#

'''
    simple_report.py - format basic information about a CSD structure in HTML

    This example shows how the python string
    <a href="https://docs.python.org/3/library/string.html#string-formatting">format method</a>
    may be used to generate an HTML report, with an almost complete
    separation of presentation (the HTML) and logic (the Python source).

    Of course more complicated substitutions may be performed with a
    fully-fledged templating system, such as <a href="www.makotemplates.org">mako</a>,
    <a href="jinja2.pocoo.org">jinja2</a>, but the simplicity of this
    is appealing.

    Any "{" and "}" in the user's template file, must be replaced with "{{"
    and "}}" for correct substitution.
'''

import argparse
import codecs
import os

from ccdc.io import EntryReader
from ccdc.diagram import DiagramGenerator


class Runner(argparse.ArgumentParser):
    '''Reads arguments and writes the report.'''

    def __init__(self):
        '''Defines arguments.'''
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'refcodes', nargs='+',
            help='Refcodes for which a report should be generated.'
        )
        self.add_argument(
            '-o', '--output-directory', type=str, default='.',
            help='Directory to which to write the reports [.]'
        )

    def run(self):
        '''Writes a report for each refcode provided.'''
        self.args = self.parse_args()
        self.csd = EntryReader('csd')
        self.generator = DiagramGenerator()
        self.generator.settings.return_type = 'SVG'
        template_file_name = os.path.join(
            os.path.dirname(__file__), 'simple_report_template.html'
        )
        self.template = codecs.open(template_file_name, encoding='utf-8').read()
        for refcode in self.args.refcodes:
            self.run_one(refcode)

    def run_one(self, refcode):
        '''Writes one report.'''
        entry = self.csd.entry(refcode.upper())
        mol = entry.molecule
        atoms = mol.atoms
        bonds = mol.bonds
        img = self.generator.image(mol).replace('Qt SVG Document',
                                                'Diagram for %s' % refcode.upper())
        doi = entry.publication.doi
        if doi is None:
            doi = '&nbsp;'
        else:
            doi = '<a href="https://doi.org/%s">%s</a>' % (doi, doi)

        with codecs.open(os.path.join(self.args.output_directory, refcode + '.html'), 'w',
                         encoding='utf-8') as html:
            report = self.template.format(
                entry=entry,
                molecule=mol,
                image=img,
                doi=doi,
                synonyms='; '.join(s for s in entry.synonyms),
                counts=dict(
                    natoms=len(atoms),
                    ndonors=len([a for a in atoms if a.is_donor]),
                    nacceptors=len([a for a in atoms if a.is_acceptor]),
                    nrot_bonds=len([b for b in bonds if b.is_rotatable]),
                ),
            )
            html.write(report)


if __name__ == '__main__':
    Runner().run()

Here is the template used in the script: simple_report_template.html.