Pharmacophore Examples

Creating a pharmacophore model

This example creates a pharmacophore model from an overlay of ligand files. It detects features common to many of the structures in the overlay set, and generates consensus features from them. Which feature definitions to use, and a threshold for the consensus may be specified on the command line.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-09-25: created by the Cambridge Crystallographic Data Centre
#
'''
    create_overlay_pharmacophore.py -   create a pharmacophore from an overlay of molecules.

    A molecule file of the overlaid molecules is required. Supported file format: mol2, pdb, sdf.

    Example command:
        python create_overlay_pharmacophore.py -o output_directory overlay_file.mol2
'''
##############################################################################################################

import os
import argparse
import collections

from ccdc import io, utilities
from ccdc.descriptors import GeometricDescriptors

from ccdc.pharmacophore import Pharmacophore

##############################################################################################################

class Runner(argparse.ArgumentParser):
    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__,
                                             formatter_class=argparse.RawTextHelpFormatter)
        self.add_argument(
            'overlay_file',
            help='Molecule file of overlaid molecules'
        )
        self.add_argument(
            '-o', '--output-directory', default='.',
            help='Where output will be stored'
        )
        self.add_argument(
            '-f', '--feature_definitions', '--features', nargs='*',
            help='Feature definitions to be used.  Default is the standard set of feature definitions.'
        )
        self.add_argument(
            '-t', '--threshold', type=float, default=0.0,
            help='Threshold at which features will be defined'
        )
        self.args = self.parse_args()

    def run(self):
        if not os.path.exists(self.args.output_directory):
            os.makedirs(self.args.output_directory)

        Pharmacophore.read_feature_definitions()
        self.crystals = list(io.CrystalReader(self.args.overlay_file))
        if self.args.threshold <= 0.0:
            self.args.threshold = (len(self.crystals))/2.0
        if self.args.feature_definitions:
            self.feature_definitions = [v for k, v in Pharmacophore.feature_definitions.items() if k in self.args.feature_definitions]
        else:
            self.feature_definitions = [
                fd for fd in Pharmacophore.feature_definitions.values()
                if fd.identifier != 'exit_vector' and
                fd.identifier != 'heavy_atom' and
                fd.identifier != 'hydrophobe'
            ]

        complete_set_of_features = []
        for fd in self.feature_definitions:
            detected = [fd.detect_features(c) for c in self.crystals]
            all_feats = [f for l in detected for f in l]
            if not all_feats:
                    continue
            minx = min(f.spheres[0].centre.x() for f in all_feats)
            miny = min(f.spheres[0].centre.y() for f in all_feats)
            minz = min(f.spheres[0].centre.z() for f in all_feats)
            maxx = max(f.spheres[0].centre.x() for f in all_feats)
            maxy = max(f.spheres[0].centre.y() for f in all_feats)
            maxz = max(f.spheres[0].centre.z() for f in all_feats)
            g = utilities.Grid((minx-1., miny-1., minz-1.), (maxx+1, maxy+1, maxz+1), 0.2)

            spheres = []
            for f in all_feats:
                if f.spheres[0] in spheres:
                    g.set_sphere(f.spheres[0].centre, f.spheres[0].radius, 0)
                else:
                    spheres.append(f.spheres[0])
                    g.set_sphere(f.spheres[0].centre, f.spheres[0].radius, 1)

            islands = g.islands(self.args.threshold)
            print('Feature: %s, max value %.2f, n_features %d' %
                (fd.identifier, g.extrema[1], len(islands))
            )
            for island in islands:
                # how do I make a feature from an island?  Location of highest value
                indices = island.indices_at_value(island.extrema[1])
                centre = indices[0]
                org = island.bounding_box[0]
                centre = tuple(org[i] + island.spacing*centre[i] for i in range(3))
                radius = 1.0
                # Any other spheres?
                if len(all_feats[0].spheres) > 1:
                    # Pick all features which contain centre
                    feat_dists ={}
                    for f in all_feats:
                        dist, feat = (GeometricDescriptors.point_distance(f.spheres[0].centre, centre), f)
                        if dist in feat_dists:
                            feat_dists[dist].append(feat)
                        else:
                            feat_dists[dist] = [feat]

                        feat_dists = collections.OrderedDict(sorted(feat_dists.items()))
                        shortest_distance = list(feat_dists.keys())[0]

                    if len(feat_dists[shortest_distance]) > 1:
                        new_feat = [
                            Pharmacophore.Feature(fd, GeometricDescriptors.Sphere(centre, radius),
                                                  feat_dists[shortest_distance][i].spheres[1])
                            for i in range(len(feat_dists[shortest_distance]))]
                    else:
                        new_feat = [
                            Pharmacophore.Feature(fd, GeometricDescriptors.Sphere(centre, radius),
                                                  feat_dists[shortest_distance][0].spheres[1])]
                else:
                    new_feat = [
                        Pharmacophore.Feature(fd, GeometricDescriptors.Sphere(centre, radius))]

                complete_set_of_features.extend(new_feat)
            model = Pharmacophore.Query(complete_set_of_features)

            model.write(os.path.join(self.args.output_directory, 'model.cm'))

##############################################################################################################

if __name__ == '__main__':
    r = Runner()
    r.run()

Creating a feature database

This example creates a feature database either from a protein binding site information or from the CSD or from a mixture of them.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-04-05: created by the Cambridge Crystallographic Data Centre
#
"""
create_feature_database.py - Example script to create a Pharmacophore Feature Database.
"""

import argparse
import os
import csv
import tempfile
import shutil

from ccdc.io import EntryReader
from ccdc.pharmacophore import Pharmacophore
from ccdc.utilities import Colour

####################################################################################################################

# If you are creating a Structure database from the CSD set DatabaseInfo.file_name to a list of CSD
# database file names.  If you want to include any available CSD updates include them in the list.
# Alternatively, set DatabaseInfo.file_name to the string 'csd' rather than a list and all CSD updates
# will be automatically included.
csd_path = EntryReader('CSD').file_name
if not isinstance(csd_path, (list, tuple)):
    csd_path = [csd_path]
if len(csd_path) > 1:
    csd_path = [csd_path[0]] # Do not use any CSD updates if available

def parse_args():
    '''Define and parse the arguments to the script.'''
    parser = argparse.ArgumentParser(
        description='Create a new CSD-CrossMiner Pharmacophore Feature Database.'
    )

    parser.add_argument(
        '-i',
        '--input_structures',
        help='Input protein structural files (a directory of mol2 files [])',
        default=None
    )
    parser.add_argument(
        '--input_dna_structures',
        help='Input nucleic acid structural files (a directory of mol2 files [])',
        default=None
    )

    parser.add_argument(
        '-a',
        '--annotations',
        help='Path to the annotations CSV file for the set of input structures',
        default=None
    )
    parser.add_argument(
        '--dna_annotations',
        help='Path to the annotations CSV file for the set of nucleic acid input structures',
        default=None
    )
    parser.add_argument(
        '-o',
        '--output_feature_database',
        help='Output filename for the generated pharmacophore feature database [out.feat]',
        default='out.feat'
    )
    parser.add_argument(
        '-f',
        '--feature_definitions',
        help='Directory containing the set of feature definition files',
        default=None
    )
    parser.add_argument(
        '-m',
        '--protein_database',
        help='Filename of intermediate protein structure database [out.csdsqlx]',
        default='out.csdsqlx'
    )
    parser.add_argument(
        '--dna_database',
        help='Filename of intermediate nucleic acid structure database [out_dna.csdsqlx]',
        default='out_dna.csdsqlx'
    )
    parser.add_argument(
        '-c',
        '--use_csd',
        help='Whether to include all entries from the current CSD [False]',
        choices=['True', 'False'],
        default='False'
    )
    parser.add_argument(
        '-C',
        '--maximum_csd_structures',
        help='Number of entries to include from the current CSD',
        type=int,
        default=0
    )
    parser.add_argument(
        '-A',
        '--csd_annotations',
        help='Path to the CSD annotations CSV file for CSD structures',
        default=None
    )
    parser.add_argument(
        '-t',
        '--number_of_threads',
        help='Number of threads to use for feature database creation',
        type=int,
        default=None
    )

    return parser.parse_args()


def load_annotations(feature_database_path, annotation_csv, do_write=True):
    """This uses a CSV file to annotate the feature database.

    The csv file should list the annotation names in the first row, and the identifiers
    should be given in the first column. *e.g.*,

        id, first, second
        AABHTZ, one, two
        AACANI10, three, four
        AACANI11, five, once I caught a fish alive,
        AACFAZ, seven, eight
    """
    feature_database = Pharmacophore.FeatureDatabase.from_file(feature_database_path)
    with open(annotation_csv, 'r') as csv_file:
        reader = csv.DictReader(csv_file)
        for row in reader:
            identifier = row['identifier'].strip()
            del row['identifier']
            d = {
                k: v.strip() for k, v in row.items()
            }
            feature_database.annotate(identifier, **d)

    if do_write:
        tempdir = tempfile.mkdtemp()
        print("Writing feature database size: %d to file: %s" %
              (len(feature_database), feature_database_path))
        feature_database.write(os.path.join(tempdir, os.path.basename(feature_database_path)))
        shutil.copyfile(os.path.join(tempdir, os.path.basename(feature_database_path)),
                        feature_database_path)


def create_databases(
    input_file_path, feature_definitions_path, protein_database_path, feature_db_path,
    include_csd=False, maximum_csd_structures=None, number_of_threads=None,
    input_dna_file_path=None, dna_database_path=None
):
    settings = Pharmacophore.FeatureDatabase.Creator.Settings(
        feature_definition_directory=feature_definitions_path, n_threads=number_of_threads
    )
    Pharmacophore.read_feature_definitions(feature_definitions_path)
    feature_defs = list(Pharmacophore.feature_definitions.values())
    creator = Pharmacophore.FeatureDatabase.Creator(settings)

    databases = []

    # If you already have a structure database, you can set database_path to point to it
    # and omit the input_file_path argument

    def add_mol2_input_database(mol2_path, database_path, colour, databases):
        if os.path.exists(database_path):
            print('Using existing structure database %s' % database_path)
            if mol2_path is not None:
                print('--- WARNING: loose mol2 files in %s will be ignored in favour of this!' %
                      mol2_path)
        elif mol2_path is not None:
            print('Using loose mol2 files in %s' % mol2_path)

        if mol2_path is not None or os.path.exists(database_path):
            db_info = Pharmacophore.FeatureDatabase.DatabaseInfo(
                mol2_path,
                0,
                colour
            )
            structure_database = creator.StructureDatabase(
                db_info,
                use_crystal_symmetry=False,
                structure_database_path=database_path,
            )
            databases.append(structure_database)

    add_mol2_input_database(input_file_path, protein_database_path,
                            Colour(255, 170, 0, 255), databases)
    add_mol2_input_database(input_dna_file_path, dna_database_path,
                            Colour(170, 255, 0, 255), databases)

    #  CSD feature database
    if include_csd:
        db_info = Pharmacophore.FeatureDatabase.DatabaseInfo(
            csd_path, maximum_csd_structures, (255, 255, 0, 255))
        csd_database = creator.StructureDatabase(
            db_info,
            use_crystal_symmetry=True,
            structure_filters=creator.StructureDatabase.default_csd_filters()
        )
        databases.append(csd_database)

    print('Writing feature database to: ' + feature_db_path)
    feature_database = creator.create(databases, feature_defs)
    feature_database.write(feature_db_path)

    return creator


def main():
    args = parse_args()
    if args.input_structures is None and args.protein_database is None \
        and args.input_dna_structures is None and args.dna_database is None \
        and args.use_csd == 'False':
        raise RuntimeError(
            'At least one of input_structures, protein_database, input_dna_structures, dna_database and use_csd must be specified')

    creator = create_databases(
        args.input_structures,
        args.feature_definitions,
        args.protein_database,
        args.output_feature_database,
        include_csd=args.use_csd == 'True',
        maximum_csd_structures=args.maximum_csd_structures,
        number_of_threads=args.number_of_threads,
        input_dna_file_path=args.input_dna_structures,
        dna_database_path=args.dna_database
    )

    if args.annotations:
        load_annotations(args.output_feature_database, args.annotations)

    if args.dna_annotations:
        load_annotations(args.output_feature_database, args.dna_annotations)

    if args.csd_annotations:
        load_annotations(args.output_feature_database, args.csd_annotations)

    print('CSD-CrossMiner Feature Database generation is complete!')


if __name__ == "__main__":
    main()

Creating a feature database from generated conformers

This example takes a number of molecule files, generates conformers for each structure found within them, then creates a feature database from the conformer files. Structures in the database are annotated with the file from which they came and some data pertaining to the conformer generation.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-09-25: created by the Cambridge Crystallographic Data Centre
#
'''
    create_conformer_database.py  -   creates a feature database from conformers of structures.
'''
###############################################################################################

import sys
import os
import argparse
import collections

from ccdc import io, conformer
from ccdc.pharmacophore import Pharmacophore
from ccdc.utilities import Colour

###############################################################################################

class Runner(argparse.ArgumentParser):
    '''Parses arguments and runs the creation.'''
    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'mol_files', nargs='+',
            help='molecule file(s) from which to generate conformers'
        )
        self.add_argument(
            '-o', '--output-directory',
            help='where to store the feature database and generated mol2 file'
        )
        self.add_argument(
            '-m', '--max-structures', type=int, default=0,
            help='Maximum number of structures for each file for faster testing'
        )
        self.add_argument(
            '-p', '--prefix', default='',
            help='prefix for output files'
        )
        self.args = self.parse_args()

    def run(self):
        '''Creates the database.'''
        colours = [
            Colour(0, 255, 0, 255),
            Colour(255, 0, 0, 255),
            Colour(0, 0, 255, 255),
            Colour(255, 255, 0, 255),
            Colour(255, 0, 255, 255),
            Colour(0, 255, 255, 255),
            Colour(255, 255, 255, 255),
        ]
        suffix_dict = collections.defaultdict(int)
        conf_gen = conformer.ConformerGenerator()
        conf_gen.settings.max_conformers = 25
        if not os.path.isdir(self.args.output_directory):
            os.makedirs(self.args.output_directory)
        feature_db_name = os.path.join(self.args.output_directory, '%sconformers.feat' % self.args.prefix)
        if os.path.exists(feature_db_name):
            os.unlink(feature_db_name)
        structure_dbs = []
        # Generate conformers
        annotations = dict()
        max_input = sys.maxsize if self.args.max_structures == 0 else self.args.max_structures
        for i, mol_file in enumerate(self.args.mol_files):
            print('Reading %s' % mol_file)
            base, _ = os.path.splitext(os.path.basename(mol_file))
            conf_file = os.path.join(self.args.output_directory, self.args.prefix + base + '_conformers.mol2')
            if os.path.exists(conf_file):
                print('Skipping conformer generation as %s already exists' % conf_file)
            else:
                with io.MoleculeReader(mol_file) as reader, io.MoleculeWriter(conf_file) as writer:
                    for j, m in enumerate(reader):
                        if j > max_input:
                            break
                        suffix_dict[m.identifier] += 1
                        suffix = suffix_dict[m.identifier]
                        confs = conf_gen.generate(m)

                        # If conformer generation fails, ConformerGenerator returns None rather
                        # than []. Trying to iterate over this will crash the script, so we skip
                        # further steps for the structure at this point.
                        if confs is None:
                            print('WARNING: Conformer generation failed for structure %s in %s!' %
                                  (m.identifier, mol_file))
                            continue

                        for k, c in enumerate(confs):
                            m = c.molecule
                            m.identifier = '%s_%02d_%03d' % (m.identifier, suffix, k)
                            writer.write(m)
                            annotations[m.identifier] = {
                                'input_file' : mol_file,
                                'rmsd' : c.rmsd(),
                                'probability' : c.probability,
                                'normalised_score': c.normalised_score
                            }
            db_info = Pharmacophore.FeatureDatabase.DatabaseInfo(conf_file, 0, colours[i%len(colours)])
            struct_db = Pharmacophore.FeatureDatabase.Creator.StructureDatabase(
                db_info, use_crystal_symmetry=False, structure_database_path=conf_file.replace('mol2', 'sqlmol2')
            )
            structure_dbs.append(struct_db)

        # Now to create the database
        creator = Pharmacophore.FeatureDatabase.Creator()
        feature_db = creator.create(structure_dbs)
        # Now add some annotations
        for ident, attribs in annotations.items():
            feature_db.entry(ident).annotations.update(attribs)
        # Finally, write the database
        feature_db.write(feature_db_name)


###############################################################################################

if __name__ == '__main__':
    runner = Runner()
    runner.run()

Creating a feature database from docking results

This example creates a feature database from a GOLD docking run. The structures of the database will be the complexes of ligand and protein found by the docking, and so will work nicely with ensemble docking. The entries in the database will be annotated with the tags of the docked files placed by GOLD.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2018-09-25: created by the Cambridge Crystallographic Data Centre
#
'''
    create_docking_database.py  -   creates a feature database from GOLD docking results.
'''
###############################################################################################

import os
import argparse

from ccdc import io, docking, entry
from ccdc.pharmacophore import Pharmacophore
from ccdc.utilities import Colour

###############################################################################################

class Runner(argparse.ArgumentParser):
    '''Parses arguments and runs the creation.'''
    def __init__(self):
        super(self.__class__, self).__init__(description=__doc__)
        self.add_argument(
            'conf_file',
            help='GOLD conf file for the docking'
        )
        self.add_argument(
            '-o', '--output-directory',
            help='where to store the feature database and generated mol2 file'
        )

        self.args = self.parse_args()

    def run(self):
        '''Creates the database.'''
        if not os.path.isdir(self.args.output_directory):
            os.makedirs(self.args.output_directory)
        base_name, _ = os.path.splitext(os.path.basename(self.args.conf_file))
        settings = docking.Docker.Settings.from_file(self.args.conf_file)
        docker = docking.Docker(settings)
        results = docker.results
        # Create the mol2 file
        mol2_file = os.path.join(self.args.output_directory, base_name + '.mol2')
        feature_db_name = mol2_file.replace('mol2', 'feat')
        if os.path.exists(feature_db_name):
            os.unlink(feature_db_name)
        annotations = dict()
        with io.EntryWriter(mol2_file) as writer:
            for ligand in results.ligands:
                comp = results.make_complex(ligand)
                # Remove lone pairs from docking solutions
                comp.remove_unknown_atoms()
                comp.identifier = ligand.identifier
                ent = entry.Entry.from_molecule(comp, **ligand.attributes)
                annotations[ligand.identifier] = ligand.attributes
                writer.write(ent)
        # Now create the database
        db_info = Pharmacophore.FeatureDatabase.DatabaseInfo(mol2_file, 0, Colour(255, 0, 0, 255))
        struct_db = Pharmacophore.FeatureDatabase.Creator.StructureDatabase(
            db_info, use_crystal_symmetry=False, structure_database_path=mol2_file.replace('mol2', 'sqlmol2')
        )
        creator = Pharmacophore.FeatureDatabase.Creator()
        feature_db = creator.create((struct_db,))
        # Now add some annotations
        for ident, attribs in annotations.items():
            feature_db.entry(ident).annotations.update(attribs)
        # Finally, write the database
        feature_db.write(feature_db_name)


###############################################################################################

if __name__ == '__main__':
    runner = Runner()
    runner.run()

Extract annotations defined in a feature database

This writes out all annotations stored in a specified feature database.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-04-07: created by the Cambridge Crystallographic Data Centre
#
"""
extract_annotations_from_feature_database.py - Write out all annotations stored in a Pharmacophore Feature Database.
"""

import argparse
import os
import csv

from ccdc.pharmacophore import Pharmacophore


def parse_args():
    parser = argparse.ArgumentParser(
        description='Write all annotations stored in a Pharmacophore Feature Database to file.')

    parser.add_argument(
        'input_feature_database',
        help='Input Feature Database'
    )
    parser.add_argument(
        'output_annotations_file',
        nargs='?',
        help='Output filename for the generated file of annotations in CSV format [out.csv]',
        default='out.csv'
    )
    parser.add_argument(
        '-I',
        '--output_identifiers_file',
        help='Filename for output identifier list in GCD format [None]',
        type=str,
        default=None
    )

    return parser.parse_args()


def write_identifiers(feature_database, identifiers_out_path):
    print('Writing identifiers to: ' + identifiers_out_path)
    if os.path.exists(identifiers_out_path):
        os.remove(identifiers_out_path)   

    with open(identifiers_out_path, 'w') as writer:
        writer.writelines(['%s\n' % identifier for identifier in feature_database.identifiers])

 
def write_annotations(feature_database, annotations_out_path):
    print('Writing annotations to: ' + annotations_out_path)
    if os.path.exists(annotations_out_path):
        os.remove(annotations_out_path)
        
    keys = list(feature_database.annotation_keys)
    print('The available annotations are:')
    print('\n '.join(keys))

    with open(annotations_out_path, 'w') as writer:
        csv_writer = csv.DictWriter(writer, fieldnames=keys)
        csv_writer.writeheader()
        for identifier in feature_database.identifiers:
            entry = feature_database.entry(identifier)
            annos = entry.annotations
            csv_writer.writerow(annos)


def main():
    args = parse_args()
    
    feature_database = Pharmacophore.FeatureDatabase.from_file(args.input_feature_database)

    if args.output_identifiers_file is not None:
        write_identifiers(feature_database, args.output_identifiers_file)

    write_annotations(feature_database, args.output_annotations_file)

    print('Done!')

if __name__ == "__main__":
    main()

Creating an annotation file for the CSD

This script may be used to generate a CSV file containing annotations for CSD structures to be stored in a specified feature database.

#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at http://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-06-30: created by the Cambridge Crystallographic Data Centre
#
"""
extract_csd_annotations.py - extract annotations for CSD-derived entries.
"""

import argparse
import os

from ccdc.io import EntryReader
from ccdc.search import Search

from ccdc import __version__ as csd_pyapi_version


def parse_arguments():
    parser = argparse.ArgumentParser(
        description='Script to extract annotations for CSD-derived entries',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('-o', '--output',
                        help='Output csv file [./csd_annotations.csv]',
                        default='./csd_annotations.csv')
    parser.add_argument('-m', '--maximum_csd_structures',
                        help='Number of entries to include from the current CSD',
                        type=int, default=None)

    return parser.parse_args()


def write_annotations(annotation_csv_path, maximum_csd_structures=None):
    number_csd_structures = 0
    with open(annotation_csv_path, 'w') as writer:
        writer.write('identifier,CSD Refcode,formula,r factor\n')

        # define search parameters for which entries should be included
        settings = Search.Settings()
        settings.not_polymeric = True
        settings.no_disorder = 'Non-hydrogen'
        settings.has_3d_coordinates = True
        settings.must_have_elements = []
        settings.max_hit_structures = 0
        settings.only_organic = False
        settings.max_r_factor = 10.0
        settings.no_errors = False
        settings.no_powder = False
        settings.no_ions = False
        settings.must_not_have_elements = ['He', 'Be', 'Ne', 'Al', 'Si', 'Ar', 'Sc', 'Ti', 'V',
                                           'Cr', 'Ga', 'Ge', 'As', 'Se', 'Kr', 'Rb', 'Y', 'Zr',
                                           'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In',
                                           'Sn', 'Sb', 'Te', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr',
                                           'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er',
                                           'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir',
                                           'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'Rn', 'Fr',
                                           'Ra', 'Ac', 'Th', 'Pa', 'U']
        settings.no_metals = False
        settings.only_organometallic = False

        for entry in EntryReader('CSD'):

            # filter out entries that don't match the search parameters
            if not settings.test(entry):
                continue

            # filter out entries that don't have a crystal property
            try:
                _ = entry.crystal
            except RuntimeError:
                continue

            # write row to CSV
            refcode = entry.identifier
            formula = entry.crystal.formula
            r_factor = entry.r_factor
            writer.write('{},{},{},{}\n'.format(
                refcode,
                refcode,
                formula if formula else '',
                r_factor if r_factor else ''))

            # exit if the limit is reached
            number_csd_structures = number_csd_structures + 1
            if maximum_csd_structures is not None\
                    and number_csd_structures >= maximum_csd_structures:
                break


def extract_annotations(annotations_out_path, maximum_csd_structures):
    print('Writing annotations to: ' + annotations_out_path)
    if os.path.exists(annotations_out_path):
        os.remove(annotations_out_path)
    write_annotations(annotations_out_path, maximum_csd_structures)


if __name__ == '__main__':
    args = parse_arguments()
    extract_annotations(args.output, args.maximum_csd_structures)