Crystal examples¶
Generating molecular assemblies¶
These examples take a crystal and generate a molecular representation based on various criteria.
An assembly based on distance¶
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
"""
Write out heaviest component and molecular shell for a set of CSD structures.
This script takes an input a GCD file and an output directory. For each CSD
entry the script then identifies the heaviest component and writes it to the
output directory. The molecules in the molecular shell around the heaviest
component are then identified and written out to a separate file in the output
directory.
For more help use the -h argument.
"""
import argparse
import os
from ccdc import io
##############################################################################
def main(gcd_file, out_dir):
"""
Run the main program.
"""
crystal_reader = io.CrystalReader(gcd_file, format='identifiers')
for crystal in crystal_reader:
# Get the atoms from the heaviest component in the crystal. These will
# be used as the selection around which we create the molecular shell.
heaviest_component = crystal.molecule.heaviest_component
selection = heaviest_component.atoms
# Write out the central molecule
out_file_name = '%s.mol2' % crystal.molecule.identifier
out_file_path = os.path.join(out_dir, out_file_name)
with io.MoleculeWriter(out_file_path) as mol_writer:
mol_writer.write(heaviest_component)
# Generate the molecular shell around the atoms in the selection.
mol_shell = crystal.molecular_shell(distance_type="VdW",
distance_range=(0.0, 0.5),
atom_selection=selection)
# Write out the molecular shell
out_file_name = '%s_shell.mol2' % crystal.molecule.identifier
out_file_path = os.path.join(out_dir, out_file_name)
with io.MoleculeWriter(out_file_path) as mol_writer:
mol_writer.write(mol_shell)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('gcd_file', type=str, help="Input GCD file")
parser.add_argument('out_dir', type=str, help="Output directory")
args = parser.parse_args()
if not os.path.isdir(args.out_dir):
os.mkdir(args.out_dir)
if not os.path.isfile(args.gcd_file):
parser.error('%s is not a file.' % args.gcd_file)
main(args.gcd_file, args.out_dir)
An assembly based on graph sets¶
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2016-03-23: created by the Cambridge Crystallographic Data Centre
#
'''
graph_set_closure.py - expand a crystal such that all HBonds of the crystal's graph sets
are included.
The graph sets of the crystal will be calculated, then the crystal expanded to include
all of the hydrogen bonds making up the graph sets.
'''
############################################################################
import os
import argparse
from ccdc.descriptors import CrystalDescriptors
from ccdc.io import CrystalReader, MoleculeWriter
from ccdc.molecule import Molecule
############################################################################
class Runner(argparse.ArgumentParser):
'''Does the whole job.'''
def __init__(self):
'''Defines arguments and parses them.'''
super(self.__class__, self).__init__(
description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
)
self.add_argument(
'refcode_or_files', nargs='+',
help='Refcode or crystal file name.'
)
self.add_argument(
'-o', '--output-file', '--output_file',
help='Output molecule file. If not given the molecule will be written to stdout in mol2 format.'
)
self.args = self.parse_args()
@property
def csd(self):
'''Lazily create a CSD reader if necessary.'''
if not hasattr(self, '_csd'):
self._csd = CrystalReader('csd')
return self._csd
@property
def searcher(self):
'''Lazily create a GraphSetSearch instance.'''
if not hasattr(self, '_searcher'):
self._searcher = CrystalDescriptors.GraphSetSearch()
return self._searcher
def expand(self, crystal):
'''Expand the crystal by the symmetry operators of the GraphSet HBonds.'''
gsets = self.searcher.search(crystal)
symms = {
s
for gs in gsets
for hb in gs.hbonds
for s in hb.symmetry_operators
if s
}
mols = [
crystal.symmetric_molecule(s)
for s in sorted(symms)
]
mol = Molecule(crystal.identifier)
for m in mols:
mol.add_molecule(m)
return mol
def run(self):
'''Runs over all refcodes and files.'''
if self.args.output_file:
self.writer = MoleculeWriter(self.args.output_file)
else:
self.writer = MoleculeWriter('stdout', format='mol2')
for r_or_f in self.args.refcode_or_files:
if os.path.exists(r_or_f):
reader = CrystalReader(r_or_f)
for c in reader:
self.writer.write(self.expand(c))
else:
c = self.csd.crystal(r_or_f)
self.writer.write(self.expand(c))
self.writer.close()
############################################################################
if __name__ == '__main__':
Runner().run()
Other molecular expansions¶
This example constructs various molecular expansions of a crystal based on packing, slicing, hydrogen bond and contact networks, and polymer expansion:
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-07-17: created by the Cambridge Crystallographic Data Centre
#
"""
crystal_expansion.py - show the various molecular expansions of a crystal.
This script creates various multi-molecular structures from a crystal.
The structures will be written to a multi-mol2 file.
"""
import argparse
from ccdc import io
from ccdc.descriptors import MolecularDescriptors
import os
class Runner(argparse.ArgumentParser):
"""Defines arguments, parses the command line and runs the job."""
def __init__(self):
"""Defines the arguments."""
super(self.__class__, self).__init__(description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
self.add_argument(
'refcodes', nargs='+', help='CSD Refcodes to expand.'
)
self.add_argument(
'-o', '--output-directory', '--output_directory', default='.',
help='Optional output directory'
)
def run(self):
"""Runs the task."""
args = self.parse_args()
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
csd = io.EntryReader('csd')
for refcode in args.refcodes:
crystal = csd.crystal(refcode)
molecular_shell = crystal.molecular_shell(distance_range=(0.0, 7.0))
molecular_shell.identifier = '%s_molecular_shell' % refcode
packing_shell = crystal.packing_shell(30)
packing_shell.identifier = '%s_packing_shell' % refcode
hbond_network = crystal.hbond_network(repetitions=3)
hbond_network.identifier = '%s_hbond_network' % refcode
contact_network = crystal.contact_network(repetitions=2)
contact_network.identifier = '%s_contact_network' % refcode
packing = crystal.packing(((-1, -1, -1), (0.5, 0.5, 0.5)))
packing.identifier = '%s_packing' % refcode
slicing = crystal.slicing(
MolecularDescriptors.atom_plane(*[a for a in crystal.molecule.heavy_atoms])
)
slicing.identifier = '%s_slicing' % refcode
polymer_expansion = crystal.polymer_expansion(repetitions=12)
polymer_expansion.identifier = '%s_polymer' % refcode
output_file = os.path.join(args.output_directory, '%s.mol2' % refcode)
with io.MoleculeWriter(output_file) as writer:
writer.write(molecular_shell)
writer.write(packing_shell)
writer.write(hbond_network)
writer.write(contact_network)
writer.write(packing)
writer.write(slicing)
writer.write(polymer_expansion)
if __name__ == '__main__':
Runner().run()
Analysis of HBond dimensionality¶
This example uses the hbond_network expansions and a principle axes aligned molecule to determine in how many dimensions a crystal’s HBonds are expanding.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-09-04: created by the Cambridge Crystallographic Data Centre
#
'''
hbond_dimensionality.py - establish the dimensionality of a crystal's hydrogen bonded network.
Expands the hbond network of a crystal, and sees which dimenions are growing. For example:
ACSALA (aspirin) shows no dimension grows, i.e the network forms a ring
YIGPIO02 (Ritonavir) grows in one dimension
HXACAN (Paracetamol) grows in two dimensiont
DMANTL08 (Mannitol) grows in all three dimensions.
'''
###########################################################################
import os
import argparse
from ccdc import io
from ccdc.descriptors import MolecularDescriptors
csd = io.EntryReader('csd')
###########################################################################
class Runner(argparse.ArgumentParser):
'''Runs the calculation.'''
def __init__(self):
super(self.__class__, self).__init__(description=__doc__)
self.add_argument('crystals', nargs='+',
help='Refcodes or crystal files to analyese'
)
self.args = self.parse_args()
def run(self):
print('Identifier\tNetwork dimensionality')
for a in self.args.crystals:
if not os.path.exists(a):
c = csd.crystal(a)
self.analyse(c)
else:
for c in io.CrystalReader(a):
self.analyse(c)
def analyse(self, crystal):
'''Calculates the dimensionality of the crystal.'''
m = crystal.molecule
siteless = not m.all_atoms_have_sites
if siteless:
m.add_hydrogens()
crystal.molecule = m
shells = [
crystal.hbond_network(repetitions=2), crystal.hbond_network(repetitions=5)
]
boxes = [MolecularDescriptors.PrincipleAxesAlignedBox(s) for s in shells]
vecs = [
[b.x_vector.length, b.y_vector.length, b.z_vector.length]
for b in boxes
]
ratios = [vecs[-1][i]/vecs[0][i] for i in range(3)]
thresh = 1.1
ndims = sum(r > thresh for r in ratios)
#print(ratios)
#print(ndims)
print('%8s\t%d' % (crystal.identifier, ndims))
###########################################################################
if __name__ == '__main__':
Runner().run()
Descriptor HTML report¶
Note that this script makes use of functionality from the cookbook utility module.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
"""
Generate a HTML table of molecular and crystallographic properties for
structures in the CSD.
This includes, amongst more generic ones, point group analysis descriptors.
For more help use the -h argument.
"""
import sys
import argparse
import os.path
import collections
import codecs
this_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(1, this_dir)
import utilities
sys.path.remove(this_dir)
from ccdc import io
from ccdc.descriptors import MolecularDescriptors
##############################################################################
# The headers for the HTML table
HEADERS = (
'REFCODE',
'Identifier',
'Hydrate',
'Formula',
'Space Gp. Symbol',
'Z Prime',
'Unique Chem. Units',
'Mol. Wt.',
'Donors',
'Acceptors',
'Rotatable bonds',
'Point Gp. Symbol',
'Point Gp. Order',
'Point Gp. Description',
'Atom counts',
)
##############################################################################
def get_num_unique_structures(molecule):
"Return the number of unique structures in a molecule."
d = collections.defaultdict(int)
for m in molecule.components:
smi = m.smiles
if smi is None:
smi = 'Unknown bond in SMILES'
d[smi] += 1
return len(d)
def get_atom_counts(molecule):
"Return a list of strings of atom counts."
atom_cts = collections.defaultdict(int)
for atom in molecule.atoms:
atom_cts[atom.atomic_symbol] += 1
l = sorted([(v, k) for k, v in atom_cts.items()])
l.reverse()
return ['%s: %d' % (k, v) for v, k in l]
##############################################################################
def get_row_data(entry):
"Return a tuple of molecular and crystallographic properties."
# Extract the molecule (which may have several components) and the heaviest
# component from the crystal structure. The heaviest component is the
# molecule of interest.
crystal = entry.crystal
mol = crystal.molecule
mol_of_interest = mol.heaviest_component
# Find out how many unique molecules are present in the crystal structure
num_unique_structures = get_num_unique_structures(mol)
# Ensure all hydrogen atoms are present, for donor calculations
mol_of_interest.add_hydrogens()
# Counts of molecular properties
donors = len([at for at in mol_of_interest.atoms if at.is_donor])
accs = len([at for at in mol_of_interest.atoms if at.is_acceptor])
rots = len([b for b in mol_of_interest.bonds if b.is_rotatable])
# Collect a list of atoms present (which could be taken from the formula)
atom_counts = get_atom_counts(mol_of_interest)
# Another datum requested
hydrate = 'hydrate' in entry.chemical_name.lower()
# Molecular weight (before stripping hydrogen atoms!)
mol_wt = mol_of_interest.molecular_weight
# Strip hydrogens for point group calculations
mol_of_interest.remove_hydrogens()
# Returns a triple (order, symbol, description)
pt_group_info = MolecularDescriptors.point_group_analysis(mol_of_interest)
# Now we have all the information
return (
mol.identifier,
entry.chemical_name,
hydrate,
entry.formula,
crystal.spacegroup_symbol,
crystal.z_prime,
num_unique_structures,
mol_wt,
donors,
accs,
rots,
pt_group_info[0],
pt_group_info[1],
pt_group_info[2],
' '.join(atom_counts)
)
def main(gcd_file, out_file):
"Run the main program."
# Collect the data of interest in a list.
rows = []
with io.EntryReader(gcd_file, format='identifiers') as entry_reader:
for entry in entry_reader:
rows.append(get_row_data(entry))
# Write out the report as a HTML file.
with codecs.open(out_file, 'w', encoding='utf8') as writer:
writer.write('\n'.join([
'<HTML>',
'<HEAD>',
'<TITLE>Point group analysis</TITLE>',
'</HEAD>',
'<BODY>',
'<H1>Point group analysis</H1>\n'
]))
utilities.write_html_table(HEADERS, rows, stream=writer, border=1)
writer.write('</BODY></HTML>\n')
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('gcd_file', type=str, help="Input GCD file")
parser.add_argument('out_file', type=str, help="Output HTML file")
args = parser.parse_args()
if not os.path.isfile(args.gcd_file):
parser.error('%s is not a file.' % args.gcd_file)
main(args.gcd_file, args.out_file)
Analysing crystal morphologies¶
This example creates BFDH morphologies of CSD structures and analyses the aspect ratios of them to predict whether they will from needles, plates, laths or blocks. For example, for the polymorphs of the anti-inflammatory XOCXUI01 and XOCXUI02:
Classification of XOCXUI01 and XOCXUI02
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
''' morphology_analysis.py - compare the habits of two crystal structures.
Determines the aspect ratios of the BFDH morphologies of the two crystals
and plots them on a chart, showing which class of habit they are likely to
possess.
'''
import argparse
import os
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from ccdc.morphology import BFDHMorphology
from ccdc.io import EntryReader
arg_parser = argparse.ArgumentParser(description=__doc__)
arg_parser.add_argument(
'refcodes', nargs='+', help='Refcodes of structures to analyse'
)
arg_parser.add_argument(
'-o', '--output', default=os.path.join('.', 'morphology_aspects.png'), help='Output chart PNG filename.'
)
args = arg_parser.parse_args()
# Set up plot
fig = plt.figure(figsize=(10., 10.))
ax = plt.axes()
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xlabel('M:L Ratio', fontsize=16)
ax.set_ylabel('S:M Ratio', fontsize=16)
ax.annotate('Needle', (0.23, 0.95), fontsize=16)
ax.annotate('Block', (0.73, 0.95), fontsize=16)
ax.annotate('Lath', (0.23, 0.45), fontsize=16)
ax.annotate('Plate', (0.73, 0.45), fontsize=16)
plt.plot([0.5, 0.5], [0, 1], c='black', linewidth=2)
plt.plot([0, 1], [0.5, 0.5], c='black', linewidth=2)
ptc = fig.patch
ptc.set_facecolor('white')
def make_plot(asp, colour, refcode):
# This function plots the ratios
y, x = get_ratios(asp)
plt.scatter(x, y, c=colour, s=40)
ax.annotate('{}'.format(refcode), (x + 0.007, y - 0.007), fontsize=10)
def get_ratios(aspect):
# This function returns the side length ratios small:medium and medium:large
s_m = aspect[0] / aspect[1] # Small to medium side lengths ratio
m_l = aspect[1] / aspect[2] # Medium to long side lengths ratio
return s_m, m_l
def get_form(aspect):
# In this function we designate a shape from the aspect ratio
s_m, m_l = get_ratios(aspect)
if s_m < 0.5:
if m_l < 0.5:
return 'Lath'
else:
return 'Plate'
elif m_l < 0.5:
return 'Needle'
else:
return 'Block'
if __name__ == '__main__':
# Read structures from the CSD
reader = EntryReader('CSD')
if not os.path.exists(os.path.dirname(args.output)):
os.makedirs(os.path.dirname(args.output))
# Read in two crystal polymorphs of the drug Nabumetone
for refcode in args.refcodes:
crystal = reader.crystal(refcode)
morphology = BFDHMorphology(crystal)
bounding_box = morphology.oriented_bounding_box
# By calculating the bounging box of the morphologies, we can get the aspect ratios
aspect = [
bounding_box.minor_length, bounding_box.median_length, bounding_box.major_length
]
# The aspect ratios can be used to designate a shape for the crystal
shape = get_form(aspect)
print('{}: {}'.format(refcode, shape))
# This can be plotted
make_plot(aspect, 'red', refcode)
# Using the plotting from Morphology.py we can also overlay the morphologies in 3D
plt.savefig(args.output)
Charting crystal morphologies¶
This example draws an image of the morphology and tables of plane angles and of face relative areas of morphologies of one or more crystals of the CSD. An example for HXACAN is in ../images/hxacan_morphology.html
.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
'''morphology_plot.py - make an image of a morphology.
'''
import argparse
import codecs
import itertools
import io as sys_io
import os
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
from ccdc.io import EntryReader
from ccdc.morphology import BFDHMorphology
from ccdc.descriptors import GeometricDescriptors
# Visualisation functions
def wireframe(face, ax):
# Plot wireframe of morphology in matplotlib
for n, edge in enumerate(face.edges):
Axes3D.plot(ax,
[coord[0] for coord in edge],
[coord[1] for coord in edge],
[coord[2] for coord in edge],
c='black',
linewidth=1.5)
def skin(face, colour, ax):
# Mesh the facets for matplotlib
for edge in face.edges:
vertices = [(edge[0], edge[1], face.centre_of_geometry)]
Axes3D.add_collection3d(ax, Poly3DCollection(
vertices, color=colour, linewidth=0, alpha=0.1))
def add_labels(morphology, face, ax):
# Add labels to facets
ax.text(face.centre_of_geometry[0], face.centre_of_geometry[1], face.centre_of_geometry[2],
''' {} {}'''.format(face.miller_indices.hkl,
round(morphology.relative_area(face.miller_indices), 3)),
color='black')
ax.scatter(face.centre_of_geometry[0],
face.centre_of_geometry[1],
face.centre_of_geometry[2],
s=10,
color='black')
def generate_plot(morphology, colour):
# Plot the morphology in matplotlib
# Set up Axes
fig = plt.figure(figsize=(10., 10.)) # 3D graph instance
ax = fig.add_subplot(111, projection='3d') # 3D Axes
ax.set_xlim3d(-20, 20)
ax.set_ylim3d(-20, 20)
ax.set_zlim3d(-20, 20)
ax.set_xlabel('X axis')
ax.set_ylabel('Y axis')
ax.set_zlabel('Z axis')
ax.grid(False)
plt.axis('off')
for facet in morphology.facets:
wireframe(facet, ax)
skin(facet, colour, ax)
add_labels(morphology, facet, ax)
return ax, plt
# Analysis functions
def compare_angles(morphology, format='html'):
# Print a table of every combination of facets and their dihedral angle
indices = {}
for f in morphology.facets:
indices[f.miller_indices.hkl] = GeometricDescriptors.Plane.from_points(*f.coordinates)
# Every pair of facets to compare
combinations = list(itertools.combinations([k for k in indices], 2))
if format == 'html':
return '<table><tr><th>Plane1</th><th>Plane2</th></th><th>Angle</th>%s</table>' % (
'\n'.join('<tr><td>%s</td><td>%s</td><td align="right">%.2f</td></tr>' %
(c[0], c[1], 180 - (indices[c[0]].plane_angle(indices[c[1]])))
for c in combinations)
)
for combination in combinations:
angle = 180 - (indices[combination[0]]).plane_angle(indices[combination[1]])
print('planes {} & {}: angle = {}'.format(combination[0], combination[1], angle))
def print_areas(morphology, format='html'):
'''Print a table of every face and its relative area.'''
if format == 'html':
return '<table><tr><th>Indices</th><th>Area</th></tr>%s</table>' % (
'\n'.join('<tr><td>%s</td><td align="right">%.2f</td></tr>' %
(face.miller_indices.hkl, morphology.relative_area(face.miller_indices))
for face in morphology.facets
)
)
for face in morphology.facets:
print('{}: relative area = {}'.format(
face.miller_indices.hkl, round(morphology.relative_area(face.miller_indices), 3)))
def run_all(morphology):
# Run all of the functions in this script
ax, plt = generate_plot(morphology, (0, 0, 1, 0.1))
compare_angles(morphology)
print_areas(morphology)
return ax, plt
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
arg_parser.add_argument(
'refcodes', nargs='+', help='refcodes of CSD structures to plot'
)
arg_parser.add_argument(
'-o', '--output',
default=os.path.join(os.getcwd(), 'morphologies.html'),
help='Full path of the HTML file for the output report.'
)
args = arg_parser.parse_args()
reader = EntryReader('CSD') # Read structures from the CSD
if not os.path.exists(os.path.dirname(args.output)):
os.makedirs(os.path.dirname(args.output))
lines = [
'<style>',
'.box { margin:0; }',
'.left { float: left; width: 50%; overflow: hidden; }',
'.middle { float: left; width: 25%; overflow: hidden; }',
'.right { float: left; width: 20%; overflow: hidden; }',
'</style>',
'']
for refcode in args.refcodes:
cryst = reader.crystal(refcode) # Open the crystal from CSD entry SUCACB03
morphology = BFDHMorphology(cryst) # Create a morphology object
ax, plt = generate_plot(morphology, (0, 0, 1, 0.1))
svg = sys_io.BytesIO()
plt.savefig(svg, format='svg')
lines.append('<div class="box">')
lines.append('<div class="left">%s</div>' % svg.getvalue().decode('utf-8'))
lines.append('<div class="middle">%s</div>' % compare_angles(morphology, format='html'))
lines.append('<div class="right">%s</div>' % print_areas(morphology, format='html'))
lines.append('</div>')
with codecs.open(args.output, 'w', encoding='utf-8') as writer:
for line in lines:
writer.write('%s%s' % (line, os.linesep))
Functional groups of a crystal’s HBonds¶
This writes a csv file containing the hydrogen bonding atoms of crystals with the functional groups of the donor and acceptor atoms.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
'''
functional_groups_of_hbonds.py - produce a spreadsheet of functional groups of hbonds
'''
################################################################################
import sys
import csv
import argparse
from ccdc.io import CrystalReader
from ccdc.descriptors import CrystalDescriptors
################################################################################
class Runner(argparse.ArgumentParser):
'''Defines and parses arguments, runs the job.'''
def __init__(self):
super(self.__class__, self).__init__(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
self.add_argument(
'gcd_file',
help='GCD file of crystal structures to analyse'
)
self.add_argument(
'-o', '--output-file', '--output_file', default=None,
help='Output csv file [stdout]'
)
self.args = self.parse_args()
def run(self):
'''Runs the job.'''
calculator = CrystalDescriptors.HBondCoordination()
reader = CrystalReader(self.args.gcd_file)
if self.args.output_file:
f = open(self.args.output_file, 'w')
else:
f = sys.stdout
headers = [
'Identifier', 'Donor residue', 'Donor functional group', 'Donor atom',
'Acceptor residue', 'Acceptor functional group', 'Acceptor atom'
]
writer = csv.writer(f)
writer.writerow(headers)
def get_residue(crystal, label):
for i, c in enumerate(crystal.molecule.components):
try:
c.atom(label)
return 'RES%d' % (i+1)
except RuntimeError:
pass
raise RuntimeError('Ambiguous atom label %s' % label)
for c in reader:
if c.has_disorder:
continue
predictions = calculator.predict(c)
for hb in c.hbonds():
if hb.atoms[1] in hb.atoms[0].neighbours:
don_at, acc_at = hb.atoms[0], hb.atoms[-1]
else:
don_at, acc_at = hb.atoms[-1], hb.atoms[0]
don, acc = predictions.functional_groups_of_hbond(hb)
if don is None:
don = '%s Nothing Unidentified group Nothing' % don_at.label
if acc is None:
acc = '%s Nothing Unidentified group Nothing' % acc_at.label
don_bits = don.split()
acc_bits = acc.split()
row = [
c.identifier,
get_residue(c, don_bits[0]),
' '.join(don_bits[2:-1]),
don_bits[0],
get_residue(c, acc_bits[0]),
' '.join(acc_bits[2:-1]),
acc_bits[0]
]
writer.writerow(row)
################################################################################
if __name__ == '__main__':
Runner().run()
A simple spreadsheet of hydrogen bond coordination likelihoods¶
This example iterates over a GCD file (or any other crystal file) creating a csv file of hbond coordination likelihoods.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
'''
hbond_coordination_prediction_spreadsheet.py - calculate hbond coordination predictions for a set of crystals
This makes a spreadsheet of hbond coordination predictions containing columns indicating predicted probability and whether or not that number of hbonds was observed.
'''
#############################################################################################
import argparse
from ccdc.io import CrystalReader
from ccdc.descriptors import CrystalDescriptors
from ccdc.utilities import CSVWriter
#############################################################################################
class Runner(argparse.ArgumentParser):
'''Defines arguments, parses arguments and runs the analysis.'''
def __init__(self):
super(self.__class__, self).__init__(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
self.add_argument(
'gcd_file', type=str,
help='GCD file of crystal structures to analyse'
)
self.add_argument(
'-o', '--output-file', '--output_file', default=None,
help='Output csv file [stdout]'
)
self.add_argument(
'-m', '--model-path', '--model_path', default=None,
help='Path to a directory of coordination models'
)
self.args = self.parse_args()
def run(self):
settings = CrystalDescriptors.HBondCoordination.Settings()
if self.args.model_path is not None:
settings.coordination_models_path = self.args.model_path
calculator = CrystalDescriptors.HBondCoordination(settings=settings)
reader = CrystalReader(self.args.gcd_file)
outfile = self.args.output_file if self.args.output_file else None
headers = ['Identifier', 'Molecule', 'Atom', 'Functional group', 'D/A']
for i in range(7):
headers.extend(['%s %d likelihood' % ('>=' if i == 6 else '=', i), 'Observed'])
with CSVWriter(file_name=outfile, header=headers) as writer:
for c in reader:
if c.has_disorder:
continue
predictions = calculator.predict(c)
for r in predictions.to_csv(separator=None):
writer.write_row(r)
#############################################################################################
if __name__ == '__main__':
Runner().run()
Exploring hydrogen bond coordination models¶
This example builds a coordination model as a tree of possible coordinations for the donors and acceptors of a crystal. This is only one of many possible models that can be constructed from the API.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
"""
hbond_coordination_tree.py - create a tree of possible hbond coordinations with attached probabilities.
The tree enumerates each possible coordination for each donor atom. Subtrees from this enumerates each possible coordination for each acceptor atom consistent with the donations previously defined. For example in the subtree where the first donor donates once and the second donor donates once, the sum of the acceptor atoms' coordination must sum to two.
For convenience, the created tree will be returned, so invoking this interactively allows one to explore the tree data structure.
"""
import argparse
import os
from ccdc.descriptors import CrystalDescriptors
from ccdc import io
from ccdc.diagram import DiagramGenerator
from ccdc.utilities import CSVWriter, HTMLReport, html_table
csd = io.EntryReader('csd')
class Node(object):
"""A node in a hydrogen bond propensity tree.
:param label: (:obj:`str`) A label for the node
:param is_donor: (:obj:`boolean`) Whether the node is a donor.
:param coordination: (:obj:`float`) This node's coordination score.
:param probability: (:obj:`float`) The hydrogen bond probability of this node.
:param cumulative_probability: (:obj:`float`) The cumulative hydrogen bond probability of this
node and all the nodes on the path to it.
:param is_observed: (:obj:`boolean`) Whether the node is on the observed coordination path.
:param current_donations: (:obj:`list`) The nodes which are donors for this node.
:param remaining_donors: (:obj:`list`) The nodes which are donors for this node but have not
yet been factored into the cumulative probability.
:param remaining_acceptors: (:obj:`list`) Remaining acceptors for this node.
:param predictions: (:obj:`list`) The probability predictions for this node.
"""
def __init__(self, label, is_donor, coordination, probability, cumulative_probability,
is_observed, current_donations, remaining_donors, remaining_acceptors, predictions):
"""Holds all the info for a decision tree node."""
self.label = label
self.is_donor = is_donor
self.coordination = coordination
self.probability = probability
self.cumulative_probability = cumulative_probability * self.probability
self.is_observed = is_observed
self.remaining_donors = remaining_donors
self.remaining_acceptors = remaining_acceptors
self.current_donations = current_donations
self.predictions = predictions
self.children = []
self._expand()
def _expand(self):
"""Expands child nodes."""
if self.remaining_donors:
preds = self.predictions.predictions_for_label(self.remaining_donors[0])
self.children = [
Node(self.remaining_donors[0], True, p, preds[1][p], self.cumulative_probability,
p == preds[0], p + self.current_donations,
self.remaining_donors[1:], self.remaining_acceptors, self.predictions)
for p in preds[1]
if preds[1][p] != 0.0 and preds[1][p]*self.cumulative_probability > 0.0
]
elif self.remaining_acceptors:
preds = self.predictions.predictions_for_label(self.remaining_acceptors[0], 'acceptor')
self.children = [
Node(self.remaining_acceptors[0], False, p, preds[1][p],
self.cumulative_probability, p == preds[0], self.current_donations - p,
self.remaining_donors, self.remaining_acceptors[1:], self.predictions)
for p in preds[1]
if preds[1][p] != 0.0 and p <= self.current_donations
and preds[1][p] != 0.0 and preds[1][p] * self.cumulative_probability > 0.0
]
@property
def is_leaf(self):
""":obj:`boolean`: Whether or not this node is a leaf node."""
return self.children == []
@property
def observed_path(self):
""":obj:`list` of :obj:`Node`: All nodes corresponding to the crystal's observed coordination."""
if self.is_leaf:
if self.is_observed:
return [self]
else:
for k in self.children:
if k.is_observed:
return [self] + k.observed_path
print(self.label, self.is_observed, self.coordination)
print([(k.is_observed, k.label, k.coordination) for k in self.children])
raise RuntimeError('No observed child')
@property
def leaves(self):
""":obj:`list` of :obj:`Node`: All the leaf nodes in the tree."""
if self.is_leaf:
yield self
else:
for k in self.children:
for l in k.leaves:
yield l
def walk(self):
"""Walks over the tree yielding all nodes.
:returns: (:obj:`Node`) Yields all the nodes in the tree.
"""
yield self
for k in self.children:
for x in k.walk():
yield x
def _set_depth(self, d):
"""Recursively sets the depth of the tree.
:param d: (:obj:`int`) The current depth of the tree.
"""
self.depth = d
for k in self.children:
k._set_depth(d+1)
def _prune(self):
"""Remove paths where the leaf has still got unaccounted for donations."""
for k in self.children:
k._prune()
to_go = [(i, k) for i, k in enumerate(self.children) if k.is_leaf and k.current_donations]
to_go.sort(reverse=True)
for i, k in to_go:
self.children.pop(i)
def _accumulate_das(self, dons, accs):
"""Sets in the node the accumulated coordinations.
This will be attributes, dons and accs, contaiming labels of the donor or acceptor with a
multiplicity given by the coordination.
:param dons: (:obj:`list`) A list of donor atoms.
:param accs: (:obj:`list`) A list of acceptor atoms.
"""
if self.is_donor:
dons = dons + [self.label] * self.coordination
else:
accs = accs + [self.label] * self.coordination
if self.is_leaf:
self.donors = dons
self.acceptors = accs
else:
for k in self.children:
k._accumulate_das(dons, accs)
def _set_cumulative_prob(self, prob):
"""There are various different ways of setting the cumulative probability.
Here we have chosen to ignore the probability associated with coordination scores of 0.
:param prob: (:obj:`float`) The probability to factor in to the total.
"""
# Ignore 0-coordinated acceptors
if not self.is_donor and self.coordination == 0:
self.cumulative_probability = prob
else:
self.cumulative_probability = self.probability * prob
for k in self.children:
k._set_cumulative_prob(self.cumulative_probability)
def _set_full_cumulative_prob(self, prob):
"""Sets the full cumulative probability including the 0-coordination probabilities.
:param prob: (:obj:`float`) The probability to factor in to the total.
"""
self.full_cumulative_probability = self.probability * prob
for k in self.children:
k._set_full_cumulative_prob(self.full_cumulative_probability)
def _accumulate_coordinations(self, accumulated):
"""Accumulates the coordination in a list.
This is to ensure leaf nodes have the complete list of coordinations contributing.
:param accumulated: (:obj:`list`) The coordinations to add to the total.
"""
self.accumulated_coordination = accumulated + [self.coordination]
for k in self.children:
k._accumulate_coordinations(self.accumulated_coordination)
class RootNode(Node):
"""The root node of the tree.
:param donors: (:obj:`list`) The donor atoms.
:param acceptors: (:obj:`list`) The acceptor atoms.
:param predictions: (:obj:`list`) The probability predictions.
:param children: (:obj:`Node`) The child nodes of the root.
"""
def __init__(self, donors, acceptors, predictions, children):
self.label = 'ROOT'
self.donors = donors
self.acceptors = acceptors
self.is_donor = None
self.coordination = 0
self.probability = 1.0
self.cumulative_probability = 1.0
self.is_observed = True
self.remaining_donors = donors
self.remaining_acceptors = acceptors
self.current_donations = 0
self.predictions = predictions
self.children = children
for k in self.children:
k._expand()
self._prune()
self._set_depth(0)
self._accumulate_das([], [])
self._set_cumulative_prob(1.0)
self._set_full_cumulative_prob(1.0)
for k in self.children:
k._accumulate_coordinations([])
@staticmethod
def make_tree(crystal):
"""Makes the tree from the crystal.
:param crystal: (:obj:`ccdc.crystal.Crystal`) A CSD Python API Crystal object.
:returns: (:obj:`RootNode`) A Hydrogen Bond Propensity tree for the crystal.
"""
coordination_calc = CrystalDescriptors.HBondCoordination()
predicts = coordination_calc.predict(crystal)
dons = [d.label for d in coordination_calc.donor_atoms]
accs = [a.label for a in coordination_calc.acceptor_atoms]
preds = predicts.predictions_for_label(dons[0])
return RootNode(dons, accs, predicts, [
Node(dons[0], True, p, preds[1][p], 1.0, p == preds[0], p, dons[1:], accs, predicts)
for p in preds[1] if preds[1][p] != 0.0
])
class Runner(argparse.ArgumentParser):
"""Handle command-line arguments and generate the reports."""
def __init__(self):
"""Set up the argument parser."""
super(self.__class__, self).__init__(description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
self.add_argument('refcodes', nargs='+', help='CSD Refcodes or files')
self.add_argument('-o', '--output',
help='The output file to use. HTML or CSV files can be written.')
self.args = self.parse_args()
self.output_format = None
""":obj:`str` An identifier for the output format to write."""
self.output_file = None
""":obj:`str` The full file name of the output file."""
self.output_dir = None
""":obj:`str` The directory to hold output files."""
self.diagram_generator = None
""":obj:`ccdc.diagram.DiagramGenerator` A diagram generator."""
if not self.args.output:
self.output_format = 'stdout'
else:
if not (self.args.output.endswith('html') or self.args.output.endswith('csv')):
raise Exception('Cannot recognise format for output file %s! .html or .csv are valid.'
% self.args.output)
self.output_file = os.path.abspath(self.args.output)
self.output_dir = os.path.dirname(self.output_file)
# Make sure the output directory exists
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
if self.output_file:
self.diagram_generator = DiagramGenerator()
self.diagram_generator.settings.return_type = 'SVG'
self.output = None
if self.args.output.endswith('html'):
self.output_format = 'html'
elif self.args.output.endswith('csv'):
self.output_format = 'csv'
def run(self):
"""Run the reporting on all specified crystals.
:returns: (:obj:`RootNode`) A Hydrogen Bond Propensity tree for the last crystal processed.
"""
for refcode_or_file in self.args.refcodes:
if os.path.isfile(refcode_or_file):
for crystal in io.CrystalReader(refcode_or_file):
tree = self.run_one(crystal)
else:
tree = self.run_one(csd.crystal(refcode_or_file))
# Write the HTML footer for HTML reports
if self.output_format == 'html':
with self.get_output_writer() as report:
report.write_footer()
return tree
def get_output_writer(self, identifier=''):
"""Get an output writer for the given identifer.
This will reopen and close the files after each iteration so problems any structure
don't result in all the data so far being lost.
:param identifier: (:obj:`str`) The identifier of the structure to write for.
:returns: (:obj:`file`) A context handler file-like object that can write output,
appropriate to the output format specified.
"""
if self.output_format == 'html':
return HTMLReport(file_name=self.output_file,
report_title='Hydrogen Bond Propensity Report',
append=True)
elif self.output_format == 'csv':
out_path = os.path.dirname(self.output_file)
basename, ext = os.path.splitext(os.path.basename(self.output_file))
output_file_path = os.path.join(out_path, '%s_%s%s' % (basename, identifier, ext))
return CSVWriter(file_name=output_file_path)
elif self.output_format == 'stdout':
return CSVWriter(stdout=True)
def run_one(self, crystal):
"""Calculates the tree for one crystal and writes results to output.
:param crystal: (:obj:`ccdc.crystal.Crystal`) A CSD Python API Crystal object.
:returns: (:obj:`RootNode`) A Hydrogen Bond Propensity tree.
"""
tree = RootNode.make_tree(crystal)
if self.output_format == 'html':
self.write_html(tree, crystal)
else:
self.write_csv(tree, crystal)
return tree
@staticmethod
def csv_headers(tree):
"""Generate CSV headers for the given tree.
:param tree: (:obj:`RootNode`) A Hydrogen Bond Propensity tree.
:returns: (:obj:`list`) A list of CSV column headers for the given tree.
"""
headers = ['Identifier']
headers += ['D %s' % tree.donors[i] for i in range(len(tree.donors))]
headers += ['A %s' % tree.acceptors[i] for i in range(len(tree.acceptors))]
headers += ['Probability', 'Full Probability', 'Observed']
return headers
def write_csv(self, tree, crystal):
"""Write tree data to CSV.
:param tree: (:obj:`RootNode`) A Hydrogen Bond Propensity tree.
:param crystal: (:obj:`ccdc.crystal.Crystal`) A CSD Python API Crystal object.
"""
observed_path = tree.observed_path
leaves = [(n.cumulative_probability, n) for n in tree.leaves]
leaves.sort(reverse=True)
with self.get_output_writer(crystal.identifier) as writer:
writer.write_header(self.csv_headers(tree))
for probability, leaf in leaves:
row = [crystal.identifier]
row += list(leaf.accumulated_coordination)
row += [probability, leaf.full_cumulative_probability, leaf == observed_path[-1]]
writer.write_row(row)
@staticmethod
def propensity_data(leaves, observed_path):
"""Generate rows of HTML output.
:param leaves: (:obj:`list`) A list of (:obj:`float`, :obj:`Node`) tuples of hydrogen bond
probabilities and the leaves of the probability tree they correspond to.
:param observed_path: (:obj:`list`) A list of :obj:`Node`s representing the observed
coordination of the crystal.
:returns: (:obj:`list`) A list of the propensity data for each leaf passed in.
"""
for leaf_data in leaves:
leaf = leaf_data[1]
probability = leaf_data[0]
yield [str(i) for i in leaf.accumulated_coordination] +\
['%.4f' % probability, '%.4f' % leaf.full_cumulative_probability,
(leaf == observed_path[-1])]
def write_html(self, tree, crystal):
"""Writes the given crystal's report to HTML.
:param tree: (:obj:`RootNode`) A Hydrogen Bond Propensity tree.
:param crystal: (:obj:`ccdc.crystal.Crystal`) A CSD Python API Crystal object.
"""
# Sort the data by reverse probability
leaves = [(n.cumulative_probability, n) for n in tree.leaves]
leaves.sort(reverse=True)
# Generate a diagram
diagram = self.diagram_generator.image(crystal).replace(
'Qt SVG Document', 'Diagram for %s' % crystal.identifier)
with self.get_output_writer() as report:
report.write_section_header(headline=crystal.identifier)
report.write_figure_data(diagram, caption='Diagram for %s' % crystal.identifier,
figure_id='diagram_%s' % crystal.identifier)
table_header = ['D %s' % x for x in tree.donors] +\
['A %s' % a for a in tree.acceptors] +\
['Probability', 'Full Probability', 'Observed']
report.write(html_table(data=self.propensity_data(leaves,
observed_path=tree.observed_path),
header=table_header,
table_id='data_%s' % crystal.identifier,
caption='Propensity data for %s' % crystal.identifier))
if __name__ == '__main__':
t = Runner().run()
Charting hydrogen bond propensities¶
This example creates a chart of hydrogen bond propensity groupings, plotting hydrogen bond score against hbond coordination score. Where there are structures to the right of and lower than the observed structure, there is a greater chance of the existence of polymorphic structures, for example:
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
'''
hydrogen_bond_propensities.py - performs an hbp calculation and produces a chart of propensity vs coordination score
'''
################################################################################
import argparse
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import ccdc.io
from ccdc.descriptors import CrystalDescriptors
################################################################################
def run(refcode):
'''Runs the job.'''
hbp = CrystalDescriptors.HBondPropensities()
reader = ccdc.io.CrystalReader('csd')
crystal = reader.crystal(refcode)
print('Hydrogen Bond Propensity calculation for %s\n' % refcode)
hbp.set_target(crystal)
hbp.match_fitting_data(count=300)
hbp.analyse_fitting_data()
hbp.perform_regression()
propensities = hbp.calculate_propensities()
groups = hbp.generate_hbond_groupings()
observed_groups = hbp.target_hbond_grouping()
make_chart(groups, observed_groups, crystal.identifier)
def make_chart(groups, observed_groups, identifier):
'''Makes a chart of the hbond groupings, plotting hbond score against coordination score.'''
figure = plt.scatter([group.hbond_score for group in groups],
[group.coordination_score for group in groups],
s=50, label='Putative Structures')
ax2 = plt.scatter(observed_groups.hbond_score,
observed_groups.coordination_score,
c='red', marker='*', s=250, label='Observed Structure')
plt.origin = 'upper'
plt.xlim(0, 1.0)
plt.ylim(-1.0, 0)
plt.legend(scatterpoints=1)
ax = plt.gca()
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xlabel('Mean H-Bond Propensity')
ax.set_ylabel('Mean H-Bond Coordination')
ax.xaxis.set_label_position('top')
fig_name = '%s_standard_chart' % identifier
plt.savefig('%s.png' % fig_name, format='png')
plt.savefig('%s.svg' % fig_name, format='svg')
################################################################################
if __name__ == '__main__':
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('refcode', help='Refcode of structure to be analysed', type=str)
args = parser.parse_args()
run(args.refcode)