Search examples¶
Torsion angle preferences in drug-like molecules¶
Note that this script makes use of functionality from the cookbook utility module.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
"""
Script to generate torsion angle histograms for common med. chem. fragments.
This script will create a report containing torsion histograms for ten common
medicinal chemistry fragments in an output directory.
The script was inspired by and the SMARTS pattern taken from the supplementary
material of:
"Torsion Angle Preferences in Druglike Chemical Space: A Comprehensive Guide"
Scharfer C.; Schulz-Gasch T.; Ehrilich H.-C.; Guba W.; Rarey M.; Stahl M.,
J. Med. Chem. (2013) 56, 2016-2028
"""
import argparse
import os
import sys
from utilities import Histogram
from ccdc.search import SMARTSSubstructure, SubstructureSearch
def main():
# Define arguments and parse them
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('-m', '--max-hits', type=int,
default=100,
help='Maximum number of entries to use (0=unlimited) [100]')
parser.add_argument('-o', '--output-directory',
default='med_chem_torsions',
help='Output directory [med_chem_torsions]')
args = parser.parse_args()
if args.max_hits == 0:
args.max_hits = sys.maxsize
# Extract file names and create directory if necessary
if not os.path.exists(args.output_directory):
os.mkdir(args.output_directory)
# Top ten SMARTS patterns
patts = [
'[*:1]~[CX3:2]!@[NX3:3]~[*:4]',
'[*:1]~[NX3:2]!@[NX2:3]~[*:4]',
'[*:1]~[NX3:2]!@[NX3:3]~[*:4]',
'[cH1:1]~[a:2]([cH1])!@[a:3]([s,o,n:4])([cH0])',
'[*:1]~[cX3:2]!@[NX2:3]~[*:4]',
'[*:1]~[CX4:2]!@[NX2:3]~[*:4]',
'[cH1:1]~[a:2]([cH1])!@[a:3]([cH0])[cH0:4]',
'[*:1]~[OX2:2]!@[P:3]~[*:4]',
'[*:1]~[CX4:2]!@[P:3]~[*:4]',
'[*:1]~[CX4:2]!@[CX3:3]~[*:4]',
]
# Do the search, and draw the histograms
hist_filenames = []
for index, pattern in enumerate(patts):
substructure = SMARTSSubstructure(pattern)
searcher = SubstructureSearch()
searcher.add_substructure(substructure)
searcher.add_torsion_angle_measurement(
'Torsion',
0, substructure.label_to_atom_index(1),
0, substructure.label_to_atom_index(2),
0, substructure.label_to_atom_index(3),
0, substructure.label_to_atom_index(4))
hits = searcher.search(max_hit_structures=args.max_hits,
max_hits_per_structure=1)
hist_fn = os.path.join(args.output_directory, 'torsion_%02d.png' % index)
hist_filenames.append(hist_fn)
hist = Histogram(title=pattern, xlabel='Torsion', ylabel='Count',
file_name=hist_fn)
torsions = [abs(hit.measurements['Torsion'])
for hit in hits if hit.measurements['Torsion'] is not None]
hist.add_plot(torsions, color='blue', bins=18)
hist.write()
del hist
# Write the html
with open(os.path.join(args.output_directory, 'index.html'), 'w') as out_file:
out_file.write('<html><head><title>Torsion angle preferences</title></head>\n')
out_file.write('<body><h1>Druglike chemical space torsion angle preferences</h1>\n')
for hist_fn in hist_filenames:
out_file.write('<img src="%s"/>\n' % os.path.basename(hist_fn))
out_file.write('</body></html>\n')
if __name__ == '__main__':
main()
Similarity search with maximum common substructure highlighted¶
This example performs a similarity search, then generates diagrams of the matching structures showing the maximum common substructure if the probe molecule and the hit molecule.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2015-06-17: created by the Cambridge Crystallographic Data Centre
#
"""
maximum_common_substructure.py - perform a similarity search, then
format a web page containing the common substructures of the original
structure and the returned similar structures.
"""
import argparse
import os
import sys
from ccdc.io import MoleculeReader
from ccdc.search import SimilaritySearch
from ccdc.descriptors import MolecularDescriptors
from ccdc.diagram import DiagramGenerator
import ccdc
class Runner(argparse.ArgumentParser):
"""Defines arguments, reads command line, performs the search and formats the report."""
def __init__(self):
"""Defines arguments."""
super(self.__class__, self).__init__(description=__doc__)
self.add_argument(
'refcodes', nargs='+',
help='CSD refcodes or files from which to read molecules.'
)
self.add_argument(
'-t', '--threshold', type=float, default=0.9,
help='Similarity threshold sought'
)
self.add_argument(
'-m', '--maximum', type=int, default=ccdc.maxint32,
help='Maximum number of structures to retrieve [all]'
)
self.add_argument(
'-o', '--output', default='results.html',
help='Where to write the HTML report [report.html]'
)
def run(self):
"""Runs the task."""
self.args = self.parse_args()
csd = MoleculeReader('csd')
self.mcss = MolecularDescriptors.MaximumCommonSubstructure()
self.generator = DiagramGenerator()
self.generator.settings.return_type = 'SVG'
self.generator.settings.highlight_color = '#00C0C0'
self.generator.settings.shrink_symbols = False
self.output = open(self.args.output, 'w')
self.output.write('<html><head><title>Similarity Report</title></head><body>\n')
for r in self.args.refcodes:
if os.path.exists(r):
for m in MoleculeReader(r):
self.run_one(m)
else:
self.run_one(csd.molecule(r))
self.output.write('</body></html>\n')
self.output.close()
def run_one(self, mol):
"""Runs one search and report generation."""
search = SimilaritySearch(mol, self.args.threshold)
hits = search.search(max_hit_structures=self.args.maximum)
def one_hit(hit):
"""Format a single hit."""
atoms, bonds = self.mcss.search(mol, hit.molecule)
mol_atoms = list(zip(*atoms))[0]
hit_atoms = list(zip(*atoms))[1]
image1 = self.generator.image(mol, highlight_atoms=mol_atoms)
image2 = self.generator.image(hit.molecule, highlight_atoms=hit_atoms)
return '\n'.join([
'<tr><th align="center">%s</th><th align="center">%s</th></tr>'
% (mol.identifier, hit.identifier),
'<tr><td>%s</td><td>%s</td></tr>'
% (image1.replace('Qt SVG Document', 'Diagram for %s' % mol.identifier),
image2.replace('Qt SVG Document', 'Diagram for %s' % hit.identifier))])
text = [
'<h2>Similarity to %s with threshold %.2f</h2>\n'
% (mol.identifier, self.args.threshold),
'<table>',
'\n'.join(one_hit(h) for h in hits if mol.identifier != h.identifier),
'</table>'
]
self.output.write('\n'.join(text))
if __name__ == '__main__':
Runner().run()
Hit processor example¶
This example shows how to use ccdc.search.SubstructureSearch.HitProcessor
to analyse search hits as they are found during a search.
#!/usr/bin/env python
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2017-08-31: created by the Cambridge Crystallographic Data Centre
#
"""
hit_processor.py - example of a search using a HitProcessor
This example searches the CSD for instances of a given SMARTS pattern,
collecting counts of matching conformations.
At the end it will format a brief HTML report the refcode of the first
structure containing a match with a unique conformation, and the number
of matches sharing this conformation.
Arguments may specify the RMSD tolerance to consider two
conformations the same and a maximum number of structures to inspect.
"""
import argparse
import operator
from ccdc import search, io
from ccdc.descriptors import MolecularDescriptors
csd = io.EntryReader('csd')
class Result(object):
"""Records all matching hits."""
def __init__(self, hit):
"""First instance of this conformation in the CSD."""
self.identifier = hit.identifier
self.count = 1
self.mol = hit.molecule
self.atoms = hit.match_atoms()
def matches(self, hit, tolerance):
"""Test whether a hit matches this conformation."""
rmsd = MolecularDescriptors.rmsd(self.mol, hit.molecule, atoms=list(zip(
self.atoms, hit.match_atoms())), overlay=True)
return rmsd < tolerance
class Analysis(search.SubstructureSearch.HitProcessor):
"""Deals with each hit as it is found by the SubstructureSearch."""
def __init__(self, tolerance, max_structures):
self.tolerance = tolerance
self.max_structures = max_structures
self.results = []
self.last_identifier = csd.identifier(min(len(csd) - 1, self.max_structures))
def add_hit(self, hit):
"""Called from the searcher."""
if self.last_identifier < hit.identifier:
self.cancel()
if len(self.results) != 0:
for r in self.results:
if r.matches(hit, self.tolerance):
r.count += 1
return
self.results.append(Result(hit))
class Runner(argparse.ArgumentParser):
"""Defines arguments, runs the job."""
def __init__(self):
super(self.__class__, self).__init__(description=__doc__)
self.add_argument(
'smarts', help='SMARTS pattern to analyse'
)
self.add_argument(
'-t', '--title', help='Title for the report [smarts]'
)
self.add_argument(
'-o', '--output-file', help='Output file name [smarts.html]'
)
self.add_argument(
'-r', '--rmsd-tolerance', default=0.1, type=float,
help='Maximum RMSD to be considered the same conformation'
)
self.add_argument(
'-m', '--max-structures', default=len(csd), type=int,
help='Maximum number of structures to process'
)
def run(self):
"""Runs the job."""
# process command-line arguments
self.args = self.parse_args()
if self.args.title is None:
self.args.title = self.args.smarts
if self.args.output_file is None:
self.args.output_file = '%s.html' % self.args.title
# run a search
searcher = search.SubstructureSearch()
searcher.add_substructure(search.SMARTSSubstructure(self.args.smarts))
analysis = Analysis(self.args.rmsd_tolerance, self.args.max_structures)
analysis.search(searcher)
analysis.results.sort(key=operator.attrgetter('count'), reverse=True)
# output HTML
with open(self.args.output_file, 'w') as writer:
writer.write('<html>\n<head>\n<title>%s</title>\n</head>\n<body>\n' % self.args.title)
writer.write('<h2>%s</h2>\n' % self.args.title)
writer.write('<p>CSD matches for %s' % self.args.smarts)
writer.write('<table border="1">\n')
writer.write('<tr><th>Identifier</th><th># Matches</th></tr>\n')
for r in analysis.results:
writer.write('<tr><td>%s</td><td align="right">%d</td></tr>\n' %
(r.identifier, r.count))
writer.write('</table>\n')
writer.write('</body>\n</html>')
if __name__ == '__main__':
Runner().run()
Protein-ligand search¶
This example shows how to perform protein-ligand searching on an sqlmol2 format database.
#
# This script can be used for any purpose without limitation subject to the
# conditions at https://www.ccdc.cam.ac.uk/Community/Pages/Licences/v2.aspx
#
# This permission notice and the following statement of attribution must be
# included in all copies or substantial portions of this script.
#
# 2019-04-26: created by the Cambridge Crystallographic Data Centre
#
"""
protein_ligand_search.py - examples of protein ligand searches.
All arguments may be specified by an unambiguous prefix of its long form.
Each of the components of a protein-ligand complex may have one or more SMARTS
patterns defined by:
-l --ligand
-p --protein
-c --cofactor
-w --water
-m --metal
--nucleotide
These may be repeated. Each separate argument adds the appropriate substructure
to the SubstructureSearch. The SMARTS definition should be enclosed in single
quotes to prevent command shell expansion of special characters. Substructure
indices for use in geometric definitions and measurements will be in the above
order, i.e. ligand substructures start at 0, protein substructures at the length
of ligand substructures and so on.
Geometric objects may be defined by:
-P --plane
-V --vector
-C --centroid
These will be followed by a quoted string of the form
'(sub_index, at_index) (sub_index, at_index) ...' or the name of a centroid.
Geometric objects will be defined sequentially, labelled PLANE1, VEC1, CENT1 etc.
and added to the SubstructureSearch.
Measurements and constraints may be defined by:
-d --distances
-a --angles
-t --torsions
--vector-angles
--plane-angles
--point-plane-distances
--vector-plane-angles
These will be followed by a quoted string containing the appropriate number of
(sub, at) pairs or references to geometric objects, optionally followed by the
constraint range. If the constraint range is not present the appropriate
measurement will be added to the SubstructureSearch, otherwise the appropriate
constraint will be added. The constraint range will be a pair (min, max)
where min and max are floats.
The maximum number of hits and the maximum number of hits per structure may be
specified by:
-m --max-hit-structures
-M --max-hits-per-structure
An output directory may be specified by:
-o --output-directory
The prefix of the output files may be specified by:
--prefix
A location of an csdsqlx file may be specified by:
-D --database
A feature database for further searches will be written if
-f --feature-database
is present in the arguments.
Into the specified output directory will be written a .csv file containing the
labels of matched atoms and values for the defined measurements and constraints.
Also a .mol2 file containing the hit structures will be written.
An example:
python protein_ligand_search.py -D /path/to/testdb.csdsqlx \ # A small database for testing
-o /tmp/results/ \ # Where to put the output
-l "C-NH" \ # Ligand SMARTS
-p "c(~o)~o" \ # Protein SMARTS
-d "(0, 0) (1, 1) (-5, 0)" \ # Distance constraint
--dis "(0, 0) (1, 2)" \ # Distance measurement
-P "(1, 0) (1, 1) (1, 2)" "(0, 0) (0, 1) (0, 2)" \ # A pair of planes
--plane-angle "PLANE1 PLANE2 (0, 25)" \ # Plane-angle constraint
--pre test # Prefix of output files
"""
import os
import argparse
import collections
import csv
import string
import io as sys_io
import codecs
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from ccdc import search, io, utilities, diagram, pharmacophore
from ccdc.descriptors import MolecularDescriptors
class ParseString(argparse.Action):
'''Stores the string argument into the appropriate place in the args namespace.'''
def __call__(self, parser, namespace, values, option_string):
'''Works out where to put the concatenated value.'''
vals = [''.join(l) for l in values]
if getattr(namespace, self.dest) is None:
setattr(namespace, self.dest, vals)
else:
getattr(namespace, self.dest).extend(vals)
class Runner(argparse.ArgumentParser):
'''Sets up and runs the job.'''
def __init__(self):
""" Defines and parses the arguments to the job. """
super(self.__class__, self).__init__(description='Perform a protein ligand search.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
smarts_args = self.add_argument_group(title='SMARTS search patterns')
smarts_args.add_argument(
'-l', '--ligand-smarts',
nargs='*', type=list, default=[], action=ParseString,
help='SMARTS patterns to be searched for in a ligand.'
)
smarts_args.add_argument(
'-p', '--protein-smarts',
nargs='*', type=list, default=[], action=ParseString,
help='SMARTS patterns to be searched for in a protein.'
)
smarts_args.add_argument(
'-c', '--cofactor-smarts',
nargs='*', type=list, default=[], action=ParseString,
help='SMARTS patterns to be searched for in a cofactor.'
)
smarts_args.add_argument(
'-w', '--water-smarts',
nargs='*', type=list, default=[], action=ParseString,
help='SMARTS patterns to be searched for in a water.'
)
smarts_args.add_argument(
'-m', '--metal-smarts',
nargs='*', type=list, default=[], action=ParseString,
help='SMARTS patterns to be searched for in a metal.'
)
smarts_args.add_argument(
'--nucleotide-smarts',
nargs='*', type=list, default=[], action=ParseString,
help='SMARTS patterns to be searched for in a nucleotide.'
)
geometry_args = self.add_argument_group(
title='Geometric objects',
description='Any points for parameters in this group can be given either as ' +
'(substructure_index, atom_index) or the name of a Centroid, e.g. CENT1.')
geometry_args.add_argument(
'-P', '--planes',
nargs='*', type=list, default=[], action=ParseString,
help='Planes as three points, e.g. "(0, 0) (0, 1) (0, 2)" or "CENT1 (0, 1) (0, 2)".'
)
geometry_args.add_argument(
'-C', '--centroids',
nargs='*', type=list, default=[], action=ParseString,
help='Centroids as one point, e.g. "(1, 0)".'
)
geometry_args.add_argument(
'-V', '--vectors',
nargs='*', type=list, default=[], action=ParseString,
help='Vectors as two points, e.g. "(0, 0) (0, 1)" or "(1, 0) CENT2".'
)
constraints_args = self.add_argument_group(
title='Measurements and Constraints',
description='Any points for parameters in this group can be given either as ' +
'(substructure_index, atom_index) or the name of a Centroid, e.g. CENT1. ' +
'Each parameter ends in an additional optional range to make a constraint; ' +
'if no range is given a measurement will be made instead.')
constraints_args.add_argument(
'-d', '--distances',
nargs='*', type=list, default=[], action=ParseString,
help='Distances as two points followed by a distance range for constraints; ' +
'e.g. "(0, 0) (1, 0)" or (0, 0) CENT1 (10, 30)".'
)
constraints_args.add_argument(
'-a', '--angles',
nargs='*', type=list, default=[], action=ParseString,
help='Angles as three points followed by an angle range for constraints; ' +
'e.g. "(0, 0) (1, 0) (0, 1)" or "(0, 0) (1, 0) CENT1 (0, 25)".'
)
constraints_args.add_argument(
'-t', '--torsions',
nargs='*', type=list, default=[], action=ParseString,
help='Torsion as four points, followed by an angle range for constraints; ' +
'e.g. "(1, 0) (1, 1) (0, 1) (0, 0)" or "(1, 0) CENT1 (0, 1) (0, 0) (0, 25)".'
)
constraints_args.add_argument(
'--vector-angles',
nargs='*', type=list, default=[], action=ParseString,
help='Vector angles as two vectors, followed by an angle range for constraints; ' +
'e.g. "VEC1 VEC2" or "VEC1 VEC2 (0, 25)".'
)
constraints_args.add_argument(
'--plane-angles',
nargs='*', type=list, default=[], action=ParseString,
help='Plane angles as two planes, followed by an angle range for constraints; ' +
'e.g. "PLANE1 PLANE2" or "PLANE1 PLANE2 (0, 25)".'
)
constraints_args.add_argument(
'--point-plane-distances',
nargs='*', type=list, default=[], action=ParseString,
help='Point-plane distances as a point and a plane + a dist. range for constraints; ' +
'e.g. "(0, 1) PLANE1" or "CENT1 PLANE1 (0, 25)".'
)
constraints_args.add_argument(
'--vector-plane-angles',
nargs='*', type=list, default=[], action=ParseString,
help='Vector-plane angles as a vector and a plane + an angle range for constraints; ' +
'e.g. "VEC1 PLANE1" or "VEC1 PLANE1 (10, 30)".'
)
hits_args = self.add_argument_group(title='Result limitations')
hits_args.add_argument(
'-n', '--max-hit-structures', default=0, type=int,
help='Maximum number of structures to find.'
)
hits_args.add_argument(
'-N', '--max-hits-per-structure', default=0, type=int,
help='Maximum number of hits per structure.'
)
output_args = self.add_argument_group(title='Output files')
output_args.add_argument(
'-o', '--output-directory', default='./',
help='Where to write output files.'
)
output_args.add_argument(
'--prefix', default='hits',
help='Prefix for the output files.'
)
output_args.add_argument(
'-f', '--feature-database', default=False, action='store_true',
help='Whether or not to write a feature database from the hits.'
)
database_args = self.add_argument_group(title='Databases')
database_args.add_argument(
'-D', '--database',
default=os.path.join(os.path.dirname(
pharmacophore.Pharmacophore.default_feature_database_location()),
'pdb_crossminer.csdsqlx'),
help='Location of the csdsqlx database to search.'
)
self.args = self.parse_args()
if not os.path.exists(self.args.output_directory):
os.makedirs(self.args.output_directory)
def _parse_point(self, s):
"""Extract a name or a pair of indices from the string."""
if s[0] in string.ascii_letters:
if ' ' in s:
x = s[:s.index(' ')]
s = s[s.index(' ') + 1:].strip()
return x, s
else:
x = s
s = ''
return x, s
elif s[0] == '(':
part1 = s[1:s.index(',')].strip()
part2 = s[s.index(',') + 1:s.index(')')].strip()
s = s[s.index(')') + 1:].strip()
try:
p1 = int(part1)
p2 = int(part2)
except TypeError:
p1 = float(part1)
p2 = float(part2)
return (p1, p2), s
else:
self.error('Malformed argument %s' % s)
def _parse_geom(self, label, d, required, obj=None):
"""Extracts a geometric object, a constraint or a measurement and adds to the query."""
# print('Parsing', label, d, obj)
s = d.strip()
comps = []
while s:
try:
g, s = self._parse_point(s)
except ValueError as exc:
raise ValueError('Cannot parse parameter %s %s: %s' % (label, s, exc))
comps.append(g)
if obj == 'plane':
self.query.add_plane(label, *comps)
elif obj == 'centroid':
self.query.add_centroid(label, *comps)
elif obj == 'vector':
self.query.add_vector(label, *comps)
elif obj is not None:
raise NotImplementedError('Parameter type %s not implemented!' % obj)
elif len(comps) == required + 1:
rng = comps.pop()
if label.startswith('DIST'):
self.query.add_distance_constraint(
label, comps[0], comps[1], rng, vdw_corrected=True, type='any')
elif label.startswith('ANG'):
self.query.add_angle_constraint(label, comps[0], comps[1], comps[2], rng)
elif label.startswith('TOR'):
self.query.add_torsion_angle_constraint(
label, comps[0], comps[1], comps[2], comps[3], rng)
elif label.startswith('VA'):
self.query.add_vector_angle_constraint(label, comps[0], comps[1], rng)
elif label.startswith('PA'):
self.query.add_plane_angle_constraint(label, comps[0], comps[1], rng)
elif label.startswith('PP'):
self.query.add_point_plane_distance_constraint(label, comps[0], comps[1], rng)
elif label.startswith('VP'):
self.query.add_vector_plane_angle_constraint(label, comps[0], comps[1], rng)
elif len(comps) == required:
if label.startswith('DIST'):
self.query.add_distance_measurement(label, comps[0], comps[1])
elif label.startswith('ANG'):
self.query.add_angle_measurement(label, comps[0], comps[1], comps[2])
elif label.startswith('TOR'):
self.query.add_torsion_angle_measurement(
label, comps[0], comps[1], comps[2], comps[3])
elif label.startswith('VA'):
self.query.add_vector_angle_measurement(label, comps[0], comps[1])
elif label.startswith('PA'):
self.query.add_plane_angle_measurement(label, comps[0], comps[1])
elif label.startswith('PP'):
self.query.add_point_plane_distance_measurement(label, comps[0], comps[1])
elif label.startswith('VP'):
self.query.add_vector_plane_angle_measurement(label, comps[0], comps[1])
else:
raise NotImplementedError('Cannot parse parameter "%s".' % d)
def _make_substructures(self, smarts, tag):
"""Makes substructures from the SMARTS definitions."""
all_labels = []
for i, s in enumerate(smarts):
_inx_dict = collections.defaultdict(int)
sub = search.SMARTSSubstructure(s)
self.query.add_substructure(sub)
labels = []
for a in sub.atoms:
_inx_dict[self._atom_tag(a)] += 1
labels.append('%s_%s_%d' % (tag, self._atom_tag(a), _inx_dict[self._atom_tag(a)]))
a.add_protein_atom_type_constraint(tag)
all_labels.append(labels)
return all_labels
def build_query(self):
"""Builds the whole query."""
self.query = search.SubstructureSearch()
self.query.settings.max_hit_structures = self.args.max_hit_structures
self.query.settings.max_hits_per_structure = self.args.max_hits_per_structure
self.atom_tags = []
self.atom_tags.extend(self._make_substructures(self.args.ligand_smarts, 'Ligand'))
self.atom_tags.extend(self._make_substructures(self.args.protein_smarts, 'Amino'))
self.atom_tags.extend(self._make_substructures(self.args.cofactor_smarts, 'Cofactor'))
self.atom_tags.extend(self._make_substructures(self.args.water_smarts, 'Water'))
self.atom_tags.extend(self._make_substructures(self.args.metal_smarts, 'Metal'))
self.atom_tags.extend(self._make_substructures(self.args.nucleotide_smarts, 'Nucleotide'))
for i, d in enumerate(self.args.planes):
self._parse_geom('PLANE%d' % (i + 1), d, -1, obj='plane')
for i, d in enumerate(self.args.centroids):
self._parse_geom('CENT%d' % (i + 1), d, -1, obj='centroid')
for i, d in enumerate(self.args.vectors):
self._parse_geom('VEC%d' % (i + 1), d, 2, obj='vector')
for i, d in enumerate(self.args.distances):
self._parse_geom('DIST%d' % (i + 1), d, 2)
for i, d in enumerate(self.args.angles):
self._parse_geom('ANG%d' % (i + 1), d, 3)
for i, d in enumerate(self.args.torsions):
self._parse_geom('TOR%d' % (i + 1), d, 4)
for i, d in enumerate(self.args.vector_angles):
self._parse_geom('VA%d' % (i + 1), d, 2)
for i, d in enumerate(self.args.plane_angles):
self._parse_geom('PA%d' % (i + 1), d, 2)
for i, d in enumerate(self.args.point_plane_distances):
self._parse_geom('PP%d' % (i + 1), d, 2)
for i, d in enumerate(self.args.vector_plane_angles):
self._parse_geom('VP%d' % (i + 1), d, 2)
@staticmethod
def _atom_tag(a):
"""Makes a string representation of a substructure atom."""
s = str(a)
return s[len('QueryAtom('):s.rindex(')')]
def get_field_names(self):
"""Column names for the csv file."""
names = ['Identifier']
for tags in self.atom_tags:
names.extend(tags)
for c in self.query.constraints.keys():
names.append(c)
for c in self.query.contacts.keys():
names.append(c)
for m in self.query.measurements.keys():
names.append(m)
for g in self.query.geometric_objects.keys():
names.append(g)
return names
class HitProcessor(search.SubstructureSearch.HitProcessor):
"""Processes hits as they are found."""
def __init__(self, query, atom_tags, csv_file, mol2_file, html_writer, unique_writer):
self.query = query
self.atom_tags = atom_tags
self.csv_file = csv_file
self.mol2_file = mol2_file
self.html_file = html_writer
self.unique_writer = unique_writer
self.superimpose_onto = None
self.hit_ct = 0
self.last_id = None
self.measurements = collections.defaultdict(list)
self.hits_per_structure = 0
def add_hit(self, hit):
"""Adds a hit to the csv and mol2 files."""
print(hit.identifier)
if self.superimpose_onto is None:
self.superimpose_onto = hit.molecule
self.superimpose_atoms = hit.match_atoms()
mol = hit.molecule
else:
mol, rmsd, _, _ = MolecularDescriptors.overlay_rmsds_and_transformation(
self.superimpose_onto, hit.molecule,
atoms=zip(self.superimpose_atoms, hit.match_atoms()),
with_symmetry=False)
if self.query.settings.max_hit_structures and\
self.hit_ct >= self.query.settings.max_hit_structures:
self.cancel()
return
if self.last_id != hit.identifier:
self.hit_ct += 1
self.last_id = hit.identifier
self.unique_writer.write(mol)
self.hits_per_structure = 1
else:
self.hits_per_structure += 1
if self.query.settings.max_hits_per_structure and\
self.query.settings.max_hits_per_structure >= self.hits_per_structure:
return
self.mol2_file.write(mol)
for k, v in hit.measurements.items():
self.measurements[k].append(v)
for k, v in hit.constraints.items():
self.measurements[k].append(v)
d = {
'Identifier': hit.identifier,
}
for sub, tags in zip(hit.match_substructures(), self.atom_tags):
d.update({
t: '%s:%s:%s' % (a.chain_label, a.residue_label, a.label)
for a, t in zip(sub.atoms, tags)
})
for k, v in hit.measurements.items():
d[k] = round(v, 3)
for k, v in hit.constraints.items():
d[k] = round(v, 3)
for k, v in hit.geometric_objects.items():
d[k] = str(v)
self.csv_file.writerow(d)
def smarts_table(self, smarts, tag):
diagram_gen = diagram.DiagramGenerator()
diagram_gen.settings.image_width = diagram_gen.image_height = 100
diagram_gen.settings.return_type = 'SVG'
def smarts_diag(s):
sub = search.SMARTSSubstructure(s)
img = diagram_gen.image(sub)
return img.replace('Qt SVG Document', 'Diagram for %s' % s)
table = [
'<table border="1">',
'<tr><th colspan="%d">%s smarts</th></tr>' % (len(smarts), tag),
'<tr>',
' '.join('<td>%s</td>' % s for s in smarts),
'</tr>',
'<tr>',
'\n'.join('<td>%s</td>' % smarts_diag(s) for s in smarts),
'</tr>',
'</table>',
]
return '\n'.join(table)
def geometric_object_table(self, objects, tag, name):
table = [
'<table border="1">',
'<tr><th colspan="2">%s</th></tr>' % tag,
'\n'.join('<tr><td>%s%d</td><td>%s</td></tr>' % (name, i + 1, obj)
for i, obj in enumerate(objects)),
'</table>'
]
return '\n'.join(table)
def one_histogram(self, tag, data):
plt.hist(data)
plt.title(tag)
svg = sys_io.StringIO()
plt.savefig(svg, format='svg')
plt.close('all')
return svg.getvalue()
def write_html_body(self, hp, html):
lines = [
'<html><head>',
'<title>%s %s</title>' % (self.args.prefix, 'Search'),
'</head>',
'<body>',
'<h1>%s %s</h1>' % (self.args.prefix, 'Search'),
]
html.write('\n'.join(lines))
html.write(self.smarts_table(self.args.ligand_smarts, 'Ligand'))
html.write(self.smarts_table(self.args.protein_smarts, 'Protein'))
html.write(self.smarts_table(self.args.cofactor_smarts, 'Cofactor'))
html.write(self.smarts_table(self.args.water_smarts, 'Water'))
html.write(self.smarts_table(self.args.metal_smarts, 'Metal'))
html.write(self.smarts_table(self.args.nucleotide_smarts, 'Nucleotide'))
html.write(self.geometric_object_table(self.args.planes, 'Planes', 'PLANE'))
html.write(self.geometric_object_table(self.args.centroids, 'Centroids', 'CENT'))
html.write(self.geometric_object_table(self.args.vectors, 'Vectors', 'VEC'))
html.write(self.geometric_object_table(self.args.distances, 'Distances', 'DIST'))
html.write(self.geometric_object_table(self.args.angles, 'Angles', 'ANG'))
html.write(self.geometric_object_table(
self.args.torsions, 'torsions', 'TOR'))
html.write(self.geometric_object_table(self.args.vector_angles, 'Vector Angles', 'VA'))
html.write(self.geometric_object_table(self.args.plane_angles, 'Plane Angles', 'PA'))
html.write(self.geometric_object_table(
self.args.point_plane_distances, 'Point Plane Distances', 'PP'))
html.write(self.geometric_object_table(
self.args.vector_plane_angles, 'Vector Plane Angles', 'VP'))
lines = [
'</body>',
'</html>',
]
html.write('\n'.join(lines))
for i, _ in enumerate(self.args.distances):
label = 'DIST%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
for i, _ in enumerate(self.args.angles):
label = 'ANG%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
for i, _ in enumerate(self.args.torsions):
label = 'TOR%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
for i, _ in enumerate(self.args.vector_angles):
label = 'VA%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
for i, _ in enumerate(self.args.plane_angles):
label = 'PA%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
for i, _ in enumerate(self.args.point_plane_distances):
label = 'PP%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
for i, _ in enumerate(self.args.vector_plane_angles):
label = 'VP%d' % (i + 1)
lines.append(self.one_histogram(label, hp.measurements[label]))
html.write('\n'.join(lines))
def run(self):
"""Runs with provided arguments."""
self.database = io.EntryReader(self.args.database)
self.build_query()
field_names = self.get_field_names()
unique_file_name = os.path.join(self.args.output_directory,
'unique_%s.mol2' % self.args.prefix)
if os.path.exists(unique_file_name):
os.unlink(unique_file_name)
with open(os.path.join(self.args.output_directory, '%s.csv' % self.args.prefix), 'w') as w,\
io.MoleculeWriter(os.path.join(self.args.output_directory,
'%s.mol2' % self.args.prefix)) as mol_writer, \
codecs.open(os.path.join(self.args.output_directory, '%s.html' % self.args.prefix), 'w', encoding="utf-8")\
as html_writer, \
io.MoleculeWriter(unique_file_name) as unique_writer:
csv_file = csv.DictWriter(w, field_names)
csv_file.writeheader()
hp = self.HitProcessor(self.query, self.atom_tags, csv_file,
mol_writer, html_writer, unique_writer)
hp.search(self.query, self.database)
self.write_html_body(hp, html_writer)
if self.args.feature_database:
prot_info = pharmacophore.Pharmacophore.FeatureDatabase.DatabaseInfo(
os.path.join(self.args.output_directory, 'unique_%s.mol2' %
self.args.prefix), 0, utilities.Colour(255, 0, 128, 292)
)
prot_sd = pharmacophore.Pharmacophore.FeatureDatabase.Creator.StructureDatabase(
prot_info, use_crystal_symmetry=False,
structure_database_path=os.path.join(self.args.output_directory,
'%s.csdsqlx' % self.args.prefix)
)
creator = pharmacophore.Pharmacophore.FeatureDatabase.Creator()
db = creator.create((prot_sd,))
file_name = os.path.join(self.args.output_directory, '%s.feat' % self.args.prefix)
if os.path.exists(file_name):
os.unlink(file_name)
db.write(file_name)
if __name__ == '__main__':
runner = Runner()
runner.run()